• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <Python.h>
2 #include <errcode.h>
3 
4 #include "pycore_pyerrors.h"      // _PyErr_ProgramDecodedTextObject()
5 #include "lexer/state.h"
6 #include "lexer/lexer.h"
7 #include "pegen.h"
8 
9 // TOKENIZER ERRORS
10 
11 void
_PyPegen_raise_tokenizer_init_error(PyObject * filename)12 _PyPegen_raise_tokenizer_init_error(PyObject *filename)
13 {
14     if (!(PyErr_ExceptionMatches(PyExc_LookupError)
15           || PyErr_ExceptionMatches(PyExc_SyntaxError)
16           || PyErr_ExceptionMatches(PyExc_ValueError)
17           || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
18         return;
19     }
20     PyObject *errstr = NULL;
21     PyObject *tuple = NULL;
22     PyObject *type;
23     PyObject *value;
24     PyObject *tback;
25     PyErr_Fetch(&type, &value, &tback);
26     errstr = PyObject_Str(value);
27     if (!errstr) {
28         goto error;
29     }
30 
31     PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
32     if (!tmp) {
33         goto error;
34     }
35 
36     tuple = PyTuple_Pack(2, errstr, tmp);
37     Py_DECREF(tmp);
38     if (!value) {
39         goto error;
40     }
41     PyErr_SetObject(PyExc_SyntaxError, tuple);
42 
43 error:
44     Py_XDECREF(type);
45     Py_XDECREF(value);
46     Py_XDECREF(tback);
47     Py_XDECREF(errstr);
48     Py_XDECREF(tuple);
49 }
50 
51 static inline void
raise_unclosed_parentheses_error(Parser * p)52 raise_unclosed_parentheses_error(Parser *p) {
53        int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
54        int error_col = p->tok->parencolstack[p->tok->level-1];
55        RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
56                                   error_lineno, error_col, error_lineno, -1,
57                                   "'%c' was never closed",
58                                   p->tok->parenstack[p->tok->level-1]);
59 }
60 
61 int
_Pypegen_tokenizer_error(Parser * p)62 _Pypegen_tokenizer_error(Parser *p)
63 {
64     if (PyErr_Occurred()) {
65         return -1;
66     }
67 
68     const char *msg = NULL;
69     PyObject* errtype = PyExc_SyntaxError;
70     Py_ssize_t col_offset = -1;
71     p->error_indicator = 1;
72     switch (p->tok->done) {
73         case E_TOKEN:
74             msg = "invalid token";
75             break;
76         case E_EOF:
77             if (p->tok->level) {
78                 raise_unclosed_parentheses_error(p);
79             } else {
80                 RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
81             }
82             return -1;
83         case E_DEDENT:
84             RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
85             return -1;
86         case E_INTR:
87             if (!PyErr_Occurred()) {
88                 PyErr_SetNone(PyExc_KeyboardInterrupt);
89             }
90             return -1;
91         case E_NOMEM:
92             PyErr_NoMemory();
93             return -1;
94         case E_TABSPACE:
95             errtype = PyExc_TabError;
96             msg = "inconsistent use of tabs and spaces in indentation";
97             break;
98         case E_TOODEEP:
99             errtype = PyExc_IndentationError;
100             msg = "too many levels of indentation";
101             break;
102         case E_LINECONT: {
103             col_offset = p->tok->cur - p->tok->buf - 1;
104             msg = "unexpected character after line continuation character";
105             break;
106         }
107         case E_COLUMNOVERFLOW:
108             PyErr_SetString(PyExc_OverflowError,
109                     "Parser column offset overflow - source line is too big");
110             return -1;
111         default:
112             msg = "unknown parsing error";
113     }
114 
115     RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
116                                col_offset >= 0 ? col_offset : 0,
117                                p->tok->lineno, -1, msg);
118     return -1;
119 }
120 
121 int
_Pypegen_raise_decode_error(Parser * p)122 _Pypegen_raise_decode_error(Parser *p)
123 {
124     assert(PyErr_Occurred());
125     const char *errtype = NULL;
126     if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
127         errtype = "unicode error";
128     }
129     else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
130         errtype = "value error";
131     }
132     if (errtype) {
133         PyObject *type;
134         PyObject *value;
135         PyObject *tback;
136         PyObject *errstr;
137         PyErr_Fetch(&type, &value, &tback);
138         errstr = PyObject_Str(value);
139         if (errstr) {
140             RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
141             Py_DECREF(errstr);
142         }
143         else {
144             PyErr_Clear();
145             RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
146         }
147         Py_XDECREF(type);
148         Py_XDECREF(value);
149         Py_XDECREF(tback);
150     }
151 
152     return -1;
153 }
154 
155 static int
_PyPegen_tokenize_full_source_to_check_for_errors(Parser * p)156 _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
157     // Tokenize the whole input to see if there are any tokenization
158     // errors such as mistmatching parentheses. These will get priority
159     // over generic syntax errors only if the line number of the error is
160     // before the one that we had for the generic error.
161 
162     // We don't want to tokenize to the end for interactive input
163     if (p->tok->prompt != NULL) {
164         return 0;
165     }
166 
167     PyObject *type, *value, *traceback;
168     PyErr_Fetch(&type, &value, &traceback);
169 
170     Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
171     Py_ssize_t current_err_line = current_token->lineno;
172 
173     int ret = 0;
174     struct token new_token;
175     _PyToken_Init(&new_token);
176 
177     for (;;) {
178         switch (_PyTokenizer_Get(p->tok, &new_token)) {
179             case ERRORTOKEN:
180                 if (PyErr_Occurred()) {
181                     ret = -1;
182                     goto exit;
183                 }
184                 if (p->tok->level != 0) {
185                     int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
186                     if (current_err_line > error_lineno) {
187                         raise_unclosed_parentheses_error(p);
188                         ret = -1;
189                         goto exit;
190                     }
191                 }
192                 break;
193             case ENDMARKER:
194                 break;
195             default:
196                 continue;
197         }
198         break;
199     }
200 
201 
202 exit:
203     _PyToken_Free(&new_token);
204     // If we're in an f-string, we want the syntax error in the expression part
205     // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
206     // do not swallow it.
207     if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
208         Py_XDECREF(value);
209         Py_XDECREF(type);
210         Py_XDECREF(traceback);
211     } else {
212         PyErr_Restore(type, value, traceback);
213     }
214     return ret;
215 }
216 
217 // PARSER ERRORS
218 
219 void *
_PyPegen_raise_error(Parser * p,PyObject * errtype,int use_mark,const char * errmsg,...)220 _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...)
221 {
222     // Bail out if we already have an error set.
223     if (p->error_indicator && PyErr_Occurred()) {
224         return NULL;
225     }
226     if (p->fill == 0) {
227         va_list va;
228         va_start(va, errmsg);
229         _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
230         va_end(va);
231         return NULL;
232     }
233     if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
234         p->error_indicator = 1;
235         return NULL;
236     }
237     Token *t = p->known_err_token != NULL
238                    ? p->known_err_token
239                    : p->tokens[use_mark ? p->mark : p->fill - 1];
240     Py_ssize_t col_offset;
241     Py_ssize_t end_col_offset = -1;
242     if (t->col_offset == -1) {
243         if (p->tok->cur == p->tok->buf) {
244             col_offset = 0;
245         } else {
246             const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
247             col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
248         }
249     } else {
250         col_offset = t->col_offset + 1;
251     }
252 
253     if (t->end_col_offset != -1) {
254         end_col_offset = t->end_col_offset + 1;
255     }
256 
257     va_list va;
258     va_start(va, errmsg);
259     _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
260     va_end(va);
261 
262     return NULL;
263 }
264 
265 static PyObject *
get_error_line_from_tokenizer_buffers(Parser * p,Py_ssize_t lineno)266 get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
267 {
268     /* If the file descriptor is interactive, the source lines of the current
269      * (multi-line) statement are stored in p->tok->interactive_src_start.
270      * If not, we're parsing from a string, which means that the whole source
271      * is stored in p->tok->str. */
272     assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL);
273 
274     char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
275     if (cur_line == NULL) {
276         assert(p->tok->fp_interactive);
277         // We can reach this point if the tokenizer buffers for interactive source have not been
278         // initialized because we failed to decode the original source with the given locale.
279         return PyUnicode_FromStringAndSize("", 0);
280     }
281 
282     Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
283     const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
284 
285     if (buf_end < cur_line) {
286         buf_end = cur_line + strlen(cur_line);
287     }
288 
289     for (int i = 0; i < relative_lineno - 1; i++) {
290         char *new_line = strchr(cur_line, '\n');
291         // The assert is here for debug builds but the conditional that
292         // follows is there so in release builds we do not crash at the cost
293         // to report a potentially wrong line.
294         assert(new_line != NULL && new_line + 1 < buf_end);
295         if (new_line == NULL || new_line + 1 > buf_end) {
296             break;
297         }
298         cur_line = new_line + 1;
299     }
300 
301     char *next_newline;
302     if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
303         next_newline = cur_line + strlen(cur_line);
304     }
305     return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
306 }
307 
308 void *
_PyPegen_raise_error_known_location(Parser * p,PyObject * errtype,Py_ssize_t lineno,Py_ssize_t col_offset,Py_ssize_t end_lineno,Py_ssize_t end_col_offset,const char * errmsg,va_list va)309 _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
310                                     Py_ssize_t lineno, Py_ssize_t col_offset,
311                                     Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
312                                     const char *errmsg, va_list va)
313 {
314     // Bail out if we already have an error set.
315     if (p->error_indicator && PyErr_Occurred()) {
316         return NULL;
317     }
318     PyObject *value = NULL;
319     PyObject *errstr = NULL;
320     PyObject *error_line = NULL;
321     PyObject *tmp = NULL;
322     p->error_indicator = 1;
323 
324     if (end_lineno == CURRENT_POS) {
325         end_lineno = p->tok->lineno;
326     }
327     if (end_col_offset == CURRENT_POS) {
328         end_col_offset = p->tok->cur - p->tok->line_start;
329     }
330 
331     errstr = PyUnicode_FromFormatV(errmsg, va);
332     if (!errstr) {
333         goto error;
334     }
335 
336     if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
337         error_line = get_error_line_from_tokenizer_buffers(p, lineno);
338     }
339     else if (p->start_rule == Py_file_input) {
340         error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
341                                                      (int) lineno, p->tok->encoding);
342     }
343 
344     if (!error_line) {
345         /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
346            then we need to find the error line from some other source, because
347            p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
348            failed or we're parsing from a string or the REPL. There's a third edge case where
349            we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
350            `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
351            does not physically exist */
352         assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
353 
354         if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
355             Py_ssize_t size = p->tok->inp - p->tok->buf;
356             error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
357         }
358         else if (p->tok->fp == NULL || p->tok->fp == stdin) {
359             error_line = get_error_line_from_tokenizer_buffers(p, lineno);
360         }
361         else {
362             error_line = PyUnicode_FromStringAndSize("", 0);
363         }
364         if (!error_line) {
365             goto error;
366         }
367     }
368 
369     Py_ssize_t col_number = col_offset;
370     Py_ssize_t end_col_number = end_col_offset;
371 
372     col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
373     if (col_number < 0) {
374         goto error;
375     }
376 
377     if (end_col_offset > 0) {
378         end_col_number = _PyPegen_byte_offset_to_character_offset(error_line, end_col_offset);
379         if (end_col_number < 0) {
380             goto error;
381         }
382     }
383 
384     tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
385     if (!tmp) {
386         goto error;
387     }
388     value = PyTuple_Pack(2, errstr, tmp);
389     Py_DECREF(tmp);
390     if (!value) {
391         goto error;
392     }
393     PyErr_SetObject(errtype, value);
394 
395     Py_DECREF(errstr);
396     Py_DECREF(value);
397     return NULL;
398 
399 error:
400     Py_XDECREF(errstr);
401     Py_XDECREF(error_line);
402     return NULL;
403 }
404 
405 void
_Pypegen_set_syntax_error(Parser * p,Token * last_token)406 _Pypegen_set_syntax_error(Parser* p, Token* last_token) {
407     // Existing syntax error
408     if (PyErr_Occurred()) {
409         // Prioritize tokenizer errors to custom syntax errors raised
410         // on the second phase only if the errors come from the parser.
411         int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
412         if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
413             _PyPegen_tokenize_full_source_to_check_for_errors(p);
414         }
415         // Propagate the existing syntax error.
416         return;
417     }
418     // Initialization error
419     if (p->fill == 0) {
420         RAISE_SYNTAX_ERROR("error at start before reading any input");
421     }
422     // Parser encountered EOF (End of File) unexpectedtly
423     if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
424         if (p->tok->level) {
425             raise_unclosed_parentheses_error(p);
426         } else {
427             RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
428         }
429         return;
430     }
431     // Indentation error in the tokenizer
432     if (last_token->type == INDENT || last_token->type == DEDENT) {
433         RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
434         return;
435     }
436     // Unknown error (generic case)
437 
438     // Use the last token we found on the first pass to avoid reporting
439     // incorrect locations for generic syntax errors just because we reached
440     // further away when trying to find specific syntax errors in the second
441     // pass.
442     RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
443     // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
444     // generic SyntaxError we just raised if errors are found.
445     _PyPegen_tokenize_full_source_to_check_for_errors(p);
446 }
447 
448 void
_Pypegen_stack_overflow(Parser * p)449 _Pypegen_stack_overflow(Parser *p)
450 {
451     p->error_indicator = 1;
452     PyErr_SetString(PyExc_MemoryError,
453         "Parser stack overflowed - Python source too complex to parse");
454 }
455