• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <Python.h>
2 #include <errcode.h>
3 
4 #include "tokenizer.h"
5 #include "pegen.h"
6 
7 // TOKENIZER ERRORS
8 
9 void
_PyPegen_raise_tokenizer_init_error(PyObject * filename)10 _PyPegen_raise_tokenizer_init_error(PyObject *filename)
11 {
12     if (!(PyErr_ExceptionMatches(PyExc_LookupError)
13           || PyErr_ExceptionMatches(PyExc_SyntaxError)
14           || PyErr_ExceptionMatches(PyExc_ValueError)
15           || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
16         return;
17     }
18     PyObject *errstr = NULL;
19     PyObject *tuple = NULL;
20     PyObject *type;
21     PyObject *value;
22     PyObject *tback;
23     PyErr_Fetch(&type, &value, &tback);
24     errstr = PyObject_Str(value);
25     if (!errstr) {
26         goto error;
27     }
28 
29     PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
30     if (!tmp) {
31         goto error;
32     }
33 
34     tuple = PyTuple_Pack(2, errstr, tmp);
35     Py_DECREF(tmp);
36     if (!value) {
37         goto error;
38     }
39     PyErr_SetObject(PyExc_SyntaxError, tuple);
40 
41 error:
42     Py_XDECREF(type);
43     Py_XDECREF(value);
44     Py_XDECREF(tback);
45     Py_XDECREF(errstr);
46     Py_XDECREF(tuple);
47 }
48 
49 static inline void
raise_unclosed_parentheses_error(Parser * p)50 raise_unclosed_parentheses_error(Parser *p) {
51        int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
52        int error_col = p->tok->parencolstack[p->tok->level-1];
53        RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
54                                   error_lineno, error_col, error_lineno, -1,
55                                   "'%c' was never closed",
56                                   p->tok->parenstack[p->tok->level-1]);
57 }
58 
59 int
_Pypegen_tokenizer_error(Parser * p)60 _Pypegen_tokenizer_error(Parser *p)
61 {
62     if (PyErr_Occurred()) {
63         return -1;
64     }
65 
66     const char *msg = NULL;
67     PyObject* errtype = PyExc_SyntaxError;
68     Py_ssize_t col_offset = -1;
69     switch (p->tok->done) {
70         case E_TOKEN:
71             msg = "invalid token";
72             break;
73         case E_EOF:
74             if (p->tok->level) {
75                 raise_unclosed_parentheses_error(p);
76             } else {
77                 RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
78             }
79             return -1;
80         case E_DEDENT:
81             RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
82             return -1;
83         case E_INTR:
84             if (!PyErr_Occurred()) {
85                 PyErr_SetNone(PyExc_KeyboardInterrupt);
86             }
87             return -1;
88         case E_NOMEM:
89             PyErr_NoMemory();
90             return -1;
91         case E_TABSPACE:
92             errtype = PyExc_TabError;
93             msg = "inconsistent use of tabs and spaces in indentation";
94             break;
95         case E_TOODEEP:
96             errtype = PyExc_IndentationError;
97             msg = "too many levels of indentation";
98             break;
99         case E_LINECONT: {
100             col_offset = p->tok->cur - p->tok->buf - 1;
101             msg = "unexpected character after line continuation character";
102             break;
103         }
104         default:
105             msg = "unknown parsing error";
106     }
107 
108     RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
109                                col_offset >= 0 ? col_offset : 0,
110                                p->tok->lineno, -1, msg);
111     return -1;
112 }
113 
114 int
_Pypegen_raise_decode_error(Parser * p)115 _Pypegen_raise_decode_error(Parser *p)
116 {
117     assert(PyErr_Occurred());
118     const char *errtype = NULL;
119     if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
120         errtype = "unicode error";
121     }
122     else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
123         errtype = "value error";
124     }
125     if (errtype) {
126         PyObject *type;
127         PyObject *value;
128         PyObject *tback;
129         PyObject *errstr;
130         PyErr_Fetch(&type, &value, &tback);
131         errstr = PyObject_Str(value);
132         if (errstr) {
133             RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
134             Py_DECREF(errstr);
135         }
136         else {
137             PyErr_Clear();
138             RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
139         }
140         Py_XDECREF(type);
141         Py_XDECREF(value);
142         Py_XDECREF(tback);
143     }
144 
145     return -1;
146 }
147 
148 static int
_PyPegen_tokenize_full_source_to_check_for_errors(Parser * p)149 _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
150     // Tokenize the whole input to see if there are any tokenization
151     // errors such as mistmatching parentheses. These will get priority
152     // over generic syntax errors only if the line number of the error is
153     // before the one that we had for the generic error.
154 
155     // We don't want to tokenize to the end for interactive input
156     if (p->tok->prompt != NULL) {
157         return 0;
158     }
159 
160     PyObject *type, *value, *traceback;
161     PyErr_Fetch(&type, &value, &traceback);
162 
163     Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
164     Py_ssize_t current_err_line = current_token->lineno;
165 
166     int ret = 0;
167 
168     for (;;) {
169         const char *start;
170         const char *end;
171         switch (_PyTokenizer_Get(p->tok, &start, &end)) {
172             case ERRORTOKEN:
173                 if (p->tok->level != 0) {
174                     int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
175                     if (current_err_line > error_lineno) {
176                         raise_unclosed_parentheses_error(p);
177                         ret = -1;
178                         goto exit;
179                     }
180                 }
181                 break;
182             case ENDMARKER:
183                 break;
184             default:
185                 continue;
186         }
187         break;
188     }
189 
190 
191 exit:
192     if (PyErr_Occurred()) {
193         Py_XDECREF(value);
194         Py_XDECREF(type);
195         Py_XDECREF(traceback);
196     } else {
197         PyErr_Restore(type, value, traceback);
198     }
199     return ret;
200 }
201 
202 // PARSER ERRORS
203 
204 void *
_PyPegen_raise_error(Parser * p,PyObject * errtype,const char * errmsg,...)205 _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
206 {
207     if (p->fill == 0) {
208         va_list va;
209         va_start(va, errmsg);
210         _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
211         va_end(va);
212         return NULL;
213     }
214 
215     Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
216     Py_ssize_t col_offset;
217     Py_ssize_t end_col_offset = -1;
218     if (t->col_offset == -1) {
219         if (p->tok->cur == p->tok->buf) {
220             col_offset = 0;
221         } else {
222             const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
223             col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
224         }
225     } else {
226         col_offset = t->col_offset + 1;
227     }
228 
229     if (t->end_col_offset != -1) {
230         end_col_offset = t->end_col_offset + 1;
231     }
232 
233     va_list va;
234     va_start(va, errmsg);
235     _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
236     va_end(va);
237 
238     return NULL;
239 }
240 
241 static PyObject *
get_error_line_from_tokenizer_buffers(Parser * p,Py_ssize_t lineno)242 get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
243 {
244     /* If the file descriptor is interactive, the source lines of the current
245      * (multi-line) statement are stored in p->tok->interactive_src_start.
246      * If not, we're parsing from a string, which means that the whole source
247      * is stored in p->tok->str. */
248     assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin);
249 
250     char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
251     assert(cur_line != NULL);
252 
253     for (int i = 0; i < lineno - 1; i++) {
254         cur_line = strchr(cur_line, '\n') + 1;
255     }
256 
257     char *next_newline;
258     if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
259         next_newline = cur_line + strlen(cur_line);
260     }
261     return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
262 }
263 
264 void *
_PyPegen_raise_error_known_location(Parser * p,PyObject * errtype,Py_ssize_t lineno,Py_ssize_t col_offset,Py_ssize_t end_lineno,Py_ssize_t end_col_offset,const char * errmsg,va_list va)265 _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
266                                     Py_ssize_t lineno, Py_ssize_t col_offset,
267                                     Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
268                                     const char *errmsg, va_list va)
269 {
270     PyObject *value = NULL;
271     PyObject *errstr = NULL;
272     PyObject *error_line = NULL;
273     PyObject *tmp = NULL;
274     p->error_indicator = 1;
275 
276     if (end_lineno == CURRENT_POS) {
277         end_lineno = p->tok->lineno;
278     }
279     if (end_col_offset == CURRENT_POS) {
280         end_col_offset = p->tok->cur - p->tok->line_start;
281     }
282 
283     if (p->start_rule == Py_fstring_input) {
284         const char *fstring_msg = "f-string: ";
285         Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
286 
287         char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
288         if (!new_errmsg) {
289             return (void *) PyErr_NoMemory();
290         }
291 
292         // Copy both strings into new buffer
293         memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
294         memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
295         new_errmsg[len] = 0;
296         errmsg = new_errmsg;
297     }
298     errstr = PyUnicode_FromFormatV(errmsg, va);
299     if (!errstr) {
300         goto error;
301     }
302 
303     if (p->tok->fp_interactive) {
304         error_line = get_error_line_from_tokenizer_buffers(p, lineno);
305     }
306     else if (p->start_rule == Py_file_input) {
307         error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
308                                                      (int) lineno, p->tok->encoding);
309     }
310 
311     if (!error_line) {
312         /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
313            then we need to find the error line from some other source, because
314            p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
315            failed or we're parsing from a string or the REPL. There's a third edge case where
316            we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
317            `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
318            does not physically exist */
319         assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
320 
321         if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
322             Py_ssize_t size = p->tok->inp - p->tok->buf;
323             error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
324         }
325         else if (p->tok->fp == NULL || p->tok->fp == stdin) {
326             error_line = get_error_line_from_tokenizer_buffers(p, lineno);
327         }
328         else {
329             error_line = PyUnicode_FromStringAndSize("", 0);
330         }
331         if (!error_line) {
332             goto error;
333         }
334     }
335 
336     if (p->start_rule == Py_fstring_input) {
337         col_offset -= p->starting_col_offset;
338         end_col_offset -= p->starting_col_offset;
339     }
340 
341     Py_ssize_t col_number = col_offset;
342     Py_ssize_t end_col_number = end_col_offset;
343 
344     if (p->tok->encoding != NULL) {
345         col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
346         if (col_number < 0) {
347             goto error;
348         }
349         if (end_col_number > 0) {
350             Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
351             if (end_col_offset < 0) {
352                 goto error;
353             } else {
354                 end_col_number = end_col_offset;
355             }
356         }
357     }
358     tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
359     if (!tmp) {
360         goto error;
361     }
362     value = PyTuple_Pack(2, errstr, tmp);
363     Py_DECREF(tmp);
364     if (!value) {
365         goto error;
366     }
367     PyErr_SetObject(errtype, value);
368 
369     Py_DECREF(errstr);
370     Py_DECREF(value);
371     if (p->start_rule == Py_fstring_input) {
372         PyMem_Free((void *)errmsg);
373     }
374     return NULL;
375 
376 error:
377     Py_XDECREF(errstr);
378     Py_XDECREF(error_line);
379     if (p->start_rule == Py_fstring_input) {
380         PyMem_Free((void *)errmsg);
381     }
382     return NULL;
383 }
384 
385 void
_Pypegen_set_syntax_error(Parser * p,Token * last_token)386 _Pypegen_set_syntax_error(Parser* p, Token* last_token) {
387     // Existing sintax error
388     if (PyErr_Occurred()) {
389         // Prioritize tokenizer errors to custom syntax errors raised
390         // on the second phase only if the errors come from the parser.
391         if (p->tok->done == E_DONE && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
392             _PyPegen_tokenize_full_source_to_check_for_errors(p);
393         }
394         // Propagate the existing syntax error.
395         return;
396     }
397     // Initialization error
398     if (p->fill == 0) {
399         RAISE_SYNTAX_ERROR("error at start before reading any input");
400     }
401     // Parser encountered EOF (End of File) unexpectedtly
402     if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
403         if (p->tok->level) {
404             raise_unclosed_parentheses_error(p);
405         } else {
406             RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
407         }
408         return;
409     }
410     // Indentation error in the tokenizer
411     if (last_token->type == INDENT || last_token->type == DEDENT) {
412         RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
413         return;
414     }
415     // Unknown error (generic case)
416 
417     // Use the last token we found on the first pass to avoid reporting
418     // incorrect locations for generic syntax errors just because we reached
419     // further away when trying to find specific syntax errors in the second
420     // pass.
421     RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
422     // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
423     // generic SyntaxError we just raised if errors are found.
424     _PyPegen_tokenize_full_source_to_check_for_errors(p);
425 }
426