1 #include <Python.h>
2 #include <errcode.h>
3
4 #include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject()
5 #include "lexer/state.h"
6 #include "lexer/lexer.h"
7 #include "pegen.h"
8
9 // TOKENIZER ERRORS
10
11 void
_PyPegen_raise_tokenizer_init_error(PyObject * filename)12 _PyPegen_raise_tokenizer_init_error(PyObject *filename)
13 {
14 if (!(PyErr_ExceptionMatches(PyExc_LookupError)
15 || PyErr_ExceptionMatches(PyExc_SyntaxError)
16 || PyErr_ExceptionMatches(PyExc_ValueError)
17 || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
18 return;
19 }
20 PyObject *errstr = NULL;
21 PyObject *tuple = NULL;
22 PyObject *type;
23 PyObject *value;
24 PyObject *tback;
25 PyErr_Fetch(&type, &value, &tback);
26 errstr = PyObject_Str(value);
27 if (!errstr) {
28 goto error;
29 }
30
31 PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
32 if (!tmp) {
33 goto error;
34 }
35
36 tuple = PyTuple_Pack(2, errstr, tmp);
37 Py_DECREF(tmp);
38 if (!value) {
39 goto error;
40 }
41 PyErr_SetObject(PyExc_SyntaxError, tuple);
42
43 error:
44 Py_XDECREF(type);
45 Py_XDECREF(value);
46 Py_XDECREF(tback);
47 Py_XDECREF(errstr);
48 Py_XDECREF(tuple);
49 }
50
51 static inline void
raise_unclosed_parentheses_error(Parser * p)52 raise_unclosed_parentheses_error(Parser *p) {
53 int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
54 int error_col = p->tok->parencolstack[p->tok->level-1];
55 RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
56 error_lineno, error_col, error_lineno, -1,
57 "'%c' was never closed",
58 p->tok->parenstack[p->tok->level-1]);
59 }
60
61 int
_Pypegen_tokenizer_error(Parser * p)62 _Pypegen_tokenizer_error(Parser *p)
63 {
64 if (PyErr_Occurred()) {
65 return -1;
66 }
67
68 const char *msg = NULL;
69 PyObject* errtype = PyExc_SyntaxError;
70 Py_ssize_t col_offset = -1;
71 p->error_indicator = 1;
72 switch (p->tok->done) {
73 case E_TOKEN:
74 msg = "invalid token";
75 break;
76 case E_EOF:
77 if (p->tok->level) {
78 raise_unclosed_parentheses_error(p);
79 } else {
80 RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
81 }
82 return -1;
83 case E_DEDENT:
84 RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
85 return -1;
86 case E_INTR:
87 if (!PyErr_Occurred()) {
88 PyErr_SetNone(PyExc_KeyboardInterrupt);
89 }
90 return -1;
91 case E_NOMEM:
92 PyErr_NoMemory();
93 return -1;
94 case E_TABSPACE:
95 errtype = PyExc_TabError;
96 msg = "inconsistent use of tabs and spaces in indentation";
97 break;
98 case E_TOODEEP:
99 errtype = PyExc_IndentationError;
100 msg = "too many levels of indentation";
101 break;
102 case E_LINECONT: {
103 col_offset = p->tok->cur - p->tok->buf - 1;
104 msg = "unexpected character after line continuation character";
105 break;
106 }
107 case E_COLUMNOVERFLOW:
108 PyErr_SetString(PyExc_OverflowError,
109 "Parser column offset overflow - source line is too big");
110 return -1;
111 default:
112 msg = "unknown parsing error";
113 }
114
115 RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
116 col_offset >= 0 ? col_offset : 0,
117 p->tok->lineno, -1, msg);
118 return -1;
119 }
120
121 int
_Pypegen_raise_decode_error(Parser * p)122 _Pypegen_raise_decode_error(Parser *p)
123 {
124 assert(PyErr_Occurred());
125 const char *errtype = NULL;
126 if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
127 errtype = "unicode error";
128 }
129 else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
130 errtype = "value error";
131 }
132 if (errtype) {
133 PyObject *type;
134 PyObject *value;
135 PyObject *tback;
136 PyObject *errstr;
137 PyErr_Fetch(&type, &value, &tback);
138 errstr = PyObject_Str(value);
139 if (errstr) {
140 RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
141 Py_DECREF(errstr);
142 }
143 else {
144 PyErr_Clear();
145 RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
146 }
147 Py_XDECREF(type);
148 Py_XDECREF(value);
149 Py_XDECREF(tback);
150 }
151
152 return -1;
153 }
154
155 static int
_PyPegen_tokenize_full_source_to_check_for_errors(Parser * p)156 _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
157 // Tokenize the whole input to see if there are any tokenization
158 // errors such as mistmatching parentheses. These will get priority
159 // over generic syntax errors only if the line number of the error is
160 // before the one that we had for the generic error.
161
162 // We don't want to tokenize to the end for interactive input
163 if (p->tok->prompt != NULL) {
164 return 0;
165 }
166
167 PyObject *type, *value, *traceback;
168 PyErr_Fetch(&type, &value, &traceback);
169
170 Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
171 Py_ssize_t current_err_line = current_token->lineno;
172
173 int ret = 0;
174 struct token new_token;
175 _PyToken_Init(&new_token);
176
177 for (;;) {
178 switch (_PyTokenizer_Get(p->tok, &new_token)) {
179 case ERRORTOKEN:
180 if (PyErr_Occurred()) {
181 ret = -1;
182 goto exit;
183 }
184 if (p->tok->level != 0) {
185 int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
186 if (current_err_line > error_lineno) {
187 raise_unclosed_parentheses_error(p);
188 ret = -1;
189 goto exit;
190 }
191 }
192 break;
193 case ENDMARKER:
194 break;
195 default:
196 continue;
197 }
198 break;
199 }
200
201
202 exit:
203 _PyToken_Free(&new_token);
204 // If we're in an f-string, we want the syntax error in the expression part
205 // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
206 // do not swallow it.
207 if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
208 Py_XDECREF(value);
209 Py_XDECREF(type);
210 Py_XDECREF(traceback);
211 } else {
212 PyErr_Restore(type, value, traceback);
213 }
214 return ret;
215 }
216
217 // PARSER ERRORS
218
219 void *
_PyPegen_raise_error(Parser * p,PyObject * errtype,int use_mark,const char * errmsg,...)220 _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...)
221 {
222 // Bail out if we already have an error set.
223 if (p->error_indicator && PyErr_Occurred()) {
224 return NULL;
225 }
226 if (p->fill == 0) {
227 va_list va;
228 va_start(va, errmsg);
229 _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
230 va_end(va);
231 return NULL;
232 }
233 if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
234 p->error_indicator = 1;
235 return NULL;
236 }
237 Token *t = p->known_err_token != NULL
238 ? p->known_err_token
239 : p->tokens[use_mark ? p->mark : p->fill - 1];
240 Py_ssize_t col_offset;
241 Py_ssize_t end_col_offset = -1;
242 if (t->col_offset == -1) {
243 if (p->tok->cur == p->tok->buf) {
244 col_offset = 0;
245 } else {
246 const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
247 col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
248 }
249 } else {
250 col_offset = t->col_offset + 1;
251 }
252
253 if (t->end_col_offset != -1) {
254 end_col_offset = t->end_col_offset + 1;
255 }
256
257 va_list va;
258 va_start(va, errmsg);
259 _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
260 va_end(va);
261
262 return NULL;
263 }
264
265 static PyObject *
get_error_line_from_tokenizer_buffers(Parser * p,Py_ssize_t lineno)266 get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
267 {
268 /* If the file descriptor is interactive, the source lines of the current
269 * (multi-line) statement are stored in p->tok->interactive_src_start.
270 * If not, we're parsing from a string, which means that the whole source
271 * is stored in p->tok->str. */
272 assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL);
273
274 char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
275 if (cur_line == NULL) {
276 assert(p->tok->fp_interactive);
277 // We can reach this point if the tokenizer buffers for interactive source have not been
278 // initialized because we failed to decode the original source with the given locale.
279 return PyUnicode_FromStringAndSize("", 0);
280 }
281
282 Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
283 const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
284
285 if (buf_end < cur_line) {
286 buf_end = cur_line + strlen(cur_line);
287 }
288
289 for (int i = 0; i < relative_lineno - 1; i++) {
290 char *new_line = strchr(cur_line, '\n');
291 // The assert is here for debug builds but the conditional that
292 // follows is there so in release builds we do not crash at the cost
293 // to report a potentially wrong line.
294 assert(new_line != NULL && new_line + 1 < buf_end);
295 if (new_line == NULL || new_line + 1 > buf_end) {
296 break;
297 }
298 cur_line = new_line + 1;
299 }
300
301 char *next_newline;
302 if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
303 next_newline = cur_line + strlen(cur_line);
304 }
305 return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
306 }
307
308 void *
_PyPegen_raise_error_known_location(Parser * p,PyObject * errtype,Py_ssize_t lineno,Py_ssize_t col_offset,Py_ssize_t end_lineno,Py_ssize_t end_col_offset,const char * errmsg,va_list va)309 _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
310 Py_ssize_t lineno, Py_ssize_t col_offset,
311 Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
312 const char *errmsg, va_list va)
313 {
314 // Bail out if we already have an error set.
315 if (p->error_indicator && PyErr_Occurred()) {
316 return NULL;
317 }
318 PyObject *value = NULL;
319 PyObject *errstr = NULL;
320 PyObject *error_line = NULL;
321 PyObject *tmp = NULL;
322 p->error_indicator = 1;
323
324 if (end_lineno == CURRENT_POS) {
325 end_lineno = p->tok->lineno;
326 }
327 if (end_col_offset == CURRENT_POS) {
328 end_col_offset = p->tok->cur - p->tok->line_start;
329 }
330
331 errstr = PyUnicode_FromFormatV(errmsg, va);
332 if (!errstr) {
333 goto error;
334 }
335
336 if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
337 error_line = get_error_line_from_tokenizer_buffers(p, lineno);
338 }
339 else if (p->start_rule == Py_file_input) {
340 error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
341 (int) lineno, p->tok->encoding);
342 }
343
344 if (!error_line) {
345 /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
346 then we need to find the error line from some other source, because
347 p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
348 failed or we're parsing from a string or the REPL. There's a third edge case where
349 we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
350 `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
351 does not physically exist */
352 assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
353
354 if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
355 Py_ssize_t size = p->tok->inp - p->tok->buf;
356 error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
357 }
358 else if (p->tok->fp == NULL || p->tok->fp == stdin) {
359 error_line = get_error_line_from_tokenizer_buffers(p, lineno);
360 }
361 else {
362 error_line = PyUnicode_FromStringAndSize("", 0);
363 }
364 if (!error_line) {
365 goto error;
366 }
367 }
368
369 Py_ssize_t col_number = col_offset;
370 Py_ssize_t end_col_number = end_col_offset;
371
372 col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
373 if (col_number < 0) {
374 goto error;
375 }
376
377 if (end_col_offset > 0) {
378 end_col_number = _PyPegen_byte_offset_to_character_offset(error_line, end_col_offset);
379 if (end_col_number < 0) {
380 goto error;
381 }
382 }
383
384 tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
385 if (!tmp) {
386 goto error;
387 }
388 value = PyTuple_Pack(2, errstr, tmp);
389 Py_DECREF(tmp);
390 if (!value) {
391 goto error;
392 }
393 PyErr_SetObject(errtype, value);
394
395 Py_DECREF(errstr);
396 Py_DECREF(value);
397 return NULL;
398
399 error:
400 Py_XDECREF(errstr);
401 Py_XDECREF(error_line);
402 return NULL;
403 }
404
405 void
_Pypegen_set_syntax_error(Parser * p,Token * last_token)406 _Pypegen_set_syntax_error(Parser* p, Token* last_token) {
407 // Existing syntax error
408 if (PyErr_Occurred()) {
409 // Prioritize tokenizer errors to custom syntax errors raised
410 // on the second phase only if the errors come from the parser.
411 int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
412 if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
413 _PyPegen_tokenize_full_source_to_check_for_errors(p);
414 }
415 // Propagate the existing syntax error.
416 return;
417 }
418 // Initialization error
419 if (p->fill == 0) {
420 RAISE_SYNTAX_ERROR("error at start before reading any input");
421 }
422 // Parser encountered EOF (End of File) unexpectedtly
423 if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
424 if (p->tok->level) {
425 raise_unclosed_parentheses_error(p);
426 } else {
427 RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
428 }
429 return;
430 }
431 // Indentation error in the tokenizer
432 if (last_token->type == INDENT || last_token->type == DEDENT) {
433 RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
434 return;
435 }
436 // Unknown error (generic case)
437
438 // Use the last token we found on the first pass to avoid reporting
439 // incorrect locations for generic syntax errors just because we reached
440 // further away when trying to find specific syntax errors in the second
441 // pass.
442 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
443 // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
444 // generic SyntaxError we just raised if errors are found.
445 _PyPegen_tokenize_full_source_to_check_for_errors(p);
446 }
447
448 void
_Pypegen_stack_overflow(Parser * p)449 _Pypegen_stack_overflow(Parser *p)
450 {
451 p->error_indicator = 1;
452 PyErr_SetString(PyExc_MemoryError,
453 "Parser stack overflowed - Python source too complex to parse");
454 }
455