• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  #include <Python.h>
2  #include <errcode.h>
3  
4  #include "tokenizer.h"
5  #include "pegen.h"
6  
7  // TOKENIZER ERRORS
8  
9  void
_PyPegen_raise_tokenizer_init_error(PyObject * filename)10  _PyPegen_raise_tokenizer_init_error(PyObject *filename)
11  {
12      if (!(PyErr_ExceptionMatches(PyExc_LookupError)
13            || PyErr_ExceptionMatches(PyExc_SyntaxError)
14            || PyErr_ExceptionMatches(PyExc_ValueError)
15            || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
16          return;
17      }
18      PyObject *errstr = NULL;
19      PyObject *tuple = NULL;
20      PyObject *type;
21      PyObject *value;
22      PyObject *tback;
23      PyErr_Fetch(&type, &value, &tback);
24      errstr = PyObject_Str(value);
25      if (!errstr) {
26          goto error;
27      }
28  
29      PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
30      if (!tmp) {
31          goto error;
32      }
33  
34      tuple = PyTuple_Pack(2, errstr, tmp);
35      Py_DECREF(tmp);
36      if (!value) {
37          goto error;
38      }
39      PyErr_SetObject(PyExc_SyntaxError, tuple);
40  
41  error:
42      Py_XDECREF(type);
43      Py_XDECREF(value);
44      Py_XDECREF(tback);
45      Py_XDECREF(errstr);
46      Py_XDECREF(tuple);
47  }
48  
49  static inline void
raise_unclosed_parentheses_error(Parser * p)50  raise_unclosed_parentheses_error(Parser *p) {
51         int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
52         int error_col = p->tok->parencolstack[p->tok->level-1];
53         RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
54                                    error_lineno, error_col, error_lineno, -1,
55                                    "'%c' was never closed",
56                                    p->tok->parenstack[p->tok->level-1]);
57  }
58  
59  int
_Pypegen_tokenizer_error(Parser * p)60  _Pypegen_tokenizer_error(Parser *p)
61  {
62      if (PyErr_Occurred()) {
63          return -1;
64      }
65  
66      const char *msg = NULL;
67      PyObject* errtype = PyExc_SyntaxError;
68      Py_ssize_t col_offset = -1;
69      switch (p->tok->done) {
70          case E_TOKEN:
71              msg = "invalid token";
72              break;
73          case E_EOF:
74              if (p->tok->level) {
75                  raise_unclosed_parentheses_error(p);
76              } else {
77                  RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
78              }
79              return -1;
80          case E_DEDENT:
81              RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
82              return -1;
83          case E_INTR:
84              if (!PyErr_Occurred()) {
85                  PyErr_SetNone(PyExc_KeyboardInterrupt);
86              }
87              return -1;
88          case E_NOMEM:
89              PyErr_NoMemory();
90              return -1;
91          case E_TABSPACE:
92              errtype = PyExc_TabError;
93              msg = "inconsistent use of tabs and spaces in indentation";
94              break;
95          case E_TOODEEP:
96              errtype = PyExc_IndentationError;
97              msg = "too many levels of indentation";
98              break;
99          case E_LINECONT: {
100              col_offset = p->tok->cur - p->tok->buf - 1;
101              msg = "unexpected character after line continuation character";
102              break;
103          }
104          default:
105              msg = "unknown parsing error";
106      }
107  
108      RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
109                                 col_offset >= 0 ? col_offset : 0,
110                                 p->tok->lineno, -1, msg);
111      return -1;
112  }
113  
114  int
_Pypegen_raise_decode_error(Parser * p)115  _Pypegen_raise_decode_error(Parser *p)
116  {
117      assert(PyErr_Occurred());
118      const char *errtype = NULL;
119      if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
120          errtype = "unicode error";
121      }
122      else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
123          errtype = "value error";
124      }
125      if (errtype) {
126          PyObject *type;
127          PyObject *value;
128          PyObject *tback;
129          PyObject *errstr;
130          PyErr_Fetch(&type, &value, &tback);
131          errstr = PyObject_Str(value);
132          if (errstr) {
133              RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
134              Py_DECREF(errstr);
135          }
136          else {
137              PyErr_Clear();
138              RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
139          }
140          Py_XDECREF(type);
141          Py_XDECREF(value);
142          Py_XDECREF(tback);
143      }
144  
145      return -1;
146  }
147  
148  static int
_PyPegen_tokenize_full_source_to_check_for_errors(Parser * p)149  _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
150      // Tokenize the whole input to see if there are any tokenization
151      // errors such as mistmatching parentheses. These will get priority
152      // over generic syntax errors only if the line number of the error is
153      // before the one that we had for the generic error.
154  
155      // We don't want to tokenize to the end for interactive input
156      if (p->tok->prompt != NULL) {
157          return 0;
158      }
159  
160      PyObject *type, *value, *traceback;
161      PyErr_Fetch(&type, &value, &traceback);
162  
163      Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
164      Py_ssize_t current_err_line = current_token->lineno;
165  
166      int ret = 0;
167  
168      for (;;) {
169          const char *start;
170          const char *end;
171          switch (_PyTokenizer_Get(p->tok, &start, &end)) {
172              case ERRORTOKEN:
173                  if (p->tok->level != 0) {
174                      int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
175                      if (current_err_line > error_lineno) {
176                          raise_unclosed_parentheses_error(p);
177                          ret = -1;
178                          goto exit;
179                      }
180                  }
181                  break;
182              case ENDMARKER:
183                  break;
184              default:
185                  continue;
186          }
187          break;
188      }
189  
190  
191  exit:
192      if (PyErr_Occurred()) {
193          Py_XDECREF(value);
194          Py_XDECREF(type);
195          Py_XDECREF(traceback);
196      } else {
197          PyErr_Restore(type, value, traceback);
198      }
199      return ret;
200  }
201  
202  // PARSER ERRORS
203  
204  void *
_PyPegen_raise_error(Parser * p,PyObject * errtype,const char * errmsg,...)205  _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
206  {
207      if (p->fill == 0) {
208          va_list va;
209          va_start(va, errmsg);
210          _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
211          va_end(va);
212          return NULL;
213      }
214  
215      Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
216      Py_ssize_t col_offset;
217      Py_ssize_t end_col_offset = -1;
218      if (t->col_offset == -1) {
219          if (p->tok->cur == p->tok->buf) {
220              col_offset = 0;
221          } else {
222              const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
223              col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
224          }
225      } else {
226          col_offset = t->col_offset + 1;
227      }
228  
229      if (t->end_col_offset != -1) {
230          end_col_offset = t->end_col_offset + 1;
231      }
232  
233      va_list va;
234      va_start(va, errmsg);
235      _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
236      va_end(va);
237  
238      return NULL;
239  }
240  
241  static PyObject *
get_error_line_from_tokenizer_buffers(Parser * p,Py_ssize_t lineno)242  get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
243  {
244      /* If the file descriptor is interactive, the source lines of the current
245       * (multi-line) statement are stored in p->tok->interactive_src_start.
246       * If not, we're parsing from a string, which means that the whole source
247       * is stored in p->tok->str. */
248      assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin);
249  
250      char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
251      assert(cur_line != NULL);
252  
253      for (int i = 0; i < lineno - 1; i++) {
254          cur_line = strchr(cur_line, '\n') + 1;
255      }
256  
257      char *next_newline;
258      if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
259          next_newline = cur_line + strlen(cur_line);
260      }
261      return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
262  }
263  
264  void *
_PyPegen_raise_error_known_location(Parser * p,PyObject * errtype,Py_ssize_t lineno,Py_ssize_t col_offset,Py_ssize_t end_lineno,Py_ssize_t end_col_offset,const char * errmsg,va_list va)265  _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
266                                      Py_ssize_t lineno, Py_ssize_t col_offset,
267                                      Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
268                                      const char *errmsg, va_list va)
269  {
270      PyObject *value = NULL;
271      PyObject *errstr = NULL;
272      PyObject *error_line = NULL;
273      PyObject *tmp = NULL;
274      p->error_indicator = 1;
275  
276      if (end_lineno == CURRENT_POS) {
277          end_lineno = p->tok->lineno;
278      }
279      if (end_col_offset == CURRENT_POS) {
280          end_col_offset = p->tok->cur - p->tok->line_start;
281      }
282  
283      if (p->start_rule == Py_fstring_input) {
284          const char *fstring_msg = "f-string: ";
285          Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
286  
287          char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
288          if (!new_errmsg) {
289              return (void *) PyErr_NoMemory();
290          }
291  
292          // Copy both strings into new buffer
293          memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
294          memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
295          new_errmsg[len] = 0;
296          errmsg = new_errmsg;
297      }
298      errstr = PyUnicode_FromFormatV(errmsg, va);
299      if (!errstr) {
300          goto error;
301      }
302  
303      if (p->tok->fp_interactive) {
304          error_line = get_error_line_from_tokenizer_buffers(p, lineno);
305      }
306      else if (p->start_rule == Py_file_input) {
307          error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
308                                                       (int) lineno, p->tok->encoding);
309      }
310  
311      if (!error_line) {
312          /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
313             then we need to find the error line from some other source, because
314             p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
315             failed or we're parsing from a string or the REPL. There's a third edge case where
316             we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
317             `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
318             does not physically exist */
319          assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
320  
321          if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
322              Py_ssize_t size = p->tok->inp - p->tok->buf;
323              error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
324          }
325          else if (p->tok->fp == NULL || p->tok->fp == stdin) {
326              error_line = get_error_line_from_tokenizer_buffers(p, lineno);
327          }
328          else {
329              error_line = PyUnicode_FromStringAndSize("", 0);
330          }
331          if (!error_line) {
332              goto error;
333          }
334      }
335  
336      if (p->start_rule == Py_fstring_input) {
337          col_offset -= p->starting_col_offset;
338          end_col_offset -= p->starting_col_offset;
339      }
340  
341      Py_ssize_t col_number = col_offset;
342      Py_ssize_t end_col_number = end_col_offset;
343  
344      if (p->tok->encoding != NULL) {
345          col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
346          if (col_number < 0) {
347              goto error;
348          }
349          if (end_col_number > 0) {
350              Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
351              if (end_col_offset < 0) {
352                  goto error;
353              } else {
354                  end_col_number = end_col_offset;
355              }
356          }
357      }
358      tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
359      if (!tmp) {
360          goto error;
361      }
362      value = PyTuple_Pack(2, errstr, tmp);
363      Py_DECREF(tmp);
364      if (!value) {
365          goto error;
366      }
367      PyErr_SetObject(errtype, value);
368  
369      Py_DECREF(errstr);
370      Py_DECREF(value);
371      if (p->start_rule == Py_fstring_input) {
372          PyMem_Free((void *)errmsg);
373      }
374      return NULL;
375  
376  error:
377      Py_XDECREF(errstr);
378      Py_XDECREF(error_line);
379      if (p->start_rule == Py_fstring_input) {
380          PyMem_Free((void *)errmsg);
381      }
382      return NULL;
383  }
384  
385  void
_Pypegen_set_syntax_error(Parser * p,Token * last_token)386  _Pypegen_set_syntax_error(Parser* p, Token* last_token) {
387      // Existing sintax error
388      if (PyErr_Occurred()) {
389          // Prioritize tokenizer errors to custom syntax errors raised
390          // on the second phase only if the errors come from the parser.
391          if (p->tok->done == E_DONE && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
392              _PyPegen_tokenize_full_source_to_check_for_errors(p);
393          }
394          // Propagate the existing syntax error.
395          return;
396      }
397      // Initialization error
398      if (p->fill == 0) {
399          RAISE_SYNTAX_ERROR("error at start before reading any input");
400      }
401      // Parser encountered EOF (End of File) unexpectedtly
402      if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
403          if (p->tok->level) {
404              raise_unclosed_parentheses_error(p);
405          } else {
406              RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
407          }
408          return;
409      }
410      // Indentation error in the tokenizer
411      if (last_token->type == INDENT || last_token->type == DEDENT) {
412          RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
413          return;
414      }
415      // Unknown error (generic case)
416  
417      // Use the last token we found on the first pass to avoid reporting
418      // incorrect locations for generic syntax errors just because we reached
419      // further away when trying to find specific syntax errors in the second
420      // pass.
421      RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
422      // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
423      // generic SyntaxError we just raised if errors are found.
424      _PyPegen_tokenize_full_source_to_check_for_errors(p);
425  }
426