• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include "Python.h"
2 #include "errcode.h"
3 #include "pycore_token.h"
4 
5 #include "../lexer/state.h"
6 
7 
8 /* ############## ERRORS ############## */
9 
10 static int
_syntaxerror_range(struct tok_state * tok,const char * format,int col_offset,int end_col_offset,va_list vargs)11 _syntaxerror_range(struct tok_state *tok, const char *format,
12                    int col_offset, int end_col_offset,
13                    va_list vargs)
14 {
15     // In release builds, we don't want to overwrite a previous error, but in debug builds we
16     // want to fail if we are not doing it so we can fix it.
17     assert(tok->done != E_ERROR);
18     if (tok->done == E_ERROR) {
19         return ERRORTOKEN;
20     }
21     PyObject *errmsg, *errtext, *args;
22     errmsg = PyUnicode_FromFormatV(format, vargs);
23     if (!errmsg) {
24         goto error;
25     }
26 
27     errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
28                                    "replace");
29     if (!errtext) {
30         goto error;
31     }
32 
33     if (col_offset == -1) {
34         col_offset = (int)PyUnicode_GET_LENGTH(errtext);
35     }
36     if (end_col_offset == -1) {
37         end_col_offset = col_offset;
38     }
39 
40     Py_ssize_t line_len = strcspn(tok->line_start, "\n");
41     if (line_len != tok->cur - tok->line_start) {
42         Py_DECREF(errtext);
43         errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
44                                        "replace");
45     }
46     if (!errtext) {
47         goto error;
48     }
49 
50     args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
51                          col_offset, errtext, tok->lineno, end_col_offset);
52     if (args) {
53         PyErr_SetObject(PyExc_SyntaxError, args);
54         Py_DECREF(args);
55     }
56 
57 error:
58     Py_XDECREF(errmsg);
59     tok->done = E_ERROR;
60     return ERRORTOKEN;
61 }
62 
63 int
_PyTokenizer_syntaxerror(struct tok_state * tok,const char * format,...)64 _PyTokenizer_syntaxerror(struct tok_state *tok, const char *format, ...)
65 {
66     // This errors are cleaned on startup. Todo: Fix it.
67     va_list vargs;
68     va_start(vargs, format);
69     int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
70     va_end(vargs);
71     return ret;
72 }
73 
74 int
_PyTokenizer_syntaxerror_known_range(struct tok_state * tok,int col_offset,int end_col_offset,const char * format,...)75 _PyTokenizer_syntaxerror_known_range(struct tok_state *tok,
76                         int col_offset, int end_col_offset,
77                         const char *format, ...)
78 {
79     va_list vargs;
80     va_start(vargs, format);
81     int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
82     va_end(vargs);
83     return ret;
84 }
85 
86 int
_PyTokenizer_indenterror(struct tok_state * tok)87 _PyTokenizer_indenterror(struct tok_state *tok)
88 {
89     tok->done = E_TABSPACE;
90     tok->cur = tok->inp;
91     return ERRORTOKEN;
92 }
93 
94 char *
_PyTokenizer_error_ret(struct tok_state * tok)95 _PyTokenizer_error_ret(struct tok_state *tok) /* XXX */
96 {
97     tok->decoding_erred = 1;
98     if ((tok->fp != NULL || tok->readline != NULL) && tok->buf != NULL) {/* see _PyTokenizer_Free */
99         PyMem_Free(tok->buf);
100     }
101     tok->buf = tok->cur = tok->inp = NULL;
102     tok->start = NULL;
103     tok->end = NULL;
104     tok->done = E_DECODE;
105     return NULL;                /* as if it were EOF */
106 }
107 
108 int
_PyTokenizer_warn_invalid_escape_sequence(struct tok_state * tok,int first_invalid_escape_char)109 _PyTokenizer_warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char)
110 {
111     if (!tok->report_warnings) {
112         return 0;
113     }
114 
115     PyObject *msg = PyUnicode_FromFormat(
116         "invalid escape sequence '\\%c'",
117         (char) first_invalid_escape_char
118     );
119 
120     if (msg == NULL) {
121         return -1;
122     }
123 
124     if (PyErr_WarnExplicitObject(PyExc_SyntaxWarning, msg, tok->filename,
125                                  tok->lineno, NULL, NULL) < 0) {
126         Py_DECREF(msg);
127 
128         if (PyErr_ExceptionMatches(PyExc_SyntaxWarning)) {
129             /* Replace the SyntaxWarning exception with a SyntaxError
130                to get a more accurate error report */
131             PyErr_Clear();
132             return _PyTokenizer_syntaxerror(tok, "invalid escape sequence '\\%c'", (char) first_invalid_escape_char);
133         }
134 
135         return -1;
136     }
137 
138     Py_DECREF(msg);
139     return 0;
140 }
141 
142 int
_PyTokenizer_parser_warn(struct tok_state * tok,PyObject * category,const char * format,...)143 _PyTokenizer_parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
144 {
145     if (!tok->report_warnings) {
146         return 0;
147     }
148 
149     PyObject *errmsg;
150     va_list vargs;
151     va_start(vargs, format);
152     errmsg = PyUnicode_FromFormatV(format, vargs);
153     va_end(vargs);
154     if (!errmsg) {
155         goto error;
156     }
157 
158     if (PyErr_WarnExplicitObject(category, errmsg, tok->filename,
159                                  tok->lineno, NULL, NULL) < 0) {
160         if (PyErr_ExceptionMatches(category)) {
161             /* Replace the DeprecationWarning exception with a SyntaxError
162                to get a more accurate error report */
163             PyErr_Clear();
164             _PyTokenizer_syntaxerror(tok, "%U", errmsg);
165         }
166         goto error;
167     }
168     Py_DECREF(errmsg);
169     return 0;
170 
171 error:
172     Py_XDECREF(errmsg);
173     tok->done = E_ERROR;
174     return -1;
175 }
176 
177 
178 /* ############## STRING MANIPULATION ############## */
179 
180 char *
_PyTokenizer_new_string(const char * s,Py_ssize_t len,struct tok_state * tok)181 _PyTokenizer_new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
182 {
183     char* result = (char *)PyMem_Malloc(len + 1);
184     if (!result) {
185         tok->done = E_NOMEM;
186         return NULL;
187     }
188     memcpy(result, s, len);
189     result[len] = '\0';
190     return result;
191 }
192 
193 PyObject *
_PyTokenizer_translate_into_utf8(const char * str,const char * enc)194 _PyTokenizer_translate_into_utf8(const char* str, const char* enc) {
195     PyObject *utf8;
196     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
197     if (buf == NULL)
198         return NULL;
199     utf8 = PyUnicode_AsUTF8String(buf);
200     Py_DECREF(buf);
201     return utf8;
202 }
203 
204 char *
_PyTokenizer_translate_newlines(const char * s,int exec_input,int preserve_crlf,struct tok_state * tok)205 _PyTokenizer_translate_newlines(const char *s, int exec_input, int preserve_crlf,
206                    struct tok_state *tok) {
207     int skip_next_lf = 0;
208     size_t needed_length = strlen(s) + 2, final_length;
209     char *buf, *current;
210     char c = '\0';
211     buf = PyMem_Malloc(needed_length);
212     if (buf == NULL) {
213         tok->done = E_NOMEM;
214         return NULL;
215     }
216     for (current = buf; *s; s++, current++) {
217         c = *s;
218         if (skip_next_lf) {
219             skip_next_lf = 0;
220             if (c == '\n') {
221                 c = *++s;
222                 if (!c)
223                     break;
224             }
225         }
226         if (!preserve_crlf && c == '\r') {
227             skip_next_lf = 1;
228             c = '\n';
229         }
230         *current = c;
231     }
232     /* If this is exec input, add a newline to the end of the string if
233        there isn't one already. */
234     if (exec_input && c != '\n' && c != '\0') {
235         *current = '\n';
236         current++;
237     }
238     *current = '\0';
239     final_length = current - buf + 1;
240     if (final_length < needed_length && final_length) {
241         /* should never fail */
242         char* result = PyMem_Realloc(buf, final_length);
243         if (result == NULL) {
244             PyMem_Free(buf);
245         }
246         buf = result;
247     }
248     return buf;
249 }
250 
251 /* ############## ENCODING STUFF ############## */
252 
253 
254 /* See whether the file starts with a BOM. If it does,
255    invoke the set_readline function with the new encoding.
256    Return 1 on success, 0 on failure.  */
257 int
_PyTokenizer_check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)258 _PyTokenizer_check_bom(int get_char(struct tok_state *),
259           void unget_char(int, struct tok_state *),
260           int set_readline(struct tok_state *, const char *),
261           struct tok_state *tok)
262 {
263     int ch1, ch2, ch3;
264     ch1 = get_char(tok);
265     tok->decoding_state = STATE_SEEK_CODING;
266     if (ch1 == EOF) {
267         return 1;
268     } else if (ch1 == 0xEF) {
269         ch2 = get_char(tok);
270         if (ch2 != 0xBB) {
271             unget_char(ch2, tok);
272             unget_char(ch1, tok);
273             return 1;
274         }
275         ch3 = get_char(tok);
276         if (ch3 != 0xBF) {
277             unget_char(ch3, tok);
278             unget_char(ch2, tok);
279             unget_char(ch1, tok);
280             return 1;
281         }
282     } else {
283         unget_char(ch1, tok);
284         return 1;
285     }
286     if (tok->encoding != NULL)
287         PyMem_Free(tok->encoding);
288     tok->encoding = _PyTokenizer_new_string("utf-8", 5, tok);
289     if (!tok->encoding)
290         return 0;
291     /* No need to set_readline: input is already utf-8 */
292     return 1;
293 }
294 
295 static const char *
get_normal_name(const char * s)296 get_normal_name(const char *s)  /* for utf-8 and latin-1 */
297 {
298     char buf[13];
299     int i;
300     for (i = 0; i < 12; i++) {
301         int c = s[i];
302         if (c == '\0')
303             break;
304         else if (c == '_')
305             buf[i] = '-';
306         else
307             buf[i] = Py_TOLOWER(c);
308     }
309     buf[i] = '\0';
310     if (strcmp(buf, "utf-8") == 0 ||
311         strncmp(buf, "utf-8-", 6) == 0)
312         return "utf-8";
313     else if (strcmp(buf, "latin-1") == 0 ||
314              strcmp(buf, "iso-8859-1") == 0 ||
315              strcmp(buf, "iso-latin-1") == 0 ||
316              strncmp(buf, "latin-1-", 8) == 0 ||
317              strncmp(buf, "iso-8859-1-", 11) == 0 ||
318              strncmp(buf, "iso-latin-1-", 12) == 0)
319         return "iso-8859-1";
320     else
321         return s;
322 }
323 
324 /* Return the coding spec in S, or NULL if none is found.  */
325 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)326 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
327 {
328     Py_ssize_t i;
329     *spec = NULL;
330     /* Coding spec must be in a comment, and that comment must be
331      * the only statement on the source code line. */
332     for (i = 0; i < size - 6; i++) {
333         if (s[i] == '#')
334             break;
335         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
336             return 1;
337     }
338     for (; i < size - 6; i++) { /* XXX inefficient search */
339         const char* t = s + i;
340         if (memcmp(t, "coding", 6) == 0) {
341             const char* begin = NULL;
342             t += 6;
343             if (t[0] != ':' && t[0] != '=')
344                 continue;
345             do {
346                 t++;
347             } while (t[0] == ' ' || t[0] == '\t');
348 
349             begin = t;
350             while (Py_ISALNUM(t[0]) ||
351                    t[0] == '-' || t[0] == '_' || t[0] == '.')
352                 t++;
353 
354             if (begin < t) {
355                 char* r = _PyTokenizer_new_string(begin, t - begin, tok);
356                 const char* q;
357                 if (!r)
358                     return 0;
359                 q = get_normal_name(r);
360                 if (r != q) {
361                     PyMem_Free(r);
362                     r = _PyTokenizer_new_string(q, strlen(q), tok);
363                     if (!r)
364                         return 0;
365                 }
366                 *spec = r;
367                 break;
368             }
369         }
370     }
371     return 1;
372 }
373 
374 /* Check whether the line contains a coding spec. If it does,
375    invoke the set_readline function for the new encoding.
376    This function receives the tok_state and the new encoding.
377    Return 1 on success, 0 on failure.  */
378 int
_PyTokenizer_check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))379 _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
380                   int set_readline(struct tok_state *, const char *))
381 {
382     char *cs;
383     if (tok->cont_line) {
384         /* It's a continuation line, so it can't be a coding spec. */
385         tok->decoding_state = STATE_NORMAL;
386         return 1;
387     }
388     if (!get_coding_spec(line, &cs, size, tok)) {
389         return 0;
390     }
391     if (!cs) {
392         Py_ssize_t i;
393         for (i = 0; i < size; i++) {
394             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
395                 break;
396             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
397                 /* Stop checking coding spec after a line containing
398                  * anything except a comment. */
399                 tok->decoding_state = STATE_NORMAL;
400                 break;
401             }
402         }
403         return 1;
404     }
405     tok->decoding_state = STATE_NORMAL;
406     if (tok->encoding == NULL) {
407         assert(tok->decoding_readline == NULL);
408         if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
409             _PyTokenizer_error_ret(tok);
410             PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
411             PyMem_Free(cs);
412             return 0;
413         }
414         tok->encoding = cs;
415     } else {                /* then, compare cs with BOM */
416         if (strcmp(tok->encoding, cs) != 0) {
417             _PyTokenizer_error_ret(tok);
418             PyErr_Format(PyExc_SyntaxError,
419                          "encoding problem: %s with BOM", cs);
420             PyMem_Free(cs);
421             return 0;
422         }
423         PyMem_Free(cs);
424     }
425     return 1;
426 }
427 
428 /* Check whether the characters at s start a valid
429    UTF-8 sequence. Return the number of characters forming
430    the sequence if yes, 0 if not.  The special cases match
431    those in stringlib/codecs.h:utf8_decode.
432 */
433 static int
valid_utf8(const unsigned char * s)434 valid_utf8(const unsigned char* s)
435 {
436     int expected = 0;
437     int length;
438     if (*s < 0x80) {
439         /* single-byte code */
440         return 1;
441     }
442     else if (*s < 0xE0) {
443         /* \xC2\x80-\xDF\xBF -- 0080-07FF */
444         if (*s < 0xC2) {
445             /* invalid sequence
446                \x80-\xBF -- continuation byte
447                \xC0-\xC1 -- fake 0000-007F */
448             return 0;
449         }
450         expected = 1;
451     }
452     else if (*s < 0xF0) {
453         /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
454         if (*s == 0xE0 && *(s + 1) < 0xA0) {
455             /* invalid sequence
456                \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
457             return 0;
458         }
459         else if (*s == 0xED && *(s + 1) >= 0xA0) {
460             /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
461                will result in surrogates in range D800-DFFF. Surrogates are
462                not valid UTF-8 so they are rejected.
463                See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
464                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
465             return 0;
466         }
467         expected = 2;
468     }
469     else if (*s < 0xF5) {
470         /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
471         if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
472             /* invalid sequence -- one of:
473                \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
474                \xF4\x90\x80\x80- -- 110000- overflow */
475             return 0;
476         }
477         expected = 3;
478     }
479     else {
480         /* invalid start byte */
481         return 0;
482     }
483     length = expected + 1;
484     for (; expected; expected--)
485         if (s[expected] < 0x80 || s[expected] >= 0xC0)
486             return 0;
487     return length;
488 }
489 
490 int
_PyTokenizer_ensure_utf8(char * line,struct tok_state * tok)491 _PyTokenizer_ensure_utf8(char *line, struct tok_state *tok)
492 {
493     int badchar = 0;
494     unsigned char *c;
495     int length;
496     for (c = (unsigned char *)line; *c; c += length) {
497         if (!(length = valid_utf8(c))) {
498             badchar = *c;
499             break;
500         }
501     }
502     if (badchar) {
503         PyErr_Format(PyExc_SyntaxError,
504                      "Non-UTF-8 code starting with '\\x%.2x' "
505                      "in file %U on line %i, "
506                      "but no encoding declared; "
507                      "see https://peps.python.org/pep-0263/ for details",
508                      badchar, tok->filename, tok->lineno);
509         return 0;
510     }
511     return 1;
512 }
513 
514 
515 /* ############## DEBUGGING STUFF ############## */
516 
517 #ifdef Py_DEBUG
518 void
_PyTokenizer_print_escape(FILE * f,const char * s,Py_ssize_t size)519 _PyTokenizer_print_escape(FILE *f, const char *s, Py_ssize_t size)
520 {
521     if (s == NULL) {
522         fputs("NULL", f);
523         return;
524     }
525     putc('"', f);
526     while (size-- > 0) {
527         unsigned char c = *s++;
528         switch (c) {
529             case '\n': fputs("\\n", f); break;
530             case '\r': fputs("\\r", f); break;
531             case '\t': fputs("\\t", f); break;
532             case '\f': fputs("\\f", f); break;
533             case '\'': fputs("\\'", f); break;
534             case '"': fputs("\\\"", f); break;
535             default:
536                 if (0x20 <= c && c <= 0x7f)
537                     putc(c, f);
538                 else
539                     fprintf(f, "\\x%02x", c);
540         }
541     }
542     putc('"', f);
543 }
544 
545 void
_PyTokenizer_tok_dump(int type,char * start,char * end)546 _PyTokenizer_tok_dump(int type, char *start, char *end)
547 {
548     fprintf(stderr, "%s", _PyParser_TokenNames[type]);
549     if (type == NAME || type == NUMBER || type == STRING || type == OP)
550         fprintf(stderr, "(%.*s)", (int)(end - start), start);
551 }
552 #endif
553