1 #include "Python.h"
2 #include "errcode.h"
3 #include "pycore_token.h"
4
5 #include "../lexer/state.h"
6
7
8 /* ############## ERRORS ############## */
9
10 static int
_syntaxerror_range(struct tok_state * tok,const char * format,int col_offset,int end_col_offset,va_list vargs)11 _syntaxerror_range(struct tok_state *tok, const char *format,
12 int col_offset, int end_col_offset,
13 va_list vargs)
14 {
15 // In release builds, we don't want to overwrite a previous error, but in debug builds we
16 // want to fail if we are not doing it so we can fix it.
17 assert(tok->done != E_ERROR);
18 if (tok->done == E_ERROR) {
19 return ERRORTOKEN;
20 }
21 PyObject *errmsg, *errtext, *args;
22 errmsg = PyUnicode_FromFormatV(format, vargs);
23 if (!errmsg) {
24 goto error;
25 }
26
27 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
28 "replace");
29 if (!errtext) {
30 goto error;
31 }
32
33 if (col_offset == -1) {
34 col_offset = (int)PyUnicode_GET_LENGTH(errtext);
35 }
36 if (end_col_offset == -1) {
37 end_col_offset = col_offset;
38 }
39
40 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
41 if (line_len != tok->cur - tok->line_start) {
42 Py_DECREF(errtext);
43 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
44 "replace");
45 }
46 if (!errtext) {
47 goto error;
48 }
49
50 args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
51 col_offset, errtext, tok->lineno, end_col_offset);
52 if (args) {
53 PyErr_SetObject(PyExc_SyntaxError, args);
54 Py_DECREF(args);
55 }
56
57 error:
58 Py_XDECREF(errmsg);
59 tok->done = E_ERROR;
60 return ERRORTOKEN;
61 }
62
63 int
_PyTokenizer_syntaxerror(struct tok_state * tok,const char * format,...)64 _PyTokenizer_syntaxerror(struct tok_state *tok, const char *format, ...)
65 {
66 // This errors are cleaned on startup. Todo: Fix it.
67 va_list vargs;
68 va_start(vargs, format);
69 int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
70 va_end(vargs);
71 return ret;
72 }
73
74 int
_PyTokenizer_syntaxerror_known_range(struct tok_state * tok,int col_offset,int end_col_offset,const char * format,...)75 _PyTokenizer_syntaxerror_known_range(struct tok_state *tok,
76 int col_offset, int end_col_offset,
77 const char *format, ...)
78 {
79 va_list vargs;
80 va_start(vargs, format);
81 int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
82 va_end(vargs);
83 return ret;
84 }
85
86 int
_PyTokenizer_indenterror(struct tok_state * tok)87 _PyTokenizer_indenterror(struct tok_state *tok)
88 {
89 tok->done = E_TABSPACE;
90 tok->cur = tok->inp;
91 return ERRORTOKEN;
92 }
93
94 char *
_PyTokenizer_error_ret(struct tok_state * tok)95 _PyTokenizer_error_ret(struct tok_state *tok) /* XXX */
96 {
97 tok->decoding_erred = 1;
98 if ((tok->fp != NULL || tok->readline != NULL) && tok->buf != NULL) {/* see _PyTokenizer_Free */
99 PyMem_Free(tok->buf);
100 }
101 tok->buf = tok->cur = tok->inp = NULL;
102 tok->start = NULL;
103 tok->end = NULL;
104 tok->done = E_DECODE;
105 return NULL; /* as if it were EOF */
106 }
107
108 int
_PyTokenizer_warn_invalid_escape_sequence(struct tok_state * tok,int first_invalid_escape_char)109 _PyTokenizer_warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char)
110 {
111 if (!tok->report_warnings) {
112 return 0;
113 }
114
115 PyObject *msg = PyUnicode_FromFormat(
116 "invalid escape sequence '\\%c'",
117 (char) first_invalid_escape_char
118 );
119
120 if (msg == NULL) {
121 return -1;
122 }
123
124 if (PyErr_WarnExplicitObject(PyExc_SyntaxWarning, msg, tok->filename,
125 tok->lineno, NULL, NULL) < 0) {
126 Py_DECREF(msg);
127
128 if (PyErr_ExceptionMatches(PyExc_SyntaxWarning)) {
129 /* Replace the SyntaxWarning exception with a SyntaxError
130 to get a more accurate error report */
131 PyErr_Clear();
132 return _PyTokenizer_syntaxerror(tok, "invalid escape sequence '\\%c'", (char) first_invalid_escape_char);
133 }
134
135 return -1;
136 }
137
138 Py_DECREF(msg);
139 return 0;
140 }
141
142 int
_PyTokenizer_parser_warn(struct tok_state * tok,PyObject * category,const char * format,...)143 _PyTokenizer_parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
144 {
145 if (!tok->report_warnings) {
146 return 0;
147 }
148
149 PyObject *errmsg;
150 va_list vargs;
151 va_start(vargs, format);
152 errmsg = PyUnicode_FromFormatV(format, vargs);
153 va_end(vargs);
154 if (!errmsg) {
155 goto error;
156 }
157
158 if (PyErr_WarnExplicitObject(category, errmsg, tok->filename,
159 tok->lineno, NULL, NULL) < 0) {
160 if (PyErr_ExceptionMatches(category)) {
161 /* Replace the DeprecationWarning exception with a SyntaxError
162 to get a more accurate error report */
163 PyErr_Clear();
164 _PyTokenizer_syntaxerror(tok, "%U", errmsg);
165 }
166 goto error;
167 }
168 Py_DECREF(errmsg);
169 return 0;
170
171 error:
172 Py_XDECREF(errmsg);
173 tok->done = E_ERROR;
174 return -1;
175 }
176
177
178 /* ############## STRING MANIPULATION ############## */
179
180 char *
_PyTokenizer_new_string(const char * s,Py_ssize_t len,struct tok_state * tok)181 _PyTokenizer_new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
182 {
183 char* result = (char *)PyMem_Malloc(len + 1);
184 if (!result) {
185 tok->done = E_NOMEM;
186 return NULL;
187 }
188 memcpy(result, s, len);
189 result[len] = '\0';
190 return result;
191 }
192
193 PyObject *
_PyTokenizer_translate_into_utf8(const char * str,const char * enc)194 _PyTokenizer_translate_into_utf8(const char* str, const char* enc) {
195 PyObject *utf8;
196 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
197 if (buf == NULL)
198 return NULL;
199 utf8 = PyUnicode_AsUTF8String(buf);
200 Py_DECREF(buf);
201 return utf8;
202 }
203
204 char *
_PyTokenizer_translate_newlines(const char * s,int exec_input,int preserve_crlf,struct tok_state * tok)205 _PyTokenizer_translate_newlines(const char *s, int exec_input, int preserve_crlf,
206 struct tok_state *tok) {
207 int skip_next_lf = 0;
208 size_t needed_length = strlen(s) + 2, final_length;
209 char *buf, *current;
210 char c = '\0';
211 buf = PyMem_Malloc(needed_length);
212 if (buf == NULL) {
213 tok->done = E_NOMEM;
214 return NULL;
215 }
216 for (current = buf; *s; s++, current++) {
217 c = *s;
218 if (skip_next_lf) {
219 skip_next_lf = 0;
220 if (c == '\n') {
221 c = *++s;
222 if (!c)
223 break;
224 }
225 }
226 if (!preserve_crlf && c == '\r') {
227 skip_next_lf = 1;
228 c = '\n';
229 }
230 *current = c;
231 }
232 /* If this is exec input, add a newline to the end of the string if
233 there isn't one already. */
234 if (exec_input && c != '\n' && c != '\0') {
235 *current = '\n';
236 current++;
237 }
238 *current = '\0';
239 final_length = current - buf + 1;
240 if (final_length < needed_length && final_length) {
241 /* should never fail */
242 char* result = PyMem_Realloc(buf, final_length);
243 if (result == NULL) {
244 PyMem_Free(buf);
245 }
246 buf = result;
247 }
248 return buf;
249 }
250
251 /* ############## ENCODING STUFF ############## */
252
253
254 /* See whether the file starts with a BOM. If it does,
255 invoke the set_readline function with the new encoding.
256 Return 1 on success, 0 on failure. */
257 int
_PyTokenizer_check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)258 _PyTokenizer_check_bom(int get_char(struct tok_state *),
259 void unget_char(int, struct tok_state *),
260 int set_readline(struct tok_state *, const char *),
261 struct tok_state *tok)
262 {
263 int ch1, ch2, ch3;
264 ch1 = get_char(tok);
265 tok->decoding_state = STATE_SEEK_CODING;
266 if (ch1 == EOF) {
267 return 1;
268 } else if (ch1 == 0xEF) {
269 ch2 = get_char(tok);
270 if (ch2 != 0xBB) {
271 unget_char(ch2, tok);
272 unget_char(ch1, tok);
273 return 1;
274 }
275 ch3 = get_char(tok);
276 if (ch3 != 0xBF) {
277 unget_char(ch3, tok);
278 unget_char(ch2, tok);
279 unget_char(ch1, tok);
280 return 1;
281 }
282 } else {
283 unget_char(ch1, tok);
284 return 1;
285 }
286 if (tok->encoding != NULL)
287 PyMem_Free(tok->encoding);
288 tok->encoding = _PyTokenizer_new_string("utf-8", 5, tok);
289 if (!tok->encoding)
290 return 0;
291 /* No need to set_readline: input is already utf-8 */
292 return 1;
293 }
294
295 static const char *
get_normal_name(const char * s)296 get_normal_name(const char *s) /* for utf-8 and latin-1 */
297 {
298 char buf[13];
299 int i;
300 for (i = 0; i < 12; i++) {
301 int c = s[i];
302 if (c == '\0')
303 break;
304 else if (c == '_')
305 buf[i] = '-';
306 else
307 buf[i] = Py_TOLOWER(c);
308 }
309 buf[i] = '\0';
310 if (strcmp(buf, "utf-8") == 0 ||
311 strncmp(buf, "utf-8-", 6) == 0)
312 return "utf-8";
313 else if (strcmp(buf, "latin-1") == 0 ||
314 strcmp(buf, "iso-8859-1") == 0 ||
315 strcmp(buf, "iso-latin-1") == 0 ||
316 strncmp(buf, "latin-1-", 8) == 0 ||
317 strncmp(buf, "iso-8859-1-", 11) == 0 ||
318 strncmp(buf, "iso-latin-1-", 12) == 0)
319 return "iso-8859-1";
320 else
321 return s;
322 }
323
324 /* Return the coding spec in S, or NULL if none is found. */
325 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)326 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
327 {
328 Py_ssize_t i;
329 *spec = NULL;
330 /* Coding spec must be in a comment, and that comment must be
331 * the only statement on the source code line. */
332 for (i = 0; i < size - 6; i++) {
333 if (s[i] == '#')
334 break;
335 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
336 return 1;
337 }
338 for (; i < size - 6; i++) { /* XXX inefficient search */
339 const char* t = s + i;
340 if (memcmp(t, "coding", 6) == 0) {
341 const char* begin = NULL;
342 t += 6;
343 if (t[0] != ':' && t[0] != '=')
344 continue;
345 do {
346 t++;
347 } while (t[0] == ' ' || t[0] == '\t');
348
349 begin = t;
350 while (Py_ISALNUM(t[0]) ||
351 t[0] == '-' || t[0] == '_' || t[0] == '.')
352 t++;
353
354 if (begin < t) {
355 char* r = _PyTokenizer_new_string(begin, t - begin, tok);
356 const char* q;
357 if (!r)
358 return 0;
359 q = get_normal_name(r);
360 if (r != q) {
361 PyMem_Free(r);
362 r = _PyTokenizer_new_string(q, strlen(q), tok);
363 if (!r)
364 return 0;
365 }
366 *spec = r;
367 break;
368 }
369 }
370 }
371 return 1;
372 }
373
374 /* Check whether the line contains a coding spec. If it does,
375 invoke the set_readline function for the new encoding.
376 This function receives the tok_state and the new encoding.
377 Return 1 on success, 0 on failure. */
378 int
_PyTokenizer_check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))379 _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
380 int set_readline(struct tok_state *, const char *))
381 {
382 char *cs;
383 if (tok->cont_line) {
384 /* It's a continuation line, so it can't be a coding spec. */
385 tok->decoding_state = STATE_NORMAL;
386 return 1;
387 }
388 if (!get_coding_spec(line, &cs, size, tok)) {
389 return 0;
390 }
391 if (!cs) {
392 Py_ssize_t i;
393 for (i = 0; i < size; i++) {
394 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
395 break;
396 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
397 /* Stop checking coding spec after a line containing
398 * anything except a comment. */
399 tok->decoding_state = STATE_NORMAL;
400 break;
401 }
402 }
403 return 1;
404 }
405 tok->decoding_state = STATE_NORMAL;
406 if (tok->encoding == NULL) {
407 assert(tok->decoding_readline == NULL);
408 if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
409 _PyTokenizer_error_ret(tok);
410 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
411 PyMem_Free(cs);
412 return 0;
413 }
414 tok->encoding = cs;
415 } else { /* then, compare cs with BOM */
416 if (strcmp(tok->encoding, cs) != 0) {
417 _PyTokenizer_error_ret(tok);
418 PyErr_Format(PyExc_SyntaxError,
419 "encoding problem: %s with BOM", cs);
420 PyMem_Free(cs);
421 return 0;
422 }
423 PyMem_Free(cs);
424 }
425 return 1;
426 }
427
428 /* Check whether the characters at s start a valid
429 UTF-8 sequence. Return the number of characters forming
430 the sequence if yes, 0 if not. The special cases match
431 those in stringlib/codecs.h:utf8_decode.
432 */
433 static int
valid_utf8(const unsigned char * s)434 valid_utf8(const unsigned char* s)
435 {
436 int expected = 0;
437 int length;
438 if (*s < 0x80) {
439 /* single-byte code */
440 return 1;
441 }
442 else if (*s < 0xE0) {
443 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
444 if (*s < 0xC2) {
445 /* invalid sequence
446 \x80-\xBF -- continuation byte
447 \xC0-\xC1 -- fake 0000-007F */
448 return 0;
449 }
450 expected = 1;
451 }
452 else if (*s < 0xF0) {
453 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
454 if (*s == 0xE0 && *(s + 1) < 0xA0) {
455 /* invalid sequence
456 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
457 return 0;
458 }
459 else if (*s == 0xED && *(s + 1) >= 0xA0) {
460 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
461 will result in surrogates in range D800-DFFF. Surrogates are
462 not valid UTF-8 so they are rejected.
463 See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
464 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
465 return 0;
466 }
467 expected = 2;
468 }
469 else if (*s < 0xF5) {
470 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
471 if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
472 /* invalid sequence -- one of:
473 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
474 \xF4\x90\x80\x80- -- 110000- overflow */
475 return 0;
476 }
477 expected = 3;
478 }
479 else {
480 /* invalid start byte */
481 return 0;
482 }
483 length = expected + 1;
484 for (; expected; expected--)
485 if (s[expected] < 0x80 || s[expected] >= 0xC0)
486 return 0;
487 return length;
488 }
489
490 int
_PyTokenizer_ensure_utf8(char * line,struct tok_state * tok)491 _PyTokenizer_ensure_utf8(char *line, struct tok_state *tok)
492 {
493 int badchar = 0;
494 unsigned char *c;
495 int length;
496 for (c = (unsigned char *)line; *c; c += length) {
497 if (!(length = valid_utf8(c))) {
498 badchar = *c;
499 break;
500 }
501 }
502 if (badchar) {
503 PyErr_Format(PyExc_SyntaxError,
504 "Non-UTF-8 code starting with '\\x%.2x' "
505 "in file %U on line %i, "
506 "but no encoding declared; "
507 "see https://peps.python.org/pep-0263/ for details",
508 badchar, tok->filename, tok->lineno);
509 return 0;
510 }
511 return 1;
512 }
513
514
515 /* ############## DEBUGGING STUFF ############## */
516
517 #ifdef Py_DEBUG
518 void
_PyTokenizer_print_escape(FILE * f,const char * s,Py_ssize_t size)519 _PyTokenizer_print_escape(FILE *f, const char *s, Py_ssize_t size)
520 {
521 if (s == NULL) {
522 fputs("NULL", f);
523 return;
524 }
525 putc('"', f);
526 while (size-- > 0) {
527 unsigned char c = *s++;
528 switch (c) {
529 case '\n': fputs("\\n", f); break;
530 case '\r': fputs("\\r", f); break;
531 case '\t': fputs("\\t", f); break;
532 case '\f': fputs("\\f", f); break;
533 case '\'': fputs("\\'", f); break;
534 case '"': fputs("\\\"", f); break;
535 default:
536 if (0x20 <= c && c <= 0x7f)
537 putc(c, f);
538 else
539 fprintf(f, "\\x%02x", c);
540 }
541 }
542 putc('"', f);
543 }
544
545 void
_PyTokenizer_tok_dump(int type,char * start,char * end)546 _PyTokenizer_tok_dump(int type, char *start, char *end)
547 {
548 fprintf(stderr, "%s", _PyParser_TokenNames[type]);
549 if (type == NAME || type == NUMBER || type == STRING || type == OP)
550 fprintf(stderr, "(%.*s)", (int)(end - start), start);
551 }
552 #endif
553