1
2 /* Parser-tokenizer link implementation */
3
4 #include "Python.h"
5 #include "tokenizer.h"
6 #include "node.h"
7 #include "grammar.h"
8 #include "parser.h"
9 #include "parsetok.h"
10 #include "errcode.h"
11 #include "graminit.h"
12
13
14 /* Forward */
15 static node *parsetok(struct tok_state *, grammar *, int, perrdetail *, int *);
16 static int initerr(perrdetail *err_ret, PyObject * filename);
17
18 typedef struct {
19 struct {
20 int lineno;
21 char *comment;
22 } *items;
23 size_t size;
24 size_t num_items;
25 } growable_comment_array;
26
27 static int
growable_comment_array_init(growable_comment_array * arr,size_t initial_size)28 growable_comment_array_init(growable_comment_array *arr, size_t initial_size) {
29 assert(initial_size > 0);
30 arr->items = malloc(initial_size * sizeof(*arr->items));
31 arr->size = initial_size;
32 arr->num_items = 0;
33
34 return arr->items != NULL;
35 }
36
37 static int
growable_comment_array_add(growable_comment_array * arr,int lineno,char * comment)38 growable_comment_array_add(growable_comment_array *arr, int lineno, char *comment) {
39 if (arr->num_items >= arr->size) {
40 size_t new_size = arr->size * 2;
41 void *new_items_array = realloc(arr->items, new_size * sizeof(*arr->items));
42 if (!new_items_array) {
43 return 0;
44 }
45 arr->items = new_items_array;
46 arr->size = new_size;
47 }
48
49 arr->items[arr->num_items].lineno = lineno;
50 arr->items[arr->num_items].comment = comment;
51 arr->num_items++;
52 return 1;
53 }
54
55 static void
growable_comment_array_deallocate(growable_comment_array * arr)56 growable_comment_array_deallocate(growable_comment_array *arr) {
57 for (unsigned i = 0; i < arr->num_items; i++) {
58 PyObject_FREE(arr->items[i].comment);
59 }
60 free(arr->items);
61 }
62
63 /* Parse input coming from a string. Return error code, print some errors. */
64 node *
PyParser_ParseString(const char * s,grammar * g,int start,perrdetail * err_ret)65 PyParser_ParseString(const char *s, grammar *g, int start, perrdetail *err_ret)
66 {
67 return PyParser_ParseStringFlagsFilename(s, NULL, g, start, err_ret, 0);
68 }
69
70 node *
PyParser_ParseStringFlags(const char * s,grammar * g,int start,perrdetail * err_ret,int flags)71 PyParser_ParseStringFlags(const char *s, grammar *g, int start,
72 perrdetail *err_ret, int flags)
73 {
74 return PyParser_ParseStringFlagsFilename(s, NULL,
75 g, start, err_ret, flags);
76 }
77
78 node *
PyParser_ParseStringFlagsFilename(const char * s,const char * filename,grammar * g,int start,perrdetail * err_ret,int flags)79 PyParser_ParseStringFlagsFilename(const char *s, const char *filename,
80 grammar *g, int start,
81 perrdetail *err_ret, int flags)
82 {
83 int iflags = flags;
84 return PyParser_ParseStringFlagsFilenameEx(s, filename, g, start,
85 err_ret, &iflags);
86 }
87
88 node *
PyParser_ParseStringObject(const char * s,PyObject * filename,grammar * g,int start,perrdetail * err_ret,int * flags)89 PyParser_ParseStringObject(const char *s, PyObject *filename,
90 grammar *g, int start,
91 perrdetail *err_ret, int *flags)
92 {
93 struct tok_state *tok;
94 int exec_input = start == file_input;
95
96 if (initerr(err_ret, filename) < 0)
97 return NULL;
98
99 if (PySys_Audit("compile", "yO", s, err_ret->filename) < 0) {
100 err_ret->error = E_ERROR;
101 return NULL;
102 }
103
104 if (*flags & PyPARSE_IGNORE_COOKIE)
105 tok = PyTokenizer_FromUTF8(s, exec_input);
106 else
107 tok = PyTokenizer_FromString(s, exec_input);
108 if (tok == NULL) {
109 err_ret->error = PyErr_Occurred() ? E_DECODE : E_NOMEM;
110 return NULL;
111 }
112 if (*flags & PyPARSE_TYPE_COMMENTS) {
113 tok->type_comments = 1;
114 }
115
116 Py_INCREF(err_ret->filename);
117 tok->filename = err_ret->filename;
118 if (*flags & PyPARSE_ASYNC_HACKS)
119 tok->async_hacks = 1;
120 return parsetok(tok, g, start, err_ret, flags);
121 }
122
123 node *
PyParser_ParseStringFlagsFilenameEx(const char * s,const char * filename_str,grammar * g,int start,perrdetail * err_ret,int * flags)124 PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename_str,
125 grammar *g, int start,
126 perrdetail *err_ret, int *flags)
127 {
128 node *n;
129 PyObject *filename = NULL;
130 if (filename_str != NULL) {
131 filename = PyUnicode_DecodeFSDefault(filename_str);
132 if (filename == NULL) {
133 err_ret->error = E_ERROR;
134 return NULL;
135 }
136 }
137 n = PyParser_ParseStringObject(s, filename, g, start, err_ret, flags);
138 Py_XDECREF(filename);
139 return n;
140 }
141
142 /* Parse input coming from a file. Return error code, print some errors. */
143
144 node *
PyParser_ParseFile(FILE * fp,const char * filename,grammar * g,int start,const char * ps1,const char * ps2,perrdetail * err_ret)145 PyParser_ParseFile(FILE *fp, const char *filename, grammar *g, int start,
146 const char *ps1, const char *ps2,
147 perrdetail *err_ret)
148 {
149 return PyParser_ParseFileFlags(fp, filename, NULL,
150 g, start, ps1, ps2, err_ret, 0);
151 }
152
153 node *
PyParser_ParseFileFlags(FILE * fp,const char * filename,const char * enc,grammar * g,int start,const char * ps1,const char * ps2,perrdetail * err_ret,int flags)154 PyParser_ParseFileFlags(FILE *fp, const char *filename, const char *enc,
155 grammar *g, int start,
156 const char *ps1, const char *ps2,
157 perrdetail *err_ret, int flags)
158 {
159 int iflags = flags;
160 return PyParser_ParseFileFlagsEx(fp, filename, enc, g, start, ps1,
161 ps2, err_ret, &iflags);
162 }
163
164 node *
PyParser_ParseFileObject(FILE * fp,PyObject * filename,const char * enc,grammar * g,int start,const char * ps1,const char * ps2,perrdetail * err_ret,int * flags)165 PyParser_ParseFileObject(FILE *fp, PyObject *filename,
166 const char *enc, grammar *g, int start,
167 const char *ps1, const char *ps2,
168 perrdetail *err_ret, int *flags)
169 {
170 struct tok_state *tok;
171
172 if (initerr(err_ret, filename) < 0)
173 return NULL;
174
175 if (PySys_Audit("compile", "OO", Py_None, err_ret->filename) < 0) {
176 return NULL;
177 }
178
179 if ((tok = PyTokenizer_FromFile(fp, enc, ps1, ps2)) == NULL) {
180 err_ret->error = E_NOMEM;
181 return NULL;
182 }
183 if (*flags & PyPARSE_TYPE_COMMENTS) {
184 tok->type_comments = 1;
185 }
186 Py_INCREF(err_ret->filename);
187 tok->filename = err_ret->filename;
188 return parsetok(tok, g, start, err_ret, flags);
189 }
190
191 node *
PyParser_ParseFileFlagsEx(FILE * fp,const char * filename,const char * enc,grammar * g,int start,const char * ps1,const char * ps2,perrdetail * err_ret,int * flags)192 PyParser_ParseFileFlagsEx(FILE *fp, const char *filename,
193 const char *enc, grammar *g, int start,
194 const char *ps1, const char *ps2,
195 perrdetail *err_ret, int *flags)
196 {
197 node *n;
198 PyObject *fileobj = NULL;
199 if (filename != NULL) {
200 fileobj = PyUnicode_DecodeFSDefault(filename);
201 if (fileobj == NULL) {
202 err_ret->error = E_ERROR;
203 return NULL;
204 }
205 }
206 n = PyParser_ParseFileObject(fp, fileobj, enc, g,
207 start, ps1, ps2, err_ret, flags);
208 Py_XDECREF(fileobj);
209 return n;
210 }
211
212 /* Parse input coming from the given tokenizer structure.
213 Return error code. */
214
215 static node *
parsetok(struct tok_state * tok,grammar * g,int start,perrdetail * err_ret,int * flags)216 parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
217 int *flags)
218 {
219 parser_state *ps;
220 node *n;
221 int started = 0;
222 int col_offset, end_col_offset;
223 growable_comment_array type_ignores;
224
225 if (!growable_comment_array_init(&type_ignores, 10)) {
226 err_ret->error = E_NOMEM;
227 PyTokenizer_Free(tok);
228 return NULL;
229 }
230
231 if ((ps = PyParser_New(g, start)) == NULL) {
232 err_ret->error = E_NOMEM;
233 growable_comment_array_deallocate(&type_ignores);
234 PyTokenizer_Free(tok);
235 return NULL;
236 }
237 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
238 if (*flags & PyPARSE_BARRY_AS_BDFL)
239 ps->p_flags |= CO_FUTURE_BARRY_AS_BDFL;
240 if (*flags & PyPARSE_TYPE_COMMENTS)
241 ps->p_flags |= PyCF_TYPE_COMMENTS;
242 #endif
243
244 for (;;) {
245 const char *a, *b;
246 int type;
247 size_t len;
248 char *str;
249 col_offset = -1;
250 int lineno;
251 const char *line_start;
252
253 type = PyTokenizer_Get(tok, &a, &b);
254
255 len = (a != NULL && b != NULL) ? b - a : 0;
256 str = (char *) PyObject_MALLOC(len + 1);
257 if (str == NULL) {
258 err_ret->error = E_NOMEM;
259 break;
260 }
261 if (len > 0)
262 strncpy(str, a, len);
263 str[len] = '\0';
264
265 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
266 if (type == NOTEQUAL) {
267 if (!(ps->p_flags & CO_FUTURE_BARRY_AS_BDFL) &&
268 strcmp(str, "!=")) {
269 PyObject_FREE(str);
270 err_ret->error = E_SYNTAX;
271 break;
272 }
273 else if ((ps->p_flags & CO_FUTURE_BARRY_AS_BDFL) &&
274 strcmp(str, "<>")) {
275 PyObject_FREE(str);
276 err_ret->expected = NOTEQUAL;
277 err_ret->error = E_SYNTAX;
278 break;
279 }
280 }
281 #endif
282
283 /* Nodes of type STRING, especially multi line strings
284 must be handled differently in order to get both
285 the starting line number and the column offset right.
286 (cf. issue 16806) */
287 lineno = type == STRING ? tok->first_lineno : tok->lineno;
288 line_start = type == STRING ? tok->multi_line_start : tok->line_start;
289 if (a != NULL && a >= line_start) {
290 col_offset = Py_SAFE_DOWNCAST(a - line_start,
291 intptr_t, int);
292 }
293 else {
294 col_offset = -1;
295 }
296
297 if (b != NULL && b >= tok->line_start) {
298 end_col_offset = Py_SAFE_DOWNCAST(b - tok->line_start,
299 intptr_t, int);
300 }
301 else {
302 end_col_offset = -1;
303 }
304
305 if (type == TYPE_IGNORE) {
306 if (!growable_comment_array_add(&type_ignores, tok->lineno, str)) {
307 err_ret->error = E_NOMEM;
308 break;
309 }
310 continue;
311 }
312
313 if (type == ERRORTOKEN) {
314 err_ret->error = tok->done;
315 break;
316 }
317 if (type == ENDMARKER && started) {
318 type = NEWLINE; /* Add an extra newline */
319 started = 0;
320 /* Add the right number of dedent tokens,
321 except if a certain flag is given --
322 codeop.py uses this. */
323 if (tok->indent &&
324 !(*flags & PyPARSE_DONT_IMPLY_DEDENT))
325 {
326 tok->pendin = -tok->indent;
327 tok->indent = 0;
328 }
329 }
330 else {
331 started = 1;
332 }
333
334 if ((err_ret->error =
335 PyParser_AddToken(ps, (int)type, str,
336 lineno, col_offset, tok->lineno, end_col_offset,
337 &(err_ret->expected))) != E_OK) {
338 if (tok->done == E_EOF && !ISWHITESPACE(type)) {
339 tok->done = E_SYNTAX;
340 }
341 if (err_ret->error != E_DONE) {
342 PyObject_FREE(str);
343 err_ret->token = type;
344 }
345 break;
346 }
347 }
348
349 if (err_ret->error == E_DONE) {
350 n = ps->p_tree;
351 ps->p_tree = NULL;
352
353 if (n->n_type == file_input) {
354 /* Put type_ignore nodes in the ENDMARKER of file_input. */
355 int num;
356 node *ch;
357 size_t i;
358
359 num = NCH(n);
360 ch = CHILD(n, num - 1);
361 REQ(ch, ENDMARKER);
362
363 for (i = 0; i < type_ignores.num_items; i++) {
364 int res = PyNode_AddChild(ch, TYPE_IGNORE, type_ignores.items[i].comment,
365 type_ignores.items[i].lineno, 0,
366 type_ignores.items[i].lineno, 0);
367 if (res != 0) {
368 err_ret->error = res;
369 PyNode_Free(n);
370 n = NULL;
371 break;
372 }
373 type_ignores.items[i].comment = NULL;
374 }
375 }
376
377 /* Check that the source for a single input statement really
378 is a single statement by looking at what is left in the
379 buffer after parsing. Trailing whitespace and comments
380 are OK. */
381 if (err_ret->error == E_DONE && start == single_input) {
382 const char *cur = tok->cur;
383 char c = *tok->cur;
384
385 for (;;) {
386 while (c == ' ' || c == '\t' || c == '\n' || c == '\014')
387 c = *++cur;
388
389 if (!c)
390 break;
391
392 if (c != '#') {
393 err_ret->error = E_BADSINGLE;
394 PyNode_Free(n);
395 n = NULL;
396 break;
397 }
398
399 /* Suck up comment. */
400 while (c && c != '\n')
401 c = *++cur;
402 }
403 }
404 }
405 else
406 n = NULL;
407
408 growable_comment_array_deallocate(&type_ignores);
409
410 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
411 *flags = ps->p_flags;
412 #endif
413 PyParser_Delete(ps);
414
415 if (n == NULL) {
416 if (tok->done == E_EOF)
417 err_ret->error = E_EOF;
418 err_ret->lineno = tok->lineno;
419 if (tok->buf != NULL) {
420 size_t len;
421 assert(tok->cur - tok->buf < INT_MAX);
422 /* if we've managed to parse a token, point the offset to its start,
423 * else use the current reading position of the tokenizer
424 */
425 err_ret->offset = col_offset != -1 ? col_offset + 1 : ((int)(tok->cur - tok->buf));
426 len = tok->inp - tok->buf;
427 err_ret->text = (char *) PyObject_MALLOC(len + 1);
428 if (err_ret->text != NULL) {
429 if (len > 0)
430 strncpy(err_ret->text, tok->buf, len);
431 err_ret->text[len] = '\0';
432 }
433 }
434 } else if (tok->encoding != NULL) {
435 /* 'nodes->n_str' uses PyObject_*, while 'tok->encoding' was
436 * allocated using PyMem_
437 */
438 node* r = PyNode_New(encoding_decl);
439 if (r)
440 r->n_str = PyObject_MALLOC(strlen(tok->encoding)+1);
441 if (!r || !r->n_str) {
442 err_ret->error = E_NOMEM;
443 if (r)
444 PyObject_FREE(r);
445 n = NULL;
446 goto done;
447 }
448 strcpy(r->n_str, tok->encoding);
449 PyMem_FREE(tok->encoding);
450 tok->encoding = NULL;
451 r->n_nchildren = 1;
452 r->n_child = n;
453 n = r;
454 }
455
456 done:
457 PyTokenizer_Free(tok);
458
459 if (n != NULL) {
460 _PyNode_FinalizeEndPos(n);
461 }
462 return n;
463 }
464
465 static int
initerr(perrdetail * err_ret,PyObject * filename)466 initerr(perrdetail *err_ret, PyObject *filename)
467 {
468 err_ret->error = E_OK;
469 err_ret->lineno = 0;
470 err_ret->offset = 0;
471 err_ret->text = NULL;
472 err_ret->token = -1;
473 err_ret->expected = -1;
474 if (filename) {
475 Py_INCREF(filename);
476 err_ret->filename = filename;
477 }
478 else {
479 err_ret->filename = PyUnicode_FromString("<string>");
480 if (err_ret->filename == NULL) {
481 err_ret->error = E_ERROR;
482 return -1;
483 }
484 }
485 return 0;
486 }
487