1
2 /* Parser implementation */
3
4 /* For a description, see the comments at end of this file */
5
6 /* XXX To do: error recovery */
7
8 #include "Python.h"
9 #include "token.h"
10 #include "grammar.h"
11 #include "node.h"
12 #include "parser.h"
13 #include "errcode.h"
14 #include "graminit.h"
15
16
17 #ifdef Py_DEBUG
18 extern int Py_DebugFlag;
19 #define D(x) if (!Py_DebugFlag); else x
20 #else
21 #define D(x)
22 #endif
23
24
25 /* STACK DATA TYPE */
26
27 static void s_reset(stack *);
28
29 static void
s_reset(stack * s)30 s_reset(stack *s)
31 {
32 s->s_top = &s->s_base[MAXSTACK];
33 }
34
35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
36
37 static int
s_push(stack * s,const dfa * d,node * parent)38 s_push(stack *s, const dfa *d, node *parent)
39 {
40 stackentry *top;
41 if (s->s_top == s->s_base) {
42 fprintf(stderr, "s_push: parser stack overflow\n");
43 return E_NOMEM;
44 }
45 top = --s->s_top;
46 top->s_dfa = d;
47 top->s_parent = parent;
48 top->s_state = 0;
49 return 0;
50 }
51
52 #ifdef Py_DEBUG
53
54 static void
s_pop(stack * s)55 s_pop(stack *s)
56 {
57 if (s_empty(s)) {
58 Py_FatalError("parser stack underflow");
59 }
60 s->s_top++;
61 }
62
63 #else /* !Py_DEBUG */
64
65 #define s_pop(s) (s)->s_top++
66
67 #endif
68
69
70 /* PARSER CREATION */
71
72 parser_state *
PyParser_New(grammar * g,int start)73 PyParser_New(grammar *g, int start)
74 {
75 parser_state *ps;
76
77 if (!g->g_accel)
78 PyGrammar_AddAccelerators(g);
79 ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
80 if (ps == NULL)
81 return NULL;
82 ps->p_grammar = g;
83 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
84 ps->p_flags = 0;
85 #endif
86 ps->p_tree = PyNode_New(start);
87 if (ps->p_tree == NULL) {
88 PyMem_FREE(ps);
89 return NULL;
90 }
91 s_reset(&ps->p_stack);
92 (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
93 return ps;
94 }
95
96 void
PyParser_Delete(parser_state * ps)97 PyParser_Delete(parser_state *ps)
98 {
99 /* NB If you want to save the parse tree,
100 you must set p_tree to NULL before calling delparser! */
101 PyNode_Free(ps->p_tree);
102 PyMem_FREE(ps);
103 }
104
105
106 /* PARSER STACK OPERATIONS */
107
108 static int
shift(stack * s,int type,char * str,int newstate,int lineno,int col_offset,int end_lineno,int end_col_offset)109 shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset,
110 int end_lineno, int end_col_offset)
111 {
112 int err;
113 assert(!s_empty(s));
114 err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset,
115 end_lineno, end_col_offset);
116 if (err)
117 return err;
118 s->s_top->s_state = newstate;
119 return 0;
120 }
121
122 static int
push(stack * s,int type,const dfa * d,int newstate,int lineno,int col_offset,int end_lineno,int end_col_offset)123 push(stack *s, int type, const dfa *d, int newstate, int lineno, int col_offset,
124 int end_lineno, int end_col_offset)
125 {
126 int err;
127 node *n;
128 n = s->s_top->s_parent;
129 assert(!s_empty(s));
130 err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset,
131 end_lineno, end_col_offset);
132 if (err)
133 return err;
134 s->s_top->s_state = newstate;
135 return s_push(s, d, CHILD(n, NCH(n)-1));
136 }
137
138
139 /* PARSER PROPER */
140
141 static int
classify(parser_state * ps,int type,const char * str)142 classify(parser_state *ps, int type, const char *str)
143 {
144 grammar *g = ps->p_grammar;
145 int n = g->g_ll.ll_nlabels;
146
147 if (type == NAME) {
148 const label *l = g->g_ll.ll_label;
149 int i;
150 for (i = n; i > 0; i--, l++) {
151 if (l->lb_type != NAME || l->lb_str == NULL ||
152 l->lb_str[0] != str[0] ||
153 strcmp(l->lb_str, str) != 0)
154 continue;
155 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
156 #if 0
157 /* Leaving this in as an example */
158 if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
159 if (str[0] == 'w' && strcmp(str, "with") == 0)
160 break; /* not a keyword yet */
161 else if (str[0] == 'a' && strcmp(str, "as") == 0)
162 break; /* not a keyword yet */
163 }
164 #endif
165 #endif
166 D(printf("It's a keyword\n"));
167 return n - i;
168 }
169 }
170
171 {
172 const label *l = g->g_ll.ll_label;
173 int i;
174 for (i = n; i > 0; i--, l++) {
175 if (l->lb_type == type && l->lb_str == NULL) {
176 D(printf("It's a token we know\n"));
177 return n - i;
178 }
179 }
180 }
181
182 D(printf("Illegal token\n"));
183 return -1;
184 }
185
186 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
187 #if 0
188 /* Leaving this in as an example */
189 static void
190 future_hack(parser_state *ps)
191 {
192 node *n = ps->p_stack.s_top->s_parent;
193 node *ch, *cch;
194 int i;
195
196 /* from __future__ import ..., must have at least 4 children */
197 n = CHILD(n, 0);
198 if (NCH(n) < 4)
199 return;
200 ch = CHILD(n, 0);
201 if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
202 return;
203 ch = CHILD(n, 1);
204 if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
205 strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
206 return;
207 ch = CHILD(n, 3);
208 /* ch can be a star, a parenthesis or import_as_names */
209 if (TYPE(ch) == STAR)
210 return;
211 if (TYPE(ch) == LPAR)
212 ch = CHILD(n, 4);
213
214 for (i = 0; i < NCH(ch); i += 2) {
215 cch = CHILD(ch, i);
216 if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
217 char *str_ch = STR(CHILD(cch, 0));
218 if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
219 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
220 } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
221 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
222 } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
223 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
224 }
225 }
226 }
227 }
228 #endif
229 #endif /* future keyword */
230
231 int
PyParser_AddToken(parser_state * ps,int type,char * str,int lineno,int col_offset,int end_lineno,int end_col_offset,int * expected_ret)232 PyParser_AddToken(parser_state *ps, int type, char *str,
233 int lineno, int col_offset,
234 int end_lineno, int end_col_offset,
235 int *expected_ret)
236 {
237 int ilabel;
238 int err;
239
240 D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
241
242 /* Find out which label this token is */
243 ilabel = classify(ps, type, str);
244 if (ilabel < 0)
245 return E_SYNTAX;
246
247 /* Loop until the token is shifted or an error occurred */
248 for (;;) {
249 /* Fetch the current dfa and state */
250 const dfa *d = ps->p_stack.s_top->s_dfa;
251 state *s = &d->d_state[ps->p_stack.s_top->s_state];
252
253 D(printf(" DFA '%s', state %d:",
254 d->d_name, ps->p_stack.s_top->s_state));
255
256 /* Check accelerator */
257 if (s->s_lower <= ilabel && ilabel < s->s_upper) {
258 int x = s->s_accel[ilabel - s->s_lower];
259 if (x != -1) {
260 if (x & (1<<7)) {
261 /* Push non-terminal */
262 int nt = (x >> 8) + NT_OFFSET;
263 int arrow = x & ((1<<7)-1);
264 if (nt == func_body_suite && !(ps->p_flags & PyCF_TYPE_COMMENTS)) {
265 /* When parsing type comments is not requested,
266 we can provide better errors about bad indentation
267 by using 'suite' for the body of a funcdef */
268 D(printf(" [switch func_body_suite to suite]"));
269 nt = suite;
270 }
271 const dfa *d1 = PyGrammar_FindDFA(
272 ps->p_grammar, nt);
273 if ((err = push(&ps->p_stack, nt, d1,
274 arrow, lineno, col_offset,
275 end_lineno, end_col_offset)) > 0) {
276 D(printf(" MemError: push\n"));
277 return err;
278 }
279 D(printf(" Push '%s'\n", d1->d_name));
280 continue;
281 }
282
283 /* Shift the token */
284 if ((err = shift(&ps->p_stack, type, str,
285 x, lineno, col_offset,
286 end_lineno, end_col_offset)) > 0) {
287 D(printf(" MemError: shift.\n"));
288 return err;
289 }
290 D(printf(" Shift.\n"));
291 /* Pop while we are in an accept-only state */
292 while (s = &d->d_state
293 [ps->p_stack.s_top->s_state],
294 s->s_accept && s->s_narcs == 1) {
295 D(printf(" DFA '%s', state %d: "
296 "Direct pop.\n",
297 d->d_name,
298 ps->p_stack.s_top->s_state));
299 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
300 #if 0
301 if (d->d_name[0] == 'i' &&
302 strcmp(d->d_name,
303 "import_stmt") == 0)
304 future_hack(ps);
305 #endif
306 #endif
307 s_pop(&ps->p_stack);
308 if (s_empty(&ps->p_stack)) {
309 D(printf(" ACCEPT.\n"));
310 return E_DONE;
311 }
312 d = ps->p_stack.s_top->s_dfa;
313 }
314 return E_OK;
315 }
316 }
317
318 if (s->s_accept) {
319 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
320 #if 0
321 if (d->d_name[0] == 'i' &&
322 strcmp(d->d_name, "import_stmt") == 0)
323 future_hack(ps);
324 #endif
325 #endif
326 /* Pop this dfa and try again */
327 s_pop(&ps->p_stack);
328 D(printf(" Pop ...\n"));
329 if (s_empty(&ps->p_stack)) {
330 D(printf(" Error: bottom of stack.\n"));
331 return E_SYNTAX;
332 }
333 continue;
334 }
335
336 /* Stuck, report syntax error */
337 D(printf(" Error.\n"));
338 if (expected_ret) {
339 if (s->s_lower == s->s_upper - 1) {
340 /* Only one possible expected token */
341 *expected_ret = ps->p_grammar->
342 g_ll.ll_label[s->s_lower].lb_type;
343 }
344 else
345 *expected_ret = -1;
346 }
347 return E_SYNTAX;
348 }
349 }
350
351
352 #ifdef Py_DEBUG
353
354 /* DEBUG OUTPUT */
355
356 void
dumptree(grammar * g,node * n)357 dumptree(grammar *g, node *n)
358 {
359 int i;
360
361 if (n == NULL)
362 printf("NIL");
363 else {
364 label l;
365 l.lb_type = TYPE(n);
366 l.lb_str = STR(n);
367 printf("%s", PyGrammar_LabelRepr(&l));
368 if (ISNONTERMINAL(TYPE(n))) {
369 printf("(");
370 for (i = 0; i < NCH(n); i++) {
371 if (i > 0)
372 printf(",");
373 dumptree(g, CHILD(n, i));
374 }
375 printf(")");
376 }
377 }
378 }
379
380 void
showtree(grammar * g,node * n)381 showtree(grammar *g, node *n)
382 {
383 int i;
384
385 if (n == NULL)
386 return;
387 if (ISNONTERMINAL(TYPE(n))) {
388 for (i = 0; i < NCH(n); i++)
389 showtree(g, CHILD(n, i));
390 }
391 else if (ISTERMINAL(TYPE(n))) {
392 printf("%s", _PyParser_TokenNames[TYPE(n)]);
393 if (TYPE(n) == NUMBER || TYPE(n) == NAME)
394 printf("(%s)", STR(n));
395 printf(" ");
396 }
397 else
398 printf("? ");
399 }
400
401 void
printtree(parser_state * ps)402 printtree(parser_state *ps)
403 {
404 if (Py_DebugFlag) {
405 printf("Parse tree:\n");
406 dumptree(ps->p_grammar, ps->p_tree);
407 printf("\n");
408 printf("Tokens:\n");
409 showtree(ps->p_grammar, ps->p_tree);
410 printf("\n");
411 }
412 printf("Listing:\n");
413 PyNode_ListTree(ps->p_tree);
414 printf("\n");
415 }
416
417 #endif /* Py_DEBUG */
418
419 /*
420
421 Description
422 -----------
423
424 The parser's interface is different than usual: the function addtoken()
425 must be called for each token in the input. This makes it possible to
426 turn it into an incremental parsing system later. The parsing system
427 constructs a parse tree as it goes.
428
429 A parsing rule is represented as a Deterministic Finite-state Automaton
430 (DFA). A node in a DFA represents a state of the parser; an arc represents
431 a transition. Transitions are either labeled with terminal symbols or
432 with non-terminals. When the parser decides to follow an arc labeled
433 with a non-terminal, it is invoked recursively with the DFA representing
434 the parsing rule for that as its initial state; when that DFA accepts,
435 the parser that invoked it continues. The parse tree constructed by the
436 recursively called parser is inserted as a child in the current parse tree.
437
438 The DFA's can be constructed automatically from a more conventional
439 language description. An extended LL(1) grammar (ELL(1)) is suitable.
440 Certain restrictions make the parser's life easier: rules that can produce
441 the empty string should be outlawed (there are other ways to put loops
442 or optional parts in the language). To avoid the need to construct
443 FIRST sets, we can require that all but the last alternative of a rule
444 (really: arc going out of a DFA's state) must begin with a terminal
445 symbol.
446
447 As an example, consider this grammar:
448
449 expr: term (OP term)*
450 term: CONSTANT | '(' expr ')'
451
452 The DFA corresponding to the rule for expr is:
453
454 ------->.---term-->.------->
455 ^ |
456 | |
457 \----OP----/
458
459 The parse tree generated for the input a+b is:
460
461 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
462
463 */
464