/src/Python-3.8.3/Parser/parser.c
Line | Count | Source (jump to first uncovered line) |
1 | | |
2 | | /* Parser implementation */ |
3 | | |
4 | | /* For a description, see the comments at end of this file */ |
5 | | |
6 | | /* XXX To do: error recovery */ |
7 | | |
8 | | #include "Python.h" |
9 | | #include "token.h" |
10 | | #include "grammar.h" |
11 | | #include "node.h" |
12 | | #include "parser.h" |
13 | | #include "errcode.h" |
14 | | #include "graminit.h" |
15 | | |
16 | | |
17 | | #ifdef Py_DEBUG |
18 | | extern int Py_DebugFlag; |
19 | | #define D(x) if (!Py_DebugFlag); else x |
20 | | #else |
21 | | #define D(x) |
22 | | #endif |
23 | | |
24 | | |
25 | | /* STACK DATA TYPE */ |
26 | | |
27 | | static void s_reset(stack *); |
28 | | |
29 | | static void |
30 | | s_reset(stack *s) |
31 | 16 | { |
32 | 16 | s->s_top = &s->s_base[MAXSTACK]; |
33 | 16 | } |
34 | | |
35 | 6.62k | #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK]) |
36 | | |
37 | | static int |
38 | | s_push(stack *s, const dfa *d, node *parent) |
39 | 6.62k | { |
40 | 6.62k | stackentry *top; |
41 | 6.62k | if (s->s_top == s->s_base) { |
42 | 0 | fprintf(stderr, "s_push: parser stack overflow\n"); |
43 | 0 | return E_NOMEM; |
44 | 0 | } |
45 | 6.62k | top = --s->s_top; |
46 | 6.62k | top->s_dfa = d; |
47 | 6.62k | top->s_parent = parent; |
48 | 6.62k | top->s_state = 0; |
49 | 6.62k | return 0; |
50 | 6.62k | } |
51 | | |
52 | | #ifdef Py_DEBUG |
53 | | |
54 | | static void |
55 | | s_pop(stack *s) |
56 | | { |
57 | | if (s_empty(s)) |
58 | | Py_FatalError("s_pop: parser stack underflow -- FATAL"); |
59 | | s->s_top++; |
60 | | } |
61 | | |
62 | | #else /* !Py_DEBUG */ |
63 | | |
64 | 6.62k | #define s_pop(s) (s)->s_top++ |
65 | | |
66 | | #endif |
67 | | |
68 | | |
69 | | /* PARSER CREATION */ |
70 | | |
71 | | parser_state * |
72 | | PyParser_New(grammar *g, int start) |
73 | 16 | { |
74 | 16 | parser_state *ps; |
75 | | |
76 | 16 | if (!g->g_accel) |
77 | 14 | PyGrammar_AddAccelerators(g); |
78 | 16 | ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state)); |
79 | 16 | if (ps == NULL) |
80 | 0 | return NULL; |
81 | 16 | ps->p_grammar = g; |
82 | 16 | #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD |
83 | 16 | ps->p_flags = 0; |
84 | 16 | #endif |
85 | 16 | ps->p_tree = PyNode_New(start); |
86 | 16 | if (ps->p_tree == NULL) { |
87 | 0 | PyMem_FREE(ps); |
88 | 0 | return NULL; |
89 | 0 | } |
90 | 16 | s_reset(&ps->p_stack); |
91 | 16 | (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree); |
92 | 16 | return ps; |
93 | 16 | } |
94 | | |
95 | | void |
96 | | PyParser_Delete(parser_state *ps) |
97 | 16 | { |
98 | | /* NB If you want to save the parse tree, |
99 | | you must set p_tree to NULL before calling delparser! */ |
100 | 16 | PyNode_Free(ps->p_tree); |
101 | 16 | PyMem_FREE(ps); |
102 | 16 | } |
103 | | |
104 | | |
105 | | /* PARSER STACK OPERATIONS */ |
106 | | |
107 | | static int |
108 | | shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset, |
109 | | int end_lineno, int end_col_offset) |
110 | 1.41k | { |
111 | 1.41k | int err; |
112 | 1.41k | assert(!s_empty(s)); |
113 | 1.41k | err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset, |
114 | 1.41k | end_lineno, end_col_offset); |
115 | 1.41k | if (err) |
116 | 0 | return err; |
117 | 1.41k | s->s_top->s_state = newstate; |
118 | 1.41k | return 0; |
119 | 1.41k | } |
120 | | |
121 | | static int |
122 | | push(stack *s, int type, const dfa *d, int newstate, int lineno, int col_offset, |
123 | | int end_lineno, int end_col_offset) |
124 | 6.60k | { |
125 | 6.60k | int err; |
126 | 6.60k | node *n; |
127 | 6.60k | n = s->s_top->s_parent; |
128 | 6.60k | assert(!s_empty(s)); |
129 | 6.60k | err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset, |
130 | 6.60k | end_lineno, end_col_offset); |
131 | 6.60k | if (err) |
132 | 0 | return err; |
133 | 6.60k | s->s_top->s_state = newstate; |
134 | 6.60k | return s_push(s, d, CHILD(n, NCH(n)-1)); |
135 | 6.60k | } |
136 | | |
137 | | |
138 | | /* PARSER PROPER */ |
139 | | |
140 | | static int |
141 | | classify(parser_state *ps, int type, const char *str) |
142 | 1.41k | { |
143 | 1.41k | grammar *g = ps->p_grammar; |
144 | 1.41k | int n = g->g_ll.ll_nlabels; |
145 | | |
146 | 1.41k | if (type == NAME) { |
147 | 524 | const label *l = g->g_ll.ll_label; |
148 | 524 | int i; |
149 | 80.6k | for (i = n; i > 0; i--, l++) { |
150 | 80.2k | if (l->lb_type != NAME || l->lb_str == NULL || |
151 | 80.2k | l->lb_str[0] != str[0] || |
152 | 80.2k | strcmp(l->lb_str, str) != 0) |
153 | 80.1k | continue; |
154 | 134 | #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD |
155 | | #if 0 |
156 | | /* Leaving this in as an example */ |
157 | | if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) { |
158 | | if (str[0] == 'w' && strcmp(str, "with") == 0) |
159 | | break; /* not a keyword yet */ |
160 | | else if (str[0] == 'a' && strcmp(str, "as") == 0) |
161 | | break; /* not a keyword yet */ |
162 | | } |
163 | | #endif |
164 | 134 | #endif |
165 | 134 | D(printf("It's a keyword\n")); |
166 | 134 | return n - i; |
167 | 80.2k | } |
168 | 524 | } |
169 | | |
170 | 1.27k | { |
171 | 1.27k | const label *l = g->g_ll.ll_label; |
172 | 1.27k | int i; |
173 | 66.7k | for (i = n; i > 0; i--, l++) { |
174 | 66.7k | if (l->lb_type == type && l->lb_str == NULL) { |
175 | 1.27k | D(printf("It's a token we know\n")); |
176 | 1.27k | return n - i; |
177 | 1.27k | } |
178 | 66.7k | } |
179 | 1.27k | } |
180 | | |
181 | 0 | D(printf("Illegal token\n")); |
182 | 0 | return -1; |
183 | 1.27k | } |
184 | | |
185 | | #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD |
186 | | #if 0 |
187 | | /* Leaving this in as an example */ |
188 | | static void |
189 | | future_hack(parser_state *ps) |
190 | | { |
191 | | node *n = ps->p_stack.s_top->s_parent; |
192 | | node *ch, *cch; |
193 | | int i; |
194 | | |
195 | | /* from __future__ import ..., must have at least 4 children */ |
196 | | n = CHILD(n, 0); |
197 | | if (NCH(n) < 4) |
198 | | return; |
199 | | ch = CHILD(n, 0); |
200 | | if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0) |
201 | | return; |
202 | | ch = CHILD(n, 1); |
203 | | if (NCH(ch) == 1 && STR(CHILD(ch, 0)) && |
204 | | strcmp(STR(CHILD(ch, 0)), "__future__") != 0) |
205 | | return; |
206 | | ch = CHILD(n, 3); |
207 | | /* ch can be a star, a parenthesis or import_as_names */ |
208 | | if (TYPE(ch) == STAR) |
209 | | return; |
210 | | if (TYPE(ch) == LPAR) |
211 | | ch = CHILD(n, 4); |
212 | | |
213 | | for (i = 0; i < NCH(ch); i += 2) { |
214 | | cch = CHILD(ch, i); |
215 | | if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) { |
216 | | char *str_ch = STR(CHILD(cch, 0)); |
217 | | if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) { |
218 | | ps->p_flags |= CO_FUTURE_WITH_STATEMENT; |
219 | | } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) { |
220 | | ps->p_flags |= CO_FUTURE_PRINT_FUNCTION; |
221 | | } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) { |
222 | | ps->p_flags |= CO_FUTURE_UNICODE_LITERALS; |
223 | | } |
224 | | } |
225 | | } |
226 | | } |
227 | | #endif |
228 | | #endif /* future keyword */ |
229 | | |
230 | | int |
231 | | PyParser_AddToken(parser_state *ps, int type, char *str, |
232 | | int lineno, int col_offset, |
233 | | int end_lineno, int end_col_offset, |
234 | | int *expected_ret) |
235 | 1.41k | { |
236 | 1.41k | int ilabel; |
237 | 1.41k | int err; |
238 | | |
239 | 1.41k | D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str)); |
240 | | |
241 | | /* Find out which label this token is */ |
242 | 1.41k | ilabel = classify(ps, type, str); |
243 | 1.41k | if (ilabel < 0) |
244 | 0 | return E_SYNTAX; |
245 | | |
246 | | /* Loop until the token is shifted or an error occurred */ |
247 | 13.7k | for (;;) { |
248 | | /* Fetch the current dfa and state */ |
249 | 13.7k | const dfa *d = ps->p_stack.s_top->s_dfa; |
250 | 13.7k | state *s = &d->d_state[ps->p_stack.s_top->s_state]; |
251 | | |
252 | 13.7k | D(printf(" DFA '%s', state %d:", |
253 | 13.7k | d->d_name, ps->p_stack.s_top->s_state)); |
254 | | |
255 | | /* Check accelerator */ |
256 | 13.7k | if (s->s_lower <= ilabel && ilabel < s->s_upper) { |
257 | 8.85k | int x = s->s_accel[ilabel - s->s_lower]; |
258 | 8.85k | if (x != -1) { |
259 | 8.01k | if (x & (1<<7)) { |
260 | | /* Push non-terminal */ |
261 | 6.60k | int nt = (x >> 8) + NT_OFFSET; |
262 | 6.60k | int arrow = x & ((1<<7)-1); |
263 | 6.60k | if (nt == func_body_suite && !(ps->p_flags & PyCF_TYPE_COMMENTS)) { |
264 | | /* When parsing type comments is not requested, |
265 | | we can provide better errors about bad indentation |
266 | | by using 'suite' for the body of a funcdef */ |
267 | 4 | D(printf(" [switch func_body_suite to suite]")); |
268 | 4 | nt = suite; |
269 | 4 | } |
270 | 6.60k | const dfa *d1 = PyGrammar_FindDFA( |
271 | 6.60k | ps->p_grammar, nt); |
272 | 6.60k | if ((err = push(&ps->p_stack, nt, d1, |
273 | 6.60k | arrow, lineno, col_offset, |
274 | 6.60k | end_lineno, end_col_offset)) > 0) { |
275 | 0 | D(printf(" MemError: push\n")); |
276 | 0 | return err; |
277 | 0 | } |
278 | 6.60k | D(printf(" Push '%s'\n", d1->d_name)); |
279 | 6.60k | continue; |
280 | 6.60k | } |
281 | | |
282 | | /* Shift the token */ |
283 | 1.41k | if ((err = shift(&ps->p_stack, type, str, |
284 | 1.41k | x, lineno, col_offset, |
285 | 1.41k | end_lineno, end_col_offset)) > 0) { |
286 | 0 | D(printf(" MemError: shift.\n")); |
287 | 0 | return err; |
288 | 0 | } |
289 | 1.41k | D(printf(" Shift.\n")); |
290 | | /* Pop while we are in an accept-only state */ |
291 | 2.27k | while (s = &d->d_state |
292 | 2.27k | [ps->p_stack.s_top->s_state], |
293 | 2.27k | s->s_accept && s->s_narcs == 1) { |
294 | 883 | D(printf(" DFA '%s', state %d: " |
295 | 883 | "Direct pop.\n", |
296 | 883 | d->d_name, |
297 | 883 | ps->p_stack.s_top->s_state)); |
298 | 883 | #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD |
299 | | #if 0 |
300 | | if (d->d_name[0] == 'i' && |
301 | | strcmp(d->d_name, |
302 | | "import_stmt") == 0) |
303 | | future_hack(ps); |
304 | | #endif |
305 | 883 | #endif |
306 | 883 | s_pop(&ps->p_stack); |
307 | 883 | if (s_empty(&ps->p_stack)) { |
308 | 16 | D(printf(" ACCEPT.\n")); |
309 | 16 | return E_DONE; |
310 | 16 | } |
311 | 867 | d = ps->p_stack.s_top->s_dfa; |
312 | 867 | } |
313 | 1.39k | return E_OK; |
314 | 1.41k | } |
315 | 8.85k | } |
316 | | |
317 | 5.73k | if (s->s_accept) { |
318 | 5.73k | #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD |
319 | | #if 0 |
320 | | if (d->d_name[0] == 'i' && |
321 | | strcmp(d->d_name, "import_stmt") == 0) |
322 | | future_hack(ps); |
323 | | #endif |
324 | 5.73k | #endif |
325 | | /* Pop this dfa and try again */ |
326 | 5.73k | s_pop(&ps->p_stack); |
327 | 5.73k | D(printf(" Pop ...\n")); |
328 | 5.73k | if (s_empty(&ps->p_stack)) { |
329 | 0 | D(printf(" Error: bottom of stack.\n")); |
330 | 0 | return E_SYNTAX; |
331 | 0 | } |
332 | 5.73k | continue; |
333 | 5.73k | } |
334 | | |
335 | | /* Stuck, report syntax error */ |
336 | 0 | D(printf(" Error.\n")); |
337 | 0 | if (expected_ret) { |
338 | 0 | if (s->s_lower == s->s_upper - 1) { |
339 | | /* Only one possible expected token */ |
340 | 0 | *expected_ret = ps->p_grammar-> |
341 | 0 | g_ll.ll_label[s->s_lower].lb_type; |
342 | 0 | } |
343 | 0 | else |
344 | 0 | *expected_ret = -1; |
345 | 0 | } |
346 | 0 | return E_SYNTAX; |
347 | 5.73k | } |
348 | 1.41k | } |
349 | | |
350 | | |
351 | | #ifdef Py_DEBUG |
352 | | |
353 | | /* DEBUG OUTPUT */ |
354 | | |
355 | | void |
356 | | dumptree(grammar *g, node *n) |
357 | | { |
358 | | int i; |
359 | | |
360 | | if (n == NULL) |
361 | | printf("NIL"); |
362 | | else { |
363 | | label l; |
364 | | l.lb_type = TYPE(n); |
365 | | l.lb_str = STR(n); |
366 | | printf("%s", PyGrammar_LabelRepr(&l)); |
367 | | if (ISNONTERMINAL(TYPE(n))) { |
368 | | printf("("); |
369 | | for (i = 0; i < NCH(n); i++) { |
370 | | if (i > 0) |
371 | | printf(","); |
372 | | dumptree(g, CHILD(n, i)); |
373 | | } |
374 | | printf(")"); |
375 | | } |
376 | | } |
377 | | } |
378 | | |
379 | | void |
380 | | showtree(grammar *g, node *n) |
381 | | { |
382 | | int i; |
383 | | |
384 | | if (n == NULL) |
385 | | return; |
386 | | if (ISNONTERMINAL(TYPE(n))) { |
387 | | for (i = 0; i < NCH(n); i++) |
388 | | showtree(g, CHILD(n, i)); |
389 | | } |
390 | | else if (ISTERMINAL(TYPE(n))) { |
391 | | printf("%s", _PyParser_TokenNames[TYPE(n)]); |
392 | | if (TYPE(n) == NUMBER || TYPE(n) == NAME) |
393 | | printf("(%s)", STR(n)); |
394 | | printf(" "); |
395 | | } |
396 | | else |
397 | | printf("? "); |
398 | | } |
399 | | |
400 | | void |
401 | | printtree(parser_state *ps) |
402 | | { |
403 | | if (Py_DebugFlag) { |
404 | | printf("Parse tree:\n"); |
405 | | dumptree(ps->p_grammar, ps->p_tree); |
406 | | printf("\n"); |
407 | | printf("Tokens:\n"); |
408 | | showtree(ps->p_grammar, ps->p_tree); |
409 | | printf("\n"); |
410 | | } |
411 | | printf("Listing:\n"); |
412 | | PyNode_ListTree(ps->p_tree); |
413 | | printf("\n"); |
414 | | } |
415 | | |
416 | | #endif /* Py_DEBUG */ |
417 | | |
418 | | /* |
419 | | |
420 | | Description |
421 | | ----------- |
422 | | |
423 | | The parser's interface is different than usual: the function addtoken() |
424 | | must be called for each token in the input. This makes it possible to |
425 | | turn it into an incremental parsing system later. The parsing system |
426 | | constructs a parse tree as it goes. |
427 | | |
428 | | A parsing rule is represented as a Deterministic Finite-state Automaton |
429 | | (DFA). A node in a DFA represents a state of the parser; an arc represents |
430 | | a transition. Transitions are either labeled with terminal symbols or |
431 | | with non-terminals. When the parser decides to follow an arc labeled |
432 | | with a non-terminal, it is invoked recursively with the DFA representing |
433 | | the parsing rule for that as its initial state; when that DFA accepts, |
434 | | the parser that invoked it continues. The parse tree constructed by the |
435 | | recursively called parser is inserted as a child in the current parse tree. |
436 | | |
437 | | The DFA's can be constructed automatically from a more conventional |
438 | | language description. An extended LL(1) grammar (ELL(1)) is suitable. |
439 | | Certain restrictions make the parser's life easier: rules that can produce |
440 | | the empty string should be outlawed (there are other ways to put loops |
441 | | or optional parts in the language). To avoid the need to construct |
442 | | FIRST sets, we can require that all but the last alternative of a rule |
443 | | (really: arc going out of a DFA's state) must begin with a terminal |
444 | | symbol. |
445 | | |
446 | | As an example, consider this grammar: |
447 | | |
448 | | expr: term (OP term)* |
449 | | term: CONSTANT | '(' expr ')' |
450 | | |
451 | | The DFA corresponding to the rule for expr is: |
452 | | |
453 | | ------->.---term-->.-------> |
454 | | ^ | |
455 | | | | |
456 | | \----OP----/ |
457 | | |
458 | | The parse tree generated for the input a+b is: |
459 | | |
460 | | (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b))) |
461 | | |
462 | | */ |