/src/cpython/Parser/pegen_errors.c
Line | Count | Source |
1 | | #include <Python.h> |
2 | | #include <errcode.h> |
3 | | |
4 | | #include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject() |
5 | | #include "pycore_runtime.h" // _Py_ID() |
6 | | #include "lexer/state.h" |
7 | | #include "lexer/lexer.h" |
8 | | #include "pegen.h" |
9 | | |
10 | | // TOKENIZER ERRORS |
11 | | |
12 | | void |
13 | | _PyPegen_raise_tokenizer_init_error(PyObject *filename) |
14 | 2.63k | { |
15 | 2.63k | if (!(PyErr_ExceptionMatches(PyExc_LookupError) |
16 | 2.47k | || PyErr_ExceptionMatches(PyExc_SyntaxError) |
17 | 1.87k | || PyErr_ExceptionMatches(PyExc_ValueError) |
18 | 60 | || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { |
19 | 60 | return; |
20 | 60 | } |
21 | 2.57k | PyObject *errstr = NULL; |
22 | 2.57k | PyObject *tuple = NULL; |
23 | 2.57k | PyObject *type; |
24 | 2.57k | PyObject *value; |
25 | 2.57k | PyObject *tback; |
26 | 2.57k | PyErr_Fetch(&type, &value, &tback); |
27 | 2.57k | if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) { |
28 | 608 | if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) { |
29 | 0 | goto error; |
30 | 0 | } |
31 | 608 | PyErr_Restore(type, value, tback); |
32 | 608 | return; |
33 | 608 | } |
34 | 1.96k | errstr = PyObject_Str(value); |
35 | 1.96k | if (!errstr) { |
36 | 0 | goto error; |
37 | 0 | } |
38 | | |
39 | 1.96k | PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); |
40 | 1.96k | if (!tmp) { |
41 | 0 | goto error; |
42 | 0 | } |
43 | | |
44 | 1.96k | tuple = PyTuple_Pack(2, errstr, tmp); |
45 | 1.96k | Py_DECREF(tmp); |
46 | 1.96k | if (!value) { |
47 | 0 | goto error; |
48 | 0 | } |
49 | 1.96k | PyErr_SetObject(PyExc_SyntaxError, tuple); |
50 | | |
51 | 1.96k | error: |
52 | 1.96k | Py_XDECREF(type); |
53 | 1.96k | Py_XDECREF(value); |
54 | 1.96k | Py_XDECREF(tback); |
55 | 1.96k | Py_XDECREF(errstr); |
56 | 1.96k | Py_XDECREF(tuple); |
57 | 1.96k | } |
58 | | |
59 | | static inline void |
60 | 1.80k | raise_unclosed_parentheses_error(Parser *p) { |
61 | 1.80k | int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
62 | 1.80k | int error_col = p->tok->parencolstack[p->tok->level-1]; |
63 | 1.80k | RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, |
64 | 1.80k | error_lineno, error_col, error_lineno, -1, |
65 | 1.80k | "'%c' was never closed", |
66 | 1.80k | p->tok->parenstack[p->tok->level-1]); |
67 | 1.80k | } |
68 | | |
69 | | int |
70 | | _Pypegen_tokenizer_error(Parser *p) |
71 | 3.78k | { |
72 | 3.78k | if (PyErr_Occurred()) { |
73 | 1.96k | return -1; |
74 | 1.96k | } |
75 | | |
76 | 1.81k | const char *msg = NULL; |
77 | 1.81k | PyObject* errtype = PyExc_SyntaxError; |
78 | 1.81k | Py_ssize_t col_offset = -1; |
79 | 1.81k | p->error_indicator = 1; |
80 | 1.81k | switch (p->tok->done) { |
81 | 0 | case E_TOKEN: |
82 | 0 | msg = "invalid token"; |
83 | 0 | break; |
84 | 1.76k | case E_EOF: |
85 | 1.76k | if (p->tok->level) { |
86 | 1.72k | raise_unclosed_parentheses_error(p); |
87 | 1.72k | } else { |
88 | 43 | RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
89 | 43 | } |
90 | 1.76k | return -1; |
91 | 3 | case E_DEDENT: |
92 | 3 | RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); |
93 | 3 | return -1; |
94 | 0 | case E_INTR: |
95 | 0 | if (!PyErr_Occurred()) { |
96 | 0 | PyErr_SetNone(PyExc_KeyboardInterrupt); |
97 | 0 | } |
98 | 0 | return -1; |
99 | 0 | case E_NOMEM: |
100 | 0 | PyErr_NoMemory(); |
101 | 0 | return -1; |
102 | 2 | case E_TABSPACE: |
103 | 2 | errtype = PyExc_TabError; |
104 | 2 | msg = "inconsistent use of tabs and spaces in indentation"; |
105 | 2 | break; |
106 | 0 | case E_TOODEEP: |
107 | 0 | errtype = PyExc_IndentationError; |
108 | 0 | msg = "too many levels of indentation"; |
109 | 0 | break; |
110 | 43 | case E_LINECONT: { |
111 | 43 | col_offset = p->tok->cur - p->tok->buf - 1; |
112 | 43 | msg = "unexpected character after line continuation character"; |
113 | 43 | break; |
114 | 0 | } |
115 | 0 | case E_COLUMNOVERFLOW: |
116 | 0 | PyErr_SetString(PyExc_OverflowError, |
117 | 0 | "Parser column offset overflow - source line is too big"); |
118 | 0 | return -1; |
119 | 0 | default: |
120 | 0 | msg = "unknown parsing error"; |
121 | 1.81k | } |
122 | | |
123 | 45 | RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno, |
124 | 45 | col_offset >= 0 ? col_offset : 0, |
125 | 45 | p->tok->lineno, -1, msg); |
126 | 45 | return -1; |
127 | 1.81k | } |
128 | | |
129 | | int |
130 | | _Pypegen_raise_decode_error(Parser *p) |
131 | 112 | { |
132 | 112 | assert(PyErr_Occurred()); |
133 | 112 | const char *errtype = NULL; |
134 | 112 | if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { |
135 | 104 | errtype = "unicode error"; |
136 | 104 | } |
137 | 8 | else if (PyErr_ExceptionMatches(PyExc_ValueError)) { |
138 | 4 | errtype = "value error"; |
139 | 4 | } |
140 | 112 | if (errtype) { |
141 | 108 | PyObject *type; |
142 | 108 | PyObject *value; |
143 | 108 | PyObject *tback; |
144 | 108 | PyObject *errstr; |
145 | 108 | PyErr_Fetch(&type, &value, &tback); |
146 | 108 | errstr = PyObject_Str(value); |
147 | 108 | if (errstr) { |
148 | 108 | RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr); |
149 | 108 | Py_DECREF(errstr); |
150 | 108 | } |
151 | 0 | else { |
152 | 0 | PyErr_Clear(); |
153 | 0 | RAISE_SYNTAX_ERROR("(%s) unknown error", errtype); |
154 | 0 | } |
155 | 108 | Py_XDECREF(type); |
156 | 108 | Py_XDECREF(value); |
157 | 108 | Py_XDECREF(tback); |
158 | 108 | } |
159 | | |
160 | 112 | return -1; |
161 | 112 | } |
162 | | |
163 | | static int |
164 | 7.94k | _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { |
165 | | // Tokenize the whole input to see if there are any tokenization |
166 | | // errors such as mismatching parentheses. These will get priority |
167 | | // over generic syntax errors only if the line number of the error is |
168 | | // before the one that we had for the generic error. |
169 | | |
170 | | // We don't want to tokenize to the end for interactive input |
171 | 7.94k | if (p->tok->prompt != NULL) { |
172 | 0 | return 0; |
173 | 0 | } |
174 | | |
175 | 7.94k | PyObject *type, *value, *traceback; |
176 | 7.94k | PyErr_Fetch(&type, &value, &traceback); |
177 | | |
178 | 7.94k | Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; |
179 | 7.94k | Py_ssize_t current_err_line = current_token->lineno; |
180 | | |
181 | 7.94k | int ret = 0; |
182 | 7.94k | struct token new_token; |
183 | 7.94k | _PyToken_Init(&new_token); |
184 | | |
185 | 43.4k | for (;;) { |
186 | 43.4k | switch (_PyTokenizer_Get(p->tok, &new_token)) { |
187 | 2.79k | case ERRORTOKEN: |
188 | 2.79k | if (PyErr_Occurred()) { |
189 | 467 | ret = -1; |
190 | 467 | goto exit; |
191 | 467 | } |
192 | 2.32k | if (p->tok->level != 0) { |
193 | 2.30k | int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
194 | 2.30k | if (current_err_line > error_lineno) { |
195 | 76 | raise_unclosed_parentheses_error(p); |
196 | 76 | ret = -1; |
197 | 76 | goto exit; |
198 | 76 | } |
199 | 2.30k | } |
200 | 2.24k | break; |
201 | 5.15k | case ENDMARKER: |
202 | 5.15k | break; |
203 | 35.4k | default: |
204 | 35.4k | continue; |
205 | 43.4k | } |
206 | 7.39k | break; |
207 | 43.4k | } |
208 | | |
209 | | |
210 | 7.94k | exit: |
211 | 7.94k | _PyToken_Free(&new_token); |
212 | | // If we're in an f-string, we want the syntax error in the expression part |
213 | | // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards |
214 | | // do not swallow it. |
215 | 7.94k | if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) { |
216 | 366 | Py_XDECREF(value); |
217 | 366 | Py_XDECREF(type); |
218 | 366 | Py_XDECREF(traceback); |
219 | 7.57k | } else { |
220 | 7.57k | PyErr_Restore(type, value, traceback); |
221 | 7.57k | } |
222 | 7.94k | return ret; |
223 | 7.94k | } |
224 | | |
225 | | // PARSER ERRORS |
226 | | |
227 | | void * |
228 | | _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...) |
229 | 1.21k | { |
230 | | // Bail out if we already have an error set. |
231 | 1.21k | if (p->error_indicator && PyErr_Occurred()) { |
232 | 231 | return NULL; |
233 | 231 | } |
234 | 982 | if (p->fill == 0) { |
235 | 0 | va_list va; |
236 | 0 | va_start(va, errmsg); |
237 | 0 | _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va); |
238 | 0 | va_end(va); |
239 | 0 | return NULL; |
240 | 0 | } |
241 | 982 | if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) { |
242 | 0 | p->error_indicator = 1; |
243 | 0 | return NULL; |
244 | 0 | } |
245 | 982 | Token *t = p->known_err_token != NULL |
246 | 982 | ? p->known_err_token |
247 | 982 | : p->tokens[use_mark ? p->mark : p->fill - 1]; |
248 | 982 | Py_ssize_t col_offset; |
249 | 982 | Py_ssize_t end_col_offset = -1; |
250 | 982 | if (t->col_offset == -1) { |
251 | 283 | if (p->tok->cur == p->tok->buf) { |
252 | 3 | col_offset = 0; |
253 | 280 | } else { |
254 | 280 | const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; |
255 | 280 | col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); |
256 | 280 | } |
257 | 699 | } else { |
258 | 699 | col_offset = t->col_offset + 1; |
259 | 699 | } |
260 | | |
261 | 982 | if (t->end_col_offset != -1) { |
262 | 699 | end_col_offset = t->end_col_offset + 1; |
263 | 699 | } |
264 | | |
265 | 982 | va_list va; |
266 | 982 | va_start(va, errmsg); |
267 | 982 | _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va); |
268 | 982 | va_end(va); |
269 | | |
270 | 982 | return NULL; |
271 | 982 | } |
272 | | |
273 | | static PyObject * |
274 | | get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno) |
275 | 172 | { |
276 | | /* If the file descriptor is interactive, the source lines of the current |
277 | | * (multi-line) statement are stored in p->tok->interactive_src_start. |
278 | | * If not, we're parsing from a string, which means that the whole source |
279 | | * is stored in p->tok->str. */ |
280 | 172 | assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL); |
281 | | |
282 | 172 | char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str; |
283 | 172 | if (cur_line == NULL) { |
284 | 0 | assert(p->tok->fp_interactive); |
285 | | // We can reach this point if the tokenizer buffers for interactive source have not been |
286 | | // initialized because we failed to decode the original source with the given locale. |
287 | 0 | return Py_GetConstant(Py_CONSTANT_EMPTY_STR); |
288 | 0 | } |
289 | | |
290 | 172 | Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno; |
291 | 172 | const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp; |
292 | | |
293 | 172 | if (buf_end < cur_line) { |
294 | 12 | buf_end = cur_line + strlen(cur_line); |
295 | 12 | } |
296 | | |
297 | 2.03k | for (int i = 0; i < relative_lineno - 1; i++) { |
298 | 1.86k | char *new_line = strchr(cur_line, '\n'); |
299 | | // The assert is here for debug builds but the conditional that |
300 | | // follows is there so in release builds we do not crash at the cost |
301 | | // to report a potentially wrong line. |
302 | 1.86k | assert(new_line != NULL && new_line + 1 < buf_end); |
303 | 1.86k | if (new_line == NULL || new_line + 1 > buf_end) { |
304 | 0 | break; |
305 | 0 | } |
306 | 1.86k | cur_line = new_line + 1; |
307 | 1.86k | } |
308 | | |
309 | 172 | char *next_newline; |
310 | 172 | if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line |
311 | 0 | next_newline = cur_line + strlen(cur_line); |
312 | 0 | } |
313 | 172 | return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace"); |
314 | 172 | } |
315 | | |
316 | | void * |
317 | | _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, |
318 | | Py_ssize_t lineno, Py_ssize_t col_offset, |
319 | | Py_ssize_t end_lineno, Py_ssize_t end_col_offset, |
320 | | const char *errmsg, va_list va) |
321 | 10.9k | { |
322 | | // Bail out if we already have an error set. |
323 | 10.9k | if (p->error_indicator && PyErr_Occurred()) { |
324 | 927 | return NULL; |
325 | 927 | } |
326 | 10.0k | PyObject *value = NULL; |
327 | 10.0k | PyObject *errstr = NULL; |
328 | 10.0k | PyObject *error_line = NULL; |
329 | 10.0k | PyObject *tmp = NULL; |
330 | 10.0k | p->error_indicator = 1; |
331 | | |
332 | 10.0k | if (end_lineno == CURRENT_POS) { |
333 | 23 | end_lineno = p->tok->lineno; |
334 | 23 | } |
335 | 10.0k | if (end_col_offset == CURRENT_POS) { |
336 | 23 | end_col_offset = p->tok->cur - p->tok->line_start; |
337 | 23 | } |
338 | | |
339 | 10.0k | errstr = PyUnicode_FromFormatV(errmsg, va); |
340 | 10.0k | if (!errstr) { |
341 | 0 | goto error; |
342 | 0 | } |
343 | | |
344 | 10.0k | if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) { |
345 | 0 | error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
346 | 0 | } |
347 | 10.0k | else if (p->start_rule == Py_file_input) { |
348 | 10.0k | error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename, |
349 | 10.0k | (int) lineno, p->tok->encoding); |
350 | 10.0k | } |
351 | | |
352 | 10.0k | if (!error_line) { |
353 | | /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called, |
354 | | then we need to find the error line from some other source, because |
355 | | p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly |
356 | | failed or we're parsing from a string or the REPL. There's a third edge case where |
357 | | we're actually parsing from a file, which has an E_EOF SyntaxError and in that case |
358 | | `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which |
359 | | does not physically exist */ |
360 | 10.0k | assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); |
361 | | |
362 | 10.0k | if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { |
363 | 9.89k | Py_ssize_t size = p->tok->inp - p->tok->line_start; |
364 | 9.89k | error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace"); |
365 | 9.89k | } |
366 | 172 | else if (p->tok->fp == NULL || p->tok->fp == stdin) { |
367 | 172 | error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
368 | 172 | } |
369 | 0 | else { |
370 | 0 | error_line = Py_GetConstant(Py_CONSTANT_EMPTY_STR); |
371 | 0 | } |
372 | 10.0k | if (!error_line) { |
373 | 0 | goto error; |
374 | 0 | } |
375 | 10.0k | } |
376 | | |
377 | 10.0k | Py_ssize_t col_number = col_offset; |
378 | 10.0k | Py_ssize_t end_col_number = end_col_offset; |
379 | | |
380 | 10.0k | col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset); |
381 | 10.0k | if (col_number < 0) { |
382 | 0 | goto error; |
383 | 0 | } |
384 | | |
385 | 10.0k | if (end_col_offset > 0) { |
386 | 7.83k | end_col_number = _PyPegen_byte_offset_to_character_offset(error_line, end_col_offset); |
387 | 7.83k | if (end_col_number < 0) { |
388 | 0 | goto error; |
389 | 0 | } |
390 | 7.83k | } |
391 | | |
392 | 10.0k | tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number); |
393 | 10.0k | if (!tmp) { |
394 | 0 | goto error; |
395 | 0 | } |
396 | 10.0k | value = PyTuple_Pack(2, errstr, tmp); |
397 | 10.0k | Py_DECREF(tmp); |
398 | 10.0k | if (!value) { |
399 | 0 | goto error; |
400 | 0 | } |
401 | 10.0k | PyErr_SetObject(errtype, value); |
402 | | |
403 | 10.0k | Py_DECREF(errstr); |
404 | 10.0k | Py_DECREF(value); |
405 | 10.0k | return NULL; |
406 | | |
407 | 0 | error: |
408 | 0 | Py_XDECREF(errstr); |
409 | 0 | Py_XDECREF(error_line); |
410 | 0 | return NULL; |
411 | 10.0k | } |
412 | | |
413 | | void |
414 | 12.0k | _Pypegen_set_syntax_error(Parser* p, Token* last_token) { |
415 | | // Existing syntax error |
416 | 12.0k | if (PyErr_Occurred()) { |
417 | | // Prioritize tokenizer errors to custom syntax errors raised |
418 | | // on the second phase only if the errors come from the parser. |
419 | 5.53k | int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK); |
420 | 5.53k | if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) { |
421 | 1.58k | _PyPegen_tokenize_full_source_to_check_for_errors(p); |
422 | 1.58k | } |
423 | | // Propagate the existing syntax error. |
424 | 5.53k | return; |
425 | 5.53k | } |
426 | | // Initialization error |
427 | 6.48k | if (p->fill == 0) { |
428 | 0 | RAISE_SYNTAX_ERROR("error at start before reading any input"); |
429 | 0 | } |
430 | | // Parser encountered EOF (End of File) unexpectedtly |
431 | 6.48k | if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) { |
432 | 0 | if (p->tok->level) { |
433 | 0 | raise_unclosed_parentheses_error(p); |
434 | 0 | } else { |
435 | 0 | RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
436 | 0 | } |
437 | 0 | return; |
438 | 0 | } |
439 | | // Indentation error in the tokenizer |
440 | 6.48k | if (last_token->type == INDENT || last_token->type == DEDENT) { |
441 | 127 | RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent"); |
442 | 127 | return; |
443 | 127 | } |
444 | | // Unknown error (generic case) |
445 | | |
446 | | // Use the last token we found on the first pass to avoid reporting |
447 | | // incorrect locations for generic syntax errors just because we reached |
448 | | // further away when trying to find specific syntax errors in the second |
449 | | // pass. |
450 | 6.35k | RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax"); |
451 | | // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing |
452 | | // generic SyntaxError we just raised if errors are found. |
453 | 6.35k | _PyPegen_tokenize_full_source_to_check_for_errors(p); |
454 | 6.35k | } |
455 | | |
456 | | void |
457 | | _Pypegen_stack_overflow(Parser *p) |
458 | 67 | { |
459 | 67 | p->error_indicator = 1; |
460 | 67 | PyErr_SetString(PyExc_MemoryError, |
461 | 67 | "Parser stack overflowed - Python source too complex to parse"); |
462 | 67 | } |