/src/cpython/Parser/pegen_errors.c
Line | Count | Source |
1 | | #include <Python.h> |
2 | | #include <errcode.h> |
3 | | |
4 | | #include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject() |
5 | | #include "pycore_runtime.h" // _Py_ID() |
6 | | #include "pycore_tuple.h" // _PyTuple_FromPair |
7 | | #include "lexer/state.h" |
8 | | #include "lexer/lexer.h" |
9 | | #include "pegen.h" |
10 | | |
11 | | // TOKENIZER ERRORS |
12 | | |
13 | | void |
14 | | _PyPegen_raise_tokenizer_init_error(PyObject *filename) |
15 | 2.06k | { |
16 | 2.06k | if (!(PyErr_ExceptionMatches(PyExc_LookupError) |
17 | 1.95k | || PyErr_ExceptionMatches(PyExc_SyntaxError) |
18 | 1.43k | || PyErr_ExceptionMatches(PyExc_ValueError) |
19 | 53 | || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { |
20 | 53 | return; |
21 | 53 | } |
22 | 2.01k | PyObject *errstr = NULL; |
23 | 2.01k | PyObject *tuple = NULL; |
24 | 2.01k | PyObject *type; |
25 | 2.01k | PyObject *value; |
26 | 2.01k | PyObject *tback; |
27 | 2.01k | PyErr_Fetch(&type, &value, &tback); |
28 | 2.01k | if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) { |
29 | 519 | if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) { |
30 | 0 | goto error; |
31 | 0 | } |
32 | 519 | PyErr_Restore(type, value, tback); |
33 | 519 | return; |
34 | 519 | } |
35 | 1.49k | errstr = PyObject_Str(value); |
36 | 1.49k | if (!errstr) { |
37 | 0 | goto error; |
38 | 0 | } |
39 | | |
40 | 1.49k | PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); |
41 | 1.49k | if (!tmp) { |
42 | 0 | goto error; |
43 | 0 | } |
44 | | |
45 | 1.49k | tuple = _PyTuple_FromPair(errstr, tmp); |
46 | 1.49k | Py_DECREF(tmp); |
47 | 1.49k | if (!tuple) { |
48 | 0 | goto error; |
49 | 0 | } |
50 | 1.49k | PyErr_SetObject(PyExc_SyntaxError, tuple); |
51 | | |
52 | 1.49k | error: |
53 | 1.49k | Py_XDECREF(type); |
54 | 1.49k | Py_XDECREF(value); |
55 | 1.49k | Py_XDECREF(tback); |
56 | 1.49k | Py_XDECREF(errstr); |
57 | 1.49k | Py_XDECREF(tuple); |
58 | 1.49k | } |
59 | | |
60 | | static inline void |
61 | 1.68k | raise_unclosed_parentheses_error(Parser *p) { |
62 | 1.68k | int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
63 | 1.68k | int error_col = p->tok->parencolstack[p->tok->level-1]; |
64 | 1.68k | RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, |
65 | 1.68k | error_lineno, error_col, error_lineno, -1, |
66 | 1.68k | "'%c' was never closed", |
67 | 1.68k | p->tok->parenstack[p->tok->level-1]); |
68 | 1.68k | } |
69 | | |
70 | | int |
71 | | _Pypegen_tokenizer_error(Parser *p) |
72 | 3.40k | { |
73 | 3.40k | if (PyErr_Occurred()) { |
74 | 1.72k | return -1; |
75 | 1.72k | } |
76 | | |
77 | 1.68k | const char *msg = NULL; |
78 | 1.68k | PyObject* errtype = PyExc_SyntaxError; |
79 | 1.68k | Py_ssize_t col_offset = -1; |
80 | 1.68k | p->error_indicator = 1; |
81 | 1.68k | switch (p->tok->done) { |
82 | 0 | case E_TOKEN: |
83 | 0 | msg = "invalid token"; |
84 | 0 | break; |
85 | 1.64k | case E_EOF: |
86 | 1.64k | if (p->tok->level) { |
87 | 1.61k | raise_unclosed_parentheses_error(p); |
88 | 1.61k | } else { |
89 | 32 | RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
90 | 32 | } |
91 | 1.64k | return -1; |
92 | 6 | case E_DEDENT: |
93 | 6 | RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); |
94 | 6 | return -1; |
95 | 0 | case E_INTR: |
96 | 0 | if (!PyErr_Occurred()) { |
97 | 0 | PyErr_SetNone(PyExc_KeyboardInterrupt); |
98 | 0 | } |
99 | 0 | return -1; |
100 | 0 | case E_NOMEM: |
101 | 0 | PyErr_NoMemory(); |
102 | 0 | return -1; |
103 | 2 | case E_TABSPACE: |
104 | 2 | errtype = PyExc_TabError; |
105 | 2 | msg = "inconsistent use of tabs and spaces in indentation"; |
106 | 2 | break; |
107 | 0 | case E_TOODEEP: |
108 | 0 | errtype = PyExc_IndentationError; |
109 | 0 | msg = "too many levels of indentation"; |
110 | 0 | break; |
111 | 29 | case E_LINECONT: { |
112 | 29 | col_offset = p->tok->cur - p->tok->buf - 1; |
113 | 29 | msg = "unexpected character after line continuation character"; |
114 | 29 | break; |
115 | 0 | } |
116 | 0 | case E_COLUMNOVERFLOW: |
117 | 0 | PyErr_SetString(PyExc_OverflowError, |
118 | 0 | "Parser column offset overflow - source line is too big"); |
119 | 0 | return -1; |
120 | 0 | default: |
121 | 0 | msg = "unknown parsing error"; |
122 | 1.68k | } |
123 | | |
124 | 31 | RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno, |
125 | 31 | col_offset >= 0 ? col_offset : 0, |
126 | 31 | p->tok->lineno, -1, msg); |
127 | 31 | return -1; |
128 | 1.68k | } |
129 | | |
130 | | int |
131 | | _Pypegen_raise_decode_error(Parser *p) |
132 | 97 | { |
133 | 97 | assert(PyErr_Occurred()); |
134 | 97 | const char *errtype = NULL; |
135 | 97 | if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { |
136 | 90 | errtype = "unicode error"; |
137 | 90 | } |
138 | 7 | else if (PyErr_ExceptionMatches(PyExc_ValueError)) { |
139 | 5 | errtype = "value error"; |
140 | 5 | } |
141 | 97 | if (errtype) { |
142 | 95 | PyObject *type; |
143 | 95 | PyObject *value; |
144 | 95 | PyObject *tback; |
145 | 95 | PyObject *errstr; |
146 | 95 | PyErr_Fetch(&type, &value, &tback); |
147 | 95 | errstr = PyObject_Str(value); |
148 | 95 | if (errstr) { |
149 | 95 | RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr); |
150 | 95 | Py_DECREF(errstr); |
151 | 95 | } |
152 | 0 | else { |
153 | 0 | PyErr_Clear(); |
154 | 0 | RAISE_SYNTAX_ERROR("(%s) unknown error", errtype); |
155 | 0 | } |
156 | 95 | Py_XDECREF(type); |
157 | 95 | Py_XDECREF(value); |
158 | 95 | Py_XDECREF(tback); |
159 | 95 | } |
160 | | |
161 | 97 | return -1; |
162 | 97 | } |
163 | | |
164 | | static int |
165 | 80.5k | _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { |
166 | | // Tokenize the whole input to see if there are any tokenization |
167 | | // errors such as mismatching parentheses. These will get priority |
168 | | // over generic syntax errors only if the line number of the error is |
169 | | // before the one that we had for the generic error. |
170 | | |
171 | | // We don't want to tokenize to the end for interactive input |
172 | 80.5k | if (p->tok->prompt != NULL) { |
173 | 0 | return 0; |
174 | 0 | } |
175 | | |
176 | 80.5k | PyObject *type, *value, *traceback; |
177 | 80.5k | PyErr_Fetch(&type, &value, &traceback); |
178 | | |
179 | 80.5k | Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; |
180 | 80.5k | Py_ssize_t current_err_line = current_token->lineno; |
181 | | |
182 | 80.5k | int ret = 0; |
183 | 80.5k | struct token new_token; |
184 | 80.5k | _PyToken_Init(&new_token); |
185 | | |
186 | 344k | for (;;) { |
187 | 344k | switch (_PyTokenizer_Get(p->tok, &new_token)) { |
188 | 2.63k | case ERRORTOKEN: |
189 | 2.63k | if (PyErr_Occurred()) { |
190 | 541 | ret = -1; |
191 | 541 | goto exit; |
192 | 541 | } |
193 | 2.09k | if (p->tok->level != 0) { |
194 | 2.07k | int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
195 | 2.07k | if (current_err_line > error_lineno) { |
196 | 69 | raise_unclosed_parentheses_error(p); |
197 | 69 | ret = -1; |
198 | 69 | goto exit; |
199 | 69 | } |
200 | 2.07k | } |
201 | 2.02k | break; |
202 | 77.9k | case ENDMARKER: |
203 | 77.9k | break; |
204 | 264k | default: |
205 | 264k | continue; |
206 | 344k | } |
207 | 79.9k | break; |
208 | 344k | } |
209 | | |
210 | | |
211 | 80.5k | exit: |
212 | 80.5k | _PyToken_Free(&new_token); |
213 | | // If we're in an f-string, we want the syntax error in the expression part |
214 | | // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards |
215 | | // do not swallow it. |
216 | 80.5k | if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) { |
217 | 442 | Py_XDECREF(value); |
218 | 442 | Py_XDECREF(type); |
219 | 442 | Py_XDECREF(traceback); |
220 | 80.1k | } else { |
221 | 80.1k | PyErr_Restore(type, value, traceback); |
222 | 80.1k | } |
223 | 80.5k | return ret; |
224 | 80.5k | } |
225 | | |
226 | | // PARSER ERRORS |
227 | | |
228 | | void * |
229 | | _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...) |
230 | 985 | { |
231 | | // Bail out if we already have an error set. |
232 | 985 | if (p->error_indicator && PyErr_Occurred()) { |
233 | 114 | return NULL; |
234 | 114 | } |
235 | 871 | if (p->fill == 0) { |
236 | 0 | va_list va; |
237 | 0 | va_start(va, errmsg); |
238 | 0 | _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va); |
239 | 0 | va_end(va); |
240 | 0 | return NULL; |
241 | 0 | } |
242 | 871 | if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) { |
243 | 0 | p->error_indicator = 1; |
244 | 0 | return NULL; |
245 | 0 | } |
246 | 871 | Token *t = p->known_err_token != NULL |
247 | 871 | ? p->known_err_token |
248 | 871 | : p->tokens[use_mark ? p->mark : p->fill - 1]; |
249 | 871 | Py_ssize_t col_offset; |
250 | 871 | Py_ssize_t end_col_offset = -1; |
251 | 871 | if (t->col_offset == -1) { |
252 | 243 | if (p->tok->cur == p->tok->buf) { |
253 | 5 | col_offset = 0; |
254 | 238 | } else { |
255 | 238 | const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; |
256 | 238 | col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); |
257 | 238 | } |
258 | 628 | } else { |
259 | 628 | col_offset = t->col_offset + 1; |
260 | 628 | } |
261 | | |
262 | 871 | if (t->end_col_offset != -1) { |
263 | 628 | end_col_offset = t->end_col_offset + 1; |
264 | 628 | } |
265 | | |
266 | 871 | va_list va; |
267 | 871 | va_start(va, errmsg); |
268 | 871 | _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va); |
269 | 871 | va_end(va); |
270 | | |
271 | 871 | return NULL; |
272 | 871 | } |
273 | | |
274 | | static PyObject * |
275 | | get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno) |
276 | 177 | { |
277 | | /* If the file descriptor is interactive, the source lines of the current |
278 | | * (multi-line) statement are stored in p->tok->interactive_src_start. |
279 | | * If not, we're parsing from a string, which means that the whole source |
280 | | * is stored in p->tok->str. */ |
281 | 177 | assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL); |
282 | | |
283 | 177 | char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str; |
284 | 177 | if (cur_line == NULL) { |
285 | 0 | assert(p->tok->fp_interactive); |
286 | | // We can reach this point if the tokenizer buffers for interactive source have not been |
287 | | // initialized because we failed to decode the original source with the given locale. |
288 | 0 | return Py_GetConstant(Py_CONSTANT_EMPTY_STR); |
289 | 0 | } |
290 | | |
291 | 177 | Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno; |
292 | 177 | const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp; |
293 | | |
294 | 177 | if (buf_end < cur_line) { |
295 | 0 | buf_end = cur_line + strlen(cur_line); |
296 | 0 | } |
297 | | |
298 | 2.19k | for (int i = 0; i < relative_lineno - 1; i++) { |
299 | 2.02k | char *new_line = strchr(cur_line, '\n'); |
300 | | // The assert is here for debug builds but the conditional that |
301 | | // follows is there so in release builds we do not crash at the cost |
302 | | // to report a potentially wrong line. |
303 | 2.02k | assert(new_line != NULL && new_line + 1 < buf_end); |
304 | 2.02k | if (new_line == NULL || new_line + 1 > buf_end) { |
305 | 0 | break; |
306 | 0 | } |
307 | 2.02k | cur_line = new_line + 1; |
308 | 2.02k | } |
309 | | |
310 | 177 | char *next_newline; |
311 | 177 | if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line |
312 | 0 | next_newline = cur_line + strlen(cur_line); |
313 | 0 | } |
314 | 177 | return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace"); |
315 | 177 | } |
316 | | |
317 | | void * |
318 | | _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, |
319 | | Py_ssize_t lineno, Py_ssize_t col_offset, |
320 | | Py_ssize_t end_lineno, Py_ssize_t end_col_offset, |
321 | | const char *errmsg, va_list va) |
322 | 82.9k | { |
323 | | // Bail out if we already have an error set. |
324 | 82.9k | if (p->error_indicator && PyErr_Occurred()) { |
325 | 456 | return NULL; |
326 | 456 | } |
327 | 82.5k | PyObject *value = NULL; |
328 | 82.5k | PyObject *errstr = NULL; |
329 | 82.5k | PyObject *error_line = NULL; |
330 | 82.5k | PyObject *tmp = NULL; |
331 | 82.5k | p->error_indicator = 1; |
332 | | |
333 | 82.5k | if (end_lineno == CURRENT_POS) { |
334 | 27 | end_lineno = p->tok->lineno; |
335 | 27 | } |
336 | 82.5k | if (end_col_offset == CURRENT_POS) { |
337 | 27 | end_col_offset = p->tok->cur - p->tok->line_start; |
338 | 27 | } |
339 | | |
340 | 82.5k | errstr = PyUnicode_FromFormatV(errmsg, va); |
341 | 82.5k | if (!errstr) { |
342 | 0 | goto error; |
343 | 0 | } |
344 | | |
345 | 82.5k | if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) { |
346 | 0 | error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
347 | 0 | } |
348 | 82.5k | else if (p->start_rule == Py_file_input) { |
349 | 82.5k | error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename, |
350 | 82.5k | (int) lineno, p->tok->encoding); |
351 | 82.5k | } |
352 | | |
353 | 82.5k | if (!error_line) { |
354 | | /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called, |
355 | | then we need to find the error line from some other source, because |
356 | | p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly |
357 | | failed or we're parsing from a string or the REPL. There's a third edge case where |
358 | | we're actually parsing from a file, which has an E_EOF SyntaxError and in that case |
359 | | `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which |
360 | | does not physically exist */ |
361 | 82.5k | assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); |
362 | | |
363 | 82.5k | if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { |
364 | 82.3k | Py_ssize_t size = p->tok->inp - p->tok->line_start; |
365 | 82.3k | error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace"); |
366 | 82.3k | } |
367 | 177 | else if (p->tok->fp == NULL || p->tok->fp == stdin) { |
368 | 177 | error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
369 | 177 | } |
370 | 0 | else { |
371 | 0 | error_line = Py_GetConstant(Py_CONSTANT_EMPTY_STR); |
372 | 0 | } |
373 | 82.5k | if (!error_line) { |
374 | 0 | goto error; |
375 | 0 | } |
376 | 82.5k | } |
377 | | |
378 | 82.5k | Py_ssize_t col_number = col_offset; |
379 | 82.5k | Py_ssize_t end_col_number = end_col_offset; |
380 | | |
381 | 82.5k | col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset); |
382 | 82.5k | if (col_number < 0) { |
383 | 0 | goto error; |
384 | 0 | } |
385 | | |
386 | 82.5k | if (end_col_offset > 0) { |
387 | 80.5k | end_col_number = _PyPegen_byte_offset_to_character_offset(error_line, end_col_offset); |
388 | 80.5k | if (end_col_number < 0) { |
389 | 0 | goto error; |
390 | 0 | } |
391 | 80.5k | } |
392 | | |
393 | 82.5k | tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number); |
394 | 82.5k | if (!tmp) { |
395 | 0 | goto error; |
396 | 0 | } |
397 | 82.5k | value = _PyTuple_FromPair(errstr, tmp); |
398 | 82.5k | Py_DECREF(tmp); |
399 | 82.5k | if (!value) { |
400 | 0 | goto error; |
401 | 0 | } |
402 | 82.5k | PyErr_SetObject(errtype, value); |
403 | | |
404 | 82.5k | Py_DECREF(errstr); |
405 | 82.5k | Py_DECREF(value); |
406 | 82.5k | return NULL; |
407 | | |
408 | 0 | error: |
409 | 0 | Py_XDECREF(errstr); |
410 | 0 | Py_XDECREF(error_line); |
411 | 0 | return NULL; |
412 | 82.5k | } |
413 | | |
414 | | void |
415 | 84.2k | _Pypegen_set_syntax_error(Parser* p, Token* last_token) { |
416 | | // Existing syntax error |
417 | 84.2k | if (PyErr_Occurred()) { |
418 | | // Prioritize tokenizer errors to custom syntax errors raised |
419 | | // on the second phase only if the errors come from the parser. |
420 | 5.11k | int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK); |
421 | 5.11k | if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) { |
422 | 1.56k | _PyPegen_tokenize_full_source_to_check_for_errors(p); |
423 | 1.56k | } |
424 | | // Propagate the existing syntax error. |
425 | 5.11k | return; |
426 | 5.11k | } |
427 | | // Initialization error |
428 | 79.1k | if (p->fill == 0) { |
429 | 0 | RAISE_SYNTAX_ERROR("error at start before reading any input"); |
430 | 0 | } |
431 | | // Parser encountered EOF (End of File) unexpectedtly |
432 | 79.1k | if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) { |
433 | 0 | if (p->tok->level) { |
434 | 0 | raise_unclosed_parentheses_error(p); |
435 | 0 | } else { |
436 | 0 | RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
437 | 0 | } |
438 | 0 | return; |
439 | 0 | } |
440 | | // Indentation error in the tokenizer |
441 | 79.1k | if (last_token->type == INDENT || last_token->type == DEDENT) { |
442 | 102 | RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent"); |
443 | 102 | return; |
444 | 102 | } |
445 | | // Unknown error (generic case) |
446 | | |
447 | | // Use the last token we found on the first pass to avoid reporting |
448 | | // incorrect locations for generic syntax errors just because we reached |
449 | | // further away when trying to find specific syntax errors in the second |
450 | | // pass. |
451 | 79.0k | RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax"); |
452 | | // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing |
453 | | // generic SyntaxError we just raised if errors are found. |
454 | 79.0k | _PyPegen_tokenize_full_source_to_check_for_errors(p); |
455 | 79.0k | } |
456 | | |
457 | | void |
458 | | _Pypegen_stack_overflow(Parser *p) |
459 | 54 | { |
460 | 54 | p->error_indicator = 1; |
461 | 54 | PyErr_SetString(PyExc_MemoryError, |
462 | 54 | "Parser stack overflowed - Python source too complex to parse"); |
463 | 54 | } |