/src/cpython/Parser/pegen_errors.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include <Python.h> |
2 | | #include <errcode.h> |
3 | | |
4 | | #include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject() |
5 | | #include "lexer/state.h" |
6 | | #include "lexer/lexer.h" |
7 | | #include "pegen.h" |
8 | | |
9 | | // TOKENIZER ERRORS |
10 | | |
11 | | void |
12 | | _PyPegen_raise_tokenizer_init_error(PyObject *filename) |
13 | 1.84k | { |
14 | 1.84k | if (!(PyErr_ExceptionMatches(PyExc_LookupError) |
15 | 1.84k | || PyErr_ExceptionMatches(PyExc_SyntaxError) |
16 | 1.84k | || PyErr_ExceptionMatches(PyExc_ValueError) |
17 | 1.84k | || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { |
18 | 56 | return; |
19 | 56 | } |
20 | 1.78k | PyObject *errstr = NULL; |
21 | 1.78k | PyObject *tuple = NULL; |
22 | 1.78k | PyObject *type; |
23 | 1.78k | PyObject *value; |
24 | 1.78k | PyObject *tback; |
25 | 1.78k | PyErr_Fetch(&type, &value, &tback); |
26 | 1.78k | errstr = PyObject_Str(value); |
27 | 1.78k | if (!errstr) { |
28 | 0 | goto error; |
29 | 0 | } |
30 | | |
31 | 1.78k | PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); |
32 | 1.78k | if (!tmp) { |
33 | 0 | goto error; |
34 | 0 | } |
35 | | |
36 | 1.78k | tuple = PyTuple_Pack(2, errstr, tmp); |
37 | 1.78k | Py_DECREF(tmp); |
38 | 1.78k | if (!value) { |
39 | 0 | goto error; |
40 | 0 | } |
41 | 1.78k | PyErr_SetObject(PyExc_SyntaxError, tuple); |
42 | | |
43 | 1.78k | error: |
44 | 1.78k | Py_XDECREF(type); |
45 | 1.78k | Py_XDECREF(value); |
46 | 1.78k | Py_XDECREF(tback); |
47 | 1.78k | Py_XDECREF(errstr); |
48 | 1.78k | Py_XDECREF(tuple); |
49 | 1.78k | } |
50 | | |
51 | | static inline void |
52 | 1.91k | raise_unclosed_parentheses_error(Parser *p) { |
53 | 1.91k | int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
54 | 1.91k | int error_col = p->tok->parencolstack[p->tok->level-1]; |
55 | 1.91k | RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, |
56 | 1.91k | error_lineno, error_col, error_lineno, -1, |
57 | 1.91k | "'%c' was never closed", |
58 | 1.91k | p->tok->parenstack[p->tok->level-1]); |
59 | 1.91k | } |
60 | | |
61 | | int |
62 | | _Pypegen_tokenizer_error(Parser *p) |
63 | 3.77k | { |
64 | 3.77k | if (PyErr_Occurred()) { |
65 | 1.90k | return -1; |
66 | 1.90k | } |
67 | | |
68 | 1.87k | const char *msg = NULL; |
69 | 1.87k | PyObject* errtype = PyExc_SyntaxError; |
70 | 1.87k | Py_ssize_t col_offset = -1; |
71 | 1.87k | p->error_indicator = 1; |
72 | 1.87k | switch (p->tok->done) { |
73 | 0 | case E_TOKEN: |
74 | 0 | msg = "invalid token"; |
75 | 0 | break; |
76 | 1.82k | case E_EOF: |
77 | 1.82k | if (p->tok->level) { |
78 | 1.79k | raise_unclosed_parentheses_error(p); |
79 | 1.79k | } else { |
80 | 28 | RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
81 | 28 | } |
82 | 1.82k | return -1; |
83 | 4 | case E_DEDENT: |
84 | 4 | RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); |
85 | 4 | return -1; |
86 | 0 | case E_INTR: |
87 | 0 | if (!PyErr_Occurred()) { |
88 | 0 | PyErr_SetNone(PyExc_KeyboardInterrupt); |
89 | 0 | } |
90 | 0 | return -1; |
91 | 0 | case E_NOMEM: |
92 | 0 | PyErr_NoMemory(); |
93 | 0 | return -1; |
94 | 2 | case E_TABSPACE: |
95 | 2 | errtype = PyExc_TabError; |
96 | 2 | msg = "inconsistent use of tabs and spaces in indentation"; |
97 | 2 | break; |
98 | 0 | case E_TOODEEP: |
99 | 0 | errtype = PyExc_IndentationError; |
100 | 0 | msg = "too many levels of indentation"; |
101 | 0 | break; |
102 | 43 | case E_LINECONT: { |
103 | 43 | col_offset = p->tok->cur - p->tok->buf - 1; |
104 | 43 | msg = "unexpected character after line continuation character"; |
105 | 43 | break; |
106 | 0 | } |
107 | 0 | case E_COLUMNOVERFLOW: |
108 | 0 | PyErr_SetString(PyExc_OverflowError, |
109 | 0 | "Parser column offset overflow - source line is too big"); |
110 | 0 | return -1; |
111 | 0 | default: |
112 | 0 | msg = "unknown parsing error"; |
113 | 1.87k | } |
114 | | |
115 | 45 | RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno, |
116 | 45 | col_offset >= 0 ? col_offset : 0, |
117 | 45 | p->tok->lineno, -1, msg); |
118 | 45 | return -1; |
119 | 1.87k | } |
120 | | |
121 | | int |
122 | | _Pypegen_raise_decode_error(Parser *p) |
123 | 1.17k | { |
124 | 1.17k | assert(PyErr_Occurred()); |
125 | 1.17k | const char *errtype = NULL; |
126 | 1.17k | if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { |
127 | 1.16k | errtype = "unicode error"; |
128 | 1.16k | } |
129 | 18 | else if (PyErr_ExceptionMatches(PyExc_ValueError)) { |
130 | 5 | errtype = "value error"; |
131 | 5 | } |
132 | 1.17k | if (errtype) { |
133 | 1.16k | PyObject *type; |
134 | 1.16k | PyObject *value; |
135 | 1.16k | PyObject *tback; |
136 | 1.16k | PyObject *errstr; |
137 | 1.16k | PyErr_Fetch(&type, &value, &tback); |
138 | 1.16k | errstr = PyObject_Str(value); |
139 | 1.16k | if (errstr) { |
140 | 1.16k | RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr); |
141 | 1.16k | Py_DECREF(errstr); |
142 | 1.16k | } |
143 | 0 | else { |
144 | 0 | PyErr_Clear(); |
145 | 0 | RAISE_SYNTAX_ERROR("(%s) unknown error", errtype); |
146 | 0 | } |
147 | 1.16k | Py_XDECREF(type); |
148 | 1.16k | Py_XDECREF(value); |
149 | 1.16k | Py_XDECREF(tback); |
150 | 1.16k | } |
151 | | |
152 | 1.17k | return -1; |
153 | 1.17k | } |
154 | | |
155 | | static int |
156 | 8.75k | _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { |
157 | | // Tokenize the whole input to see if there are any tokenization |
158 | | // errors such as mismatching parentheses. These will get priority |
159 | | // over generic syntax errors only if the line number of the error is |
160 | | // before the one that we had for the generic error. |
161 | | |
162 | | // We don't want to tokenize to the end for interactive input |
163 | 8.75k | if (p->tok->prompt != NULL) { |
164 | 0 | return 0; |
165 | 0 | } |
166 | | |
167 | 8.75k | PyObject *type, *value, *traceback; |
168 | 8.75k | PyErr_Fetch(&type, &value, &traceback); |
169 | | |
170 | 8.75k | Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; |
171 | 8.75k | Py_ssize_t current_err_line = current_token->lineno; |
172 | | |
173 | 8.75k | int ret = 0; |
174 | 8.75k | struct token new_token; |
175 | 8.75k | _PyToken_Init(&new_token); |
176 | | |
177 | 46.5k | for (;;) { |
178 | 46.5k | switch (_PyTokenizer_Get(p->tok, &new_token)) { |
179 | 3.56k | case ERRORTOKEN: |
180 | 3.56k | if (PyErr_Occurred()) { |
181 | 1.01k | ret = -1; |
182 | 1.01k | goto exit; |
183 | 1.01k | } |
184 | 2.55k | if (p->tok->level != 0) { |
185 | 2.52k | int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
186 | 2.52k | if (current_err_line > error_lineno) { |
187 | 117 | raise_unclosed_parentheses_error(p); |
188 | 117 | ret = -1; |
189 | 117 | goto exit; |
190 | 117 | } |
191 | 2.52k | } |
192 | 2.43k | break; |
193 | 5.18k | case ENDMARKER: |
194 | 5.18k | break; |
195 | 37.8k | default: |
196 | 37.8k | continue; |
197 | 46.5k | } |
198 | 7.61k | break; |
199 | 46.5k | } |
200 | | |
201 | | |
202 | 8.75k | exit: |
203 | 8.75k | _PyToken_Free(&new_token); |
204 | | // If we're in an f-string, we want the syntax error in the expression part |
205 | | // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards |
206 | | // do not swallow it. |
207 | 8.75k | if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) { |
208 | 885 | Py_XDECREF(value); |
209 | 885 | Py_XDECREF(type); |
210 | 885 | Py_XDECREF(traceback); |
211 | 7.86k | } else { |
212 | 7.86k | PyErr_Restore(type, value, traceback); |
213 | 7.86k | } |
214 | 8.75k | return ret; |
215 | 8.75k | } |
216 | | |
217 | | // PARSER ERRORS |
218 | | |
219 | | void * |
220 | | _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...) |
221 | 2.39k | { |
222 | | // Bail out if we already have an error set. |
223 | 2.39k | if (p->error_indicator && PyErr_Occurred()) { |
224 | 221 | return NULL; |
225 | 221 | } |
226 | 2.17k | if (p->fill == 0) { |
227 | 0 | va_list va; |
228 | 0 | va_start(va, errmsg); |
229 | 0 | _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va); |
230 | 0 | va_end(va); |
231 | 0 | return NULL; |
232 | 0 | } |
233 | 2.17k | if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) { |
234 | 0 | p->error_indicator = 1; |
235 | 0 | return NULL; |
236 | 0 | } |
237 | 2.17k | Token *t = p->known_err_token != NULL |
238 | 2.17k | ? p->known_err_token |
239 | 2.17k | : p->tokens[use_mark ? p->mark : p->fill - 1]; |
240 | 2.17k | Py_ssize_t col_offset; |
241 | 2.17k | Py_ssize_t end_col_offset = -1; |
242 | 2.17k | if (t->col_offset == -1) { |
243 | 1.09k | if (p->tok->cur == p->tok->buf) { |
244 | 2 | col_offset = 0; |
245 | 1.09k | } else { |
246 | 1.09k | const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; |
247 | 1.09k | col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); |
248 | 1.09k | } |
249 | 1.09k | } else { |
250 | 1.08k | col_offset = t->col_offset + 1; |
251 | 1.08k | } |
252 | | |
253 | 2.17k | if (t->end_col_offset != -1) { |
254 | 1.08k | end_col_offset = t->end_col_offset + 1; |
255 | 1.08k | } |
256 | | |
257 | 2.17k | va_list va; |
258 | 2.17k | va_start(va, errmsg); |
259 | 2.17k | _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va); |
260 | 2.17k | va_end(va); |
261 | | |
262 | 2.17k | return NULL; |
263 | 2.17k | } |
264 | | |
265 | | static PyObject * |
266 | | get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno) |
267 | 259 | { |
268 | | /* If the file descriptor is interactive, the source lines of the current |
269 | | * (multi-line) statement are stored in p->tok->interactive_src_start. |
270 | | * If not, we're parsing from a string, which means that the whole source |
271 | | * is stored in p->tok->str. */ |
272 | 259 | assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL); |
273 | | |
274 | 259 | char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str; |
275 | 259 | if (cur_line == NULL) { |
276 | 0 | assert(p->tok->fp_interactive); |
277 | | // We can reach this point if the tokenizer buffers for interactive source have not been |
278 | | // initialized because we failed to decode the original source with the given locale. |
279 | 0 | return Py_GetConstant(Py_CONSTANT_EMPTY_STR); |
280 | 0 | } |
281 | | |
282 | 259 | Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno; |
283 | 259 | const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp; |
284 | | |
285 | 259 | if (buf_end < cur_line) { |
286 | 9 | buf_end = cur_line + strlen(cur_line); |
287 | 9 | } |
288 | | |
289 | 2.40k | for (int i = 0; i < relative_lineno - 1; i++) { |
290 | 2.14k | char *new_line = strchr(cur_line, '\n'); |
291 | | // The assert is here for debug builds but the conditional that |
292 | | // follows is there so in release builds we do not crash at the cost |
293 | | // to report a potentially wrong line. |
294 | 2.14k | assert(new_line != NULL && new_line + 1 < buf_end); |
295 | 2.14k | if (new_line == NULL || new_line + 1 > buf_end) { |
296 | 0 | break; |
297 | 0 | } |
298 | 2.14k | cur_line = new_line + 1; |
299 | 2.14k | } |
300 | | |
301 | 259 | char *next_newline; |
302 | 259 | if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line |
303 | 0 | next_newline = cur_line + strlen(cur_line); |
304 | 0 | } |
305 | 259 | return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace"); |
306 | 259 | } |
307 | | |
308 | | void * |
309 | | _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, |
310 | | Py_ssize_t lineno, Py_ssize_t col_offset, |
311 | | Py_ssize_t end_lineno, Py_ssize_t end_col_offset, |
312 | | const char *errmsg, va_list va) |
313 | 12.9k | { |
314 | | // Bail out if we already have an error set. |
315 | 12.9k | if (p->error_indicator && PyErr_Occurred()) { |
316 | 1.18k | return NULL; |
317 | 1.18k | } |
318 | 11.8k | PyObject *value = NULL; |
319 | 11.8k | PyObject *errstr = NULL; |
320 | 11.8k | PyObject *error_line = NULL; |
321 | 11.8k | PyObject *tmp = NULL; |
322 | 11.8k | p->error_indicator = 1; |
323 | | |
324 | 11.8k | if (end_lineno == CURRENT_POS) { |
325 | 27 | end_lineno = p->tok->lineno; |
326 | 27 | } |
327 | 11.8k | if (end_col_offset == CURRENT_POS) { |
328 | 27 | end_col_offset = p->tok->cur - p->tok->line_start; |
329 | 27 | } |
330 | | |
331 | 11.8k | errstr = PyUnicode_FromFormatV(errmsg, va); |
332 | 11.8k | if (!errstr) { |
333 | 0 | goto error; |
334 | 0 | } |
335 | | |
336 | 11.8k | if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) { |
337 | 0 | error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
338 | 0 | } |
339 | 11.8k | else if (p->start_rule == Py_file_input) { |
340 | 11.8k | error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename, |
341 | 11.8k | (int) lineno, p->tok->encoding); |
342 | 11.8k | } |
343 | | |
344 | 11.8k | if (!error_line) { |
345 | | /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called, |
346 | | then we need to find the error line from some other source, because |
347 | | p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly |
348 | | failed or we're parsing from a string or the REPL. There's a third edge case where |
349 | | we're actually parsing from a file, which has an E_EOF SyntaxError and in that case |
350 | | `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which |
351 | | does not physically exist */ |
352 | 11.8k | assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); |
353 | | |
354 | 11.8k | if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { |
355 | 11.5k | Py_ssize_t size = p->tok->inp - p->tok->line_start; |
356 | 11.5k | error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace"); |
357 | 11.5k | } |
358 | 259 | else if (p->tok->fp == NULL || p->tok->fp == stdin) { |
359 | 259 | error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
360 | 259 | } |
361 | 0 | else { |
362 | 0 | error_line = Py_GetConstant(Py_CONSTANT_EMPTY_STR); |
363 | 0 | } |
364 | 11.8k | if (!error_line) { |
365 | 0 | goto error; |
366 | 0 | } |
367 | 11.8k | } |
368 | | |
369 | 11.8k | Py_ssize_t col_number = col_offset; |
370 | 11.8k | Py_ssize_t end_col_number = end_col_offset; |
371 | | |
372 | 11.8k | col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset); |
373 | 11.8k | if (col_number < 0) { |
374 | 0 | goto error; |
375 | 0 | } |
376 | | |
377 | 11.8k | if (end_col_offset > 0) { |
378 | 8.62k | end_col_number = _PyPegen_byte_offset_to_character_offset(error_line, end_col_offset); |
379 | 8.62k | if (end_col_number < 0) { |
380 | 0 | goto error; |
381 | 0 | } |
382 | 8.62k | } |
383 | | |
384 | 11.8k | tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number); |
385 | 11.8k | if (!tmp) { |
386 | 0 | goto error; |
387 | 0 | } |
388 | 11.8k | value = PyTuple_Pack(2, errstr, tmp); |
389 | 11.8k | Py_DECREF(tmp); |
390 | 11.8k | if (!value) { |
391 | 0 | goto error; |
392 | 0 | } |
393 | 11.8k | PyErr_SetObject(errtype, value); |
394 | | |
395 | 11.8k | Py_DECREF(errstr); |
396 | 11.8k | Py_DECREF(value); |
397 | 11.8k | return NULL; |
398 | | |
399 | 0 | error: |
400 | 0 | Py_XDECREF(errstr); |
401 | 0 | Py_XDECREF(error_line); |
402 | 0 | return NULL; |
403 | 11.8k | } |
404 | | |
405 | | void |
406 | 13.6k | _Pypegen_set_syntax_error(Parser* p, Token* last_token) { |
407 | | // Existing syntax error |
408 | 13.6k | if (PyErr_Occurred()) { |
409 | | // Prioritize tokenizer errors to custom syntax errors raised |
410 | | // on the second phase only if the errors come from the parser. |
411 | 6.79k | int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK); |
412 | 6.79k | if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) { |
413 | 2.04k | _PyPegen_tokenize_full_source_to_check_for_errors(p); |
414 | 2.04k | } |
415 | | // Propagate the existing syntax error. |
416 | 6.79k | return; |
417 | 6.79k | } |
418 | | // Initialization error |
419 | 6.85k | if (p->fill == 0) { |
420 | 0 | RAISE_SYNTAX_ERROR("error at start before reading any input"); |
421 | 0 | } |
422 | | // Parser encountered EOF (End of File) unexpectedtly |
423 | 6.85k | if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) { |
424 | 0 | if (p->tok->level) { |
425 | 0 | raise_unclosed_parentheses_error(p); |
426 | 0 | } else { |
427 | 0 | RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
428 | 0 | } |
429 | 0 | return; |
430 | 0 | } |
431 | | // Indentation error in the tokenizer |
432 | 6.85k | if (last_token->type == INDENT || last_token->type == DEDENT) { |
433 | 151 | RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent"); |
434 | 151 | return; |
435 | 151 | } |
436 | | // Unknown error (generic case) |
437 | | |
438 | | // Use the last token we found on the first pass to avoid reporting |
439 | | // incorrect locations for generic syntax errors just because we reached |
440 | | // further away when trying to find specific syntax errors in the second |
441 | | // pass. |
442 | 6.70k | RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax"); |
443 | | // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing |
444 | | // generic SyntaxError we just raised if errors are found. |
445 | 6.70k | _PyPegen_tokenize_full_source_to_check_for_errors(p); |
446 | 6.70k | } |
447 | | |
448 | | void |
449 | | _Pypegen_stack_overflow(Parser *p) |
450 | 65 | { |
451 | 65 | p->error_indicator = 1; |
452 | 65 | PyErr_SetString(PyExc_MemoryError, |
453 | 65 | "Parser stack overflowed - Python source too complex to parse"); |
454 | 65 | } |