/src/cpython/Parser/pegen_errors.c
Line | Count | Source |
1 | | #include <Python.h> |
2 | | #include <errcode.h> |
3 | | |
4 | | #include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject() |
5 | | #include "pycore_runtime.h" // _Py_ID() |
6 | | #include "pycore_tuple.h" // _PyTuple_FromPair |
7 | | #include "lexer/state.h" |
8 | | #include "lexer/lexer.h" |
9 | | #include "pegen.h" |
10 | | |
11 | | // TOKENIZER ERRORS |
12 | | |
13 | | static inline void |
14 | 1.81k | raise_unclosed_parentheses_error(Parser *p) { |
15 | 1.81k | int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
16 | 1.81k | int error_col = p->tok->parencolstack[p->tok->level-1]; |
17 | 1.81k | RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, |
18 | 1.81k | error_lineno, error_col, error_lineno, -1, |
19 | 1.81k | "'%c' was never closed", |
20 | 1.81k | p->tok->parenstack[p->tok->level-1]); |
21 | 1.81k | } |
22 | | |
23 | | int |
24 | | _Pypegen_tokenizer_error(Parser *p) |
25 | 3.72k | { |
26 | 3.72k | if (PyErr_Occurred()) { |
27 | 1.87k | return -1; |
28 | 1.87k | } |
29 | | |
30 | 1.85k | const char *msg = NULL; |
31 | 1.85k | PyObject* errtype = PyExc_SyntaxError; |
32 | 1.85k | Py_ssize_t col_offset = -1; |
33 | 1.85k | p->error_indicator = 1; |
34 | 1.85k | switch (p->tok->done) { |
35 | 0 | case E_TOKEN: |
36 | 0 | msg = "invalid token"; |
37 | 0 | break; |
38 | 1.79k | case E_EOF: |
39 | 1.79k | if (p->tok->level) { |
40 | 1.75k | raise_unclosed_parentheses_error(p); |
41 | 1.75k | } else { |
42 | 40 | RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
43 | 40 | } |
44 | 1.79k | return -1; |
45 | 8 | case E_DEDENT: |
46 | 8 | RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); |
47 | 8 | return -1; |
48 | 0 | case E_INTR: |
49 | 0 | if (!PyErr_Occurred()) { |
50 | 0 | PyErr_SetNone(PyExc_KeyboardInterrupt); |
51 | 0 | } |
52 | 0 | return -1; |
53 | 0 | case E_NOMEM: |
54 | 0 | PyErr_NoMemory(); |
55 | 0 | return -1; |
56 | 2 | case E_TABSPACE: |
57 | 2 | errtype = PyExc_TabError; |
58 | 2 | msg = "inconsistent use of tabs and spaces in indentation"; |
59 | 2 | break; |
60 | 0 | case E_TOODEEP: |
61 | 0 | errtype = PyExc_IndentationError; |
62 | 0 | msg = "too many levels of indentation"; |
63 | 0 | break; |
64 | 52 | case E_LINECONT: { |
65 | 52 | col_offset = p->tok->cur - p->tok->buf - 1; |
66 | 52 | msg = "unexpected character after line continuation character"; |
67 | 52 | break; |
68 | 0 | } |
69 | 0 | case E_COLUMNOVERFLOW: |
70 | 0 | PyErr_SetString(PyExc_OverflowError, |
71 | 0 | "Parser column offset overflow - source line is too big"); |
72 | 0 | return -1; |
73 | 0 | default: |
74 | 0 | msg = "unknown parsing error"; |
75 | 1.85k | } |
76 | | |
77 | 54 | RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno, |
78 | 54 | col_offset >= 0 ? col_offset : 0, |
79 | 54 | p->tok->lineno, -1, msg); |
80 | 54 | return -1; |
81 | 1.85k | } |
82 | | |
83 | | int |
84 | | _Pypegen_raise_decode_error(Parser *p) |
85 | 125 | { |
86 | 125 | assert(PyErr_Occurred()); |
87 | 125 | const char *errtype = NULL; |
88 | 125 | if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { |
89 | 117 | errtype = "unicode error"; |
90 | 117 | } |
91 | 8 | else if (PyErr_ExceptionMatches(PyExc_ValueError)) { |
92 | 6 | errtype = "value error"; |
93 | 6 | } |
94 | 125 | if (errtype) { |
95 | 123 | PyObject *type; |
96 | 123 | PyObject *value; |
97 | 123 | PyObject *tback; |
98 | 123 | PyObject *errstr; |
99 | 123 | PyErr_Fetch(&type, &value, &tback); |
100 | 123 | errstr = PyObject_Str(value); |
101 | 123 | if (errstr) { |
102 | 123 | RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr); |
103 | 123 | Py_DECREF(errstr); |
104 | 123 | } |
105 | 0 | else { |
106 | 0 | PyErr_Clear(); |
107 | 0 | RAISE_SYNTAX_ERROR("(%s) unknown error", errtype); |
108 | 0 | } |
109 | 123 | Py_XDECREF(type); |
110 | 123 | Py_XDECREF(value); |
111 | 123 | Py_XDECREF(tback); |
112 | 123 | } |
113 | | |
114 | 125 | return -1; |
115 | 125 | } |
116 | | |
117 | | static int |
118 | 96.1k | _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { |
119 | | // Tokenize the whole input to see if there are any tokenization |
120 | | // errors such as mismatching parentheses. These will get priority |
121 | | // over generic syntax errors only if the line number of the error is |
122 | | // before the one that we had for the generic error. |
123 | | |
124 | | // We don't want to tokenize to the end for interactive input |
125 | 96.1k | if (p->tok->prompt != NULL) { |
126 | 0 | return 0; |
127 | 0 | } |
128 | | |
129 | 96.1k | PyObject *type, *value, *traceback; |
130 | 96.1k | PyErr_Fetch(&type, &value, &traceback); |
131 | | |
132 | 96.1k | Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; |
133 | 96.1k | Py_ssize_t current_err_line = current_token->lineno; |
134 | | |
135 | 96.1k | int ret = 0; |
136 | 96.1k | struct token new_token; |
137 | 96.1k | _PyToken_Init(&new_token); |
138 | | |
139 | 407k | for (;;) { |
140 | 407k | switch (_PyTokenizer_Get(p->tok, &new_token)) { |
141 | 2.85k | case ERRORTOKEN: |
142 | 2.85k | if (PyErr_Occurred()) { |
143 | 604 | ret = -1; |
144 | 604 | goto exit; |
145 | 604 | } |
146 | 2.25k | if (p->tok->level != 0) { |
147 | 2.23k | int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
148 | 2.23k | if (current_err_line > error_lineno) { |
149 | 63 | raise_unclosed_parentheses_error(p); |
150 | 63 | ret = -1; |
151 | 63 | goto exit; |
152 | 63 | } |
153 | 2.23k | } |
154 | 2.19k | break; |
155 | 93.2k | case ENDMARKER: |
156 | 93.2k | break; |
157 | 311k | default: |
158 | 311k | continue; |
159 | 407k | } |
160 | 95.4k | break; |
161 | 407k | } |
162 | | |
163 | | |
164 | 96.1k | exit: |
165 | 96.1k | _PyToken_Free(&new_token); |
166 | | // If we're in an f-string, we want the syntax error in the expression part |
167 | | // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards |
168 | | // do not swallow it. |
169 | 96.1k | if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) { |
170 | 507 | Py_XDECREF(value); |
171 | 507 | Py_XDECREF(type); |
172 | 507 | Py_XDECREF(traceback); |
173 | 95.6k | } else { |
174 | 95.6k | PyErr_Restore(type, value, traceback); |
175 | 95.6k | } |
176 | 96.1k | return ret; |
177 | 96.1k | } |
178 | | |
179 | | // PARSER ERRORS |
180 | | |
181 | | void * |
182 | | _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...) |
183 | 1.10k | { |
184 | | // Bail out if we already have an error set. |
185 | 1.10k | if (p->error_indicator && PyErr_Occurred()) { |
186 | 253 | return NULL; |
187 | 253 | } |
188 | 853 | if (p->fill == 0) { |
189 | 0 | va_list va; |
190 | 0 | va_start(va, errmsg); |
191 | 0 | _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va); |
192 | 0 | va_end(va); |
193 | 0 | return NULL; |
194 | 0 | } |
195 | 853 | if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) { |
196 | 0 | p->error_indicator = 1; |
197 | 0 | return NULL; |
198 | 0 | } |
199 | 853 | Token *t = p->known_err_token != NULL |
200 | 853 | ? p->known_err_token |
201 | 853 | : p->tokens[use_mark ? p->mark : p->fill - 1]; |
202 | 853 | Py_ssize_t col_offset; |
203 | 853 | Py_ssize_t end_col_offset = -1; |
204 | 853 | if (t->col_offset == -1) { |
205 | 230 | if (p->tok->cur == p->tok->buf) { |
206 | 3 | col_offset = 0; |
207 | 227 | } else { |
208 | 227 | const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; |
209 | 227 | col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); |
210 | 227 | } |
211 | 623 | } else { |
212 | 623 | col_offset = t->col_offset + 1; |
213 | 623 | } |
214 | | |
215 | 853 | if (t->end_col_offset != -1) { |
216 | 623 | end_col_offset = t->end_col_offset + 1; |
217 | 623 | } |
218 | | |
219 | 853 | va_list va; |
220 | 853 | va_start(va, errmsg); |
221 | 853 | _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va); |
222 | 853 | va_end(va); |
223 | | |
224 | 853 | return NULL; |
225 | 853 | } |
226 | | |
227 | | static PyObject * |
228 | | get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno) |
229 | 190 | { |
230 | | /* If the file descriptor is interactive, the source lines of the current |
231 | | * (multi-line) statement are stored in p->tok->interactive_src_start. |
232 | | * If not, we're parsing from a string, which means that the whole source |
233 | | * is stored in p->tok->str. */ |
234 | 190 | assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL); |
235 | | |
236 | 190 | char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str; |
237 | 190 | if (cur_line == NULL) { |
238 | 0 | assert(p->tok->fp_interactive); |
239 | | // We can reach this point if the tokenizer buffers for interactive source have not been |
240 | | // initialized because we failed to decode the original source with the given locale. |
241 | 0 | return Py_GetConstant(Py_CONSTANT_EMPTY_STR); |
242 | 0 | } |
243 | | |
244 | 190 | Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno; |
245 | 190 | const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp; |
246 | | |
247 | 190 | if (buf_end < cur_line) { |
248 | 0 | buf_end = cur_line + strlen(cur_line); |
249 | 0 | } |
250 | | |
251 | 2.25k | for (int i = 0; i < relative_lineno - 1; i++) { |
252 | 2.06k | char *new_line = strchr(cur_line, '\n'); |
253 | | // The assert is here for debug builds but the conditional that |
254 | | // follows is there so in release builds we do not crash at the cost |
255 | | // to report a potentially wrong line. |
256 | 2.06k | assert(new_line != NULL && new_line + 1 < buf_end); |
257 | 2.06k | if (new_line == NULL || new_line + 1 > buf_end) { |
258 | 0 | break; |
259 | 0 | } |
260 | 2.06k | cur_line = new_line + 1; |
261 | 2.06k | } |
262 | | |
263 | 190 | char *next_newline; |
264 | 190 | if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line |
265 | 0 | next_newline = cur_line + strlen(cur_line); |
266 | 0 | } |
267 | 190 | return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace"); |
268 | 190 | } |
269 | | |
270 | | void * |
271 | | _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, |
272 | | Py_ssize_t lineno, Py_ssize_t col_offset, |
273 | | Py_ssize_t end_lineno, Py_ssize_t end_col_offset, |
274 | | const char *errmsg, va_list va) |
275 | 98.6k | { |
276 | | // Bail out if we already have an error set. |
277 | 98.6k | if (p->error_indicator && PyErr_Occurred()) { |
278 | 466 | return NULL; |
279 | 466 | } |
280 | 98.2k | PyObject *value = NULL; |
281 | 98.2k | PyObject *errstr = NULL; |
282 | 98.2k | PyObject *error_line = NULL; |
283 | 98.2k | PyObject *tmp = NULL; |
284 | 98.2k | p->error_indicator = 1; |
285 | | |
286 | 98.2k | if (end_lineno == CURRENT_POS) { |
287 | 25 | end_lineno = p->tok->lineno; |
288 | 25 | } |
289 | 98.2k | if (end_col_offset == CURRENT_POS) { |
290 | 25 | end_col_offset = p->tok->cur - p->tok->line_start; |
291 | 25 | } |
292 | | |
293 | 98.2k | errstr = PyUnicode_FromFormatV(errmsg, va); |
294 | 98.2k | if (!errstr) { |
295 | 0 | goto error; |
296 | 0 | } |
297 | | |
298 | 98.2k | if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) { |
299 | 0 | error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
300 | 0 | } |
301 | 98.2k | else if (p->start_rule == Py_file_input) { |
302 | 98.2k | error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename, |
303 | 98.2k | (int) lineno, p->tok->encoding); |
304 | 98.2k | } |
305 | | |
306 | 98.2k | if (!error_line) { |
307 | | /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called, |
308 | | then we need to find the error line from some other source, because |
309 | | p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly |
310 | | failed or we're parsing from a string or the REPL. There's a third edge case where |
311 | | we're actually parsing from a file, which has an E_EOF SyntaxError and in that case |
312 | | `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which |
313 | | does not physically exist */ |
314 | 98.2k | assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); |
315 | | |
316 | 98.2k | if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { |
317 | 98.0k | Py_ssize_t size = p->tok->inp - p->tok->line_start; |
318 | 98.0k | error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace"); |
319 | 98.0k | } |
320 | 190 | else if (p->tok->fp == NULL || p->tok->fp == stdin) { |
321 | 190 | error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
322 | 190 | } |
323 | 0 | else { |
324 | 0 | error_line = Py_GetConstant(Py_CONSTANT_EMPTY_STR); |
325 | 0 | } |
326 | 98.2k | if (!error_line) { |
327 | 0 | goto error; |
328 | 0 | } |
329 | 98.2k | } |
330 | | |
331 | 98.2k | Py_ssize_t col_number = col_offset; |
332 | 98.2k | Py_ssize_t end_col_number = end_col_offset; |
333 | | |
334 | 98.2k | col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset); |
335 | 98.2k | if (col_number < 0) { |
336 | 0 | goto error; |
337 | 0 | } |
338 | | |
339 | 98.2k | if (end_col_offset > 0) { |
340 | 96.0k | end_col_number = _PyPegen_byte_offset_to_character_offset(error_line, end_col_offset); |
341 | 96.0k | if (end_col_number < 0) { |
342 | 0 | goto error; |
343 | 0 | } |
344 | 96.0k | } |
345 | | |
346 | 98.2k | tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number); |
347 | 98.2k | if (!tmp) { |
348 | 0 | goto error; |
349 | 0 | } |
350 | 98.2k | value = _PyTuple_FromPair(errstr, tmp); |
351 | 98.2k | Py_DECREF(tmp); |
352 | 98.2k | if (!value) { |
353 | 0 | goto error; |
354 | 0 | } |
355 | 98.2k | PyErr_SetObject(errtype, value); |
356 | | |
357 | 98.2k | Py_DECREF(errstr); |
358 | 98.2k | Py_DECREF(value); |
359 | 98.2k | return NULL; |
360 | | |
361 | 0 | error: |
362 | 0 | Py_XDECREF(errstr); |
363 | 0 | Py_XDECREF(error_line); |
364 | 0 | return NULL; |
365 | 98.2k | } |
366 | | |
367 | | void |
368 | 100k | _Pypegen_set_syntax_error(Parser* p, Token* last_token) { |
369 | | // Existing syntax error |
370 | 100k | if (PyErr_Occurred()) { |
371 | | // Prioritize tokenizer errors to custom syntax errors raised |
372 | | // on the second phase only if the errors come from the parser. |
373 | 5.54k | int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK); |
374 | 5.54k | if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) { |
375 | 1.66k | _PyPegen_tokenize_full_source_to_check_for_errors(p); |
376 | 1.66k | } |
377 | | // Propagate the existing syntax error. |
378 | 5.54k | return; |
379 | 5.54k | } |
380 | | // Initialization error |
381 | 94.5k | if (p->fill == 0) { |
382 | 0 | RAISE_SYNTAX_ERROR("error at start before reading any input"); |
383 | 0 | } |
384 | | // Parser encountered EOF (End of File) unexpectedtly |
385 | 94.5k | if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) { |
386 | 0 | if (p->tok->level) { |
387 | 0 | raise_unclosed_parentheses_error(p); |
388 | 0 | } else { |
389 | 0 | RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
390 | 0 | } |
391 | 0 | return; |
392 | 0 | } |
393 | | // Indentation error in the tokenizer |
394 | 94.5k | if (last_token->type == INDENT || last_token->type == DEDENT) { |
395 | 86 | RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent"); |
396 | 86 | return; |
397 | 86 | } |
398 | | // Unknown error (generic case) |
399 | | |
400 | | // Use the last token we found on the first pass to avoid reporting |
401 | | // incorrect locations for generic syntax errors just because we reached |
402 | | // further away when trying to find specific syntax errors in the second |
403 | | // pass. |
404 | 94.4k | RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax"); |
405 | | // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing |
406 | | // generic SyntaxError we just raised if errors are found. |
407 | 94.4k | _PyPegen_tokenize_full_source_to_check_for_errors(p); |
408 | 94.4k | } |
409 | | |
410 | | void |
411 | | _Pypegen_stack_overflow(Parser *p) |
412 | 61 | { |
413 | 61 | p->error_indicator = 1; |
414 | 61 | PyErr_SetString(PyExc_MemoryError, |
415 | 61 | "Parser stack overflowed - Python source too complex to parse"); |
416 | 61 | } |