Coverage Report

Created: 2026-06-21 06:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Parser/pegen_errors.c
Line
Count
Source
1
#include <Python.h>
2
#include <errcode.h>
3
4
#include "pycore_pyerrors.h"      // _PyErr_ProgramDecodedTextObject()
5
#include "pycore_runtime.h"       // _Py_ID()
6
#include "pycore_tuple.h"         // _PyTuple_FromPair
7
#include "lexer/state.h"
8
#include "lexer/lexer.h"
9
#include "pegen.h"
10
11
// TOKENIZER ERRORS
12
13
static inline void
14
1.81k
raise_unclosed_parentheses_error(Parser *p) {
15
1.81k
       int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
16
1.81k
       int error_col = p->tok->parencolstack[p->tok->level-1];
17
1.81k
       RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
18
1.81k
                                  error_lineno, error_col, error_lineno, -1,
19
1.81k
                                  "'%c' was never closed",
20
1.81k
                                  p->tok->parenstack[p->tok->level-1]);
21
1.81k
}
22
23
int
24
_Pypegen_tokenizer_error(Parser *p)
25
3.72k
{
26
3.72k
    if (PyErr_Occurred()) {
27
1.87k
        return -1;
28
1.87k
    }
29
30
1.85k
    const char *msg = NULL;
31
1.85k
    PyObject* errtype = PyExc_SyntaxError;
32
1.85k
    Py_ssize_t col_offset = -1;
33
1.85k
    p->error_indicator = 1;
34
1.85k
    switch (p->tok->done) {
35
0
        case E_TOKEN:
36
0
            msg = "invalid token";
37
0
            break;
38
1.79k
        case E_EOF:
39
1.79k
            if (p->tok->level) {
40
1.75k
                raise_unclosed_parentheses_error(p);
41
1.75k
            } else {
42
40
                RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
43
40
            }
44
1.79k
            return -1;
45
8
        case E_DEDENT:
46
8
            RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
47
8
            return -1;
48
0
        case E_INTR:
49
0
            if (!PyErr_Occurred()) {
50
0
                PyErr_SetNone(PyExc_KeyboardInterrupt);
51
0
            }
52
0
            return -1;
53
0
        case E_NOMEM:
54
0
            PyErr_NoMemory();
55
0
            return -1;
56
2
        case E_TABSPACE:
57
2
            errtype = PyExc_TabError;
58
2
            msg = "inconsistent use of tabs and spaces in indentation";
59
2
            break;
60
0
        case E_TOODEEP:
61
0
            errtype = PyExc_IndentationError;
62
0
            msg = "too many levels of indentation";
63
0
            break;
64
52
        case E_LINECONT: {
65
52
            col_offset = p->tok->cur - p->tok->buf - 1;
66
52
            msg = "unexpected character after line continuation character";
67
52
            break;
68
0
        }
69
0
        case E_COLUMNOVERFLOW:
70
0
            PyErr_SetString(PyExc_OverflowError,
71
0
                    "Parser column offset overflow - source line is too big");
72
0
            return -1;
73
0
        default:
74
0
            msg = "unknown parsing error";
75
1.85k
    }
76
77
54
    RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
78
54
                               col_offset >= 0 ? col_offset : 0,
79
54
                               p->tok->lineno, -1, msg);
80
54
    return -1;
81
1.85k
}
82
83
int
84
_Pypegen_raise_decode_error(Parser *p)
85
125
{
86
125
    assert(PyErr_Occurred());
87
125
    const char *errtype = NULL;
88
125
    if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
89
117
        errtype = "unicode error";
90
117
    }
91
8
    else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
92
6
        errtype = "value error";
93
6
    }
94
125
    if (errtype) {
95
123
        PyObject *type;
96
123
        PyObject *value;
97
123
        PyObject *tback;
98
123
        PyObject *errstr;
99
123
        PyErr_Fetch(&type, &value, &tback);
100
123
        errstr = PyObject_Str(value);
101
123
        if (errstr) {
102
123
            RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
103
123
            Py_DECREF(errstr);
104
123
        }
105
0
        else {
106
0
            PyErr_Clear();
107
0
            RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
108
0
        }
109
123
        Py_XDECREF(type);
110
123
        Py_XDECREF(value);
111
123
        Py_XDECREF(tback);
112
123
    }
113
114
125
    return -1;
115
125
}
116
117
static int
118
96.1k
_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
119
    // Tokenize the whole input to see if there are any tokenization
120
    // errors such as mismatching parentheses. These will get priority
121
    // over generic syntax errors only if the line number of the error is
122
    // before the one that we had for the generic error.
123
124
    // We don't want to tokenize to the end for interactive input
125
96.1k
    if (p->tok->prompt != NULL) {
126
0
        return 0;
127
0
    }
128
129
96.1k
    PyObject *type, *value, *traceback;
130
96.1k
    PyErr_Fetch(&type, &value, &traceback);
131
132
96.1k
    Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
133
96.1k
    Py_ssize_t current_err_line = current_token->lineno;
134
135
96.1k
    int ret = 0;
136
96.1k
    struct token new_token;
137
96.1k
    _PyToken_Init(&new_token);
138
139
407k
    for (;;) {
140
407k
        switch (_PyTokenizer_Get(p->tok, &new_token)) {
141
2.85k
            case ERRORTOKEN:
142
2.85k
                if (PyErr_Occurred()) {
143
604
                    ret = -1;
144
604
                    goto exit;
145
604
                }
146
2.25k
                if (p->tok->level != 0) {
147
2.23k
                    int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
148
2.23k
                    if (current_err_line > error_lineno) {
149
63
                        raise_unclosed_parentheses_error(p);
150
63
                        ret = -1;
151
63
                        goto exit;
152
63
                    }
153
2.23k
                }
154
2.19k
                break;
155
93.2k
            case ENDMARKER:
156
93.2k
                break;
157
311k
            default:
158
311k
                continue;
159
407k
        }
160
95.4k
        break;
161
407k
    }
162
163
164
96.1k
exit:
165
96.1k
    _PyToken_Free(&new_token);
166
    // If we're in an f-string, we want the syntax error in the expression part
167
    // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
168
    // do not swallow it.
169
96.1k
    if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
170
507
        Py_XDECREF(value);
171
507
        Py_XDECREF(type);
172
507
        Py_XDECREF(traceback);
173
95.6k
    } else {
174
95.6k
        PyErr_Restore(type, value, traceback);
175
95.6k
    }
176
96.1k
    return ret;
177
96.1k
}
178
179
// PARSER ERRORS
180
181
void *
182
_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...)
183
1.10k
{
184
    // Bail out if we already have an error set.
185
1.10k
    if (p->error_indicator && PyErr_Occurred()) {
186
253
        return NULL;
187
253
    }
188
853
    if (p->fill == 0) {
189
0
        va_list va;
190
0
        va_start(va, errmsg);
191
0
        _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
192
0
        va_end(va);
193
0
        return NULL;
194
0
    }
195
853
    if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
196
0
        p->error_indicator = 1;
197
0
        return NULL;
198
0
    }
199
853
    Token *t = p->known_err_token != NULL
200
853
                   ? p->known_err_token
201
853
                   : p->tokens[use_mark ? p->mark : p->fill - 1];
202
853
    Py_ssize_t col_offset;
203
853
    Py_ssize_t end_col_offset = -1;
204
853
    if (t->col_offset == -1) {
205
230
        if (p->tok->cur == p->tok->buf) {
206
3
            col_offset = 0;
207
227
        } else {
208
227
            const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
209
227
            col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
210
227
        }
211
623
    } else {
212
623
        col_offset = t->col_offset + 1;
213
623
    }
214
215
853
    if (t->end_col_offset != -1) {
216
623
        end_col_offset = t->end_col_offset + 1;
217
623
    }
218
219
853
    va_list va;
220
853
    va_start(va, errmsg);
221
853
    _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
222
853
    va_end(va);
223
224
853
    return NULL;
225
853
}
226
227
static PyObject *
228
get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
229
190
{
230
    /* If the file descriptor is interactive, the source lines of the current
231
     * (multi-line) statement are stored in p->tok->interactive_src_start.
232
     * If not, we're parsing from a string, which means that the whole source
233
     * is stored in p->tok->str. */
234
190
    assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL);
235
236
190
    char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
237
190
    if (cur_line == NULL) {
238
0
        assert(p->tok->fp_interactive);
239
        // We can reach this point if the tokenizer buffers for interactive source have not been
240
        // initialized because we failed to decode the original source with the given locale.
241
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_STR);
242
0
    }
243
244
190
    Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
245
190
    const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
246
247
190
    if (buf_end < cur_line) {
248
0
        buf_end = cur_line + strlen(cur_line);
249
0
    }
250
251
2.25k
    for (int i = 0; i < relative_lineno - 1; i++) {
252
2.06k
        char *new_line = strchr(cur_line, '\n');
253
        // The assert is here for debug builds but the conditional that
254
        // follows is there so in release builds we do not crash at the cost
255
        // to report a potentially wrong line.
256
2.06k
        assert(new_line != NULL && new_line + 1 < buf_end);
257
2.06k
        if (new_line == NULL || new_line + 1 > buf_end) {
258
0
            break;
259
0
        }
260
2.06k
        cur_line = new_line + 1;
261
2.06k
    }
262
263
190
    char *next_newline;
264
190
    if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
265
0
        next_newline = cur_line + strlen(cur_line);
266
0
    }
267
190
    return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
268
190
}
269
270
void *
271
_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
272
                                    Py_ssize_t lineno, Py_ssize_t col_offset,
273
                                    Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
274
                                    const char *errmsg, va_list va)
275
98.6k
{
276
    // Bail out if we already have an error set.
277
98.6k
    if (p->error_indicator && PyErr_Occurred()) {
278
466
        return NULL;
279
466
    }
280
98.2k
    PyObject *value = NULL;
281
98.2k
    PyObject *errstr = NULL;
282
98.2k
    PyObject *error_line = NULL;
283
98.2k
    PyObject *tmp = NULL;
284
98.2k
    p->error_indicator = 1;
285
286
98.2k
    if (end_lineno == CURRENT_POS) {
287
25
        end_lineno = p->tok->lineno;
288
25
    }
289
98.2k
    if (end_col_offset == CURRENT_POS) {
290
25
        end_col_offset = p->tok->cur - p->tok->line_start;
291
25
    }
292
293
98.2k
    errstr = PyUnicode_FromFormatV(errmsg, va);
294
98.2k
    if (!errstr) {
295
0
        goto error;
296
0
    }
297
298
98.2k
    if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
299
0
        error_line = get_error_line_from_tokenizer_buffers(p, lineno);
300
0
    }
301
98.2k
    else if (p->start_rule == Py_file_input) {
302
98.2k
        error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
303
98.2k
                                                     (int) lineno, p->tok->encoding);
304
98.2k
    }
305
306
98.2k
    if (!error_line) {
307
        /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
308
           then we need to find the error line from some other source, because
309
           p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
310
           failed or we're parsing from a string or the REPL. There's a third edge case where
311
           we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
312
           `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
313
           does not physically exist */
314
98.2k
        assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
315
316
98.2k
        if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
317
98.0k
            Py_ssize_t size = p->tok->inp - p->tok->line_start;
318
98.0k
            error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace");
319
98.0k
        }
320
190
        else if (p->tok->fp == NULL || p->tok->fp == stdin) {
321
190
            error_line = get_error_line_from_tokenizer_buffers(p, lineno);
322
190
        }
323
0
        else {
324
0
            error_line = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
325
0
        }
326
98.2k
        if (!error_line) {
327
0
            goto error;
328
0
        }
329
98.2k
    }
330
331
98.2k
    Py_ssize_t col_number = col_offset;
332
98.2k
    Py_ssize_t end_col_number = end_col_offset;
333
334
98.2k
    col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
335
98.2k
    if (col_number < 0) {
336
0
        goto error;
337
0
    }
338
339
98.2k
    if (end_col_offset > 0) {
340
96.0k
        end_col_number = _PyPegen_byte_offset_to_character_offset(error_line, end_col_offset);
341
96.0k
        if (end_col_number < 0) {
342
0
            goto error;
343
0
        }
344
96.0k
    }
345
346
98.2k
    tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
347
98.2k
    if (!tmp) {
348
0
        goto error;
349
0
    }
350
98.2k
    value = _PyTuple_FromPair(errstr, tmp);
351
98.2k
    Py_DECREF(tmp);
352
98.2k
    if (!value) {
353
0
        goto error;
354
0
    }
355
98.2k
    PyErr_SetObject(errtype, value);
356
357
98.2k
    Py_DECREF(errstr);
358
98.2k
    Py_DECREF(value);
359
98.2k
    return NULL;
360
361
0
error:
362
0
    Py_XDECREF(errstr);
363
0
    Py_XDECREF(error_line);
364
0
    return NULL;
365
98.2k
}
366
367
void
368
100k
_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
369
    // Existing syntax error
370
100k
    if (PyErr_Occurred()) {
371
        // Prioritize tokenizer errors to custom syntax errors raised
372
        // on the second phase only if the errors come from the parser.
373
5.54k
        int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
374
5.54k
        if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
375
1.66k
            _PyPegen_tokenize_full_source_to_check_for_errors(p);
376
1.66k
        }
377
        // Propagate the existing syntax error.
378
5.54k
        return;
379
5.54k
    }
380
    // Initialization error
381
94.5k
    if (p->fill == 0) {
382
0
        RAISE_SYNTAX_ERROR("error at start before reading any input");
383
0
    }
384
    // Parser encountered EOF (End of File) unexpectedtly
385
94.5k
    if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
386
0
        if (p->tok->level) {
387
0
            raise_unclosed_parentheses_error(p);
388
0
        } else {
389
0
            RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
390
0
        }
391
0
        return;
392
0
    }
393
    // Indentation error in the tokenizer
394
94.5k
    if (last_token->type == INDENT || last_token->type == DEDENT) {
395
86
        RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
396
86
        return;
397
86
    }
398
    // Unknown error (generic case)
399
400
    // Use the last token we found on the first pass to avoid reporting
401
    // incorrect locations for generic syntax errors just because we reached
402
    // further away when trying to find specific syntax errors in the second
403
    // pass.
404
94.4k
    RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
405
    // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
406
    // generic SyntaxError we just raised if errors are found.
407
94.4k
    _PyPegen_tokenize_full_source_to_check_for_errors(p);
408
94.4k
}
409
410
void
411
_Pypegen_stack_overflow(Parser *p)
412
61
{
413
61
    p->error_indicator = 1;
414
61
    PyErr_SetString(PyExc_MemoryError,
415
61
        "Parser stack overflowed - Python source too complex to parse");
416
61
}