Coverage Report

Created: 2026-06-09 06:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Parser/pegen_errors.c
Line
Count
Source
1
#include <Python.h>
2
#include <errcode.h>
3
4
#include "pycore_pyerrors.h"      // _PyErr_ProgramDecodedTextObject()
5
#include "pycore_runtime.h"       // _Py_ID()
6
#include "pycore_tuple.h"         // _PyTuple_FromPair
7
#include "lexer/state.h"
8
#include "lexer/lexer.h"
9
#include "pegen.h"
10
11
// TOKENIZER ERRORS
12
13
void
14
_PyPegen_raise_tokenizer_init_error(PyObject *filename)
15
2.60k
{
16
2.60k
    if (!(PyErr_ExceptionMatches(PyExc_LookupError)
17
2.40k
          || PyErr_ExceptionMatches(PyExc_SyntaxError)
18
1.79k
          || PyErr_ExceptionMatches(PyExc_ValueError)
19
62
          || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
20
62
        return;
21
62
    }
22
2.54k
    PyObject *errstr = NULL;
23
2.54k
    PyObject *tuple = NULL;
24
2.54k
    PyObject *type;
25
2.54k
    PyObject *value;
26
2.54k
    PyObject *tback;
27
2.54k
    PyErr_Fetch(&type, &value, &tback);
28
2.54k
    if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
29
615
        if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
30
0
            goto error;
31
0
        }
32
615
        PyErr_Restore(type, value, tback);
33
615
        return;
34
615
    }
35
1.93k
    errstr = PyObject_Str(value);
36
1.93k
    if (!errstr) {
37
0
        goto error;
38
0
    }
39
40
1.93k
    PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
41
1.93k
    if (!tmp) {
42
0
        goto error;
43
0
    }
44
45
1.93k
    tuple = _PyTuple_FromPair(errstr, tmp);
46
1.93k
    Py_DECREF(tmp);
47
1.93k
    if (!tuple) {
48
0
        goto error;
49
0
    }
50
1.93k
    PyErr_SetObject(PyExc_SyntaxError, tuple);
51
52
1.93k
error:
53
1.93k
    Py_XDECREF(type);
54
1.93k
    Py_XDECREF(value);
55
1.93k
    Py_XDECREF(tback);
56
1.93k
    Py_XDECREF(errstr);
57
1.93k
    Py_XDECREF(tuple);
58
1.93k
}
59
60
static inline void
61
1.90k
raise_unclosed_parentheses_error(Parser *p) {
62
1.90k
       int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
63
1.90k
       int error_col = p->tok->parencolstack[p->tok->level-1];
64
1.90k
       RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
65
1.90k
                                  error_lineno, error_col, error_lineno, -1,
66
1.90k
                                  "'%c' was never closed",
67
1.90k
                                  p->tok->parenstack[p->tok->level-1]);
68
1.90k
}
69
70
int
71
_Pypegen_tokenizer_error(Parser *p)
72
3.83k
{
73
3.83k
    if (PyErr_Occurred()) {
74
1.90k
        return -1;
75
1.90k
    }
76
77
1.92k
    const char *msg = NULL;
78
1.92k
    PyObject* errtype = PyExc_SyntaxError;
79
1.92k
    Py_ssize_t col_offset = -1;
80
1.92k
    p->error_indicator = 1;
81
1.92k
    switch (p->tok->done) {
82
0
        case E_TOKEN:
83
0
            msg = "invalid token";
84
0
            break;
85
1.87k
        case E_EOF:
86
1.87k
            if (p->tok->level) {
87
1.83k
                raise_unclosed_parentheses_error(p);
88
1.83k
            } else {
89
39
                RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
90
39
            }
91
1.87k
            return -1;
92
7
        case E_DEDENT:
93
7
            RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
94
7
            return -1;
95
0
        case E_INTR:
96
0
            if (!PyErr_Occurred()) {
97
0
                PyErr_SetNone(PyExc_KeyboardInterrupt);
98
0
            }
99
0
            return -1;
100
0
        case E_NOMEM:
101
0
            PyErr_NoMemory();
102
0
            return -1;
103
2
        case E_TABSPACE:
104
2
            errtype = PyExc_TabError;
105
2
            msg = "inconsistent use of tabs and spaces in indentation";
106
2
            break;
107
0
        case E_TOODEEP:
108
0
            errtype = PyExc_IndentationError;
109
0
            msg = "too many levels of indentation";
110
0
            break;
111
49
        case E_LINECONT: {
112
49
            col_offset = p->tok->cur - p->tok->buf - 1;
113
49
            msg = "unexpected character after line continuation character";
114
49
            break;
115
0
        }
116
0
        case E_COLUMNOVERFLOW:
117
0
            PyErr_SetString(PyExc_OverflowError,
118
0
                    "Parser column offset overflow - source line is too big");
119
0
            return -1;
120
0
        default:
121
0
            msg = "unknown parsing error";
122
1.92k
    }
123
124
51
    RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
125
51
                               col_offset >= 0 ? col_offset : 0,
126
51
                               p->tok->lineno, -1, msg);
127
51
    return -1;
128
1.92k
}
129
130
int
131
_Pypegen_raise_decode_error(Parser *p)
132
124
{
133
124
    assert(PyErr_Occurred());
134
124
    const char *errtype = NULL;
135
124
    if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
136
118
        errtype = "unicode error";
137
118
    }
138
6
    else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
139
4
        errtype = "value error";
140
4
    }
141
124
    if (errtype) {
142
122
        PyObject *type;
143
122
        PyObject *value;
144
122
        PyObject *tback;
145
122
        PyObject *errstr;
146
122
        PyErr_Fetch(&type, &value, &tback);
147
122
        errstr = PyObject_Str(value);
148
122
        if (errstr) {
149
122
            RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
150
122
            Py_DECREF(errstr);
151
122
        }
152
0
        else {
153
0
            PyErr_Clear();
154
0
            RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
155
0
        }
156
122
        Py_XDECREF(type);
157
122
        Py_XDECREF(value);
158
122
        Py_XDECREF(tback);
159
122
    }
160
161
124
    return -1;
162
124
}
163
164
static int
165
89.1k
_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
166
    // Tokenize the whole input to see if there are any tokenization
167
    // errors such as mismatching parentheses. These will get priority
168
    // over generic syntax errors only if the line number of the error is
169
    // before the one that we had for the generic error.
170
171
    // We don't want to tokenize to the end for interactive input
172
89.1k
    if (p->tok->prompt != NULL) {
173
0
        return 0;
174
0
    }
175
176
89.1k
    PyObject *type, *value, *traceback;
177
89.1k
    PyErr_Fetch(&type, &value, &traceback);
178
179
89.1k
    Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
180
89.1k
    Py_ssize_t current_err_line = current_token->lineno;
181
182
89.1k
    int ret = 0;
183
89.1k
    struct token new_token;
184
89.1k
    _PyToken_Init(&new_token);
185
186
379k
    for (;;) {
187
379k
        switch (_PyTokenizer_Get(p->tok, &new_token)) {
188
2.92k
            case ERRORTOKEN:
189
2.92k
                if (PyErr_Occurred()) {
190
604
                    ret = -1;
191
604
                    goto exit;
192
604
                }
193
2.31k
                if (p->tok->level != 0) {
194
2.29k
                    int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
195
2.29k
                    if (current_err_line > error_lineno) {
196
78
                        raise_unclosed_parentheses_error(p);
197
78
                        ret = -1;
198
78
                        goto exit;
199
78
                    }
200
2.29k
                }
201
2.23k
                break;
202
86.2k
            case ENDMARKER:
203
86.2k
                break;
204
289k
            default:
205
289k
                continue;
206
379k
        }
207
88.4k
        break;
208
379k
    }
209
210
211
89.1k
exit:
212
89.1k
    _PyToken_Free(&new_token);
213
    // If we're in an f-string, we want the syntax error in the expression part
214
    // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
215
    // do not swallow it.
216
89.1k
    if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
217
507
        Py_XDECREF(value);
218
507
        Py_XDECREF(type);
219
507
        Py_XDECREF(traceback);
220
88.6k
    } else {
221
88.6k
        PyErr_Restore(type, value, traceback);
222
88.6k
    }
223
89.1k
    return ret;
224
89.1k
}
225
226
// PARSER ERRORS
227
228
void *
229
_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...)
230
1.12k
{
231
    // Bail out if we already have an error set.
232
1.12k
    if (p->error_indicator && PyErr_Occurred()) {
233
262
        return NULL;
234
262
    }
235
865
    if (p->fill == 0) {
236
0
        va_list va;
237
0
        va_start(va, errmsg);
238
0
        _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
239
0
        va_end(va);
240
0
        return NULL;
241
0
    }
242
865
    if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
243
0
        p->error_indicator = 1;
244
0
        return NULL;
245
0
    }
246
865
    Token *t = p->known_err_token != NULL
247
865
                   ? p->known_err_token
248
865
                   : p->tokens[use_mark ? p->mark : p->fill - 1];
249
865
    Py_ssize_t col_offset;
250
865
    Py_ssize_t end_col_offset = -1;
251
865
    if (t->col_offset == -1) {
252
232
        if (p->tok->cur == p->tok->buf) {
253
3
            col_offset = 0;
254
229
        } else {
255
229
            const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
256
229
            col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
257
229
        }
258
633
    } else {
259
633
        col_offset = t->col_offset + 1;
260
633
    }
261
262
865
    if (t->end_col_offset != -1) {
263
633
        end_col_offset = t->end_col_offset + 1;
264
633
    }
265
266
865
    va_list va;
267
865
    va_start(va, errmsg);
268
865
    _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
269
865
    va_end(va);
270
271
865
    return NULL;
272
865
}
273
274
static PyObject *
275
get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
276
207
{
277
    /* If the file descriptor is interactive, the source lines of the current
278
     * (multi-line) statement are stored in p->tok->interactive_src_start.
279
     * If not, we're parsing from a string, which means that the whole source
280
     * is stored in p->tok->str. */
281
207
    assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL);
282
283
207
    char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
284
207
    if (cur_line == NULL) {
285
0
        assert(p->tok->fp_interactive);
286
        // We can reach this point if the tokenizer buffers for interactive source have not been
287
        // initialized because we failed to decode the original source with the given locale.
288
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_STR);
289
0
    }
290
291
207
    Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
292
207
    const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
293
294
207
    if (buf_end < cur_line) {
295
0
        buf_end = cur_line + strlen(cur_line);
296
0
    }
297
298
1.49k
    for (int i = 0; i < relative_lineno - 1; i++) {
299
1.29k
        char *new_line = strchr(cur_line, '\n');
300
        // The assert is here for debug builds but the conditional that
301
        // follows is there so in release builds we do not crash at the cost
302
        // to report a potentially wrong line.
303
1.29k
        assert(new_line != NULL && new_line + 1 < buf_end);
304
1.29k
        if (new_line == NULL || new_line + 1 > buf_end) {
305
0
            break;
306
0
        }
307
1.29k
        cur_line = new_line + 1;
308
1.29k
    }
309
310
207
    char *next_newline;
311
207
    if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
312
0
        next_newline = cur_line + strlen(cur_line);
313
0
    }
314
207
    return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
315
207
}
316
317
void *
318
_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
319
                                    Py_ssize_t lineno, Py_ssize_t col_offset,
320
                                    Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
321
                                    const char *errmsg, va_list va)
322
92.2k
{
323
    // Bail out if we already have an error set.
324
92.2k
    if (p->error_indicator && PyErr_Occurred()) {
325
926
        return NULL;
326
926
    }
327
91.3k
    PyObject *value = NULL;
328
91.3k
    PyObject *errstr = NULL;
329
91.3k
    PyObject *error_line = NULL;
330
91.3k
    PyObject *tmp = NULL;
331
91.3k
    p->error_indicator = 1;
332
333
91.3k
    if (end_lineno == CURRENT_POS) {
334
24
        end_lineno = p->tok->lineno;
335
24
    }
336
91.3k
    if (end_col_offset == CURRENT_POS) {
337
24
        end_col_offset = p->tok->cur - p->tok->line_start;
338
24
    }
339
340
91.3k
    errstr = PyUnicode_FromFormatV(errmsg, va);
341
91.3k
    if (!errstr) {
342
0
        goto error;
343
0
    }
344
345
91.3k
    if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
346
0
        error_line = get_error_line_from_tokenizer_buffers(p, lineno);
347
0
    }
348
91.3k
    else if (p->start_rule == Py_file_input) {
349
91.3k
        error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
350
91.3k
                                                     (int) lineno, p->tok->encoding);
351
91.3k
    }
352
353
91.3k
    if (!error_line) {
354
        /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
355
           then we need to find the error line from some other source, because
356
           p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
357
           failed or we're parsing from a string or the REPL. There's a third edge case where
358
           we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
359
           `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
360
           does not physically exist */
361
91.3k
        assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
362
363
91.3k
        if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
364
91.1k
            Py_ssize_t size = p->tok->inp - p->tok->line_start;
365
91.1k
            error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace");
366
91.1k
        }
367
207
        else if (p->tok->fp == NULL || p->tok->fp == stdin) {
368
207
            error_line = get_error_line_from_tokenizer_buffers(p, lineno);
369
207
        }
370
0
        else {
371
0
            error_line = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
372
0
        }
373
91.3k
        if (!error_line) {
374
0
            goto error;
375
0
        }
376
91.3k
    }
377
378
91.3k
    Py_ssize_t col_number = col_offset;
379
91.3k
    Py_ssize_t end_col_number = end_col_offset;
380
381
91.3k
    col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
382
91.3k
    if (col_number < 0) {
383
0
        goto error;
384
0
    }
385
386
91.3k
    if (end_col_offset > 0) {
387
89.0k
        end_col_number = _PyPegen_byte_offset_to_character_offset(error_line, end_col_offset);
388
89.0k
        if (end_col_number < 0) {
389
0
            goto error;
390
0
        }
391
89.0k
    }
392
393
91.3k
    tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
394
91.3k
    if (!tmp) {
395
0
        goto error;
396
0
    }
397
91.3k
    value = _PyTuple_FromPair(errstr, tmp);
398
91.3k
    Py_DECREF(tmp);
399
91.3k
    if (!value) {
400
0
        goto error;
401
0
    }
402
91.3k
    PyErr_SetObject(errtype, value);
403
404
91.3k
    Py_DECREF(errstr);
405
91.3k
    Py_DECREF(value);
406
91.3k
    return NULL;
407
408
0
error:
409
0
    Py_XDECREF(errstr);
410
0
    Py_XDECREF(error_line);
411
0
    return NULL;
412
91.3k
}
413
414
void
415
93.1k
_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
416
    // Existing syntax error
417
93.1k
    if (PyErr_Occurred()) {
418
        // Prioritize tokenizer errors to custom syntax errors raised
419
        // on the second phase only if the errors come from the parser.
420
5.67k
        int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
421
5.67k
        if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
422
1.69k
            _PyPegen_tokenize_full_source_to_check_for_errors(p);
423
1.69k
        }
424
        // Propagate the existing syntax error.
425
5.67k
        return;
426
5.67k
    }
427
    // Initialization error
428
87.5k
    if (p->fill == 0) {
429
0
        RAISE_SYNTAX_ERROR("error at start before reading any input");
430
0
    }
431
    // Parser encountered EOF (End of File) unexpectedtly
432
87.5k
    if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
433
0
        if (p->tok->level) {
434
0
            raise_unclosed_parentheses_error(p);
435
0
        } else {
436
0
            RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
437
0
        }
438
0
        return;
439
0
    }
440
    // Indentation error in the tokenizer
441
87.5k
    if (last_token->type == INDENT || last_token->type == DEDENT) {
442
91
        RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
443
91
        return;
444
91
    }
445
    // Unknown error (generic case)
446
447
    // Use the last token we found on the first pass to avoid reporting
448
    // incorrect locations for generic syntax errors just because we reached
449
    // further away when trying to find specific syntax errors in the second
450
    // pass.
451
87.4k
    RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
452
    // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
453
    // generic SyntaxError we just raised if errors are found.
454
87.4k
    _PyPegen_tokenize_full_source_to_check_for_errors(p);
455
87.4k
}
456
457
void
458
_Pypegen_stack_overflow(Parser *p)
459
55
{
460
55
    p->error_indicator = 1;
461
55
    PyErr_SetString(PyExc_MemoryError,
462
55
        "Parser stack overflowed - Python source too complex to parse");
463
55
}