Coverage Report

Created: 2025-10-12 06:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Parser/pegen_errors.c
Line
Count
Source
1
#include <Python.h>
2
#include <errcode.h>
3
4
#include "pycore_pyerrors.h"      // _PyErr_ProgramDecodedTextObject()
5
#include "pycore_runtime.h"       // _Py_ID()
6
#include "lexer/state.h"
7
#include "lexer/lexer.h"
8
#include "pegen.h"
9
10
// TOKENIZER ERRORS
11
12
void
13
_PyPegen_raise_tokenizer_init_error(PyObject *filename)
14
2.63k
{
15
2.63k
    if (!(PyErr_ExceptionMatches(PyExc_LookupError)
16
2.47k
          || PyErr_ExceptionMatches(PyExc_SyntaxError)
17
1.87k
          || PyErr_ExceptionMatches(PyExc_ValueError)
18
60
          || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
19
60
        return;
20
60
    }
21
2.57k
    PyObject *errstr = NULL;
22
2.57k
    PyObject *tuple = NULL;
23
2.57k
    PyObject *type;
24
2.57k
    PyObject *value;
25
2.57k
    PyObject *tback;
26
2.57k
    PyErr_Fetch(&type, &value, &tback);
27
2.57k
    if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
28
608
        if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
29
0
            goto error;
30
0
        }
31
608
        PyErr_Restore(type, value, tback);
32
608
        return;
33
608
    }
34
1.96k
    errstr = PyObject_Str(value);
35
1.96k
    if (!errstr) {
36
0
        goto error;
37
0
    }
38
39
1.96k
    PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
40
1.96k
    if (!tmp) {
41
0
        goto error;
42
0
    }
43
44
1.96k
    tuple = PyTuple_Pack(2, errstr, tmp);
45
1.96k
    Py_DECREF(tmp);
46
1.96k
    if (!value) {
47
0
        goto error;
48
0
    }
49
1.96k
    PyErr_SetObject(PyExc_SyntaxError, tuple);
50
51
1.96k
error:
52
1.96k
    Py_XDECREF(type);
53
1.96k
    Py_XDECREF(value);
54
1.96k
    Py_XDECREF(tback);
55
1.96k
    Py_XDECREF(errstr);
56
1.96k
    Py_XDECREF(tuple);
57
1.96k
}
58
59
static inline void
60
1.80k
raise_unclosed_parentheses_error(Parser *p) {
61
1.80k
       int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
62
1.80k
       int error_col = p->tok->parencolstack[p->tok->level-1];
63
1.80k
       RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
64
1.80k
                                  error_lineno, error_col, error_lineno, -1,
65
1.80k
                                  "'%c' was never closed",
66
1.80k
                                  p->tok->parenstack[p->tok->level-1]);
67
1.80k
}
68
69
int
70
_Pypegen_tokenizer_error(Parser *p)
71
3.78k
{
72
3.78k
    if (PyErr_Occurred()) {
73
1.96k
        return -1;
74
1.96k
    }
75
76
1.81k
    const char *msg = NULL;
77
1.81k
    PyObject* errtype = PyExc_SyntaxError;
78
1.81k
    Py_ssize_t col_offset = -1;
79
1.81k
    p->error_indicator = 1;
80
1.81k
    switch (p->tok->done) {
81
0
        case E_TOKEN:
82
0
            msg = "invalid token";
83
0
            break;
84
1.76k
        case E_EOF:
85
1.76k
            if (p->tok->level) {
86
1.72k
                raise_unclosed_parentheses_error(p);
87
1.72k
            } else {
88
43
                RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
89
43
            }
90
1.76k
            return -1;
91
3
        case E_DEDENT:
92
3
            RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
93
3
            return -1;
94
0
        case E_INTR:
95
0
            if (!PyErr_Occurred()) {
96
0
                PyErr_SetNone(PyExc_KeyboardInterrupt);
97
0
            }
98
0
            return -1;
99
0
        case E_NOMEM:
100
0
            PyErr_NoMemory();
101
0
            return -1;
102
2
        case E_TABSPACE:
103
2
            errtype = PyExc_TabError;
104
2
            msg = "inconsistent use of tabs and spaces in indentation";
105
2
            break;
106
0
        case E_TOODEEP:
107
0
            errtype = PyExc_IndentationError;
108
0
            msg = "too many levels of indentation";
109
0
            break;
110
43
        case E_LINECONT: {
111
43
            col_offset = p->tok->cur - p->tok->buf - 1;
112
43
            msg = "unexpected character after line continuation character";
113
43
            break;
114
0
        }
115
0
        case E_COLUMNOVERFLOW:
116
0
            PyErr_SetString(PyExc_OverflowError,
117
0
                    "Parser column offset overflow - source line is too big");
118
0
            return -1;
119
0
        default:
120
0
            msg = "unknown parsing error";
121
1.81k
    }
122
123
45
    RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
124
45
                               col_offset >= 0 ? col_offset : 0,
125
45
                               p->tok->lineno, -1, msg);
126
45
    return -1;
127
1.81k
}
128
129
int
130
_Pypegen_raise_decode_error(Parser *p)
131
112
{
132
112
    assert(PyErr_Occurred());
133
112
    const char *errtype = NULL;
134
112
    if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
135
104
        errtype = "unicode error";
136
104
    }
137
8
    else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
138
4
        errtype = "value error";
139
4
    }
140
112
    if (errtype) {
141
108
        PyObject *type;
142
108
        PyObject *value;
143
108
        PyObject *tback;
144
108
        PyObject *errstr;
145
108
        PyErr_Fetch(&type, &value, &tback);
146
108
        errstr = PyObject_Str(value);
147
108
        if (errstr) {
148
108
            RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
149
108
            Py_DECREF(errstr);
150
108
        }
151
0
        else {
152
0
            PyErr_Clear();
153
0
            RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
154
0
        }
155
108
        Py_XDECREF(type);
156
108
        Py_XDECREF(value);
157
108
        Py_XDECREF(tback);
158
108
    }
159
160
112
    return -1;
161
112
}
162
163
static int
164
7.94k
_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
165
    // Tokenize the whole input to see if there are any tokenization
166
    // errors such as mismatching parentheses. These will get priority
167
    // over generic syntax errors only if the line number of the error is
168
    // before the one that we had for the generic error.
169
170
    // We don't want to tokenize to the end for interactive input
171
7.94k
    if (p->tok->prompt != NULL) {
172
0
        return 0;
173
0
    }
174
175
7.94k
    PyObject *type, *value, *traceback;
176
7.94k
    PyErr_Fetch(&type, &value, &traceback);
177
178
7.94k
    Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
179
7.94k
    Py_ssize_t current_err_line = current_token->lineno;
180
181
7.94k
    int ret = 0;
182
7.94k
    struct token new_token;
183
7.94k
    _PyToken_Init(&new_token);
184
185
43.4k
    for (;;) {
186
43.4k
        switch (_PyTokenizer_Get(p->tok, &new_token)) {
187
2.79k
            case ERRORTOKEN:
188
2.79k
                if (PyErr_Occurred()) {
189
467
                    ret = -1;
190
467
                    goto exit;
191
467
                }
192
2.32k
                if (p->tok->level != 0) {
193
2.30k
                    int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
194
2.30k
                    if (current_err_line > error_lineno) {
195
76
                        raise_unclosed_parentheses_error(p);
196
76
                        ret = -1;
197
76
                        goto exit;
198
76
                    }
199
2.30k
                }
200
2.24k
                break;
201
5.15k
            case ENDMARKER:
202
5.15k
                break;
203
35.4k
            default:
204
35.4k
                continue;
205
43.4k
        }
206
7.39k
        break;
207
43.4k
    }
208
209
210
7.94k
exit:
211
7.94k
    _PyToken_Free(&new_token);
212
    // If we're in an f-string, we want the syntax error in the expression part
213
    // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
214
    // do not swallow it.
215
7.94k
    if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
216
366
        Py_XDECREF(value);
217
366
        Py_XDECREF(type);
218
366
        Py_XDECREF(traceback);
219
7.57k
    } else {
220
7.57k
        PyErr_Restore(type, value, traceback);
221
7.57k
    }
222
7.94k
    return ret;
223
7.94k
}
224
225
// PARSER ERRORS
226
227
void *
228
_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...)
229
1.21k
{
230
    // Bail out if we already have an error set.
231
1.21k
    if (p->error_indicator && PyErr_Occurred()) {
232
231
        return NULL;
233
231
    }
234
982
    if (p->fill == 0) {
235
0
        va_list va;
236
0
        va_start(va, errmsg);
237
0
        _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
238
0
        va_end(va);
239
0
        return NULL;
240
0
    }
241
982
    if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
242
0
        p->error_indicator = 1;
243
0
        return NULL;
244
0
    }
245
982
    Token *t = p->known_err_token != NULL
246
982
                   ? p->known_err_token
247
982
                   : p->tokens[use_mark ? p->mark : p->fill - 1];
248
982
    Py_ssize_t col_offset;
249
982
    Py_ssize_t end_col_offset = -1;
250
982
    if (t->col_offset == -1) {
251
283
        if (p->tok->cur == p->tok->buf) {
252
3
            col_offset = 0;
253
280
        } else {
254
280
            const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
255
280
            col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
256
280
        }
257
699
    } else {
258
699
        col_offset = t->col_offset + 1;
259
699
    }
260
261
982
    if (t->end_col_offset != -1) {
262
699
        end_col_offset = t->end_col_offset + 1;
263
699
    }
264
265
982
    va_list va;
266
982
    va_start(va, errmsg);
267
982
    _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
268
982
    va_end(va);
269
270
982
    return NULL;
271
982
}
272
273
static PyObject *
274
get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
275
172
{
276
    /* If the file descriptor is interactive, the source lines of the current
277
     * (multi-line) statement are stored in p->tok->interactive_src_start.
278
     * If not, we're parsing from a string, which means that the whole source
279
     * is stored in p->tok->str. */
280
172
    assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL);
281
282
172
    char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
283
172
    if (cur_line == NULL) {
284
0
        assert(p->tok->fp_interactive);
285
        // We can reach this point if the tokenizer buffers for interactive source have not been
286
        // initialized because we failed to decode the original source with the given locale.
287
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_STR);
288
0
    }
289
290
172
    Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
291
172
    const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
292
293
172
    if (buf_end < cur_line) {
294
12
        buf_end = cur_line + strlen(cur_line);
295
12
    }
296
297
2.03k
    for (int i = 0; i < relative_lineno - 1; i++) {
298
1.86k
        char *new_line = strchr(cur_line, '\n');
299
        // The assert is here for debug builds but the conditional that
300
        // follows is there so in release builds we do not crash at the cost
301
        // to report a potentially wrong line.
302
1.86k
        assert(new_line != NULL && new_line + 1 < buf_end);
303
1.86k
        if (new_line == NULL || new_line + 1 > buf_end) {
304
0
            break;
305
0
        }
306
1.86k
        cur_line = new_line + 1;
307
1.86k
    }
308
309
172
    char *next_newline;
310
172
    if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
311
0
        next_newline = cur_line + strlen(cur_line);
312
0
    }
313
172
    return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
314
172
}
315
316
void *
317
_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
318
                                    Py_ssize_t lineno, Py_ssize_t col_offset,
319
                                    Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
320
                                    const char *errmsg, va_list va)
321
10.9k
{
322
    // Bail out if we already have an error set.
323
10.9k
    if (p->error_indicator && PyErr_Occurred()) {
324
927
        return NULL;
325
927
    }
326
10.0k
    PyObject *value = NULL;
327
10.0k
    PyObject *errstr = NULL;
328
10.0k
    PyObject *error_line = NULL;
329
10.0k
    PyObject *tmp = NULL;
330
10.0k
    p->error_indicator = 1;
331
332
10.0k
    if (end_lineno == CURRENT_POS) {
333
23
        end_lineno = p->tok->lineno;
334
23
    }
335
10.0k
    if (end_col_offset == CURRENT_POS) {
336
23
        end_col_offset = p->tok->cur - p->tok->line_start;
337
23
    }
338
339
10.0k
    errstr = PyUnicode_FromFormatV(errmsg, va);
340
10.0k
    if (!errstr) {
341
0
        goto error;
342
0
    }
343
344
10.0k
    if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
345
0
        error_line = get_error_line_from_tokenizer_buffers(p, lineno);
346
0
    }
347
10.0k
    else if (p->start_rule == Py_file_input) {
348
10.0k
        error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
349
10.0k
                                                     (int) lineno, p->tok->encoding);
350
10.0k
    }
351
352
10.0k
    if (!error_line) {
353
        /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
354
           then we need to find the error line from some other source, because
355
           p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
356
           failed or we're parsing from a string or the REPL. There's a third edge case where
357
           we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
358
           `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
359
           does not physically exist */
360
10.0k
        assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
361
362
10.0k
        if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
363
9.89k
            Py_ssize_t size = p->tok->inp - p->tok->line_start;
364
9.89k
            error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace");
365
9.89k
        }
366
172
        else if (p->tok->fp == NULL || p->tok->fp == stdin) {
367
172
            error_line = get_error_line_from_tokenizer_buffers(p, lineno);
368
172
        }
369
0
        else {
370
0
            error_line = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
371
0
        }
372
10.0k
        if (!error_line) {
373
0
            goto error;
374
0
        }
375
10.0k
    }
376
377
10.0k
    Py_ssize_t col_number = col_offset;
378
10.0k
    Py_ssize_t end_col_number = end_col_offset;
379
380
10.0k
    col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
381
10.0k
    if (col_number < 0) {
382
0
        goto error;
383
0
    }
384
385
10.0k
    if (end_col_offset > 0) {
386
7.83k
        end_col_number = _PyPegen_byte_offset_to_character_offset(error_line, end_col_offset);
387
7.83k
        if (end_col_number < 0) {
388
0
            goto error;
389
0
        }
390
7.83k
    }
391
392
10.0k
    tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
393
10.0k
    if (!tmp) {
394
0
        goto error;
395
0
    }
396
10.0k
    value = PyTuple_Pack(2, errstr, tmp);
397
10.0k
    Py_DECREF(tmp);
398
10.0k
    if (!value) {
399
0
        goto error;
400
0
    }
401
10.0k
    PyErr_SetObject(errtype, value);
402
403
10.0k
    Py_DECREF(errstr);
404
10.0k
    Py_DECREF(value);
405
10.0k
    return NULL;
406
407
0
error:
408
0
    Py_XDECREF(errstr);
409
0
    Py_XDECREF(error_line);
410
0
    return NULL;
411
10.0k
}
412
413
void
414
12.0k
_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
415
    // Existing syntax error
416
12.0k
    if (PyErr_Occurred()) {
417
        // Prioritize tokenizer errors to custom syntax errors raised
418
        // on the second phase only if the errors come from the parser.
419
5.53k
        int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
420
5.53k
        if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
421
1.58k
            _PyPegen_tokenize_full_source_to_check_for_errors(p);
422
1.58k
        }
423
        // Propagate the existing syntax error.
424
5.53k
        return;
425
5.53k
    }
426
    // Initialization error
427
6.48k
    if (p->fill == 0) {
428
0
        RAISE_SYNTAX_ERROR("error at start before reading any input");
429
0
    }
430
    // Parser encountered EOF (End of File) unexpectedtly
431
6.48k
    if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
432
0
        if (p->tok->level) {
433
0
            raise_unclosed_parentheses_error(p);
434
0
        } else {
435
0
            RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
436
0
        }
437
0
        return;
438
0
    }
439
    // Indentation error in the tokenizer
440
6.48k
    if (last_token->type == INDENT || last_token->type == DEDENT) {
441
127
        RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
442
127
        return;
443
127
    }
444
    // Unknown error (generic case)
445
446
    // Use the last token we found on the first pass to avoid reporting
447
    // incorrect locations for generic syntax errors just because we reached
448
    // further away when trying to find specific syntax errors in the second
449
    // pass.
450
6.35k
    RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
451
    // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
452
    // generic SyntaxError we just raised if errors are found.
453
6.35k
    _PyPegen_tokenize_full_source_to_check_for_errors(p);
454
6.35k
}
455
456
void
457
_Pypegen_stack_overflow(Parser *p)
458
67
{
459
67
    p->error_indicator = 1;
460
67
    PyErr_SetString(PyExc_MemoryError,
461
67
        "Parser stack overflowed - Python source too complex to parse");
462
67
}