Coverage Report

Created: 2025-07-04 06:49

/src/cpython/Parser/pegen_errors.c
Line
Count
Source (jump to first uncovered line)
1
#include <Python.h>
2
#include <errcode.h>
3
4
#include "pycore_pyerrors.h"      // _PyErr_ProgramDecodedTextObject()
5
#include "lexer/state.h"
6
#include "lexer/lexer.h"
7
#include "pegen.h"
8
9
// TOKENIZER ERRORS
10
11
void
12
_PyPegen_raise_tokenizer_init_error(PyObject *filename)
13
1.84k
{
14
1.84k
    if (!(PyErr_ExceptionMatches(PyExc_LookupError)
15
1.84k
          || PyErr_ExceptionMatches(PyExc_SyntaxError)
16
1.84k
          || PyErr_ExceptionMatches(PyExc_ValueError)
17
1.84k
          || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
18
56
        return;
19
56
    }
20
1.78k
    PyObject *errstr = NULL;
21
1.78k
    PyObject *tuple = NULL;
22
1.78k
    PyObject *type;
23
1.78k
    PyObject *value;
24
1.78k
    PyObject *tback;
25
1.78k
    PyErr_Fetch(&type, &value, &tback);
26
1.78k
    errstr = PyObject_Str(value);
27
1.78k
    if (!errstr) {
28
0
        goto error;
29
0
    }
30
31
1.78k
    PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
32
1.78k
    if (!tmp) {
33
0
        goto error;
34
0
    }
35
36
1.78k
    tuple = PyTuple_Pack(2, errstr, tmp);
37
1.78k
    Py_DECREF(tmp);
38
1.78k
    if (!value) {
39
0
        goto error;
40
0
    }
41
1.78k
    PyErr_SetObject(PyExc_SyntaxError, tuple);
42
43
1.78k
error:
44
1.78k
    Py_XDECREF(type);
45
1.78k
    Py_XDECREF(value);
46
1.78k
    Py_XDECREF(tback);
47
1.78k
    Py_XDECREF(errstr);
48
1.78k
    Py_XDECREF(tuple);
49
1.78k
}
50
51
static inline void
52
1.91k
raise_unclosed_parentheses_error(Parser *p) {
53
1.91k
       int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
54
1.91k
       int error_col = p->tok->parencolstack[p->tok->level-1];
55
1.91k
       RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
56
1.91k
                                  error_lineno, error_col, error_lineno, -1,
57
1.91k
                                  "'%c' was never closed",
58
1.91k
                                  p->tok->parenstack[p->tok->level-1]);
59
1.91k
}
60
61
int
62
_Pypegen_tokenizer_error(Parser *p)
63
3.77k
{
64
3.77k
    if (PyErr_Occurred()) {
65
1.90k
        return -1;
66
1.90k
    }
67
68
1.87k
    const char *msg = NULL;
69
1.87k
    PyObject* errtype = PyExc_SyntaxError;
70
1.87k
    Py_ssize_t col_offset = -1;
71
1.87k
    p->error_indicator = 1;
72
1.87k
    switch (p->tok->done) {
73
0
        case E_TOKEN:
74
0
            msg = "invalid token";
75
0
            break;
76
1.82k
        case E_EOF:
77
1.82k
            if (p->tok->level) {
78
1.79k
                raise_unclosed_parentheses_error(p);
79
1.79k
            } else {
80
28
                RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
81
28
            }
82
1.82k
            return -1;
83
4
        case E_DEDENT:
84
4
            RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
85
4
            return -1;
86
0
        case E_INTR:
87
0
            if (!PyErr_Occurred()) {
88
0
                PyErr_SetNone(PyExc_KeyboardInterrupt);
89
0
            }
90
0
            return -1;
91
0
        case E_NOMEM:
92
0
            PyErr_NoMemory();
93
0
            return -1;
94
2
        case E_TABSPACE:
95
2
            errtype = PyExc_TabError;
96
2
            msg = "inconsistent use of tabs and spaces in indentation";
97
2
            break;
98
0
        case E_TOODEEP:
99
0
            errtype = PyExc_IndentationError;
100
0
            msg = "too many levels of indentation";
101
0
            break;
102
43
        case E_LINECONT: {
103
43
            col_offset = p->tok->cur - p->tok->buf - 1;
104
43
            msg = "unexpected character after line continuation character";
105
43
            break;
106
0
        }
107
0
        case E_COLUMNOVERFLOW:
108
0
            PyErr_SetString(PyExc_OverflowError,
109
0
                    "Parser column offset overflow - source line is too big");
110
0
            return -1;
111
0
        default:
112
0
            msg = "unknown parsing error";
113
1.87k
    }
114
115
45
    RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
116
45
                               col_offset >= 0 ? col_offset : 0,
117
45
                               p->tok->lineno, -1, msg);
118
45
    return -1;
119
1.87k
}
120
121
int
122
_Pypegen_raise_decode_error(Parser *p)
123
1.17k
{
124
1.17k
    assert(PyErr_Occurred());
125
1.17k
    const char *errtype = NULL;
126
1.17k
    if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
127
1.16k
        errtype = "unicode error";
128
1.16k
    }
129
18
    else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
130
5
        errtype = "value error";
131
5
    }
132
1.17k
    if (errtype) {
133
1.16k
        PyObject *type;
134
1.16k
        PyObject *value;
135
1.16k
        PyObject *tback;
136
1.16k
        PyObject *errstr;
137
1.16k
        PyErr_Fetch(&type, &value, &tback);
138
1.16k
        errstr = PyObject_Str(value);
139
1.16k
        if (errstr) {
140
1.16k
            RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
141
1.16k
            Py_DECREF(errstr);
142
1.16k
        }
143
0
        else {
144
0
            PyErr_Clear();
145
0
            RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
146
0
        }
147
1.16k
        Py_XDECREF(type);
148
1.16k
        Py_XDECREF(value);
149
1.16k
        Py_XDECREF(tback);
150
1.16k
    }
151
152
1.17k
    return -1;
153
1.17k
}
154
155
static int
156
8.75k
_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
157
    // Tokenize the whole input to see if there are any tokenization
158
    // errors such as mismatching parentheses. These will get priority
159
    // over generic syntax errors only if the line number of the error is
160
    // before the one that we had for the generic error.
161
162
    // We don't want to tokenize to the end for interactive input
163
8.75k
    if (p->tok->prompt != NULL) {
164
0
        return 0;
165
0
    }
166
167
8.75k
    PyObject *type, *value, *traceback;
168
8.75k
    PyErr_Fetch(&type, &value, &traceback);
169
170
8.75k
    Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
171
8.75k
    Py_ssize_t current_err_line = current_token->lineno;
172
173
8.75k
    int ret = 0;
174
8.75k
    struct token new_token;
175
8.75k
    _PyToken_Init(&new_token);
176
177
46.5k
    for (;;) {
178
46.5k
        switch (_PyTokenizer_Get(p->tok, &new_token)) {
179
3.56k
            case ERRORTOKEN:
180
3.56k
                if (PyErr_Occurred()) {
181
1.01k
                    ret = -1;
182
1.01k
                    goto exit;
183
1.01k
                }
184
2.55k
                if (p->tok->level != 0) {
185
2.52k
                    int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
186
2.52k
                    if (current_err_line > error_lineno) {
187
117
                        raise_unclosed_parentheses_error(p);
188
117
                        ret = -1;
189
117
                        goto exit;
190
117
                    }
191
2.52k
                }
192
2.43k
                break;
193
5.18k
            case ENDMARKER:
194
5.18k
                break;
195
37.8k
            default:
196
37.8k
                continue;
197
46.5k
        }
198
7.61k
        break;
199
46.5k
    }
200
201
202
8.75k
exit:
203
8.75k
    _PyToken_Free(&new_token);
204
    // If we're in an f-string, we want the syntax error in the expression part
205
    // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
206
    // do not swallow it.
207
8.75k
    if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
208
885
        Py_XDECREF(value);
209
885
        Py_XDECREF(type);
210
885
        Py_XDECREF(traceback);
211
7.86k
    } else {
212
7.86k
        PyErr_Restore(type, value, traceback);
213
7.86k
    }
214
8.75k
    return ret;
215
8.75k
}
216
217
// PARSER ERRORS
218
219
void *
220
_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...)
221
2.39k
{
222
    // Bail out if we already have an error set.
223
2.39k
    if (p->error_indicator && PyErr_Occurred()) {
224
221
        return NULL;
225
221
    }
226
2.17k
    if (p->fill == 0) {
227
0
        va_list va;
228
0
        va_start(va, errmsg);
229
0
        _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
230
0
        va_end(va);
231
0
        return NULL;
232
0
    }
233
2.17k
    if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
234
0
        p->error_indicator = 1;
235
0
        return NULL;
236
0
    }
237
2.17k
    Token *t = p->known_err_token != NULL
238
2.17k
                   ? p->known_err_token
239
2.17k
                   : p->tokens[use_mark ? p->mark : p->fill - 1];
240
2.17k
    Py_ssize_t col_offset;
241
2.17k
    Py_ssize_t end_col_offset = -1;
242
2.17k
    if (t->col_offset == -1) {
243
1.09k
        if (p->tok->cur == p->tok->buf) {
244
2
            col_offset = 0;
245
1.09k
        } else {
246
1.09k
            const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
247
1.09k
            col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
248
1.09k
        }
249
1.09k
    } else {
250
1.08k
        col_offset = t->col_offset + 1;
251
1.08k
    }
252
253
2.17k
    if (t->end_col_offset != -1) {
254
1.08k
        end_col_offset = t->end_col_offset + 1;
255
1.08k
    }
256
257
2.17k
    va_list va;
258
2.17k
    va_start(va, errmsg);
259
2.17k
    _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
260
2.17k
    va_end(va);
261
262
2.17k
    return NULL;
263
2.17k
}
264
265
static PyObject *
266
get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
267
259
{
268
    /* If the file descriptor is interactive, the source lines of the current
269
     * (multi-line) statement are stored in p->tok->interactive_src_start.
270
     * If not, we're parsing from a string, which means that the whole source
271
     * is stored in p->tok->str. */
272
259
    assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL);
273
274
259
    char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
275
259
    if (cur_line == NULL) {
276
0
        assert(p->tok->fp_interactive);
277
        // We can reach this point if the tokenizer buffers for interactive source have not been
278
        // initialized because we failed to decode the original source with the given locale.
279
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_STR);
280
0
    }
281
282
259
    Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
283
259
    const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
284
285
259
    if (buf_end < cur_line) {
286
9
        buf_end = cur_line + strlen(cur_line);
287
9
    }
288
289
2.40k
    for (int i = 0; i < relative_lineno - 1; i++) {
290
2.14k
        char *new_line = strchr(cur_line, '\n');
291
        // The assert is here for debug builds but the conditional that
292
        // follows is there so in release builds we do not crash at the cost
293
        // to report a potentially wrong line.
294
2.14k
        assert(new_line != NULL && new_line + 1 < buf_end);
295
2.14k
        if (new_line == NULL || new_line + 1 > buf_end) {
296
0
            break;
297
0
        }
298
2.14k
        cur_line = new_line + 1;
299
2.14k
    }
300
301
259
    char *next_newline;
302
259
    if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
303
0
        next_newline = cur_line + strlen(cur_line);
304
0
    }
305
259
    return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
306
259
}
307
308
void *
309
_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
310
                                    Py_ssize_t lineno, Py_ssize_t col_offset,
311
                                    Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
312
                                    const char *errmsg, va_list va)
313
12.9k
{
314
    // Bail out if we already have an error set.
315
12.9k
    if (p->error_indicator && PyErr_Occurred()) {
316
1.18k
        return NULL;
317
1.18k
    }
318
11.8k
    PyObject *value = NULL;
319
11.8k
    PyObject *errstr = NULL;
320
11.8k
    PyObject *error_line = NULL;
321
11.8k
    PyObject *tmp = NULL;
322
11.8k
    p->error_indicator = 1;
323
324
11.8k
    if (end_lineno == CURRENT_POS) {
325
27
        end_lineno = p->tok->lineno;
326
27
    }
327
11.8k
    if (end_col_offset == CURRENT_POS) {
328
27
        end_col_offset = p->tok->cur - p->tok->line_start;
329
27
    }
330
331
11.8k
    errstr = PyUnicode_FromFormatV(errmsg, va);
332
11.8k
    if (!errstr) {
333
0
        goto error;
334
0
    }
335
336
11.8k
    if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
337
0
        error_line = get_error_line_from_tokenizer_buffers(p, lineno);
338
0
    }
339
11.8k
    else if (p->start_rule == Py_file_input) {
340
11.8k
        error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
341
11.8k
                                                     (int) lineno, p->tok->encoding);
342
11.8k
    }
343
344
11.8k
    if (!error_line) {
345
        /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
346
           then we need to find the error line from some other source, because
347
           p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
348
           failed or we're parsing from a string or the REPL. There's a third edge case where
349
           we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
350
           `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
351
           does not physically exist */
352
11.8k
        assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
353
354
11.8k
        if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
355
11.5k
            Py_ssize_t size = p->tok->inp - p->tok->line_start;
356
11.5k
            error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace");
357
11.5k
        }
358
259
        else if (p->tok->fp == NULL || p->tok->fp == stdin) {
359
259
            error_line = get_error_line_from_tokenizer_buffers(p, lineno);
360
259
        }
361
0
        else {
362
0
            error_line = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
363
0
        }
364
11.8k
        if (!error_line) {
365
0
            goto error;
366
0
        }
367
11.8k
    }
368
369
11.8k
    Py_ssize_t col_number = col_offset;
370
11.8k
    Py_ssize_t end_col_number = end_col_offset;
371
372
11.8k
    col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
373
11.8k
    if (col_number < 0) {
374
0
        goto error;
375
0
    }
376
377
11.8k
    if (end_col_offset > 0) {
378
8.62k
        end_col_number = _PyPegen_byte_offset_to_character_offset(error_line, end_col_offset);
379
8.62k
        if (end_col_number < 0) {
380
0
            goto error;
381
0
        }
382
8.62k
    }
383
384
11.8k
    tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
385
11.8k
    if (!tmp) {
386
0
        goto error;
387
0
    }
388
11.8k
    value = PyTuple_Pack(2, errstr, tmp);
389
11.8k
    Py_DECREF(tmp);
390
11.8k
    if (!value) {
391
0
        goto error;
392
0
    }
393
11.8k
    PyErr_SetObject(errtype, value);
394
395
11.8k
    Py_DECREF(errstr);
396
11.8k
    Py_DECREF(value);
397
11.8k
    return NULL;
398
399
0
error:
400
0
    Py_XDECREF(errstr);
401
0
    Py_XDECREF(error_line);
402
0
    return NULL;
403
11.8k
}
404
405
void
406
13.6k
_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
407
    // Existing syntax error
408
13.6k
    if (PyErr_Occurred()) {
409
        // Prioritize tokenizer errors to custom syntax errors raised
410
        // on the second phase only if the errors come from the parser.
411
6.79k
        int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
412
6.79k
        if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
413
2.04k
            _PyPegen_tokenize_full_source_to_check_for_errors(p);
414
2.04k
        }
415
        // Propagate the existing syntax error.
416
6.79k
        return;
417
6.79k
    }
418
    // Initialization error
419
6.85k
    if (p->fill == 0) {
420
0
        RAISE_SYNTAX_ERROR("error at start before reading any input");
421
0
    }
422
    // Parser encountered EOF (End of File) unexpectedtly
423
6.85k
    if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
424
0
        if (p->tok->level) {
425
0
            raise_unclosed_parentheses_error(p);
426
0
        } else {
427
0
            RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
428
0
        }
429
0
        return;
430
0
    }
431
    // Indentation error in the tokenizer
432
6.85k
    if (last_token->type == INDENT || last_token->type == DEDENT) {
433
151
        RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
434
151
        return;
435
151
    }
436
    // Unknown error (generic case)
437
438
    // Use the last token we found on the first pass to avoid reporting
439
    // incorrect locations for generic syntax errors just because we reached
440
    // further away when trying to find specific syntax errors in the second
441
    // pass.
442
6.70k
    RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
443
    // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
444
    // generic SyntaxError we just raised if errors are found.
445
6.70k
    _PyPegen_tokenize_full_source_to_check_for_errors(p);
446
6.70k
}
447
448
void
449
_Pypegen_stack_overflow(Parser *p)
450
65
{
451
65
    p->error_indicator = 1;
452
65
    PyErr_SetString(PyExc_MemoryError,
453
65
        "Parser stack overflowed - Python source too complex to parse");
454
65
}