/src/cpython/Parser/pegen_errors.c

Source
#include <Python.h>
#include <errcode.h>

#include "pycore_pyerrors.h"      // _PyErr_ProgramDecodedTextObject()
#include "pycore_runtime.h"       // _Py_ID()
#include "pycore_tuple.h"         // _PyTuple_FromPair
#include "lexer/state.h"
#include "lexer/lexer.h"
#include "pegen.h"

// TOKENIZER ERRORS

void
_PyPegen_raise_tokenizer_init_error(PyObject *filename)
{
    if (!(PyErr_ExceptionMatches(PyExc_LookupError)
          || PyErr_ExceptionMatches(PyExc_SyntaxError)
          || PyErr_ExceptionMatches(PyExc_ValueError)
          || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
        return;
    }
    PyObject *errstr = NULL;
    PyObject *tuple = NULL;
    PyObject *type;
    PyObject *value;
    PyObject *tback;
    PyErr_Fetch(&type, &value, &tback);
    if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
        if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
            goto error;
        }
        PyErr_Restore(type, value, tback);
        return;
    }
    errstr = PyObject_Str(value);
    if (!errstr) {
        goto error;
    }

    PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
    if (!tmp) {
        goto error;
    }

    tuple = _PyTuple_FromPair(errstr, tmp);
    Py_DECREF(tmp);
    if (!tuple) {
        goto error;
    }
    PyErr_SetObject(PyExc_SyntaxError, tuple);

error:
    Py_XDECREF(type);
    Py_XDECREF(value);
    Py_XDECREF(tback);
    Py_XDECREF(errstr);
    Py_XDECREF(tuple);
}

static inline void
raise_unclosed_parentheses_error(Parser *p) {
       int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
       int error_col = p->tok->parencolstack[p->tok->level-1];
       RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
                                  error_lineno, error_col, error_lineno, -1,
                                  "'%c' was never closed",
                                  p->tok->parenstack[p->tok->level-1]);
}

int
_Pypegen_tokenizer_error(Parser *p)
{
    if (PyErr_Occurred()) {
        return -1;
    }

    const char *msg = NULL;
    PyObject* errtype = PyExc_SyntaxError;
    Py_ssize_t col_offset = -1;
    p->error_indicator = 1;
    switch (p->tok->done) {
        case E_TOKEN:
            msg = "invalid token";
            break;
        case E_EOF:
            if (p->tok->level) {
                raise_unclosed_parentheses_error(p);
            } else {
                RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
            }
            return -1;
        case E_DEDENT:
            RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
            return -1;
        case E_INTR:
            if (!PyErr_Occurred()) {
                PyErr_SetNone(PyExc_KeyboardInterrupt);
            }
            return -1;
        case E_NOMEM:
            PyErr_NoMemory();
            return -1;
        case E_TABSPACE:
            errtype = PyExc_TabError;
            msg = "inconsistent use of tabs and spaces in indentation";
            break;
        case E_TOODEEP:
            errtype = PyExc_IndentationError;
            msg = "too many levels of indentation";
            break;
        case E_LINECONT: {
            col_offset = p->tok->cur - p->tok->buf - 1;
            msg = "unexpected character after line continuation character";
            break;
        }
        case E_COLUMNOVERFLOW:
            PyErr_SetString(PyExc_OverflowError,
                    "Parser column offset overflow - source line is too big");
            return -1;
        default:
            msg = "unknown parsing error";
    }

    RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
                               col_offset >= 0 ? col_offset : 0,
                               p->tok->lineno, -1, msg);
    return -1;
}

int
_Pypegen_raise_decode_error(Parser *p)
{
    assert(PyErr_Occurred());
    const char *errtype = NULL;
    if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
        errtype = "unicode error";
    }
    else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
        errtype = "value error";
    }
    if (errtype) {
        PyObject *type;
        PyObject *value;
        PyObject *tback;
        PyObject *errstr;
        PyErr_Fetch(&type, &value, &tback);
        errstr = PyObject_Str(value);
        if (errstr) {
            RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
            Py_DECREF(errstr);
        }
        else {
            PyErr_Clear();
            RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
        }
        Py_XDECREF(type);
        Py_XDECREF(value);
        Py_XDECREF(tback);
    }

    return -1;
}

static int
_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
    // Tokenize the whole input to see if there are any tokenization
    // errors such as mismatching parentheses. These will get priority
    // over generic syntax errors only if the line number of the error is
    // before the one that we had for the generic error.

    // We don't want to tokenize to the end for interactive input
    if (p->tok->prompt != NULL) {
        return 0;
    }

    PyObject *type, *value, *traceback;
    PyErr_Fetch(&type, &value, &traceback);

    Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
    Py_ssize_t current_err_line = current_token->lineno;

    int ret = 0;
    struct token new_token;
    _PyToken_Init(&new_token);

    for (;;) {
        switch (_PyTokenizer_Get(p->tok, &new_token)) {
            case ERRORTOKEN:
                if (PyErr_Occurred()) {
                    ret = -1;
                    goto exit;
                }
                if (p->tok->level != 0) {
                    int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
                    if (current_err_line > error_lineno) {
                        raise_unclosed_parentheses_error(p);
                        ret = -1;
                        goto exit;
                    }
                }
                break;
            case ENDMARKER:
                break;
            default:
                continue;
        }
        break;
    }


exit:
    _PyToken_Free(&new_token);
    // If we're in an f-string, we want the syntax error in the expression part
    // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
    // do not swallow it.
    if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
        Py_XDECREF(value);
        Py_XDECREF(type);
        Py_XDECREF(traceback);
    } else {
        PyErr_Restore(type, value, traceback);
    }
    return ret;
}

// PARSER ERRORS

void *
_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...)
{
    // Bail out if we already have an error set.
    if (p->error_indicator && PyErr_Occurred()) {
        return NULL;
    }
    if (p->fill == 0) {
        va_list va;
        va_start(va, errmsg);
        _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
        va_end(va);
        return NULL;
    }
    if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
        p->error_indicator = 1;
        return NULL;
    }
    Token *t = p->known_err_token != NULL
                   ? p->known_err_token
                   : p->tokens[use_mark ? p->mark : p->fill - 1];
    Py_ssize_t col_offset;
    Py_ssize_t end_col_offset = -1;
    if (t->col_offset == -1) {
        if (p->tok->cur == p->tok->buf) {
            col_offset = 0;
        } else {
            const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
            col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
        }
    } else {
        col_offset = t->col_offset + 1;
    }

    if (t->end_col_offset != -1) {
        end_col_offset = t->end_col_offset + 1;
    }

    va_list va;
    va_start(va, errmsg);
    _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
    va_end(va);

    return NULL;
}

static PyObject *
get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
{
    /* If the file descriptor is interactive, the source lines of the current
     * (multi-line) statement are stored in p->tok->interactive_src_start.
     * If not, we're parsing from a string, which means that the whole source
     * is stored in p->tok->str. */
    assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL);

    char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
    if (cur_line == NULL) {
        assert(p->tok->fp_interactive);
        // We can reach this point if the tokenizer buffers for interactive source have not been
        // initialized because we failed to decode the original source with the given locale.
        return Py_GetConstant(Py_CONSTANT_EMPTY_STR);
    }

    Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
    const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;

    if (buf_end < cur_line) {
        buf_end = cur_line + strlen(cur_line);
    }

    for (int i = 0; i < relative_lineno - 1; i++) {
        char *new_line = strchr(cur_line, '\n');
        // The assert is here for debug builds but the conditional that
        // follows is there so in release builds we do not crash at the cost
        // to report a potentially wrong line.
        assert(new_line != NULL && new_line + 1 < buf_end);
        if (new_line == NULL || new_line + 1 > buf_end) {
            break;
        }
        cur_line = new_line + 1;
    }

    char *next_newline;
    if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
        next_newline = cur_line + strlen(cur_line);
    }
    return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
}

void *
_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
                                    Py_ssize_t lineno, Py_ssize_t col_offset,
                                    Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
                                    const char *errmsg, va_list va)
{
    // Bail out if we already have an error set.
    if (p->error_indicator && PyErr_Occurred()) {
        return NULL;
    }
    PyObject *value = NULL;
    PyObject *errstr = NULL;
    PyObject *error_line = NULL;
    PyObject *tmp = NULL;
    p->error_indicator = 1;

    if (end_lineno == CURRENT_POS) {
        end_lineno = p->tok->lineno;
    }
    if (end_col_offset == CURRENT_POS) {
        end_col_offset = p->tok->cur - p->tok->line_start;
    }

    errstr = PyUnicode_FromFormatV(errmsg, va);
    if (!errstr) {
        goto error;
    }

    if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
        error_line = get_error_line_from_tokenizer_buffers(p, lineno);
    }
    else if (p->start_rule == Py_file_input) {
        error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
                                                     (int) lineno, p->tok->encoding);
    }

    if (!error_line) {
        /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
           then we need to find the error line from some other source, because
           p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
           failed or we're parsing from a string or the REPL. There's a third edge case where
           we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
           `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
           does not physically exist */
        assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);

        if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
            Py_ssize_t size = p->tok->inp - p->tok->line_start;
            error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace");
        }
        else if (p->tok->fp == NULL || p->tok->fp == stdin) {
            error_line = get_error_line_from_tokenizer_buffers(p, lineno);
        }
        else {
            error_line = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
        }
        if (!error_line) {
            goto error;
        }
    }

    Py_ssize_t col_number = col_offset;
    Py_ssize_t end_col_number = end_col_offset;

    col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
    if (col_number < 0) {
        goto error;
    }

    if (end_col_offset > 0) {
        end_col_number = _PyPegen_byte_offset_to_character_offset(error_line, end_col_offset);
        if (end_col_number < 0) {
            goto error;
        }
    }

    tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
    if (!tmp) {
        goto error;
    }
    value = _PyTuple_FromPair(errstr, tmp);
    Py_DECREF(tmp);
    if (!value) {
        goto error;
    }
    PyErr_SetObject(errtype, value);

    Py_DECREF(errstr);
    Py_DECREF(value);
    return NULL;

error:
    Py_XDECREF(errstr);
    Py_XDECREF(error_line);
    return NULL;
}

void
_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
    // Existing syntax error
    if (PyErr_Occurred()) {
        // Prioritize tokenizer errors to custom syntax errors raised
        // on the second phase only if the errors come from the parser.
        int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
        if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
            _PyPegen_tokenize_full_source_to_check_for_errors(p);
        }
        // Propagate the existing syntax error.
        return;
    }
    // Initialization error
    if (p->fill == 0) {
        RAISE_SYNTAX_ERROR("error at start before reading any input");
    }
    // Parser encountered EOF (End of File) unexpectedtly
    if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
        if (p->tok->level) {
            raise_unclosed_parentheses_error(p);
        } else {
            RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
        }
        return;
    }
    // Indentation error in the tokenizer
    if (last_token->type == INDENT || last_token->type == DEDENT) {
        RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
        return;
    }
    // Unknown error (generic case)

    // Use the last token we found on the first pass to avoid reporting
    // incorrect locations for generic syntax errors just because we reached
    // further away when trying to find specific syntax errors in the second
    // pass.
    RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
    // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
    // generic SyntaxError we just raised if errors are found.
    _PyPegen_tokenize_full_source_to_check_for_errors(p);
}

void
_Pypegen_stack_overflow(Parser *p)
{
    p->error_indicator = 1;
    PyErr_SetString(PyExc_MemoryError,
        "Parser stack overflowed - Python source too complex to parse");
}

Coverage Report

Created: 2026-03-23 06:45

Line	Count	Source
1		#include <Python.h>
2		#include <errcode.h>
3
4		#include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject()
5		#include "pycore_runtime.h" // _Py_ID()
6		#include "pycore_tuple.h" // _PyTuple_FromPair
7		#include "lexer/state.h"
8		#include "lexer/lexer.h"
9		#include "pegen.h"
10
11		// TOKENIZER ERRORS
12
13		void
14		_PyPegen_raise_tokenizer_init_error(PyObject *filename)
15	2.06k	{
16	2.06k	if (!(PyErr_ExceptionMatches(PyExc_LookupError)
17	1.95k	\|\| PyErr_ExceptionMatches(PyExc_SyntaxError)
18	1.43k	\|\| PyErr_ExceptionMatches(PyExc_ValueError)
19	53	\|\| PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
20	53	return;
21	53	}
22	2.01k	PyObject *errstr = NULL;
23	2.01k	PyObject *tuple = NULL;
24	2.01k	PyObject *type;
25	2.01k	PyObject *value;
26	2.01k	PyObject *tback;
27	2.01k	PyErr_Fetch(&type, &value, &tback);
28	2.01k	if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
29	519	if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
30	0	goto error;
31	0	}
32	519	PyErr_Restore(type, value, tback);
33	519	return;
34	519	}
35	1.49k	errstr = PyObject_Str(value);
36	1.49k	if (!errstr) {
37	0	goto error;
38	0	}
39
40	1.49k	PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
41	1.49k	if (!tmp) {
42	0	goto error;
43	0	}
44
45	1.49k	tuple = _PyTuple_FromPair(errstr, tmp);
46	1.49k	Py_DECREF(tmp);
47	1.49k	if (!tuple) {
48	0	goto error;
49	0	}
50	1.49k	PyErr_SetObject(PyExc_SyntaxError, tuple);
51
52	1.49k	error:
53	1.49k	Py_XDECREF(type);
54	1.49k	Py_XDECREF(value);
55	1.49k	Py_XDECREF(tback);
56	1.49k	Py_XDECREF(errstr);
57	1.49k	Py_XDECREF(tuple);
58	1.49k	}
59
60		static inline void
61	1.68k	raise_unclosed_parentheses_error(Parser *p) {
62	1.68k	int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
63	1.68k	int error_col = p->tok->parencolstack[p->tok->level-1];
64	1.68k	RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
65	1.68k	error_lineno, error_col, error_lineno, -1,
66	1.68k	"'%c' was never closed",
67	1.68k	p->tok->parenstack[p->tok->level-1]);
68	1.68k	}
69
70		int
71		_Pypegen_tokenizer_error(Parser *p)
72	3.40k	{
73	3.40k	if (PyErr_Occurred()) {
74	1.72k	return -1;
75	1.72k	}
76
77	1.68k	const char *msg = NULL;
78	1.68k	PyObject* errtype = PyExc_SyntaxError;
79	1.68k	Py_ssize_t col_offset = -1;
80	1.68k	p->error_indicator = 1;
81	1.68k	switch (p->tok->done) {
82	0	case E_TOKEN:
83	0	msg = "invalid token";
84	0	break;
85	1.64k	case E_EOF:
86	1.64k	if (p->tok->level) {
87	1.61k	raise_unclosed_parentheses_error(p);
88	1.61k	} else {
89	32	RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
90	32	}
91	1.64k	return -1;
92	6	case E_DEDENT:
93	6	RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
94	6	return -1;
95	0	case E_INTR:
96	0	if (!PyErr_Occurred()) {
97	0	PyErr_SetNone(PyExc_KeyboardInterrupt);
98	0	}
99	0	return -1;
100	0	case E_NOMEM:
101	0	PyErr_NoMemory();
102	0	return -1;
103	2	case E_TABSPACE:
104	2	errtype = PyExc_TabError;
105	2	msg = "inconsistent use of tabs and spaces in indentation";
106	2	break;
107	0	case E_TOODEEP:
108	0	errtype = PyExc_IndentationError;
109	0	msg = "too many levels of indentation";
110	0	break;
111	29	case E_LINECONT: {
112	29	col_offset = p->tok->cur - p->tok->buf - 1;
113	29	msg = "unexpected character after line continuation character";
114	29	break;
115	0	}
116	0	case E_COLUMNOVERFLOW:
117	0	PyErr_SetString(PyExc_OverflowError,
118	0	"Parser column offset overflow - source line is too big");
119	0	return -1;
120	0	default:
121	0	msg = "unknown parsing error";
122	1.68k	}
123
124	31	RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
125	31	col_offset >= 0 ? col_offset : 0,
126	31	p->tok->lineno, -1, msg);
127	31	return -1;
128	1.68k	}
129
130		int
131		_Pypegen_raise_decode_error(Parser *p)
132	97	{
133	97	assert(PyErr_Occurred());
134	97	const char *errtype = NULL;
135	97	if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
136	90	errtype = "unicode error";
137	90	}
138	7	else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
139	5	errtype = "value error";
140	5	}
141	97	if (errtype) {
142	95	PyObject *type;
143	95	PyObject *value;
144	95	PyObject *tback;
145	95	PyObject *errstr;
146	95	PyErr_Fetch(&type, &value, &tback);
147	95	errstr = PyObject_Str(value);
148	95	if (errstr) {
149	95	RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
150	95	Py_DECREF(errstr);
151	95	}
152	0	else {
153	0	PyErr_Clear();
154	0	RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
155	0	}
156	95	Py_XDECREF(type);
157	95	Py_XDECREF(value);
158	95	Py_XDECREF(tback);
159	95	}
160
161	97	return -1;
162	97	}
163
164		static int
165	80.5k	_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
166		// Tokenize the whole input to see if there are any tokenization
167		// errors such as mismatching parentheses. These will get priority
168		// over generic syntax errors only if the line number of the error is
169		// before the one that we had for the generic error.
170
171		// We don't want to tokenize to the end for interactive input
172	80.5k	if (p->tok->prompt != NULL) {
173	0	return 0;
174	0	}
175
176	80.5k	PyObject type, value, *traceback;
177	80.5k	PyErr_Fetch(&type, &value, &traceback);
178
179	80.5k	Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
180	80.5k	Py_ssize_t current_err_line = current_token->lineno;
181
182	80.5k	int ret = 0;
183	80.5k	struct token new_token;
184	80.5k	_PyToken_Init(&new_token);
185
186	344k	for (;;) {
187	344k	switch (_PyTokenizer_Get(p->tok, &new_token)) {
188	2.63k	case ERRORTOKEN:
189	2.63k	if (PyErr_Occurred()) {
190	541	ret = -1;
191	541	goto exit;
192	541	}
193	2.09k	if (p->tok->level != 0) {
194	2.07k	int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
195	2.07k	if (current_err_line > error_lineno) {
196	69	raise_unclosed_parentheses_error(p);
197	69	ret = -1;
198	69	goto exit;
199	69	}
200	2.07k	}
201	2.02k	break;
202	77.9k	case ENDMARKER:
203	77.9k	break;
204	264k	default:
205	264k	continue;
206	344k	}
207	79.9k	break;
208	344k	}
209
210
211	80.5k	exit:
212	80.5k	_PyToken_Free(&new_token);
213		// If we're in an f-string, we want the syntax error in the expression part
214		// to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
215		// do not swallow it.
216	80.5k	if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
217	442	Py_XDECREF(value);
218	442	Py_XDECREF(type);
219	442	Py_XDECREF(traceback);
220	80.1k	} else {
221	80.1k	PyErr_Restore(type, value, traceback);
222	80.1k	}
223	80.5k	return ret;
224	80.5k	}
225
226		// PARSER ERRORS
227
228		void *
229		_PyPegen_raise_error(Parser p, PyObject errtype, int use_mark, const char *errmsg, ...)
230	985	{
231		// Bail out if we already have an error set.
232	985	if (p->error_indicator && PyErr_Occurred()) {
233	114	return NULL;
234	114	}
235	871	if (p->fill == 0) {
236	0	va_list va;
237	0	va_start(va, errmsg);
238	0	_PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
239	0	va_end(va);
240	0	return NULL;
241	0	}
242	871	if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
243	0	p->error_indicator = 1;
244	0	return NULL;
245	0	}
246	871	Token *t = p->known_err_token != NULL
247	871	? p->known_err_token
248	871	: p->tokens[use_mark ? p->mark : p->fill - 1];
249	871	Py_ssize_t col_offset;
250	871	Py_ssize_t end_col_offset = -1;
251	871	if (t->col_offset == -1) {
252	243	if (p->tok->cur == p->tok->buf) {
253	5	col_offset = 0;
254	238	} else {
255	238	const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
256	238	col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
257	238	}
258	628	} else {
259	628	col_offset = t->col_offset + 1;
260	628	}
261
262	871	if (t->end_col_offset != -1) {
263	628	end_col_offset = t->end_col_offset + 1;
264	628	}
265
266	871	va_list va;
267	871	va_start(va, errmsg);
268	871	_PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
269	871	va_end(va);
270
271	871	return NULL;
272	871	}
273
274		static PyObject *
275		get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
276	177	{
277		/* If the file descriptor is interactive, the source lines of the current
278		* (multi-line) statement are stored in p->tok->interactive_src_start.
279		* If not, we're parsing from a string, which means that the whole source
280		* is stored in p->tok->str. */
281	177	assert((p->tok->fp == NULL && p->tok->str != NULL) \|\| p->tok->fp != NULL);
282
283	177	char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
284	177	if (cur_line == NULL) {
285	0	assert(p->tok->fp_interactive);
286		// We can reach this point if the tokenizer buffers for interactive source have not been
287		// initialized because we failed to decode the original source with the given locale.
288	0	return Py_GetConstant(Py_CONSTANT_EMPTY_STR);
289	0	}
290
291	177	Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
292	177	const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
293
294	177	if (buf_end < cur_line) {
295	0	buf_end = cur_line + strlen(cur_line);
296	0	}
297
298	2.19k	for (int i = 0; i < relative_lineno - 1; i++) {
299	2.02k	char *new_line = strchr(cur_line, '\n');
300		// The assert is here for debug builds but the conditional that
301		// follows is there so in release builds we do not crash at the cost
302		// to report a potentially wrong line.
303	2.02k	assert(new_line != NULL && new_line + 1 < buf_end);
304	2.02k	if (new_line == NULL \|\| new_line + 1 > buf_end) {
305	0	break;
306	0	}
307	2.02k	cur_line = new_line + 1;
308	2.02k	}
309
310	177	char *next_newline;
311	177	if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
312	0	next_newline = cur_line + strlen(cur_line);
313	0	}
314	177	return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
315	177	}
316
317		void *
318		_PyPegen_raise_error_known_location(Parser p, PyObject errtype,
319		Py_ssize_t lineno, Py_ssize_t col_offset,
320		Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
321		const char *errmsg, va_list va)
322	82.9k	{
323		// Bail out if we already have an error set.
324	82.9k	if (p->error_indicator && PyErr_Occurred()) {
325	456	return NULL;
326	456	}
327	82.5k	PyObject *value = NULL;
328	82.5k	PyObject *errstr = NULL;
329	82.5k	PyObject *error_line = NULL;
330	82.5k	PyObject *tmp = NULL;
331	82.5k	p->error_indicator = 1;
332
333	82.5k	if (end_lineno == CURRENT_POS) {
334	27	end_lineno = p->tok->lineno;
335	27	}
336	82.5k	if (end_col_offset == CURRENT_POS) {
337	27	end_col_offset = p->tok->cur - p->tok->line_start;
338	27	}
339
340	82.5k	errstr = PyUnicode_FromFormatV(errmsg, va);
341	82.5k	if (!errstr) {
342	0	goto error;
343	0	}
344
345	82.5k	if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
346	0	error_line = get_error_line_from_tokenizer_buffers(p, lineno);
347	0	}
348	82.5k	else if (p->start_rule == Py_file_input) {
349	82.5k	error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
350	82.5k	(int) lineno, p->tok->encoding);
351	82.5k	}
352
353	82.5k	if (!error_line) {
354		/* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
355		then we need to find the error line from some other source, because
356		p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
357		failed or we're parsing from a string or the REPL. There's a third edge case where
358		we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
359		`PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
360		does not physically exist */
361	82.5k	assert(p->tok->fp == NULL \|\| p->tok->fp == stdin \|\| p->tok->done == E_EOF);
362
363	82.5k	if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
364	82.3k	Py_ssize_t size = p->tok->inp - p->tok->line_start;
365	82.3k	error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace");
366	82.3k	}
367	177	else if (p->tok->fp == NULL \|\| p->tok->fp == stdin) {
368	177	error_line = get_error_line_from_tokenizer_buffers(p, lineno);
369	177	}
370	0	else {
371	0	error_line = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
372	0	}
373	82.5k	if (!error_line) {
374	0	goto error;
375	0	}
376	82.5k	}
377
378	82.5k	Py_ssize_t col_number = col_offset;
379	82.5k	Py_ssize_t end_col_number = end_col_offset;
380
381	82.5k	col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
382	82.5k	if (col_number < 0) {
383	0	goto error;
384	0	}
385
386	82.5k	if (end_col_offset > 0) {
387	80.5k	end_col_number = _PyPegen_byte_offset_to_character_offset(error_line, end_col_offset);
388	80.5k	if (end_col_number < 0) {
389	0	goto error;
390	0	}
391	80.5k	}
392
393	82.5k	tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
394	82.5k	if (!tmp) {
395	0	goto error;
396	0	}
397	82.5k	value = _PyTuple_FromPair(errstr, tmp);
398	82.5k	Py_DECREF(tmp);
399	82.5k	if (!value) {
400	0	goto error;
401	0	}
402	82.5k	PyErr_SetObject(errtype, value);
403
404	82.5k	Py_DECREF(errstr);
405	82.5k	Py_DECREF(value);
406	82.5k	return NULL;
407
408	0	error:
409	0	Py_XDECREF(errstr);
410	0	Py_XDECREF(error_line);
411	0	return NULL;
412	82.5k	}
413
414		void
415	84.2k	_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
416		// Existing syntax error
417	84.2k	if (PyErr_Occurred()) {
418		// Prioritize tokenizer errors to custom syntax errors raised
419		// on the second phase only if the errors come from the parser.
420	5.11k	int is_tok_ok = (p->tok->done == E_DONE \|\| p->tok->done == E_OK);
421	5.11k	if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
422	1.56k	_PyPegen_tokenize_full_source_to_check_for_errors(p);
423	1.56k	}
424		// Propagate the existing syntax error.
425	5.11k	return;
426	5.11k	}
427		// Initialization error
428	79.1k	if (p->fill == 0) {
429	0	RAISE_SYNTAX_ERROR("error at start before reading any input");
430	0	}
431		// Parser encountered EOF (End of File) unexpectedtly
432	79.1k	if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
433	0	if (p->tok->level) {
434	0	raise_unclosed_parentheses_error(p);
435	0	} else {
436	0	RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
437	0	}
438	0	return;
439	0	}
440		// Indentation error in the tokenizer
441	79.1k	if (last_token->type == INDENT \|\| last_token->type == DEDENT) {
442	102	RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
443	102	return;
444	102	}
445		// Unknown error (generic case)
446
447		// Use the last token we found on the first pass to avoid reporting
448		// incorrect locations for generic syntax errors just because we reached
449		// further away when trying to find specific syntax errors in the second
450		// pass.
451	79.0k	RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
452		// _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
453		// generic SyntaxError we just raised if errors are found.
454	79.0k	_PyPegen_tokenize_full_source_to_check_for_errors(p);
455	79.0k	}
456
457		void
458		_Pypegen_stack_overflow(Parser *p)
459	54	{
460	54	p->error_indicator = 1;
461	54	PyErr_SetString(PyExc_MemoryError,
462	54	"Parser stack overflowed - Python source too complex to parse");
463	54	}