/src/cpython/Python/Python-tokenize.c

Source
#include "Python.h"
#include "errcode.h"
#include "internal/pycore_critical_section.h"   // Py_BEGIN_CRITICAL_SECTION
#include "internal/pycore_tuple.h"              // _PyTuple_FromPair
#include "../Parser/lexer/state.h"
#include "../Parser/lexer/lexer.h"
#include "../Parser/tokenizer/tokenizer.h"
#include "../Parser/pegen.h"                    // _PyPegen_byte_offset_to_character_offset()

static struct PyModuleDef _tokenizemodule;

typedef struct {
    PyTypeObject *TokenizerIter;
} tokenize_state;

static tokenize_state *
get_tokenize_state(PyObject *module) {
    return (tokenize_state *)PyModule_GetState(module);
}

#define _tokenize_get_state_by_type(type) \
    get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))

#include "pycore_runtime.h"
#include "clinic/Python-tokenize.c.h"

/*[clinic input]
module _tokenizer
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
[clinic start generated code]*/
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/

typedef struct
{
    PyObject_HEAD struct tok_state *tok;
    int done;

    /* Needed to cache line for performance */
    PyObject *last_line;
    Py_ssize_t last_lineno;
    Py_ssize_t last_end_lineno;
    Py_ssize_t byte_col_offset_diff;
} tokenizeriterobject;

/*[clinic input]
@classmethod
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new

    readline: object
    /
    *
    extra_tokens: bool
    encoding: str(c_default="NULL") = 'utf-8'
[clinic start generated code]*/

static PyObject *
tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
                       int extra_tokens, const char *encoding)
/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
{
    tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
    if (self == NULL) {
        return NULL;
    }
    PyObject *filename = PyUnicode_FromString("<string>");
    if (filename == NULL) {
        return NULL;
    }
    self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
    if (self->tok == NULL) {
        Py_DECREF(filename);
        return NULL;
    }
    self->tok->filename = filename;
    if (extra_tokens) {
        self->tok->tok_extra_tokens = 1;
    }
    self->done = 0;

    self->last_line = NULL;
    self->byte_col_offset_diff = 0;
    self->last_lineno = 0;
    self->last_end_lineno = 0;

    return (PyObject *)self;
}

static int
_tokenizer_error(tokenizeriterobject *it)
{
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
    if (PyErr_Occurred()) {
        return -1;
    }

    const char *msg = NULL;
    PyObject* errtype = PyExc_SyntaxError;
    struct tok_state *tok = it->tok;
    switch (tok->done) {
        case E_TOKEN:
            msg = "invalid token";
            break;
        case E_EOF:
            PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
            PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
                                       tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
            return -1;
        case E_DEDENT:
            msg = "unindent does not match any outer indentation level";
            errtype = PyExc_IndentationError;
            break;
        case E_INTR:
            if (!PyErr_Occurred()) {
                PyErr_SetNone(PyExc_KeyboardInterrupt);
            }
            return -1;
        case E_NOMEM:
            PyErr_NoMemory();
            return -1;
        case E_TABSPACE:
            errtype = PyExc_TabError;
            msg = "inconsistent use of tabs and spaces in indentation";
            break;
        case E_TOODEEP:
            errtype = PyExc_IndentationError;
            msg = "too many levels of indentation";
            break;
        case E_LINECONT: {
            msg = "unexpected character after line continuation character";
            break;
        }
        default:
            msg = "unknown tokenization error";
    }

    PyObject* errstr = NULL;
    PyObject* error_line = NULL;
    PyObject* tmp = NULL;
    PyObject* value = NULL;
    int result = 0;

    Py_ssize_t size = tok->inp - tok->buf;
    assert(tok->buf[size-1] == '\n');
    size -= 1; // Remove the newline character from the end of the line
    error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
    if (!error_line) {
        result = -1;
        goto exit;
    }

    Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
    if (offset == -1) {
        result = -1;
        goto exit;
    }
    tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
    if (!tmp) {
        result = -1;
        goto exit;
    }

    errstr = PyUnicode_FromString(msg);
    if (!errstr) {
        result = -1;
        goto exit;
    }

    value = _PyTuple_FromPair(errstr, tmp);
    if (!value) {
        result = -1;
        goto exit;
    }

    PyErr_SetObject(errtype, value);

exit:
    Py_XDECREF(errstr);
    Py_XDECREF(error_line);
    Py_XDECREF(tmp);
    Py_XDECREF(value);
    return result;
}

static PyObject *
_get_current_line(tokenizeriterobject *it, const char *line_start, Py_ssize_t size,
                  int *line_changed)
{
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
    PyObject *line;
    if (it->tok->lineno != it->last_lineno) {
        // Line has changed since last token, so we fetch the new line and cache it
        // in the iter object.
        Py_XDECREF(it->last_line);
        line = PyUnicode_DecodeUTF8(line_start, size, "replace");
        it->last_line = line;
        it->byte_col_offset_diff = 0;
    }
    else {
        line = it->last_line;
        *line_changed = 0;
    }
    return line;
}

static void
_get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_start,
                 PyObject *line, int line_changed, Py_ssize_t lineno, Py_ssize_t end_lineno,
                 Py_ssize_t *col_offset, Py_ssize_t *end_col_offset)
{
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
    Py_ssize_t byte_offset = -1;
    if (token.start != NULL && token.start >= line_start) {
        byte_offset = token.start - line_start;
        if (line_changed) {
            *col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset);
            it->byte_col_offset_diff = byte_offset - *col_offset;
        }
        else {
            *col_offset = byte_offset - it->byte_col_offset_diff;
        }
    }

    if (token.end != NULL && token.end >= it->tok->line_start) {
        Py_ssize_t end_byte_offset = token.end - it->tok->line_start;
        if (lineno == end_lineno) {
            // If the whole token is at the same line, we can just use the token.start
            // buffer for figuring out the new column offset, since using line is not
            // performant for very long lines.
            Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
            *end_col_offset = *col_offset + token_col_offset;
            it->byte_col_offset_diff += token.end - token.start - token_col_offset;
        }
        else {
            *end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
            it->byte_col_offset_diff += end_byte_offset - *end_col_offset;
        }
    }
    it->last_lineno = lineno;
    it->last_end_lineno = end_lineno;
}

static PyObject *
tokenizeriter_next(PyObject *op)
{
    tokenizeriterobject *it = (tokenizeriterobject*)op;
    PyObject* result = NULL;

    Py_BEGIN_CRITICAL_SECTION(it);

    struct token token;
    _PyToken_Init(&token);

    int type = _PyTokenizer_Get(it->tok, &token);
    if (type == ERRORTOKEN) {
        if(!PyErr_Occurred()) {
            _tokenizer_error(it);
            assert(PyErr_Occurred());
        }
        goto exit;
    }
    if (it->done || type == ERRORTOKEN) {
        PyErr_SetString(PyExc_StopIteration, "EOF");
        it->done = 1;
        goto exit;
    }
    PyObject *str = NULL;
    if (token.start == NULL || token.end == NULL) {
        str = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
    }
    else {
        str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
    }
    if (str == NULL) {
        goto exit;
    }

    int is_trailing_token = 0;
    if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
        is_trailing_token = 1;
    }

    const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
    PyObject* line = NULL;
    int line_changed = 1;
    if (it->tok->tok_extra_tokens && is_trailing_token) {
        line = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
    } else {
        Py_ssize_t size = it->tok->inp - line_start;
        if (size >= 1 && it->tok->implicit_newline) {
            size -= 1;
        }

        line = _get_current_line(it, line_start, size, &line_changed);
    }
    if (line == NULL) {
        Py_DECREF(str);
        goto exit;
    }

    Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
    Py_ssize_t end_lineno = it->tok->lineno;
    Py_ssize_t col_offset = -1;
    Py_ssize_t end_col_offset = -1;
    _get_col_offsets(it, token, line_start, line, line_changed,
                     lineno, end_lineno, &col_offset, &end_col_offset);

    if (it->tok->tok_extra_tokens) {
        if (is_trailing_token) {
            lineno = end_lineno = lineno + 1;
            col_offset = end_col_offset = 0;
        }
        // Necessary adjustments to match the original Python tokenize
        // implementation
        if (type > DEDENT && type < OP) {
            type = OP;
        }
        else if (type == NEWLINE) {
            Py_DECREF(str);
            if (!it->tok->implicit_newline) {
                if (it->tok->start[0] == '\r') {
                    str = PyUnicode_FromString("\r\n");
                } else {
                    str = PyUnicode_FromString("\n");
                }
            }
            end_col_offset++;
        }
        else if (type == NL) {
            if (it->tok->implicit_newline) {
                Py_DECREF(str);
                str = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
            }
        }

        if (str == NULL) {
            Py_DECREF(line);
            goto exit;
        }
    }

    result = Py_BuildValue("(iN(nn)(nn)O)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
exit:
    _PyToken_Free(&token);
    if (type == ENDMARKER) {
        it->done = 1;
    }

    Py_END_CRITICAL_SECTION();
    return result;
}

static void
tokenizeriter_dealloc(PyObject *op)
{
    tokenizeriterobject *it = (tokenizeriterobject*)op;
    PyTypeObject *tp = Py_TYPE(it);
    Py_XDECREF(it->last_line);
    _PyTokenizer_Free(it->tok);
    tp->tp_free(it);
    Py_DECREF(tp);
}

static PyType_Slot tokenizeriter_slots[] = {
    {Py_tp_new, tokenizeriter_new},
    {Py_tp_dealloc, tokenizeriter_dealloc},
    {Py_tp_getattro, PyObject_GenericGetAttr},
    {Py_tp_iter, PyObject_SelfIter},
    {Py_tp_iternext, tokenizeriter_next},
    {0, NULL},
};

static PyType_Spec tokenizeriter_spec = {
    .name = "_tokenize.TokenizerIter",
    .basicsize = sizeof(tokenizeriterobject),
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
    .slots = tokenizeriter_slots,
};

static int
tokenizemodule_exec(PyObject *m)
{
    tokenize_state *state = get_tokenize_state(m);
    if (state == NULL) {
        return -1;
    }

    state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
    if (state->TokenizerIter == NULL) {
        return -1;
    }
    if (PyModule_AddType(m, state->TokenizerIter) < 0) {
        return -1;
    }

    return 0;
}

static PyMethodDef tokenize_methods[] = {
    {NULL, NULL, 0, NULL} /* Sentinel */
};

static PyModuleDef_Slot tokenizemodule_slots[] = {
    {Py_mod_exec, tokenizemodule_exec},
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
    {0, NULL}
};

static int
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
{
    tokenize_state *state = get_tokenize_state(m);
    Py_VISIT(state->TokenizerIter);
    return 0;
}

static int
tokenizemodule_clear(PyObject *m)
{
    tokenize_state *state = get_tokenize_state(m);
    Py_CLEAR(state->TokenizerIter);
    return 0;
}

static void
tokenizemodule_free(void *m)
{
    tokenizemodule_clear((PyObject *)m);
}

static struct PyModuleDef _tokenizemodule = {
    PyModuleDef_HEAD_INIT,
    .m_name = "_tokenize",
    .m_size = sizeof(tokenize_state),
    .m_slots = tokenizemodule_slots,
    .m_methods = tokenize_methods,
    .m_traverse = tokenizemodule_traverse,
    .m_clear = tokenizemodule_clear,
    .m_free = tokenizemodule_free,
};

PyMODINIT_FUNC
PyInit__tokenize(void)
{
    return PyModuleDef_Init(&_tokenizemodule);
}

Coverage Report

Created: 2026-03-23 06:45

Line	Count	Source
1		#include "Python.h"
2		#include "errcode.h"
3		#include "internal/pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION
4		#include "internal/pycore_tuple.h" // _PyTuple_FromPair
5		#include "../Parser/lexer/state.h"
6		#include "../Parser/lexer/lexer.h"
7		#include "../Parser/tokenizer/tokenizer.h"
8		#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
9
10		static struct PyModuleDef _tokenizemodule;
11
12		typedef struct {
13		PyTypeObject *TokenizerIter;
14		} tokenize_state;
15
16		static tokenize_state *
17	322	get_tokenize_state(PyObject *module) {
18	322	return (tokenize_state *)PyModule_GetState(module);
19	322	}
20
21		#define _tokenize_get_state_by_type(type) \
22		get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
23
24		#include "pycore_runtime.h"
25		#include "clinic/Python-tokenize.c.h"
26
27		/*[clinic input]
28		module _tokenizer
29		class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
30		[clinic start generated code]*/
31		/[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]/
32
33		typedef struct
34		{
35		PyObject_HEAD struct tok_state *tok;
36		int done;
37
38		/* Needed to cache line for performance */
39		PyObject *last_line;
40		Py_ssize_t last_lineno;
41		Py_ssize_t last_end_lineno;
42		Py_ssize_t byte_col_offset_diff;
43		} tokenizeriterobject;
44
45		/*[clinic input]
46		@classmethod
47		_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
48
49		readline: object
50		/
51		*
52		extra_tokens: bool
53		encoding: str(c_default="NULL") = 'utf-8'
54		[clinic start generated code]*/
55
56		static PyObject *
57		tokenizeriter_new_impl(PyTypeObject type, PyObject readline,
58		int extra_tokens, const char *encoding)
59		/[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]/
60	20	{
61	20	tokenizeriterobject self = (tokenizeriterobject )type->tp_alloc(type, 0);
62	20	if (self == NULL) {
63	0	return NULL;
64	0	}
65	20	PyObject *filename = PyUnicode_FromString("<string>");
66	20	if (filename == NULL) {
67	0	return NULL;
68	0	}
69	20	self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
70	20	if (self->tok == NULL) {
71	0	Py_DECREF(filename);
72	0	return NULL;
73	0	}
74	20	self->tok->filename = filename;
75	20	if (extra_tokens) {
76	20	self->tok->tok_extra_tokens = 1;
77	20	}
78	20	self->done = 0;
79
80	20	self->last_line = NULL;
81	20	self->byte_col_offset_diff = 0;
82	20	self->last_lineno = 0;
83	20	self->last_end_lineno = 0;
84
85	20	return (PyObject *)self;
86	20	}
87
88		static int
89		_tokenizer_error(tokenizeriterobject *it)
90	0	{
91	0	_Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
92	0	if (PyErr_Occurred()) {
93	0	return -1;
94	0	}
95
96	0	const char *msg = NULL;
97	0	PyObject* errtype = PyExc_SyntaxError;
98	0	struct tok_state *tok = it->tok;
99	0	switch (tok->done) {
100	0	case E_TOKEN:
101	0	msg = "invalid token";
102	0	break;
103	0	case E_EOF:
104	0	PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
105	0	PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
106	0	tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
107	0	return -1;
108	0	case E_DEDENT:
109	0	msg = "unindent does not match any outer indentation level";
110	0	errtype = PyExc_IndentationError;
111	0	break;
112	0	case E_INTR:
113	0	if (!PyErr_Occurred()) {
114	0	PyErr_SetNone(PyExc_KeyboardInterrupt);
115	0	}
116	0	return -1;
117	0	case E_NOMEM:
118	0	PyErr_NoMemory();
119	0	return -1;
120	0	case E_TABSPACE:
121	0	errtype = PyExc_TabError;
122	0	msg = "inconsistent use of tabs and spaces in indentation";
123	0	break;
124	0	case E_TOODEEP:
125	0	errtype = PyExc_IndentationError;
126	0	msg = "too many levels of indentation";
127	0	break;
128	0	case E_LINECONT: {
129	0	msg = "unexpected character after line continuation character";
130	0	break;
131	0	}
132	0	default:
133	0	msg = "unknown tokenization error";
134	0	}
135
136	0	PyObject* errstr = NULL;
137	0	PyObject* error_line = NULL;
138	0	PyObject* tmp = NULL;
139	0	PyObject* value = NULL;
140	0	int result = 0;
141
142	0	Py_ssize_t size = tok->inp - tok->buf;
143	0	assert(tok->buf[size-1] == '\n');
144	0	size -= 1; // Remove the newline character from the end of the line
145	0	error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
146	0	if (!error_line) {
147	0	result = -1;
148	0	goto exit;
149	0	}
150
151	0	Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
152	0	if (offset == -1) {
153	0	result = -1;
154	0	goto exit;
155	0	}
156	0	tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
157	0	if (!tmp) {
158	0	result = -1;
159	0	goto exit;
160	0	}
161
162	0	errstr = PyUnicode_FromString(msg);
163	0	if (!errstr) {
164	0	result = -1;
165	0	goto exit;
166	0	}
167
168	0	value = _PyTuple_FromPair(errstr, tmp);
169	0	if (!value) {
170	0	result = -1;
171	0	goto exit;
172	0	}
173
174	0	PyErr_SetObject(errtype, value);
175
176	0	exit:
177	0	Py_XDECREF(errstr);
178	0	Py_XDECREF(error_line);
179	0	Py_XDECREF(tmp);
180	0	Py_XDECREF(value);
181	0	return result;
182	0	}
183
184		static PyObject *
185		_get_current_line(tokenizeriterobject it, const char line_start, Py_ssize_t size,
186		int *line_changed)
187	2.04k	{
188	2.04k	_Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
189	2.04k	PyObject *line;
190	2.04k	if (it->tok->lineno != it->last_lineno) {
191		// Line has changed since last token, so we fetch the new line and cache it
192		// in the iter object.
193	320	Py_XDECREF(it->last_line);
194	320	line = PyUnicode_DecodeUTF8(line_start, size, "replace");
195	320	it->last_line = line;
196	320	it->byte_col_offset_diff = 0;
197	320	}
198	1.72k	else {
199	1.72k	line = it->last_line;
200	1.72k	*line_changed = 0;
201	1.72k	}
202	2.04k	return line;
203	2.04k	}
204
205		static void
206		_get_col_offsets(tokenizeriterobject it, struct token token, const char line_start,
207		PyObject *line, int line_changed, Py_ssize_t lineno, Py_ssize_t end_lineno,
208		Py_ssize_t col_offset, Py_ssize_t end_col_offset)
209	2.06k	{
210	2.06k	_Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
211	2.06k	Py_ssize_t byte_offset = -1;
212	2.06k	if (token.start != NULL && token.start >= line_start) {
213	2.05k	byte_offset = token.start - line_start;
214	2.05k	if (line_changed) {
215	332	*col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset);
216	332	it->byte_col_offset_diff = byte_offset - *col_offset;
217	332	}
218	1.72k	else {
219	1.72k	*col_offset = byte_offset - it->byte_col_offset_diff;
220	1.72k	}
221	2.05k	}
222
223	2.06k	if (token.end != NULL && token.end >= it->tok->line_start) {
224	2.05k	Py_ssize_t end_byte_offset = token.end - it->tok->line_start;
225	2.05k	if (lineno == end_lineno) {
226		// If the whole token is at the same line, we can just use the token.start
227		// buffer for figuring out the new column offset, since using line is not
228		// performant for very long lines.
229	2.05k	Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
230	2.05k	end_col_offset = col_offset + token_col_offset;
231	2.05k	it->byte_col_offset_diff += token.end - token.start - token_col_offset;
232	2.05k	}
233	4	else {
234	4	*end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
235	4	it->byte_col_offset_diff += end_byte_offset - *end_col_offset;
236	4	}
237	2.05k	}
238	2.06k	it->last_lineno = lineno;
239	2.06k	it->last_end_lineno = end_lineno;
240	2.06k	}
241
242		static PyObject *
243		tokenizeriter_next(PyObject *op)
244	2.06k	{
245	2.06k	tokenizeriterobject it = (tokenizeriterobject)op;
246	2.06k	PyObject* result = NULL;
247
248	2.06k	Py_BEGIN_CRITICAL_SECTION(it);
249
250	2.06k	struct token token;
251	2.06k	_PyToken_Init(&token);
252
253	2.06k	int type = _PyTokenizer_Get(it->tok, &token);
254	2.06k	if (type == ERRORTOKEN) {
255	0	if(!PyErr_Occurred()) {
256	0	_tokenizer_error(it);
257	0	assert(PyErr_Occurred());
258	0	}
259	0	goto exit;
260	0	}
261	2.06k	if (it->done \|\| type == ERRORTOKEN) {
262	4	PyErr_SetString(PyExc_StopIteration, "EOF");
263	4	it->done = 1;
264	4	goto exit;
265	4	}
266	2.06k	PyObject *str = NULL;
267	2.06k	if (token.start == NULL \|\| token.end == NULL) {
268	8	str = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
269	8	}
270	2.05k	else {
271	2.05k	str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
272	2.05k	}
273	2.06k	if (str == NULL) {
274	0	goto exit;
275	0	}
276
277	2.06k	int is_trailing_token = 0;
278	2.06k	if (type == ENDMARKER \|\| (type == DEDENT && it->tok->done == E_EOF)) {
279	20	is_trailing_token = 1;
280	20	}
281
282	2.06k	const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
283	2.06k	PyObject* line = NULL;
284	2.06k	int line_changed = 1;
285	2.06k	if (it->tok->tok_extra_tokens && is_trailing_token) {
286	20	line = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
287	2.04k	} else {
288	2.04k	Py_ssize_t size = it->tok->inp - line_start;
289	2.04k	if (size >= 1 && it->tok->implicit_newline) {
290	52	size -= 1;
291	52	}
292
293	2.04k	line = _get_current_line(it, line_start, size, &line_changed);
294	2.04k	}
295	2.06k	if (line == NULL) {
296	0	Py_DECREF(str);
297	0	goto exit;
298	0	}
299
300	2.06k	Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
301	2.06k	Py_ssize_t end_lineno = it->tok->lineno;
302	2.06k	Py_ssize_t col_offset = -1;
303	2.06k	Py_ssize_t end_col_offset = -1;
304	2.06k	_get_col_offsets(it, token, line_start, line, line_changed,
305	2.06k	lineno, end_lineno, &col_offset, &end_col_offset);
306
307	2.06k	if (it->tok->tok_extra_tokens) {
308	2.06k	if (is_trailing_token) {
309	20	lineno = end_lineno = lineno + 1;
310	20	col_offset = end_col_offset = 0;
311	20	}
312		// Necessary adjustments to match the original Python tokenize
313		// implementation
314	2.06k	if (type > DEDENT && type < OP) {
315	796	type = OP;
316	796	}
317	1.26k	else if (type == NEWLINE) {
318	164	Py_DECREF(str);
319	164	if (!it->tok->implicit_newline) {
320	160	if (it->tok->start[0] == '\r') {
321	0	str = PyUnicode_FromString("\r\n");
322	160	} else {
323	160	str = PyUnicode_FromString("\n");
324	160	}
325	160	}
326	164	end_col_offset++;
327	164	}
328	1.10k	else if (type == NL) {
329	140	if (it->tok->implicit_newline) {
330	0	Py_DECREF(str);
331	0	str = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
332	0	}
333	140	}
334
335	2.06k	if (str == NULL) {
336	0	Py_DECREF(line);
337	0	goto exit;
338	0	}
339	2.06k	}
340
341	2.06k	result = Py_BuildValue("(iN(nn)(nn)O)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
342	2.06k	exit:
343	2.06k	_PyToken_Free(&token);
344	2.06k	if (type == ENDMARKER) {
345	12	it->done = 1;
346	12	}
347
348	2.06k	Py_END_CRITICAL_SECTION();
349	2.06k	return result;
350	2.06k	}
351
352		static void
353		tokenizeriter_dealloc(PyObject *op)
354	20	{
355	20	tokenizeriterobject it = (tokenizeriterobject)op;
356	20	PyTypeObject *tp = Py_TYPE(it);
357	20	Py_XDECREF(it->last_line);
358	20	_PyTokenizer_Free(it->tok);
359	20	tp->tp_free(it);
360	20	Py_DECREF(tp);
361	20	}
362
363		static PyType_Slot tokenizeriter_slots[] = {
364		{Py_tp_new, tokenizeriter_new},
365		{Py_tp_dealloc, tokenizeriter_dealloc},
366		{Py_tp_getattro, PyObject_GenericGetAttr},
367		{Py_tp_iter, PyObject_SelfIter},
368		{Py_tp_iternext, tokenizeriter_next},
369		{0, NULL},
370		};
371
372		static PyType_Spec tokenizeriter_spec = {
373		.name = "_tokenize.TokenizerIter",
374		.basicsize = sizeof(tokenizeriterobject),
375		.flags = (Py_TPFLAGS_DEFAULT \| Py_TPFLAGS_IMMUTABLETYPE),
376		.slots = tokenizeriter_slots,
377		};
378
379		static int
380		tokenizemodule_exec(PyObject *m)
381	6	{
382	6	tokenize_state *state = get_tokenize_state(m);
383	6	if (state == NULL) {
384	0	return -1;
385	0	}
386
387	6	state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
388	6	if (state->TokenizerIter == NULL) {
389	0	return -1;
390	0	}
391	6	if (PyModule_AddType(m, state->TokenizerIter) < 0) {
392	0	return -1;
393	0	}
394
395	6	return 0;
396	6	}
397
398		static PyMethodDef tokenize_methods[] = {
399		{NULL, NULL, 0, NULL} /* Sentinel */
400		};
401
402		static PyModuleDef_Slot tokenizemodule_slots[] = {
403		{Py_mod_exec, tokenizemodule_exec},
404		{Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
405		{Py_mod_gil, Py_MOD_GIL_NOT_USED},
406		{0, NULL}
407		};
408
409		static int
410		tokenizemodule_traverse(PyObject m, visitproc visit, void arg)
411	316	{
412	316	tokenize_state *state = get_tokenize_state(m);
413	316	Py_VISIT(state->TokenizerIter);
414	316	return 0;
415	316	}
416
417		static int
418		tokenizemodule_clear(PyObject *m)
419	0	{
420	0	tokenize_state *state = get_tokenize_state(m);
421	0	Py_CLEAR(state->TokenizerIter);
422	0	return 0;
423	0	}
424
425		static void
426		tokenizemodule_free(void *m)
427	0	{
428	0	tokenizemodule_clear((PyObject *)m);
429	0	}
430
431		static struct PyModuleDef _tokenizemodule = {
432		PyModuleDef_HEAD_INIT,
433		.m_name = "_tokenize",
434		.m_size = sizeof(tokenize_state),
435		.m_slots = tokenizemodule_slots,
436		.m_methods = tokenize_methods,
437		.m_traverse = tokenizemodule_traverse,
438		.m_clear = tokenizemodule_clear,
439		.m_free = tokenizemodule_free,
440		};
441
442		PyMODINIT_FUNC
443		PyInit__tokenize(void)
444	6	{
445	6	return PyModuleDef_Init(&_tokenizemodule);
446	6	}