Coverage Report

Created: 2026-03-23 06:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Python/Python-tokenize.c
Line
Count
Source
1
#include "Python.h"
2
#include "errcode.h"
3
#include "internal/pycore_critical_section.h"   // Py_BEGIN_CRITICAL_SECTION
4
#include "internal/pycore_tuple.h"              // _PyTuple_FromPair
5
#include "../Parser/lexer/state.h"
6
#include "../Parser/lexer/lexer.h"
7
#include "../Parser/tokenizer/tokenizer.h"
8
#include "../Parser/pegen.h"                    // _PyPegen_byte_offset_to_character_offset()
9
10
static struct PyModuleDef _tokenizemodule;
11
12
typedef struct {
13
    PyTypeObject *TokenizerIter;
14
} tokenize_state;
15
16
static tokenize_state *
17
322
get_tokenize_state(PyObject *module) {
18
322
    return (tokenize_state *)PyModule_GetState(module);
19
322
}
20
21
#define _tokenize_get_state_by_type(type) \
22
    get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
23
24
#include "pycore_runtime.h"
25
#include "clinic/Python-tokenize.c.h"
26
27
/*[clinic input]
28
module _tokenizer
29
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
30
[clinic start generated code]*/
31
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
32
33
typedef struct
34
{
35
    PyObject_HEAD struct tok_state *tok;
36
    int done;
37
38
    /* Needed to cache line for performance */
39
    PyObject *last_line;
40
    Py_ssize_t last_lineno;
41
    Py_ssize_t last_end_lineno;
42
    Py_ssize_t byte_col_offset_diff;
43
} tokenizeriterobject;
44
45
/*[clinic input]
46
@classmethod
47
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
48
49
    readline: object
50
    /
51
    *
52
    extra_tokens: bool
53
    encoding: str(c_default="NULL") = 'utf-8'
54
[clinic start generated code]*/
55
56
static PyObject *
57
tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
58
                       int extra_tokens, const char *encoding)
59
/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
60
20
{
61
20
    tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
62
20
    if (self == NULL) {
63
0
        return NULL;
64
0
    }
65
20
    PyObject *filename = PyUnicode_FromString("<string>");
66
20
    if (filename == NULL) {
67
0
        return NULL;
68
0
    }
69
20
    self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
70
20
    if (self->tok == NULL) {
71
0
        Py_DECREF(filename);
72
0
        return NULL;
73
0
    }
74
20
    self->tok->filename = filename;
75
20
    if (extra_tokens) {
76
20
        self->tok->tok_extra_tokens = 1;
77
20
    }
78
20
    self->done = 0;
79
80
20
    self->last_line = NULL;
81
20
    self->byte_col_offset_diff = 0;
82
20
    self->last_lineno = 0;
83
20
    self->last_end_lineno = 0;
84
85
20
    return (PyObject *)self;
86
20
}
87
88
static int
89
_tokenizer_error(tokenizeriterobject *it)
90
0
{
91
0
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
92
0
    if (PyErr_Occurred()) {
93
0
        return -1;
94
0
    }
95
96
0
    const char *msg = NULL;
97
0
    PyObject* errtype = PyExc_SyntaxError;
98
0
    struct tok_state *tok = it->tok;
99
0
    switch (tok->done) {
100
0
        case E_TOKEN:
101
0
            msg = "invalid token";
102
0
            break;
103
0
        case E_EOF:
104
0
            PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
105
0
            PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
106
0
                                       tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
107
0
            return -1;
108
0
        case E_DEDENT:
109
0
            msg = "unindent does not match any outer indentation level";
110
0
            errtype = PyExc_IndentationError;
111
0
            break;
112
0
        case E_INTR:
113
0
            if (!PyErr_Occurred()) {
114
0
                PyErr_SetNone(PyExc_KeyboardInterrupt);
115
0
            }
116
0
            return -1;
117
0
        case E_NOMEM:
118
0
            PyErr_NoMemory();
119
0
            return -1;
120
0
        case E_TABSPACE:
121
0
            errtype = PyExc_TabError;
122
0
            msg = "inconsistent use of tabs and spaces in indentation";
123
0
            break;
124
0
        case E_TOODEEP:
125
0
            errtype = PyExc_IndentationError;
126
0
            msg = "too many levels of indentation";
127
0
            break;
128
0
        case E_LINECONT: {
129
0
            msg = "unexpected character after line continuation character";
130
0
            break;
131
0
        }
132
0
        default:
133
0
            msg = "unknown tokenization error";
134
0
    }
135
136
0
    PyObject* errstr = NULL;
137
0
    PyObject* error_line = NULL;
138
0
    PyObject* tmp = NULL;
139
0
    PyObject* value = NULL;
140
0
    int result = 0;
141
142
0
    Py_ssize_t size = tok->inp - tok->buf;
143
0
    assert(tok->buf[size-1] == '\n');
144
0
    size -= 1; // Remove the newline character from the end of the line
145
0
    error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
146
0
    if (!error_line) {
147
0
        result = -1;
148
0
        goto exit;
149
0
    }
150
151
0
    Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
152
0
    if (offset == -1) {
153
0
        result = -1;
154
0
        goto exit;
155
0
    }
156
0
    tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
157
0
    if (!tmp) {
158
0
        result = -1;
159
0
        goto exit;
160
0
    }
161
162
0
    errstr = PyUnicode_FromString(msg);
163
0
    if (!errstr) {
164
0
        result = -1;
165
0
        goto exit;
166
0
    }
167
168
0
    value = _PyTuple_FromPair(errstr, tmp);
169
0
    if (!value) {
170
0
        result = -1;
171
0
        goto exit;
172
0
    }
173
174
0
    PyErr_SetObject(errtype, value);
175
176
0
exit:
177
0
    Py_XDECREF(errstr);
178
0
    Py_XDECREF(error_line);
179
0
    Py_XDECREF(tmp);
180
0
    Py_XDECREF(value);
181
0
    return result;
182
0
}
183
184
static PyObject *
185
_get_current_line(tokenizeriterobject *it, const char *line_start, Py_ssize_t size,
186
                  int *line_changed)
187
2.04k
{
188
2.04k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
189
2.04k
    PyObject *line;
190
2.04k
    if (it->tok->lineno != it->last_lineno) {
191
        // Line has changed since last token, so we fetch the new line and cache it
192
        // in the iter object.
193
320
        Py_XDECREF(it->last_line);
194
320
        line = PyUnicode_DecodeUTF8(line_start, size, "replace");
195
320
        it->last_line = line;
196
320
        it->byte_col_offset_diff = 0;
197
320
    }
198
1.72k
    else {
199
1.72k
        line = it->last_line;
200
1.72k
        *line_changed = 0;
201
1.72k
    }
202
2.04k
    return line;
203
2.04k
}
204
205
static void
206
_get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_start,
207
                 PyObject *line, int line_changed, Py_ssize_t lineno, Py_ssize_t end_lineno,
208
                 Py_ssize_t *col_offset, Py_ssize_t *end_col_offset)
209
2.06k
{
210
2.06k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
211
2.06k
    Py_ssize_t byte_offset = -1;
212
2.06k
    if (token.start != NULL && token.start >= line_start) {
213
2.05k
        byte_offset = token.start - line_start;
214
2.05k
        if (line_changed) {
215
332
            *col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset);
216
332
            it->byte_col_offset_diff = byte_offset - *col_offset;
217
332
        }
218
1.72k
        else {
219
1.72k
            *col_offset = byte_offset - it->byte_col_offset_diff;
220
1.72k
        }
221
2.05k
    }
222
223
2.06k
    if (token.end != NULL && token.end >= it->tok->line_start) {
224
2.05k
        Py_ssize_t end_byte_offset = token.end - it->tok->line_start;
225
2.05k
        if (lineno == end_lineno) {
226
            // If the whole token is at the same line, we can just use the token.start
227
            // buffer for figuring out the new column offset, since using line is not
228
            // performant for very long lines.
229
2.05k
            Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
230
2.05k
            *end_col_offset = *col_offset + token_col_offset;
231
2.05k
            it->byte_col_offset_diff += token.end - token.start - token_col_offset;
232
2.05k
        }
233
4
        else {
234
4
            *end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
235
4
            it->byte_col_offset_diff += end_byte_offset - *end_col_offset;
236
4
        }
237
2.05k
    }
238
2.06k
    it->last_lineno = lineno;
239
2.06k
    it->last_end_lineno = end_lineno;
240
2.06k
}
241
242
static PyObject *
243
tokenizeriter_next(PyObject *op)
244
2.06k
{
245
2.06k
    tokenizeriterobject *it = (tokenizeriterobject*)op;
246
2.06k
    PyObject* result = NULL;
247
248
2.06k
    Py_BEGIN_CRITICAL_SECTION(it);
249
250
2.06k
    struct token token;
251
2.06k
    _PyToken_Init(&token);
252
253
2.06k
    int type = _PyTokenizer_Get(it->tok, &token);
254
2.06k
    if (type == ERRORTOKEN) {
255
0
        if(!PyErr_Occurred()) {
256
0
            _tokenizer_error(it);
257
0
            assert(PyErr_Occurred());
258
0
        }
259
0
        goto exit;
260
0
    }
261
2.06k
    if (it->done || type == ERRORTOKEN) {
262
4
        PyErr_SetString(PyExc_StopIteration, "EOF");
263
4
        it->done = 1;
264
4
        goto exit;
265
4
    }
266
2.06k
    PyObject *str = NULL;
267
2.06k
    if (token.start == NULL || token.end == NULL) {
268
8
        str = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
269
8
    }
270
2.05k
    else {
271
2.05k
        str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
272
2.05k
    }
273
2.06k
    if (str == NULL) {
274
0
        goto exit;
275
0
    }
276
277
2.06k
    int is_trailing_token = 0;
278
2.06k
    if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
279
20
        is_trailing_token = 1;
280
20
    }
281
282
2.06k
    const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
283
2.06k
    PyObject* line = NULL;
284
2.06k
    int line_changed = 1;
285
2.06k
    if (it->tok->tok_extra_tokens && is_trailing_token) {
286
20
        line = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
287
2.04k
    } else {
288
2.04k
        Py_ssize_t size = it->tok->inp - line_start;
289
2.04k
        if (size >= 1 && it->tok->implicit_newline) {
290
52
            size -= 1;
291
52
        }
292
293
2.04k
        line = _get_current_line(it, line_start, size, &line_changed);
294
2.04k
    }
295
2.06k
    if (line == NULL) {
296
0
        Py_DECREF(str);
297
0
        goto exit;
298
0
    }
299
300
2.06k
    Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
301
2.06k
    Py_ssize_t end_lineno = it->tok->lineno;
302
2.06k
    Py_ssize_t col_offset = -1;
303
2.06k
    Py_ssize_t end_col_offset = -1;
304
2.06k
    _get_col_offsets(it, token, line_start, line, line_changed,
305
2.06k
                     lineno, end_lineno, &col_offset, &end_col_offset);
306
307
2.06k
    if (it->tok->tok_extra_tokens) {
308
2.06k
        if (is_trailing_token) {
309
20
            lineno = end_lineno = lineno + 1;
310
20
            col_offset = end_col_offset = 0;
311
20
        }
312
        // Necessary adjustments to match the original Python tokenize
313
        // implementation
314
2.06k
        if (type > DEDENT && type < OP) {
315
796
            type = OP;
316
796
        }
317
1.26k
        else if (type == NEWLINE) {
318
164
            Py_DECREF(str);
319
164
            if (!it->tok->implicit_newline) {
320
160
                if (it->tok->start[0] == '\r') {
321
0
                    str = PyUnicode_FromString("\r\n");
322
160
                } else {
323
160
                    str = PyUnicode_FromString("\n");
324
160
                }
325
160
            }
326
164
            end_col_offset++;
327
164
        }
328
1.10k
        else if (type == NL) {
329
140
            if (it->tok->implicit_newline) {
330
0
                Py_DECREF(str);
331
0
                str = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
332
0
            }
333
140
        }
334
335
2.06k
        if (str == NULL) {
336
0
            Py_DECREF(line);
337
0
            goto exit;
338
0
        }
339
2.06k
    }
340
341
2.06k
    result = Py_BuildValue("(iN(nn)(nn)O)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
342
2.06k
exit:
343
2.06k
    _PyToken_Free(&token);
344
2.06k
    if (type == ENDMARKER) {
345
12
        it->done = 1;
346
12
    }
347
348
2.06k
    Py_END_CRITICAL_SECTION();
349
2.06k
    return result;
350
2.06k
}
351
352
static void
353
tokenizeriter_dealloc(PyObject *op)
354
20
{
355
20
    tokenizeriterobject *it = (tokenizeriterobject*)op;
356
20
    PyTypeObject *tp = Py_TYPE(it);
357
20
    Py_XDECREF(it->last_line);
358
20
    _PyTokenizer_Free(it->tok);
359
20
    tp->tp_free(it);
360
20
    Py_DECREF(tp);
361
20
}
362
363
static PyType_Slot tokenizeriter_slots[] = {
364
    {Py_tp_new, tokenizeriter_new},
365
    {Py_tp_dealloc, tokenizeriter_dealloc},
366
    {Py_tp_getattro, PyObject_GenericGetAttr},
367
    {Py_tp_iter, PyObject_SelfIter},
368
    {Py_tp_iternext, tokenizeriter_next},
369
    {0, NULL},
370
};
371
372
static PyType_Spec tokenizeriter_spec = {
373
    .name = "_tokenize.TokenizerIter",
374
    .basicsize = sizeof(tokenizeriterobject),
375
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
376
    .slots = tokenizeriter_slots,
377
};
378
379
static int
380
tokenizemodule_exec(PyObject *m)
381
6
{
382
6
    tokenize_state *state = get_tokenize_state(m);
383
6
    if (state == NULL) {
384
0
        return -1;
385
0
    }
386
387
6
    state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
388
6
    if (state->TokenizerIter == NULL) {
389
0
        return -1;
390
0
    }
391
6
    if (PyModule_AddType(m, state->TokenizerIter) < 0) {
392
0
        return -1;
393
0
    }
394
395
6
    return 0;
396
6
}
397
398
static PyMethodDef tokenize_methods[] = {
399
    {NULL, NULL, 0, NULL} /* Sentinel */
400
};
401
402
static PyModuleDef_Slot tokenizemodule_slots[] = {
403
    {Py_mod_exec, tokenizemodule_exec},
404
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
405
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
406
    {0, NULL}
407
};
408
409
static int
410
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
411
316
{
412
316
    tokenize_state *state = get_tokenize_state(m);
413
316
    Py_VISIT(state->TokenizerIter);
414
316
    return 0;
415
316
}
416
417
static int
418
tokenizemodule_clear(PyObject *m)
419
0
{
420
0
    tokenize_state *state = get_tokenize_state(m);
421
0
    Py_CLEAR(state->TokenizerIter);
422
0
    return 0;
423
0
}
424
425
static void
426
tokenizemodule_free(void *m)
427
0
{
428
0
    tokenizemodule_clear((PyObject *)m);
429
0
}
430
431
static struct PyModuleDef _tokenizemodule = {
432
    PyModuleDef_HEAD_INIT,
433
    .m_name = "_tokenize",
434
    .m_size = sizeof(tokenize_state),
435
    .m_slots = tokenizemodule_slots,
436
    .m_methods = tokenize_methods,
437
    .m_traverse = tokenizemodule_traverse,
438
    .m_clear = tokenizemodule_clear,
439
    .m_free = tokenizemodule_free,
440
};
441
442
PyMODINIT_FUNC
443
PyInit__tokenize(void)
444
6
{
445
6
    return PyModuleDef_Init(&_tokenizemodule);
446
6
}