Coverage Report

Created: 2025-07-04 06:49

/src/cpython/Python/Python-tokenize.c
Line
Count
Source (jump to first uncovered line)
1
#include "Python.h"
2
#include "errcode.h"
3
#include "internal/pycore_critical_section.h"   // Py_BEGIN_CRITICAL_SECTION
4
#include "../Parser/lexer/state.h"
5
#include "../Parser/lexer/lexer.h"
6
#include "../Parser/tokenizer/tokenizer.h"
7
#include "../Parser/pegen.h"                    // _PyPegen_byte_offset_to_character_offset()
8
9
static struct PyModuleDef _tokenizemodule;
10
11
typedef struct {
12
    PyTypeObject *TokenizerIter;
13
} tokenize_state;
14
15
static tokenize_state *
16
0
get_tokenize_state(PyObject *module) {
17
0
    return (tokenize_state *)PyModule_GetState(module);
18
0
}
19
20
#define _tokenize_get_state_by_type(type) \
21
    get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
22
23
#include "pycore_runtime.h"
24
#include "clinic/Python-tokenize.c.h"
25
26
/*[clinic input]
27
module _tokenizer
28
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
29
[clinic start generated code]*/
30
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
31
32
typedef struct
33
{
34
    PyObject_HEAD struct tok_state *tok;
35
    int done;
36
37
    /* Needed to cache line for performance */
38
    PyObject *last_line;
39
    Py_ssize_t last_lineno;
40
    Py_ssize_t last_end_lineno;
41
    Py_ssize_t byte_col_offset_diff;
42
} tokenizeriterobject;
43
44
/*[clinic input]
45
@classmethod
46
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
47
48
    readline: object
49
    /
50
    *
51
    extra_tokens: bool
52
    encoding: str(c_default="NULL") = 'utf-8'
53
[clinic start generated code]*/
54
55
static PyObject *
56
tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
57
                       int extra_tokens, const char *encoding)
58
/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
59
0
{
60
0
    tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
61
0
    if (self == NULL) {
62
0
        return NULL;
63
0
    }
64
0
    PyObject *filename = PyUnicode_FromString("<string>");
65
0
    if (filename == NULL) {
66
0
        return NULL;
67
0
    }
68
0
    self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
69
0
    if (self->tok == NULL) {
70
0
        Py_DECREF(filename);
71
0
        return NULL;
72
0
    }
73
0
    self->tok->filename = filename;
74
0
    if (extra_tokens) {
75
0
        self->tok->tok_extra_tokens = 1;
76
0
    }
77
0
    self->done = 0;
78
79
0
    self->last_line = NULL;
80
0
    self->byte_col_offset_diff = 0;
81
0
    self->last_lineno = 0;
82
0
    self->last_end_lineno = 0;
83
84
0
    return (PyObject *)self;
85
0
}
86
87
static int
88
_tokenizer_error(tokenizeriterobject *it)
89
0
{
90
0
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
91
0
    if (PyErr_Occurred()) {
92
0
        return -1;
93
0
    }
94
95
0
    const char *msg = NULL;
96
0
    PyObject* errtype = PyExc_SyntaxError;
97
0
    struct tok_state *tok = it->tok;
98
0
    switch (tok->done) {
99
0
        case E_TOKEN:
100
0
            msg = "invalid token";
101
0
            break;
102
0
        case E_EOF:
103
0
            PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
104
0
            PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
105
0
                                       tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
106
0
            return -1;
107
0
        case E_DEDENT:
108
0
            msg = "unindent does not match any outer indentation level";
109
0
            errtype = PyExc_IndentationError;
110
0
            break;
111
0
        case E_INTR:
112
0
            if (!PyErr_Occurred()) {
113
0
                PyErr_SetNone(PyExc_KeyboardInterrupt);
114
0
            }
115
0
            return -1;
116
0
        case E_NOMEM:
117
0
            PyErr_NoMemory();
118
0
            return -1;
119
0
        case E_TABSPACE:
120
0
            errtype = PyExc_TabError;
121
0
            msg = "inconsistent use of tabs and spaces in indentation";
122
0
            break;
123
0
        case E_TOODEEP:
124
0
            errtype = PyExc_IndentationError;
125
0
            msg = "too many levels of indentation";
126
0
            break;
127
0
        case E_LINECONT: {
128
0
            msg = "unexpected character after line continuation character";
129
0
            break;
130
0
        }
131
0
        default:
132
0
            msg = "unknown tokenization error";
133
0
    }
134
135
0
    PyObject* errstr = NULL;
136
0
    PyObject* error_line = NULL;
137
0
    PyObject* tmp = NULL;
138
0
    PyObject* value = NULL;
139
0
    int result = 0;
140
141
0
    Py_ssize_t size = tok->inp - tok->buf;
142
0
    assert(tok->buf[size-1] == '\n');
143
0
    size -= 1; // Remove the newline character from the end of the line
144
0
    error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
145
0
    if (!error_line) {
146
0
        result = -1;
147
0
        goto exit;
148
0
    }
149
150
0
    Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
151
0
    if (offset == -1) {
152
0
        result = -1;
153
0
        goto exit;
154
0
    }
155
0
    tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
156
0
    if (!tmp) {
157
0
        result = -1;
158
0
        goto exit;
159
0
    }
160
161
0
    errstr = PyUnicode_FromString(msg);
162
0
    if (!errstr) {
163
0
        result = -1;
164
0
        goto exit;
165
0
    }
166
167
0
    value = PyTuple_Pack(2, errstr, tmp);
168
0
    if (!value) {
169
0
        result = -1;
170
0
        goto exit;
171
0
    }
172
173
0
    PyErr_SetObject(errtype, value);
174
175
0
exit:
176
0
    Py_XDECREF(errstr);
177
0
    Py_XDECREF(error_line);
178
0
    Py_XDECREF(tmp);
179
0
    Py_XDECREF(value);
180
0
    return result;
181
0
}
182
183
static PyObject *
184
_get_current_line(tokenizeriterobject *it, const char *line_start, Py_ssize_t size,
185
                  int *line_changed)
186
0
{
187
0
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
188
0
    PyObject *line;
189
0
    if (it->tok->lineno != it->last_lineno) {
190
        // Line has changed since last token, so we fetch the new line and cache it
191
        // in the iter object.
192
0
        Py_XDECREF(it->last_line);
193
0
        line = PyUnicode_DecodeUTF8(line_start, size, "replace");
194
0
        it->last_line = line;
195
0
        it->byte_col_offset_diff = 0;
196
0
    }
197
0
    else {
198
0
        line = it->last_line;
199
0
        *line_changed = 0;
200
0
    }
201
0
    return line;
202
0
}
203
204
static void
205
_get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_start,
206
                 PyObject *line, int line_changed, Py_ssize_t lineno, Py_ssize_t end_lineno,
207
                 Py_ssize_t *col_offset, Py_ssize_t *end_col_offset)
208
0
{
209
0
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
210
0
    Py_ssize_t byte_offset = -1;
211
0
    if (token.start != NULL && token.start >= line_start) {
212
0
        byte_offset = token.start - line_start;
213
0
        if (line_changed) {
214
0
            *col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset);
215
0
            it->byte_col_offset_diff = byte_offset - *col_offset;
216
0
        }
217
0
        else {
218
0
            *col_offset = byte_offset - it->byte_col_offset_diff;
219
0
        }
220
0
    }
221
222
0
    if (token.end != NULL && token.end >= it->tok->line_start) {
223
0
        Py_ssize_t end_byte_offset = token.end - it->tok->line_start;
224
0
        if (lineno == end_lineno) {
225
            // If the whole token is at the same line, we can just use the token.start
226
            // buffer for figuring out the new column offset, since using line is not
227
            // performant for very long lines.
228
0
            Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
229
0
            *end_col_offset = *col_offset + token_col_offset;
230
0
            it->byte_col_offset_diff += token.end - token.start - token_col_offset;
231
0
        }
232
0
        else {
233
0
            *end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
234
0
            it->byte_col_offset_diff += end_byte_offset - *end_col_offset;
235
0
        }
236
0
    }
237
0
    it->last_lineno = lineno;
238
0
    it->last_end_lineno = end_lineno;
239
0
}
240
241
static PyObject *
242
tokenizeriter_next(PyObject *op)
243
0
{
244
0
    tokenizeriterobject *it = (tokenizeriterobject*)op;
245
0
    PyObject* result = NULL;
246
247
0
    Py_BEGIN_CRITICAL_SECTION(it);
248
249
0
    struct token token;
250
0
    _PyToken_Init(&token);
251
252
0
    int type = _PyTokenizer_Get(it->tok, &token);
253
0
    if (type == ERRORTOKEN) {
254
0
        if(!PyErr_Occurred()) {
255
0
            _tokenizer_error(it);
256
0
            assert(PyErr_Occurred());
257
0
        }
258
0
        goto exit;
259
0
    }
260
0
    if (it->done || type == ERRORTOKEN) {
261
0
        PyErr_SetString(PyExc_StopIteration, "EOF");
262
0
        it->done = 1;
263
0
        goto exit;
264
0
    }
265
0
    PyObject *str = NULL;
266
0
    if (token.start == NULL || token.end == NULL) {
267
0
        str = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
268
0
    }
269
0
    else {
270
0
        str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
271
0
    }
272
0
    if (str == NULL) {
273
0
        goto exit;
274
0
    }
275
276
0
    int is_trailing_token = 0;
277
0
    if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
278
0
        is_trailing_token = 1;
279
0
    }
280
281
0
    const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
282
0
    PyObject* line = NULL;
283
0
    int line_changed = 1;
284
0
    if (it->tok->tok_extra_tokens && is_trailing_token) {
285
0
        line = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
286
0
    } else {
287
0
        Py_ssize_t size = it->tok->inp - line_start;
288
0
        if (size >= 1 && it->tok->implicit_newline) {
289
0
            size -= 1;
290
0
        }
291
292
0
        line = _get_current_line(it, line_start, size, &line_changed);
293
0
    }
294
0
    if (line == NULL) {
295
0
        Py_DECREF(str);
296
0
        goto exit;
297
0
    }
298
299
0
    Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
300
0
    Py_ssize_t end_lineno = it->tok->lineno;
301
0
    Py_ssize_t col_offset = -1;
302
0
    Py_ssize_t end_col_offset = -1;
303
0
    _get_col_offsets(it, token, line_start, line, line_changed,
304
0
                     lineno, end_lineno, &col_offset, &end_col_offset);
305
306
0
    if (it->tok->tok_extra_tokens) {
307
0
        if (is_trailing_token) {
308
0
            lineno = end_lineno = lineno + 1;
309
0
            col_offset = end_col_offset = 0;
310
0
        }
311
        // Necessary adjustments to match the original Python tokenize
312
        // implementation
313
0
        if (type > DEDENT && type < OP) {
314
0
            type = OP;
315
0
        }
316
0
        else if (type == NEWLINE) {
317
0
            Py_DECREF(str);
318
0
            if (!it->tok->implicit_newline) {
319
0
                if (it->tok->start[0] == '\r') {
320
0
                    str = PyUnicode_FromString("\r\n");
321
0
                } else {
322
0
                    str = PyUnicode_FromString("\n");
323
0
                }
324
0
            }
325
0
            end_col_offset++;
326
0
        }
327
0
        else if (type == NL) {
328
0
            if (it->tok->implicit_newline) {
329
0
                Py_DECREF(str);
330
0
                str = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
331
0
            }
332
0
        }
333
334
0
        if (str == NULL) {
335
0
            Py_DECREF(line);
336
0
            goto exit;
337
0
        }
338
0
    }
339
340
0
    result = Py_BuildValue("(iN(nn)(nn)O)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
341
0
exit:
342
0
    _PyToken_Free(&token);
343
0
    if (type == ENDMARKER) {
344
0
        it->done = 1;
345
0
    }
346
347
0
    Py_END_CRITICAL_SECTION();
348
0
    return result;
349
0
}
350
351
static void
352
tokenizeriter_dealloc(PyObject *op)
353
0
{
354
0
    tokenizeriterobject *it = (tokenizeriterobject*)op;
355
0
    PyTypeObject *tp = Py_TYPE(it);
356
0
    Py_XDECREF(it->last_line);
357
0
    _PyTokenizer_Free(it->tok);
358
0
    tp->tp_free(it);
359
0
    Py_DECREF(tp);
360
0
}
361
362
static PyType_Slot tokenizeriter_slots[] = {
363
    {Py_tp_new, tokenizeriter_new},
364
    {Py_tp_dealloc, tokenizeriter_dealloc},
365
    {Py_tp_getattro, PyObject_GenericGetAttr},
366
    {Py_tp_iter, PyObject_SelfIter},
367
    {Py_tp_iternext, tokenizeriter_next},
368
    {0, NULL},
369
};
370
371
static PyType_Spec tokenizeriter_spec = {
372
    .name = "_tokenize.TokenizerIter",
373
    .basicsize = sizeof(tokenizeriterobject),
374
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
375
    .slots = tokenizeriter_slots,
376
};
377
378
static int
379
tokenizemodule_exec(PyObject *m)
380
0
{
381
0
    tokenize_state *state = get_tokenize_state(m);
382
0
    if (state == NULL) {
383
0
        return -1;
384
0
    }
385
386
0
    state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
387
0
    if (state->TokenizerIter == NULL) {
388
0
        return -1;
389
0
    }
390
0
    if (PyModule_AddType(m, state->TokenizerIter) < 0) {
391
0
        return -1;
392
0
    }
393
394
0
    return 0;
395
0
}
396
397
static PyMethodDef tokenize_methods[] = {
398
    {NULL, NULL, 0, NULL} /* Sentinel */
399
};
400
401
static PyModuleDef_Slot tokenizemodule_slots[] = {
402
    {Py_mod_exec, tokenizemodule_exec},
403
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
404
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
405
    {0, NULL}
406
};
407
408
static int
409
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
410
0
{
411
0
    tokenize_state *state = get_tokenize_state(m);
412
0
    Py_VISIT(state->TokenizerIter);
413
0
    return 0;
414
0
}
415
416
static int
417
tokenizemodule_clear(PyObject *m)
418
0
{
419
0
    tokenize_state *state = get_tokenize_state(m);
420
0
    Py_CLEAR(state->TokenizerIter);
421
0
    return 0;
422
0
}
423
424
static void
425
tokenizemodule_free(void *m)
426
0
{
427
0
    tokenizemodule_clear((PyObject *)m);
428
0
}
429
430
static struct PyModuleDef _tokenizemodule = {
431
    PyModuleDef_HEAD_INIT,
432
    .m_name = "_tokenize",
433
    .m_size = sizeof(tokenize_state),
434
    .m_slots = tokenizemodule_slots,
435
    .m_methods = tokenize_methods,
436
    .m_traverse = tokenizemodule_traverse,
437
    .m_clear = tokenizemodule_clear,
438
    .m_free = tokenizemodule_free,
439
};
440
441
PyMODINIT_FUNC
442
PyInit__tokenize(void)
443
0
{
444
0
    return PyModuleDef_Init(&_tokenizemodule);
445
0
}