/src/cpython/Python/Python-tokenize.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "Python.h" |
2 | | #include "errcode.h" |
3 | | #include "internal/pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION |
4 | | #include "../Parser/lexer/state.h" |
5 | | #include "../Parser/lexer/lexer.h" |
6 | | #include "../Parser/tokenizer/tokenizer.h" |
7 | | #include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset() |
8 | | |
9 | | static struct PyModuleDef _tokenizemodule; |
10 | | |
11 | | typedef struct { |
12 | | PyTypeObject *TokenizerIter; |
13 | | } tokenize_state; |
14 | | |
15 | | static tokenize_state * |
16 | 0 | get_tokenize_state(PyObject *module) { |
17 | 0 | return (tokenize_state *)PyModule_GetState(module); |
18 | 0 | } |
19 | | |
20 | | #define _tokenize_get_state_by_type(type) \ |
21 | | get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule)) |
22 | | |
23 | | #include "pycore_runtime.h" |
24 | | #include "clinic/Python-tokenize.c.h" |
25 | | |
26 | | /*[clinic input] |
27 | | module _tokenizer |
28 | | class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter" |
29 | | [clinic start generated code]*/ |
30 | | /*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/ |
31 | | |
32 | | typedef struct |
33 | | { |
34 | | PyObject_HEAD struct tok_state *tok; |
35 | | int done; |
36 | | |
37 | | /* Needed to cache line for performance */ |
38 | | PyObject *last_line; |
39 | | Py_ssize_t last_lineno; |
40 | | Py_ssize_t last_end_lineno; |
41 | | Py_ssize_t byte_col_offset_diff; |
42 | | } tokenizeriterobject; |
43 | | |
44 | | /*[clinic input] |
45 | | @classmethod |
46 | | _tokenizer.tokenizeriter.__new__ as tokenizeriter_new |
47 | | |
48 | | readline: object |
49 | | / |
50 | | * |
51 | | extra_tokens: bool |
52 | | encoding: str(c_default="NULL") = 'utf-8' |
53 | | [clinic start generated code]*/ |
54 | | |
55 | | static PyObject * |
56 | | tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline, |
57 | | int extra_tokens, const char *encoding) |
58 | | /*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/ |
59 | 0 | { |
60 | 0 | tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0); |
61 | 0 | if (self == NULL) { |
62 | 0 | return NULL; |
63 | 0 | } |
64 | 0 | PyObject *filename = PyUnicode_FromString("<string>"); |
65 | 0 | if (filename == NULL) { |
66 | 0 | return NULL; |
67 | 0 | } |
68 | 0 | self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1); |
69 | 0 | if (self->tok == NULL) { |
70 | 0 | Py_DECREF(filename); |
71 | 0 | return NULL; |
72 | 0 | } |
73 | 0 | self->tok->filename = filename; |
74 | 0 | if (extra_tokens) { |
75 | 0 | self->tok->tok_extra_tokens = 1; |
76 | 0 | } |
77 | 0 | self->done = 0; |
78 | |
|
79 | 0 | self->last_line = NULL; |
80 | 0 | self->byte_col_offset_diff = 0; |
81 | 0 | self->last_lineno = 0; |
82 | 0 | self->last_end_lineno = 0; |
83 | |
|
84 | 0 | return (PyObject *)self; |
85 | 0 | } |
86 | | |
87 | | static int |
88 | | _tokenizer_error(tokenizeriterobject *it) |
89 | 0 | { |
90 | 0 | _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it); |
91 | 0 | if (PyErr_Occurred()) { |
92 | 0 | return -1; |
93 | 0 | } |
94 | | |
95 | 0 | const char *msg = NULL; |
96 | 0 | PyObject* errtype = PyExc_SyntaxError; |
97 | 0 | struct tok_state *tok = it->tok; |
98 | 0 | switch (tok->done) { |
99 | 0 | case E_TOKEN: |
100 | 0 | msg = "invalid token"; |
101 | 0 | break; |
102 | 0 | case E_EOF: |
103 | 0 | PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement"); |
104 | 0 | PyErr_SyntaxLocationObject(tok->filename, tok->lineno, |
105 | 0 | tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf)); |
106 | 0 | return -1; |
107 | 0 | case E_DEDENT: |
108 | 0 | msg = "unindent does not match any outer indentation level"; |
109 | 0 | errtype = PyExc_IndentationError; |
110 | 0 | break; |
111 | 0 | case E_INTR: |
112 | 0 | if (!PyErr_Occurred()) { |
113 | 0 | PyErr_SetNone(PyExc_KeyboardInterrupt); |
114 | 0 | } |
115 | 0 | return -1; |
116 | 0 | case E_NOMEM: |
117 | 0 | PyErr_NoMemory(); |
118 | 0 | return -1; |
119 | 0 | case E_TABSPACE: |
120 | 0 | errtype = PyExc_TabError; |
121 | 0 | msg = "inconsistent use of tabs and spaces in indentation"; |
122 | 0 | break; |
123 | 0 | case E_TOODEEP: |
124 | 0 | errtype = PyExc_IndentationError; |
125 | 0 | msg = "too many levels of indentation"; |
126 | 0 | break; |
127 | 0 | case E_LINECONT: { |
128 | 0 | msg = "unexpected character after line continuation character"; |
129 | 0 | break; |
130 | 0 | } |
131 | 0 | default: |
132 | 0 | msg = "unknown tokenization error"; |
133 | 0 | } |
134 | | |
135 | 0 | PyObject* errstr = NULL; |
136 | 0 | PyObject* error_line = NULL; |
137 | 0 | PyObject* tmp = NULL; |
138 | 0 | PyObject* value = NULL; |
139 | 0 | int result = 0; |
140 | |
|
141 | 0 | Py_ssize_t size = tok->inp - tok->buf; |
142 | 0 | assert(tok->buf[size-1] == '\n'); |
143 | 0 | size -= 1; // Remove the newline character from the end of the line |
144 | 0 | error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace"); |
145 | 0 | if (!error_line) { |
146 | 0 | result = -1; |
147 | 0 | goto exit; |
148 | 0 | } |
149 | | |
150 | 0 | Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf); |
151 | 0 | if (offset == -1) { |
152 | 0 | result = -1; |
153 | 0 | goto exit; |
154 | 0 | } |
155 | 0 | tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None); |
156 | 0 | if (!tmp) { |
157 | 0 | result = -1; |
158 | 0 | goto exit; |
159 | 0 | } |
160 | | |
161 | 0 | errstr = PyUnicode_FromString(msg); |
162 | 0 | if (!errstr) { |
163 | 0 | result = -1; |
164 | 0 | goto exit; |
165 | 0 | } |
166 | | |
167 | 0 | value = PyTuple_Pack(2, errstr, tmp); |
168 | 0 | if (!value) { |
169 | 0 | result = -1; |
170 | 0 | goto exit; |
171 | 0 | } |
172 | | |
173 | 0 | PyErr_SetObject(errtype, value); |
174 | |
|
175 | 0 | exit: |
176 | 0 | Py_XDECREF(errstr); |
177 | 0 | Py_XDECREF(error_line); |
178 | 0 | Py_XDECREF(tmp); |
179 | 0 | Py_XDECREF(value); |
180 | 0 | return result; |
181 | 0 | } |
182 | | |
183 | | static PyObject * |
184 | | _get_current_line(tokenizeriterobject *it, const char *line_start, Py_ssize_t size, |
185 | | int *line_changed) |
186 | 0 | { |
187 | 0 | _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it); |
188 | 0 | PyObject *line; |
189 | 0 | if (it->tok->lineno != it->last_lineno) { |
190 | | // Line has changed since last token, so we fetch the new line and cache it |
191 | | // in the iter object. |
192 | 0 | Py_XDECREF(it->last_line); |
193 | 0 | line = PyUnicode_DecodeUTF8(line_start, size, "replace"); |
194 | 0 | it->last_line = line; |
195 | 0 | it->byte_col_offset_diff = 0; |
196 | 0 | } |
197 | 0 | else { |
198 | 0 | line = it->last_line; |
199 | 0 | *line_changed = 0; |
200 | 0 | } |
201 | 0 | return line; |
202 | 0 | } |
203 | | |
204 | | static void |
205 | | _get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_start, |
206 | | PyObject *line, int line_changed, Py_ssize_t lineno, Py_ssize_t end_lineno, |
207 | | Py_ssize_t *col_offset, Py_ssize_t *end_col_offset) |
208 | 0 | { |
209 | 0 | _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it); |
210 | 0 | Py_ssize_t byte_offset = -1; |
211 | 0 | if (token.start != NULL && token.start >= line_start) { |
212 | 0 | byte_offset = token.start - line_start; |
213 | 0 | if (line_changed) { |
214 | 0 | *col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset); |
215 | 0 | it->byte_col_offset_diff = byte_offset - *col_offset; |
216 | 0 | } |
217 | 0 | else { |
218 | 0 | *col_offset = byte_offset - it->byte_col_offset_diff; |
219 | 0 | } |
220 | 0 | } |
221 | |
|
222 | 0 | if (token.end != NULL && token.end >= it->tok->line_start) { |
223 | 0 | Py_ssize_t end_byte_offset = token.end - it->tok->line_start; |
224 | 0 | if (lineno == end_lineno) { |
225 | | // If the whole token is at the same line, we can just use the token.start |
226 | | // buffer for figuring out the new column offset, since using line is not |
227 | | // performant for very long lines. |
228 | 0 | Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset); |
229 | 0 | *end_col_offset = *col_offset + token_col_offset; |
230 | 0 | it->byte_col_offset_diff += token.end - token.start - token_col_offset; |
231 | 0 | } |
232 | 0 | else { |
233 | 0 | *end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset); |
234 | 0 | it->byte_col_offset_diff += end_byte_offset - *end_col_offset; |
235 | 0 | } |
236 | 0 | } |
237 | 0 | it->last_lineno = lineno; |
238 | 0 | it->last_end_lineno = end_lineno; |
239 | 0 | } |
240 | | |
241 | | static PyObject * |
242 | | tokenizeriter_next(PyObject *op) |
243 | 0 | { |
244 | 0 | tokenizeriterobject *it = (tokenizeriterobject*)op; |
245 | 0 | PyObject* result = NULL; |
246 | |
|
247 | 0 | Py_BEGIN_CRITICAL_SECTION(it); |
248 | |
|
249 | 0 | struct token token; |
250 | 0 | _PyToken_Init(&token); |
251 | |
|
252 | 0 | int type = _PyTokenizer_Get(it->tok, &token); |
253 | 0 | if (type == ERRORTOKEN) { |
254 | 0 | if(!PyErr_Occurred()) { |
255 | 0 | _tokenizer_error(it); |
256 | 0 | assert(PyErr_Occurred()); |
257 | 0 | } |
258 | 0 | goto exit; |
259 | 0 | } |
260 | 0 | if (it->done || type == ERRORTOKEN) { |
261 | 0 | PyErr_SetString(PyExc_StopIteration, "EOF"); |
262 | 0 | it->done = 1; |
263 | 0 | goto exit; |
264 | 0 | } |
265 | 0 | PyObject *str = NULL; |
266 | 0 | if (token.start == NULL || token.end == NULL) { |
267 | 0 | str = Py_GetConstant(Py_CONSTANT_EMPTY_STR); |
268 | 0 | } |
269 | 0 | else { |
270 | 0 | str = PyUnicode_FromStringAndSize(token.start, token.end - token.start); |
271 | 0 | } |
272 | 0 | if (str == NULL) { |
273 | 0 | goto exit; |
274 | 0 | } |
275 | | |
276 | 0 | int is_trailing_token = 0; |
277 | 0 | if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) { |
278 | 0 | is_trailing_token = 1; |
279 | 0 | } |
280 | |
|
281 | 0 | const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start; |
282 | 0 | PyObject* line = NULL; |
283 | 0 | int line_changed = 1; |
284 | 0 | if (it->tok->tok_extra_tokens && is_trailing_token) { |
285 | 0 | line = Py_GetConstant(Py_CONSTANT_EMPTY_STR); |
286 | 0 | } else { |
287 | 0 | Py_ssize_t size = it->tok->inp - line_start; |
288 | 0 | if (size >= 1 && it->tok->implicit_newline) { |
289 | 0 | size -= 1; |
290 | 0 | } |
291 | |
|
292 | 0 | line = _get_current_line(it, line_start, size, &line_changed); |
293 | 0 | } |
294 | 0 | if (line == NULL) { |
295 | 0 | Py_DECREF(str); |
296 | 0 | goto exit; |
297 | 0 | } |
298 | | |
299 | 0 | Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno; |
300 | 0 | Py_ssize_t end_lineno = it->tok->lineno; |
301 | 0 | Py_ssize_t col_offset = -1; |
302 | 0 | Py_ssize_t end_col_offset = -1; |
303 | 0 | _get_col_offsets(it, token, line_start, line, line_changed, |
304 | 0 | lineno, end_lineno, &col_offset, &end_col_offset); |
305 | |
|
306 | 0 | if (it->tok->tok_extra_tokens) { |
307 | 0 | if (is_trailing_token) { |
308 | 0 | lineno = end_lineno = lineno + 1; |
309 | 0 | col_offset = end_col_offset = 0; |
310 | 0 | } |
311 | | // Necessary adjustments to match the original Python tokenize |
312 | | // implementation |
313 | 0 | if (type > DEDENT && type < OP) { |
314 | 0 | type = OP; |
315 | 0 | } |
316 | 0 | else if (type == NEWLINE) { |
317 | 0 | Py_DECREF(str); |
318 | 0 | if (!it->tok->implicit_newline) { |
319 | 0 | if (it->tok->start[0] == '\r') { |
320 | 0 | str = PyUnicode_FromString("\r\n"); |
321 | 0 | } else { |
322 | 0 | str = PyUnicode_FromString("\n"); |
323 | 0 | } |
324 | 0 | } |
325 | 0 | end_col_offset++; |
326 | 0 | } |
327 | 0 | else if (type == NL) { |
328 | 0 | if (it->tok->implicit_newline) { |
329 | 0 | Py_DECREF(str); |
330 | 0 | str = Py_GetConstant(Py_CONSTANT_EMPTY_STR); |
331 | 0 | } |
332 | 0 | } |
333 | |
|
334 | 0 | if (str == NULL) { |
335 | 0 | Py_DECREF(line); |
336 | 0 | goto exit; |
337 | 0 | } |
338 | 0 | } |
339 | | |
340 | 0 | result = Py_BuildValue("(iN(nn)(nn)O)", type, str, lineno, col_offset, end_lineno, end_col_offset, line); |
341 | 0 | exit: |
342 | 0 | _PyToken_Free(&token); |
343 | 0 | if (type == ENDMARKER) { |
344 | 0 | it->done = 1; |
345 | 0 | } |
346 | |
|
347 | 0 | Py_END_CRITICAL_SECTION(); |
348 | 0 | return result; |
349 | 0 | } |
350 | | |
351 | | static void |
352 | | tokenizeriter_dealloc(PyObject *op) |
353 | 0 | { |
354 | 0 | tokenizeriterobject *it = (tokenizeriterobject*)op; |
355 | 0 | PyTypeObject *tp = Py_TYPE(it); |
356 | 0 | Py_XDECREF(it->last_line); |
357 | 0 | _PyTokenizer_Free(it->tok); |
358 | 0 | tp->tp_free(it); |
359 | 0 | Py_DECREF(tp); |
360 | 0 | } |
361 | | |
362 | | static PyType_Slot tokenizeriter_slots[] = { |
363 | | {Py_tp_new, tokenizeriter_new}, |
364 | | {Py_tp_dealloc, tokenizeriter_dealloc}, |
365 | | {Py_tp_getattro, PyObject_GenericGetAttr}, |
366 | | {Py_tp_iter, PyObject_SelfIter}, |
367 | | {Py_tp_iternext, tokenizeriter_next}, |
368 | | {0, NULL}, |
369 | | }; |
370 | | |
371 | | static PyType_Spec tokenizeriter_spec = { |
372 | | .name = "_tokenize.TokenizerIter", |
373 | | .basicsize = sizeof(tokenizeriterobject), |
374 | | .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE), |
375 | | .slots = tokenizeriter_slots, |
376 | | }; |
377 | | |
378 | | static int |
379 | | tokenizemodule_exec(PyObject *m) |
380 | 0 | { |
381 | 0 | tokenize_state *state = get_tokenize_state(m); |
382 | 0 | if (state == NULL) { |
383 | 0 | return -1; |
384 | 0 | } |
385 | | |
386 | 0 | state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL); |
387 | 0 | if (state->TokenizerIter == NULL) { |
388 | 0 | return -1; |
389 | 0 | } |
390 | 0 | if (PyModule_AddType(m, state->TokenizerIter) < 0) { |
391 | 0 | return -1; |
392 | 0 | } |
393 | | |
394 | 0 | return 0; |
395 | 0 | } |
396 | | |
397 | | static PyMethodDef tokenize_methods[] = { |
398 | | {NULL, NULL, 0, NULL} /* Sentinel */ |
399 | | }; |
400 | | |
401 | | static PyModuleDef_Slot tokenizemodule_slots[] = { |
402 | | {Py_mod_exec, tokenizemodule_exec}, |
403 | | {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED}, |
404 | | {Py_mod_gil, Py_MOD_GIL_NOT_USED}, |
405 | | {0, NULL} |
406 | | }; |
407 | | |
408 | | static int |
409 | | tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg) |
410 | 0 | { |
411 | 0 | tokenize_state *state = get_tokenize_state(m); |
412 | 0 | Py_VISIT(state->TokenizerIter); |
413 | 0 | return 0; |
414 | 0 | } |
415 | | |
416 | | static int |
417 | | tokenizemodule_clear(PyObject *m) |
418 | 0 | { |
419 | 0 | tokenize_state *state = get_tokenize_state(m); |
420 | 0 | Py_CLEAR(state->TokenizerIter); |
421 | 0 | return 0; |
422 | 0 | } |
423 | | |
424 | | static void |
425 | | tokenizemodule_free(void *m) |
426 | 0 | { |
427 | 0 | tokenizemodule_clear((PyObject *)m); |
428 | 0 | } |
429 | | |
430 | | static struct PyModuleDef _tokenizemodule = { |
431 | | PyModuleDef_HEAD_INIT, |
432 | | .m_name = "_tokenize", |
433 | | .m_size = sizeof(tokenize_state), |
434 | | .m_slots = tokenizemodule_slots, |
435 | | .m_methods = tokenize_methods, |
436 | | .m_traverse = tokenizemodule_traverse, |
437 | | .m_clear = tokenizemodule_clear, |
438 | | .m_free = tokenizemodule_free, |
439 | | }; |
440 | | |
441 | | PyMODINIT_FUNC |
442 | | PyInit__tokenize(void) |
443 | 0 | { |
444 | 0 | return PyModuleDef_Init(&_tokenizemodule); |
445 | 0 | } |