/src/cpython/Parser/tokenizer/file_tokenizer.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "Python.h" |
2 | | #include "pycore_call.h" // _PyObject_CallNoArgs() |
3 | | #include "pycore_fileutils.h" // _Py_UniversalNewlineFgetsWithSize() |
4 | | #include "pycore_runtime.h" // _Py_ID() |
5 | | |
6 | | #include "errcode.h" // E_NOMEM |
7 | | |
8 | | #ifdef HAVE_UNISTD_H |
9 | | # include <unistd.h> // lseek(), read() |
10 | | #endif |
11 | | |
12 | | #include "helpers.h" |
13 | | #include "../lexer/state.h" |
14 | | #include "../lexer/lexer.h" |
15 | | #include "../lexer/buffer.h" |
16 | | |
17 | | |
18 | | static int |
19 | 0 | tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) { |
20 | 0 | assert(tok->fp_interactive); |
21 | |
|
22 | 0 | if (!line) { |
23 | 0 | return 0; |
24 | 0 | } |
25 | | |
26 | 0 | Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start; |
27 | 0 | Py_ssize_t line_size = strlen(line); |
28 | 0 | char last_char = line[line_size > 0 ? line_size - 1 : line_size]; |
29 | 0 | if (last_char != '\n') { |
30 | 0 | line_size += 1; |
31 | 0 | } |
32 | 0 | char* new_str = tok->interactive_src_start; |
33 | |
|
34 | 0 | new_str = PyMem_Realloc(new_str, current_size + line_size + 1); |
35 | 0 | if (!new_str) { |
36 | 0 | if (tok->interactive_src_start) { |
37 | 0 | PyMem_Free(tok->interactive_src_start); |
38 | 0 | } |
39 | 0 | tok->interactive_src_start = NULL; |
40 | 0 | tok->interactive_src_end = NULL; |
41 | 0 | tok->done = E_NOMEM; |
42 | 0 | return -1; |
43 | 0 | } |
44 | 0 | strcpy(new_str + current_size, line); |
45 | 0 | tok->implicit_newline = 0; |
46 | 0 | if (last_char != '\n') { |
47 | | /* Last line does not end in \n, fake one */ |
48 | 0 | new_str[current_size + line_size - 1] = '\n'; |
49 | 0 | new_str[current_size + line_size] = '\0'; |
50 | 0 | tok->implicit_newline = 1; |
51 | 0 | } |
52 | 0 | tok->interactive_src_start = new_str; |
53 | 0 | tok->interactive_src_end = new_str + current_size + line_size; |
54 | 0 | return 0; |
55 | 0 | } |
56 | | |
57 | | static int |
58 | | tok_readline_raw(struct tok_state *tok) |
59 | 0 | { |
60 | 0 | do { |
61 | 0 | if (!_PyLexer_tok_reserve_buf(tok, BUFSIZ)) { |
62 | 0 | return 0; |
63 | 0 | } |
64 | 0 | int n_chars = (int)(tok->end - tok->inp); |
65 | 0 | size_t line_size = 0; |
66 | 0 | char *line = _Py_UniversalNewlineFgetsWithSize(tok->inp, n_chars, tok->fp, NULL, &line_size); |
67 | 0 | if (line == NULL) { |
68 | 0 | return 1; |
69 | 0 | } |
70 | 0 | if (tok->fp_interactive && |
71 | 0 | tok_concatenate_interactive_new_line(tok, line) == -1) { |
72 | 0 | return 0; |
73 | 0 | } |
74 | 0 | tok->inp += line_size; |
75 | 0 | if (tok->inp == tok->buf) { |
76 | 0 | return 0; |
77 | 0 | } |
78 | 0 | } while (tok->inp[-1] != '\n'); |
79 | 0 | return 1; |
80 | 0 | } |
81 | | |
82 | | static int |
83 | 0 | tok_readline_recode(struct tok_state *tok) { |
84 | 0 | PyObject *line; |
85 | 0 | const char *buf; |
86 | 0 | Py_ssize_t buflen; |
87 | 0 | line = tok->decoding_buffer; |
88 | 0 | if (line == NULL) { |
89 | 0 | line = PyObject_CallNoArgs(tok->decoding_readline); |
90 | 0 | if (line == NULL) { |
91 | 0 | _PyTokenizer_error_ret(tok); |
92 | 0 | goto error; |
93 | 0 | } |
94 | 0 | } |
95 | 0 | else { |
96 | 0 | tok->decoding_buffer = NULL; |
97 | 0 | } |
98 | 0 | buf = PyUnicode_AsUTF8AndSize(line, &buflen); |
99 | 0 | if (buf == NULL) { |
100 | 0 | _PyTokenizer_error_ret(tok); |
101 | 0 | goto error; |
102 | 0 | } |
103 | | // Make room for the null terminator *and* potentially |
104 | | // an extra newline character that we may need to artificially |
105 | | // add. |
106 | 0 | size_t buffer_size = buflen + 2; |
107 | 0 | if (!_PyLexer_tok_reserve_buf(tok, buffer_size)) { |
108 | 0 | goto error; |
109 | 0 | } |
110 | 0 | memcpy(tok->inp, buf, buflen); |
111 | 0 | tok->inp += buflen; |
112 | 0 | *tok->inp = '\0'; |
113 | 0 | if (tok->fp_interactive && |
114 | 0 | tok_concatenate_interactive_new_line(tok, buf) == -1) { |
115 | 0 | goto error; |
116 | 0 | } |
117 | 0 | Py_DECREF(line); |
118 | 0 | return 1; |
119 | 0 | error: |
120 | 0 | Py_XDECREF(line); |
121 | 0 | return 0; |
122 | 0 | } |
123 | | |
124 | | /* Fetch the next byte from TOK. */ |
125 | 0 | static int fp_getc(struct tok_state *tok) { |
126 | 0 | return getc(tok->fp); |
127 | 0 | } |
128 | | |
129 | | /* Unfetch the last byte back into TOK. */ |
130 | 0 | static void fp_ungetc(int c, struct tok_state *tok) { |
131 | 0 | ungetc(c, tok->fp); |
132 | 0 | } |
133 | | |
134 | | /* Set the readline function for TOK to a StreamReader's |
135 | | readline function. The StreamReader is named ENC. |
136 | | |
137 | | This function is called from _PyTokenizer_check_bom and _PyTokenizer_check_coding_spec. |
138 | | |
139 | | ENC is usually identical to the future value of tok->encoding, |
140 | | except for the (currently unsupported) case of UTF-16. |
141 | | |
142 | | Return 1 on success, 0 on failure. */ |
143 | | static int |
144 | | fp_setreadl(struct tok_state *tok, const char* enc) |
145 | 0 | { |
146 | 0 | PyObject *readline, *open, *stream; |
147 | 0 | int fd; |
148 | 0 | long pos; |
149 | |
|
150 | 0 | fd = fileno(tok->fp); |
151 | | /* Due to buffering the file offset for fd can be different from the file |
152 | | * position of tok->fp. If tok->fp was opened in text mode on Windows, |
153 | | * its file position counts CRLF as one char and can't be directly mapped |
154 | | * to the file offset for fd. Instead we step back one byte and read to |
155 | | * the end of line.*/ |
156 | 0 | pos = ftell(tok->fp); |
157 | 0 | if (pos == -1 || |
158 | 0 | lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) { |
159 | 0 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL); |
160 | 0 | return 0; |
161 | 0 | } |
162 | | |
163 | 0 | open = PyImport_ImportModuleAttrString("io", "open"); |
164 | 0 | if (open == NULL) { |
165 | 0 | return 0; |
166 | 0 | } |
167 | 0 | stream = PyObject_CallFunction(open, "isisOOO", |
168 | 0 | fd, "r", -1, enc, Py_None, Py_None, Py_False); |
169 | 0 | Py_DECREF(open); |
170 | 0 | if (stream == NULL) { |
171 | 0 | return 0; |
172 | 0 | } |
173 | | |
174 | 0 | readline = PyObject_GetAttr(stream, &_Py_ID(readline)); |
175 | 0 | Py_DECREF(stream); |
176 | 0 | if (readline == NULL) { |
177 | 0 | return 0; |
178 | 0 | } |
179 | 0 | Py_XSETREF(tok->decoding_readline, readline); |
180 | |
|
181 | 0 | if (pos > 0) { |
182 | 0 | PyObject *bufobj = _PyObject_CallNoArgs(readline); |
183 | 0 | if (bufobj == NULL) { |
184 | 0 | return 0; |
185 | 0 | } |
186 | 0 | Py_DECREF(bufobj); |
187 | 0 | } |
188 | | |
189 | 0 | return 1; |
190 | 0 | } |
191 | | |
192 | | static int |
193 | 0 | tok_underflow_interactive(struct tok_state *tok) { |
194 | 0 | if (tok->interactive_underflow == IUNDERFLOW_STOP) { |
195 | 0 | tok->done = E_INTERACT_STOP; |
196 | 0 | return 1; |
197 | 0 | } |
198 | 0 | char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt); |
199 | 0 | if (newtok != NULL) { |
200 | 0 | char *translated = _PyTokenizer_translate_newlines(newtok, 0, 0, tok); |
201 | 0 | PyMem_Free(newtok); |
202 | 0 | if (translated == NULL) { |
203 | 0 | return 0; |
204 | 0 | } |
205 | 0 | newtok = translated; |
206 | 0 | } |
207 | 0 | if (tok->encoding && newtok && *newtok) { |
208 | | /* Recode to UTF-8 */ |
209 | 0 | Py_ssize_t buflen; |
210 | 0 | const char* buf; |
211 | 0 | PyObject *u = _PyTokenizer_translate_into_utf8(newtok, tok->encoding); |
212 | 0 | PyMem_Free(newtok); |
213 | 0 | if (u == NULL) { |
214 | 0 | tok->done = E_DECODE; |
215 | 0 | return 0; |
216 | 0 | } |
217 | 0 | buflen = PyBytes_GET_SIZE(u); |
218 | 0 | buf = PyBytes_AS_STRING(u); |
219 | 0 | newtok = PyMem_Malloc(buflen+1); |
220 | 0 | if (newtok == NULL) { |
221 | 0 | Py_DECREF(u); |
222 | 0 | tok->done = E_NOMEM; |
223 | 0 | return 0; |
224 | 0 | } |
225 | 0 | strcpy(newtok, buf); |
226 | 0 | Py_DECREF(u); |
227 | 0 | } |
228 | 0 | if (tok->fp_interactive && |
229 | 0 | tok_concatenate_interactive_new_line(tok, newtok) == -1) { |
230 | 0 | PyMem_Free(newtok); |
231 | 0 | return 0; |
232 | 0 | } |
233 | 0 | if (tok->nextprompt != NULL) { |
234 | 0 | tok->prompt = tok->nextprompt; |
235 | 0 | } |
236 | 0 | if (newtok == NULL) { |
237 | 0 | tok->done = E_INTR; |
238 | 0 | } |
239 | 0 | else if (*newtok == '\0') { |
240 | 0 | PyMem_Free(newtok); |
241 | 0 | tok->done = E_EOF; |
242 | 0 | } |
243 | 0 | else if (tok->start != NULL) { |
244 | 0 | Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; |
245 | 0 | _PyLexer_remember_fstring_buffers(tok); |
246 | 0 | size_t size = strlen(newtok); |
247 | 0 | ADVANCE_LINENO(); |
248 | 0 | if (!_PyLexer_tok_reserve_buf(tok, size + 1)) { |
249 | 0 | PyMem_Free(tok->buf); |
250 | 0 | tok->buf = NULL; |
251 | 0 | PyMem_Free(newtok); |
252 | 0 | return 0; |
253 | 0 | } |
254 | 0 | memcpy(tok->cur, newtok, size + 1); |
255 | 0 | PyMem_Free(newtok); |
256 | 0 | tok->inp += size; |
257 | 0 | tok->multi_line_start = tok->buf + cur_multi_line_start; |
258 | 0 | _PyLexer_restore_fstring_buffers(tok); |
259 | 0 | } |
260 | 0 | else { |
261 | 0 | _PyLexer_remember_fstring_buffers(tok); |
262 | 0 | ADVANCE_LINENO(); |
263 | 0 | PyMem_Free(tok->buf); |
264 | 0 | tok->buf = newtok; |
265 | 0 | tok->cur = tok->buf; |
266 | 0 | tok->line_start = tok->buf; |
267 | 0 | tok->inp = strchr(tok->buf, '\0'); |
268 | 0 | tok->end = tok->inp + 1; |
269 | 0 | _PyLexer_restore_fstring_buffers(tok); |
270 | 0 | } |
271 | 0 | if (tok->done != E_OK) { |
272 | 0 | if (tok->prompt != NULL) { |
273 | 0 | PySys_WriteStderr("\n"); |
274 | 0 | } |
275 | 0 | return 0; |
276 | 0 | } |
277 | | |
278 | 0 | if (tok->tok_mode_stack_index && !_PyLexer_update_ftstring_expr(tok, 0)) { |
279 | 0 | return 0; |
280 | 0 | } |
281 | 0 | return 1; |
282 | 0 | } |
283 | | |
284 | | static int |
285 | 0 | tok_underflow_file(struct tok_state *tok) { |
286 | 0 | if (tok->start == NULL && !INSIDE_FSTRING(tok)) { |
287 | 0 | tok->cur = tok->inp = tok->buf; |
288 | 0 | } |
289 | 0 | if (tok->decoding_state == STATE_INIT) { |
290 | | /* We have not yet determined the encoding. |
291 | | If an encoding is found, use the file-pointer |
292 | | reader functions from now on. */ |
293 | 0 | if (!_PyTokenizer_check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) { |
294 | 0 | _PyTokenizer_error_ret(tok); |
295 | 0 | return 0; |
296 | 0 | } |
297 | 0 | assert(tok->decoding_state != STATE_INIT); |
298 | 0 | } |
299 | | /* Read until '\n' or EOF */ |
300 | 0 | if (tok->decoding_readline != NULL) { |
301 | | /* We already have a codec associated with this input. */ |
302 | 0 | if (!tok_readline_recode(tok)) { |
303 | 0 | return 0; |
304 | 0 | } |
305 | 0 | } |
306 | 0 | else { |
307 | | /* We want a 'raw' read. */ |
308 | 0 | if (!tok_readline_raw(tok)) { |
309 | 0 | return 0; |
310 | 0 | } |
311 | 0 | } |
312 | 0 | if (tok->inp == tok->cur) { |
313 | 0 | tok->done = E_EOF; |
314 | 0 | return 0; |
315 | 0 | } |
316 | 0 | tok->implicit_newline = 0; |
317 | 0 | if (tok->inp[-1] != '\n') { |
318 | 0 | assert(tok->inp + 1 < tok->end); |
319 | | /* Last line does not end in \n, fake one */ |
320 | 0 | *tok->inp++ = '\n'; |
321 | 0 | *tok->inp = '\0'; |
322 | 0 | tok->implicit_newline = 1; |
323 | 0 | } |
324 | |
|
325 | 0 | if (tok->tok_mode_stack_index && !_PyLexer_update_ftstring_expr(tok, 0)) { |
326 | 0 | return 0; |
327 | 0 | } |
328 | | |
329 | 0 | ADVANCE_LINENO(); |
330 | 0 | if (tok->decoding_state != STATE_NORMAL) { |
331 | 0 | if (tok->lineno > 2) { |
332 | 0 | tok->decoding_state = STATE_NORMAL; |
333 | 0 | } |
334 | 0 | else if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur), |
335 | 0 | tok, fp_setreadl)) |
336 | 0 | { |
337 | 0 | return 0; |
338 | 0 | } |
339 | 0 | } |
340 | | /* The default encoding is UTF-8, so make sure we don't have any |
341 | | non-UTF-8 sequences in it. */ |
342 | 0 | if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) { |
343 | 0 | _PyTokenizer_error_ret(tok); |
344 | 0 | return 0; |
345 | 0 | } |
346 | 0 | assert(tok->done == E_OK); |
347 | 0 | return tok->done == E_OK; |
348 | 0 | } |
349 | | |
350 | | /* Set up tokenizer for file */ |
351 | | struct tok_state * |
352 | | _PyTokenizer_FromFile(FILE *fp, const char* enc, |
353 | | const char *ps1, const char *ps2) |
354 | 0 | { |
355 | 0 | struct tok_state *tok = _PyTokenizer_tok_new(); |
356 | 0 | if (tok == NULL) |
357 | 0 | return NULL; |
358 | 0 | if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) { |
359 | 0 | _PyTokenizer_Free(tok); |
360 | 0 | return NULL; |
361 | 0 | } |
362 | 0 | tok->cur = tok->inp = tok->buf; |
363 | 0 | tok->end = tok->buf + BUFSIZ; |
364 | 0 | tok->fp = fp; |
365 | 0 | tok->prompt = ps1; |
366 | 0 | tok->nextprompt = ps2; |
367 | 0 | if (ps1 || ps2) { |
368 | 0 | tok->underflow = &tok_underflow_interactive; |
369 | 0 | } else { |
370 | 0 | tok->underflow = &tok_underflow_file; |
371 | 0 | } |
372 | 0 | if (enc != NULL) { |
373 | | /* Must copy encoding declaration since it |
374 | | gets copied into the parse tree. */ |
375 | 0 | tok->encoding = _PyTokenizer_new_string(enc, strlen(enc), tok); |
376 | 0 | if (!tok->encoding) { |
377 | 0 | _PyTokenizer_Free(tok); |
378 | 0 | return NULL; |
379 | 0 | } |
380 | 0 | tok->decoding_state = STATE_NORMAL; |
381 | 0 | } |
382 | 0 | return tok; |
383 | 0 | } |
384 | | |
385 | | #if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3)) |
386 | | // fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's |
387 | | // dup() emulation with open() is slow. |
388 | | typedef union { |
389 | | void *cookie; |
390 | | int fd; |
391 | | } borrowed; |
392 | | |
393 | | static ssize_t |
394 | | borrow_read(void *cookie, char *buf, size_t size) |
395 | | { |
396 | | borrowed b = {.cookie = cookie}; |
397 | | return read(b.fd, (void *)buf, size); |
398 | | } |
399 | | |
400 | | static FILE * |
401 | | fdopen_borrow(int fd) { |
402 | | // supports only reading. seek fails. close and write are no-ops. |
403 | | cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL}; |
404 | | borrowed b = {.fd = fd}; |
405 | | return fopencookie(b.cookie, "r", io_cb); |
406 | | } |
407 | | #else |
408 | | static FILE * |
409 | 0 | fdopen_borrow(int fd) { |
410 | 0 | fd = _Py_dup(fd); |
411 | 0 | if (fd < 0) { |
412 | 0 | return NULL; |
413 | 0 | } |
414 | 0 | return fdopen(fd, "r"); |
415 | 0 | } |
416 | | #endif |
417 | | |
418 | | /* Get the encoding of a Python file. Check for the coding cookie and check if |
419 | | the file starts with a BOM. |
420 | | |
421 | | _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the |
422 | | encoding in the first or second line of the file (in which case the encoding |
423 | | should be assumed to be UTF-8). |
424 | | |
425 | | The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed |
426 | | by the caller. */ |
427 | | char * |
428 | | _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) |
429 | 0 | { |
430 | 0 | struct tok_state *tok; |
431 | 0 | FILE *fp; |
432 | 0 | char *encoding = NULL; |
433 | |
|
434 | 0 | fp = fdopen_borrow(fd); |
435 | 0 | if (fp == NULL) { |
436 | 0 | return NULL; |
437 | 0 | } |
438 | 0 | tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL); |
439 | 0 | if (tok == NULL) { |
440 | 0 | fclose(fp); |
441 | 0 | return NULL; |
442 | 0 | } |
443 | 0 | if (filename != NULL) { |
444 | 0 | tok->filename = Py_NewRef(filename); |
445 | 0 | } |
446 | 0 | else { |
447 | 0 | tok->filename = PyUnicode_FromString("<string>"); |
448 | 0 | if (tok->filename == NULL) { |
449 | 0 | fclose(fp); |
450 | 0 | _PyTokenizer_Free(tok); |
451 | 0 | return encoding; |
452 | 0 | } |
453 | 0 | } |
454 | 0 | struct token token; |
455 | | // We don't want to report warnings here because it could cause infinite recursion |
456 | | // if fetching the encoding shows a warning. |
457 | 0 | tok->report_warnings = 0; |
458 | 0 | while (tok->lineno < 2 && tok->done == E_OK) { |
459 | 0 | _PyToken_Init(&token); |
460 | 0 | _PyTokenizer_Get(tok, &token); |
461 | 0 | _PyToken_Free(&token); |
462 | 0 | } |
463 | 0 | fclose(fp); |
464 | 0 | if (tok->encoding) { |
465 | 0 | encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1); |
466 | 0 | if (encoding) { |
467 | 0 | strcpy(encoding, tok->encoding); |
468 | 0 | } |
469 | 0 | } |
470 | 0 | _PyTokenizer_Free(tok); |
471 | 0 | return encoding; |
472 | 0 | } |