/src/cpython/Parser/tokenizer/file_tokenizer.c
Line | Count | Source |
1 | | #include "Python.h" |
2 | | #include "pycore_call.h" // _PyObject_CallNoArgs() |
3 | | #include "pycore_fileutils.h" // _Py_UniversalNewlineFgetsWithSize() |
4 | | #include "pycore_runtime.h" // _Py_ID() |
5 | | |
6 | | #include "errcode.h" // E_NOMEM |
7 | | |
8 | | #ifdef HAVE_UNISTD_H |
9 | | # include <unistd.h> // lseek(), read() |
10 | | #endif |
11 | | |
12 | | #include "helpers.h" |
13 | | #include "../lexer/state.h" |
14 | | #include "../lexer/lexer.h" |
15 | | #include "../lexer/buffer.h" |
16 | | |
17 | | |
18 | | static int |
19 | 0 | tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) { |
20 | 0 | assert(tok->fp_interactive); |
21 | |
|
22 | 0 | if (!line) { |
23 | 0 | return 0; |
24 | 0 | } |
25 | | |
26 | 0 | Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start; |
27 | 0 | Py_ssize_t line_size = strlen(line); |
28 | 0 | char last_char = line[line_size > 0 ? line_size - 1 : line_size]; |
29 | 0 | if (last_char != '\n') { |
30 | 0 | line_size += 1; |
31 | 0 | } |
32 | 0 | char* new_str = tok->interactive_src_start; |
33 | |
|
34 | 0 | new_str = PyMem_Realloc(new_str, current_size + line_size + 1); |
35 | 0 | if (!new_str) { |
36 | 0 | if (tok->interactive_src_start) { |
37 | 0 | PyMem_Free(tok->interactive_src_start); |
38 | 0 | } |
39 | 0 | tok->interactive_src_start = NULL; |
40 | 0 | tok->interactive_src_end = NULL; |
41 | 0 | tok->done = E_NOMEM; |
42 | 0 | return -1; |
43 | 0 | } |
44 | 0 | strcpy(new_str + current_size, line); |
45 | 0 | tok->implicit_newline = 0; |
46 | 0 | if (last_char != '\n') { |
47 | | /* Last line does not end in \n, fake one */ |
48 | 0 | new_str[current_size + line_size - 1] = '\n'; |
49 | 0 | new_str[current_size + line_size] = '\0'; |
50 | 0 | tok->implicit_newline = 1; |
51 | 0 | } |
52 | 0 | tok->interactive_src_start = new_str; |
53 | 0 | tok->interactive_src_end = new_str + current_size + line_size; |
54 | 0 | return 0; |
55 | 0 | } |
56 | | |
57 | | static int |
58 | | tok_readline_raw(struct tok_state *tok) |
59 | 0 | { |
60 | 0 | do { |
61 | 0 | if (!_PyLexer_tok_reserve_buf(tok, BUFSIZ)) { |
62 | 0 | return 0; |
63 | 0 | } |
64 | 0 | int n_chars = (int)(tok->end - tok->inp); |
65 | 0 | size_t line_size = 0; |
66 | 0 | char *line = _Py_UniversalNewlineFgetsWithSize(tok->inp, n_chars, tok->fp, NULL, &line_size); |
67 | 0 | if (line == NULL) { |
68 | 0 | return 1; |
69 | 0 | } |
70 | 0 | if (tok->fp_interactive && |
71 | 0 | tok_concatenate_interactive_new_line(tok, line) == -1) { |
72 | 0 | return 0; |
73 | 0 | } |
74 | 0 | tok->inp += line_size; |
75 | 0 | if (tok->inp == tok->buf) { |
76 | 0 | return 0; |
77 | 0 | } |
78 | 0 | } while (tok->inp[-1] != '\n'); |
79 | 0 | return 1; |
80 | 0 | } |
81 | | |
82 | | static int |
83 | 0 | tok_readline_recode(struct tok_state *tok) { |
84 | 0 | PyObject *line; |
85 | 0 | const char *buf; |
86 | 0 | Py_ssize_t buflen; |
87 | 0 | line = tok->decoding_buffer; |
88 | 0 | if (line == NULL) { |
89 | 0 | line = PyObject_CallNoArgs(tok->decoding_readline); |
90 | 0 | if (line == NULL) { |
91 | 0 | _PyTokenizer_error_ret(tok); |
92 | 0 | goto error; |
93 | 0 | } |
94 | 0 | } |
95 | 0 | else { |
96 | 0 | tok->decoding_buffer = NULL; |
97 | 0 | } |
98 | 0 | buf = PyUnicode_AsUTF8AndSize(line, &buflen); |
99 | 0 | if (buf == NULL) { |
100 | 0 | _PyTokenizer_error_ret(tok); |
101 | 0 | goto error; |
102 | 0 | } |
103 | | // Make room for the null terminator *and* potentially |
104 | | // an extra newline character that we may need to artificially |
105 | | // add. |
106 | 0 | size_t buffer_size = buflen + 2; |
107 | 0 | if (!_PyLexer_tok_reserve_buf(tok, buffer_size)) { |
108 | 0 | goto error; |
109 | 0 | } |
110 | 0 | memcpy(tok->inp, buf, buflen); |
111 | 0 | tok->inp += buflen; |
112 | 0 | *tok->inp = '\0'; |
113 | 0 | if (tok->fp_interactive && |
114 | 0 | tok_concatenate_interactive_new_line(tok, buf) == -1) { |
115 | 0 | goto error; |
116 | 0 | } |
117 | 0 | Py_DECREF(line); |
118 | 0 | return 1; |
119 | 0 | error: |
120 | 0 | Py_XDECREF(line); |
121 | 0 | return 0; |
122 | 0 | } |
123 | | |
124 | | /* Fetch the next byte from TOK. */ |
125 | 0 | static int fp_getc(struct tok_state *tok) { |
126 | 0 | return getc(tok->fp); |
127 | 0 | } |
128 | | |
129 | | /* Unfetch the last byte back into TOK. */ |
130 | 0 | static void fp_ungetc(int c, struct tok_state *tok) { |
131 | 0 | ungetc(c, tok->fp); |
132 | 0 | } |
133 | | |
134 | | /* Set the readline function for TOK to a StreamReader's |
135 | | readline function. The StreamReader is named ENC. |
136 | | |
137 | | This function is called from _PyTokenizer_check_bom and _PyTokenizer_check_coding_spec. |
138 | | |
139 | | ENC is usually identical to the future value of tok->encoding, |
140 | | except for the (currently unsupported) case of UTF-16. |
141 | | |
142 | | Return 1 on success, 0 on failure. */ |
143 | | static int |
144 | | fp_setreadl(struct tok_state *tok, const char* enc) |
145 | 0 | { |
146 | 0 | PyObject *readline, *open, *stream; |
147 | 0 | int fd; |
148 | 0 | long pos; |
149 | |
|
150 | 0 | fd = fileno(tok->fp); |
151 | | /* Due to buffering the file offset for fd can be different from the file |
152 | | * position of tok->fp. If tok->fp was opened in text mode on Windows, |
153 | | * its file position counts CRLF as one char and can't be directly mapped |
154 | | * to the file offset for fd. Instead we step back one byte and read to |
155 | | * the end of line.*/ |
156 | 0 | pos = ftell(tok->fp); |
157 | 0 | if (pos == -1 || |
158 | 0 | lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) { |
159 | 0 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL); |
160 | 0 | return 0; |
161 | 0 | } |
162 | | |
163 | 0 | open = PyImport_ImportModuleAttrString("io", "open"); |
164 | 0 | if (open == NULL) { |
165 | 0 | return 0; |
166 | 0 | } |
167 | 0 | stream = PyObject_CallFunction(open, "isisOOO", |
168 | 0 | fd, "r", -1, enc, Py_None, Py_None, Py_False); |
169 | 0 | Py_DECREF(open); |
170 | 0 | if (stream == NULL) { |
171 | 0 | return 0; |
172 | 0 | } |
173 | | |
174 | 0 | readline = PyObject_GetAttr(stream, &_Py_ID(readline)); |
175 | 0 | Py_DECREF(stream); |
176 | 0 | if (readline == NULL) { |
177 | 0 | return 0; |
178 | 0 | } |
179 | 0 | Py_XSETREF(tok->decoding_readline, readline); |
180 | |
|
181 | 0 | if (pos > 0) { |
182 | 0 | PyObject *bufobj = _PyObject_CallNoArgs(readline); |
183 | 0 | if (bufobj == NULL) { |
184 | 0 | return 0; |
185 | 0 | } |
186 | 0 | Py_DECREF(bufobj); |
187 | 0 | } |
188 | | |
189 | 0 | return 1; |
190 | 0 | } |
191 | | |
192 | | static int |
193 | 0 | tok_underflow_interactive(struct tok_state *tok) { |
194 | 0 | if (tok->interactive_underflow == IUNDERFLOW_STOP) { |
195 | 0 | tok->done = E_INTERACT_STOP; |
196 | 0 | return 1; |
197 | 0 | } |
198 | 0 | char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt); |
199 | 0 | if (newtok != NULL) { |
200 | 0 | char *translated = _PyTokenizer_translate_newlines(newtok, 0, 0, tok); |
201 | 0 | PyMem_Free(newtok); |
202 | 0 | if (translated == NULL) { |
203 | 0 | return 0; |
204 | 0 | } |
205 | 0 | newtok = translated; |
206 | 0 | } |
207 | 0 | if (tok->encoding && newtok && *newtok) { |
208 | | /* Recode to UTF-8 */ |
209 | 0 | Py_ssize_t buflen; |
210 | 0 | const char* buf; |
211 | 0 | PyObject *u = _PyTokenizer_translate_into_utf8(newtok, tok->encoding); |
212 | 0 | PyMem_Free(newtok); |
213 | 0 | if (u == NULL) { |
214 | 0 | tok->done = E_DECODE; |
215 | 0 | return 0; |
216 | 0 | } |
217 | 0 | buflen = PyBytes_GET_SIZE(u); |
218 | 0 | buf = PyBytes_AS_STRING(u); |
219 | 0 | newtok = PyMem_Malloc(buflen+1); |
220 | 0 | if (newtok == NULL) { |
221 | 0 | Py_DECREF(u); |
222 | 0 | tok->done = E_NOMEM; |
223 | 0 | return 0; |
224 | 0 | } |
225 | 0 | strcpy(newtok, buf); |
226 | 0 | Py_DECREF(u); |
227 | 0 | } |
228 | 0 | if (tok->fp_interactive && |
229 | 0 | tok_concatenate_interactive_new_line(tok, newtok) == -1) { |
230 | 0 | PyMem_Free(newtok); |
231 | 0 | return 0; |
232 | 0 | } |
233 | 0 | if (tok->nextprompt != NULL) { |
234 | 0 | tok->prompt = tok->nextprompt; |
235 | 0 | } |
236 | 0 | if (newtok == NULL) { |
237 | 0 | tok->done = E_INTR; |
238 | 0 | } |
239 | 0 | else if (*newtok == '\0') { |
240 | 0 | PyMem_Free(newtok); |
241 | 0 | tok->done = E_EOF; |
242 | 0 | } |
243 | 0 | else if (tok->start != NULL) { |
244 | 0 | Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; |
245 | 0 | _PyLexer_remember_fstring_buffers(tok); |
246 | 0 | size_t size = strlen(newtok); |
247 | 0 | ADVANCE_LINENO(); |
248 | 0 | if (!_PyLexer_tok_reserve_buf(tok, size + 1)) { |
249 | 0 | PyMem_Free(tok->buf); |
250 | 0 | tok->buf = NULL; |
251 | 0 | PyMem_Free(newtok); |
252 | 0 | return 0; |
253 | 0 | } |
254 | 0 | memcpy(tok->cur, newtok, size + 1); |
255 | 0 | PyMem_Free(newtok); |
256 | 0 | tok->inp += size; |
257 | 0 | tok->multi_line_start = tok->buf + cur_multi_line_start; |
258 | 0 | _PyLexer_restore_fstring_buffers(tok); |
259 | 0 | } |
260 | 0 | else { |
261 | 0 | _PyLexer_remember_fstring_buffers(tok); |
262 | 0 | ADVANCE_LINENO(); |
263 | 0 | PyMem_Free(tok->buf); |
264 | 0 | tok->buf = newtok; |
265 | 0 | tok->cur = tok->buf; |
266 | 0 | tok->line_start = tok->buf; |
267 | 0 | tok->inp = strchr(tok->buf, '\0'); |
268 | 0 | tok->end = tok->inp + 1; |
269 | 0 | _PyLexer_restore_fstring_buffers(tok); |
270 | 0 | } |
271 | 0 | if (tok->done != E_OK) { |
272 | 0 | if (tok->prompt != NULL) { |
273 | 0 | PySys_WriteStderr("\n"); |
274 | 0 | } |
275 | 0 | return 0; |
276 | 0 | } |
277 | | |
278 | 0 | if (tok->tok_mode_stack_index && !_PyLexer_update_ftstring_expr(tok, 0)) { |
279 | 0 | return 0; |
280 | 0 | } |
281 | 0 | return 1; |
282 | 0 | } |
283 | | |
284 | | static int |
285 | | tok_underflow_file(struct tok_state *tok) |
286 | 0 | { |
287 | 0 | if (tok->decoding_state == STATE_INIT) { |
288 | | /* We have not yet determined the encoding. |
289 | | If an encoding is found, use the file-pointer |
290 | | reader functions from now on. */ |
291 | 0 | if (!_PyTokenizer_check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) { |
292 | 0 | _PyTokenizer_error_ret(tok); |
293 | 0 | return 0; |
294 | 0 | } |
295 | 0 | assert(tok->decoding_state != STATE_INIT); |
296 | 0 | } |
297 | 0 | int raw = tok->decoding_readline == NULL; |
298 | 0 | if (raw && tok->decoding_state != STATE_NORMAL) { |
299 | | /* Keep the first line in the buffer to validate it later if |
300 | | * the encoding has not yet been determined. */ |
301 | 0 | } |
302 | 0 | else if (tok->start == NULL && !INSIDE_FSTRING(tok)) { |
303 | 0 | tok->cur = tok->inp = tok->buf; |
304 | 0 | } |
305 | | /* Read until '\n' or EOF */ |
306 | 0 | if (!raw) { |
307 | | /* We already have a codec associated with this input. */ |
308 | 0 | if (!tok_readline_recode(tok)) { |
309 | 0 | return 0; |
310 | 0 | } |
311 | 0 | } |
312 | 0 | else { |
313 | | /* We want a 'raw' read. */ |
314 | 0 | if (!tok_readline_raw(tok)) { |
315 | 0 | return 0; |
316 | 0 | } |
317 | 0 | } |
318 | 0 | if (tok->inp == tok->cur) { |
319 | 0 | tok->done = E_EOF; |
320 | 0 | return 0; |
321 | 0 | } |
322 | 0 | tok->implicit_newline = 0; |
323 | 0 | if (tok->inp[-1] != '\n') { |
324 | 0 | assert(tok->inp + 1 < tok->end); |
325 | | /* Last line does not end in \n, fake one */ |
326 | 0 | *tok->inp++ = '\n'; |
327 | 0 | *tok->inp = '\0'; |
328 | 0 | tok->implicit_newline = 1; |
329 | 0 | } |
330 | |
|
331 | 0 | if (tok->tok_mode_stack_index && !_PyLexer_update_ftstring_expr(tok, 0)) { |
332 | 0 | return 0; |
333 | 0 | } |
334 | | |
335 | 0 | ADVANCE_LINENO(); |
336 | 0 | if (tok->decoding_state != STATE_NORMAL) { |
337 | 0 | if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur), |
338 | 0 | tok, fp_setreadl)) |
339 | 0 | { |
340 | 0 | return 0; |
341 | 0 | } |
342 | 0 | if (tok->lineno >= 2) { |
343 | 0 | tok->decoding_state = STATE_NORMAL; |
344 | 0 | } |
345 | 0 | } |
346 | 0 | if (raw && tok->decoding_state == STATE_NORMAL) { |
347 | 0 | const char *line = tok->lineno <= 2 ? tok->buf : tok->cur; |
348 | 0 | int lineno = tok->lineno <= 2 ? 1 : tok->lineno; |
349 | 0 | if (!tok->encoding) { |
350 | | /* The default encoding is UTF-8, so make sure we don't have any |
351 | | non-UTF-8 sequences in it. */ |
352 | 0 | if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) { |
353 | 0 | _PyTokenizer_error_ret(tok); |
354 | 0 | return 0; |
355 | 0 | } |
356 | 0 | } |
357 | 0 | else { |
358 | 0 | PyObject *tmp = PyUnicode_Decode(line, strlen(line), |
359 | 0 | tok->encoding, NULL); |
360 | 0 | if (tmp == NULL) { |
361 | 0 | _PyTokenizer_error_ret(tok); |
362 | 0 | return 0; |
363 | 0 | } |
364 | 0 | Py_DECREF(tmp); |
365 | 0 | } |
366 | 0 | } |
367 | 0 | assert(tok->done == E_OK); |
368 | 0 | return tok->done == E_OK; |
369 | 0 | } |
370 | | |
371 | | /* Set up tokenizer for file */ |
372 | | struct tok_state * |
373 | | _PyTokenizer_FromFile(FILE *fp, const char* enc, |
374 | | const char *ps1, const char *ps2) |
375 | 0 | { |
376 | 0 | struct tok_state *tok = _PyTokenizer_tok_new(); |
377 | 0 | if (tok == NULL) |
378 | 0 | return NULL; |
379 | 0 | if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) { |
380 | 0 | _PyTokenizer_Free(tok); |
381 | 0 | return NULL; |
382 | 0 | } |
383 | 0 | tok->cur = tok->inp = tok->buf; |
384 | 0 | tok->end = tok->buf + BUFSIZ; |
385 | 0 | tok->fp = fp; |
386 | 0 | tok->prompt = ps1; |
387 | 0 | tok->nextprompt = ps2; |
388 | 0 | if (ps1 || ps2) { |
389 | 0 | tok->underflow = &tok_underflow_interactive; |
390 | 0 | } else { |
391 | 0 | tok->underflow = &tok_underflow_file; |
392 | 0 | } |
393 | 0 | if (enc != NULL) { |
394 | | /* Must copy encoding declaration since it |
395 | | gets copied into the parse tree. */ |
396 | 0 | tok->encoding = _PyTokenizer_new_string(enc, strlen(enc), tok); |
397 | 0 | if (!tok->encoding) { |
398 | 0 | _PyTokenizer_Free(tok); |
399 | 0 | return NULL; |
400 | 0 | } |
401 | 0 | tok->decoding_state = STATE_NORMAL; |
402 | 0 | } |
403 | 0 | return tok; |
404 | 0 | } |
405 | | |
406 | | #if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3)) |
407 | | // fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's |
408 | | // dup() emulation with open() is slow. |
409 | | typedef union { |
410 | | void *cookie; |
411 | | int fd; |
412 | | } borrowed; |
413 | | |
414 | | static ssize_t |
415 | | borrow_read(void *cookie, char *buf, size_t size) |
416 | | { |
417 | | borrowed b = {.cookie = cookie}; |
418 | | return read(b.fd, (void *)buf, size); |
419 | | } |
420 | | |
421 | | static FILE * |
422 | | fdopen_borrow(int fd) { |
423 | | // supports only reading. seek fails. close and write are no-ops. |
424 | | cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL}; |
425 | | borrowed b = {.fd = fd}; |
426 | | return fopencookie(b.cookie, "r", io_cb); |
427 | | } |
428 | | #else |
429 | | static FILE * |
430 | 0 | fdopen_borrow(int fd) { |
431 | 0 | fd = _Py_dup(fd); |
432 | 0 | if (fd < 0) { |
433 | 0 | return NULL; |
434 | 0 | } |
435 | 0 | return fdopen(fd, "r"); |
436 | 0 | } |
437 | | #endif |
438 | | |
439 | | /* Get the encoding of a Python file. Check for the coding cookie and check if |
440 | | the file starts with a BOM. |
441 | | |
442 | | _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the |
443 | | encoding in the first or second line of the file (in which case the encoding |
444 | | should be assumed to be UTF-8). |
445 | | |
446 | | The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed |
447 | | by the caller. */ |
448 | | char * |
449 | | _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) |
450 | 0 | { |
451 | 0 | struct tok_state *tok; |
452 | 0 | FILE *fp; |
453 | 0 | char *encoding = NULL; |
454 | |
|
455 | 0 | fp = fdopen_borrow(fd); |
456 | 0 | if (fp == NULL) { |
457 | 0 | return NULL; |
458 | 0 | } |
459 | 0 | tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL); |
460 | 0 | if (tok == NULL) { |
461 | 0 | fclose(fp); |
462 | 0 | return NULL; |
463 | 0 | } |
464 | 0 | if (filename != NULL) { |
465 | 0 | tok->filename = Py_NewRef(filename); |
466 | 0 | } |
467 | 0 | else { |
468 | 0 | tok->filename = PyUnicode_FromString("<string>"); |
469 | 0 | if (tok->filename == NULL) { |
470 | 0 | fclose(fp); |
471 | 0 | _PyTokenizer_Free(tok); |
472 | 0 | return encoding; |
473 | 0 | } |
474 | 0 | } |
475 | 0 | struct token token; |
476 | | // We don't want to report warnings here because it could cause infinite recursion |
477 | | // if fetching the encoding shows a warning. |
478 | 0 | tok->report_warnings = 0; |
479 | 0 | while (tok->lineno < 2 && tok->done == E_OK) { |
480 | 0 | _PyToken_Init(&token); |
481 | 0 | _PyTokenizer_Get(tok, &token); |
482 | 0 | _PyToken_Free(&token); |
483 | 0 | } |
484 | 0 | fclose(fp); |
485 | 0 | if (tok->encoding) { |
486 | 0 | encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1); |
487 | 0 | if (encoding) { |
488 | 0 | strcpy(encoding, tok->encoding); |
489 | 0 | } |
490 | 0 | } |
491 | 0 | _PyTokenizer_Free(tok); |
492 | 0 | return encoding; |
493 | 0 | } |