/src/Python-3.8.3/Parser/tokenizer.c
Line | Count | Source (jump to first uncovered line) |
1 | | |
2 | | /* Tokenizer implementation */ |
3 | | |
4 | | #define PY_SSIZE_T_CLEAN |
5 | | #include "Python.h" |
6 | | |
7 | | #include <ctype.h> |
8 | | #include <assert.h> |
9 | | |
10 | | #include "tokenizer.h" |
11 | | #include "errcode.h" |
12 | | |
13 | | #include "unicodeobject.h" |
14 | | #include "bytesobject.h" |
15 | | #include "fileobject.h" |
16 | | #include "codecs.h" |
17 | | #include "abstract.h" |
18 | | |
19 | | /* Alternate tab spacing */ |
20 | 0 | #define ALTTABSIZE 1 |
21 | | |
22 | 1.35k | #define is_potential_identifier_start(c) (\ |
23 | 1.35k | (c >= 'a' && c <= 'z')\ |
24 | 1.35k | || (c >= 'A' && c <= 'Z')\ |
25 | 1.35k | || c == '_'\ |
26 | 1.35k | || (c >= 128)) |
27 | | |
28 | 3.04k | #define is_potential_identifier_char(c) (\ |
29 | 3.04k | (c >= 'a' && c <= 'z')\ |
30 | 3.04k | || (c >= 'A' && c <= 'Z')\ |
31 | 3.04k | || (c >= '0' && c <= '9')\ |
32 | 3.04k | || c == '_'\ |
33 | 3.04k | || (c >= 128)) |
34 | | |
35 | | extern char *PyOS_Readline(FILE *, FILE *, const char *); |
36 | | /* Return malloc'ed string including trailing \n; |
37 | | empty malloc'ed string for EOF; |
38 | | NULL if interrupted */ |
39 | | |
40 | | /* Don't ever change this -- it would break the portability of Python code */ |
41 | 16 | #define TABSIZE 8 |
42 | | |
43 | | /* Forward */ |
44 | | static struct tok_state *tok_new(void); |
45 | | static int tok_nextc(struct tok_state *tok); |
46 | | static void tok_backup(struct tok_state *tok, int c); |
47 | | |
48 | | |
49 | | /* Spaces in this constant are treated as "zero or more spaces or tabs" when |
50 | | tokenizing. */ |
51 | | static const char* type_comment_prefix = "# type: "; |
52 | | |
53 | | /* Create and initialize a new tok_state structure */ |
54 | | |
55 | | static struct tok_state * |
56 | | tok_new(void) |
57 | 16 | { |
58 | 16 | struct tok_state *tok = (struct tok_state *)PyMem_MALLOC( |
59 | 16 | sizeof(struct tok_state)); |
60 | 16 | if (tok == NULL) |
61 | 0 | return NULL; |
62 | 16 | tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; |
63 | 16 | tok->done = E_OK; |
64 | 16 | tok->fp = NULL; |
65 | 16 | tok->input = NULL; |
66 | 16 | tok->tabsize = TABSIZE; |
67 | 16 | tok->indent = 0; |
68 | 16 | tok->indstack[0] = 0; |
69 | | |
70 | 16 | tok->atbol = 1; |
71 | 16 | tok->pendin = 0; |
72 | 16 | tok->prompt = tok->nextprompt = NULL; |
73 | 16 | tok->lineno = 0; |
74 | 16 | tok->level = 0; |
75 | 16 | tok->altindstack[0] = 0; |
76 | 16 | tok->decoding_state = STATE_INIT; |
77 | 16 | tok->decoding_erred = 0; |
78 | 16 | tok->read_coding_spec = 0; |
79 | 16 | tok->enc = NULL; |
80 | 16 | tok->encoding = NULL; |
81 | 16 | tok->cont_line = 0; |
82 | 16 | tok->filename = NULL; |
83 | 16 | tok->decoding_readline = NULL; |
84 | 16 | tok->decoding_buffer = NULL; |
85 | 16 | tok->type_comments = 0; |
86 | | |
87 | 16 | tok->async_hacks = 0; |
88 | 16 | tok->async_def = 0; |
89 | 16 | tok->async_def_indent = 0; |
90 | 16 | tok->async_def_nl = 0; |
91 | | |
92 | 16 | return tok; |
93 | 16 | } |
94 | | |
95 | | static char * |
96 | | new_string(const char *s, Py_ssize_t len, struct tok_state *tok) |
97 | 0 | { |
98 | 0 | char* result = (char *)PyMem_MALLOC(len + 1); |
99 | 0 | if (!result) { |
100 | 0 | tok->done = E_NOMEM; |
101 | 0 | return NULL; |
102 | 0 | } |
103 | 0 | memcpy(result, s, len); |
104 | 0 | result[len] = '\0'; |
105 | 0 | return result; |
106 | 0 | } |
107 | | |
108 | | static char * |
109 | | error_ret(struct tok_state *tok) /* XXX */ |
110 | 0 | { |
111 | 0 | tok->decoding_erred = 1; |
112 | 0 | if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ |
113 | 0 | PyMem_FREE(tok->buf); |
114 | 0 | tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; |
115 | 0 | tok->done = E_DECODE; |
116 | 0 | return NULL; /* as if it were EOF */ |
117 | 0 | } |
118 | | |
119 | | |
120 | | static const char * |
121 | | get_normal_name(const char *s) /* for utf-8 and latin-1 */ |
122 | 0 | { |
123 | 0 | char buf[13]; |
124 | 0 | int i; |
125 | 0 | for (i = 0; i < 12; i++) { |
126 | 0 | int c = s[i]; |
127 | 0 | if (c == '\0') |
128 | 0 | break; |
129 | 0 | else if (c == '_') |
130 | 0 | buf[i] = '-'; |
131 | 0 | else |
132 | 0 | buf[i] = tolower(c); |
133 | 0 | } |
134 | 0 | buf[i] = '\0'; |
135 | 0 | if (strcmp(buf, "utf-8") == 0 || |
136 | 0 | strncmp(buf, "utf-8-", 6) == 0) |
137 | 0 | return "utf-8"; |
138 | 0 | else if (strcmp(buf, "latin-1") == 0 || |
139 | 0 | strcmp(buf, "iso-8859-1") == 0 || |
140 | 0 | strcmp(buf, "iso-latin-1") == 0 || |
141 | 0 | strncmp(buf, "latin-1-", 8) == 0 || |
142 | 0 | strncmp(buf, "iso-8859-1-", 11) == 0 || |
143 | 0 | strncmp(buf, "iso-latin-1-", 12) == 0) |
144 | 0 | return "iso-8859-1"; |
145 | 0 | else |
146 | 0 | return s; |
147 | 0 | } |
148 | | |
149 | | /* Return the coding spec in S, or NULL if none is found. */ |
150 | | |
151 | | static int |
152 | | get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok) |
153 | 28 | { |
154 | 28 | Py_ssize_t i; |
155 | 28 | *spec = NULL; |
156 | | /* Coding spec must be in a comment, and that comment must be |
157 | | * the only statement on the source code line. */ |
158 | 28 | for (i = 0; i < size - 6; i++) { |
159 | 4 | if (s[i] == '#') |
160 | 0 | break; |
161 | 4 | if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') |
162 | 4 | return 1; |
163 | 4 | } |
164 | 24 | for (; i < size - 6; i++) { /* XXX inefficient search */ |
165 | 0 | const char* t = s + i; |
166 | 0 | if (strncmp(t, "coding", 6) == 0) { |
167 | 0 | const char* begin = NULL; |
168 | 0 | t += 6; |
169 | 0 | if (t[0] != ':' && t[0] != '=') |
170 | 0 | continue; |
171 | 0 | do { |
172 | 0 | t++; |
173 | 0 | } while (t[0] == '\x20' || t[0] == '\t'); |
174 | |
|
175 | 0 | begin = t; |
176 | 0 | while (Py_ISALNUM(t[0]) || |
177 | 0 | t[0] == '-' || t[0] == '_' || t[0] == '.') |
178 | 0 | t++; |
179 | |
|
180 | 0 | if (begin < t) { |
181 | 0 | char* r = new_string(begin, t - begin, tok); |
182 | 0 | const char* q; |
183 | 0 | if (!r) |
184 | 0 | return 0; |
185 | 0 | q = get_normal_name(r); |
186 | 0 | if (r != q) { |
187 | 0 | PyMem_FREE(r); |
188 | 0 | r = new_string(q, strlen(q), tok); |
189 | 0 | if (!r) |
190 | 0 | return 0; |
191 | 0 | } |
192 | 0 | *spec = r; |
193 | 0 | break; |
194 | 0 | } |
195 | 0 | } |
196 | 0 | } |
197 | 24 | return 1; |
198 | 24 | } |
199 | | |
200 | | /* Check whether the line contains a coding spec. If it does, |
201 | | invoke the set_readline function for the new encoding. |
202 | | This function receives the tok_state and the new encoding. |
203 | | Return 1 on success, 0 on failure. */ |
204 | | |
205 | | static int |
206 | | check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, |
207 | | int set_readline(struct tok_state *, const char *)) |
208 | 28 | { |
209 | 28 | char *cs; |
210 | 28 | int r = 1; |
211 | | |
212 | 28 | if (tok->cont_line) { |
213 | | /* It's a continuation line, so it can't be a coding spec. */ |
214 | 0 | tok->read_coding_spec = 1; |
215 | 0 | return 1; |
216 | 0 | } |
217 | 28 | if (!get_coding_spec(line, &cs, size, tok)) |
218 | 0 | return 0; |
219 | 28 | if (!cs) { |
220 | 28 | Py_ssize_t i; |
221 | 28 | for (i = 0; i < size; i++) { |
222 | 14 | if (line[i] == '#' || line[i] == '\n' || line[i] == '\r') |
223 | 0 | break; |
224 | 14 | if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { |
225 | | /* Stop checking coding spec after a line containing |
226 | | * anything except a comment. */ |
227 | 14 | tok->read_coding_spec = 1; |
228 | 14 | break; |
229 | 14 | } |
230 | 14 | } |
231 | 28 | return 1; |
232 | 28 | } |
233 | 0 | tok->read_coding_spec = 1; |
234 | 0 | if (tok->encoding == NULL) { |
235 | 0 | assert(tok->decoding_state == STATE_RAW); |
236 | 0 | if (strcmp(cs, "utf-8") == 0) { |
237 | 0 | tok->encoding = cs; |
238 | 0 | } else { |
239 | 0 | r = set_readline(tok, cs); |
240 | 0 | if (r) { |
241 | 0 | tok->encoding = cs; |
242 | 0 | tok->decoding_state = STATE_NORMAL; |
243 | 0 | } |
244 | 0 | else { |
245 | 0 | PyErr_Format(PyExc_SyntaxError, |
246 | 0 | "encoding problem: %s", cs); |
247 | 0 | PyMem_FREE(cs); |
248 | 0 | } |
249 | 0 | } |
250 | 0 | } else { /* then, compare cs with BOM */ |
251 | 0 | r = (strcmp(tok->encoding, cs) == 0); |
252 | 0 | if (!r) |
253 | 0 | PyErr_Format(PyExc_SyntaxError, |
254 | 0 | "encoding problem: %s with BOM", cs); |
255 | 0 | PyMem_FREE(cs); |
256 | 0 | } |
257 | 0 | return r; |
258 | 28 | } |
259 | | |
260 | | /* See whether the file starts with a BOM. If it does, |
261 | | invoke the set_readline function with the new encoding. |
262 | | Return 1 on success, 0 on failure. */ |
263 | | |
264 | | static int |
265 | | check_bom(int get_char(struct tok_state *), |
266 | | void unget_char(int, struct tok_state *), |
267 | | int set_readline(struct tok_state *, const char *), |
268 | | struct tok_state *tok) |
269 | 14 | { |
270 | 14 | int ch1, ch2, ch3; |
271 | 14 | ch1 = get_char(tok); |
272 | 14 | tok->decoding_state = STATE_RAW; |
273 | 14 | if (ch1 == EOF) { |
274 | 0 | return 1; |
275 | 14 | } else if (ch1 == 0xEF) { |
276 | 0 | ch2 = get_char(tok); |
277 | 0 | if (ch2 != 0xBB) { |
278 | 0 | unget_char(ch2, tok); |
279 | 0 | unget_char(ch1, tok); |
280 | 0 | return 1; |
281 | 0 | } |
282 | 0 | ch3 = get_char(tok); |
283 | 0 | if (ch3 != 0xBF) { |
284 | 0 | unget_char(ch3, tok); |
285 | 0 | unget_char(ch2, tok); |
286 | 0 | unget_char(ch1, tok); |
287 | 0 | return 1; |
288 | 0 | } |
289 | | #if 0 |
290 | | /* Disable support for UTF-16 BOMs until a decision |
291 | | is made whether this needs to be supported. */ |
292 | | } else if (ch1 == 0xFE) { |
293 | | ch2 = get_char(tok); |
294 | | if (ch2 != 0xFF) { |
295 | | unget_char(ch2, tok); |
296 | | unget_char(ch1, tok); |
297 | | return 1; |
298 | | } |
299 | | if (!set_readline(tok, "utf-16-be")) |
300 | | return 0; |
301 | | tok->decoding_state = STATE_NORMAL; |
302 | | } else if (ch1 == 0xFF) { |
303 | | ch2 = get_char(tok); |
304 | | if (ch2 != 0xFE) { |
305 | | unget_char(ch2, tok); |
306 | | unget_char(ch1, tok); |
307 | | return 1; |
308 | | } |
309 | | if (!set_readline(tok, "utf-16-le")) |
310 | | return 0; |
311 | | tok->decoding_state = STATE_NORMAL; |
312 | | #endif |
313 | 14 | } else { |
314 | 14 | unget_char(ch1, tok); |
315 | 14 | return 1; |
316 | 14 | } |
317 | 0 | if (tok->encoding != NULL) |
318 | 0 | PyMem_FREE(tok->encoding); |
319 | 0 | tok->encoding = new_string("utf-8", 5, tok); |
320 | 0 | if (!tok->encoding) |
321 | 0 | return 0; |
322 | | /* No need to set_readline: input is already utf-8 */ |
323 | 0 | return 1; |
324 | 0 | } |
325 | | |
326 | | /* Read a line of text from TOK into S, using the stream in TOK. |
327 | | Return NULL on failure, else S. |
328 | | |
329 | | On entry, tok->decoding_buffer will be one of: |
330 | | 1) NULL: need to call tok->decoding_readline to get a new line |
331 | | 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and |
332 | | stored the result in tok->decoding_buffer |
333 | | 3) PyByteArrayObject *: previous call to fp_readl did not have enough room |
334 | | (in the s buffer) to copy entire contents of the line read |
335 | | by tok->decoding_readline. tok->decoding_buffer has the overflow. |
336 | | In this case, fp_readl is called in a loop (with an expanded buffer) |
337 | | until the buffer ends with a '\n' (or until the end of the file is |
338 | | reached): see tok_nextc and its calls to decoding_fgets. |
339 | | */ |
340 | | |
341 | | static char * |
342 | | fp_readl(char *s, int size, struct tok_state *tok) |
343 | 0 | { |
344 | 0 | PyObject* bufobj; |
345 | 0 | const char *buf; |
346 | 0 | Py_ssize_t buflen; |
347 | | |
348 | | /* Ask for one less byte so we can terminate it */ |
349 | 0 | assert(size > 0); |
350 | 0 | size--; |
351 | |
|
352 | 0 | if (tok->decoding_buffer) { |
353 | 0 | bufobj = tok->decoding_buffer; |
354 | 0 | Py_INCREF(bufobj); |
355 | 0 | } |
356 | 0 | else |
357 | 0 | { |
358 | 0 | bufobj = _PyObject_CallNoArg(tok->decoding_readline); |
359 | 0 | if (bufobj == NULL) |
360 | 0 | goto error; |
361 | 0 | } |
362 | 0 | if (PyUnicode_CheckExact(bufobj)) |
363 | 0 | { |
364 | 0 | buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen); |
365 | 0 | if (buf == NULL) { |
366 | 0 | goto error; |
367 | 0 | } |
368 | 0 | } |
369 | 0 | else |
370 | 0 | { |
371 | 0 | buf = PyByteArray_AsString(bufobj); |
372 | 0 | if (buf == NULL) { |
373 | 0 | goto error; |
374 | 0 | } |
375 | 0 | buflen = PyByteArray_GET_SIZE(bufobj); |
376 | 0 | } |
377 | | |
378 | 0 | Py_XDECREF(tok->decoding_buffer); |
379 | 0 | if (buflen > size) { |
380 | | /* Too many chars, the rest goes into tok->decoding_buffer */ |
381 | 0 | tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size, |
382 | 0 | buflen-size); |
383 | 0 | if (tok->decoding_buffer == NULL) |
384 | 0 | goto error; |
385 | 0 | buflen = size; |
386 | 0 | } |
387 | 0 | else |
388 | 0 | tok->decoding_buffer = NULL; |
389 | | |
390 | 0 | memcpy(s, buf, buflen); |
391 | 0 | s[buflen] = '\0'; |
392 | 0 | if (buflen == 0) /* EOF */ |
393 | 0 | s = NULL; |
394 | 0 | Py_DECREF(bufobj); |
395 | 0 | return s; |
396 | | |
397 | 0 | error: |
398 | 0 | Py_XDECREF(bufobj); |
399 | 0 | return error_ret(tok); |
400 | 0 | } |
401 | | |
402 | | /* Set the readline function for TOK to a StreamReader's |
403 | | readline function. The StreamReader is named ENC. |
404 | | |
405 | | This function is called from check_bom and check_coding_spec. |
406 | | |
407 | | ENC is usually identical to the future value of tok->encoding, |
408 | | except for the (currently unsupported) case of UTF-16. |
409 | | |
410 | | Return 1 on success, 0 on failure. */ |
411 | | |
412 | | static int |
413 | | fp_setreadl(struct tok_state *tok, const char* enc) |
414 | 0 | { |
415 | 0 | PyObject *readline, *io, *stream; |
416 | 0 | _Py_IDENTIFIER(open); |
417 | 0 | _Py_IDENTIFIER(readline); |
418 | 0 | int fd; |
419 | 0 | long pos; |
420 | |
|
421 | 0 | fd = fileno(tok->fp); |
422 | | /* Due to buffering the file offset for fd can be different from the file |
423 | | * position of tok->fp. If tok->fp was opened in text mode on Windows, |
424 | | * its file position counts CRLF as one char and can't be directly mapped |
425 | | * to the file offset for fd. Instead we step back one byte and read to |
426 | | * the end of line.*/ |
427 | 0 | pos = ftell(tok->fp); |
428 | 0 | if (pos == -1 || |
429 | 0 | lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) { |
430 | 0 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL); |
431 | 0 | return 0; |
432 | 0 | } |
433 | | |
434 | 0 | io = PyImport_ImportModuleNoBlock("io"); |
435 | 0 | if (io == NULL) |
436 | 0 | return 0; |
437 | | |
438 | 0 | stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO", |
439 | 0 | fd, "r", -1, enc, Py_None, Py_None, Py_False); |
440 | 0 | Py_DECREF(io); |
441 | 0 | if (stream == NULL) |
442 | 0 | return 0; |
443 | | |
444 | 0 | readline = _PyObject_GetAttrId(stream, &PyId_readline); |
445 | 0 | Py_DECREF(stream); |
446 | 0 | if (readline == NULL) |
447 | 0 | return 0; |
448 | 0 | Py_XSETREF(tok->decoding_readline, readline); |
449 | |
|
450 | 0 | if (pos > 0) { |
451 | 0 | PyObject *bufobj = _PyObject_CallNoArg(readline); |
452 | 0 | if (bufobj == NULL) |
453 | 0 | return 0; |
454 | 0 | Py_DECREF(bufobj); |
455 | 0 | } |
456 | | |
457 | 0 | return 1; |
458 | 0 | } |
459 | | |
460 | | /* Fetch the next byte from TOK. */ |
461 | | |
462 | 0 | static int fp_getc(struct tok_state *tok) { |
463 | 0 | return getc(tok->fp); |
464 | 0 | } |
465 | | |
466 | | /* Unfetch the last byte back into TOK. */ |
467 | | |
468 | 0 | static void fp_ungetc(int c, struct tok_state *tok) { |
469 | 0 | ungetc(c, tok->fp); |
470 | 0 | } |
471 | | |
472 | | /* Check whether the characters at s start a valid |
473 | | UTF-8 sequence. Return the number of characters forming |
474 | | the sequence if yes, 0 if not. */ |
475 | | static int valid_utf8(const unsigned char* s) |
476 | 0 | { |
477 | 0 | int expected = 0; |
478 | 0 | int length; |
479 | 0 | if (*s < 0x80) |
480 | | /* single-byte code */ |
481 | 0 | return 1; |
482 | 0 | if (*s < 0xc0) |
483 | | /* following byte */ |
484 | 0 | return 0; |
485 | 0 | if (*s < 0xE0) |
486 | 0 | expected = 1; |
487 | 0 | else if (*s < 0xF0) |
488 | 0 | expected = 2; |
489 | 0 | else if (*s < 0xF8) |
490 | 0 | expected = 3; |
491 | 0 | else |
492 | 0 | return 0; |
493 | 0 | length = expected + 1; |
494 | 0 | for (; expected; expected--) |
495 | 0 | if (s[expected] < 0x80 || s[expected] >= 0xC0) |
496 | 0 | return 0; |
497 | 0 | return length; |
498 | 0 | } |
499 | | |
500 | | /* Read a line of input from TOK. Determine encoding |
501 | | if necessary. */ |
502 | | |
503 | | static char * |
504 | | decoding_fgets(char *s, int size, struct tok_state *tok) |
505 | 0 | { |
506 | 0 | char *line = NULL; |
507 | 0 | int badchar = 0; |
508 | 0 | for (;;) { |
509 | 0 | if (tok->decoding_state == STATE_NORMAL) { |
510 | | /* We already have a codec associated with |
511 | | this input. */ |
512 | 0 | line = fp_readl(s, size, tok); |
513 | 0 | break; |
514 | 0 | } else if (tok->decoding_state == STATE_RAW) { |
515 | | /* We want a 'raw' read. */ |
516 | 0 | line = Py_UniversalNewlineFgets(s, size, |
517 | 0 | tok->fp, NULL); |
518 | 0 | break; |
519 | 0 | } else { |
520 | | /* We have not yet determined the encoding. |
521 | | If an encoding is found, use the file-pointer |
522 | | reader functions from now on. */ |
523 | 0 | if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) |
524 | 0 | return error_ret(tok); |
525 | 0 | assert(tok->decoding_state != STATE_INIT); |
526 | 0 | } |
527 | 0 | } |
528 | 0 | if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { |
529 | 0 | if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { |
530 | 0 | return error_ret(tok); |
531 | 0 | } |
532 | 0 | } |
533 | | /* The default encoding is UTF-8, so make sure we don't have any |
534 | | non-UTF-8 sequences in it. */ |
535 | 0 | if (line && !tok->encoding) { |
536 | 0 | unsigned char *c; |
537 | 0 | int length; |
538 | 0 | for (c = (unsigned char *)line; *c; c += length) |
539 | 0 | if (!(length = valid_utf8(c))) { |
540 | 0 | badchar = *c; |
541 | 0 | break; |
542 | 0 | } |
543 | 0 | } |
544 | 0 | if (badchar) { |
545 | | /* Need to add 1 to the line number, since this line |
546 | | has not been counted, yet. */ |
547 | 0 | PyErr_Format(PyExc_SyntaxError, |
548 | 0 | "Non-UTF-8 code starting with '\\x%.2x' " |
549 | 0 | "in file %U on line %i, " |
550 | 0 | "but no encoding declared; " |
551 | 0 | "see http://python.org/dev/peps/pep-0263/ for details", |
552 | 0 | badchar, tok->filename, tok->lineno + 1); |
553 | 0 | return error_ret(tok); |
554 | 0 | } |
555 | 0 | return line; |
556 | 0 | } |
557 | | |
558 | | static int |
559 | | decoding_feof(struct tok_state *tok) |
560 | 0 | { |
561 | 0 | if (tok->decoding_state != STATE_NORMAL) { |
562 | 0 | return feof(tok->fp); |
563 | 0 | } else { |
564 | 0 | PyObject* buf = tok->decoding_buffer; |
565 | 0 | if (buf == NULL) { |
566 | 0 | buf = _PyObject_CallNoArg(tok->decoding_readline); |
567 | 0 | if (buf == NULL) { |
568 | 0 | error_ret(tok); |
569 | 0 | return 1; |
570 | 0 | } else { |
571 | 0 | tok->decoding_buffer = buf; |
572 | 0 | } |
573 | 0 | } |
574 | 0 | return PyObject_Length(buf) == 0; |
575 | 0 | } |
576 | 0 | } |
577 | | |
578 | | /* Fetch a byte from TOK, using the string buffer. */ |
579 | | |
580 | | static int |
581 | 14 | buf_getc(struct tok_state *tok) { |
582 | 14 | return Py_CHARMASK(*tok->str++); |
583 | 14 | } |
584 | | |
585 | | /* Unfetch a byte from TOK, using the string buffer. */ |
586 | | |
587 | | static void |
588 | 14 | buf_ungetc(int c, struct tok_state *tok) { |
589 | 14 | tok->str--; |
590 | 14 | assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ |
591 | 14 | } |
592 | | |
593 | | /* Set the readline function for TOK to ENC. For the string-based |
594 | | tokenizer, this means to just record the encoding. */ |
595 | | |
596 | | static int |
597 | 0 | buf_setreadl(struct tok_state *tok, const char* enc) { |
598 | 0 | tok->enc = enc; |
599 | 0 | return 1; |
600 | 0 | } |
601 | | |
602 | | /* Return a UTF-8 encoding Python string object from the |
603 | | C byte string STR, which is encoded with ENC. */ |
604 | | |
605 | | static PyObject * |
606 | 0 | translate_into_utf8(const char* str, const char* enc) { |
607 | 0 | PyObject *utf8; |
608 | 0 | PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); |
609 | 0 | if (buf == NULL) |
610 | 0 | return NULL; |
611 | 0 | utf8 = PyUnicode_AsUTF8String(buf); |
612 | 0 | Py_DECREF(buf); |
613 | 0 | return utf8; |
614 | 0 | } |
615 | | |
616 | | |
617 | | static char * |
618 | 16 | translate_newlines(const char *s, int exec_input, struct tok_state *tok) { |
619 | 16 | int skip_next_lf = 0; |
620 | 16 | size_t needed_length = strlen(s) + 2, final_length; |
621 | 16 | char *buf, *current; |
622 | 16 | char c = '\0'; |
623 | 16 | buf = PyMem_MALLOC(needed_length); |
624 | 16 | if (buf == NULL) { |
625 | 0 | tok->done = E_NOMEM; |
626 | 0 | return NULL; |
627 | 0 | } |
628 | 7.53k | for (current = buf; *s; s++, current++) { |
629 | 7.51k | c = *s; |
630 | 7.51k | if (skip_next_lf) { |
631 | 0 | skip_next_lf = 0; |
632 | 0 | if (c == '\n') { |
633 | 0 | c = *++s; |
634 | 0 | if (!c) |
635 | 0 | break; |
636 | 0 | } |
637 | 0 | } |
638 | 7.51k | if (c == '\r') { |
639 | 0 | skip_next_lf = 1; |
640 | 0 | c = '\n'; |
641 | 0 | } |
642 | 7.51k | *current = c; |
643 | 7.51k | } |
644 | | /* If this is exec input, add a newline to the end of the string if |
645 | | there isn't one already. */ |
646 | 16 | if (exec_input && c != '\n') { |
647 | 2 | *current = '\n'; |
648 | 2 | current++; |
649 | 2 | } |
650 | 16 | *current = '\0'; |
651 | 16 | final_length = current - buf + 1; |
652 | 16 | if (final_length < needed_length && final_length) { |
653 | | /* should never fail */ |
654 | 14 | char* result = PyMem_REALLOC(buf, final_length); |
655 | 14 | if (result == NULL) { |
656 | 0 | PyMem_FREE(buf); |
657 | 0 | } |
658 | 14 | buf = result; |
659 | 14 | } |
660 | 16 | return buf; |
661 | 16 | } |
662 | | |
663 | | /* Decode a byte string STR for use as the buffer of TOK. |
664 | | Look for encoding declarations inside STR, and record them |
665 | | inside TOK. */ |
666 | | |
667 | | static const char * |
668 | | decode_str(const char *input, int single, struct tok_state *tok) |
669 | 14 | { |
670 | 14 | PyObject* utf8 = NULL; |
671 | 14 | const char *str; |
672 | 14 | const char *s; |
673 | 14 | const char *newl[2] = {NULL, NULL}; |
674 | 14 | int lineno = 0; |
675 | 14 | tok->input = str = translate_newlines(input, single, tok); |
676 | 14 | if (str == NULL) |
677 | 0 | return NULL; |
678 | 14 | tok->enc = NULL; |
679 | 14 | tok->str = str; |
680 | 14 | if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) |
681 | 0 | return error_ret(tok); |
682 | 14 | str = tok->str; /* string after BOM if any */ |
683 | 14 | assert(str); |
684 | 14 | if (tok->enc != NULL) { |
685 | 0 | utf8 = translate_into_utf8(str, tok->enc); |
686 | 0 | if (utf8 == NULL) |
687 | 0 | return error_ret(tok); |
688 | 0 | str = PyBytes_AsString(utf8); |
689 | 0 | } |
690 | 174 | for (s = str;; s++) { |
691 | 174 | if (*s == '\0') break; |
692 | 174 | else if (*s == '\n') { |
693 | 28 | assert(lineno < 2); |
694 | 28 | newl[lineno] = s; |
695 | 28 | lineno++; |
696 | 28 | if (lineno == 2) break; |
697 | 28 | } |
698 | 174 | } |
699 | 14 | tok->enc = NULL; |
700 | | /* need to check line 1 and 2 separately since check_coding_spec |
701 | | assumes a single line as input */ |
702 | 14 | if (newl[0]) { |
703 | 14 | if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) |
704 | 0 | return error_ret(tok); |
705 | 14 | if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) { |
706 | 14 | if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], |
707 | 14 | tok, buf_setreadl)) |
708 | 0 | return error_ret(tok); |
709 | 14 | } |
710 | 14 | } |
711 | 14 | if (tok->enc != NULL) { |
712 | 0 | assert(utf8 == NULL); |
713 | 0 | utf8 = translate_into_utf8(str, tok->enc); |
714 | 0 | if (utf8 == NULL) |
715 | 0 | return error_ret(tok); |
716 | 0 | str = PyBytes_AS_STRING(utf8); |
717 | 0 | } |
718 | 14 | assert(tok->decoding_buffer == NULL); |
719 | 14 | tok->decoding_buffer = utf8; /* CAUTION */ |
720 | 14 | return str; |
721 | 14 | } |
722 | | |
723 | | /* Set up tokenizer for string */ |
724 | | |
725 | | struct tok_state * |
726 | | PyTokenizer_FromString(const char *str, int exec_input) |
727 | 14 | { |
728 | 14 | struct tok_state *tok = tok_new(); |
729 | 14 | if (tok == NULL) |
730 | 0 | return NULL; |
731 | 14 | str = decode_str(str, exec_input, tok); |
732 | 14 | if (str == NULL) { |
733 | 0 | PyTokenizer_Free(tok); |
734 | 0 | return NULL; |
735 | 0 | } |
736 | | |
737 | | /* XXX: constify members. */ |
738 | 14 | tok->buf = tok->cur = tok->end = tok->inp = (char*)str; |
739 | 14 | return tok; |
740 | 14 | } |
741 | | |
742 | | struct tok_state * |
743 | | PyTokenizer_FromUTF8(const char *str, int exec_input) |
744 | 2 | { |
745 | 2 | struct tok_state *tok = tok_new(); |
746 | 2 | if (tok == NULL) |
747 | 0 | return NULL; |
748 | 2 | tok->input = str = translate_newlines(str, exec_input, tok); |
749 | 2 | if (str == NULL) { |
750 | 0 | PyTokenizer_Free(tok); |
751 | 0 | return NULL; |
752 | 0 | } |
753 | 2 | tok->decoding_state = STATE_RAW; |
754 | 2 | tok->read_coding_spec = 1; |
755 | 2 | tok->enc = NULL; |
756 | 2 | tok->str = str; |
757 | 2 | tok->encoding = (char *)PyMem_MALLOC(6); |
758 | 2 | if (!tok->encoding) { |
759 | 0 | PyTokenizer_Free(tok); |
760 | 0 | return NULL; |
761 | 0 | } |
762 | 2 | strcpy(tok->encoding, "utf-8"); |
763 | | |
764 | | /* XXX: constify members. */ |
765 | 2 | tok->buf = tok->cur = tok->end = tok->inp = (char*)str; |
766 | 2 | return tok; |
767 | 2 | } |
768 | | |
769 | | /* Set up tokenizer for file */ |
770 | | |
771 | | struct tok_state * |
772 | | PyTokenizer_FromFile(FILE *fp, const char* enc, |
773 | | const char *ps1, const char *ps2) |
774 | 0 | { |
775 | 0 | struct tok_state *tok = tok_new(); |
776 | 0 | if (tok == NULL) |
777 | 0 | return NULL; |
778 | 0 | if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) { |
779 | 0 | PyTokenizer_Free(tok); |
780 | 0 | return NULL; |
781 | 0 | } |
782 | 0 | tok->cur = tok->inp = tok->buf; |
783 | 0 | tok->end = tok->buf + BUFSIZ; |
784 | 0 | tok->fp = fp; |
785 | 0 | tok->prompt = ps1; |
786 | 0 | tok->nextprompt = ps2; |
787 | 0 | if (enc != NULL) { |
788 | | /* Must copy encoding declaration since it |
789 | | gets copied into the parse tree. */ |
790 | 0 | tok->encoding = PyMem_MALLOC(strlen(enc)+1); |
791 | 0 | if (!tok->encoding) { |
792 | 0 | PyTokenizer_Free(tok); |
793 | 0 | return NULL; |
794 | 0 | } |
795 | 0 | strcpy(tok->encoding, enc); |
796 | 0 | tok->decoding_state = STATE_NORMAL; |
797 | 0 | } |
798 | 0 | return tok; |
799 | 0 | } |
800 | | |
801 | | |
802 | | /* Free a tok_state structure */ |
803 | | |
804 | | void |
805 | | PyTokenizer_Free(struct tok_state *tok) |
806 | 16 | { |
807 | 16 | if (tok->encoding != NULL) |
808 | 0 | PyMem_FREE(tok->encoding); |
809 | 16 | Py_XDECREF(tok->decoding_readline); |
810 | 16 | Py_XDECREF(tok->decoding_buffer); |
811 | 16 | Py_XDECREF(tok->filename); |
812 | 16 | if (tok->fp != NULL && tok->buf != NULL) |
813 | 0 | PyMem_FREE(tok->buf); |
814 | 16 | if (tok->input) |
815 | 16 | PyMem_FREE((char *)tok->input); |
816 | 16 | PyMem_FREE(tok); |
817 | 16 | } |
818 | | |
819 | | /* Get next char, updating state; error code goes into tok->done */ |
820 | | |
821 | | static int |
822 | | tok_nextc(struct tok_state *tok) |
823 | 10.3k | { |
824 | 10.3k | for (;;) { |
825 | 10.3k | if (tok->cur != tok->inp) { |
826 | 9.98k | return Py_CHARMASK(*tok->cur++); /* Fast path */ |
827 | 9.98k | } |
828 | 332 | if (tok->done != E_OK) |
829 | 64 | return EOF; |
830 | 268 | if (tok->fp == NULL) { |
831 | 268 | char *end = strchr(tok->inp, '\n'); |
832 | 268 | if (end != NULL) |
833 | 252 | end++; |
834 | 16 | else { |
835 | 16 | end = strchr(tok->inp, '\0'); |
836 | 16 | if (end == tok->inp) { |
837 | 16 | tok->done = E_EOF; |
838 | 16 | return EOF; |
839 | 16 | } |
840 | 16 | } |
841 | 252 | if (tok->start == NULL) |
842 | 252 | tok->buf = tok->cur; |
843 | 252 | tok->line_start = tok->cur; |
844 | 252 | tok->lineno++; |
845 | 252 | tok->inp = end; |
846 | 252 | return Py_CHARMASK(*tok->cur++); |
847 | 268 | } |
848 | 0 | if (tok->prompt != NULL) { |
849 | 0 | char *newtok = PyOS_Readline(stdin, stdout, tok->prompt); |
850 | 0 | if (newtok != NULL) { |
851 | 0 | char *translated = translate_newlines(newtok, 0, tok); |
852 | 0 | PyMem_FREE(newtok); |
853 | 0 | if (translated == NULL) |
854 | 0 | return EOF; |
855 | 0 | newtok = translated; |
856 | 0 | } |
857 | 0 | if (tok->encoding && newtok && *newtok) { |
858 | | /* Recode to UTF-8 */ |
859 | 0 | Py_ssize_t buflen; |
860 | 0 | const char* buf; |
861 | 0 | PyObject *u = translate_into_utf8(newtok, tok->encoding); |
862 | 0 | PyMem_FREE(newtok); |
863 | 0 | if (!u) { |
864 | 0 | tok->done = E_DECODE; |
865 | 0 | return EOF; |
866 | 0 | } |
867 | 0 | buflen = PyBytes_GET_SIZE(u); |
868 | 0 | buf = PyBytes_AS_STRING(u); |
869 | 0 | newtok = PyMem_MALLOC(buflen+1); |
870 | 0 | if (newtok == NULL) { |
871 | 0 | Py_DECREF(u); |
872 | 0 | tok->done = E_NOMEM; |
873 | 0 | return EOF; |
874 | 0 | } |
875 | 0 | strcpy(newtok, buf); |
876 | 0 | Py_DECREF(u); |
877 | 0 | } |
878 | 0 | if (tok->nextprompt != NULL) |
879 | 0 | tok->prompt = tok->nextprompt; |
880 | 0 | if (newtok == NULL) |
881 | 0 | tok->done = E_INTR; |
882 | 0 | else if (*newtok == '\0') { |
883 | 0 | PyMem_FREE(newtok); |
884 | 0 | tok->done = E_EOF; |
885 | 0 | } |
886 | 0 | else if (tok->start != NULL) { |
887 | 0 | size_t start = tok->start - tok->buf; |
888 | 0 | size_t oldlen = tok->cur - tok->buf; |
889 | 0 | size_t newlen = oldlen + strlen(newtok); |
890 | 0 | Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; |
891 | 0 | char *buf = tok->buf; |
892 | 0 | buf = (char *)PyMem_REALLOC(buf, newlen+1); |
893 | 0 | tok->lineno++; |
894 | 0 | if (buf == NULL) { |
895 | 0 | PyMem_FREE(tok->buf); |
896 | 0 | tok->buf = NULL; |
897 | 0 | PyMem_FREE(newtok); |
898 | 0 | tok->done = E_NOMEM; |
899 | 0 | return EOF; |
900 | 0 | } |
901 | 0 | tok->buf = buf; |
902 | 0 | tok->cur = tok->buf + oldlen; |
903 | 0 | tok->multi_line_start = tok->buf + cur_multi_line_start; |
904 | 0 | tok->line_start = tok->cur; |
905 | 0 | strcpy(tok->buf + oldlen, newtok); |
906 | 0 | PyMem_FREE(newtok); |
907 | 0 | tok->inp = tok->buf + newlen; |
908 | 0 | tok->end = tok->inp + 1; |
909 | 0 | tok->start = tok->buf + start; |
910 | 0 | } |
911 | 0 | else { |
912 | 0 | tok->lineno++; |
913 | 0 | if (tok->buf != NULL) |
914 | 0 | PyMem_FREE(tok->buf); |
915 | 0 | tok->buf = newtok; |
916 | 0 | tok->cur = tok->buf; |
917 | 0 | tok->line_start = tok->buf; |
918 | 0 | tok->inp = strchr(tok->buf, '\0'); |
919 | 0 | tok->end = tok->inp + 1; |
920 | 0 | } |
921 | 0 | } |
922 | 0 | else { |
923 | 0 | int done = 0; |
924 | 0 | Py_ssize_t cur = 0; |
925 | 0 | char *pt; |
926 | 0 | if (tok->start == NULL) { |
927 | 0 | if (tok->buf == NULL) { |
928 | 0 | tok->buf = (char *) |
929 | 0 | PyMem_MALLOC(BUFSIZ); |
930 | 0 | if (tok->buf == NULL) { |
931 | 0 | tok->done = E_NOMEM; |
932 | 0 | return EOF; |
933 | 0 | } |
934 | 0 | tok->end = tok->buf + BUFSIZ; |
935 | 0 | } |
936 | 0 | if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), |
937 | 0 | tok) == NULL) { |
938 | 0 | if (!tok->decoding_erred) |
939 | 0 | tok->done = E_EOF; |
940 | 0 | done = 1; |
941 | 0 | } |
942 | 0 | else { |
943 | 0 | tok->done = E_OK; |
944 | 0 | tok->inp = strchr(tok->buf, '\0'); |
945 | 0 | done = tok->inp == tok->buf || tok->inp[-1] == '\n'; |
946 | 0 | } |
947 | 0 | } |
948 | 0 | else { |
949 | 0 | cur = tok->cur - tok->buf; |
950 | 0 | if (decoding_feof(tok)) { |
951 | 0 | tok->done = E_EOF; |
952 | 0 | done = 1; |
953 | 0 | } |
954 | 0 | else |
955 | 0 | tok->done = E_OK; |
956 | 0 | } |
957 | 0 | tok->lineno++; |
958 | | /* Read until '\n' or EOF */ |
959 | 0 | while (!done) { |
960 | 0 | Py_ssize_t curstart = tok->start == NULL ? -1 : |
961 | 0 | tok->start - tok->buf; |
962 | 0 | Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; |
963 | 0 | Py_ssize_t curvalid = tok->inp - tok->buf; |
964 | 0 | Py_ssize_t newsize = curvalid + BUFSIZ; |
965 | 0 | char *newbuf = tok->buf; |
966 | 0 | newbuf = (char *)PyMem_REALLOC(newbuf, |
967 | 0 | newsize); |
968 | 0 | if (newbuf == NULL) { |
969 | 0 | tok->done = E_NOMEM; |
970 | 0 | tok->cur = tok->inp; |
971 | 0 | return EOF; |
972 | 0 | } |
973 | 0 | tok->buf = newbuf; |
974 | 0 | tok->cur = tok->buf + cur; |
975 | 0 | tok->multi_line_start = tok->buf + cur_multi_line_start; |
976 | 0 | tok->line_start = tok->cur; |
977 | 0 | tok->inp = tok->buf + curvalid; |
978 | 0 | tok->end = tok->buf + newsize; |
979 | 0 | tok->start = curstart < 0 ? NULL : |
980 | 0 | tok->buf + curstart; |
981 | 0 | if (decoding_fgets(tok->inp, |
982 | 0 | (int)(tok->end - tok->inp), |
983 | 0 | tok) == NULL) { |
984 | | /* Break out early on decoding |
985 | | errors, as tok->buf will be NULL |
986 | | */ |
987 | 0 | if (tok->decoding_erred) |
988 | 0 | return EOF; |
989 | | /* Last line does not end in \n, |
990 | | fake one */ |
991 | 0 | if (tok->inp[-1] != '\n') |
992 | 0 | strcpy(tok->inp, "\n"); |
993 | 0 | } |
994 | 0 | tok->inp = strchr(tok->inp, '\0'); |
995 | 0 | done = tok->inp[-1] == '\n'; |
996 | 0 | } |
997 | 0 | if (tok->buf != NULL) { |
998 | 0 | tok->cur = tok->buf + cur; |
999 | 0 | tok->line_start = tok->cur; |
1000 | | /* replace "\r\n" with "\n" */ |
1001 | | /* For Mac leave the \r, giving a syntax error */ |
1002 | 0 | pt = tok->inp - 2; |
1003 | 0 | if (pt >= tok->buf && *pt == '\r') { |
1004 | 0 | *pt++ = '\n'; |
1005 | 0 | *pt = '\0'; |
1006 | 0 | tok->inp = pt; |
1007 | 0 | } |
1008 | 0 | } |
1009 | 0 | } |
1010 | 0 | if (tok->done != E_OK) { |
1011 | 0 | if (tok->prompt != NULL) |
1012 | 0 | PySys_WriteStderr("\n"); |
1013 | 0 | tok->cur = tok->inp; |
1014 | 0 | return EOF; |
1015 | 0 | } |
1016 | 0 | } |
1017 | | /*NOTREACHED*/ |
1018 | 10.3k | } |
1019 | | |
1020 | | |
1021 | | /* Back-up one character */ |
1022 | | |
1023 | | static void |
1024 | | tok_backup(struct tok_state *tok, int c) |
1025 | 2.76k | { |
1026 | 2.76k | if (c != EOF) { |
1027 | 2.71k | if (--tok->cur < tok->buf) |
1028 | 0 | Py_FatalError("tok_backup: beginning of buffer"); |
1029 | 2.71k | if (*tok->cur != c) |
1030 | 0 | *tok->cur = c; |
1031 | 2.71k | } |
1032 | 2.76k | } |
1033 | | |
1034 | | |
1035 | | static int |
1036 | | syntaxerror(struct tok_state *tok, const char *format, ...) |
1037 | 0 | { |
1038 | 0 | PyObject *errmsg, *errtext, *args; |
1039 | 0 | va_list vargs; |
1040 | 0 | #ifdef HAVE_STDARG_PROTOTYPES |
1041 | 0 | va_start(vargs, format); |
1042 | | #else |
1043 | | va_start(vargs); |
1044 | | #endif |
1045 | 0 | errmsg = PyUnicode_FromFormatV(format, vargs); |
1046 | 0 | va_end(vargs); |
1047 | 0 | if (!errmsg) { |
1048 | 0 | goto error; |
1049 | 0 | } |
1050 | | |
1051 | 0 | errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start, |
1052 | 0 | "replace"); |
1053 | 0 | if (!errtext) { |
1054 | 0 | goto error; |
1055 | 0 | } |
1056 | 0 | int offset = (int)PyUnicode_GET_LENGTH(errtext); |
1057 | 0 | Py_ssize_t line_len = strcspn(tok->line_start, "\n"); |
1058 | 0 | if (line_len != tok->cur - tok->line_start) { |
1059 | 0 | Py_DECREF(errtext); |
1060 | 0 | errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len, |
1061 | 0 | "replace"); |
1062 | 0 | } |
1063 | 0 | if (!errtext) { |
1064 | 0 | goto error; |
1065 | 0 | } |
1066 | | |
1067 | 0 | args = Py_BuildValue("(O(OiiN))", errmsg, |
1068 | 0 | tok->filename, tok->lineno, offset, errtext); |
1069 | 0 | if (args) { |
1070 | 0 | PyErr_SetObject(PyExc_SyntaxError, args); |
1071 | 0 | Py_DECREF(args); |
1072 | 0 | } |
1073 | |
|
1074 | 0 | error: |
1075 | 0 | Py_XDECREF(errmsg); |
1076 | 0 | tok->done = E_ERROR; |
1077 | 0 | return ERRORTOKEN; |
1078 | 0 | } |
1079 | | |
1080 | | static int |
1081 | | indenterror(struct tok_state *tok) |
1082 | 0 | { |
1083 | 0 | tok->done = E_TABSPACE; |
1084 | 0 | tok->cur = tok->inp; |
1085 | 0 | return ERRORTOKEN; |
1086 | 0 | } |
1087 | | |
1088 | | /* Verify that the identifier follows PEP 3131. |
1089 | | All identifier strings are guaranteed to be "ready" unicode objects. |
1090 | | */ |
1091 | | static int |
1092 | | verify_identifier(struct tok_state *tok) |
1093 | 0 | { |
1094 | 0 | PyObject *s; |
1095 | 0 | int result; |
1096 | 0 | if (tok->decoding_erred) |
1097 | 0 | return 0; |
1098 | 0 | s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); |
1099 | 0 | if (s == NULL) { |
1100 | 0 | if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { |
1101 | 0 | PyErr_Clear(); |
1102 | 0 | tok->done = E_IDENTIFIER; |
1103 | 0 | } else { |
1104 | 0 | tok->done = E_ERROR; |
1105 | 0 | } |
1106 | 0 | return 0; |
1107 | 0 | } |
1108 | 0 | result = PyUnicode_IsIdentifier(s); |
1109 | 0 | Py_DECREF(s); |
1110 | 0 | if (result == 0) |
1111 | 0 | tok->done = E_IDENTIFIER; |
1112 | 0 | return result; |
1113 | 0 | } |
1114 | | |
1115 | | static int |
1116 | | tok_decimal_tail(struct tok_state *tok) |
1117 | 18 | { |
1118 | 18 | int c; |
1119 | | |
1120 | 18 | while (1) { |
1121 | 30 | do { |
1122 | 30 | c = tok_nextc(tok); |
1123 | 30 | } while (isdigit(c)); |
1124 | 18 | if (c != '_') { |
1125 | 18 | break; |
1126 | 18 | } |
1127 | 0 | c = tok_nextc(tok); |
1128 | 0 | if (!isdigit(c)) { |
1129 | 0 | tok_backup(tok, c); |
1130 | 0 | syntaxerror(tok, "invalid decimal literal"); |
1131 | 0 | return 0; |
1132 | 0 | } |
1133 | 0 | } |
1134 | 18 | return c; |
1135 | 18 | } |
1136 | | |
1137 | | /* Get next token, after space stripping etc. */ |
1138 | | |
1139 | | static int |
1140 | | tok_get(struct tok_state *tok, char **p_start, char **p_end) |
1141 | 1.41k | { |
1142 | 1.41k | int c; |
1143 | 1.41k | int blankline, nonascii; |
1144 | | |
1145 | 1.41k | *p_start = *p_end = NULL; |
1146 | 1.50k | nextline: |
1147 | 1.50k | tok->start = NULL; |
1148 | 1.50k | blankline = 0; |
1149 | | |
1150 | | /* Get indentation level */ |
1151 | 1.50k | if (tok->atbol) { |
1152 | 268 | int col = 0; |
1153 | 268 | int altcol = 0; |
1154 | 268 | tok->atbol = 0; |
1155 | 1.43k | for (;;) { |
1156 | 1.43k | c = tok_nextc(tok); |
1157 | 1.43k | if (c == ' ') { |
1158 | 1.16k | col++, altcol++; |
1159 | 1.16k | } |
1160 | 268 | else if (c == '\t') { |
1161 | 0 | col = (col / tok->tabsize + 1) * tok->tabsize; |
1162 | 0 | altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE; |
1163 | 0 | } |
1164 | 268 | else if (c == '\014') {/* Control-L (formfeed) */ |
1165 | 0 | col = altcol = 0; /* For Emacs users */ |
1166 | 0 | } |
1167 | 268 | else { |
1168 | 268 | break; |
1169 | 268 | } |
1170 | 1.43k | } |
1171 | 268 | tok_backup(tok, c); |
1172 | 268 | if (c == '#' || c == '\n') { |
1173 | | /* Lines with only whitespace and/or comments |
1174 | | shouldn't affect the indentation and are |
1175 | | not passed to the parser as NEWLINE tokens, |
1176 | | except *totally* empty lines in interactive |
1177 | | mode, which signal the end of a command group. */ |
1178 | 78 | if (col == 0 && c == '\n' && tok->prompt != NULL) { |
1179 | 0 | blankline = 0; /* Let it through */ |
1180 | 0 | } |
1181 | 78 | else if (tok->prompt != NULL && tok->lineno == 1) { |
1182 | | /* In interactive mode, if the first line contains |
1183 | | only spaces and/or a comment, let it through. */ |
1184 | 0 | blankline = 0; |
1185 | 0 | col = altcol = 0; |
1186 | 0 | } |
1187 | 78 | else { |
1188 | 78 | blankline = 1; /* Ignore completely */ |
1189 | 78 | } |
1190 | | /* We can't jump back right here since we still |
1191 | | may need to skip to the end of a comment */ |
1192 | 78 | } |
1193 | 268 | if (!blankline && tok->level == 0) { |
1194 | 176 | if (col == tok->indstack[tok->indent]) { |
1195 | | /* No change */ |
1196 | 74 | if (altcol != tok->altindstack[tok->indent]) { |
1197 | 0 | return indenterror(tok); |
1198 | 0 | } |
1199 | 74 | } |
1200 | 102 | else if (col > tok->indstack[tok->indent]) { |
1201 | | /* Indent -- always one */ |
1202 | 58 | if (tok->indent+1 >= MAXINDENT) { |
1203 | 0 | tok->done = E_TOODEEP; |
1204 | 0 | tok->cur = tok->inp; |
1205 | 0 | return ERRORTOKEN; |
1206 | 0 | } |
1207 | 58 | if (altcol <= tok->altindstack[tok->indent]) { |
1208 | 0 | return indenterror(tok); |
1209 | 0 | } |
1210 | 58 | tok->pendin++; |
1211 | 58 | tok->indstack[++tok->indent] = col; |
1212 | 58 | tok->altindstack[tok->indent] = altcol; |
1213 | 58 | } |
1214 | 44 | else /* col < tok->indstack[tok->indent] */ { |
1215 | | /* Dedent -- any number, must be consistent */ |
1216 | 102 | while (tok->indent > 0 && |
1217 | 102 | col < tok->indstack[tok->indent]) { |
1218 | 58 | tok->pendin--; |
1219 | 58 | tok->indent--; |
1220 | 58 | } |
1221 | 44 | if (col != tok->indstack[tok->indent]) { |
1222 | 0 | tok->done = E_DEDENT; |
1223 | 0 | tok->cur = tok->inp; |
1224 | 0 | return ERRORTOKEN; |
1225 | 0 | } |
1226 | 44 | if (altcol != tok->altindstack[tok->indent]) { |
1227 | 0 | return indenterror(tok); |
1228 | 0 | } |
1229 | 44 | } |
1230 | 176 | } |
1231 | 268 | } |
1232 | | |
1233 | 1.50k | tok->start = tok->cur; |
1234 | | |
1235 | | /* Return pending indents/dedents */ |
1236 | 1.50k | if (tok->pendin != 0) { |
1237 | 116 | if (tok->pendin < 0) { |
1238 | 58 | tok->pendin++; |
1239 | 58 | return DEDENT; |
1240 | 58 | } |
1241 | 58 | else { |
1242 | 58 | tok->pendin--; |
1243 | 58 | return INDENT; |
1244 | 58 | } |
1245 | 116 | } |
1246 | | |
1247 | | /* Peek ahead at the next character */ |
1248 | 1.38k | c = tok_nextc(tok); |
1249 | 1.38k | tok_backup(tok, c); |
1250 | | /* Check if we are closing an async function */ |
1251 | 1.38k | if (tok->async_def |
1252 | 1.38k | && !blankline |
1253 | | /* Due to some implementation artifacts of type comments, |
1254 | | * a TYPE_COMMENT at the start of a function won't set an |
1255 | | * indentation level and it will produce a NEWLINE after it. |
1256 | | * To avoid spuriously ending an async function due to this, |
1257 | | * wait until we have some non-newline char in front of us. */ |
1258 | 1.38k | && c != '\n' |
1259 | 1.38k | && tok->level == 0 |
1260 | | /* There was a NEWLINE after ASYNC DEF, |
1261 | | so we're past the signature. */ |
1262 | 1.38k | && tok->async_def_nl |
1263 | | /* Current indentation level is less than where |
1264 | | the async function was defined */ |
1265 | 1.38k | && tok->async_def_indent >= tok->indent) |
1266 | 0 | { |
1267 | 0 | tok->async_def = 0; |
1268 | 0 | tok->async_def_indent = 0; |
1269 | 0 | tok->async_def_nl = 0; |
1270 | 0 | } |
1271 | | |
1272 | 1.38k | again: |
1273 | 1.38k | tok->start = NULL; |
1274 | | /* Skip spaces */ |
1275 | 1.72k | do { |
1276 | 1.72k | c = tok_nextc(tok); |
1277 | 1.72k | } while (c == ' ' || c == '\t' || c == '\014'); |
1278 | | |
1279 | | /* Set start of current token */ |
1280 | 1.38k | tok->start = tok->cur - 1; |
1281 | | |
1282 | | /* Skip comment, unless it's a type comment */ |
1283 | 1.38k | if (c == '#') { |
1284 | 66 | const char *prefix, *p, *type_start; |
1285 | | |
1286 | 2.32k | while (c != EOF && c != '\n') { |
1287 | 2.25k | c = tok_nextc(tok); |
1288 | 2.25k | } |
1289 | | |
1290 | 66 | if (tok->type_comments) { |
1291 | 0 | p = tok->start; |
1292 | 0 | prefix = type_comment_prefix; |
1293 | 0 | while (*prefix && p < tok->cur) { |
1294 | 0 | if (*prefix == ' ') { |
1295 | 0 | while (*p == ' ' || *p == '\t') { |
1296 | 0 | p++; |
1297 | 0 | } |
1298 | 0 | } else if (*prefix == *p) { |
1299 | 0 | p++; |
1300 | 0 | } else { |
1301 | 0 | break; |
1302 | 0 | } |
1303 | | |
1304 | 0 | prefix++; |
1305 | 0 | } |
1306 | | |
1307 | | /* This is a type comment if we matched all of type_comment_prefix. */ |
1308 | 0 | if (!*prefix) { |
1309 | 0 | int is_type_ignore = 1; |
1310 | 0 | const char *ignore_end = p + 6; |
1311 | 0 | tok_backup(tok, c); /* don't eat the newline or EOF */ |
1312 | |
|
1313 | 0 | type_start = p; |
1314 | | |
1315 | | /* A TYPE_IGNORE is "type: ignore" followed by the end of the token |
1316 | | * or anything ASCII and non-alphanumeric. */ |
1317 | 0 | is_type_ignore = ( |
1318 | 0 | tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0 |
1319 | 0 | && !(tok->cur > ignore_end |
1320 | 0 | && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); |
1321 | |
|
1322 | 0 | if (is_type_ignore) { |
1323 | 0 | *p_start = (char *) ignore_end; |
1324 | 0 | *p_end = tok->cur; |
1325 | | |
1326 | | /* If this type ignore is the only thing on the line, consume the newline also. */ |
1327 | 0 | if (blankline) { |
1328 | 0 | tok_nextc(tok); |
1329 | 0 | tok->atbol = 1; |
1330 | 0 | } |
1331 | 0 | return TYPE_IGNORE; |
1332 | 0 | } else { |
1333 | 0 | *p_start = (char *) type_start; /* after type_comment_prefix */ |
1334 | 0 | *p_end = tok->cur; |
1335 | 0 | return TYPE_COMMENT; |
1336 | 0 | } |
1337 | 0 | } |
1338 | 0 | } |
1339 | 66 | } |
1340 | | |
1341 | | /* Check for EOF and errors now */ |
1342 | 1.38k | if (c == EOF) { |
1343 | 32 | return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; |
1344 | 32 | } |
1345 | | |
1346 | | /* Identifier (most frequent token!) */ |
1347 | 1.35k | nonascii = 0; |
1348 | 1.35k | if (is_potential_identifier_start(c)) { |
1349 | | /* Process the various legal combinations of b"", r"", u"", and f"". */ |
1350 | 530 | int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0; |
1351 | 584 | while (1) { |
1352 | 584 | if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B')) |
1353 | 6 | saw_b = 1; |
1354 | | /* Since this is a backwards compatibility support literal we don't |
1355 | | want to support it in arbitrary order like byte literals. */ |
1356 | 578 | else if (!(saw_b || saw_u || saw_r || saw_f) |
1357 | 578 | && (c == 'u'|| c == 'U')) { |
1358 | 2 | saw_u = 1; |
1359 | 2 | } |
1360 | | /* ur"" and ru"" are not supported */ |
1361 | 576 | else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) { |
1362 | 22 | saw_r = 1; |
1363 | 22 | } |
1364 | 554 | else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) { |
1365 | 30 | saw_f = 1; |
1366 | 30 | } |
1367 | 524 | else { |
1368 | 524 | break; |
1369 | 524 | } |
1370 | 60 | c = tok_nextc(tok); |
1371 | 60 | if (c == '"' || c == '\'') { |
1372 | 6 | goto letter_quote; |
1373 | 6 | } |
1374 | 60 | } |
1375 | 3.04k | while (is_potential_identifier_char(c)) { |
1376 | 2.52k | if (c >= 128) { |
1377 | 0 | nonascii = 1; |
1378 | 0 | } |
1379 | 2.52k | c = tok_nextc(tok); |
1380 | 2.52k | } |
1381 | 524 | tok_backup(tok, c); |
1382 | 524 | if (nonascii && !verify_identifier(tok)) { |
1383 | 0 | return ERRORTOKEN; |
1384 | 0 | } |
1385 | 524 | *p_start = tok->start; |
1386 | 524 | *p_end = tok->cur; |
1387 | | |
1388 | | /* async/await parsing block. */ |
1389 | 524 | if (tok->cur - tok->start == 5 && tok->start[0] == 'a') { |
1390 | | /* May be an 'async' or 'await' token. For Python 3.7 or |
1391 | | later we recognize them unconditionally. For Python |
1392 | | 3.5 or 3.6 we recognize 'async' in front of 'def', and |
1393 | | either one inside of 'async def'. (Technically we |
1394 | | shouldn't recognize these at all for 3.4 or earlier, |
1395 | | but there's no *valid* Python 3.4 code that would be |
1396 | | rejected, and async functions will be rejected in a |
1397 | | later phase.) */ |
1398 | 0 | if (!tok->async_hacks || tok->async_def) { |
1399 | | /* Always recognize the keywords. */ |
1400 | 0 | if (memcmp(tok->start, "async", 5) == 0) { |
1401 | 0 | return ASYNC; |
1402 | 0 | } |
1403 | 0 | if (memcmp(tok->start, "await", 5) == 0) { |
1404 | 0 | return AWAIT; |
1405 | 0 | } |
1406 | 0 | } |
1407 | 0 | else if (memcmp(tok->start, "async", 5) == 0) { |
1408 | | /* The current token is 'async'. |
1409 | | Look ahead one token to see if that is 'def'. */ |
1410 | |
|
1411 | 0 | struct tok_state ahead_tok; |
1412 | 0 | char *ahead_tok_start = NULL, *ahead_tok_end = NULL; |
1413 | 0 | int ahead_tok_kind; |
1414 | |
|
1415 | 0 | memcpy(&ahead_tok, tok, sizeof(ahead_tok)); |
1416 | 0 | ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start, |
1417 | 0 | &ahead_tok_end); |
1418 | |
|
1419 | 0 | if (ahead_tok_kind == NAME |
1420 | 0 | && ahead_tok.cur - ahead_tok.start == 3 |
1421 | 0 | && memcmp(ahead_tok.start, "def", 3) == 0) |
1422 | 0 | { |
1423 | | /* The next token is going to be 'def', so instead of |
1424 | | returning a plain NAME token, return ASYNC. */ |
1425 | 0 | tok->async_def_indent = tok->indent; |
1426 | 0 | tok->async_def = 1; |
1427 | 0 | return ASYNC; |
1428 | 0 | } |
1429 | 0 | } |
1430 | 0 | } |
1431 | | |
1432 | 524 | return NAME; |
1433 | 524 | } |
1434 | | |
1435 | | /* Newline */ |
1436 | 826 | if (c == '\n') { |
1437 | 252 | tok->atbol = 1; |
1438 | 252 | if (blankline || tok->level > 0) { |
1439 | 92 | goto nextline; |
1440 | 92 | } |
1441 | 160 | *p_start = tok->start; |
1442 | 160 | *p_end = tok->cur - 1; /* Leave '\n' out of the string */ |
1443 | 160 | tok->cont_line = 0; |
1444 | 160 | if (tok->async_def) { |
1445 | | /* We're somewhere inside an 'async def' function, and |
1446 | | we've encountered a NEWLINE after its signature. */ |
1447 | 0 | tok->async_def_nl = 1; |
1448 | 0 | } |
1449 | 160 | return NEWLINE; |
1450 | 252 | } |
1451 | | |
1452 | | /* Period or number starting with period? */ |
1453 | 574 | if (c == '.') { |
1454 | 60 | c = tok_nextc(tok); |
1455 | 60 | if (isdigit(c)) { |
1456 | 0 | goto fraction; |
1457 | 60 | } else if (c == '.') { |
1458 | 0 | c = tok_nextc(tok); |
1459 | 0 | if (c == '.') { |
1460 | 0 | *p_start = tok->start; |
1461 | 0 | *p_end = tok->cur; |
1462 | 0 | return ELLIPSIS; |
1463 | 0 | } |
1464 | 0 | else { |
1465 | 0 | tok_backup(tok, c); |
1466 | 0 | } |
1467 | 0 | tok_backup(tok, '.'); |
1468 | 0 | } |
1469 | 60 | else { |
1470 | 60 | tok_backup(tok, c); |
1471 | 60 | } |
1472 | 60 | *p_start = tok->start; |
1473 | 60 | *p_end = tok->cur; |
1474 | 60 | return DOT; |
1475 | 60 | } |
1476 | | |
1477 | | /* Number */ |
1478 | 514 | if (isdigit(c)) { |
1479 | 30 | if (c == '0') { |
1480 | | /* Hex, octal or binary -- maybe. */ |
1481 | 14 | c = tok_nextc(tok); |
1482 | 14 | if (c == 'x' || c == 'X') { |
1483 | | /* Hex */ |
1484 | 0 | c = tok_nextc(tok); |
1485 | 0 | do { |
1486 | 0 | if (c == '_') { |
1487 | 0 | c = tok_nextc(tok); |
1488 | 0 | } |
1489 | 0 | if (!isxdigit(c)) { |
1490 | 0 | tok_backup(tok, c); |
1491 | 0 | return syntaxerror(tok, "invalid hexadecimal literal"); |
1492 | 0 | } |
1493 | 0 | do { |
1494 | 0 | c = tok_nextc(tok); |
1495 | 0 | } while (isxdigit(c)); |
1496 | 0 | } while (c == '_'); |
1497 | 0 | } |
1498 | 14 | else if (c == 'o' || c == 'O') { |
1499 | | /* Octal */ |
1500 | 0 | c = tok_nextc(tok); |
1501 | 0 | do { |
1502 | 0 | if (c == '_') { |
1503 | 0 | c = tok_nextc(tok); |
1504 | 0 | } |
1505 | 0 | if (c < '0' || c >= '8') { |
1506 | 0 | tok_backup(tok, c); |
1507 | 0 | if (isdigit(c)) { |
1508 | 0 | return syntaxerror(tok, |
1509 | 0 | "invalid digit '%c' in octal literal", c); |
1510 | 0 | } |
1511 | 0 | else { |
1512 | 0 | return syntaxerror(tok, "invalid octal literal"); |
1513 | 0 | } |
1514 | 0 | } |
1515 | 0 | do { |
1516 | 0 | c = tok_nextc(tok); |
1517 | 0 | } while ('0' <= c && c < '8'); |
1518 | 0 | } while (c == '_'); |
1519 | 0 | if (isdigit(c)) { |
1520 | 0 | return syntaxerror(tok, |
1521 | 0 | "invalid digit '%c' in octal literal", c); |
1522 | 0 | } |
1523 | 0 | } |
1524 | 14 | else if (c == 'b' || c == 'B') { |
1525 | | /* Binary */ |
1526 | 0 | c = tok_nextc(tok); |
1527 | 0 | do { |
1528 | 0 | if (c == '_') { |
1529 | 0 | c = tok_nextc(tok); |
1530 | 0 | } |
1531 | 0 | if (c != '0' && c != '1') { |
1532 | 0 | tok_backup(tok, c); |
1533 | 0 | if (isdigit(c)) { |
1534 | 0 | return syntaxerror(tok, |
1535 | 0 | "invalid digit '%c' in binary literal", c); |
1536 | 0 | } |
1537 | 0 | else { |
1538 | 0 | return syntaxerror(tok, "invalid binary literal"); |
1539 | 0 | } |
1540 | 0 | } |
1541 | 0 | do { |
1542 | 0 | c = tok_nextc(tok); |
1543 | 0 | } while (c == '0' || c == '1'); |
1544 | 0 | } while (c == '_'); |
1545 | 0 | if (isdigit(c)) { |
1546 | 0 | return syntaxerror(tok, |
1547 | 0 | "invalid digit '%c' in binary literal", c); |
1548 | 0 | } |
1549 | 0 | } |
1550 | 14 | else { |
1551 | 14 | int nonzero = 0; |
1552 | | /* maybe old-style octal; c is first char of it */ |
1553 | | /* in any case, allow '0' as a literal */ |
1554 | 14 | while (1) { |
1555 | 14 | if (c == '_') { |
1556 | 0 | c = tok_nextc(tok); |
1557 | 0 | if (!isdigit(c)) { |
1558 | 0 | tok_backup(tok, c); |
1559 | 0 | return syntaxerror(tok, "invalid decimal literal"); |
1560 | 0 | } |
1561 | 0 | } |
1562 | 14 | if (c != '0') { |
1563 | 14 | break; |
1564 | 14 | } |
1565 | 0 | c = tok_nextc(tok); |
1566 | 0 | } |
1567 | 14 | if (isdigit(c)) { |
1568 | 0 | nonzero = 1; |
1569 | 0 | c = tok_decimal_tail(tok); |
1570 | 0 | if (c == 0) { |
1571 | 0 | return ERRORTOKEN; |
1572 | 0 | } |
1573 | 0 | } |
1574 | 14 | if (c == '.') { |
1575 | 2 | c = tok_nextc(tok); |
1576 | 2 | goto fraction; |
1577 | 2 | } |
1578 | 12 | else if (c == 'e' || c == 'E') { |
1579 | 0 | goto exponent; |
1580 | 0 | } |
1581 | 12 | else if (c == 'j' || c == 'J') { |
1582 | 0 | goto imaginary; |
1583 | 0 | } |
1584 | 12 | else if (nonzero) { |
1585 | | /* Old-style octal: now disallowed. */ |
1586 | 0 | tok_backup(tok, c); |
1587 | 0 | return syntaxerror(tok, |
1588 | 0 | "leading zeros in decimal integer " |
1589 | 0 | "literals are not permitted; " |
1590 | 0 | "use an 0o prefix for octal integers"); |
1591 | 0 | } |
1592 | 14 | } |
1593 | 14 | } |
1594 | 16 | else { |
1595 | | /* Decimal */ |
1596 | 16 | c = tok_decimal_tail(tok); |
1597 | 16 | if (c == 0) { |
1598 | 0 | return ERRORTOKEN; |
1599 | 0 | } |
1600 | 16 | { |
1601 | | /* Accept floating point numbers. */ |
1602 | 16 | if (c == '.') { |
1603 | 0 | c = tok_nextc(tok); |
1604 | 2 | fraction: |
1605 | | /* Fraction */ |
1606 | 2 | if (isdigit(c)) { |
1607 | 2 | c = tok_decimal_tail(tok); |
1608 | 2 | if (c == 0) { |
1609 | 0 | return ERRORTOKEN; |
1610 | 0 | } |
1611 | 2 | } |
1612 | 2 | } |
1613 | 18 | if (c == 'e' || c == 'E') { |
1614 | 0 | int e; |
1615 | 0 | exponent: |
1616 | 0 | e = c; |
1617 | | /* Exponent part */ |
1618 | 0 | c = tok_nextc(tok); |
1619 | 0 | if (c == '+' || c == '-') { |
1620 | 0 | c = tok_nextc(tok); |
1621 | 0 | if (!isdigit(c)) { |
1622 | 0 | tok_backup(tok, c); |
1623 | 0 | return syntaxerror(tok, "invalid decimal literal"); |
1624 | 0 | } |
1625 | 0 | } else if (!isdigit(c)) { |
1626 | 0 | tok_backup(tok, c); |
1627 | 0 | tok_backup(tok, e); |
1628 | 0 | *p_start = tok->start; |
1629 | 0 | *p_end = tok->cur; |
1630 | 0 | return NUMBER; |
1631 | 0 | } |
1632 | 0 | c = tok_decimal_tail(tok); |
1633 | 0 | if (c == 0) { |
1634 | 0 | return ERRORTOKEN; |
1635 | 0 | } |
1636 | 0 | } |
1637 | 18 | if (c == 'j' || c == 'J') { |
1638 | | /* Imaginary part */ |
1639 | 0 | imaginary: |
1640 | 0 | c = tok_nextc(tok); |
1641 | 0 | } |
1642 | 18 | } |
1643 | 18 | } |
1644 | 30 | tok_backup(tok, c); |
1645 | 30 | *p_start = tok->start; |
1646 | 30 | *p_end = tok->cur; |
1647 | 30 | return NUMBER; |
1648 | 30 | } |
1649 | | |
1650 | 490 | letter_quote: |
1651 | | /* String */ |
1652 | 490 | if (c == '\'' || c == '"') { |
1653 | 24 | int quote = c; |
1654 | 24 | int quote_size = 1; /* 1 or 3 */ |
1655 | 24 | int end_quote_size = 0; |
1656 | | |
1657 | | /* Nodes of type STRING, especially multi line strings |
1658 | | must be handled differently in order to get both |
1659 | | the starting line number and the column offset right. |
1660 | | (cf. issue 16806) */ |
1661 | 24 | tok->first_lineno = tok->lineno; |
1662 | 24 | tok->multi_line_start = tok->line_start; |
1663 | | |
1664 | | /* Find the quote size and start of string */ |
1665 | 24 | c = tok_nextc(tok); |
1666 | 24 | if (c == quote) { |
1667 | 0 | c = tok_nextc(tok); |
1668 | 0 | if (c == quote) { |
1669 | 0 | quote_size = 3; |
1670 | 0 | } |
1671 | 0 | else { |
1672 | 0 | end_quote_size = 1; /* empty string found */ |
1673 | 0 | } |
1674 | 0 | } |
1675 | 24 | if (c != quote) { |
1676 | 24 | tok_backup(tok, c); |
1677 | 24 | } |
1678 | | |
1679 | | /* Get rest of string */ |
1680 | 342 | while (end_quote_size != quote_size) { |
1681 | 318 | c = tok_nextc(tok); |
1682 | 318 | if (c == EOF) { |
1683 | 0 | if (quote_size == 3) { |
1684 | 0 | tok->done = E_EOFS; |
1685 | 0 | } |
1686 | 0 | else { |
1687 | 0 | tok->done = E_EOLS; |
1688 | 0 | } |
1689 | 0 | tok->cur = tok->inp; |
1690 | 0 | return ERRORTOKEN; |
1691 | 0 | } |
1692 | 318 | if (quote_size == 1 && c == '\n') { |
1693 | 0 | tok->done = E_EOLS; |
1694 | 0 | tok->cur = tok->inp; |
1695 | 0 | return ERRORTOKEN; |
1696 | 0 | } |
1697 | 318 | if (c == quote) { |
1698 | 24 | end_quote_size += 1; |
1699 | 24 | } |
1700 | 294 | else { |
1701 | 294 | end_quote_size = 0; |
1702 | 294 | if (c == '\\') { |
1703 | 6 | tok_nextc(tok); /* skip escaped char */ |
1704 | 6 | } |
1705 | 294 | } |
1706 | 318 | } |
1707 | | |
1708 | 24 | *p_start = tok->start; |
1709 | 24 | *p_end = tok->cur; |
1710 | 24 | return STRING; |
1711 | 24 | } |
1712 | | |
1713 | | /* Line continuation */ |
1714 | 466 | if (c == '\\') { |
1715 | 0 | c = tok_nextc(tok); |
1716 | 0 | if (c != '\n') { |
1717 | 0 | tok->done = E_LINECONT; |
1718 | 0 | tok->cur = tok->inp; |
1719 | 0 | return ERRORTOKEN; |
1720 | 0 | } |
1721 | 0 | c = tok_nextc(tok); |
1722 | 0 | if (c == EOF) { |
1723 | 0 | tok->done = E_EOF; |
1724 | 0 | tok->cur = tok->inp; |
1725 | 0 | return ERRORTOKEN; |
1726 | 0 | } else { |
1727 | 0 | tok_backup(tok, c); |
1728 | 0 | } |
1729 | 0 | tok->cont_line = 1; |
1730 | 0 | goto again; /* Read next line */ |
1731 | 0 | } |
1732 | | |
1733 | | /* Check for two-character token */ |
1734 | 466 | { |
1735 | 466 | int c2 = tok_nextc(tok); |
1736 | 466 | int token = PyToken_TwoChars(c, c2); |
1737 | 466 | if (token != OP) { |
1738 | 14 | int c3 = tok_nextc(tok); |
1739 | 14 | int token3 = PyToken_ThreeChars(c, c2, c3); |
1740 | 14 | if (token3 != OP) { |
1741 | 0 | token = token3; |
1742 | 0 | } |
1743 | 14 | else { |
1744 | 14 | tok_backup(tok, c3); |
1745 | 14 | } |
1746 | 14 | *p_start = tok->start; |
1747 | 14 | *p_end = tok->cur; |
1748 | 14 | return token; |
1749 | 14 | } |
1750 | 452 | tok_backup(tok, c2); |
1751 | 452 | } |
1752 | | |
1753 | | /* Keep track of parentheses nesting level */ |
1754 | 0 | switch (c) { |
1755 | 98 | case '(': |
1756 | 116 | case '[': |
1757 | 122 | case '{': |
1758 | 122 | if (tok->level >= MAXLEVEL) { |
1759 | 0 | return syntaxerror(tok, "too many nested parentheses"); |
1760 | 0 | } |
1761 | 122 | tok->parenstack[tok->level] = c; |
1762 | 122 | tok->parenlinenostack[tok->level] = tok->lineno; |
1763 | 122 | tok->level++; |
1764 | 122 | break; |
1765 | 98 | case ')': |
1766 | 116 | case ']': |
1767 | 122 | case '}': |
1768 | 122 | if (!tok->level) { |
1769 | 0 | return syntaxerror(tok, "unmatched '%c'", c); |
1770 | 0 | } |
1771 | 122 | tok->level--; |
1772 | 122 | int opening = tok->parenstack[tok->level]; |
1773 | 122 | if (!((opening == '(' && c == ')') || |
1774 | 122 | (opening == '[' && c == ']') || |
1775 | 122 | (opening == '{' && c == '}'))) |
1776 | 0 | { |
1777 | 0 | if (tok->parenlinenostack[tok->level] != tok->lineno) { |
1778 | 0 | return syntaxerror(tok, |
1779 | 0 | "closing parenthesis '%c' does not match " |
1780 | 0 | "opening parenthesis '%c' on line %d", |
1781 | 0 | c, opening, tok->parenlinenostack[tok->level]); |
1782 | 0 | } |
1783 | 0 | else { |
1784 | 0 | return syntaxerror(tok, |
1785 | 0 | "closing parenthesis '%c' does not match " |
1786 | 0 | "opening parenthesis '%c'", |
1787 | 0 | c, opening); |
1788 | 0 | } |
1789 | 0 | } |
1790 | 122 | break; |
1791 | 452 | } |
1792 | | |
1793 | | /* Punctuation character */ |
1794 | 452 | *p_start = tok->start; |
1795 | 452 | *p_end = tok->cur; |
1796 | 452 | return PyToken_OneChar(c); |
1797 | 452 | } |
1798 | | |
1799 | | int |
1800 | | PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) |
1801 | 1.41k | { |
1802 | 1.41k | int result = tok_get(tok, p_start, p_end); |
1803 | 1.41k | if (tok->decoding_erred) { |
1804 | 0 | result = ERRORTOKEN; |
1805 | 0 | tok->done = E_DECODE; |
1806 | 0 | } |
1807 | 1.41k | return result; |
1808 | 1.41k | } |
1809 | | |
1810 | | /* Get the encoding of a Python file. Check for the coding cookie and check if |
1811 | | the file starts with a BOM. |
1812 | | |
1813 | | PyTokenizer_FindEncodingFilename() returns NULL when it can't find the |
1814 | | encoding in the first or second line of the file (in which case the encoding |
1815 | | should be assumed to be UTF-8). |
1816 | | |
1817 | | The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed |
1818 | | by the caller. */ |
1819 | | |
1820 | | char * |
1821 | | PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) |
1822 | 0 | { |
1823 | 0 | struct tok_state *tok; |
1824 | 0 | FILE *fp; |
1825 | 0 | char *p_start =NULL , *p_end =NULL , *encoding = NULL; |
1826 | |
|
1827 | 0 | fd = _Py_dup(fd); |
1828 | 0 | if (fd < 0) { |
1829 | 0 | return NULL; |
1830 | 0 | } |
1831 | | |
1832 | 0 | fp = fdopen(fd, "r"); |
1833 | 0 | if (fp == NULL) { |
1834 | 0 | return NULL; |
1835 | 0 | } |
1836 | 0 | tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL); |
1837 | 0 | if (tok == NULL) { |
1838 | 0 | fclose(fp); |
1839 | 0 | return NULL; |
1840 | 0 | } |
1841 | 0 | if (filename != NULL) { |
1842 | 0 | Py_INCREF(filename); |
1843 | 0 | tok->filename = filename; |
1844 | 0 | } |
1845 | 0 | else { |
1846 | 0 | tok->filename = PyUnicode_FromString("<string>"); |
1847 | 0 | if (tok->filename == NULL) { |
1848 | 0 | fclose(fp); |
1849 | 0 | PyTokenizer_Free(tok); |
1850 | 0 | return encoding; |
1851 | 0 | } |
1852 | 0 | } |
1853 | 0 | while (tok->lineno < 2 && tok->done == E_OK) { |
1854 | 0 | PyTokenizer_Get(tok, &p_start, &p_end); |
1855 | 0 | } |
1856 | 0 | fclose(fp); |
1857 | 0 | if (tok->encoding) { |
1858 | 0 | encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1); |
1859 | 0 | if (encoding) |
1860 | 0 | strcpy(encoding, tok->encoding); |
1861 | 0 | } |
1862 | 0 | PyTokenizer_Free(tok); |
1863 | 0 | return encoding; |
1864 | 0 | } |
1865 | | |
1866 | | char * |
1867 | | PyTokenizer_FindEncoding(int fd) |
1868 | 0 | { |
1869 | 0 | return PyTokenizer_FindEncodingFilename(fd, NULL); |
1870 | 0 | } |
1871 | | |
1872 | | #ifdef Py_DEBUG |
1873 | | |
1874 | | void |
1875 | | tok_dump(int type, char *start, char *end) |
1876 | | { |
1877 | | printf("%s", _PyParser_TokenNames[type]); |
1878 | | if (type == NAME || type == NUMBER || type == STRING || type == OP) |
1879 | | printf("(%.*s)", (int)(end - start), start); |
1880 | | } |
1881 | | |
1882 | | #endif |