/src/cpython/Parser/tokenizer/helpers.c
Line | Count | Source |
1 | | #include "Python.h" |
2 | | #include "errcode.h" |
3 | | #include "pycore_runtime.h" // _Py_ID() |
4 | | #include "pycore_token.h" |
5 | | #include "pycore_tuple.h" // _PyTuple_FromPair |
6 | | |
7 | | #include "../lexer/state.h" |
8 | | |
9 | | |
10 | | /* ############## ERRORS ############## */ |
11 | | |
12 | | static int |
13 | | _syntaxerror_range(struct tok_state *tok, const char *format, |
14 | | int col_offset, int end_col_offset, |
15 | | va_list vargs) |
16 | 3.07k | { |
17 | | // In release builds, we don't want to overwrite a previous error, but in debug builds we |
18 | | // want to fail if we are not doing it so we can fix it. |
19 | 3.07k | assert(tok->done != E_ERROR); |
20 | 3.07k | if (tok->done == E_ERROR) { |
21 | 0 | return ERRORTOKEN; |
22 | 0 | } |
23 | 3.07k | PyObject *errmsg, *errtext, *args; |
24 | 3.07k | errmsg = PyUnicode_FromFormatV(format, vargs); |
25 | 3.07k | if (!errmsg) { |
26 | 0 | goto error; |
27 | 0 | } |
28 | | |
29 | 3.07k | errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start, |
30 | 3.07k | "replace"); |
31 | 3.07k | if (!errtext) { |
32 | 0 | goto error; |
33 | 0 | } |
34 | | |
35 | 3.07k | if (col_offset == -1) { |
36 | 2.42k | col_offset = (int)PyUnicode_GET_LENGTH(errtext); |
37 | 2.42k | } |
38 | 3.07k | if (end_col_offset == -1) { |
39 | 2.42k | end_col_offset = col_offset; |
40 | 2.42k | } |
41 | | |
42 | 3.07k | Py_ssize_t line_len = strcspn(tok->line_start, "\n"); |
43 | 3.07k | if (line_len != tok->cur - tok->line_start) { |
44 | 2.32k | Py_DECREF(errtext); |
45 | 2.32k | errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len, |
46 | 2.32k | "replace"); |
47 | 2.32k | } |
48 | 3.07k | if (!errtext) { |
49 | 0 | goto error; |
50 | 0 | } |
51 | | |
52 | 3.07k | args = Py_BuildValue("(O(OiiNii))", errmsg, |
53 | 3.07k | tok->filename ? tok->filename : Py_None, |
54 | 3.07k | tok->lineno, col_offset, errtext, |
55 | 3.07k | tok->lineno, end_col_offset); |
56 | 3.07k | if (args) { |
57 | 3.07k | PyErr_SetObject(PyExc_SyntaxError, args); |
58 | 3.07k | Py_DECREF(args); |
59 | 3.07k | } |
60 | | |
61 | 3.07k | error: |
62 | 3.07k | Py_XDECREF(errmsg); |
63 | 3.07k | tok->done = E_ERROR; |
64 | 3.07k | return ERRORTOKEN; |
65 | 3.07k | } |
66 | | |
67 | | int |
68 | | _PyTokenizer_syntaxerror(struct tok_state *tok, const char *format, ...) |
69 | 2.42k | { |
70 | | // These errors are cleaned on startup. Todo: Fix it. |
71 | 2.42k | va_list vargs; |
72 | 2.42k | va_start(vargs, format); |
73 | 2.42k | int ret = _syntaxerror_range(tok, format, -1, -1, vargs); |
74 | 2.42k | va_end(vargs); |
75 | 2.42k | return ret; |
76 | 2.42k | } |
77 | | |
78 | | int |
79 | | _PyTokenizer_syntaxerror_known_range(struct tok_state *tok, |
80 | | int col_offset, int end_col_offset, |
81 | | const char *format, ...) |
82 | 644 | { |
83 | 644 | va_list vargs; |
84 | 644 | va_start(vargs, format); |
85 | 644 | int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs); |
86 | 644 | va_end(vargs); |
87 | 644 | return ret; |
88 | 644 | } |
89 | | |
90 | | int |
91 | | _PyTokenizer_indenterror(struct tok_state *tok) |
92 | 4 | { |
93 | 4 | tok->done = E_TABSPACE; |
94 | 4 | tok->cur = tok->inp; |
95 | 4 | return ERRORTOKEN; |
96 | 4 | } |
97 | | |
98 | | char * |
99 | | _PyTokenizer_error_ret(struct tok_state *tok) /* XXX */ |
100 | 2.59k | { |
101 | 2.59k | tok->decoding_erred = 1; |
102 | 2.59k | if ((tok->fp != NULL || tok->readline != NULL) && tok->buf != NULL) {/* see _PyTokenizer_Free */ |
103 | 0 | PyMem_Free(tok->buf); |
104 | 0 | } |
105 | 2.59k | tok->buf = tok->cur = tok->inp = NULL; |
106 | 2.59k | tok->start = NULL; |
107 | 2.59k | tok->end = NULL; |
108 | 2.59k | tok->done = E_DECODE; |
109 | 2.59k | return NULL; /* as if it were EOF */ |
110 | 2.59k | } |
111 | | |
112 | | int |
113 | | _PyTokenizer_warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char) |
114 | 579 | { |
115 | 579 | if (!tok->report_warnings) { |
116 | 0 | return 0; |
117 | 0 | } |
118 | | |
119 | 579 | PyObject *msg = PyUnicode_FromFormat( |
120 | 579 | "\"\\%c\" is an invalid escape sequence. " |
121 | 579 | "Such sequences will not work in the future. " |
122 | 579 | "Did you mean \"\\\\%c\"? A raw string is also an option.", |
123 | 579 | (char) first_invalid_escape_char, |
124 | 579 | (char) first_invalid_escape_char |
125 | 579 | ); |
126 | | |
127 | 579 | if (msg == NULL) { |
128 | 0 | return -1; |
129 | 0 | } |
130 | | |
131 | 579 | if (PyErr_WarnExplicitObject(PyExc_SyntaxWarning, msg, tok->filename, |
132 | 579 | tok->lineno, tok->module, NULL) < 0) { |
133 | 0 | Py_DECREF(msg); |
134 | |
|
135 | 0 | if (PyErr_ExceptionMatches(PyExc_SyntaxWarning)) { |
136 | | /* Replace the SyntaxWarning exception with a SyntaxError |
137 | | to get a more accurate error report */ |
138 | 0 | PyErr_Clear(); |
139 | |
|
140 | 0 | return _PyTokenizer_syntaxerror(tok, |
141 | 0 | "\"\\%c\" is an invalid escape sequence. " |
142 | 0 | "Did you mean \"\\\\%c\"? A raw string is also an option.", |
143 | 0 | (char) first_invalid_escape_char, |
144 | 0 | (char) first_invalid_escape_char); |
145 | 0 | } |
146 | | |
147 | 0 | return -1; |
148 | 0 | } |
149 | | |
150 | 579 | Py_DECREF(msg); |
151 | 579 | return 0; |
152 | 579 | } |
153 | | |
154 | | void |
155 | | _PyTokenizer_raise_init_error(PyObject *filename) |
156 | 2.59k | { |
157 | 2.59k | if (!(PyErr_ExceptionMatches(PyExc_LookupError) |
158 | 2.38k | || PyErr_ExceptionMatches(PyExc_SyntaxError) |
159 | 1.78k | || PyErr_ExceptionMatches(PyExc_ValueError) |
160 | 60 | || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { |
161 | 60 | return; |
162 | 60 | } |
163 | 2.53k | PyObject *errstr = NULL; |
164 | 2.53k | PyObject *tuple = NULL; |
165 | 2.53k | PyObject *type; |
166 | 2.53k | PyObject *value; |
167 | 2.53k | PyObject *tback; |
168 | 2.53k | PyErr_Fetch(&type, &value, &tback); |
169 | 2.53k | if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) { |
170 | 598 | if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) { |
171 | 0 | goto error; |
172 | 0 | } |
173 | 598 | PyErr_Restore(type, value, tback); |
174 | 598 | return; |
175 | 598 | } |
176 | 1.94k | errstr = PyObject_Str(value); |
177 | 1.94k | if (!errstr) { |
178 | 0 | goto error; |
179 | 0 | } |
180 | | |
181 | 1.94k | PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); |
182 | 1.94k | if (!tmp) { |
183 | 0 | goto error; |
184 | 0 | } |
185 | | |
186 | 1.94k | tuple = _PyTuple_FromPair(errstr, tmp); |
187 | 1.94k | Py_DECREF(tmp); |
188 | 1.94k | if (!tuple) { |
189 | 0 | goto error; |
190 | 0 | } |
191 | 1.94k | PyErr_SetObject(PyExc_SyntaxError, tuple); |
192 | | |
193 | 1.94k | error: |
194 | 1.94k | Py_XDECREF(type); |
195 | 1.94k | Py_XDECREF(value); |
196 | 1.94k | Py_XDECREF(tback); |
197 | 1.94k | Py_XDECREF(errstr); |
198 | 1.94k | Py_XDECREF(tuple); |
199 | 1.94k | } |
200 | | |
201 | | int |
202 | | _PyTokenizer_parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...) |
203 | 10.3k | { |
204 | 10.3k | if (!tok->report_warnings) { |
205 | 0 | return 0; |
206 | 0 | } |
207 | | |
208 | 10.3k | PyObject *errmsg; |
209 | 10.3k | va_list vargs; |
210 | 10.3k | va_start(vargs, format); |
211 | 10.3k | errmsg = PyUnicode_FromFormatV(format, vargs); |
212 | 10.3k | va_end(vargs); |
213 | 10.3k | if (!errmsg) { |
214 | 0 | goto error; |
215 | 0 | } |
216 | | |
217 | 10.3k | if (PyErr_WarnExplicitObject(category, errmsg, tok->filename, |
218 | 10.3k | tok->lineno, tok->module, NULL) < 0) { |
219 | 0 | if (PyErr_ExceptionMatches(category)) { |
220 | | /* Replace the DeprecationWarning exception with a SyntaxError |
221 | | to get a more accurate error report */ |
222 | 0 | PyErr_Clear(); |
223 | 0 | _PyTokenizer_syntaxerror(tok, "%U", errmsg); |
224 | 0 | } |
225 | 0 | goto error; |
226 | 0 | } |
227 | 10.3k | Py_DECREF(errmsg); |
228 | 10.3k | return 0; |
229 | | |
230 | 0 | error: |
231 | 0 | Py_XDECREF(errmsg); |
232 | 0 | tok->done = E_ERROR; |
233 | 0 | return -1; |
234 | 10.3k | } |
235 | | |
236 | | |
237 | | /* ############## STRING MANIPULATION ############## */ |
238 | | |
239 | | char * |
240 | | _PyTokenizer_new_string(const char *s, Py_ssize_t len, struct tok_state *tok) |
241 | 93.4k | { |
242 | 93.4k | char* result = (char *)PyMem_Malloc(len + 1); |
243 | 93.4k | if (!result) { |
244 | 0 | tok->done = E_NOMEM; |
245 | 0 | PyErr_NoMemory(); |
246 | 0 | return NULL; |
247 | 0 | } |
248 | 93.4k | memcpy(result, s, len); |
249 | 93.4k | result[len] = '\0'; |
250 | 93.4k | return result; |
251 | 93.4k | } |
252 | | |
253 | | PyObject * |
254 | 4.14k | _PyTokenizer_translate_into_utf8(const char* str, const char* enc) { |
255 | 4.14k | PyObject *utf8; |
256 | 4.14k | PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); |
257 | 4.14k | if (buf == NULL) |
258 | 1.94k | return NULL; |
259 | 2.19k | utf8 = PyUnicode_AsUTF8String(buf); |
260 | 2.19k | Py_DECREF(buf); |
261 | 2.19k | return utf8; |
262 | 4.14k | } |
263 | | |
264 | | char * |
265 | | _PyTokenizer_translate_newlines(const char *s, int exec_input, int preserve_crlf, |
266 | 112k | struct tok_state *tok) { |
267 | 112k | int skip_next_lf = 0; |
268 | 112k | size_t needed_length = strlen(s) + 2, final_length; |
269 | 112k | char *buf, *current; |
270 | 112k | char c = '\0'; |
271 | 112k | buf = PyMem_Malloc(needed_length); |
272 | 112k | if (buf == NULL) { |
273 | 0 | tok->done = E_NOMEM; |
274 | 0 | PyErr_NoMemory(); |
275 | 0 | return NULL; |
276 | 0 | } |
277 | 7.67M | for (current = buf; *s; s++, current++) { |
278 | 7.56M | c = *s; |
279 | 7.56M | if (skip_next_lf) { |
280 | 10.6k | skip_next_lf = 0; |
281 | 10.6k | if (c == '\n') { |
282 | 527 | c = *++s; |
283 | 527 | if (!c) |
284 | 10 | break; |
285 | 527 | } |
286 | 10.6k | } |
287 | 7.56M | if (!preserve_crlf && c == '\r') { |
288 | 10.6k | skip_next_lf = 1; |
289 | 10.6k | c = '\n'; |
290 | 10.6k | } |
291 | 7.56M | *current = c; |
292 | 7.56M | } |
293 | | /* If this is exec input, add a newline to the end of the string if |
294 | | there isn't one already. */ |
295 | 112k | if (exec_input && c != '\n' && c != '\0') { |
296 | 109k | *current = '\n'; |
297 | 109k | current++; |
298 | 109k | } |
299 | 112k | *current = '\0'; |
300 | 112k | final_length = current - buf + 1; |
301 | 112k | if (final_length < needed_length && final_length) { |
302 | | /* should never fail */ |
303 | 3.08k | char* result = PyMem_Realloc(buf, final_length); |
304 | 3.08k | if (result == NULL) { |
305 | 0 | PyMem_Free(buf); |
306 | 0 | } |
307 | 3.08k | buf = result; |
308 | 3.08k | } |
309 | 112k | return buf; |
310 | 112k | } |
311 | | |
312 | | /* ############## ENCODING STUFF ############## */ |
313 | | |
314 | | |
315 | | /* See whether the file starts with a BOM. If it does, |
316 | | invoke the set_readline function with the new encoding. |
317 | | Return 1 on success, 0 on failure. */ |
318 | | int |
319 | | _PyTokenizer_check_bom(int get_char(struct tok_state *), |
320 | | void unget_char(int, struct tok_state *), |
321 | | int set_readline(struct tok_state *, const char *), |
322 | | struct tok_state *tok) |
323 | 20.8k | { |
324 | 20.8k | int ch1, ch2, ch3; |
325 | 20.8k | ch1 = get_char(tok); |
326 | 20.8k | tok->decoding_state = STATE_SEEK_CODING; |
327 | 20.8k | if (ch1 == EOF) { |
328 | 0 | return 1; |
329 | 20.8k | } else if (ch1 == 0xEF) { |
330 | 64 | ch2 = get_char(tok); |
331 | 64 | if (ch2 != 0xBB) { |
332 | 53 | unget_char(ch2, tok); |
333 | 53 | unget_char(ch1, tok); |
334 | 53 | return 1; |
335 | 53 | } |
336 | 11 | ch3 = get_char(tok); |
337 | 11 | if (ch3 != 0xBF) { |
338 | 5 | unget_char(ch3, tok); |
339 | 5 | unget_char(ch2, tok); |
340 | 5 | unget_char(ch1, tok); |
341 | 5 | return 1; |
342 | 5 | } |
343 | 20.7k | } else { |
344 | 20.7k | unget_char(ch1, tok); |
345 | 20.7k | return 1; |
346 | 20.7k | } |
347 | 6 | if (tok->encoding != NULL) |
348 | 0 | PyMem_Free(tok->encoding); |
349 | 6 | tok->encoding = _PyTokenizer_new_string("utf-8", 5, tok); |
350 | 6 | if (!tok->encoding) |
351 | 0 | return 0; |
352 | | /* No need to set_readline: input is already utf-8 */ |
353 | 6 | return 1; |
354 | 6 | } |
355 | | |
356 | | static const char * |
357 | | get_normal_name(const char *s) /* for utf-8 and latin-1 */ |
358 | 4.14k | { |
359 | 4.14k | char buf[13]; |
360 | 4.14k | int i; |
361 | 27.3k | for (i = 0; i < 12; i++) { |
362 | 27.1k | int c = s[i]; |
363 | 27.1k | if (c == '\0') |
364 | 3.91k | break; |
365 | 23.2k | else if (c == '_') |
366 | 629 | buf[i] = '-'; |
367 | 22.5k | else |
368 | 22.5k | buf[i] = Py_TOLOWER(c); |
369 | 27.1k | } |
370 | 4.14k | buf[i] = '\0'; |
371 | 4.14k | if (strcmp(buf, "utf-8") == 0 || |
372 | 4.14k | strncmp(buf, "utf-8-", 6) == 0) |
373 | 3 | return "utf-8"; |
374 | 4.14k | else if (strcmp(buf, "latin-1") == 0 || |
375 | 4.13k | strcmp(buf, "iso-8859-1") == 0 || |
376 | 4.13k | strcmp(buf, "iso-latin-1") == 0 || |
377 | 4.13k | strncmp(buf, "latin-1-", 8) == 0 || |
378 | 4.13k | strncmp(buf, "iso-8859-1-", 11) == 0 || |
379 | 4.12k | strncmp(buf, "iso-latin-1-", 12) == 0) |
380 | 16 | return "iso-8859-1"; |
381 | 4.12k | else |
382 | 4.12k | return s; |
383 | 4.14k | } |
384 | | |
385 | | /* Return the coding spec in S, or NULL if none is found. */ |
386 | | static int |
387 | | get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok) |
388 | 21.3k | { |
389 | 21.3k | Py_ssize_t i; |
390 | 21.3k | *spec = NULL; |
391 | | /* Coding spec must be in a comment, and that comment must be |
392 | | * the only statement on the source code line. */ |
393 | 23.0k | for (i = 0; i < size - 6; i++) { |
394 | 18.6k | if (s[i] == '#') |
395 | 4.33k | break; |
396 | 14.3k | if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') |
397 | 12.6k | return 1; |
398 | 14.3k | } |
399 | 243k | for (; i < size - 6; i++) { /* XXX inefficient search */ |
400 | 238k | const char* t = s + i; |
401 | 238k | if (memcmp(t, "coding", 6) == 0) { |
402 | 4.84k | const char* begin = NULL; |
403 | 4.84k | t += 6; |
404 | 4.84k | if (t[0] != ':' && t[0] != '=') |
405 | 418 | continue; |
406 | 5.21k | do { |
407 | 5.21k | t++; |
408 | 5.21k | } while (t[0] == ' ' || t[0] == '\t'); |
409 | | |
410 | 4.43k | begin = t; |
411 | 33.5k | while (Py_ISALNUM(t[0]) || |
412 | 7.32k | t[0] == '-' || t[0] == '_' || t[0] == '.') |
413 | 29.1k | t++; |
414 | | |
415 | 4.43k | if (begin < t) { |
416 | 4.14k | char* r = _PyTokenizer_new_string(begin, t - begin, tok); |
417 | 4.14k | const char* q; |
418 | 4.14k | if (!r) |
419 | 0 | return 0; |
420 | 4.14k | q = get_normal_name(r); |
421 | 4.14k | if (r != q) { |
422 | 19 | PyMem_Free(r); |
423 | 19 | r = _PyTokenizer_new_string(q, strlen(q), tok); |
424 | 19 | if (!r) |
425 | 0 | return 0; |
426 | 19 | } |
427 | 4.14k | *spec = r; |
428 | 4.14k | break; |
429 | 4.14k | } |
430 | 4.43k | } |
431 | 238k | } |
432 | 8.71k | return 1; |
433 | 8.71k | } |
434 | | |
435 | | /* Check whether the line contains a coding spec. If it does, |
436 | | invoke the set_readline function for the new encoding. |
437 | | This function receives the tok_state and the new encoding. |
438 | | Return 1 on success, 0 on failure. */ |
439 | | int |
440 | | _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, |
441 | | int set_readline(struct tok_state *, const char *)) |
442 | 21.3k | { |
443 | 21.3k | char *cs; |
444 | 21.3k | if (tok->cont_line) { |
445 | | /* It's a continuation line, so it can't be a coding spec. */ |
446 | 0 | tok->decoding_state = STATE_NORMAL; |
447 | 0 | return 1; |
448 | 0 | } |
449 | 21.3k | if (!get_coding_spec(line, &cs, size, tok)) { |
450 | 0 | return 0; |
451 | 0 | } |
452 | 21.3k | if (!cs) { |
453 | 17.2k | Py_ssize_t i; |
454 | 19.1k | for (i = 0; i < size; i++) { |
455 | 18.7k | if (line[i] == '#' || line[i] == '\n' || line[i] == '\r') |
456 | 447 | break; |
457 | 18.3k | if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { |
458 | | /* Stop checking coding spec after a line containing |
459 | | * anything except a comment. */ |
460 | 16.4k | tok->decoding_state = STATE_NORMAL; |
461 | 16.4k | break; |
462 | 16.4k | } |
463 | 18.3k | } |
464 | 17.2k | return 1; |
465 | 17.2k | } |
466 | 4.14k | tok->decoding_state = STATE_NORMAL; |
467 | 4.14k | if (tok->encoding == NULL) { |
468 | 4.14k | assert(tok->decoding_readline == NULL); |
469 | 4.14k | if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) { |
470 | 0 | _PyTokenizer_raise_init_error(tok->filename); |
471 | 0 | _PyTokenizer_error_ret(tok); |
472 | 0 | PyMem_Free(cs); |
473 | 0 | return 0; |
474 | 0 | } |
475 | 4.14k | tok->encoding = cs; |
476 | 4.14k | } else { /* then, compare cs with BOM */ |
477 | 3 | if (strcmp(tok->encoding, cs) != 0) { |
478 | 2 | tok->line_start = line; |
479 | 2 | tok->cur = (char *)line; |
480 | 2 | assert(size <= INT_MAX); |
481 | 2 | _PyTokenizer_syntaxerror_known_range(tok, 0, (int)size, |
482 | 2 | "encoding problem: %s with BOM", cs); |
483 | 2 | PyMem_Free(cs); |
484 | 2 | _PyTokenizer_error_ret(tok); |
485 | 2 | return 0; |
486 | 2 | } |
487 | 1 | PyMem_Free(cs); |
488 | 1 | } |
489 | 4.14k | return 1; |
490 | 4.14k | } |
491 | | |
492 | | /* Check whether the characters at s start a valid |
493 | | UTF-8 sequence. Return the number of characters forming |
494 | | the sequence if yes, 0 if not. The special cases match |
495 | | those in stringlib/codecs.h:utf8_decode. |
496 | | */ |
497 | | static int |
498 | | valid_utf8(const unsigned char* s) |
499 | 4.20M | { |
500 | 4.20M | int expected = 0; |
501 | 4.20M | int length; |
502 | 4.20M | if (*s < 0x80) { |
503 | | /* single-byte code */ |
504 | 4.19M | return 1; |
505 | 4.19M | } |
506 | 8.68k | else if (*s < 0xE0) { |
507 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ |
508 | 3.56k | if (*s < 0xC2) { |
509 | | /* invalid sequence |
510 | | \x80-\xBF -- continuation byte |
511 | | \xC0-\xC1 -- fake 0000-007F */ |
512 | 139 | return 0; |
513 | 139 | } |
514 | 3.42k | expected = 1; |
515 | 3.42k | } |
516 | 5.12k | else if (*s < 0xF0) { |
517 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ |
518 | 2.29k | if (*s == 0xE0 && *(s + 1) < 0xA0) { |
519 | | /* invalid sequence |
520 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ |
521 | 16 | return 0; |
522 | 16 | } |
523 | 2.27k | else if (*s == 0xED && *(s + 1) >= 0xA0) { |
524 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF |
525 | | will result in surrogates in range D800-DFFF. Surrogates are |
526 | | not valid UTF-8 so they are rejected. |
527 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf |
528 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ |
529 | 22 | return 0; |
530 | 22 | } |
531 | 2.25k | expected = 2; |
532 | 2.25k | } |
533 | 2.82k | else if (*s < 0xF5) { |
534 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ |
535 | 2.79k | if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) { |
536 | | /* invalid sequence -- one of: |
537 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF |
538 | | \xF4\x90\x80\x80- -- 110000- overflow */ |
539 | 45 | return 0; |
540 | 45 | } |
541 | 2.74k | expected = 3; |
542 | 2.74k | } |
543 | 33 | else { |
544 | | /* invalid start byte */ |
545 | 33 | return 0; |
546 | 33 | } |
547 | 8.43k | length = expected + 1; |
548 | 24.0k | for (int i = 1; i <= expected; i++) { |
549 | 15.9k | if (s[i] < 0x80 || s[i] >= 0xC0) { |
550 | 341 | return 0; |
551 | 341 | } |
552 | 15.9k | } |
553 | 8.09k | return length; |
554 | 8.43k | } |
555 | | |
556 | | int |
557 | | _PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno) |
558 | 17.0k | { |
559 | 17.0k | const char *badchar = NULL; |
560 | 17.0k | const char *c; |
561 | 17.0k | int length; |
562 | 17.0k | int col_offset = 0; |
563 | 17.0k | const char *line_start = line; |
564 | 4.21M | for (c = line; *c; c += length) { |
565 | 4.20M | if (!(length = valid_utf8((const unsigned char *)c))) { |
566 | 596 | badchar = c; |
567 | 596 | break; |
568 | 596 | } |
569 | 4.20M | col_offset++; |
570 | 4.20M | if (*c == '\n') { |
571 | 180k | lineno++; |
572 | 180k | col_offset = 0; |
573 | 180k | line_start = c + 1; |
574 | 180k | } |
575 | 4.20M | } |
576 | 17.0k | if (badchar) { |
577 | 596 | tok->lineno = lineno; |
578 | 596 | tok->line_start = line_start; |
579 | 596 | tok->cur = (char *)badchar; |
580 | 596 | _PyTokenizer_syntaxerror_known_range(tok, |
581 | 596 | col_offset + 1, col_offset + 1, |
582 | 596 | "Non-UTF-8 code starting with '\\x%.2x'" |
583 | 596 | "%s%V on line %i, " |
584 | 596 | "but no encoding declared; " |
585 | 596 | "see https://peps.python.org/pep-0263/ for details", |
586 | 596 | (unsigned char)*badchar, |
587 | 596 | tok->filename ? " in file " : "", tok->filename, "", |
588 | 596 | lineno); |
589 | 596 | return 0; |
590 | 596 | } |
591 | 16.4k | return 1; |
592 | 17.0k | } |
593 | | |
594 | | |
595 | | /* ############## DEBUGGING STUFF ############## */ |
596 | | |
597 | | #ifdef Py_DEBUG |
598 | | void |
599 | | _PyTokenizer_print_escape(FILE *f, const char *s, Py_ssize_t size) |
600 | | { |
601 | | if (s == NULL) { |
602 | | fputs("NULL", f); |
603 | | return; |
604 | | } |
605 | | putc('"', f); |
606 | | while (size-- > 0) { |
607 | | unsigned char c = *s++; |
608 | | switch (c) { |
609 | | case '\n': fputs("\\n", f); break; |
610 | | case '\r': fputs("\\r", f); break; |
611 | | case '\t': fputs("\\t", f); break; |
612 | | case '\f': fputs("\\f", f); break; |
613 | | case '\'': fputs("\\'", f); break; |
614 | | case '"': fputs("\\\"", f); break; |
615 | | default: |
616 | | if (0x20 <= c && c <= 0x7f) |
617 | | putc(c, f); |
618 | | else |
619 | | fprintf(f, "\\x%02x", c); |
620 | | } |
621 | | } |
622 | | putc('"', f); |
623 | | } |
624 | | |
625 | | void |
626 | | _PyTokenizer_tok_dump(int type, char *start, char *end) |
627 | | { |
628 | | fprintf(stderr, "%s", _PyParser_TokenNames[type]); |
629 | | if (type == NAME || type == NUMBER || type == STRING || type == OP) |
630 | | fprintf(stderr, "(%.*s)", (int)(end - start), start); |
631 | | } |
632 | | #endif |