/src/cpython/Parser/tokenizer/helpers.c
Line | Count | Source |
1 | | #include "Python.h" |
2 | | #include "errcode.h" |
3 | | #include "pycore_token.h" |
4 | | |
5 | | #include "../lexer/state.h" |
6 | | |
7 | | |
8 | | /* ############## ERRORS ############## */ |
9 | | |
10 | | static int |
11 | | _syntaxerror_range(struct tok_state *tok, const char *format, |
12 | | int col_offset, int end_col_offset, |
13 | | va_list vargs) |
14 | 3.12k | { |
15 | | // In release builds, we don't want to overwrite a previous error, but in debug builds we |
16 | | // want to fail if we are not doing it so we can fix it. |
17 | 3.12k | assert(tok->done != E_ERROR); |
18 | 3.12k | if (tok->done == E_ERROR) { |
19 | 0 | return ERRORTOKEN; |
20 | 0 | } |
21 | 3.12k | PyObject *errmsg, *errtext, *args; |
22 | 3.12k | errmsg = PyUnicode_FromFormatV(format, vargs); |
23 | 3.12k | if (!errmsg) { |
24 | 0 | goto error; |
25 | 0 | } |
26 | | |
27 | 3.12k | errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start, |
28 | 3.12k | "replace"); |
29 | 3.12k | if (!errtext) { |
30 | 0 | goto error; |
31 | 0 | } |
32 | | |
33 | 3.12k | if (col_offset == -1) { |
34 | 2.46k | col_offset = (int)PyUnicode_GET_LENGTH(errtext); |
35 | 2.46k | } |
36 | 3.12k | if (end_col_offset == -1) { |
37 | 2.46k | end_col_offset = col_offset; |
38 | 2.46k | } |
39 | | |
40 | 3.12k | Py_ssize_t line_len = strcspn(tok->line_start, "\n"); |
41 | 3.12k | if (line_len != tok->cur - tok->line_start) { |
42 | 2.35k | Py_DECREF(errtext); |
43 | 2.35k | errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len, |
44 | 2.35k | "replace"); |
45 | 2.35k | } |
46 | 3.12k | if (!errtext) { |
47 | 0 | goto error; |
48 | 0 | } |
49 | | |
50 | 3.12k | args = Py_BuildValue("(O(OiiNii))", errmsg, |
51 | 3.12k | tok->filename ? tok->filename : Py_None, |
52 | 3.12k | tok->lineno, col_offset, errtext, |
53 | 3.12k | tok->lineno, end_col_offset); |
54 | 3.12k | if (args) { |
55 | 3.12k | PyErr_SetObject(PyExc_SyntaxError, args); |
56 | 3.12k | Py_DECREF(args); |
57 | 3.12k | } |
58 | | |
59 | 3.12k | error: |
60 | 3.12k | Py_XDECREF(errmsg); |
61 | 3.12k | tok->done = E_ERROR; |
62 | 3.12k | return ERRORTOKEN; |
63 | 3.12k | } |
64 | | |
65 | | int |
66 | | _PyTokenizer_syntaxerror(struct tok_state *tok, const char *format, ...) |
67 | 2.46k | { |
68 | | // These errors are cleaned on startup. Todo: Fix it. |
69 | 2.46k | va_list vargs; |
70 | 2.46k | va_start(vargs, format); |
71 | 2.46k | int ret = _syntaxerror_range(tok, format, -1, -1, vargs); |
72 | 2.46k | va_end(vargs); |
73 | 2.46k | return ret; |
74 | 2.46k | } |
75 | | |
76 | | int |
77 | | _PyTokenizer_syntaxerror_known_range(struct tok_state *tok, |
78 | | int col_offset, int end_col_offset, |
79 | | const char *format, ...) |
80 | 661 | { |
81 | 661 | va_list vargs; |
82 | 661 | va_start(vargs, format); |
83 | 661 | int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs); |
84 | 661 | va_end(vargs); |
85 | 661 | return ret; |
86 | 661 | } |
87 | | |
88 | | int |
89 | | _PyTokenizer_indenterror(struct tok_state *tok) |
90 | 4 | { |
91 | 4 | tok->done = E_TABSPACE; |
92 | 4 | tok->cur = tok->inp; |
93 | 4 | return ERRORTOKEN; |
94 | 4 | } |
95 | | |
96 | | char * |
97 | | _PyTokenizer_error_ret(struct tok_state *tok) /* XXX */ |
98 | 2.60k | { |
99 | 2.60k | tok->decoding_erred = 1; |
100 | 2.60k | if ((tok->fp != NULL || tok->readline != NULL) && tok->buf != NULL) {/* see _PyTokenizer_Free */ |
101 | 0 | PyMem_Free(tok->buf); |
102 | 0 | } |
103 | 2.60k | tok->buf = tok->cur = tok->inp = NULL; |
104 | 2.60k | tok->start = NULL; |
105 | 2.60k | tok->end = NULL; |
106 | 2.60k | tok->done = E_DECODE; |
107 | 2.60k | return NULL; /* as if it were EOF */ |
108 | 2.60k | } |
109 | | |
110 | | int |
111 | | _PyTokenizer_warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char) |
112 | 624 | { |
113 | 624 | if (!tok->report_warnings) { |
114 | 0 | return 0; |
115 | 0 | } |
116 | | |
117 | 624 | PyObject *msg = PyUnicode_FromFormat( |
118 | 624 | "\"\\%c\" is an invalid escape sequence. " |
119 | 624 | "Such sequences will not work in the future. " |
120 | 624 | "Did you mean \"\\\\%c\"? A raw string is also an option.", |
121 | 624 | (char) first_invalid_escape_char, |
122 | 624 | (char) first_invalid_escape_char |
123 | 624 | ); |
124 | | |
125 | 624 | if (msg == NULL) { |
126 | 0 | return -1; |
127 | 0 | } |
128 | | |
129 | 624 | if (PyErr_WarnExplicitObject(PyExc_SyntaxWarning, msg, tok->filename, |
130 | 624 | tok->lineno, tok->module, NULL) < 0) { |
131 | 0 | Py_DECREF(msg); |
132 | |
|
133 | 0 | if (PyErr_ExceptionMatches(PyExc_SyntaxWarning)) { |
134 | | /* Replace the SyntaxWarning exception with a SyntaxError |
135 | | to get a more accurate error report */ |
136 | 0 | PyErr_Clear(); |
137 | |
|
138 | 0 | return _PyTokenizer_syntaxerror(tok, |
139 | 0 | "\"\\%c\" is an invalid escape sequence. " |
140 | 0 | "Did you mean \"\\\\%c\"? A raw string is also an option.", |
141 | 0 | (char) first_invalid_escape_char, |
142 | 0 | (char) first_invalid_escape_char); |
143 | 0 | } |
144 | | |
145 | 0 | return -1; |
146 | 0 | } |
147 | | |
148 | 624 | Py_DECREF(msg); |
149 | 624 | return 0; |
150 | 624 | } |
151 | | |
152 | | int |
153 | | _PyTokenizer_parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...) |
154 | 11.2k | { |
155 | 11.2k | if (!tok->report_warnings) { |
156 | 0 | return 0; |
157 | 0 | } |
158 | | |
159 | 11.2k | PyObject *errmsg; |
160 | 11.2k | va_list vargs; |
161 | 11.2k | va_start(vargs, format); |
162 | 11.2k | errmsg = PyUnicode_FromFormatV(format, vargs); |
163 | 11.2k | va_end(vargs); |
164 | 11.2k | if (!errmsg) { |
165 | 0 | goto error; |
166 | 0 | } |
167 | | |
168 | 11.2k | if (PyErr_WarnExplicitObject(category, errmsg, tok->filename, |
169 | 11.2k | tok->lineno, tok->module, NULL) < 0) { |
170 | 0 | if (PyErr_ExceptionMatches(category)) { |
171 | | /* Replace the DeprecationWarning exception with a SyntaxError |
172 | | to get a more accurate error report */ |
173 | 0 | PyErr_Clear(); |
174 | 0 | _PyTokenizer_syntaxerror(tok, "%U", errmsg); |
175 | 0 | } |
176 | 0 | goto error; |
177 | 0 | } |
178 | 11.2k | Py_DECREF(errmsg); |
179 | 11.2k | return 0; |
180 | | |
181 | 0 | error: |
182 | 0 | Py_XDECREF(errmsg); |
183 | 0 | tok->done = E_ERROR; |
184 | 0 | return -1; |
185 | 11.2k | } |
186 | | |
187 | | |
188 | | /* ############## STRING MANIPULATION ############## */ |
189 | | |
190 | | char * |
191 | | _PyTokenizer_new_string(const char *s, Py_ssize_t len, struct tok_state *tok) |
192 | 86.2k | { |
193 | 86.2k | char* result = (char *)PyMem_Malloc(len + 1); |
194 | 86.2k | if (!result) { |
195 | 0 | tok->done = E_NOMEM; |
196 | 0 | PyErr_NoMemory(); |
197 | 0 | return NULL; |
198 | 0 | } |
199 | 86.2k | memcpy(result, s, len); |
200 | 86.2k | result[len] = '\0'; |
201 | 86.2k | return result; |
202 | 86.2k | } |
203 | | |
204 | | PyObject * |
205 | 4.08k | _PyTokenizer_translate_into_utf8(const char* str, const char* enc) { |
206 | 4.08k | PyObject *utf8; |
207 | 4.08k | PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); |
208 | 4.08k | if (buf == NULL) |
209 | 1.93k | return NULL; |
210 | 2.14k | utf8 = PyUnicode_AsUTF8String(buf); |
211 | 2.14k | Py_DECREF(buf); |
212 | 2.14k | return utf8; |
213 | 4.08k | } |
214 | | |
215 | | char * |
216 | | _PyTokenizer_translate_newlines(const char *s, int exec_input, int preserve_crlf, |
217 | 105k | struct tok_state *tok) { |
218 | 105k | int skip_next_lf = 0; |
219 | 105k | size_t needed_length = strlen(s) + 2, final_length; |
220 | 105k | char *buf, *current; |
221 | 105k | char c = '\0'; |
222 | 105k | buf = PyMem_Malloc(needed_length); |
223 | 105k | if (buf == NULL) { |
224 | 0 | tok->done = E_NOMEM; |
225 | 0 | PyErr_NoMemory(); |
226 | 0 | return NULL; |
227 | 0 | } |
228 | 7.93M | for (current = buf; *s; s++, current++) { |
229 | 7.82M | c = *s; |
230 | 7.82M | if (skip_next_lf) { |
231 | 10.7k | skip_next_lf = 0; |
232 | 10.7k | if (c == '\n') { |
233 | 524 | c = *++s; |
234 | 524 | if (!c) |
235 | 8 | break; |
236 | 524 | } |
237 | 10.7k | } |
238 | 7.82M | if (!preserve_crlf && c == '\r') { |
239 | 10.8k | skip_next_lf = 1; |
240 | 10.8k | c = '\n'; |
241 | 10.8k | } |
242 | 7.82M | *current = c; |
243 | 7.82M | } |
244 | | /* If this is exec input, add a newline to the end of the string if |
245 | | there isn't one already. */ |
246 | 105k | if (exec_input && c != '\n' && c != '\0') { |
247 | 102k | *current = '\n'; |
248 | 102k | current++; |
249 | 102k | } |
250 | 105k | *current = '\0'; |
251 | 105k | final_length = current - buf + 1; |
252 | 105k | if (final_length < needed_length && final_length) { |
253 | | /* should never fail */ |
254 | 3.07k | char* result = PyMem_Realloc(buf, final_length); |
255 | 3.07k | if (result == NULL) { |
256 | 0 | PyMem_Free(buf); |
257 | 0 | } |
258 | 3.07k | buf = result; |
259 | 3.07k | } |
260 | 105k | return buf; |
261 | 105k | } |
262 | | |
263 | | /* ############## ENCODING STUFF ############## */ |
264 | | |
265 | | |
266 | | /* See whether the file starts with a BOM. If it does, |
267 | | invoke the set_readline function with the new encoding. |
268 | | Return 1 on success, 0 on failure. */ |
269 | | int |
270 | | _PyTokenizer_check_bom(int get_char(struct tok_state *), |
271 | | void unget_char(int, struct tok_state *), |
272 | | int set_readline(struct tok_state *, const char *), |
273 | | struct tok_state *tok) |
274 | 21.2k | { |
275 | 21.2k | int ch1, ch2, ch3; |
276 | 21.2k | ch1 = get_char(tok); |
277 | 21.2k | tok->decoding_state = STATE_SEEK_CODING; |
278 | 21.2k | if (ch1 == EOF) { |
279 | 0 | return 1; |
280 | 21.2k | } else if (ch1 == 0xEF) { |
281 | 64 | ch2 = get_char(tok); |
282 | 64 | if (ch2 != 0xBB) { |
283 | 56 | unget_char(ch2, tok); |
284 | 56 | unget_char(ch1, tok); |
285 | 56 | return 1; |
286 | 56 | } |
287 | 8 | ch3 = get_char(tok); |
288 | 8 | if (ch3 != 0xBF) { |
289 | 3 | unget_char(ch3, tok); |
290 | 3 | unget_char(ch2, tok); |
291 | 3 | unget_char(ch1, tok); |
292 | 3 | return 1; |
293 | 3 | } |
294 | 21.1k | } else { |
295 | 21.1k | unget_char(ch1, tok); |
296 | 21.1k | return 1; |
297 | 21.1k | } |
298 | 5 | if (tok->encoding != NULL) |
299 | 0 | PyMem_Free(tok->encoding); |
300 | 5 | tok->encoding = _PyTokenizer_new_string("utf-8", 5, tok); |
301 | 5 | if (!tok->encoding) |
302 | 0 | return 0; |
303 | | /* No need to set_readline: input is already utf-8 */ |
304 | 5 | return 1; |
305 | 5 | } |
306 | | |
307 | | static const char * |
308 | | get_normal_name(const char *s) /* for utf-8 and latin-1 */ |
309 | 4.08k | { |
310 | 4.08k | char buf[13]; |
311 | 4.08k | int i; |
312 | 26.7k | for (i = 0; i < 12; i++) { |
313 | 26.4k | int c = s[i]; |
314 | 26.4k | if (c == '\0') |
315 | 3.86k | break; |
316 | 22.6k | else if (c == '_') |
317 | 610 | buf[i] = '-'; |
318 | 22.0k | else |
319 | 22.0k | buf[i] = Py_TOLOWER(c); |
320 | 26.4k | } |
321 | 4.08k | buf[i] = '\0'; |
322 | 4.08k | if (strcmp(buf, "utf-8") == 0 || |
323 | 4.08k | strncmp(buf, "utf-8-", 6) == 0) |
324 | 3 | return "utf-8"; |
325 | 4.08k | else if (strcmp(buf, "latin-1") == 0 || |
326 | 4.07k | strcmp(buf, "iso-8859-1") == 0 || |
327 | 4.07k | strcmp(buf, "iso-latin-1") == 0 || |
328 | 4.07k | strncmp(buf, "latin-1-", 8) == 0 || |
329 | 4.07k | strncmp(buf, "iso-8859-1-", 11) == 0 || |
330 | 4.07k | strncmp(buf, "iso-latin-1-", 12) == 0) |
331 | 16 | return "iso-8859-1"; |
332 | 4.06k | else |
333 | 4.06k | return s; |
334 | 4.08k | } |
335 | | |
336 | | /* Return the coding spec in S, or NULL if none is found. */ |
337 | | static int |
338 | | get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok) |
339 | 21.7k | { |
340 | 21.7k | Py_ssize_t i; |
341 | 21.7k | *spec = NULL; |
342 | | /* Coding spec must be in a comment, and that comment must be |
343 | | * the only statement on the source code line. */ |
344 | 24.5k | for (i = 0; i < size - 6; i++) { |
345 | 20.0k | if (s[i] == '#') |
346 | 4.28k | break; |
347 | 15.7k | if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') |
348 | 12.9k | return 1; |
349 | 15.7k | } |
350 | 250k | for (; i < size - 6; i++) { /* XXX inefficient search */ |
351 | 245k | const char* t = s + i; |
352 | 245k | if (memcmp(t, "coding", 6) == 0) { |
353 | 4.84k | const char* begin = NULL; |
354 | 4.84k | t += 6; |
355 | 4.84k | if (t[0] != ':' && t[0] != '=') |
356 | 353 | continue; |
357 | 5.22k | do { |
358 | 5.22k | t++; |
359 | 5.22k | } while (t[0] == ' ' || t[0] == '\t'); |
360 | | |
361 | 4.49k | begin = t; |
362 | 32.6k | while (Py_ISALNUM(t[0]) || |
363 | 7.36k | t[0] == '-' || t[0] == '_' || t[0] == '.') |
364 | 28.1k | t++; |
365 | | |
366 | 4.49k | if (begin < t) { |
367 | 4.08k | char* r = _PyTokenizer_new_string(begin, t - begin, tok); |
368 | 4.08k | const char* q; |
369 | 4.08k | if (!r) |
370 | 0 | return 0; |
371 | 4.08k | q = get_normal_name(r); |
372 | 4.08k | if (r != q) { |
373 | 19 | PyMem_Free(r); |
374 | 19 | r = _PyTokenizer_new_string(q, strlen(q), tok); |
375 | 19 | if (!r) |
376 | 0 | return 0; |
377 | 19 | } |
378 | 4.08k | *spec = r; |
379 | 4.08k | break; |
380 | 4.08k | } |
381 | 4.49k | } |
382 | 245k | } |
383 | 8.86k | return 1; |
384 | 8.86k | } |
385 | | |
386 | | /* Check whether the line contains a coding spec. If it does, |
387 | | invoke the set_readline function for the new encoding. |
388 | | This function receives the tok_state and the new encoding. |
389 | | Return 1 on success, 0 on failure. */ |
390 | | int |
391 | | _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, |
392 | | int set_readline(struct tok_state *, const char *)) |
393 | 21.7k | { |
394 | 21.7k | char *cs; |
395 | 21.7k | if (tok->cont_line) { |
396 | | /* It's a continuation line, so it can't be a coding spec. */ |
397 | 0 | tok->decoding_state = STATE_NORMAL; |
398 | 0 | return 1; |
399 | 0 | } |
400 | 21.7k | if (!get_coding_spec(line, &cs, size, tok)) { |
401 | 0 | return 0; |
402 | 0 | } |
403 | 21.7k | if (!cs) { |
404 | 17.7k | Py_ssize_t i; |
405 | 20.6k | for (i = 0; i < size; i++) { |
406 | 20.3k | if (line[i] == '#' || line[i] == '\n' || line[i] == '\r') |
407 | 469 | break; |
408 | 19.8k | if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { |
409 | | /* Stop checking coding spec after a line containing |
410 | | * anything except a comment. */ |
411 | 16.9k | tok->decoding_state = STATE_NORMAL; |
412 | 16.9k | break; |
413 | 16.9k | } |
414 | 19.8k | } |
415 | 17.7k | return 1; |
416 | 17.7k | } |
417 | 4.08k | tok->decoding_state = STATE_NORMAL; |
418 | 4.08k | if (tok->encoding == NULL) { |
419 | 4.08k | assert(tok->decoding_readline == NULL); |
420 | 4.08k | if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) { |
421 | 0 | _PyTokenizer_error_ret(tok); |
422 | 0 | PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); |
423 | 0 | PyMem_Free(cs); |
424 | 0 | return 0; |
425 | 0 | } |
426 | 4.08k | tok->encoding = cs; |
427 | 4.08k | } else { /* then, compare cs with BOM */ |
428 | 3 | if (strcmp(tok->encoding, cs) != 0) { |
429 | 2 | tok->line_start = line; |
430 | 2 | tok->cur = (char *)line; |
431 | 2 | assert(size <= INT_MAX); |
432 | 2 | _PyTokenizer_syntaxerror_known_range(tok, 0, (int)size, |
433 | 2 | "encoding problem: %s with BOM", cs); |
434 | 2 | PyMem_Free(cs); |
435 | 2 | _PyTokenizer_error_ret(tok); |
436 | 2 | return 0; |
437 | 2 | } |
438 | 1 | PyMem_Free(cs); |
439 | 1 | } |
440 | 4.08k | return 1; |
441 | 4.08k | } |
442 | | |
443 | | /* Check whether the characters at s start a valid |
444 | | UTF-8 sequence. Return the number of characters forming |
445 | | the sequence if yes, 0 if not. The special cases match |
446 | | those in stringlib/codecs.h:utf8_decode. |
447 | | */ |
448 | | static int |
449 | | valid_utf8(const unsigned char* s) |
450 | 4.70M | { |
451 | 4.70M | int expected = 0; |
452 | 4.70M | int length; |
453 | 4.70M | if (*s < 0x80) { |
454 | | /* single-byte code */ |
455 | 4.69M | return 1; |
456 | 4.69M | } |
457 | 9.34k | else if (*s < 0xE0) { |
458 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ |
459 | 3.89k | if (*s < 0xC2) { |
460 | | /* invalid sequence |
461 | | \x80-\xBF -- continuation byte |
462 | | \xC0-\xC1 -- fake 0000-007F */ |
463 | 147 | return 0; |
464 | 147 | } |
465 | 3.75k | expected = 1; |
466 | 3.75k | } |
467 | 5.44k | else if (*s < 0xF0) { |
468 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ |
469 | 2.28k | if (*s == 0xE0 && *(s + 1) < 0xA0) { |
470 | | /* invalid sequence |
471 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ |
472 | 20 | return 0; |
473 | 20 | } |
474 | 2.26k | else if (*s == 0xED && *(s + 1) >= 0xA0) { |
475 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF |
476 | | will result in surrogates in range D800-DFFF. Surrogates are |
477 | | not valid UTF-8 so they are rejected. |
478 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf |
479 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ |
480 | 19 | return 0; |
481 | 19 | } |
482 | 2.24k | expected = 2; |
483 | 2.24k | } |
484 | 3.16k | else if (*s < 0xF5) { |
485 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ |
486 | 3.13k | if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) { |
487 | | /* invalid sequence -- one of: |
488 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF |
489 | | \xF4\x90\x80\x80- -- 110000- overflow */ |
490 | 41 | return 0; |
491 | 41 | } |
492 | 3.08k | expected = 3; |
493 | 3.08k | } |
494 | 38 | else { |
495 | | /* invalid start byte */ |
496 | 38 | return 0; |
497 | 38 | } |
498 | 9.08k | length = expected + 1; |
499 | 25.9k | for (int i = 1; i <= expected; i++) { |
500 | 17.2k | if (s[i] < 0x80 || s[i] >= 0xC0) { |
501 | 348 | return 0; |
502 | 348 | } |
503 | 17.2k | } |
504 | 8.73k | return length; |
505 | 9.08k | } |
506 | | |
507 | | int |
508 | | _PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno) |
509 | 17.4k | { |
510 | 17.4k | const char *badchar = NULL; |
511 | 17.4k | const char *c; |
512 | 17.4k | int length; |
513 | 17.4k | int col_offset = 0; |
514 | 17.4k | const char *line_start = line; |
515 | 4.71M | for (c = line; *c; c += length) { |
516 | 4.70M | if (!(length = valid_utf8((const unsigned char *)c))) { |
517 | 613 | badchar = c; |
518 | 613 | break; |
519 | 613 | } |
520 | 4.69M | col_offset++; |
521 | 4.69M | if (*c == '\n') { |
522 | 194k | lineno++; |
523 | 194k | col_offset = 0; |
524 | 194k | line_start = c + 1; |
525 | 194k | } |
526 | 4.69M | } |
527 | 17.4k | if (badchar) { |
528 | 613 | tok->lineno = lineno; |
529 | 613 | tok->line_start = line_start; |
530 | 613 | tok->cur = (char *)badchar; |
531 | 613 | _PyTokenizer_syntaxerror_known_range(tok, |
532 | 613 | col_offset + 1, col_offset + 1, |
533 | 613 | "Non-UTF-8 code starting with '\\x%.2x'" |
534 | 613 | "%s%V on line %i, " |
535 | 613 | "but no encoding declared; " |
536 | 613 | "see https://peps.python.org/pep-0263/ for details", |
537 | 613 | (unsigned char)*badchar, |
538 | 613 | tok->filename ? " in file " : "", tok->filename, "", |
539 | 613 | lineno); |
540 | 613 | return 0; |
541 | 613 | } |
542 | 16.8k | return 1; |
543 | 17.4k | } |
544 | | |
545 | | |
546 | | /* ############## DEBUGGING STUFF ############## */ |
547 | | |
548 | | #ifdef Py_DEBUG |
549 | | void |
550 | | _PyTokenizer_print_escape(FILE *f, const char *s, Py_ssize_t size) |
551 | | { |
552 | | if (s == NULL) { |
553 | | fputs("NULL", f); |
554 | | return; |
555 | | } |
556 | | putc('"', f); |
557 | | while (size-- > 0) { |
558 | | unsigned char c = *s++; |
559 | | switch (c) { |
560 | | case '\n': fputs("\\n", f); break; |
561 | | case '\r': fputs("\\r", f); break; |
562 | | case '\t': fputs("\\t", f); break; |
563 | | case '\f': fputs("\\f", f); break; |
564 | | case '\'': fputs("\\'", f); break; |
565 | | case '"': fputs("\\\"", f); break; |
566 | | default: |
567 | | if (0x20 <= c && c <= 0x7f) |
568 | | putc(c, f); |
569 | | else |
570 | | fprintf(f, "\\x%02x", c); |
571 | | } |
572 | | } |
573 | | putc('"', f); |
574 | | } |
575 | | |
576 | | void |
577 | | _PyTokenizer_tok_dump(int type, char *start, char *end) |
578 | | { |
579 | | fprintf(stderr, "%s", _PyParser_TokenNames[type]); |
580 | | if (type == NAME || type == NUMBER || type == STRING || type == OP) |
581 | | fprintf(stderr, "(%.*s)", (int)(end - start), start); |
582 | | } |
583 | | #endif |