/src/cpython/Parser/tokenizer/helpers.c
Line | Count | Source |
1 | | #include "Python.h" |
2 | | #include "errcode.h" |
3 | | #include "pycore_token.h" |
4 | | |
5 | | #include "../lexer/state.h" |
6 | | |
7 | | |
8 | | /* ############## ERRORS ############## */ |
9 | | |
10 | | static int |
11 | | _syntaxerror_range(struct tok_state *tok, const char *format, |
12 | | int col_offset, int end_col_offset, |
13 | | va_list vargs) |
14 | 3.04k | { |
15 | | // In release builds, we don't want to overwrite a previous error, but in debug builds we |
16 | | // want to fail if we are not doing it so we can fix it. |
17 | 3.04k | assert(tok->done != E_ERROR); |
18 | 3.04k | if (tok->done == E_ERROR) { |
19 | 0 | return ERRORTOKEN; |
20 | 0 | } |
21 | 3.04k | PyObject *errmsg, *errtext, *args; |
22 | 3.04k | errmsg = PyUnicode_FromFormatV(format, vargs); |
23 | 3.04k | if (!errmsg) { |
24 | 0 | goto error; |
25 | 0 | } |
26 | | |
27 | 3.04k | errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start, |
28 | 3.04k | "replace"); |
29 | 3.04k | if (!errtext) { |
30 | 0 | goto error; |
31 | 0 | } |
32 | | |
33 | 3.04k | if (col_offset == -1) { |
34 | 2.40k | col_offset = (int)PyUnicode_GET_LENGTH(errtext); |
35 | 2.40k | } |
36 | 3.04k | if (end_col_offset == -1) { |
37 | 2.40k | end_col_offset = col_offset; |
38 | 2.40k | } |
39 | | |
40 | 3.04k | Py_ssize_t line_len = strcspn(tok->line_start, "\n"); |
41 | 3.04k | if (line_len != tok->cur - tok->line_start) { |
42 | 2.29k | Py_DECREF(errtext); |
43 | 2.29k | errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len, |
44 | 2.29k | "replace"); |
45 | 2.29k | } |
46 | 3.04k | if (!errtext) { |
47 | 0 | goto error; |
48 | 0 | } |
49 | | |
50 | 3.04k | args = Py_BuildValue("(O(OiiNii))", errmsg, |
51 | 3.04k | tok->filename ? tok->filename : Py_None, |
52 | 3.04k | tok->lineno, col_offset, errtext, |
53 | 3.04k | tok->lineno, end_col_offset); |
54 | 3.04k | if (args) { |
55 | 3.04k | PyErr_SetObject(PyExc_SyntaxError, args); |
56 | 3.04k | Py_DECREF(args); |
57 | 3.04k | } |
58 | | |
59 | 3.04k | error: |
60 | 3.04k | Py_XDECREF(errmsg); |
61 | 3.04k | tok->done = E_ERROR; |
62 | 3.04k | return ERRORTOKEN; |
63 | 3.04k | } |
64 | | |
65 | | int |
66 | | _PyTokenizer_syntaxerror(struct tok_state *tok, const char *format, ...) |
67 | 2.40k | { |
68 | | // This errors are cleaned on startup. Todo: Fix it. |
69 | 2.40k | va_list vargs; |
70 | 2.40k | va_start(vargs, format); |
71 | 2.40k | int ret = _syntaxerror_range(tok, format, -1, -1, vargs); |
72 | 2.40k | va_end(vargs); |
73 | 2.40k | return ret; |
74 | 2.40k | } |
75 | | |
76 | | int |
77 | | _PyTokenizer_syntaxerror_known_range(struct tok_state *tok, |
78 | | int col_offset, int end_col_offset, |
79 | | const char *format, ...) |
80 | 635 | { |
81 | 635 | va_list vargs; |
82 | 635 | va_start(vargs, format); |
83 | 635 | int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs); |
84 | 635 | va_end(vargs); |
85 | 635 | return ret; |
86 | 635 | } |
87 | | |
88 | | int |
89 | | _PyTokenizer_indenterror(struct tok_state *tok) |
90 | 4 | { |
91 | 4 | tok->done = E_TABSPACE; |
92 | 4 | tok->cur = tok->inp; |
93 | 4 | return ERRORTOKEN; |
94 | 4 | } |
95 | | |
96 | | char * |
97 | | _PyTokenizer_error_ret(struct tok_state *tok) /* XXX */ |
98 | 2.63k | { |
99 | 2.63k | tok->decoding_erred = 1; |
100 | 2.63k | if ((tok->fp != NULL || tok->readline != NULL) && tok->buf != NULL) {/* see _PyTokenizer_Free */ |
101 | 0 | PyMem_Free(tok->buf); |
102 | 0 | } |
103 | 2.63k | tok->buf = tok->cur = tok->inp = NULL; |
104 | 2.63k | tok->start = NULL; |
105 | 2.63k | tok->end = NULL; |
106 | 2.63k | tok->done = E_DECODE; |
107 | 2.63k | return NULL; /* as if it were EOF */ |
108 | 2.63k | } |
109 | | |
110 | | int |
111 | | _PyTokenizer_warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char) |
112 | 1.18k | { |
113 | 1.18k | if (!tok->report_warnings) { |
114 | 0 | return 0; |
115 | 0 | } |
116 | | |
117 | 1.18k | PyObject *msg = PyUnicode_FromFormat( |
118 | 1.18k | "\"\\%c\" is an invalid escape sequence. " |
119 | 1.18k | "Such sequences will not work in the future. " |
120 | 1.18k | "Did you mean \"\\\\%c\"? A raw string is also an option.", |
121 | 1.18k | (char) first_invalid_escape_char, |
122 | 1.18k | (char) first_invalid_escape_char |
123 | 1.18k | ); |
124 | | |
125 | 1.18k | if (msg == NULL) { |
126 | 0 | return -1; |
127 | 0 | } |
128 | | |
129 | 1.18k | if (PyErr_WarnExplicitObject(PyExc_SyntaxWarning, msg, tok->filename, |
130 | 1.18k | tok->lineno, NULL, NULL) < 0) { |
131 | 1 | Py_DECREF(msg); |
132 | | |
133 | 1 | if (PyErr_ExceptionMatches(PyExc_SyntaxWarning)) { |
134 | | /* Replace the SyntaxWarning exception with a SyntaxError |
135 | | to get a more accurate error report */ |
136 | 0 | PyErr_Clear(); |
137 | |
|
138 | 0 | return _PyTokenizer_syntaxerror(tok, |
139 | 0 | "\"\\%c\" is an invalid escape sequence. " |
140 | 0 | "Did you mean \"\\\\%c\"? A raw string is also an option.", |
141 | 0 | (char) first_invalid_escape_char, |
142 | 0 | (char) first_invalid_escape_char); |
143 | 0 | } |
144 | | |
145 | 1 | return -1; |
146 | 1 | } |
147 | | |
148 | 1.18k | Py_DECREF(msg); |
149 | 1.18k | return 0; |
150 | 1.18k | } |
151 | | |
152 | | int |
153 | | _PyTokenizer_parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...) |
154 | 10.5k | { |
155 | 10.5k | if (!tok->report_warnings) { |
156 | 0 | return 0; |
157 | 0 | } |
158 | | |
159 | 10.5k | PyObject *errmsg; |
160 | 10.5k | va_list vargs; |
161 | 10.5k | va_start(vargs, format); |
162 | 10.5k | errmsg = PyUnicode_FromFormatV(format, vargs); |
163 | 10.5k | va_end(vargs); |
164 | 10.5k | if (!errmsg) { |
165 | 0 | goto error; |
166 | 0 | } |
167 | | |
168 | 10.5k | if (PyErr_WarnExplicitObject(category, errmsg, tok->filename, |
169 | 10.5k | tok->lineno, NULL, NULL) < 0) { |
170 | 0 | if (PyErr_ExceptionMatches(category)) { |
171 | | /* Replace the DeprecationWarning exception with a SyntaxError |
172 | | to get a more accurate error report */ |
173 | 0 | PyErr_Clear(); |
174 | 0 | _PyTokenizer_syntaxerror(tok, "%U", errmsg); |
175 | 0 | } |
176 | 0 | goto error; |
177 | 0 | } |
178 | 10.5k | Py_DECREF(errmsg); |
179 | 10.5k | return 0; |
180 | | |
181 | 0 | error: |
182 | 0 | Py_XDECREF(errmsg); |
183 | 0 | tok->done = E_ERROR; |
184 | 0 | return -1; |
185 | 10.5k | } |
186 | | |
187 | | |
188 | | /* ############## STRING MANIPULATION ############## */ |
189 | | |
190 | | char * |
191 | | _PyTokenizer_new_string(const char *s, Py_ssize_t len, struct tok_state *tok) |
192 | 4.73k | { |
193 | 4.73k | char* result = (char *)PyMem_Malloc(len + 1); |
194 | 4.73k | if (!result) { |
195 | 0 | tok->done = E_NOMEM; |
196 | 0 | return NULL; |
197 | 0 | } |
198 | 4.73k | memcpy(result, s, len); |
199 | 4.73k | result[len] = '\0'; |
200 | 4.73k | return result; |
201 | 4.73k | } |
202 | | |
203 | | PyObject * |
204 | 4.63k | _PyTokenizer_translate_into_utf8(const char* str, const char* enc) { |
205 | 4.63k | PyObject *utf8; |
206 | 4.63k | PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); |
207 | 4.63k | if (buf == NULL) |
208 | 1.98k | return NULL; |
209 | 2.64k | utf8 = PyUnicode_AsUTF8String(buf); |
210 | 2.64k | Py_DECREF(buf); |
211 | 2.64k | return utf8; |
212 | 4.63k | } |
213 | | |
214 | | char * |
215 | | _PyTokenizer_translate_newlines(const char *s, int exec_input, int preserve_crlf, |
216 | 21.6k | struct tok_state *tok) { |
217 | 21.6k | int skip_next_lf = 0; |
218 | 21.6k | size_t needed_length = strlen(s) + 2, final_length; |
219 | 21.6k | char *buf, *current; |
220 | 21.6k | char c = '\0'; |
221 | 21.6k | buf = PyMem_Malloc(needed_length); |
222 | 21.6k | if (buf == NULL) { |
223 | 0 | tok->done = E_NOMEM; |
224 | 0 | return NULL; |
225 | 0 | } |
226 | 7.17M | for (current = buf; *s; s++, current++) { |
227 | 7.15M | c = *s; |
228 | 7.15M | if (skip_next_lf) { |
229 | 8.79k | skip_next_lf = 0; |
230 | 8.79k | if (c == '\n') { |
231 | 258 | c = *++s; |
232 | 258 | if (!c) |
233 | 3 | break; |
234 | 258 | } |
235 | 8.79k | } |
236 | 7.15M | if (!preserve_crlf && c == '\r') { |
237 | 8.83k | skip_next_lf = 1; |
238 | 8.83k | c = '\n'; |
239 | 8.83k | } |
240 | 7.15M | *current = c; |
241 | 7.15M | } |
242 | | /* If this is exec input, add a newline to the end of the string if |
243 | | there isn't one already. */ |
244 | 21.6k | if (exec_input && c != '\n' && c != '\0') { |
245 | 21.0k | *current = '\n'; |
246 | 21.0k | current++; |
247 | 21.0k | } |
248 | 21.6k | *current = '\0'; |
249 | 21.6k | final_length = current - buf + 1; |
250 | 21.6k | if (final_length < needed_length && final_length) { |
251 | | /* should never fail */ |
252 | 656 | char* result = PyMem_Realloc(buf, final_length); |
253 | 656 | if (result == NULL) { |
254 | 0 | PyMem_Free(buf); |
255 | 0 | } |
256 | 656 | buf = result; |
257 | 656 | } |
258 | 21.6k | return buf; |
259 | 21.6k | } |
260 | | |
261 | | /* ############## ENCODING STUFF ############## */ |
262 | | |
263 | | |
264 | | /* See whether the file starts with a BOM. If it does, |
265 | | invoke the set_readline function with the new encoding. |
266 | | Return 1 on success, 0 on failure. */ |
267 | | int |
268 | | _PyTokenizer_check_bom(int get_char(struct tok_state *), |
269 | | void unget_char(int, struct tok_state *), |
270 | | int set_readline(struct tok_state *, const char *), |
271 | | struct tok_state *tok) |
272 | 21.5k | { |
273 | 21.5k | int ch1, ch2, ch3; |
274 | 21.5k | ch1 = get_char(tok); |
275 | 21.5k | tok->decoding_state = STATE_SEEK_CODING; |
276 | 21.5k | if (ch1 == EOF) { |
277 | 0 | return 1; |
278 | 21.5k | } else if (ch1 == 0xEF) { |
279 | 64 | ch2 = get_char(tok); |
280 | 64 | if (ch2 != 0xBB) { |
281 | 56 | unget_char(ch2, tok); |
282 | 56 | unget_char(ch1, tok); |
283 | 56 | return 1; |
284 | 56 | } |
285 | 8 | ch3 = get_char(tok); |
286 | 8 | if (ch3 != 0xBF) { |
287 | 3 | unget_char(ch3, tok); |
288 | 3 | unget_char(ch2, tok); |
289 | 3 | unget_char(ch1, tok); |
290 | 3 | return 1; |
291 | 3 | } |
292 | 21.4k | } else { |
293 | 21.4k | unget_char(ch1, tok); |
294 | 21.4k | return 1; |
295 | 21.4k | } |
296 | 5 | if (tok->encoding != NULL) |
297 | 0 | PyMem_Free(tok->encoding); |
298 | 5 | tok->encoding = _PyTokenizer_new_string("utf-8", 5, tok); |
299 | 5 | if (!tok->encoding) |
300 | 0 | return 0; |
301 | | /* No need to set_readline: input is already utf-8 */ |
302 | 5 | return 1; |
303 | 5 | } |
304 | | |
305 | | static const char * |
306 | | get_normal_name(const char *s) /* for utf-8 and latin-1 */ |
307 | 4.63k | { |
308 | 4.63k | char buf[13]; |
309 | 4.63k | int i; |
310 | 30.5k | for (i = 0; i < 12; i++) { |
311 | 30.3k | int c = s[i]; |
312 | 30.3k | if (c == '\0') |
313 | 4.38k | break; |
314 | 25.9k | else if (c == '_') |
315 | 726 | buf[i] = '-'; |
316 | 25.2k | else |
317 | 25.2k | buf[i] = Py_TOLOWER(c); |
318 | 30.3k | } |
319 | 4.63k | buf[i] = '\0'; |
320 | 4.63k | if (strcmp(buf, "utf-8") == 0 || |
321 | 4.63k | strncmp(buf, "utf-8-", 6) == 0) |
322 | 2 | return "utf-8"; |
323 | 4.63k | else if (strcmp(buf, "latin-1") == 0 || |
324 | 4.63k | strcmp(buf, "iso-8859-1") == 0 || |
325 | 4.63k | strcmp(buf, "iso-latin-1") == 0 || |
326 | 4.63k | strncmp(buf, "latin-1-", 8) == 0 || |
327 | 4.62k | strncmp(buf, "iso-8859-1-", 11) == 0 || |
328 | 4.62k | strncmp(buf, "iso-latin-1-", 12) == 0) |
329 | 13 | return "iso-8859-1"; |
330 | 4.62k | else |
331 | 4.62k | return s; |
332 | 4.63k | } |
333 | | |
334 | | /* Return the coding spec in S, or NULL if none is found. */ |
335 | | static int |
336 | | get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok) |
337 | 22.1k | { |
338 | 22.1k | Py_ssize_t i; |
339 | 22.1k | *spec = NULL; |
340 | | /* Coding spec must be in a comment, and that comment must be |
341 | | * the only statement on the source code line. */ |
342 | 23.4k | for (i = 0; i < size - 6; i++) { |
343 | 18.8k | if (s[i] == '#') |
344 | 4.87k | break; |
345 | 13.9k | if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') |
346 | 12.6k | return 1; |
347 | 13.9k | } |
348 | 200k | for (; i < size - 6; i++) { /* XXX inefficient search */ |
349 | 195k | const char* t = s + i; |
350 | 195k | if (memcmp(t, "coding", 6) == 0) { |
351 | 5.26k | const char* begin = NULL; |
352 | 5.26k | t += 6; |
353 | 5.26k | if (t[0] != ':' && t[0] != '=') |
354 | 209 | continue; |
355 | 5.76k | do { |
356 | 5.76k | t++; |
357 | 5.76k | } while (t[0] == ' ' || t[0] == '\t'); |
358 | | |
359 | 5.05k | begin = t; |
360 | 38.7k | while (Py_ISALNUM(t[0]) || |
361 | 7.63k | t[0] == '-' || t[0] == '_' || t[0] == '.') |
362 | 33.6k | t++; |
363 | | |
364 | 5.05k | if (begin < t) { |
365 | 4.63k | char* r = _PyTokenizer_new_string(begin, t - begin, tok); |
366 | 4.63k | const char* q; |
367 | 4.63k | if (!r) |
368 | 0 | return 0; |
369 | 4.63k | q = get_normal_name(r); |
370 | 4.63k | if (r != q) { |
371 | 15 | PyMem_Free(r); |
372 | 15 | r = _PyTokenizer_new_string(q, strlen(q), tok); |
373 | 15 | if (!r) |
374 | 0 | return 0; |
375 | 15 | } |
376 | 4.63k | *spec = r; |
377 | 4.63k | break; |
378 | 4.63k | } |
379 | 5.05k | } |
380 | 195k | } |
381 | 9.51k | return 1; |
382 | 9.51k | } |
383 | | |
384 | | /* Check whether the line contains a coding spec. If it does, |
385 | | invoke the set_readline function for the new encoding. |
386 | | This function receives the tok_state and the new encoding. |
387 | | Return 1 on success, 0 on failure. */ |
388 | | int |
389 | | _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, |
390 | | int set_readline(struct tok_state *, const char *)) |
391 | 22.1k | { |
392 | 22.1k | char *cs; |
393 | 22.1k | if (tok->cont_line) { |
394 | | /* It's a continuation line, so it can't be a coding spec. */ |
395 | 0 | tok->decoding_state = STATE_NORMAL; |
396 | 0 | return 1; |
397 | 0 | } |
398 | 22.1k | if (!get_coding_spec(line, &cs, size, tok)) { |
399 | 0 | return 0; |
400 | 0 | } |
401 | 22.1k | if (!cs) { |
402 | 17.5k | Py_ssize_t i; |
403 | 19.0k | for (i = 0; i < size; i++) { |
404 | 18.6k | if (line[i] == '#' || line[i] == '\n' || line[i] == '\r') |
405 | 508 | break; |
406 | 18.1k | if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { |
407 | | /* Stop checking coding spec after a line containing |
408 | | * anything except a comment. */ |
409 | 16.6k | tok->decoding_state = STATE_NORMAL; |
410 | 16.6k | break; |
411 | 16.6k | } |
412 | 18.1k | } |
413 | 17.5k | return 1; |
414 | 17.5k | } |
415 | 4.63k | tok->decoding_state = STATE_NORMAL; |
416 | 4.63k | if (tok->encoding == NULL) { |
417 | 4.63k | assert(tok->decoding_readline == NULL); |
418 | 4.63k | if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) { |
419 | 0 | _PyTokenizer_error_ret(tok); |
420 | 0 | PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); |
421 | 0 | PyMem_Free(cs); |
422 | 0 | return 0; |
423 | 0 | } |
424 | 4.63k | tok->encoding = cs; |
425 | 4.63k | } else { /* then, compare cs with BOM */ |
426 | 3 | if (strcmp(tok->encoding, cs) != 0) { |
427 | 3 | tok->line_start = line; |
428 | 3 | tok->cur = (char *)line; |
429 | 3 | assert(size <= INT_MAX); |
430 | 3 | _PyTokenizer_syntaxerror_known_range(tok, 0, (int)size, |
431 | 3 | "encoding problem: %s with BOM", cs); |
432 | 3 | PyMem_Free(cs); |
433 | 3 | _PyTokenizer_error_ret(tok); |
434 | 3 | return 0; |
435 | 3 | } |
436 | 0 | PyMem_Free(cs); |
437 | 0 | } |
438 | 4.63k | return 1; |
439 | 4.63k | } |
440 | | |
441 | | /* Check whether the characters at s start a valid |
442 | | UTF-8 sequence. Return the number of characters forming |
443 | | the sequence if yes, 0 if not. The special cases match |
444 | | those in stringlib/codecs.h:utf8_decode. |
445 | | */ |
446 | | static int |
447 | | valid_utf8(const unsigned char* s) |
448 | 6.44M | { |
449 | 6.44M | int expected = 0; |
450 | 6.44M | int length; |
451 | 6.44M | if (*s < 0x80) { |
452 | | /* single-byte code */ |
453 | 6.43M | return 1; |
454 | 6.43M | } |
455 | 7.13k | else if (*s < 0xE0) { |
456 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ |
457 | 3.20k | if (*s < 0xC2) { |
458 | | /* invalid sequence |
459 | | \x80-\xBF -- continuation byte |
460 | | \xC0-\xC1 -- fake 0000-007F */ |
461 | 135 | return 0; |
462 | 135 | } |
463 | 3.07k | expected = 1; |
464 | 3.07k | } |
465 | 3.92k | else if (*s < 0xF0) { |
466 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ |
467 | 1.55k | if (*s == 0xE0 && *(s + 1) < 0xA0) { |
468 | | /* invalid sequence |
469 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ |
470 | 25 | return 0; |
471 | 25 | } |
472 | 1.52k | else if (*s == 0xED && *(s + 1) >= 0xA0) { |
473 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF |
474 | | will result in surrogates in range D800-DFFF. Surrogates are |
475 | | not valid UTF-8 so they are rejected. |
476 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf |
477 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ |
478 | 23 | return 0; |
479 | 23 | } |
480 | 1.50k | expected = 2; |
481 | 1.50k | } |
482 | 2.37k | else if (*s < 0xF5) { |
483 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ |
484 | 2.32k | if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) { |
485 | | /* invalid sequence -- one of: |
486 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF |
487 | | \xF4\x90\x80\x80- -- 110000- overflow */ |
488 | 54 | return 0; |
489 | 54 | } |
490 | 2.26k | expected = 3; |
491 | 2.26k | } |
492 | 51 | else { |
493 | | /* invalid start byte */ |
494 | 51 | return 0; |
495 | 51 | } |
496 | 6.84k | length = expected + 1; |
497 | 19.0k | for (; expected; expected--) |
498 | 12.5k | if (s[expected] < 0x80 || s[expected] >= 0xC0) |
499 | 317 | return 0; |
500 | 6.52k | return length; |
501 | 6.84k | } |
502 | | |
503 | | int |
504 | | _PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno) |
505 | 16.9k | { |
506 | 16.9k | const char *badchar = NULL; |
507 | 16.9k | const char *c; |
508 | 16.9k | int length; |
509 | 16.9k | int col_offset = 0; |
510 | 16.9k | const char *line_start = line; |
511 | 6.46M | for (c = line; *c; c += length) { |
512 | 6.44M | if (!(length = valid_utf8((const unsigned char *)c))) { |
513 | 605 | badchar = c; |
514 | 605 | break; |
515 | 605 | } |
516 | 6.44M | col_offset++; |
517 | 6.44M | if (*c == '\n') { |
518 | 217k | lineno++; |
519 | 217k | col_offset = 0; |
520 | 217k | line_start = c + 1; |
521 | 217k | } |
522 | 6.44M | } |
523 | 16.9k | if (badchar) { |
524 | 605 | tok->lineno = lineno; |
525 | 605 | tok->line_start = line_start; |
526 | 605 | tok->cur = (char *)badchar; |
527 | 605 | _PyTokenizer_syntaxerror_known_range(tok, |
528 | 605 | col_offset + 1, col_offset + 1, |
529 | 605 | "Non-UTF-8 code starting with '\\x%.2x'" |
530 | 605 | "%s%V on line %i, " |
531 | 605 | "but no encoding declared; " |
532 | 605 | "see https://peps.python.org/pep-0263/ for details", |
533 | 605 | (unsigned char)*badchar, |
534 | 605 | tok->filename ? " in file " : "", tok->filename, "", |
535 | 605 | lineno); |
536 | 605 | return 0; |
537 | 605 | } |
538 | 16.3k | return 1; |
539 | 16.9k | } |
540 | | |
541 | | |
542 | | /* ############## DEBUGGING STUFF ############## */ |
543 | | |
544 | | #ifdef Py_DEBUG |
545 | | void |
546 | | _PyTokenizer_print_escape(FILE *f, const char *s, Py_ssize_t size) |
547 | | { |
548 | | if (s == NULL) { |
549 | | fputs("NULL", f); |
550 | | return; |
551 | | } |
552 | | putc('"', f); |
553 | | while (size-- > 0) { |
554 | | unsigned char c = *s++; |
555 | | switch (c) { |
556 | | case '\n': fputs("\\n", f); break; |
557 | | case '\r': fputs("\\r", f); break; |
558 | | case '\t': fputs("\\t", f); break; |
559 | | case '\f': fputs("\\f", f); break; |
560 | | case '\'': fputs("\\'", f); break; |
561 | | case '"': fputs("\\\"", f); break; |
562 | | default: |
563 | | if (0x20 <= c && c <= 0x7f) |
564 | | putc(c, f); |
565 | | else |
566 | | fprintf(f, "\\x%02x", c); |
567 | | } |
568 | | } |
569 | | putc('"', f); |
570 | | } |
571 | | |
572 | | void |
573 | | _PyTokenizer_tok_dump(int type, char *start, char *end) |
574 | | { |
575 | | fprintf(stderr, "%s", _PyParser_TokenNames[type]); |
576 | | if (type == NAME || type == NUMBER || type == STRING || type == OP) |
577 | | fprintf(stderr, "(%.*s)", (int)(end - start), start); |
578 | | } |
579 | | #endif |