/src/cpython/Parser/tokenizer/string_tokenizer.c
Line | Count | Source |
1 | | #include "Python.h" |
2 | | #include "errcode.h" |
3 | | |
4 | | #include "helpers.h" |
5 | | #include "../lexer/state.h" |
6 | | |
7 | | static int |
8 | 169k | tok_underflow_string(struct tok_state *tok) { |
9 | 169k | char *end = strchr(tok->inp, '\n'); |
10 | 169k | if (end != NULL) { |
11 | 155k | end++; |
12 | 155k | } |
13 | 13.8k | else { |
14 | 13.8k | end = strchr(tok->inp, '\0'); |
15 | 13.8k | if (end == tok->inp) { |
16 | 13.8k | tok->done = E_EOF; |
17 | 13.8k | return 0; |
18 | 13.8k | } |
19 | 13.8k | } |
20 | 155k | if (tok->start == NULL) { |
21 | 147k | tok->buf = tok->cur; |
22 | 147k | } |
23 | 155k | tok->line_start = tok->cur; |
24 | 155k | ADVANCE_LINENO(); |
25 | 155k | tok->inp = end; |
26 | 155k | return 1; |
27 | 169k | } |
28 | | |
29 | | /* Fetch a byte from TOK, using the string buffer. */ |
30 | | static int |
31 | 18.4k | buf_getc(struct tok_state *tok) { |
32 | 18.4k | return Py_CHARMASK(*tok->str++); |
33 | 18.4k | } |
34 | | |
35 | | /* Unfetch a byte from TOK, using the string buffer. */ |
36 | | static void |
37 | 18.4k | buf_ungetc(int c, struct tok_state *tok) { |
38 | 18.4k | tok->str--; |
39 | 18.4k | assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ |
40 | 18.4k | } |
41 | | |
42 | | /* Set the readline function for TOK to ENC. For the string-based |
43 | | tokenizer, this means to just record the encoding. */ |
44 | | static int |
45 | 3.32k | buf_setreadl(struct tok_state *tok, const char* enc) { |
46 | 3.32k | tok->enc = enc; |
47 | 3.32k | return 1; |
48 | 3.32k | } |
49 | | |
50 | | /* Decode a byte string STR for use as the buffer of TOK. |
51 | | Look for encoding declarations inside STR, and record them |
52 | | inside TOK. */ |
53 | | static char * |
54 | | decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf) |
55 | 18.3k | { |
56 | 18.3k | PyObject* utf8 = NULL; |
57 | 18.3k | char *str; |
58 | 18.3k | const char *s; |
59 | 18.3k | const char *newl[2] = {NULL, NULL}; |
60 | 18.3k | int lineno = 0; |
61 | 18.3k | tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok); |
62 | 18.3k | if (str == NULL) |
63 | 0 | return NULL; |
64 | 18.3k | tok->enc = NULL; |
65 | 18.3k | tok->str = str; |
66 | 18.3k | if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) |
67 | 0 | return _PyTokenizer_error_ret(tok); |
68 | 18.3k | str = tok->str; /* string after BOM if any */ |
69 | 18.3k | assert(str); |
70 | 18.3k | if (tok->enc != NULL) { |
71 | 0 | utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc); |
72 | 0 | if (utf8 == NULL) |
73 | 0 | return _PyTokenizer_error_ret(tok); |
74 | 0 | str = PyBytes_AsString(utf8); |
75 | 0 | } |
76 | 1.96M | for (s = str;; s++) { |
77 | 1.96M | if (*s == '\0') break; |
78 | 1.95M | else if (*s == '\n') { |
79 | 23.2k | assert(lineno < 2); |
80 | 23.2k | newl[lineno] = s; |
81 | 23.2k | lineno++; |
82 | 23.2k | if (lineno == 2) break; |
83 | 23.2k | } |
84 | 1.96M | } |
85 | 18.3k | tok->enc = NULL; |
86 | | /* need to check line 1 and 2 separately since check_coding_spec |
87 | | assumes a single line as input */ |
88 | 18.3k | if (newl[0]) { |
89 | 18.3k | tok->lineno = 1; |
90 | 18.3k | if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) { |
91 | 1 | return NULL; |
92 | 1 | } |
93 | 18.3k | if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) { |
94 | 541 | tok->lineno = 2; |
95 | 541 | if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0], |
96 | 541 | tok, buf_setreadl)) |
97 | 1 | return NULL; |
98 | 541 | } |
99 | 18.3k | } |
100 | 18.3k | tok->lineno = 0; |
101 | 18.3k | if (tok->enc != NULL) { |
102 | 3.32k | assert(utf8 == NULL); |
103 | 3.32k | utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc); |
104 | 3.32k | if (utf8 == NULL) |
105 | 1.55k | return _PyTokenizer_error_ret(tok); |
106 | 1.76k | str = PyBytes_AS_STRING(utf8); |
107 | 1.76k | } |
108 | 15.0k | else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) { |
109 | 516 | return _PyTokenizer_error_ret(tok); |
110 | 516 | } |
111 | 16.3k | if (utf8 != NULL) { |
112 | 1.76k | char *translated = _PyTokenizer_translate_newlines( |
113 | 1.76k | str, single, preserve_crlf, tok); |
114 | 1.76k | if (translated == NULL) { |
115 | 0 | Py_DECREF(utf8); |
116 | 0 | return _PyTokenizer_error_ret(tok); |
117 | 0 | } |
118 | 1.76k | PyMem_Free(tok->input); |
119 | 1.76k | tok->input = translated; |
120 | 1.76k | str = translated; |
121 | 1.76k | Py_CLEAR(utf8); |
122 | 1.76k | } |
123 | 16.3k | tok->str = str; |
124 | 16.3k | assert(tok->decoding_buffer == NULL); |
125 | 16.3k | tok->decoding_buffer = utf8; /* CAUTION */ |
126 | 16.3k | return str; |
127 | 16.3k | } |
128 | | |
129 | | /* Set up tokenizer for string */ |
130 | | struct tok_state * |
131 | | _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf) |
132 | 18.3k | { |
133 | 18.3k | struct tok_state *tok = _PyTokenizer_tok_new(); |
134 | 18.3k | char *decoded; |
135 | | |
136 | 18.3k | if (tok == NULL) |
137 | 0 | return NULL; |
138 | 18.3k | decoded = decode_str(str, exec_input, tok, preserve_crlf); |
139 | 18.3k | if (decoded == NULL) { |
140 | 2.07k | _PyTokenizer_Free(tok); |
141 | 2.07k | return NULL; |
142 | 2.07k | } |
143 | | |
144 | 16.3k | tok->buf = tok->cur = tok->inp = decoded; |
145 | 16.3k | tok->end = decoded; |
146 | 16.3k | tok->underflow = &tok_underflow_string; |
147 | 16.3k | return tok; |
148 | 18.3k | } |