/src/cpython/Parser/tokenizer/string_tokenizer.c
Line | Count | Source |
1 | | #include "Python.h" |
2 | | #include "errcode.h" |
3 | | |
4 | | #include "helpers.h" |
5 | | #include "../lexer/state.h" |
6 | | |
7 | | static int |
8 | 243k | tok_underflow_string(struct tok_state *tok) { |
9 | 243k | char *end = strchr(tok->inp, '\n'); |
10 | 243k | if (end != NULL) { |
11 | 226k | end++; |
12 | 226k | } |
13 | 16.5k | else { |
14 | 16.5k | end = strchr(tok->inp, '\0'); |
15 | 16.5k | if (end == tok->inp) { |
16 | 16.3k | tok->done = E_EOF; |
17 | 16.3k | return 0; |
18 | 16.3k | } |
19 | 16.5k | } |
20 | 226k | if (tok->start == NULL) { |
21 | 209k | tok->buf = tok->cur; |
22 | 209k | } |
23 | 226k | tok->line_start = tok->cur; |
24 | 226k | ADVANCE_LINENO(); |
25 | 226k | tok->inp = end; |
26 | 226k | return 1; |
27 | 243k | } |
28 | | |
29 | | /* Fetch a byte from TOK, using the string buffer. */ |
30 | | static int |
31 | 21.6k | buf_getc(struct tok_state *tok) { |
32 | 21.6k | return Py_CHARMASK(*tok->str++); |
33 | 21.6k | } |
34 | | |
35 | | /* Unfetch a byte from TOK, using the string buffer. */ |
36 | | static void |
37 | 21.6k | buf_ungetc(int c, struct tok_state *tok) { |
38 | 21.6k | tok->str--; |
39 | 21.6k | assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ |
40 | 21.6k | } |
41 | | |
42 | | /* Set the readline function for TOK to ENC. For the string-based |
43 | | tokenizer, this means to just record the encoding. */ |
44 | | static int |
45 | 4.63k | buf_setreadl(struct tok_state *tok, const char* enc) { |
46 | 4.63k | tok->enc = enc; |
47 | 4.63k | return 1; |
48 | 4.63k | } |
49 | | |
50 | | /* Decode a byte string STR for use as the buffer of TOK. |
51 | | Look for encoding declarations inside STR, and record them |
52 | | inside TOK. */ |
53 | | static char * |
54 | | decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf) |
55 | 21.5k | { |
56 | 21.5k | PyObject* utf8 = NULL; |
57 | 21.5k | char *str; |
58 | 21.5k | const char *s; |
59 | 21.5k | const char *newl[2] = {NULL, NULL}; |
60 | 21.5k | int lineno = 0; |
61 | 21.5k | tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok); |
62 | 21.5k | if (str == NULL) |
63 | 0 | return NULL; |
64 | 21.5k | tok->enc = NULL; |
65 | 21.5k | tok->str = str; |
66 | 21.5k | if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) |
67 | 0 | return _PyTokenizer_error_ret(tok); |
68 | 21.5k | str = tok->str; /* string after BOM if any */ |
69 | 21.5k | assert(str); |
70 | 21.5k | if (tok->enc != NULL) { |
71 | 0 | utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc); |
72 | 0 | if (utf8 == NULL) |
73 | 0 | return _PyTokenizer_error_ret(tok); |
74 | 0 | str = PyBytes_AsString(utf8); |
75 | 0 | } |
76 | 1.92M | for (s = str;; s++) { |
77 | 1.92M | if (*s == '\0') break; |
78 | 1.91M | else if (*s == '\n') { |
79 | 27.5k | assert(lineno < 2); |
80 | 27.5k | newl[lineno] = s; |
81 | 27.5k | lineno++; |
82 | 27.5k | if (lineno == 2) break; |
83 | 27.5k | } |
84 | 1.92M | } |
85 | 21.5k | tok->enc = NULL; |
86 | | /* need to check line 1 and 2 separately since check_coding_spec |
87 | | assumes a single line as input */ |
88 | 21.5k | if (newl[0]) { |
89 | 21.5k | tok->lineno = 1; |
90 | 21.5k | if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) { |
91 | 2 | return NULL; |
92 | 2 | } |
93 | 21.5k | if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) { |
94 | 614 | tok->lineno = 2; |
95 | 614 | if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0], |
96 | 614 | tok, buf_setreadl)) |
97 | 1 | return NULL; |
98 | 614 | } |
99 | 21.5k | } |
100 | 21.5k | tok->lineno = 0; |
101 | 21.5k | if (tok->enc != NULL) { |
102 | 4.63k | assert(utf8 == NULL); |
103 | 4.63k | utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc); |
104 | 4.63k | if (utf8 == NULL) |
105 | 2.02k | return _PyTokenizer_error_ret(tok); |
106 | 2.60k | str = PyBytes_AS_STRING(utf8); |
107 | 2.60k | } |
108 | 16.9k | else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) { |
109 | 605 | return _PyTokenizer_error_ret(tok); |
110 | 605 | } |
111 | 21.5k | assert(tok->decoding_buffer == NULL); |
112 | 18.9k | tok->decoding_buffer = utf8; /* CAUTION */ |
113 | 18.9k | return str; |
114 | 21.5k | } |
115 | | |
116 | | /* Set up tokenizer for string */ |
117 | | struct tok_state * |
118 | | _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf) |
119 | 21.5k | { |
120 | 21.5k | struct tok_state *tok = _PyTokenizer_tok_new(); |
121 | 21.5k | char *decoded; |
122 | | |
123 | 21.5k | if (tok == NULL) |
124 | 0 | return NULL; |
125 | 21.5k | decoded = decode_str(str, exec_input, tok, preserve_crlf); |
126 | 21.5k | if (decoded == NULL) { |
127 | 2.63k | _PyTokenizer_Free(tok); |
128 | 2.63k | return NULL; |
129 | 2.63k | } |
130 | | |
131 | 18.9k | tok->buf = tok->cur = tok->inp = decoded; |
132 | 18.9k | tok->end = decoded; |
133 | 18.9k | tok->underflow = &tok_underflow_string; |
134 | 18.9k | return tok; |
135 | 21.5k | } |