/src/cpython/Parser/tokenizer/string_tokenizer.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "Python.h" |
2 | | #include "errcode.h" |
3 | | |
4 | | #include "helpers.h" |
5 | | #include "../lexer/state.h" |
6 | | |
7 | | static int |
8 | 241k | tok_underflow_string(struct tok_state *tok) { |
9 | 241k | char *end = strchr(tok->inp, '\n'); |
10 | 241k | if (end != NULL) { |
11 | 224k | end++; |
12 | 224k | } |
13 | 17.8k | else { |
14 | 17.8k | end = strchr(tok->inp, '\0'); |
15 | 17.8k | if (end == tok->inp) { |
16 | 17.6k | tok->done = E_EOF; |
17 | 17.6k | return 0; |
18 | 17.6k | } |
19 | 17.8k | } |
20 | 224k | if (tok->start == NULL) { |
21 | 208k | tok->buf = tok->cur; |
22 | 208k | } |
23 | 224k | tok->line_start = tok->cur; |
24 | 224k | ADVANCE_LINENO(); |
25 | 224k | tok->inp = end; |
26 | 224k | return 1; |
27 | 241k | } |
28 | | |
29 | | /* Fetch a byte from TOK, using the string buffer. */ |
30 | | static int |
31 | 23.4k | buf_getc(struct tok_state *tok) { |
32 | 23.4k | return Py_CHARMASK(*tok->str++); |
33 | 23.4k | } |
34 | | |
35 | | /* Unfetch a byte from TOK, using the string buffer. */ |
36 | | static void |
37 | 23.4k | buf_ungetc(int c, struct tok_state *tok) { |
38 | 23.4k | tok->str--; |
39 | 23.4k | assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ |
40 | 23.4k | } |
41 | | |
42 | | /* Set the readline function for TOK to ENC. For the string-based |
43 | | tokenizer, this means to just record the encoding. */ |
44 | | static int |
45 | 4.33k | buf_setreadl(struct tok_state *tok, const char* enc) { |
46 | 4.33k | tok->enc = enc; |
47 | 4.33k | return 1; |
48 | 4.33k | } |
49 | | |
50 | | /* Decode a byte string STR for use as the buffer of TOK. |
51 | | Look for encoding declarations inside STR, and record them |
52 | | inside TOK. */ |
53 | | static char * |
54 | | decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf) |
55 | 23.3k | { |
56 | 23.3k | PyObject* utf8 = NULL; |
57 | 23.3k | char *str; |
58 | 23.3k | const char *s; |
59 | 23.3k | const char *newl[2] = {NULL, NULL}; |
60 | 23.3k | int lineno = 0; |
61 | 23.3k | tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok); |
62 | 23.3k | if (str == NULL) |
63 | 0 | return NULL; |
64 | 23.3k | tok->enc = NULL; |
65 | 23.3k | tok->str = str; |
66 | 23.3k | if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) |
67 | 0 | return _PyTokenizer_error_ret(tok); |
68 | 23.3k | str = tok->str; /* string after BOM if any */ |
69 | 23.3k | assert(str); |
70 | 23.3k | if (tok->enc != NULL) { |
71 | 0 | utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc); |
72 | 0 | if (utf8 == NULL) |
73 | 0 | return _PyTokenizer_error_ret(tok); |
74 | 0 | str = PyBytes_AsString(utf8); |
75 | 0 | } |
76 | 2.18M | for (s = str;; s++) { |
77 | 2.18M | if (*s == '\0') break; |
78 | 2.16M | else if (*s == '\n') { |
79 | 29.2k | assert(lineno < 2); |
80 | 29.2k | newl[lineno] = s; |
81 | 29.2k | lineno++; |
82 | 29.2k | if (lineno == 2) break; |
83 | 29.2k | } |
84 | 2.18M | } |
85 | 23.3k | tok->enc = NULL; |
86 | | /* need to check line 1 and 2 separately since check_coding_spec |
87 | | assumes a single line as input */ |
88 | 23.3k | if (newl[0]) { |
89 | 23.3k | if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) { |
90 | 2 | return NULL; |
91 | 2 | } |
92 | 23.3k | if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) { |
93 | 828 | if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0], |
94 | 828 | tok, buf_setreadl)) |
95 | 1 | return NULL; |
96 | 828 | } |
97 | 23.3k | } |
98 | 23.3k | if (tok->enc != NULL) { |
99 | 4.33k | assert(utf8 == NULL); |
100 | 4.33k | utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc); |
101 | 4.33k | if (utf8 == NULL) |
102 | 1.83k | return _PyTokenizer_error_ret(tok); |
103 | 2.49k | str = PyBytes_AS_STRING(utf8); |
104 | 2.49k | } |
105 | 21.5k | assert(tok->decoding_buffer == NULL); |
106 | 21.5k | tok->decoding_buffer = utf8; /* CAUTION */ |
107 | 21.5k | return str; |
108 | 23.3k | } |
109 | | |
110 | | /* Set up tokenizer for string */ |
111 | | struct tok_state * |
112 | | _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf) |
113 | 23.3k | { |
114 | 23.3k | struct tok_state *tok = _PyTokenizer_tok_new(); |
115 | 23.3k | char *decoded; |
116 | | |
117 | 23.3k | if (tok == NULL) |
118 | 0 | return NULL; |
119 | 23.3k | decoded = decode_str(str, exec_input, tok, preserve_crlf); |
120 | 23.3k | if (decoded == NULL) { |
121 | 1.84k | _PyTokenizer_Free(tok); |
122 | 1.84k | return NULL; |
123 | 1.84k | } |
124 | | |
125 | 21.5k | tok->buf = tok->cur = tok->inp = decoded; |
126 | 21.5k | tok->end = decoded; |
127 | 21.5k | tok->underflow = &tok_underflow_string; |
128 | 21.5k | return tok; |
129 | 23.3k | } |