/src/cpython/Objects/stringlib/codecs.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* stringlib: codec implementations */ |
2 | | |
3 | | #if !STRINGLIB_IS_UNICODE |
4 | | # error "codecs.h is specific to Unicode" |
5 | | #endif |
6 | | |
7 | | #include "pycore_bitutils.h" // _Py_bswap32() |
8 | | |
9 | | /* Mask to quickly check whether a C 'size_t' contains a |
10 | | non-ASCII, UTF8-encoded char. */ |
11 | | #if (SIZEOF_SIZE_T == 8) |
12 | 330M | # define ASCII_CHAR_MASK 0x8080808080808080ULL |
13 | | #elif (SIZEOF_SIZE_T == 4) |
14 | | # define ASCII_CHAR_MASK 0x80808080U |
15 | | #else |
16 | | # error C 'size_t' size should be either 4 or 8! |
17 | | #endif |
18 | | |
19 | | /* 10xxxxxx */ |
20 | 102M | #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0) |
21 | | |
22 | | Py_LOCAL_INLINE(Py_UCS4) |
23 | | STRINGLIB(utf8_decode)(const char **inptr, const char *end, |
24 | | STRINGLIB_CHAR *dest, |
25 | | Py_ssize_t *outpos) |
26 | 167M | { |
27 | 167M | Py_UCS4 ch; |
28 | 167M | const char *s = *inptr; |
29 | 167M | STRINGLIB_CHAR *p = dest + *outpos; |
30 | | |
31 | 332M | while (s < end) { |
32 | 332M | ch = (unsigned char)*s; |
33 | | |
34 | 332M | if (ch < 0x80) { |
35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 |
36 | | input will consist of an overwhelming majority of ASCII |
37 | | characters, we try to optimize for this case by checking |
38 | | as many characters as a C 'size_t' can contain. |
39 | | First, check if we can do an aligned read, as most CPUs have |
40 | | a penalty for unaligned reads. |
41 | | */ |
42 | 131M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { |
43 | | /* Help register allocation */ |
44 | 17.1M | const char *_s = s; |
45 | 17.1M | STRINGLIB_CHAR *_p = p; |
46 | 330M | while (_s + SIZEOF_SIZE_T <= end) { |
47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), |
48 | | and do a fast unrolled copy if it only contains ASCII |
49 | | characters. */ |
50 | 330M | size_t value = *(const size_t *) _s; |
51 | 330M | if (value & ASCII_CHAR_MASK) |
52 | 16.9M | break; |
53 | 313M | #if PY_LITTLE_ENDIAN |
54 | 313M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); |
55 | 313M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
56 | 313M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
57 | 313M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
58 | 313M | # if SIZEOF_SIZE_T == 8 |
59 | 313M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
60 | 313M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
61 | 313M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
62 | 313M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
63 | 313M | # endif |
64 | | #else |
65 | | # if SIZEOF_SIZE_T == 8 |
66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); |
74 | | # else |
75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); |
79 | | # endif |
80 | | #endif |
81 | 313M | _s += SIZEOF_SIZE_T; |
82 | 313M | _p += SIZEOF_SIZE_T; |
83 | 313M | } |
84 | 17.1M | s = _s; |
85 | 17.1M | p = _p; |
86 | 17.1M | if (s == end) |
87 | 10.9k | break; |
88 | 17.0M | ch = (unsigned char)*s; |
89 | 17.0M | } |
90 | 131M | if (ch < 0x80) { |
91 | 130M | s++; |
92 | 130M | *p++ = ch; |
93 | 130M | continue; |
94 | 130M | } |
95 | 131M | } |
96 | | |
97 | 201M | if (ch < 0xE0) { |
98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ |
99 | 89.5M | Py_UCS4 ch2; |
100 | 89.5M | if (ch < 0xC2) { |
101 | | /* invalid sequence |
102 | | \x80-\xBF -- continuation byte |
103 | | \xC0-\xC1 -- fake 0000-007F */ |
104 | 64.5M | goto InvalidStart; |
105 | 64.5M | } |
106 | 25.0M | if (end - s < 2) { |
107 | | /* unexpected end of data: the caller will decide whether |
108 | | it's an error or not */ |
109 | 11.7k | break; |
110 | 11.7k | } |
111 | 25.0M | ch2 = (unsigned char)s[1]; |
112 | 25.0M | if (!IS_CONTINUATION_BYTE(ch2)) |
113 | | /* invalid continuation byte */ |
114 | 20.7M | goto InvalidContinuation1; |
115 | 4.27M | ch = (ch << 6) + ch2 - |
116 | 4.27M | ((0xC0 << 6) + 0x80); |
117 | 4.27M | assert ((ch > 0x007F) && (ch <= 0x07FF)); |
118 | 4.27M | s += 2; |
119 | 4.27M | if (STRINGLIB_MAX_CHAR <= 0x007F || |
120 | 4.27M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) |
121 | | /* Out-of-range */ |
122 | 86.1k | goto Return; |
123 | 4.18M | *p++ = ch; |
124 | 4.18M | continue; |
125 | 4.27M | } |
126 | | |
127 | 112M | if (ch < 0xF0) { |
128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ |
129 | 40.3M | Py_UCS4 ch2, ch3; |
130 | 40.3M | if (end - s < 3) { |
131 | | /* unexpected end of data: the caller will decide whether |
132 | | it's an error or not */ |
133 | 12.1k | if (end - s < 2) |
134 | 4.65k | break; |
135 | 7.48k | ch2 = (unsigned char)s[1]; |
136 | 7.48k | if (!IS_CONTINUATION_BYTE(ch2) || |
137 | 7.48k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) |
138 | | /* for clarification see comments below */ |
139 | 5.09k | goto InvalidContinuation1; |
140 | 2.39k | break; |
141 | 7.48k | } |
142 | 40.3M | ch2 = (unsigned char)s[1]; |
143 | 40.3M | ch3 = (unsigned char)s[2]; |
144 | 40.3M | if (!IS_CONTINUATION_BYTE(ch2)) { |
145 | | /* invalid continuation byte */ |
146 | 9.17M | goto InvalidContinuation1; |
147 | 9.17M | } |
148 | 31.1M | if (ch == 0xE0) { |
149 | 92.3k | if (ch2 < 0xA0) |
150 | | /* invalid sequence |
151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ |
152 | 38.8k | goto InvalidContinuation1; |
153 | 31.0M | } else if (ch == 0xED && ch2 >= 0xA0) { |
154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF |
155 | | will result in surrogates in range D800-DFFF. Surrogates are |
156 | | not valid UTF-8 so they are rejected. |
157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf |
158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ |
159 | 59.4k | goto InvalidContinuation1; |
160 | 59.4k | } |
161 | 31.0M | if (!IS_CONTINUATION_BYTE(ch3)) { |
162 | | /* invalid continuation byte */ |
163 | 785k | goto InvalidContinuation2; |
164 | 785k | } |
165 | 30.2M | ch = (ch << 12) + (ch2 << 6) + ch3 - |
166 | 30.2M | ((0xE0 << 12) + (0x80 << 6) + 0x80); |
167 | 30.2M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); |
168 | 30.2M | s += 3; |
169 | 30.2M | if (STRINGLIB_MAX_CHAR <= 0x07FF || |
170 | 30.2M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) |
171 | | /* Out-of-range */ |
172 | 188k | goto Return; |
173 | 30.0M | *p++ = ch; |
174 | 30.0M | continue; |
175 | 30.2M | } |
176 | | |
177 | 71.7M | if (ch < 0xF5) { |
178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ |
179 | 4.16M | Py_UCS4 ch2, ch3, ch4; |
180 | 4.16M | if (end - s < 4) { |
181 | | /* unexpected end of data: the caller will decide whether |
182 | | it's an error or not */ |
183 | 19.2k | if (end - s < 2) |
184 | 5.51k | break; |
185 | 13.7k | ch2 = (unsigned char)s[1]; |
186 | 13.7k | if (!IS_CONTINUATION_BYTE(ch2) || |
187 | 13.7k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) |
188 | | /* for clarification see comments below */ |
189 | 9.31k | goto InvalidContinuation1; |
190 | 4.42k | if (end - s < 3) |
191 | 1.57k | break; |
192 | 2.84k | ch3 = (unsigned char)s[2]; |
193 | 2.84k | if (!IS_CONTINUATION_BYTE(ch3)) |
194 | 1.80k | goto InvalidContinuation2; |
195 | 1.04k | break; |
196 | 2.84k | } |
197 | 4.14M | ch2 = (unsigned char)s[1]; |
198 | 4.14M | ch3 = (unsigned char)s[2]; |
199 | 4.14M | ch4 = (unsigned char)s[3]; |
200 | 4.14M | if (!IS_CONTINUATION_BYTE(ch2)) { |
201 | | /* invalid continuation byte */ |
202 | 2.94M | goto InvalidContinuation1; |
203 | 2.94M | } |
204 | 1.19M | if (ch == 0xF0) { |
205 | 611k | if (ch2 < 0x90) |
206 | | /* invalid sequence |
207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ |
208 | 38.8k | goto InvalidContinuation1; |
209 | 611k | } else if (ch == 0xF4 && ch2 >= 0x90) { |
210 | | /* invalid sequence |
211 | | \xF4\x90\x80\x80- -- 110000- overflow */ |
212 | 60.0k | goto InvalidContinuation1; |
213 | 60.0k | } |
214 | 1.09M | if (!IS_CONTINUATION_BYTE(ch3)) { |
215 | | /* invalid continuation byte */ |
216 | 315k | goto InvalidContinuation2; |
217 | 315k | } |
218 | 779k | if (!IS_CONTINUATION_BYTE(ch4)) { |
219 | | /* invalid continuation byte */ |
220 | 103k | goto InvalidContinuation3; |
221 | 103k | } |
222 | 676k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - |
223 | 676k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); |
224 | 676k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); |
225 | 676k | s += 4; |
226 | 676k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || |
227 | 676k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) |
228 | | /* Out-of-range */ |
229 | 32.0k | goto Return; |
230 | 643k | *p++ = ch; |
231 | 643k | continue; |
232 | 676k | } |
233 | 67.5M | goto InvalidStart; |
234 | 71.7M | } |
235 | 345k | ch = 0; |
236 | 167M | Return: |
237 | 167M | *inptr = s; |
238 | 167M | *outpos = p - dest; |
239 | 167M | return ch; |
240 | 132M | InvalidStart: |
241 | 132M | ch = 1; |
242 | 132M | goto Return; |
243 | 33.0M | InvalidContinuation1: |
244 | 33.0M | ch = 2; |
245 | 33.0M | goto Return; |
246 | 1.10M | InvalidContinuation2: |
247 | 1.10M | ch = 3; |
248 | 1.10M | goto Return; |
249 | 103k | InvalidContinuation3: |
250 | 103k | ch = 4; |
251 | 103k | goto Return; |
252 | 345k | } unicodeobject.c:asciilib_utf8_decode Line | Count | Source | 26 | 318k | { | 27 | 318k | Py_UCS4 ch; | 28 | 318k | const char *s = *inptr; | 29 | 318k | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 318k | while (s < end) { | 32 | 318k | ch = (unsigned char)*s; | 33 | | | 34 | 318k | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 0 | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 0 | const char *_s = s; | 45 | 0 | STRINGLIB_CHAR *_p = p; | 46 | 0 | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 0 | size_t value = *(const size_t *) _s; | 51 | 0 | if (value & ASCII_CHAR_MASK) | 52 | 0 | break; | 53 | 0 | #if PY_LITTLE_ENDIAN | 54 | 0 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 0 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 0 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 0 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 0 | # if SIZEOF_SIZE_T == 8 | 59 | 0 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 0 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 0 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 0 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 0 | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 0 | _s += SIZEOF_SIZE_T; | 82 | 0 | _p += SIZEOF_SIZE_T; | 83 | 0 | } | 84 | 0 | s = _s; | 85 | 0 | p = _p; | 86 | 0 | if (s == end) | 87 | 0 | break; | 88 | 0 | ch = (unsigned char)*s; | 89 | 0 | } | 90 | 0 | if (ch < 0x80) { | 91 | 0 | s++; | 92 | 0 | *p++ = ch; | 93 | 0 | continue; | 94 | 0 | } | 95 | 0 | } | 96 | | | 97 | 318k | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 106k | Py_UCS4 ch2; | 100 | 106k | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 15.4k | goto InvalidStart; | 105 | 15.4k | } | 106 | 91.2k | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 1.29k | break; | 110 | 1.29k | } | 111 | 89.9k | ch2 = (unsigned char)s[1]; | 112 | 89.9k | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 5.40k | goto InvalidContinuation1; | 115 | 84.5k | ch = (ch << 6) + ch2 - | 116 | 84.5k | ((0xC0 << 6) + 0x80); | 117 | 84.5k | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 84.5k | s += 2; | 119 | 84.5k | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 84.5k | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 84.5k | goto Return; | 123 | 0 | *p++ = ch; | 124 | 0 | continue; | 125 | 84.5k | } | 126 | | | 127 | 212k | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 180k | Py_UCS4 ch2, ch3; | 130 | 180k | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 2.66k | if (end - s < 2) | 134 | 1.11k | break; | 135 | 1.54k | ch2 = (unsigned char)s[1]; | 136 | 1.54k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 1.54k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 1.02k | goto InvalidContinuation1; | 140 | 522 | break; | 141 | 1.54k | } | 142 | 177k | ch2 = (unsigned char)s[1]; | 143 | 177k | ch3 = (unsigned char)s[2]; | 144 | 177k | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 3.18k | goto InvalidContinuation1; | 147 | 3.18k | } | 148 | 174k | if (ch == 0xE0) { | 149 | 1.10k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 287 | goto InvalidContinuation1; | 153 | 173k | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 218 | goto InvalidContinuation1; | 160 | 218 | } | 161 | 173k | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 2.41k | goto InvalidContinuation2; | 164 | 2.41k | } | 165 | 171k | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 171k | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 171k | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 171k | s += 3; | 169 | 171k | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 171k | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 171k | goto Return; | 173 | 0 | *p++ = ch; | 174 | 0 | continue; | 175 | 171k | } | 176 | | | 177 | 32.0k | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 22.3k | Py_UCS4 ch2, ch3, ch4; | 180 | 22.3k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 7.81k | if (end - s < 2) | 184 | 2.87k | break; | 185 | 4.93k | ch2 = (unsigned char)s[1]; | 186 | 4.93k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 4.93k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 3.90k | goto InvalidContinuation1; | 190 | 1.03k | if (end - s < 3) | 191 | 435 | break; | 192 | 596 | ch3 = (unsigned char)s[2]; | 193 | 596 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 475 | goto InvalidContinuation2; | 195 | 121 | break; | 196 | 596 | } | 197 | 14.5k | ch2 = (unsigned char)s[1]; | 198 | 14.5k | ch3 = (unsigned char)s[2]; | 199 | 14.5k | ch4 = (unsigned char)s[3]; | 200 | 14.5k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 2.66k | goto InvalidContinuation1; | 203 | 2.66k | } | 204 | 11.8k | if (ch == 0xF0) { | 205 | 2.45k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 93 | goto InvalidContinuation1; | 209 | 9.44k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 483 | goto InvalidContinuation1; | 213 | 483 | } | 214 | 11.3k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 1.26k | goto InvalidContinuation2; | 217 | 1.26k | } | 218 | 10.0k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 449 | goto InvalidContinuation3; | 221 | 449 | } | 222 | 9.61k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 9.61k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 9.61k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 9.61k | s += 4; | 226 | 9.61k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 9.61k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 9.61k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 9.61k | } | 233 | 9.65k | goto InvalidStart; | 234 | 32.0k | } | 235 | 6.37k | ch = 0; | 236 | 318k | Return: | 237 | 318k | *inptr = s; | 238 | 318k | *outpos = p - dest; | 239 | 318k | return ch; | 240 | 25.0k | InvalidStart: | 241 | 25.0k | ch = 1; | 242 | 25.0k | goto Return; | 243 | 17.2k | InvalidContinuation1: | 244 | 17.2k | ch = 2; | 245 | 17.2k | goto Return; | 246 | 4.15k | InvalidContinuation2: | 247 | 4.15k | ch = 3; | 248 | 4.15k | goto Return; | 249 | 449 | InvalidContinuation3: | 250 | 449 | ch = 4; | 251 | 449 | goto Return; | 252 | 6.37k | } |
unicodeobject.c:ucs1lib_utf8_decode Line | Count | Source | 26 | 95.6k | { | 27 | 95.6k | Py_UCS4 ch; | 28 | 95.6k | const char *s = *inptr; | 29 | 95.6k | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 1.23M | while (s < end) { | 32 | 1.19M | ch = (unsigned char)*s; | 33 | | | 34 | 1.19M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 809k | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 113k | const char *_s = s; | 45 | 113k | STRINGLIB_CHAR *_p = p; | 46 | 8.57M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 8.54M | size_t value = *(const size_t *) _s; | 51 | 8.54M | if (value & ASCII_CHAR_MASK) | 52 | 91.3k | break; | 53 | 8.45M | #if PY_LITTLE_ENDIAN | 54 | 8.45M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 8.45M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 8.45M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 8.45M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 8.45M | # if SIZEOF_SIZE_T == 8 | 59 | 8.45M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 8.45M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 8.45M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 8.45M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 8.45M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 8.45M | _s += SIZEOF_SIZE_T; | 82 | 8.45M | _p += SIZEOF_SIZE_T; | 83 | 8.45M | } | 84 | 113k | s = _s; | 85 | 113k | p = _p; | 86 | 113k | if (s == end) | 87 | 2.79k | break; | 88 | 110k | ch = (unsigned char)*s; | 89 | 110k | } | 90 | 807k | if (ch < 0x80) { | 91 | 782k | s++; | 92 | 782k | *p++ = ch; | 93 | 782k | continue; | 94 | 782k | } | 95 | 807k | } | 96 | | | 97 | 414k | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 383k | Py_UCS4 ch2; | 100 | 383k | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 2.36k | goto InvalidStart; | 105 | 2.36k | } | 106 | 381k | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 923 | break; | 110 | 923 | } | 111 | 380k | ch2 = (unsigned char)s[1]; | 112 | 380k | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 24.4k | goto InvalidContinuation1; | 115 | 356k | ch = (ch << 6) + ch2 - | 116 | 356k | ((0xC0 << 6) + 0x80); | 117 | 356k | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 356k | s += 2; | 119 | 356k | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 356k | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 1.55k | goto Return; | 123 | 354k | *p++ = ch; | 124 | 354k | continue; | 125 | 356k | } | 126 | | | 127 | 30.4k | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 21.3k | Py_UCS4 ch2, ch3; | 130 | 21.3k | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 1.69k | if (end - s < 2) | 134 | 365 | break; | 135 | 1.33k | ch2 = (unsigned char)s[1]; | 136 | 1.33k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 1.33k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 881 | goto InvalidContinuation1; | 140 | 451 | break; | 141 | 1.33k | } | 142 | 19.6k | ch2 = (unsigned char)s[1]; | 143 | 19.6k | ch3 = (unsigned char)s[2]; | 144 | 19.6k | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 1.26k | goto InvalidContinuation1; | 147 | 1.26k | } | 148 | 18.4k | if (ch == 0xE0) { | 149 | 626 | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 165 | goto InvalidContinuation1; | 153 | 17.8k | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 445 | goto InvalidContinuation1; | 160 | 445 | } | 161 | 17.8k | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 545 | goto InvalidContinuation2; | 164 | 545 | } | 165 | 17.2k | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 17.2k | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 17.2k | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 17.2k | s += 3; | 169 | 17.2k | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 17.2k | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 17.2k | goto Return; | 173 | 0 | *p++ = ch; | 174 | 0 | continue; | 175 | 17.2k | } | 176 | | | 177 | 9.01k | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 8.37k | Py_UCS4 ch2, ch3, ch4; | 180 | 8.37k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 1.65k | if (end - s < 2) | 184 | 311 | break; | 185 | 1.34k | ch2 = (unsigned char)s[1]; | 186 | 1.34k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 1.34k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 937 | goto InvalidContinuation1; | 190 | 403 | if (end - s < 3) | 191 | 82 | break; | 192 | 321 | ch3 = (unsigned char)s[2]; | 193 | 321 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 258 | goto InvalidContinuation2; | 195 | 63 | break; | 196 | 321 | } | 197 | 6.71k | ch2 = (unsigned char)s[1]; | 198 | 6.71k | ch3 = (unsigned char)s[2]; | 199 | 6.71k | ch4 = (unsigned char)s[3]; | 200 | 6.71k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 672 | goto InvalidContinuation1; | 203 | 672 | } | 204 | 6.04k | if (ch == 0xF0) { | 205 | 1.23k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 132 | goto InvalidContinuation1; | 209 | 4.80k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 329 | goto InvalidContinuation1; | 213 | 329 | } | 214 | 5.58k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 2.25k | goto InvalidContinuation2; | 217 | 2.25k | } | 218 | 3.32k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 368 | goto InvalidContinuation3; | 221 | 368 | } | 222 | 2.96k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 2.96k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 2.96k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 2.96k | s += 4; | 226 | 2.96k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 2.96k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 2.96k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 2.96k | } | 233 | 645 | goto InvalidStart; | 234 | 9.01k | } | 235 | 38.1k | ch = 0; | 236 | 95.6k | Return: | 237 | 95.6k | *inptr = s; | 238 | 95.6k | *outpos = p - dest; | 239 | 95.6k | return ch; | 240 | 3.00k | InvalidStart: | 241 | 3.00k | ch = 1; | 242 | 3.00k | goto Return; | 243 | 29.2k | InvalidContinuation1: | 244 | 29.2k | ch = 2; | 245 | 29.2k | goto Return; | 246 | 3.06k | InvalidContinuation2: | 247 | 3.06k | ch = 3; | 248 | 3.06k | goto Return; | 249 | 368 | InvalidContinuation3: | 250 | 368 | ch = 4; | 251 | 368 | goto Return; | 252 | 38.1k | } |
unicodeobject.c:ucs2lib_utf8_decode Line | Count | Source | 26 | 84.1M | { | 27 | 84.1M | Py_UCS4 ch; | 28 | 84.1M | const char *s = *inptr; | 29 | 84.1M | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 156M | while (s < end) { | 32 | 156M | ch = (unsigned char)*s; | 33 | | | 34 | 156M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 55.6M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 7.44M | const char *_s = s; | 45 | 7.44M | STRINGLIB_CHAR *_p = p; | 46 | 176M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 176M | size_t value = *(const size_t *) _s; | 51 | 176M | if (value & ASCII_CHAR_MASK) | 52 | 7.35M | break; | 53 | 169M | #if PY_LITTLE_ENDIAN | 54 | 169M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 169M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 169M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 169M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 169M | # if SIZEOF_SIZE_T == 8 | 59 | 169M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 169M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 169M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 169M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 169M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 169M | _s += SIZEOF_SIZE_T; | 82 | 169M | _p += SIZEOF_SIZE_T; | 83 | 169M | } | 84 | 7.44M | s = _s; | 85 | 7.44M | p = _p; | 86 | 7.44M | if (s == end) | 87 | 5.64k | break; | 88 | 7.43M | ch = (unsigned char)*s; | 89 | 7.43M | } | 90 | 55.6M | if (ch < 0x80) { | 91 | 55.4M | s++; | 92 | 55.4M | *p++ = ch; | 93 | 55.4M | continue; | 94 | 55.4M | } | 95 | 55.6M | } | 96 | | | 97 | 100M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 42.7M | Py_UCS4 ch2; | 100 | 42.7M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 30.3M | goto InvalidStart; | 105 | 30.3M | } | 106 | 12.3M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 8.24k | break; | 110 | 8.24k | } | 111 | 12.3M | ch2 = (unsigned char)s[1]; | 112 | 12.3M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 11.2M | goto InvalidContinuation1; | 115 | 1.06M | ch = (ch << 6) + ch2 - | 116 | 1.06M | ((0xC0 << 6) + 0x80); | 117 | 1.06M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 1.06M | s += 2; | 119 | 1.06M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 1.06M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 0 | goto Return; | 123 | 1.06M | *p++ = ch; | 124 | 1.06M | continue; | 125 | 1.06M | } | 126 | | | 127 | 58.1M | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 20.7M | Py_UCS4 ch2, ch3; | 130 | 20.7M | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 4.26k | if (end - s < 2) | 134 | 1.92k | break; | 135 | 2.33k | ch2 = (unsigned char)s[1]; | 136 | 2.33k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 2.33k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 1.82k | goto InvalidContinuation1; | 140 | 508 | break; | 141 | 2.33k | } | 142 | 20.7M | ch2 = (unsigned char)s[1]; | 143 | 20.7M | ch3 = (unsigned char)s[2]; | 144 | 20.7M | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 4.71M | goto InvalidContinuation1; | 147 | 4.71M | } | 148 | 16.0M | if (ch == 0xE0) { | 149 | 21.5k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 7.16k | goto InvalidContinuation1; | 153 | 15.9M | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 7.66k | goto InvalidContinuation1; | 160 | 7.66k | } | 161 | 15.9M | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 94.1k | goto InvalidContinuation2; | 164 | 94.1k | } | 165 | 15.8M | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 15.8M | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 15.8M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 15.8M | s += 3; | 169 | 15.8M | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 15.8M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 0 | goto Return; | 173 | 15.8M | *p++ = ch; | 174 | 15.8M | continue; | 175 | 15.8M | } | 176 | | | 177 | 37.4M | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 956k | Py_UCS4 ch2, ch3, ch4; | 180 | 956k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 6.09k | if (end - s < 2) | 184 | 1.60k | break; | 185 | 4.48k | ch2 = (unsigned char)s[1]; | 186 | 4.48k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 4.48k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 2.73k | goto InvalidContinuation1; | 190 | 1.75k | if (end - s < 3) | 191 | 531 | break; | 192 | 1.22k | ch3 = (unsigned char)s[2]; | 193 | 1.22k | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 547 | goto InvalidContinuation2; | 195 | 678 | break; | 196 | 1.22k | } | 197 | 950k | ch2 = (unsigned char)s[1]; | 198 | 950k | ch3 = (unsigned char)s[2]; | 199 | 950k | ch4 = (unsigned char)s[3]; | 200 | 950k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 846k | goto InvalidContinuation1; | 203 | 846k | } | 204 | 103k | if (ch == 0xF0) { | 205 | 20.0k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 5.16k | goto InvalidContinuation1; | 209 | 83.7k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 8.79k | goto InvalidContinuation1; | 213 | 8.79k | } | 214 | 89.8k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 56.6k | goto InvalidContinuation2; | 217 | 56.6k | } | 218 | 33.2k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 13.7k | goto InvalidContinuation3; | 221 | 13.7k | } | 222 | 19.5k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 19.5k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 19.5k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 19.5k | s += 4; | 226 | 19.5k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 19.5k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 19.5k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 19.5k | } | 233 | 36.4M | goto InvalidStart; | 234 | 37.4M | } | 235 | 259k | ch = 0; | 236 | 84.1M | Return: | 237 | 84.1M | *inptr = s; | 238 | 84.1M | *outpos = p - dest; | 239 | 84.1M | return ch; | 240 | 66.7M | InvalidStart: | 241 | 66.7M | ch = 1; | 242 | 66.7M | goto Return; | 243 | 16.8M | InvalidContinuation1: | 244 | 16.8M | ch = 2; | 245 | 16.8M | goto Return; | 246 | 151k | InvalidContinuation2: | 247 | 151k | ch = 3; | 248 | 151k | goto Return; | 249 | 13.7k | InvalidContinuation3: | 250 | 13.7k | ch = 4; | 251 | 13.7k | goto Return; | 252 | 259k | } |
unicodeobject.c:ucs4lib_utf8_decode Line | Count | Source | 26 | 82.5M | { | 27 | 82.5M | Py_UCS4 ch; | 28 | 82.5M | const char *s = *inptr; | 29 | 82.5M | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 174M | while (s < end) { | 32 | 174M | ch = (unsigned char)*s; | 33 | | | 34 | 174M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 74.6M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 9.55M | const char *_s = s; | 45 | 9.55M | STRINGLIB_CHAR *_p = p; | 46 | 145M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 145M | size_t value = *(const size_t *) _s; | 51 | 145M | if (value & ASCII_CHAR_MASK) | 52 | 9.52M | break; | 53 | 136M | #if PY_LITTLE_ENDIAN | 54 | 136M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 136M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 136M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 136M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 136M | # if SIZEOF_SIZE_T == 8 | 59 | 136M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 136M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 136M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 136M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 136M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 136M | _s += SIZEOF_SIZE_T; | 82 | 136M | _p += SIZEOF_SIZE_T; | 83 | 136M | } | 84 | 9.55M | s = _s; | 85 | 9.55M | p = _p; | 86 | 9.55M | if (s == end) | 87 | 2.53k | break; | 88 | 9.54M | ch = (unsigned char)*s; | 89 | 9.54M | } | 90 | 74.6M | if (ch < 0x80) { | 91 | 74.3M | s++; | 92 | 74.3M | *p++ = ch; | 93 | 74.3M | continue; | 94 | 74.3M | } | 95 | 74.6M | } | 96 | | | 97 | 100M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 46.3M | Py_UCS4 ch2; | 100 | 46.3M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 34.1M | goto InvalidStart; | 105 | 34.1M | } | 106 | 12.2M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 1.31k | break; | 110 | 1.31k | } | 111 | 12.1M | ch2 = (unsigned char)s[1]; | 112 | 12.1M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 9.43M | goto InvalidContinuation1; | 115 | 2.76M | ch = (ch << 6) + ch2 - | 116 | 2.76M | ((0xC0 << 6) + 0x80); | 117 | 2.76M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 2.76M | s += 2; | 119 | 2.76M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 2.76M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 0 | goto Return; | 123 | 2.76M | *p++ = ch; | 124 | 2.76M | continue; | 125 | 2.76M | } | 126 | | | 127 | 53.6M | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 19.3M | Py_UCS4 ch2, ch3; | 130 | 19.3M | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 3.50k | if (end - s < 2) | 134 | 1.24k | break; | 135 | 2.26k | ch2 = (unsigned char)s[1]; | 136 | 2.26k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 2.26k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 1.35k | goto InvalidContinuation1; | 140 | 909 | break; | 141 | 2.26k | } | 142 | 19.3M | ch2 = (unsigned char)s[1]; | 143 | 19.3M | ch3 = (unsigned char)s[2]; | 144 | 19.3M | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 4.45M | goto InvalidContinuation1; | 147 | 4.45M | } | 148 | 14.9M | if (ch == 0xE0) { | 149 | 69.0k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 31.1k | goto InvalidContinuation1; | 153 | 14.8M | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 51.1k | goto InvalidContinuation1; | 160 | 51.1k | } | 161 | 14.8M | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 688k | goto InvalidContinuation2; | 164 | 688k | } | 165 | 14.1M | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 14.1M | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 14.1M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 14.1M | s += 3; | 169 | 14.1M | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 14.1M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 0 | goto Return; | 173 | 14.1M | *p++ = ch; | 174 | 14.1M | continue; | 175 | 14.1M | } | 176 | | | 177 | 34.2M | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 3.17M | Py_UCS4 ch2, ch3, ch4; | 180 | 3.17M | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 3.68k | if (end - s < 2) | 184 | 716 | break; | 185 | 2.96k | ch2 = (unsigned char)s[1]; | 186 | 2.96k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 2.96k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 1.73k | goto InvalidContinuation1; | 190 | 1.23k | if (end - s < 3) | 191 | 528 | break; | 192 | 705 | ch3 = (unsigned char)s[2]; | 193 | 705 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 524 | goto InvalidContinuation2; | 195 | 181 | break; | 196 | 705 | } | 197 | 3.17M | ch2 = (unsigned char)s[1]; | 198 | 3.17M | ch3 = (unsigned char)s[2]; | 199 | 3.17M | ch4 = (unsigned char)s[3]; | 200 | 3.17M | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 2.09M | goto InvalidContinuation1; | 203 | 2.09M | } | 204 | 1.07M | if (ch == 0xF0) { | 205 | 587k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 33.4k | goto InvalidContinuation1; | 209 | 587k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 50.4k | goto InvalidContinuation1; | 213 | 50.4k | } | 214 | 988k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 255k | goto InvalidContinuation2; | 217 | 255k | } | 218 | 733k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 89.1k | goto InvalidContinuation3; | 221 | 89.1k | } | 222 | 643k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 643k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 643k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 643k | s += 4; | 226 | 643k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 643k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 0 | goto Return; | 230 | 643k | *p++ = ch; | 231 | 643k | continue; | 232 | 643k | } | 233 | 31.1M | goto InvalidStart; | 234 | 34.2M | } | 235 | 41.1k | ch = 0; | 236 | 82.5M | Return: | 237 | 82.5M | *inptr = s; | 238 | 82.5M | *outpos = p - dest; | 239 | 82.5M | return ch; | 240 | 65.3M | InvalidStart: | 241 | 65.3M | ch = 1; | 242 | 65.3M | goto Return; | 243 | 16.1M | InvalidContinuation1: | 244 | 16.1M | ch = 2; | 245 | 16.1M | goto Return; | 246 | 944k | InvalidContinuation2: | 247 | 944k | ch = 3; | 248 | 944k | goto Return; | 249 | 89.1k | InvalidContinuation3: | 250 | 89.1k | ch = 4; | 251 | 89.1k | goto Return; | 252 | 41.1k | } |
|
253 | | |
254 | | #undef ASCII_CHAR_MASK |
255 | | |
256 | | |
257 | | /* UTF-8 encoder specialized for a Unicode kind to avoid the slow |
258 | | PyUnicode_READ() macro. Delete some parts of the code depending on the kind: |
259 | | UCS-1 strings don't need to handle surrogates for example. */ |
260 | | Py_LOCAL_INLINE(char *) |
261 | | STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, |
262 | | PyObject *unicode, |
263 | | const STRINGLIB_CHAR *data, |
264 | | Py_ssize_t size, |
265 | | _Py_error_handler error_handler, |
266 | | const char *errors) |
267 | 6.56M | { |
268 | 6.56M | Py_ssize_t i; /* index into data of next input character */ |
269 | 6.56M | char *p; /* next free byte in output buffer */ |
270 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
271 | | PyObject *error_handler_obj = NULL; |
272 | | PyObject *exc = NULL; |
273 | | PyObject *rep = NULL; |
274 | | #endif |
275 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
276 | | const Py_ssize_t max_char_size = 2; |
277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 |
278 | | const Py_ssize_t max_char_size = 3; |
279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ |
280 | | const Py_ssize_t max_char_size = 4; |
281 | | #endif |
282 | | |
283 | 6.56M | assert(size >= 0); |
284 | 6.56M | if (size > PY_SSIZE_T_MAX / max_char_size) { |
285 | | /* integer overflow */ |
286 | 0 | PyErr_NoMemory(); |
287 | 0 | return NULL; |
288 | 0 | } |
289 | | |
290 | 6.56M | _PyBytesWriter_Init(writer); |
291 | 6.56M | p = _PyBytesWriter_Alloc(writer, size * max_char_size); |
292 | 6.56M | if (p == NULL) |
293 | 0 | return NULL; |
294 | | |
295 | 2.89G | for (i = 0; i < size;) { |
296 | 2.89G | Py_UCS4 ch = data[i++]; |
297 | | |
298 | 2.89G | if (ch < 0x80) { |
299 | | /* Encode ASCII */ |
300 | 2.76G | *p++ = (char) ch; |
301 | | |
302 | 2.76G | } |
303 | 47.3M | else |
304 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
305 | 47.3M | if (ch < 0x0800) |
306 | 861k | #endif |
307 | 86.1M | { |
308 | | /* Encode Latin-1 */ |
309 | 86.1M | *p++ = (char)(0xc0 | (ch >> 6)); |
310 | 86.1M | *p++ = (char)(0x80 | (ch & 0x3f)); |
311 | 86.1M | } |
312 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
313 | 46.5M | else if (Py_UNICODE_IS_SURROGATE(ch)) { |
314 | 369k | Py_ssize_t startpos, endpos, newpos; |
315 | 369k | Py_ssize_t k; |
316 | 369k | if (error_handler == _Py_ERROR_UNKNOWN) { |
317 | 202k | error_handler = _Py_GetErrorHandler(errors); |
318 | 202k | } |
319 | | |
320 | 369k | startpos = i-1; |
321 | 369k | endpos = startpos+1; |
322 | | |
323 | 15.2M | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) |
324 | 14.9M | endpos++; |
325 | | |
326 | | /* Only overallocate the buffer if it's not the last write */ |
327 | 369k | writer->overallocate = (endpos < size); |
328 | | |
329 | 369k | switch (error_handler) |
330 | 369k | { |
331 | 0 | case _Py_ERROR_REPLACE: |
332 | 0 | memset(p, '?', endpos - startpos); |
333 | 0 | p += (endpos - startpos); |
334 | 0 | _Py_FALLTHROUGH; |
335 | 0 | case _Py_ERROR_IGNORE: |
336 | 0 | i += (endpos - startpos - 1); |
337 | 0 | break; |
338 | | |
339 | 0 | case _Py_ERROR_SURROGATEPASS: |
340 | 0 | for (k=startpos; k<endpos; k++) { |
341 | 0 | ch = data[k]; |
342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); |
343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); |
345 | 0 | } |
346 | 0 | i += (endpos - startpos - 1); |
347 | 0 | break; |
348 | | |
349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: |
350 | | /* subtract preallocated bytes */ |
351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); |
352 | 0 | p = backslashreplace(writer, p, |
353 | 0 | unicode, startpos, endpos); |
354 | 0 | if (p == NULL) |
355 | 0 | goto error; |
356 | 0 | i += (endpos - startpos - 1); |
357 | 0 | break; |
358 | | |
359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: |
360 | | /* subtract preallocated bytes */ |
361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); |
362 | 0 | p = xmlcharrefreplace(writer, p, |
363 | 0 | unicode, startpos, endpos); |
364 | 0 | if (p == NULL) |
365 | 0 | goto error; |
366 | 0 | i += (endpos - startpos - 1); |
367 | 0 | break; |
368 | | |
369 | 220k | case _Py_ERROR_SURROGATEESCAPE: |
370 | 11.1M | for (k=startpos; k<endpos; k++) { |
371 | 10.9M | ch = data[k]; |
372 | 10.9M | if (!(0xDC80 <= ch && ch <= 0xDCFF)) |
373 | 22 | break; |
374 | 10.9M | *p++ = (char)(ch & 0xff); |
375 | 10.9M | } |
376 | 220k | if (k >= endpos) { |
377 | 220k | i += (endpos - startpos - 1); |
378 | 220k | break; |
379 | 220k | } |
380 | 22 | startpos = k; |
381 | 22 | assert(startpos < endpos); |
382 | 22 | _Py_FALLTHROUGH; |
383 | 148k | default: |
384 | 148k | rep = unicode_encode_call_errorhandler( |
385 | 148k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", |
386 | 148k | unicode, &exc, startpos, endpos, &newpos); |
387 | 148k | if (!rep) |
388 | 148k | goto error; |
389 | | |
390 | 0 | if (newpos < startpos) { |
391 | 0 | writer->overallocate = 1; |
392 | 0 | p = _PyBytesWriter_Prepare(writer, p, |
393 | 0 | max_char_size * (startpos - newpos)); |
394 | 0 | if (p == NULL) |
395 | 0 | goto error; |
396 | 0 | } |
397 | 0 | else { |
398 | | /* subtract preallocated bytes */ |
399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); |
400 | | /* Only overallocate the buffer if it's not the last write */ |
401 | 0 | writer->overallocate = (newpos < size); |
402 | 0 | } |
403 | | |
404 | 0 | if (PyBytes_Check(rep)) { |
405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, |
406 | 0 | PyBytes_AS_STRING(rep), |
407 | 0 | PyBytes_GET_SIZE(rep)); |
408 | 0 | } |
409 | 0 | else { |
410 | | /* rep is unicode */ |
411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { |
412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, |
413 | 0 | startpos, endpos, |
414 | 0 | "surrogates not allowed"); |
415 | 0 | goto error; |
416 | 0 | } |
417 | | |
418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, |
419 | 0 | PyUnicode_DATA(rep), |
420 | 0 | PyUnicode_GET_LENGTH(rep)); |
421 | 0 | } |
422 | | |
423 | 0 | if (p == NULL) |
424 | 0 | goto error; |
425 | 0 | Py_CLEAR(rep); |
426 | |
|
427 | 0 | i = newpos; |
428 | 369k | } |
429 | | |
430 | | /* If overallocation was disabled, ensure that it was the last |
431 | | write. Otherwise, we missed an optimization */ |
432 | 220k | assert(writer->overallocate || i == size); |
433 | 220k | } |
434 | 20.4M | else |
435 | | #if STRINGLIB_SIZEOF_CHAR > 2 |
436 | 20.4M | if (ch < 0x10000) |
437 | 20.3M | #endif |
438 | 46.0M | { |
439 | 46.0M | *p++ = (char)(0xe0 | (ch >> 12)); |
440 | 46.0M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
441 | 46.0M | *p++ = (char)(0x80 | (ch & 0x3f)); |
442 | 46.0M | } |
443 | | #if STRINGLIB_SIZEOF_CHAR > 2 |
444 | | else /* ch >= 0x10000 */ |
445 | 130k | { |
446 | 130k | assert(ch <= MAX_UNICODE); |
447 | | /* Encode UCS4 Unicode ordinals */ |
448 | 130k | *p++ = (char)(0xf0 | (ch >> 18)); |
449 | 130k | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); |
450 | 130k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
451 | 130k | *p++ = (char)(0x80 | (ch & 0x3f)); |
452 | 130k | } |
453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ |
454 | | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ |
455 | 2.89G | } |
456 | | |
457 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
458 | 1.42M | Py_XDECREF(error_handler_obj); |
459 | 1.42M | Py_XDECREF(exc); |
460 | | #endif |
461 | 1.42M | return p; |
462 | | |
463 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
464 | 148k | error: |
465 | 148k | Py_XDECREF(rep); |
466 | 148k | Py_XDECREF(error_handler_obj); |
467 | 148k | Py_XDECREF(exc); |
468 | 148k | return NULL; |
469 | | #endif |
470 | 1.57M | } unicodeobject.c:ucs1lib_utf8_encoder Line | Count | Source | 267 | 4.98M | { | 268 | 4.98M | Py_ssize_t i; /* index into data of next input character */ | 269 | 4.98M | char *p; /* next free byte in output buffer */ | 270 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | | PyObject *error_handler_obj = NULL; | 272 | | PyObject *exc = NULL; | 273 | | PyObject *rep = NULL; | 274 | | #endif | 275 | 4.98M | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | 4.98M | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | | const Py_ssize_t max_char_size = 4; | 281 | | #endif | 282 | | | 283 | 4.98M | assert(size >= 0); | 284 | 4.98M | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 4.98M | _PyBytesWriter_Init(writer); | 291 | 4.98M | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 4.98M | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 622M | for (i = 0; i < size;) { | 296 | 617M | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 617M | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 532M | *p++ = (char) ch; | 301 | | | 302 | 532M | } | 303 | 85.2M | else | 304 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | | if (ch < 0x0800) | 306 | | #endif | 307 | 85.2M | { | 308 | | /* Encode Latin-1 */ | 309 | 85.2M | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 85.2M | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 85.2M | } | 312 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | | Py_ssize_t startpos, endpos, newpos; | 315 | | Py_ssize_t k; | 316 | | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | | error_handler = _Py_GetErrorHandler(errors); | 318 | | } | 319 | | | 320 | | startpos = i-1; | 321 | | endpos = startpos+1; | 322 | | | 323 | | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | | writer->overallocate = (endpos < size); | 328 | | | 329 | | switch (error_handler) | 330 | | { | 331 | | case _Py_ERROR_REPLACE: | 332 | | memset(p, '?', endpos - startpos); | 333 | | p += (endpos - startpos); | 334 | | _Py_FALLTHROUGH; | 335 | | case _Py_ERROR_IGNORE: | 336 | | i += (endpos - startpos - 1); | 337 | | break; | 338 | | | 339 | | case _Py_ERROR_SURROGATEPASS: | 340 | | for (k=startpos; k<endpos; k++) { | 341 | | ch = data[k]; | 342 | | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | | } | 346 | | i += (endpos - startpos - 1); | 347 | | break; | 348 | | | 349 | | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | | writer->min_size -= max_char_size * (endpos - startpos); | 352 | | p = backslashreplace(writer, p, | 353 | | unicode, startpos, endpos); | 354 | | if (p == NULL) | 355 | | goto error; | 356 | | i += (endpos - startpos - 1); | 357 | | break; | 358 | | | 359 | | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | | writer->min_size -= max_char_size * (endpos - startpos); | 362 | | p = xmlcharrefreplace(writer, p, | 363 | | unicode, startpos, endpos); | 364 | | if (p == NULL) | 365 | | goto error; | 366 | | i += (endpos - startpos - 1); | 367 | | break; | 368 | | | 369 | | case _Py_ERROR_SURROGATEESCAPE: | 370 | | for (k=startpos; k<endpos; k++) { | 371 | | ch = data[k]; | 372 | | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | | break; | 374 | | *p++ = (char)(ch & 0xff); | 375 | | } | 376 | | if (k >= endpos) { | 377 | | i += (endpos - startpos - 1); | 378 | | break; | 379 | | } | 380 | | startpos = k; | 381 | | assert(startpos < endpos); | 382 | | _Py_FALLTHROUGH; | 383 | | default: | 384 | | rep = unicode_encode_call_errorhandler( | 385 | | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | | unicode, &exc, startpos, endpos, &newpos); | 387 | | if (!rep) | 388 | | goto error; | 389 | | | 390 | | if (newpos < startpos) { | 391 | | writer->overallocate = 1; | 392 | | p = _PyBytesWriter_Prepare(writer, p, | 393 | | max_char_size * (startpos - newpos)); | 394 | | if (p == NULL) | 395 | | goto error; | 396 | | } | 397 | | else { | 398 | | /* subtract preallocated bytes */ | 399 | | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | | writer->overallocate = (newpos < size); | 402 | | } | 403 | | | 404 | | if (PyBytes_Check(rep)) { | 405 | | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | | PyBytes_AS_STRING(rep), | 407 | | PyBytes_GET_SIZE(rep)); | 408 | | } | 409 | | else { | 410 | | /* rep is unicode */ | 411 | | if (!PyUnicode_IS_ASCII(rep)) { | 412 | | raise_encode_exception(&exc, "utf-8", unicode, | 413 | | startpos, endpos, | 414 | | "surrogates not allowed"); | 415 | | goto error; | 416 | | } | 417 | | | 418 | | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | | PyUnicode_DATA(rep), | 420 | | PyUnicode_GET_LENGTH(rep)); | 421 | | } | 422 | | | 423 | | if (p == NULL) | 424 | | goto error; | 425 | | Py_CLEAR(rep); | 426 | | | 427 | | i = newpos; | 428 | | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | | assert(writer->overallocate || i == size); | 433 | | } | 434 | | else | 435 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | | if (ch < 0x10000) | 437 | | #endif | 438 | | { | 439 | | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | | } | 443 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | | else /* ch >= 0x10000 */ | 445 | | { | 446 | | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | | } | 453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 617M | } | 456 | | | 457 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | | Py_XDECREF(error_handler_obj); | 459 | | Py_XDECREF(exc); | 460 | | #endif | 461 | 4.98M | return p; | 462 | | | 463 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | | error: | 465 | | Py_XDECREF(rep); | 466 | | Py_XDECREF(error_handler_obj); | 467 | | Py_XDECREF(exc); | 468 | | return NULL; | 469 | | #endif | 470 | 4.98M | } |
unicodeobject.c:ucs2lib_utf8_encoder Line | Count | Source | 267 | 1.50M | { | 268 | 1.50M | Py_ssize_t i; /* index into data of next input character */ | 269 | 1.50M | char *p; /* next free byte in output buffer */ | 270 | 1.50M | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | 1.50M | PyObject *error_handler_obj = NULL; | 272 | 1.50M | PyObject *exc = NULL; | 273 | 1.50M | PyObject *rep = NULL; | 274 | 1.50M | #endif | 275 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | | const Py_ssize_t max_char_size = 4; | 281 | | #endif | 282 | | | 283 | 1.50M | assert(size >= 0); | 284 | 1.50M | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 1.50M | _PyBytesWriter_Init(writer); | 291 | 1.50M | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 1.50M | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 1.03G | for (i = 0; i < size;) { | 296 | 1.03G | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 1.03G | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 1.00G | *p++ = (char) ch; | 301 | | | 302 | 1.00G | } | 303 | 26.3M | else | 304 | 26.3M | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | 26.3M | if (ch < 0x0800) | 306 | 289k | #endif | 307 | 289k | { | 308 | | /* Encode Latin-1 */ | 309 | 289k | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 289k | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 289k | } | 312 | 26.0M | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | 26.0M | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | 351k | Py_ssize_t startpos, endpos, newpos; | 315 | 351k | Py_ssize_t k; | 316 | 351k | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | 196k | error_handler = _Py_GetErrorHandler(errors); | 318 | 196k | } | 319 | | | 320 | 351k | startpos = i-1; | 321 | 351k | endpos = startpos+1; | 322 | | | 323 | 15.1M | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | 14.8M | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | 351k | writer->overallocate = (endpos < size); | 328 | | | 329 | 351k | switch (error_handler) | 330 | 351k | { | 331 | 0 | case _Py_ERROR_REPLACE: | 332 | 0 | memset(p, '?', endpos - startpos); | 333 | 0 | p += (endpos - startpos); | 334 | 0 | _Py_FALLTHROUGH; | 335 | 0 | case _Py_ERROR_IGNORE: | 336 | 0 | i += (endpos - startpos - 1); | 337 | 0 | break; | 338 | | | 339 | 0 | case _Py_ERROR_SURROGATEPASS: | 340 | 0 | for (k=startpos; k<endpos; k++) { | 341 | 0 | ch = data[k]; | 342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | 0 | } | 346 | 0 | i += (endpos - startpos - 1); | 347 | 0 | break; | 348 | | | 349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 352 | 0 | p = backslashreplace(writer, p, | 353 | 0 | unicode, startpos, endpos); | 354 | 0 | if (p == NULL) | 355 | 0 | goto error; | 356 | 0 | i += (endpos - startpos - 1); | 357 | 0 | break; | 358 | | | 359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 362 | 0 | p = xmlcharrefreplace(writer, p, | 363 | 0 | unicode, startpos, endpos); | 364 | 0 | if (p == NULL) | 365 | 0 | goto error; | 366 | 0 | i += (endpos - startpos - 1); | 367 | 0 | break; | 368 | | | 369 | 206k | case _Py_ERROR_SURROGATEESCAPE: | 370 | 11.0M | for (k=startpos; k<endpos; k++) { | 371 | 10.8M | ch = data[k]; | 372 | 10.8M | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | 14 | break; | 374 | 10.8M | *p++ = (char)(ch & 0xff); | 375 | 10.8M | } | 376 | 206k | if (k >= endpos) { | 377 | 206k | i += (endpos - startpos - 1); | 378 | 206k | break; | 379 | 206k | } | 380 | 14 | startpos = k; | 381 | 14 | assert(startpos < endpos); | 382 | 14 | _Py_FALLTHROUGH; | 383 | 144k | default: | 384 | 144k | rep = unicode_encode_call_errorhandler( | 385 | 144k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | 144k | unicode, &exc, startpos, endpos, &newpos); | 387 | 144k | if (!rep) | 388 | 144k | goto error; | 389 | | | 390 | 0 | if (newpos < startpos) { | 391 | 0 | writer->overallocate = 1; | 392 | 0 | p = _PyBytesWriter_Prepare(writer, p, | 393 | 0 | max_char_size * (startpos - newpos)); | 394 | 0 | if (p == NULL) | 395 | 0 | goto error; | 396 | 0 | } | 397 | 0 | else { | 398 | | /* subtract preallocated bytes */ | 399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | 0 | writer->overallocate = (newpos < size); | 402 | 0 | } | 403 | | | 404 | 0 | if (PyBytes_Check(rep)) { | 405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | 0 | PyBytes_AS_STRING(rep), | 407 | 0 | PyBytes_GET_SIZE(rep)); | 408 | 0 | } | 409 | 0 | else { | 410 | | /* rep is unicode */ | 411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { | 412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, | 413 | 0 | startpos, endpos, | 414 | 0 | "surrogates not allowed"); | 415 | 0 | goto error; | 416 | 0 | } | 417 | | | 418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | 0 | PyUnicode_DATA(rep), | 420 | 0 | PyUnicode_GET_LENGTH(rep)); | 421 | 0 | } | 422 | | | 423 | 0 | if (p == NULL) | 424 | 0 | goto error; | 425 | 0 | Py_CLEAR(rep); | 426 | |
| 427 | 0 | i = newpos; | 428 | 351k | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | 206k | assert(writer->overallocate || i == size); | 433 | 206k | } | 434 | 25.7M | else | 435 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | | if (ch < 0x10000) | 437 | | #endif | 438 | 25.7M | { | 439 | 25.7M | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | 25.7M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | 25.7M | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | 25.7M | } | 443 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | | else /* ch >= 0x10000 */ | 445 | | { | 446 | | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | | } | 453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | 1.03G | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 1.03G | } | 456 | | | 457 | 1.36M | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | 1.36M | Py_XDECREF(error_handler_obj); | 459 | 1.36M | Py_XDECREF(exc); | 460 | 1.36M | #endif | 461 | 1.36M | return p; | 462 | | | 463 | 0 | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | 144k | error: | 465 | 144k | Py_XDECREF(rep); | 466 | 144k | Py_XDECREF(error_handler_obj); | 467 | 144k | Py_XDECREF(exc); | 468 | 144k | return NULL; | 469 | 1.50M | #endif | 470 | 1.50M | } |
unicodeobject.c:ucs4lib_utf8_encoder Line | Count | Source | 267 | 66.5k | { | 268 | 66.5k | Py_ssize_t i; /* index into data of next input character */ | 269 | 66.5k | char *p; /* next free byte in output buffer */ | 270 | 66.5k | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | 66.5k | PyObject *error_handler_obj = NULL; | 272 | 66.5k | PyObject *exc = NULL; | 273 | 66.5k | PyObject *rep = NULL; | 274 | 66.5k | #endif | 275 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | 66.5k | const Py_ssize_t max_char_size = 4; | 281 | 66.5k | #endif | 282 | | | 283 | 66.5k | assert(size >= 0); | 284 | 66.5k | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 66.5k | _PyBytesWriter_Init(writer); | 291 | 66.5k | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 66.5k | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 1.24G | for (i = 0; i < size;) { | 296 | 1.24G | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 1.24G | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 1.22G | *p++ = (char) ch; | 301 | | | 302 | 1.22G | } | 303 | 21.0M | else | 304 | 21.0M | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | 21.0M | if (ch < 0x0800) | 306 | 571k | #endif | 307 | 571k | { | 308 | | /* Encode Latin-1 */ | 309 | 571k | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 571k | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 571k | } | 312 | 20.4M | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | 20.4M | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | 17.6k | Py_ssize_t startpos, endpos, newpos; | 315 | 17.6k | Py_ssize_t k; | 316 | 17.6k | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | 5.97k | error_handler = _Py_GetErrorHandler(errors); | 318 | 5.97k | } | 319 | | | 320 | 17.6k | startpos = i-1; | 321 | 17.6k | endpos = startpos+1; | 322 | | | 323 | 115k | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | 97.9k | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | 17.6k | writer->overallocate = (endpos < size); | 328 | | | 329 | 17.6k | switch (error_handler) | 330 | 17.6k | { | 331 | 0 | case _Py_ERROR_REPLACE: | 332 | 0 | memset(p, '?', endpos - startpos); | 333 | 0 | p += (endpos - startpos); | 334 | 0 | _Py_FALLTHROUGH; | 335 | 0 | case _Py_ERROR_IGNORE: | 336 | 0 | i += (endpos - startpos - 1); | 337 | 0 | break; | 338 | | | 339 | 0 | case _Py_ERROR_SURROGATEPASS: | 340 | 0 | for (k=startpos; k<endpos; k++) { | 341 | 0 | ch = data[k]; | 342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | 0 | } | 346 | 0 | i += (endpos - startpos - 1); | 347 | 0 | break; | 348 | | | 349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 352 | 0 | p = backslashreplace(writer, p, | 353 | 0 | unicode, startpos, endpos); | 354 | 0 | if (p == NULL) | 355 | 0 | goto error; | 356 | 0 | i += (endpos - startpos - 1); | 357 | 0 | break; | 358 | | | 359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 362 | 0 | p = xmlcharrefreplace(writer, p, | 363 | 0 | unicode, startpos, endpos); | 364 | 0 | if (p == NULL) | 365 | 0 | goto error; | 366 | 0 | i += (endpos - startpos - 1); | 367 | 0 | break; | 368 | | | 369 | 13.8k | case _Py_ERROR_SURROGATEESCAPE: | 370 | 120k | for (k=startpos; k<endpos; k++) { | 371 | 106k | ch = data[k]; | 372 | 106k | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | 8 | break; | 374 | 106k | *p++ = (char)(ch & 0xff); | 375 | 106k | } | 376 | 13.8k | if (k >= endpos) { | 377 | 13.8k | i += (endpos - startpos - 1); | 378 | 13.8k | break; | 379 | 13.8k | } | 380 | 8 | startpos = k; | 381 | 8 | assert(startpos < endpos); | 382 | 8 | _Py_FALLTHROUGH; | 383 | 3.85k | default: | 384 | 3.85k | rep = unicode_encode_call_errorhandler( | 385 | 3.85k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | 3.85k | unicode, &exc, startpos, endpos, &newpos); | 387 | 3.85k | if (!rep) | 388 | 3.85k | goto error; | 389 | | | 390 | 0 | if (newpos < startpos) { | 391 | 0 | writer->overallocate = 1; | 392 | 0 | p = _PyBytesWriter_Prepare(writer, p, | 393 | 0 | max_char_size * (startpos - newpos)); | 394 | 0 | if (p == NULL) | 395 | 0 | goto error; | 396 | 0 | } | 397 | 0 | else { | 398 | | /* subtract preallocated bytes */ | 399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | 0 | writer->overallocate = (newpos < size); | 402 | 0 | } | 403 | | | 404 | 0 | if (PyBytes_Check(rep)) { | 405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | 0 | PyBytes_AS_STRING(rep), | 407 | 0 | PyBytes_GET_SIZE(rep)); | 408 | 0 | } | 409 | 0 | else { | 410 | | /* rep is unicode */ | 411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { | 412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, | 413 | 0 | startpos, endpos, | 414 | 0 | "surrogates not allowed"); | 415 | 0 | goto error; | 416 | 0 | } | 417 | | | 418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | 0 | PyUnicode_DATA(rep), | 420 | 0 | PyUnicode_GET_LENGTH(rep)); | 421 | 0 | } | 422 | | | 423 | 0 | if (p == NULL) | 424 | 0 | goto error; | 425 | 0 | Py_CLEAR(rep); | 426 | |
| 427 | 0 | i = newpos; | 428 | 17.6k | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | 13.8k | assert(writer->overallocate || i == size); | 433 | 13.8k | } | 434 | 20.4M | else | 435 | 20.4M | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | 20.4M | if (ch < 0x10000) | 437 | 20.3M | #endif | 438 | 20.3M | { | 439 | 20.3M | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | 20.3M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | 20.3M | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | 20.3M | } | 443 | 130k | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | 130k | else /* ch >= 0x10000 */ | 445 | 130k | { | 446 | 130k | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | 130k | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | 130k | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | 130k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | 130k | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | 130k | } | 453 | 1.24G | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | 1.24G | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 1.24G | } | 456 | | | 457 | 62.7k | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | 62.7k | Py_XDECREF(error_handler_obj); | 459 | 62.7k | Py_XDECREF(exc); | 460 | 62.7k | #endif | 461 | 62.7k | return p; | 462 | | | 463 | 0 | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | 3.85k | error: | 465 | 3.85k | Py_XDECREF(rep); | 466 | 3.85k | Py_XDECREF(error_handler_obj); | 467 | 3.85k | Py_XDECREF(exc); | 468 | 3.85k | return NULL; | 469 | 66.5k | #endif | 470 | 66.5k | } |
Unexecuted instantiation: unicodeobject.c:asciilib_utf8_encoder |
471 | | |
472 | | /* The pattern for constructing UCS2-repeated masks. */ |
473 | | #if SIZEOF_LONG == 8 |
474 | 401k | # define UCS2_REPEAT_MASK 0x0001000100010001ul |
475 | | #elif SIZEOF_LONG == 4 |
476 | | # define UCS2_REPEAT_MASK 0x00010001ul |
477 | | #else |
478 | | # error C 'long' size should be either 4 or 8! |
479 | | #endif |
480 | | |
481 | | /* The mask for fast checking. */ |
482 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
483 | | /* The mask for fast checking of whether a C 'long' contains a |
484 | | non-ASCII or non-Latin1 UTF16-encoded characters. */ |
485 | 13.0k | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) |
486 | | #else |
487 | | /* The mask for fast checking of whether a C 'long' may contain |
488 | | UTF16-encoded surrogate characters. This is an efficient heuristic, |
489 | | assuming that non-surrogate characters with a code point >= 0x8000 are |
490 | | rare in most input. |
491 | | */ |
492 | 364k | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) |
493 | | #endif |
494 | | /* The mask for fast byte-swapping. */ |
495 | 24.9k | #define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) |
496 | | /* Swap bytes. */ |
497 | 12.4k | #define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ |
498 | 12.4k | (((value) & STRIPPED_MASK) << 8)) |
499 | | |
500 | | Py_LOCAL_INLINE(Py_UCS4) |
501 | | STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, |
502 | | STRINGLIB_CHAR *dest, Py_ssize_t *outpos, |
503 | | int native_ordering) |
504 | 71.9k | { |
505 | 71.9k | Py_UCS4 ch; |
506 | 71.9k | const unsigned char *q = *inptr; |
507 | 71.9k | STRINGLIB_CHAR *p = dest + *outpos; |
508 | | /* Offsets from q for retrieving byte pairs in the right order. */ |
509 | 71.9k | #if PY_LITTLE_ENDIAN |
510 | 71.9k | int ihi = !!native_ordering, ilo = !native_ordering; |
511 | | #else |
512 | | int ihi = !native_ordering, ilo = !!native_ordering; |
513 | | #endif |
514 | 71.9k | --e; |
515 | | |
516 | 444k | while (q < e) { |
517 | 439k | Py_UCS4 ch2; |
518 | | /* First check for possible aligned read of a C 'long'. Unaligned |
519 | | reads are more expensive, better to defer to another iteration. */ |
520 | 439k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { |
521 | | /* Fast path for runs of in-range non-surrogate chars. */ |
522 | 118k | const unsigned char *_q = q; |
523 | 399k | while (_q + SIZEOF_LONG <= e) { |
524 | 386k | unsigned long block = * (const unsigned long *) _q; |
525 | 386k | if (native_ordering) { |
526 | | /* Can use buffer directly */ |
527 | 377k | if (block & FAST_CHAR_MASK) |
528 | 100k | break; |
529 | 377k | } |
530 | 9.03k | else { |
531 | | /* Need to byte-swap */ |
532 | 9.03k | if (block & SWAB(FAST_CHAR_MASK)) |
533 | 4.57k | break; |
534 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
535 | 1.00k | block >>= 8; |
536 | | #else |
537 | 3.45k | block = SWAB(block); |
538 | | #endif |
539 | 3.45k | } |
540 | 281k | #if PY_LITTLE_ENDIAN |
541 | | # if SIZEOF_LONG == 4 |
542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); |
544 | | # elif SIZEOF_LONG == 8 |
545 | 281k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
546 | 281k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
547 | 281k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
548 | 281k | p[3] = (STRINGLIB_CHAR)(block >> 48); |
549 | 281k | # endif |
550 | | #else |
551 | | # if SIZEOF_LONG == 4 |
552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); |
553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
554 | | # elif SIZEOF_LONG == 8 |
555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); |
556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
559 | | # endif |
560 | | #endif |
561 | 281k | _q += SIZEOF_LONG; |
562 | 281k | p += SIZEOF_LONG / 2; |
563 | 281k | } |
564 | 118k | q = _q; |
565 | 118k | if (q >= e) |
566 | 954 | break; |
567 | 118k | } |
568 | | |
569 | 438k | ch = (q[ihi] << 8) | q[ilo]; |
570 | 438k | q += 2; |
571 | 438k | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
572 | | #if STRINGLIB_SIZEOF_CHAR < 2 |
573 | 29.8k | if (ch > STRINGLIB_MAX_CHAR) |
574 | | /* Out-of-range */ |
575 | 17.4k | goto Return; |
576 | 12.4k | #endif |
577 | 12.4k | *p++ = (STRINGLIB_CHAR)ch; |
578 | 12.4k | continue; |
579 | 388k | } |
580 | | |
581 | | /* UTF-16 code pair: */ |
582 | 49.5k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) |
583 | 26.8k | goto IllegalEncoding; |
584 | 22.6k | if (q >= e) |
585 | 1.92k | goto UnexpectedEnd; |
586 | 20.7k | ch2 = (q[ihi] << 8) | q[ilo]; |
587 | 20.7k | q += 2; |
588 | 20.7k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) |
589 | 13.2k | goto IllegalSurrogate; |
590 | 7.55k | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); |
591 | | #if STRINGLIB_SIZEOF_CHAR < 4 |
592 | | /* Out-of-range */ |
593 | 6.81k | goto Return; |
594 | | #else |
595 | | *p++ = (STRINGLIB_CHAR)ch; |
596 | | #endif |
597 | 742 | } |
598 | 5.74k | ch = 0; |
599 | 71.9k | Return: |
600 | 71.9k | *inptr = q; |
601 | 71.9k | *outpos = p - dest; |
602 | 71.9k | return ch; |
603 | 1.92k | UnexpectedEnd: |
604 | 1.92k | ch = 1; |
605 | 1.92k | goto Return; |
606 | 26.8k | IllegalEncoding: |
607 | 26.8k | ch = 2; |
608 | 26.8k | goto Return; |
609 | 13.2k | IllegalSurrogate: |
610 | 13.2k | ch = 3; |
611 | 13.2k | goto Return; |
612 | 5.74k | } unicodeobject.c:asciilib_utf16_decode Line | Count | Source | 504 | 18.6k | { | 505 | 18.6k | Py_UCS4 ch; | 506 | 18.6k | const unsigned char *q = *inptr; | 507 | 18.6k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 18.6k | #if PY_LITTLE_ENDIAN | 510 | 18.6k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 18.6k | --e; | 515 | | | 516 | 26.3k | while (q < e) { | 517 | 25.9k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 25.9k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 17.9k | const unsigned char *_q = q; | 523 | 20.6k | while (_q + SIZEOF_LONG <= e) { | 524 | 13.2k | unsigned long block = * (const unsigned long *) _q; | 525 | 13.2k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 11.2k | if (block & FAST_CHAR_MASK) | 528 | 9.25k | break; | 529 | 11.2k | } | 530 | 2.02k | else { | 531 | | /* Need to byte-swap */ | 532 | 2.02k | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 1.29k | break; | 534 | 732 | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | 732 | block >>= 8; | 536 | | #else | 537 | | block = SWAB(block); | 538 | | #endif | 539 | 732 | } | 540 | 2.71k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 2.71k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 2.71k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 2.71k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 2.71k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 2.71k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 2.71k | _q += SIZEOF_LONG; | 562 | 2.71k | p += SIZEOF_LONG / 2; | 563 | 2.71k | } | 564 | 17.9k | q = _q; | 565 | 17.9k | if (q >= e) | 566 | 302 | break; | 567 | 17.9k | } | 568 | | | 569 | 25.6k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 25.6k | q += 2; | 571 | 25.6k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | 23.8k | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | 23.8k | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | 16.1k | goto Return; | 576 | 7.71k | #endif | 577 | 7.71k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 7.71k | continue; | 579 | 23.8k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 1.72k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 864 | goto IllegalEncoding; | 584 | 860 | if (q >= e) | 585 | 242 | goto UnexpectedEnd; | 586 | 618 | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 618 | q += 2; | 588 | 618 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 262 | goto IllegalSurrogate; | 590 | 356 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 356 | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 356 | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 618 | } | 598 | 777 | ch = 0; | 599 | 18.6k | Return: | 600 | 18.6k | *inptr = q; | 601 | 18.6k | *outpos = p - dest; | 602 | 18.6k | return ch; | 603 | 242 | UnexpectedEnd: | 604 | 242 | ch = 1; | 605 | 242 | goto Return; | 606 | 864 | IllegalEncoding: | 607 | 864 | ch = 2; | 608 | 864 | goto Return; | 609 | 262 | IllegalSurrogate: | 610 | 262 | ch = 3; | 611 | 262 | goto Return; | 612 | 777 | } |
unicodeobject.c:ucs1lib_utf16_decode Line | Count | Source | 504 | 2.79k | { | 505 | 2.79k | Py_UCS4 ch; | 506 | 2.79k | const unsigned char *q = *inptr; | 507 | 2.79k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 2.79k | #if PY_LITTLE_ENDIAN | 510 | 2.79k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 2.79k | --e; | 515 | | | 516 | 7.48k | while (q < e) { | 517 | 7.33k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 7.33k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 1.23k | const unsigned char *_q = q; | 523 | 2.41k | while (_q + SIZEOF_LONG <= e) { | 524 | 2.22k | unsigned long block = * (const unsigned long *) _q; | 525 | 2.22k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 1.76k | if (block & FAST_CHAR_MASK) | 528 | 864 | break; | 529 | 1.76k | } | 530 | 451 | else { | 531 | | /* Need to byte-swap */ | 532 | 451 | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 174 | break; | 534 | 277 | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | 277 | block >>= 8; | 536 | | #else | 537 | | block = SWAB(block); | 538 | | #endif | 539 | 277 | } | 540 | 1.18k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 1.18k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 1.18k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 1.18k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 1.18k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 1.18k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 1.18k | _q += SIZEOF_LONG; | 562 | 1.18k | p += SIZEOF_LONG / 2; | 563 | 1.18k | } | 564 | 1.23k | q = _q; | 565 | 1.23k | if (q >= e) | 566 | 122 | break; | 567 | 1.23k | } | 568 | | | 569 | 7.21k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 7.21k | q += 2; | 571 | 7.21k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | 5.97k | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | 5.97k | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | 1.28k | goto Return; | 576 | 4.69k | #endif | 577 | 4.69k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 4.69k | continue; | 579 | 5.97k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 1.23k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 156 | goto IllegalEncoding; | 584 | 1.08k | if (q >= e) | 585 | 51 | goto UnexpectedEnd; | 586 | 1.03k | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 1.03k | q += 2; | 588 | 1.03k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 802 | goto IllegalSurrogate; | 590 | 229 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 229 | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 229 | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 1.03k | } | 598 | 275 | ch = 0; | 599 | 2.79k | Return: | 600 | 2.79k | *inptr = q; | 601 | 2.79k | *outpos = p - dest; | 602 | 2.79k | return ch; | 603 | 51 | UnexpectedEnd: | 604 | 51 | ch = 1; | 605 | 51 | goto Return; | 606 | 156 | IllegalEncoding: | 607 | 156 | ch = 2; | 608 | 156 | goto Return; | 609 | 802 | IllegalSurrogate: | 610 | 802 | ch = 3; | 611 | 802 | goto Return; | 612 | 275 | } |
unicodeobject.c:ucs2lib_utf16_decode Line | Count | Source | 504 | 17.6k | { | 505 | 17.6k | Py_UCS4 ch; | 506 | 17.6k | const unsigned char *q = *inptr; | 507 | 17.6k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 17.6k | #if PY_LITTLE_ENDIAN | 510 | 17.6k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 17.6k | --e; | 515 | | | 516 | 292k | while (q < e) { | 517 | 289k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 289k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 68.8k | const unsigned char *_q = q; | 523 | 327k | while (_q + SIZEOF_LONG <= e) { | 524 | 324k | unsigned long block = * (const unsigned long *) _q; | 525 | 324k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 320k | if (block & FAST_CHAR_MASK) | 528 | 63.3k | break; | 529 | 320k | } | 530 | 4.50k | else { | 531 | | /* Need to byte-swap */ | 532 | 4.50k | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 2.07k | break; | 534 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | | block >>= 8; | 536 | | #else | 537 | 2.43k | block = SWAB(block); | 538 | 2.43k | #endif | 539 | 2.43k | } | 540 | 259k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 259k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 259k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 259k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 259k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 259k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 259k | _q += SIZEOF_LONG; | 562 | 259k | p += SIZEOF_LONG / 2; | 563 | 259k | } | 564 | 68.8k | q = _q; | 565 | 68.8k | if (q >= e) | 566 | 411 | break; | 567 | 68.8k | } | 568 | | | 569 | 288k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 288k | q += 2; | 571 | 288k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | | goto Return; | 576 | | #endif | 577 | 275k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 275k | continue; | 579 | 275k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 13.8k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 1.88k | goto IllegalEncoding; | 584 | 11.9k | if (q >= e) | 585 | 281 | goto UnexpectedEnd; | 586 | 11.6k | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 11.6k | q += 2; | 588 | 11.6k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 5.44k | goto IllegalSurrogate; | 590 | 6.22k | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 6.22k | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 6.22k | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 11.6k | } | 598 | 3.78k | ch = 0; | 599 | 17.6k | Return: | 600 | 17.6k | *inptr = q; | 601 | 17.6k | *outpos = p - dest; | 602 | 17.6k | return ch; | 603 | 281 | UnexpectedEnd: | 604 | 281 | ch = 1; | 605 | 281 | goto Return; | 606 | 1.88k | IllegalEncoding: | 607 | 1.88k | ch = 2; | 608 | 1.88k | goto Return; | 609 | 5.44k | IllegalSurrogate: | 610 | 5.44k | ch = 3; | 611 | 5.44k | goto Return; | 612 | 3.78k | } |
unicodeobject.c:ucs4lib_utf16_decode Line | Count | Source | 504 | 32.8k | { | 505 | 32.8k | Py_UCS4 ch; | 506 | 32.8k | const unsigned char *q = *inptr; | 507 | 32.8k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 32.8k | #if PY_LITTLE_ENDIAN | 510 | 32.8k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 32.8k | --e; | 515 | | | 516 | 117k | while (q < e) { | 517 | 116k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 116k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 30.2k | const unsigned char *_q = q; | 523 | 48.3k | while (_q + SIZEOF_LONG <= e) { | 524 | 45.9k | unsigned long block = * (const unsigned long *) _q; | 525 | 45.9k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 43.9k | if (block & FAST_CHAR_MASK) | 528 | 26.8k | break; | 529 | 43.9k | } | 530 | 2.05k | else { | 531 | | /* Need to byte-swap */ | 532 | 2.05k | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 1.03k | break; | 534 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | | block >>= 8; | 536 | | #else | 537 | 1.01k | block = SWAB(block); | 538 | 1.01k | #endif | 539 | 1.01k | } | 540 | 18.0k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 18.0k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 18.0k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 18.0k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 18.0k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 18.0k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 18.0k | _q += SIZEOF_LONG; | 562 | 18.0k | p += SIZEOF_LONG / 2; | 563 | 18.0k | } | 564 | 30.2k | q = _q; | 565 | 30.2k | if (q >= e) | 566 | 119 | break; | 567 | 30.2k | } | 568 | | | 569 | 116k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 116k | q += 2; | 571 | 116k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | | goto Return; | 576 | | #endif | 577 | 84.0k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 84.0k | continue; | 579 | 84.0k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 32.7k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 23.9k | goto IllegalEncoding; | 584 | 8.78k | if (q >= e) | 585 | 1.34k | goto UnexpectedEnd; | 586 | 7.43k | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 7.43k | q += 2; | 588 | 7.43k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 6.69k | goto IllegalSurrogate; | 590 | 742 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | | goto Return; | 594 | | #else | 595 | 742 | *p++ = (STRINGLIB_CHAR)ch; | 596 | 742 | #endif | 597 | 742 | } | 598 | 909 | ch = 0; | 599 | 32.8k | Return: | 600 | 32.8k | *inptr = q; | 601 | 32.8k | *outpos = p - dest; | 602 | 32.8k | return ch; | 603 | 1.34k | UnexpectedEnd: | 604 | 1.34k | ch = 1; | 605 | 1.34k | goto Return; | 606 | 23.9k | IllegalEncoding: | 607 | 23.9k | ch = 2; | 608 | 23.9k | goto Return; | 609 | 6.69k | IllegalSurrogate: | 610 | 6.69k | ch = 3; | 611 | 6.69k | goto Return; | 612 | 909 | } |
|
613 | | #undef UCS2_REPEAT_MASK |
614 | | #undef FAST_CHAR_MASK |
615 | | #undef STRIPPED_MASK |
616 | | #undef SWAB |
617 | | |
618 | | |
619 | | #if STRINGLIB_MAX_CHAR >= 0x80 |
620 | | Py_LOCAL_INLINE(Py_ssize_t) |
621 | | STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, |
622 | | Py_ssize_t len, |
623 | | unsigned short **outptr, |
624 | | int native_ordering) |
625 | 0 | { |
626 | 0 | unsigned short *out = *outptr; |
627 | 0 | const STRINGLIB_CHAR *end = in + len; |
628 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
629 | 0 | if (native_ordering) { |
630 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
631 | 0 | while (in < unrolled_end) { |
632 | 0 | out[0] = in[0]; |
633 | 0 | out[1] = in[1]; |
634 | 0 | out[2] = in[2]; |
635 | 0 | out[3] = in[3]; |
636 | 0 | in += 4; out += 4; |
637 | 0 | } |
638 | 0 | while (in < end) { |
639 | 0 | *out++ = *in++; |
640 | 0 | } |
641 | 0 | } else { |
642 | 0 | # define SWAB2(CH) ((CH) << 8) /* high byte is zero */ |
643 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
644 | 0 | while (in < unrolled_end) { |
645 | 0 | out[0] = SWAB2(in[0]); |
646 | 0 | out[1] = SWAB2(in[1]); |
647 | 0 | out[2] = SWAB2(in[2]); |
648 | 0 | out[3] = SWAB2(in[3]); |
649 | 0 | in += 4; out += 4; |
650 | 0 | } |
651 | 0 | while (in < end) { |
652 | 0 | Py_UCS4 ch = *in++; |
653 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
654 | 0 | } |
655 | 0 | #undef SWAB2 |
656 | 0 | } |
657 | | *outptr = out; |
658 | | return len; |
659 | | #else |
660 | 0 | if (native_ordering) { |
661 | | #if STRINGLIB_MAX_CHAR < 0x10000 |
662 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
663 | 0 | while (in < unrolled_end) { |
664 | | /* check if any character is a surrogate character */ |
665 | 0 | if (((in[0] ^ 0xd800) & |
666 | 0 | (in[1] ^ 0xd800) & |
667 | 0 | (in[2] ^ 0xd800) & |
668 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
669 | 0 | break; |
670 | 0 | out[0] = in[0]; |
671 | 0 | out[1] = in[1]; |
672 | 0 | out[2] = in[2]; |
673 | 0 | out[3] = in[3]; |
674 | 0 | in += 4; out += 4; |
675 | 0 | } |
676 | | #endif |
677 | 0 | while (in < end) { |
678 | 0 | Py_UCS4 ch; |
679 | 0 | ch = *in++; |
680 | 0 | if (ch < 0xd800) |
681 | 0 | *out++ = ch; |
682 | 0 | else if (ch < 0xe000) |
683 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
684 | 0 | goto fail; |
685 | | #if STRINGLIB_MAX_CHAR >= 0x10000 |
686 | 0 | else if (ch >= 0x10000) { |
687 | 0 | out[0] = Py_UNICODE_HIGH_SURROGATE(ch); |
688 | 0 | out[1] = Py_UNICODE_LOW_SURROGATE(ch); |
689 | 0 | out += 2; |
690 | 0 | } |
691 | 0 | #endif |
692 | 0 | else |
693 | 0 | *out++ = ch; |
694 | 0 | } |
695 | 0 | } else { |
696 | 0 | #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) |
697 | | #if STRINGLIB_MAX_CHAR < 0x10000 |
698 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
699 | 0 | while (in < unrolled_end) { |
700 | | /* check if any character is a surrogate character */ |
701 | 0 | if (((in[0] ^ 0xd800) & |
702 | 0 | (in[1] ^ 0xd800) & |
703 | 0 | (in[2] ^ 0xd800) & |
704 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
705 | 0 | break; |
706 | 0 | out[0] = SWAB2(in[0]); |
707 | 0 | out[1] = SWAB2(in[1]); |
708 | 0 | out[2] = SWAB2(in[2]); |
709 | 0 | out[3] = SWAB2(in[3]); |
710 | 0 | in += 4; out += 4; |
711 | 0 | } |
712 | | #endif |
713 | 0 | while (in < end) { |
714 | 0 | Py_UCS4 ch = *in++; |
715 | 0 | if (ch < 0xd800) |
716 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
717 | 0 | else if (ch < 0xe000) |
718 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
719 | 0 | goto fail; |
720 | | #if STRINGLIB_MAX_CHAR >= 0x10000 |
721 | 0 | else if (ch >= 0x10000) { |
722 | 0 | Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); |
723 | 0 | Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); |
724 | 0 | out[0] = SWAB2(ch1); |
725 | 0 | out[1] = SWAB2(ch2); |
726 | 0 | out += 2; |
727 | 0 | } |
728 | 0 | #endif |
729 | 0 | else |
730 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
731 | 0 | } |
732 | 0 | #undef SWAB2 |
733 | 0 | } |
734 | 0 | *outptr = out; |
735 | 0 | return len; |
736 | 0 | fail: |
737 | 0 | *outptr = out; |
738 | 0 | return len - (end - in + 1); |
739 | | #endif |
740 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf16_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf16_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf16_encode |
741 | | |
742 | | static inline uint32_t |
743 | | STRINGLIB(SWAB4)(STRINGLIB_CHAR ch) |
744 | 0 | { |
745 | 0 | uint32_t word = ch; |
746 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
747 | | /* high bytes are zero */ |
748 | | return (word << 24); |
749 | | #elif STRINGLIB_SIZEOF_CHAR == 2 |
750 | | /* high bytes are zero */ |
751 | | return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8); |
752 | | #else |
753 | | return _Py_bswap32(word); |
754 | | #endif |
755 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_SWAB4 Unexecuted instantiation: unicodeobject.c:ucs2lib_SWAB4 Unexecuted instantiation: unicodeobject.c:ucs4lib_SWAB4 |
756 | | |
757 | | Py_LOCAL_INLINE(Py_ssize_t) |
758 | | STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, |
759 | | Py_ssize_t len, |
760 | | uint32_t **outptr, |
761 | | int native_ordering) |
762 | 0 | { |
763 | 0 | uint32_t *out = *outptr; |
764 | 0 | const STRINGLIB_CHAR *end = in + len; |
765 | 0 | if (native_ordering) { |
766 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
767 | 0 | while (in < unrolled_end) { |
768 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
769 | | /* check if any character is a surrogate character */ |
770 | 0 | if (((in[0] ^ 0xd800) & |
771 | 0 | (in[1] ^ 0xd800) & |
772 | 0 | (in[2] ^ 0xd800) & |
773 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
774 | 0 | break; |
775 | 0 | #endif |
776 | 0 | out[0] = in[0]; |
777 | 0 | out[1] = in[1]; |
778 | 0 | out[2] = in[2]; |
779 | 0 | out[3] = in[3]; |
780 | 0 | in += 4; out += 4; |
781 | 0 | } |
782 | 0 | while (in < end) { |
783 | 0 | Py_UCS4 ch; |
784 | 0 | ch = *in++; |
785 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
786 | 0 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
787 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
788 | 0 | goto fail; |
789 | 0 | } |
790 | 0 | #endif |
791 | 0 | *out++ = ch; |
792 | 0 | } |
793 | 0 | } else { |
794 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
795 | 0 | while (in < unrolled_end) { |
796 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
797 | | /* check if any character is a surrogate character */ |
798 | 0 | if (((in[0] ^ 0xd800) & |
799 | 0 | (in[1] ^ 0xd800) & |
800 | 0 | (in[2] ^ 0xd800) & |
801 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
802 | 0 | break; |
803 | 0 | #endif |
804 | 0 | out[0] = STRINGLIB(SWAB4)(in[0]); |
805 | 0 | out[1] = STRINGLIB(SWAB4)(in[1]); |
806 | 0 | out[2] = STRINGLIB(SWAB4)(in[2]); |
807 | 0 | out[3] = STRINGLIB(SWAB4)(in[3]); |
808 | 0 | in += 4; out += 4; |
809 | 0 | } |
810 | 0 | while (in < end) { |
811 | 0 | Py_UCS4 ch = *in++; |
812 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
813 | 0 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
814 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
815 | 0 | goto fail; |
816 | 0 | } |
817 | 0 | #endif |
818 | 0 | *out++ = STRINGLIB(SWAB4)(ch); |
819 | 0 | } |
820 | 0 | } |
821 | 0 | *outptr = out; |
822 | 0 | return len; |
823 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
824 | 0 | fail: |
825 | 0 | *outptr = out; |
826 | 0 | return len - (end - in + 1); |
827 | | #endif |
828 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf32_encode |
829 | | |
830 | | #endif |