/src/cpython3/Objects/stringlib/codecs.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* stringlib: codec implementations */ |
2 | | |
3 | | #if !STRINGLIB_IS_UNICODE |
4 | | # error "codecs.h is specific to Unicode" |
5 | | #endif |
6 | | |
7 | | #include "pycore_bitutils.h" // _Py_bswap32() |
8 | | |
9 | | /* Mask to quickly check whether a C 'size_t' contains a |
10 | | non-ASCII, UTF8-encoded char. */ |
11 | | #if (SIZEOF_SIZE_T == 8) |
12 | 93.1M | # define ASCII_CHAR_MASK 0x8080808080808080ULL |
13 | | #elif (SIZEOF_SIZE_T == 4) |
14 | | # define ASCII_CHAR_MASK 0x80808080U |
15 | | #else |
16 | | # error C 'size_t' size should be either 4 or 8! |
17 | | #endif |
18 | | |
19 | | /* 10xxxxxx */ |
20 | 695M | #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0) |
21 | | |
22 | | Py_LOCAL_INLINE(Py_UCS4) |
23 | | STRINGLIB(utf8_decode)(const char **inptr, const char *end, |
24 | | STRINGLIB_CHAR *dest, |
25 | | Py_ssize_t *outpos) |
26 | 6.23M | { |
27 | 6.23M | Py_UCS4 ch; |
28 | 6.23M | const char *s = *inptr; |
29 | 6.23M | STRINGLIB_CHAR *p = dest + *outpos; |
30 | | |
31 | 558M | while (s < end) { |
32 | 555M | ch = (unsigned char)*s; |
33 | | |
34 | 555M | if (ch < 0x80) { |
35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 |
36 | | input will consist of an overwhelming majority of ASCII |
37 | | characters, we try to optimize for this case by checking |
38 | | as many characters as a C 'size_t' can contain. |
39 | | First, check if we can do an aligned read, as most CPUs have |
40 | | a penalty for unaligned reads. |
41 | | */ |
42 | 35.3M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { |
43 | | /* Help register allocation */ |
44 | 4.78M | const char *_s = s; |
45 | 4.78M | STRINGLIB_CHAR *_p = p; |
46 | 94.0M | while (_s + SIZEOF_SIZE_T <= end) { |
47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), |
48 | | and do a fast unrolled copy if it only contains ASCII |
49 | | characters. */ |
50 | 93.1M | size_t value = *(const size_t *) _s; |
51 | 93.1M | if (value & ASCII_CHAR_MASK) |
52 | 3.81M | break; |
53 | 89.2M | #if PY_LITTLE_ENDIAN |
54 | 89.2M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); |
55 | 89.2M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
56 | 89.2M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
57 | 89.2M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
58 | 89.2M | # if SIZEOF_SIZE_T == 8 |
59 | 89.2M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
60 | 89.2M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
61 | 89.2M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
62 | 89.2M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
63 | 89.2M | # endif |
64 | | #else |
65 | | # if SIZEOF_SIZE_T == 8 |
66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); |
74 | | # else |
75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); |
79 | | # endif |
80 | | #endif |
81 | 89.2M | _s += SIZEOF_SIZE_T; |
82 | 89.2M | _p += SIZEOF_SIZE_T; |
83 | 89.2M | } |
84 | 4.78M | s = _s; |
85 | 4.78M | p = _p; |
86 | 4.78M | if (s == end) |
87 | 38.6k | break; |
88 | 4.74M | ch = (unsigned char)*s; |
89 | 4.74M | } |
90 | 35.3M | if (ch < 0x80) { |
91 | 35.0M | s++; |
92 | 35.0M | *p++ = ch; |
93 | 35.0M | continue; |
94 | 35.0M | } |
95 | 35.3M | } |
96 | | |
97 | 520M | if (ch < 0xE0) { |
98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ |
99 | 353M | Py_UCS4 ch2; |
100 | 353M | if (ch < 0xC2) { |
101 | | /* invalid sequence |
102 | | \x80-\xBF -- continuation byte |
103 | | \xC0-\xC1 -- fake 0000-007F */ |
104 | 30.3k | goto InvalidStart; |
105 | 30.3k | } |
106 | 353M | if (end - s < 2) { |
107 | | /* unexpected end of data: the caller will decide whether |
108 | | it's an error or not */ |
109 | 709 | break; |
110 | 709 | } |
111 | 353M | ch2 = (unsigned char)s[1]; |
112 | 353M | if (!IS_CONTINUATION_BYTE(ch2)) |
113 | | /* invalid continuation byte */ |
114 | 8.09k | goto InvalidContinuation1; |
115 | 353M | ch = (ch << 6) + ch2 - |
116 | 353M | ((0xC0 << 6) + 0x80); |
117 | 353M | assert ((ch > 0x007F) && (ch <= 0x07FF)); |
118 | 353M | s += 2; |
119 | 353M | if (STRINGLIB_MAX_CHAR <= 0x007F || |
120 | 353M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) |
121 | | /* Out-of-range */ |
122 | 2.50M | goto Return; |
123 | 350M | *p++ = ch; |
124 | 350M | continue; |
125 | 353M | } |
126 | | |
127 | 167M | if (ch < 0xF0) { |
128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ |
129 | 159M | Py_UCS4 ch2, ch3; |
130 | 159M | if (end - s < 3) { |
131 | | /* unexpected end of data: the caller will decide whether |
132 | | it's an error or not */ |
133 | 2.13k | if (end - s < 2) |
134 | 1.07k | break; |
135 | 1.06k | ch2 = (unsigned char)s[1]; |
136 | 1.06k | if (!IS_CONTINUATION_BYTE(ch2) || |
137 | 1.06k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) |
138 | | /* for clarification see comments below */ |
139 | 419 | goto InvalidContinuation1; |
140 | 641 | break; |
141 | 1.06k | } |
142 | 159M | ch2 = (unsigned char)s[1]; |
143 | 159M | ch3 = (unsigned char)s[2]; |
144 | 159M | if (!IS_CONTINUATION_BYTE(ch2)) { |
145 | | /* invalid continuation byte */ |
146 | 12.7k | goto InvalidContinuation1; |
147 | 12.7k | } |
148 | 159M | if (ch == 0xE0) { |
149 | 99.5M | if (ch2 < 0xA0) |
150 | | /* invalid sequence |
151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ |
152 | 764 | goto InvalidContinuation1; |
153 | 99.5M | } else if (ch == 0xED && ch2 >= 0xA0) { |
154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF |
155 | | will result in surrogates in range D800-DFFF. Surrogates are |
156 | | not valid UTF-8 so they are rejected. |
157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf |
158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ |
159 | 1.95k | goto InvalidContinuation1; |
160 | 1.95k | } |
161 | 159M | if (!IS_CONTINUATION_BYTE(ch3)) { |
162 | | /* invalid continuation byte */ |
163 | 2.65k | goto InvalidContinuation2; |
164 | 2.65k | } |
165 | 159M | ch = (ch << 12) + (ch2 << 6) + ch3 - |
166 | 159M | ((0xE0 << 12) + (0x80 << 6) + 0x80); |
167 | 159M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); |
168 | 159M | s += 3; |
169 | 159M | if (STRINGLIB_MAX_CHAR <= 0x07FF || |
170 | 159M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) |
171 | | /* Out-of-range */ |
172 | 692k | goto Return; |
173 | 159M | *p++ = ch; |
174 | 159M | continue; |
175 | 159M | } |
176 | | |
177 | 7.50M | if (ch < 0xF5) { |
178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ |
179 | 7.48M | Py_UCS4 ch2, ch3, ch4; |
180 | 7.48M | if (end - s < 4) { |
181 | | /* unexpected end of data: the caller will decide whether |
182 | | it's an error or not */ |
183 | 1.47k | if (end - s < 2) |
184 | 395 | break; |
185 | 1.07k | ch2 = (unsigned char)s[1]; |
186 | 1.07k | if (!IS_CONTINUATION_BYTE(ch2) || |
187 | 1.07k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) |
188 | | /* for clarification see comments below */ |
189 | 519 | goto InvalidContinuation1; |
190 | 556 | if (end - s < 3) |
191 | 215 | break; |
192 | 341 | ch3 = (unsigned char)s[2]; |
193 | 341 | if (!IS_CONTINUATION_BYTE(ch3)) |
194 | 163 | goto InvalidContinuation2; |
195 | 178 | break; |
196 | 341 | } |
197 | 7.48M | ch2 = (unsigned char)s[1]; |
198 | 7.48M | ch3 = (unsigned char)s[2]; |
199 | 7.48M | ch4 = (unsigned char)s[3]; |
200 | 7.48M | if (!IS_CONTINUATION_BYTE(ch2)) { |
201 | | /* invalid continuation byte */ |
202 | 13.4k | goto InvalidContinuation1; |
203 | 13.4k | } |
204 | 7.47M | if (ch == 0xF0) { |
205 | 159k | if (ch2 < 0x90) |
206 | | /* invalid sequence |
207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ |
208 | 917 | goto InvalidContinuation1; |
209 | 7.31M | } else if (ch == 0xF4 && ch2 >= 0x90) { |
210 | | /* invalid sequence |
211 | | \xF4\x90\x80\x80- -- 110000- overflow */ |
212 | 818 | goto InvalidContinuation1; |
213 | 818 | } |
214 | 7.47M | if (!IS_CONTINUATION_BYTE(ch3)) { |
215 | | /* invalid continuation byte */ |
216 | 3.14k | goto InvalidContinuation2; |
217 | 3.14k | } |
218 | 7.46M | if (!IS_CONTINUATION_BYTE(ch4)) { |
219 | | /* invalid continuation byte */ |
220 | 2.36k | goto InvalidContinuation3; |
221 | 2.36k | } |
222 | 7.46M | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - |
223 | 7.46M | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); |
224 | 7.46M | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); |
225 | 7.46M | s += 4; |
226 | 7.46M | if (STRINGLIB_MAX_CHAR <= 0xFFFF || |
227 | 7.46M | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) |
228 | | /* Out-of-range */ |
229 | 370k | goto Return; |
230 | 7.09M | *p++ = ch; |
231 | 7.09M | continue; |
232 | 7.46M | } |
233 | 12.7k | goto InvalidStart; |
234 | 7.50M | } |
235 | 2.58M | ch = 0; |
236 | 6.23M | Return: |
237 | 6.23M | *inptr = s; |
238 | 6.23M | *outpos = p - dest; |
239 | 6.23M | return ch; |
240 | 43.0k | InvalidStart: |
241 | 43.0k | ch = 1; |
242 | 43.0k | goto Return; |
243 | 39.6k | InvalidContinuation1: |
244 | 39.6k | ch = 2; |
245 | 39.6k | goto Return; |
246 | 5.96k | InvalidContinuation2: |
247 | 5.96k | ch = 3; |
248 | 5.96k | goto Return; |
249 | 2.36k | InvalidContinuation3: |
250 | 2.36k | ch = 4; |
251 | 2.36k | goto Return; |
252 | 2.58M | } unicodeobject.c:asciilib_utf8_decode Line | Count | Source | 26 | 2.87M | { | 27 | 2.87M | Py_UCS4 ch; | 28 | 2.87M | const char *s = *inptr; | 29 | 2.87M | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 2.87M | while (s < end) { | 32 | 2.87M | ch = (unsigned char)*s; | 33 | | | 34 | 2.87M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 0 | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 0 | const char *_s = s; | 45 | 0 | STRINGLIB_CHAR *_p = p; | 46 | 0 | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 0 | size_t value = *(const size_t *) _s; | 51 | 0 | if (value & ASCII_CHAR_MASK) | 52 | 0 | break; | 53 | 0 | #if PY_LITTLE_ENDIAN | 54 | 0 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 0 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 0 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 0 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 0 | # if SIZEOF_SIZE_T == 8 | 59 | 0 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 0 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 0 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 0 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 0 | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 0 | _s += SIZEOF_SIZE_T; | 82 | 0 | _p += SIZEOF_SIZE_T; | 83 | 0 | } | 84 | 0 | s = _s; | 85 | 0 | p = _p; | 86 | 0 | if (s == end) | 87 | 0 | break; | 88 | 0 | ch = (unsigned char)*s; | 89 | 0 | } | 90 | 0 | if (ch < 0x80) { | 91 | 0 | s++; | 92 | 0 | *p++ = ch; | 93 | 0 | continue; | 94 | 0 | } | 95 | 0 | } | 96 | | | 97 | 2.87M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 2.25M | Py_UCS4 ch2; | 100 | 2.25M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 2.15k | goto InvalidStart; | 105 | 2.15k | } | 106 | 2.25M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 151 | break; | 110 | 151 | } | 111 | 2.25M | ch2 = (unsigned char)s[1]; | 112 | 2.25M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 399 | goto InvalidContinuation1; | 115 | 2.25M | ch = (ch << 6) + ch2 - | 116 | 2.25M | ((0xC0 << 6) + 0x80); | 117 | 2.25M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 2.25M | s += 2; | 119 | 2.25M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 2.25M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 2.25M | goto Return; | 123 | 0 | *p++ = ch; | 124 | 0 | continue; | 125 | 2.25M | } | 126 | | | 127 | 618k | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 368k | Py_UCS4 ch2, ch3; | 130 | 368k | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 741 | if (end - s < 2) | 134 | 480 | break; | 135 | 261 | ch2 = (unsigned char)s[1]; | 136 | 261 | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 261 | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 59 | goto InvalidContinuation1; | 140 | 202 | break; | 141 | 261 | } | 142 | 367k | ch2 = (unsigned char)s[1]; | 143 | 367k | ch3 = (unsigned char)s[2]; | 144 | 367k | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 221 | goto InvalidContinuation1; | 147 | 221 | } | 148 | 367k | if (ch == 0xE0) { | 149 | 58.7k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 33 | goto InvalidContinuation1; | 153 | 308k | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 18 | goto InvalidContinuation1; | 160 | 18 | } | 161 | 367k | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 68 | goto InvalidContinuation2; | 164 | 68 | } | 165 | 367k | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 367k | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 367k | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 367k | s += 3; | 169 | 367k | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 367k | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 367k | goto Return; | 173 | 0 | *p++ = ch; | 174 | 0 | continue; | 175 | 367k | } | 176 | | | 177 | 250k | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 249k | Py_UCS4 ch2, ch3, ch4; | 180 | 249k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 195 | if (end - s < 2) | 184 | 79 | break; | 185 | 116 | ch2 = (unsigned char)s[1]; | 186 | 116 | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 116 | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 67 | goto InvalidContinuation1; | 190 | 49 | if (end - s < 3) | 191 | 20 | break; | 192 | 29 | ch3 = (unsigned char)s[2]; | 193 | 29 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 18 | goto InvalidContinuation2; | 195 | 11 | break; | 196 | 29 | } | 197 | 249k | ch2 = (unsigned char)s[1]; | 198 | 249k | ch3 = (unsigned char)s[2]; | 199 | 249k | ch4 = (unsigned char)s[3]; | 200 | 249k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 209 | goto InvalidContinuation1; | 203 | 209 | } | 204 | 249k | if (ch == 0xF0) { | 205 | 17.6k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 15 | goto InvalidContinuation1; | 209 | 231k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 9 | goto InvalidContinuation1; | 213 | 9 | } | 214 | 249k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 46 | goto InvalidContinuation2; | 217 | 46 | } | 218 | 249k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 54 | goto InvalidContinuation3; | 221 | 54 | } | 222 | 249k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 249k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 249k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 249k | s += 4; | 226 | 249k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 249k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 249k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 249k | } | 233 | 451 | goto InvalidStart; | 234 | 250k | } | 235 | 943 | ch = 0; | 236 | 2.87M | Return: | 237 | 2.87M | *inptr = s; | 238 | 2.87M | *outpos = p - dest; | 239 | 2.87M | return ch; | 240 | 2.60k | InvalidStart: | 241 | 2.60k | ch = 1; | 242 | 2.60k | goto Return; | 243 | 1.03k | InvalidContinuation1: | 244 | 1.03k | ch = 2; | 245 | 1.03k | goto Return; | 246 | 132 | InvalidContinuation2: | 247 | 132 | ch = 3; | 248 | 132 | goto Return; | 249 | 54 | InvalidContinuation3: | 250 | 54 | ch = 4; | 251 | 54 | goto Return; | 252 | 943 | } |
unicodeobject.c:ucs1lib_utf8_decode Line | Count | Source | 26 | 1.26M | { | 27 | 1.26M | Py_UCS4 ch; | 28 | 1.26M | const char *s = *inptr; | 29 | 1.26M | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 197M | while (s < end) { | 32 | 196M | ch = (unsigned char)*s; | 33 | | | 34 | 196M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 4.96M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 503k | const char *_s = s; | 45 | 503k | STRINGLIB_CHAR *_p = p; | 46 | 12.8M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 12.7M | size_t value = *(const size_t *) _s; | 51 | 12.7M | if (value & ASCII_CHAR_MASK) | 52 | 387k | break; | 53 | 12.3M | #if PY_LITTLE_ENDIAN | 54 | 12.3M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 12.3M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 12.3M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 12.3M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 12.3M | # if SIZEOF_SIZE_T == 8 | 59 | 12.3M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 12.3M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 12.3M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 12.3M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 12.3M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 12.3M | _s += SIZEOF_SIZE_T; | 82 | 12.3M | _p += SIZEOF_SIZE_T; | 83 | 12.3M | } | 84 | 503k | s = _s; | 85 | 503k | p = _p; | 86 | 503k | if (s == end) | 87 | 6.20k | break; | 88 | 497k | ch = (unsigned char)*s; | 89 | 497k | } | 90 | 4.95M | if (ch < 0x80) { | 91 | 4.93M | s++; | 92 | 4.93M | *p++ = ch; | 93 | 4.93M | continue; | 94 | 4.93M | } | 95 | 4.95M | } | 96 | | | 97 | 191M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 191M | Py_UCS4 ch2; | 100 | 191M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 113 | goto InvalidStart; | 105 | 113 | } | 106 | 191M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 87 | break; | 110 | 87 | } | 111 | 191M | ch2 = (unsigned char)s[1]; | 112 | 191M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 106 | goto InvalidContinuation1; | 115 | 191M | ch = (ch << 6) + ch2 - | 116 | 191M | ((0xC0 << 6) + 0x80); | 117 | 191M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 191M | s += 2; | 119 | 191M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 191M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 249k | goto Return; | 123 | 191M | *p++ = ch; | 124 | 191M | continue; | 125 | 191M | } | 126 | | | 127 | 342k | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 325k | Py_UCS4 ch2, ch3; | 130 | 325k | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 206 | if (end - s < 2) | 134 | 51 | break; | 135 | 155 | ch2 = (unsigned char)s[1]; | 136 | 155 | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 155 | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 70 | goto InvalidContinuation1; | 140 | 85 | break; | 141 | 155 | } | 142 | 325k | ch2 = (unsigned char)s[1]; | 143 | 325k | ch3 = (unsigned char)s[2]; | 144 | 325k | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 53 | goto InvalidContinuation1; | 147 | 53 | } | 148 | 325k | if (ch == 0xE0) { | 149 | 296k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 42 | goto InvalidContinuation1; | 153 | 296k | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 21 | goto InvalidContinuation1; | 160 | 21 | } | 161 | 325k | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 63 | goto InvalidContinuation2; | 164 | 63 | } | 165 | 325k | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 325k | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 325k | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 325k | s += 3; | 169 | 325k | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 325k | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 325k | goto Return; | 173 | 0 | *p++ = ch; | 174 | 0 | continue; | 175 | 325k | } | 176 | | | 177 | 17.2k | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 17.1k | Py_UCS4 ch2, ch3, ch4; | 180 | 17.1k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 215 | if (end - s < 2) | 184 | 30 | break; | 185 | 185 | ch2 = (unsigned char)s[1]; | 186 | 185 | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 185 | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 74 | goto InvalidContinuation1; | 190 | 111 | if (end - s < 3) | 191 | 36 | break; | 192 | 75 | ch3 = (unsigned char)s[2]; | 193 | 75 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 40 | goto InvalidContinuation2; | 195 | 35 | break; | 196 | 75 | } | 197 | 16.9k | ch2 = (unsigned char)s[1]; | 198 | 16.9k | ch3 = (unsigned char)s[2]; | 199 | 16.9k | ch4 = (unsigned char)s[3]; | 200 | 16.9k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 56 | goto InvalidContinuation1; | 203 | 56 | } | 204 | 16.9k | if (ch == 0xF0) { | 205 | 1.95k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 30 | goto InvalidContinuation1; | 209 | 14.9k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 22 | goto InvalidContinuation1; | 213 | 22 | } | 214 | 16.8k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 56 | goto InvalidContinuation2; | 217 | 56 | } | 218 | 16.7k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 50 | goto InvalidContinuation3; | 221 | 50 | } | 222 | 16.7k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 16.7k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 16.7k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 16.7k | s += 4; | 226 | 16.7k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 16.7k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 16.7k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 16.7k | } | 233 | 51 | goto InvalidStart; | 234 | 17.2k | } | 235 | 667k | ch = 0; | 236 | 1.26M | Return: | 237 | 1.26M | *inptr = s; | 238 | 1.26M | *outpos = p - dest; | 239 | 1.26M | return ch; | 240 | 164 | InvalidStart: | 241 | 164 | ch = 1; | 242 | 164 | goto Return; | 243 | 474 | InvalidContinuation1: | 244 | 474 | ch = 2; | 245 | 474 | goto Return; | 246 | 159 | InvalidContinuation2: | 247 | 159 | ch = 3; | 248 | 159 | goto Return; | 249 | 50 | InvalidContinuation3: | 250 | 50 | ch = 4; | 251 | 50 | goto Return; | 252 | 667k | } |
unicodeobject.c:ucs2lib_utf8_decode Line | Count | Source | 26 | 1.75M | { | 27 | 1.75M | Py_UCS4 ch; | 28 | 1.75M | const char *s = *inptr; | 29 | 1.75M | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 190M | while (s < end) { | 32 | 188M | ch = (unsigned char)*s; | 33 | | | 34 | 188M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 16.3M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 2.40M | const char *_s = s; | 45 | 2.40M | STRINGLIB_CHAR *_p = p; | 46 | 42.7M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 42.0M | size_t value = *(const size_t *) _s; | 51 | 42.0M | if (value & ASCII_CHAR_MASK) | 52 | 1.67M | break; | 53 | 40.3M | #if PY_LITTLE_ENDIAN | 54 | 40.3M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 40.3M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 40.3M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 40.3M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 40.3M | # if SIZEOF_SIZE_T == 8 | 59 | 40.3M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 40.3M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 40.3M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 40.3M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 40.3M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 40.3M | _s += SIZEOF_SIZE_T; | 82 | 40.3M | _p += SIZEOF_SIZE_T; | 83 | 40.3M | } | 84 | 2.40M | s = _s; | 85 | 2.40M | p = _p; | 86 | 2.40M | if (s == end) | 87 | 21.6k | break; | 88 | 2.38M | ch = (unsigned char)*s; | 89 | 2.38M | } | 90 | 16.3M | if (ch < 0x80) { | 91 | 16.1M | s++; | 92 | 16.1M | *p++ = ch; | 93 | 16.1M | continue; | 94 | 16.1M | } | 95 | 16.3M | } | 96 | | | 97 | 172M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 127M | Py_UCS4 ch2; | 100 | 127M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 8.07k | goto InvalidStart; | 105 | 8.07k | } | 106 | 127M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 376 | break; | 110 | 376 | } | 111 | 127M | ch2 = (unsigned char)s[1]; | 112 | 127M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 4.87k | goto InvalidContinuation1; | 115 | 127M | ch = (ch << 6) + ch2 - | 116 | 127M | ((0xC0 << 6) + 0x80); | 117 | 127M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 127M | s += 2; | 119 | 127M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 127M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 0 | goto Return; | 123 | 127M | *p++ = ch; | 124 | 127M | continue; | 125 | 127M | } | 126 | | | 127 | 44.5M | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 44.4M | Py_UCS4 ch2, ch3; | 130 | 44.4M | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 760 | if (end - s < 2) | 134 | 369 | break; | 135 | 391 | ch2 = (unsigned char)s[1]; | 136 | 391 | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 391 | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 176 | goto InvalidContinuation1; | 140 | 215 | break; | 141 | 391 | } | 142 | 44.4M | ch2 = (unsigned char)s[1]; | 143 | 44.4M | ch3 = (unsigned char)s[2]; | 144 | 44.4M | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 9.05k | goto InvalidContinuation1; | 147 | 9.05k | } | 148 | 44.3M | if (ch == 0xE0) { | 149 | 33.6M | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 404 | goto InvalidContinuation1; | 153 | 33.6M | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 708 | goto InvalidContinuation1; | 160 | 708 | } | 161 | 44.3M | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 1.40k | goto InvalidContinuation2; | 164 | 1.40k | } | 165 | 44.3M | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 44.3M | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 44.3M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 44.3M | s += 3; | 169 | 44.3M | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 44.3M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 0 | goto Return; | 173 | 44.3M | *p++ = ch; | 174 | 44.3M | continue; | 175 | 44.3M | } | 176 | | | 177 | 122k | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 115k | Py_UCS4 ch2, ch3, ch4; | 180 | 115k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 450 | if (end - s < 2) | 184 | 113 | break; | 185 | 337 | ch2 = (unsigned char)s[1]; | 186 | 337 | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 337 | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 169 | goto InvalidContinuation1; | 190 | 168 | if (end - s < 3) | 191 | 62 | break; | 192 | 106 | ch3 = (unsigned char)s[2]; | 193 | 106 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 51 | goto InvalidContinuation2; | 195 | 55 | break; | 196 | 106 | } | 197 | 115k | ch2 = (unsigned char)s[1]; | 198 | 115k | ch3 = (unsigned char)s[2]; | 199 | 115k | ch4 = (unsigned char)s[3]; | 200 | 115k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 7.78k | goto InvalidContinuation1; | 203 | 7.78k | } | 204 | 107k | if (ch == 0xF0) { | 205 | 22.5k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 431 | goto InvalidContinuation1; | 209 | 84.9k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 354 | goto InvalidContinuation1; | 213 | 354 | } | 214 | 106k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 1.60k | goto InvalidContinuation2; | 217 | 1.60k | } | 218 | 105k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 1.08k | goto InvalidContinuation3; | 221 | 1.08k | } | 222 | 104k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 104k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 104k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 104k | s += 4; | 226 | 104k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 104k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 104k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 104k | } | 233 | 6.48k | goto InvalidStart; | 234 | 122k | } | 235 | 1.60M | ch = 0; | 236 | 1.75M | Return: | 237 | 1.75M | *inptr = s; | 238 | 1.75M | *outpos = p - dest; | 239 | 1.75M | return ch; | 240 | 14.5k | InvalidStart: | 241 | 14.5k | ch = 1; | 242 | 14.5k | goto Return; | 243 | 23.9k | InvalidContinuation1: | 244 | 23.9k | ch = 2; | 245 | 23.9k | goto Return; | 246 | 3.06k | InvalidContinuation2: | 247 | 3.06k | ch = 3; | 248 | 3.06k | goto Return; | 249 | 1.08k | InvalidContinuation3: | 250 | 1.08k | ch = 4; | 251 | 1.08k | goto Return; | 252 | 1.60M | } |
unicodeobject.c:ucs4lib_utf8_decode Line | Count | Source | 26 | 347k | { | 27 | 347k | Py_UCS4 ch; | 28 | 347k | const char *s = *inptr; | 29 | 347k | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 167M | while (s < end) { | 32 | 167M | ch = (unsigned char)*s; | 33 | | | 34 | 167M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 14.0M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 1.87M | const char *_s = s; | 45 | 1.87M | STRINGLIB_CHAR *_p = p; | 46 | 38.4M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 38.2M | size_t value = *(const size_t *) _s; | 51 | 38.2M | if (value & ASCII_CHAR_MASK) | 52 | 1.75M | break; | 53 | 36.5M | #if PY_LITTLE_ENDIAN | 54 | 36.5M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 36.5M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 36.5M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 36.5M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 36.5M | # if SIZEOF_SIZE_T == 8 | 59 | 36.5M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 36.5M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 36.5M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 36.5M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 36.5M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 36.5M | _s += SIZEOF_SIZE_T; | 82 | 36.5M | _p += SIZEOF_SIZE_T; | 83 | 36.5M | } | 84 | 1.87M | s = _s; | 85 | 1.87M | p = _p; | 86 | 1.87M | if (s == end) | 87 | 10.7k | break; | 88 | 1.86M | ch = (unsigned char)*s; | 89 | 1.86M | } | 90 | 14.0M | if (ch < 0x80) { | 91 | 13.8M | s++; | 92 | 13.8M | *p++ = ch; | 93 | 13.8M | continue; | 94 | 13.8M | } | 95 | 14.0M | } | 96 | | | 97 | 153M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 31.7M | Py_UCS4 ch2; | 100 | 31.7M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 19.9k | goto InvalidStart; | 105 | 19.9k | } | 106 | 31.7M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 95 | break; | 110 | 95 | } | 111 | 31.7M | ch2 = (unsigned char)s[1]; | 112 | 31.7M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 2.71k | goto InvalidContinuation1; | 115 | 31.7M | ch = (ch << 6) + ch2 - | 116 | 31.7M | ((0xC0 << 6) + 0x80); | 117 | 31.7M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 31.7M | s += 2; | 119 | 31.7M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 31.7M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 0 | goto Return; | 123 | 31.7M | *p++ = ch; | 124 | 31.7M | continue; | 125 | 31.7M | } | 126 | | | 127 | 121M | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 114M | Py_UCS4 ch2, ch3; | 130 | 114M | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 423 | if (end - s < 2) | 134 | 170 | break; | 135 | 253 | ch2 = (unsigned char)s[1]; | 136 | 253 | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 253 | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 114 | goto InvalidContinuation1; | 140 | 139 | break; | 141 | 253 | } | 142 | 114M | ch2 = (unsigned char)s[1]; | 143 | 114M | ch3 = (unsigned char)s[2]; | 144 | 114M | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 3.40k | goto InvalidContinuation1; | 147 | 3.40k | } | 148 | 114M | if (ch == 0xE0) { | 149 | 65.5M | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 285 | goto InvalidContinuation1; | 153 | 65.5M | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 1.21k | goto InvalidContinuation1; | 160 | 1.21k | } | 161 | 114M | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 1.11k | goto InvalidContinuation2; | 164 | 1.11k | } | 165 | 114M | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 114M | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 114M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 114M | s += 3; | 169 | 114M | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 114M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 0 | goto Return; | 173 | 114M | *p++ = ch; | 174 | 114M | continue; | 175 | 114M | } | 176 | | | 177 | 7.11M | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 7.10M | Py_UCS4 ch2, ch3, ch4; | 180 | 7.10M | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 610 | if (end - s < 2) | 184 | 173 | break; | 185 | 437 | ch2 = (unsigned char)s[1]; | 186 | 437 | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 437 | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 209 | goto InvalidContinuation1; | 190 | 228 | if (end - s < 3) | 191 | 97 | break; | 192 | 131 | ch3 = (unsigned char)s[2]; | 193 | 131 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 54 | goto InvalidContinuation2; | 195 | 77 | break; | 196 | 131 | } | 197 | 7.10M | ch2 = (unsigned char)s[1]; | 198 | 7.10M | ch3 = (unsigned char)s[2]; | 199 | 7.10M | ch4 = (unsigned char)s[3]; | 200 | 7.10M | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 5.42k | goto InvalidContinuation1; | 203 | 5.42k | } | 204 | 7.10M | if (ch == 0xF0) { | 205 | 117k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 441 | goto InvalidContinuation1; | 209 | 6.98M | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 433 | goto InvalidContinuation1; | 213 | 433 | } | 214 | 7.09M | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 1.43k | goto InvalidContinuation2; | 217 | 1.43k | } | 218 | 7.09M | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 1.17k | goto InvalidContinuation3; | 221 | 1.17k | } | 222 | 7.09M | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 7.09M | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 7.09M | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 7.09M | s += 4; | 226 | 7.09M | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 7.09M | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 0 | goto Return; | 230 | 7.09M | *p++ = ch; | 231 | 7.09M | continue; | 232 | 7.09M | } | 233 | 5.76k | goto InvalidStart; | 234 | 7.11M | } | 235 | 303k | ch = 0; | 236 | 347k | Return: | 237 | 347k | *inptr = s; | 238 | 347k | *outpos = p - dest; | 239 | 347k | return ch; | 240 | 25.7k | InvalidStart: | 241 | 25.7k | ch = 1; | 242 | 25.7k | goto Return; | 243 | 14.2k | InvalidContinuation1: | 244 | 14.2k | ch = 2; | 245 | 14.2k | goto Return; | 246 | 2.60k | InvalidContinuation2: | 247 | 2.60k | ch = 3; | 248 | 2.60k | goto Return; | 249 | 1.17k | InvalidContinuation3: | 250 | 1.17k | ch = 4; | 251 | 1.17k | goto Return; | 252 | 303k | } |
|
253 | | |
254 | | #undef ASCII_CHAR_MASK |
255 | | |
256 | | |
257 | | /* UTF-8 encoder specialized for a Unicode kind to avoid the slow |
258 | | PyUnicode_READ() macro. Delete some parts of the code depending on the kind: |
259 | | UCS-1 strings don't need to handle surrogates for example. */ |
260 | | Py_LOCAL_INLINE(char *) |
261 | | STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, |
262 | | PyObject *unicode, |
263 | | const STRINGLIB_CHAR *data, |
264 | | Py_ssize_t size, |
265 | | _Py_error_handler error_handler, |
266 | | const char *errors) |
267 | 11.5k | { |
268 | 11.5k | Py_ssize_t i; /* index into data of next input character */ |
269 | 11.5k | char *p; /* next free byte in output buffer */ |
270 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
271 | | PyObject *error_handler_obj = NULL; |
272 | | PyObject *exc = NULL; |
273 | | PyObject *rep = NULL; |
274 | | #endif |
275 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
276 | | const Py_ssize_t max_char_size = 2; |
277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 |
278 | | const Py_ssize_t max_char_size = 3; |
279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ |
280 | | const Py_ssize_t max_char_size = 4; |
281 | | #endif |
282 | | |
283 | 11.5k | assert(size >= 0); |
284 | 11.5k | if (size > PY_SSIZE_T_MAX / max_char_size) { |
285 | | /* integer overflow */ |
286 | 0 | PyErr_NoMemory(); |
287 | 0 | return NULL; |
288 | 0 | } |
289 | | |
290 | 11.5k | _PyBytesWriter_Init(writer); |
291 | 11.5k | p = _PyBytesWriter_Alloc(writer, size * max_char_size); |
292 | 11.5k | if (p == NULL) |
293 | 0 | return NULL; |
294 | | |
295 | 18.6M | for (i = 0; i < size;) { |
296 | 18.5M | Py_UCS4 ch = data[i++]; |
297 | | |
298 | 18.5M | if (ch < 0x80) { |
299 | | /* Encode ASCII */ |
300 | 17.2M | *p++ = (char) ch; |
301 | | |
302 | 17.2M | } |
303 | 992k | else |
304 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
305 | 992k | if (ch < 0x0800) |
306 | 293k | #endif |
307 | 637k | { |
308 | | /* Encode Latin-1 */ |
309 | 637k | *p++ = (char)(0xc0 | (ch >> 6)); |
310 | 637k | *p++ = (char)(0x80 | (ch & 0x3f)); |
311 | 637k | } |
312 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
313 | 699k | else if (Py_UNICODE_IS_SURROGATE(ch)) { |
314 | 29 | Py_ssize_t startpos, endpos, newpos; |
315 | 29 | Py_ssize_t k; |
316 | 29 | if (error_handler == _Py_ERROR_UNKNOWN) { |
317 | 23 | error_handler = _Py_GetErrorHandler(errors); |
318 | 23 | } |
319 | | |
320 | 29 | startpos = i-1; |
321 | 29 | endpos = startpos+1; |
322 | | |
323 | 43 | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) |
324 | 14 | endpos++; |
325 | | |
326 | | /* Only overallocate the buffer if it's not the last write */ |
327 | 29 | writer->overallocate = (endpos < size); |
328 | | |
329 | 29 | switch (error_handler) |
330 | 29 | { |
331 | 0 | case _Py_ERROR_REPLACE: |
332 | 0 | memset(p, '?', endpos - startpos); |
333 | 0 | p += (endpos - startpos); |
334 | 0 | _Py_FALLTHROUGH; |
335 | 0 | case _Py_ERROR_IGNORE: |
336 | 0 | i += (endpos - startpos - 1); |
337 | 0 | break; |
338 | | |
339 | 0 | case _Py_ERROR_SURROGATEPASS: |
340 | 0 | for (k=startpos; k<endpos; k++) { |
341 | 0 | ch = data[k]; |
342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); |
343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); |
345 | 0 | } |
346 | 0 | i += (endpos - startpos - 1); |
347 | 0 | break; |
348 | | |
349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: |
350 | | /* subtract preallocated bytes */ |
351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); |
352 | 0 | p = backslashreplace(writer, p, |
353 | 0 | unicode, startpos, endpos); |
354 | 0 | if (p == NULL) |
355 | 0 | goto error; |
356 | 0 | i += (endpos - startpos - 1); |
357 | 0 | break; |
358 | | |
359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: |
360 | | /* subtract preallocated bytes */ |
361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); |
362 | 0 | p = xmlcharrefreplace(writer, p, |
363 | 0 | unicode, startpos, endpos); |
364 | 0 | if (p == NULL) |
365 | 0 | goto error; |
366 | 0 | i += (endpos - startpos - 1); |
367 | 0 | break; |
368 | | |
369 | 0 | case _Py_ERROR_SURROGATEESCAPE: |
370 | 0 | for (k=startpos; k<endpos; k++) { |
371 | 0 | ch = data[k]; |
372 | 0 | if (!(0xDC80 <= ch && ch <= 0xDCFF)) |
373 | 0 | break; |
374 | 0 | *p++ = (char)(ch & 0xff); |
375 | 0 | } |
376 | 0 | if (k >= endpos) { |
377 | 0 | i += (endpos - startpos - 1); |
378 | 0 | break; |
379 | 0 | } |
380 | 0 | startpos = k; |
381 | 0 | assert(startpos < endpos); |
382 | 0 | _Py_FALLTHROUGH; |
383 | 29 | default: |
384 | 29 | rep = unicode_encode_call_errorhandler( |
385 | 29 | errors, &error_handler_obj, "utf-8", "surrogates not allowed", |
386 | 29 | unicode, &exc, startpos, endpos, &newpos); |
387 | 29 | if (!rep) |
388 | 29 | goto error; |
389 | | |
390 | 0 | if (newpos < startpos) { |
391 | 0 | writer->overallocate = 1; |
392 | 0 | p = _PyBytesWriter_Prepare(writer, p, |
393 | 0 | max_char_size * (startpos - newpos)); |
394 | 0 | if (p == NULL) |
395 | 0 | goto error; |
396 | 0 | } |
397 | 0 | else { |
398 | | /* subtract preallocated bytes */ |
399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); |
400 | | /* Only overallocate the buffer if it's not the last write */ |
401 | 0 | writer->overallocate = (newpos < size); |
402 | 0 | } |
403 | | |
404 | 0 | if (PyBytes_Check(rep)) { |
405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, |
406 | 0 | PyBytes_AS_STRING(rep), |
407 | 0 | PyBytes_GET_SIZE(rep)); |
408 | 0 | } |
409 | 0 | else { |
410 | | /* rep is unicode */ |
411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { |
412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, |
413 | 0 | startpos, endpos, |
414 | 0 | "surrogates not allowed"); |
415 | 0 | goto error; |
416 | 0 | } |
417 | | |
418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, |
419 | 0 | PyUnicode_DATA(rep), |
420 | 0 | PyUnicode_GET_LENGTH(rep)); |
421 | 0 | } |
422 | | |
423 | 0 | if (p == NULL) |
424 | 0 | goto error; |
425 | 0 | Py_CLEAR(rep); |
426 | |
|
427 | 0 | i = newpos; |
428 | 29 | } |
429 | | |
430 | | /* If overallocation was disabled, ensure that it was the last |
431 | | write. Otherwise, we missed an optimization */ |
432 | 0 | assert(writer->overallocate || i == size); |
433 | 0 | } |
434 | 59.9k | else |
435 | | #if STRINGLIB_SIZEOF_CHAR > 2 |
436 | 59.9k | if (ch < 0x10000) |
437 | 49.9k | #endif |
438 | 689k | { |
439 | 689k | *p++ = (char)(0xe0 | (ch >> 12)); |
440 | 689k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
441 | 689k | *p++ = (char)(0x80 | (ch & 0x3f)); |
442 | 689k | } |
443 | | #if STRINGLIB_SIZEOF_CHAR > 2 |
444 | | else /* ch >= 0x10000 */ |
445 | 10.0k | { |
446 | 10.0k | assert(ch <= MAX_UNICODE); |
447 | | /* Encode UCS4 Unicode ordinals */ |
448 | 10.0k | *p++ = (char)(0xf0 | (ch >> 18)); |
449 | 10.0k | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); |
450 | 10.0k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
451 | 10.0k | *p++ = (char)(0x80 | (ch & 0x3f)); |
452 | 10.0k | } |
453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ |
454 | | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ |
455 | 18.5M | } |
456 | | |
457 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
458 | 7.21k | Py_XDECREF(error_handler_obj); |
459 | 7.21k | Py_XDECREF(exc); |
460 | | #endif |
461 | 7.21k | return p; |
462 | | |
463 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
464 | 29 | error: |
465 | 29 | Py_XDECREF(rep); |
466 | 29 | Py_XDECREF(error_handler_obj); |
467 | 29 | Py_XDECREF(exc); |
468 | 29 | return NULL; |
469 | | #endif |
470 | 7.24k | } unicodeobject.c:ucs1lib_utf8_encoder Line | Count | Source | 267 | 4.28k | { | 268 | 4.28k | Py_ssize_t i; /* index into data of next input character */ | 269 | 4.28k | char *p; /* next free byte in output buffer */ | 270 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | | PyObject *error_handler_obj = NULL; | 272 | | PyObject *exc = NULL; | 273 | | PyObject *rep = NULL; | 274 | | #endif | 275 | 4.28k | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | 4.28k | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | | const Py_ssize_t max_char_size = 4; | 281 | | #endif | 282 | | | 283 | 4.28k | assert(size >= 0); | 284 | 4.28k | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 4.28k | _PyBytesWriter_Init(writer); | 291 | 4.28k | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 4.28k | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 1.54M | for (i = 0; i < size;) { | 296 | 1.54M | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 1.54M | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 1.19M | *p++ = (char) ch; | 301 | | | 302 | 1.19M | } | 303 | 344k | else | 304 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | | if (ch < 0x0800) | 306 | | #endif | 307 | 344k | { | 308 | | /* Encode Latin-1 */ | 309 | 344k | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 344k | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 344k | } | 312 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | | Py_ssize_t startpos, endpos, newpos; | 315 | | Py_ssize_t k; | 316 | | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | | error_handler = _Py_GetErrorHandler(errors); | 318 | | } | 319 | | | 320 | | startpos = i-1; | 321 | | endpos = startpos+1; | 322 | | | 323 | | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | | writer->overallocate = (endpos < size); | 328 | | | 329 | | switch (error_handler) | 330 | | { | 331 | | case _Py_ERROR_REPLACE: | 332 | | memset(p, '?', endpos - startpos); | 333 | | p += (endpos - startpos); | 334 | | _Py_FALLTHROUGH; | 335 | | case _Py_ERROR_IGNORE: | 336 | | i += (endpos - startpos - 1); | 337 | | break; | 338 | | | 339 | | case _Py_ERROR_SURROGATEPASS: | 340 | | for (k=startpos; k<endpos; k++) { | 341 | | ch = data[k]; | 342 | | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | | } | 346 | | i += (endpos - startpos - 1); | 347 | | break; | 348 | | | 349 | | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | | writer->min_size -= max_char_size * (endpos - startpos); | 352 | | p = backslashreplace(writer, p, | 353 | | unicode, startpos, endpos); | 354 | | if (p == NULL) | 355 | | goto error; | 356 | | i += (endpos - startpos - 1); | 357 | | break; | 358 | | | 359 | | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | | writer->min_size -= max_char_size * (endpos - startpos); | 362 | | p = xmlcharrefreplace(writer, p, | 363 | | unicode, startpos, endpos); | 364 | | if (p == NULL) | 365 | | goto error; | 366 | | i += (endpos - startpos - 1); | 367 | | break; | 368 | | | 369 | | case _Py_ERROR_SURROGATEESCAPE: | 370 | | for (k=startpos; k<endpos; k++) { | 371 | | ch = data[k]; | 372 | | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | | break; | 374 | | *p++ = (char)(ch & 0xff); | 375 | | } | 376 | | if (k >= endpos) { | 377 | | i += (endpos - startpos - 1); | 378 | | break; | 379 | | } | 380 | | startpos = k; | 381 | | assert(startpos < endpos); | 382 | | _Py_FALLTHROUGH; | 383 | | default: | 384 | | rep = unicode_encode_call_errorhandler( | 385 | | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | | unicode, &exc, startpos, endpos, &newpos); | 387 | | if (!rep) | 388 | | goto error; | 389 | | | 390 | | if (newpos < startpos) { | 391 | | writer->overallocate = 1; | 392 | | p = _PyBytesWriter_Prepare(writer, p, | 393 | | max_char_size * (startpos - newpos)); | 394 | | if (p == NULL) | 395 | | goto error; | 396 | | } | 397 | | else { | 398 | | /* subtract preallocated bytes */ | 399 | | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | | writer->overallocate = (newpos < size); | 402 | | } | 403 | | | 404 | | if (PyBytes_Check(rep)) { | 405 | | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | | PyBytes_AS_STRING(rep), | 407 | | PyBytes_GET_SIZE(rep)); | 408 | | } | 409 | | else { | 410 | | /* rep is unicode */ | 411 | | if (!PyUnicode_IS_ASCII(rep)) { | 412 | | raise_encode_exception(&exc, "utf-8", unicode, | 413 | | startpos, endpos, | 414 | | "surrogates not allowed"); | 415 | | goto error; | 416 | | } | 417 | | | 418 | | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | | PyUnicode_DATA(rep), | 420 | | PyUnicode_GET_LENGTH(rep)); | 421 | | } | 422 | | | 423 | | if (p == NULL) | 424 | | goto error; | 425 | | Py_CLEAR(rep); | 426 | | | 427 | | i = newpos; | 428 | | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | | assert(writer->overallocate || i == size); | 433 | | } | 434 | | else | 435 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | | if (ch < 0x10000) | 437 | | #endif | 438 | | { | 439 | | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | | } | 443 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | | else /* ch >= 0x10000 */ | 445 | | { | 446 | | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | | } | 453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 1.54M | } | 456 | | | 457 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | | Py_XDECREF(error_handler_obj); | 459 | | Py_XDECREF(exc); | 460 | | #endif | 461 | 4.28k | return p; | 462 | | | 463 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | | error: | 465 | | Py_XDECREF(rep); | 466 | | Py_XDECREF(error_handler_obj); | 467 | | Py_XDECREF(exc); | 468 | | return NULL; | 469 | | #endif | 470 | 4.28k | } |
unicodeobject.c:ucs2lib_utf8_encoder Line | Count | Source | 267 | 5.72k | { | 268 | 5.72k | Py_ssize_t i; /* index into data of next input character */ | 269 | 5.72k | char *p; /* next free byte in output buffer */ | 270 | 5.72k | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | 5.72k | PyObject *error_handler_obj = NULL; | 272 | 5.72k | PyObject *exc = NULL; | 273 | 5.72k | PyObject *rep = NULL; | 274 | 5.72k | #endif | 275 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | | const Py_ssize_t max_char_size = 4; | 281 | | #endif | 282 | | | 283 | 5.72k | assert(size >= 0); | 284 | 5.72k | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 5.72k | _PyBytesWriter_Init(writer); | 291 | 5.72k | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 5.72k | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 10.2M | for (i = 0; i < size;) { | 296 | 10.2M | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 10.2M | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 9.37M | *p++ = (char) ch; | 301 | | | 302 | 9.37M | } | 303 | 911k | else | 304 | 911k | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | 911k | if (ch < 0x0800) | 306 | 271k | #endif | 307 | 271k | { | 308 | | /* Encode Latin-1 */ | 309 | 271k | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 271k | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 271k | } | 312 | 639k | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | 639k | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | 12 | Py_ssize_t startpos, endpos, newpos; | 315 | 12 | Py_ssize_t k; | 316 | 12 | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | 9 | error_handler = _Py_GetErrorHandler(errors); | 318 | 9 | } | 319 | | | 320 | 12 | startpos = i-1; | 321 | 12 | endpos = startpos+1; | 322 | | | 323 | 20 | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | 8 | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | 12 | writer->overallocate = (endpos < size); | 328 | | | 329 | 12 | switch (error_handler) | 330 | 12 | { | 331 | 0 | case _Py_ERROR_REPLACE: | 332 | 0 | memset(p, '?', endpos - startpos); | 333 | 0 | p += (endpos - startpos); | 334 | 0 | _Py_FALLTHROUGH; | 335 | 0 | case _Py_ERROR_IGNORE: | 336 | 0 | i += (endpos - startpos - 1); | 337 | 0 | break; | 338 | | | 339 | 0 | case _Py_ERROR_SURROGATEPASS: | 340 | 0 | for (k=startpos; k<endpos; k++) { | 341 | 0 | ch = data[k]; | 342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | 0 | } | 346 | 0 | i += (endpos - startpos - 1); | 347 | 0 | break; | 348 | | | 349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 352 | 0 | p = backslashreplace(writer, p, | 353 | 0 | unicode, startpos, endpos); | 354 | 0 | if (p == NULL) | 355 | 0 | goto error; | 356 | 0 | i += (endpos - startpos - 1); | 357 | 0 | break; | 358 | | | 359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 362 | 0 | p = xmlcharrefreplace(writer, p, | 363 | 0 | unicode, startpos, endpos); | 364 | 0 | if (p == NULL) | 365 | 0 | goto error; | 366 | 0 | i += (endpos - startpos - 1); | 367 | 0 | break; | 368 | | | 369 | 0 | case _Py_ERROR_SURROGATEESCAPE: | 370 | 0 | for (k=startpos; k<endpos; k++) { | 371 | 0 | ch = data[k]; | 372 | 0 | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | 0 | break; | 374 | 0 | *p++ = (char)(ch & 0xff); | 375 | 0 | } | 376 | 0 | if (k >= endpos) { | 377 | 0 | i += (endpos - startpos - 1); | 378 | 0 | break; | 379 | 0 | } | 380 | 0 | startpos = k; | 381 | 0 | assert(startpos < endpos); | 382 | 0 | _Py_FALLTHROUGH; | 383 | 12 | default: | 384 | 12 | rep = unicode_encode_call_errorhandler( | 385 | 12 | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | 12 | unicode, &exc, startpos, endpos, &newpos); | 387 | 12 | if (!rep) | 388 | 12 | goto error; | 389 | | | 390 | 0 | if (newpos < startpos) { | 391 | 0 | writer->overallocate = 1; | 392 | 0 | p = _PyBytesWriter_Prepare(writer, p, | 393 | 0 | max_char_size * (startpos - newpos)); | 394 | 0 | if (p == NULL) | 395 | 0 | goto error; | 396 | 0 | } | 397 | 0 | else { | 398 | | /* subtract preallocated bytes */ | 399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | 0 | writer->overallocate = (newpos < size); | 402 | 0 | } | 403 | | | 404 | 0 | if (PyBytes_Check(rep)) { | 405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | 0 | PyBytes_AS_STRING(rep), | 407 | 0 | PyBytes_GET_SIZE(rep)); | 408 | 0 | } | 409 | 0 | else { | 410 | | /* rep is unicode */ | 411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { | 412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, | 413 | 0 | startpos, endpos, | 414 | 0 | "surrogates not allowed"); | 415 | 0 | goto error; | 416 | 0 | } | 417 | | | 418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | 0 | PyUnicode_DATA(rep), | 420 | 0 | PyUnicode_GET_LENGTH(rep)); | 421 | 0 | } | 422 | | | 423 | 0 | if (p == NULL) | 424 | 0 | goto error; | 425 | 0 | Py_CLEAR(rep); | 426 | |
| 427 | 0 | i = newpos; | 428 | 12 | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | 0 | assert(writer->overallocate || i == size); | 433 | 0 | } | 434 | 639k | else | 435 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | | if (ch < 0x10000) | 437 | | #endif | 438 | 639k | { | 439 | 639k | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | 639k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | 639k | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | 639k | } | 443 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | | else /* ch >= 0x10000 */ | 445 | | { | 446 | | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | | } | 453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | 10.2M | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 10.2M | } | 456 | | | 457 | 5.70k | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | 5.70k | Py_XDECREF(error_handler_obj); | 459 | 5.70k | Py_XDECREF(exc); | 460 | 5.70k | #endif | 461 | 5.70k | return p; | 462 | | | 463 | 0 | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | 12 | error: | 465 | 12 | Py_XDECREF(rep); | 466 | 12 | Py_XDECREF(error_handler_obj); | 467 | 12 | Py_XDECREF(exc); | 468 | 12 | return NULL; | 469 | 5.72k | #endif | 470 | 5.72k | } |
unicodeobject.c:ucs4lib_utf8_encoder Line | Count | Source | 267 | 1.52k | { | 268 | 1.52k | Py_ssize_t i; /* index into data of next input character */ | 269 | 1.52k | char *p; /* next free byte in output buffer */ | 270 | 1.52k | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | 1.52k | PyObject *error_handler_obj = NULL; | 272 | 1.52k | PyObject *exc = NULL; | 273 | 1.52k | PyObject *rep = NULL; | 274 | 1.52k | #endif | 275 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | 1.52k | const Py_ssize_t max_char_size = 4; | 281 | 1.52k | #endif | 282 | | | 283 | 1.52k | assert(size >= 0); | 284 | 1.52k | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 1.52k | _PyBytesWriter_Init(writer); | 291 | 1.52k | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 1.52k | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 6.77M | for (i = 0; i < size;) { | 296 | 6.77M | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 6.77M | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 6.69M | *p++ = (char) ch; | 301 | | | 302 | 6.69M | } | 303 | 81.5k | else | 304 | 81.5k | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | 81.5k | if (ch < 0x0800) | 306 | 21.5k | #endif | 307 | 21.5k | { | 308 | | /* Encode Latin-1 */ | 309 | 21.5k | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 21.5k | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 21.5k | } | 312 | 60.0k | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | 60.0k | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | 17 | Py_ssize_t startpos, endpos, newpos; | 315 | 17 | Py_ssize_t k; | 316 | 17 | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | 14 | error_handler = _Py_GetErrorHandler(errors); | 318 | 14 | } | 319 | | | 320 | 17 | startpos = i-1; | 321 | 17 | endpos = startpos+1; | 322 | | | 323 | 23 | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | 6 | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | 17 | writer->overallocate = (endpos < size); | 328 | | | 329 | 17 | switch (error_handler) | 330 | 17 | { | 331 | 0 | case _Py_ERROR_REPLACE: | 332 | 0 | memset(p, '?', endpos - startpos); | 333 | 0 | p += (endpos - startpos); | 334 | 0 | _Py_FALLTHROUGH; | 335 | 0 | case _Py_ERROR_IGNORE: | 336 | 0 | i += (endpos - startpos - 1); | 337 | 0 | break; | 338 | | | 339 | 0 | case _Py_ERROR_SURROGATEPASS: | 340 | 0 | for (k=startpos; k<endpos; k++) { | 341 | 0 | ch = data[k]; | 342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | 0 | } | 346 | 0 | i += (endpos - startpos - 1); | 347 | 0 | break; | 348 | | | 349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 352 | 0 | p = backslashreplace(writer, p, | 353 | 0 | unicode, startpos, endpos); | 354 | 0 | if (p == NULL) | 355 | 0 | goto error; | 356 | 0 | i += (endpos - startpos - 1); | 357 | 0 | break; | 358 | | | 359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 362 | 0 | p = xmlcharrefreplace(writer, p, | 363 | 0 | unicode, startpos, endpos); | 364 | 0 | if (p == NULL) | 365 | 0 | goto error; | 366 | 0 | i += (endpos - startpos - 1); | 367 | 0 | break; | 368 | | | 369 | 0 | case _Py_ERROR_SURROGATEESCAPE: | 370 | 0 | for (k=startpos; k<endpos; k++) { | 371 | 0 | ch = data[k]; | 372 | 0 | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | 0 | break; | 374 | 0 | *p++ = (char)(ch & 0xff); | 375 | 0 | } | 376 | 0 | if (k >= endpos) { | 377 | 0 | i += (endpos - startpos - 1); | 378 | 0 | break; | 379 | 0 | } | 380 | 0 | startpos = k; | 381 | 0 | assert(startpos < endpos); | 382 | 0 | _Py_FALLTHROUGH; | 383 | 17 | default: | 384 | 17 | rep = unicode_encode_call_errorhandler( | 385 | 17 | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | 17 | unicode, &exc, startpos, endpos, &newpos); | 387 | 17 | if (!rep) | 388 | 17 | goto error; | 389 | | | 390 | 0 | if (newpos < startpos) { | 391 | 0 | writer->overallocate = 1; | 392 | 0 | p = _PyBytesWriter_Prepare(writer, p, | 393 | 0 | max_char_size * (startpos - newpos)); | 394 | 0 | if (p == NULL) | 395 | 0 | goto error; | 396 | 0 | } | 397 | 0 | else { | 398 | | /* subtract preallocated bytes */ | 399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | 0 | writer->overallocate = (newpos < size); | 402 | 0 | } | 403 | | | 404 | 0 | if (PyBytes_Check(rep)) { | 405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | 0 | PyBytes_AS_STRING(rep), | 407 | 0 | PyBytes_GET_SIZE(rep)); | 408 | 0 | } | 409 | 0 | else { | 410 | | /* rep is unicode */ | 411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { | 412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, | 413 | 0 | startpos, endpos, | 414 | 0 | "surrogates not allowed"); | 415 | 0 | goto error; | 416 | 0 | } | 417 | | | 418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | 0 | PyUnicode_DATA(rep), | 420 | 0 | PyUnicode_GET_LENGTH(rep)); | 421 | 0 | } | 422 | | | 423 | 0 | if (p == NULL) | 424 | 0 | goto error; | 425 | 0 | Py_CLEAR(rep); | 426 | |
| 427 | 0 | i = newpos; | 428 | 17 | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | 0 | assert(writer->overallocate || i == size); | 433 | 0 | } | 434 | 59.9k | else | 435 | 59.9k | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | 59.9k | if (ch < 0x10000) | 437 | 49.9k | #endif | 438 | 49.9k | { | 439 | 49.9k | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | 49.9k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | 49.9k | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | 49.9k | } | 443 | 10.0k | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | 10.0k | else /* ch >= 0x10000 */ | 445 | 10.0k | { | 446 | 10.0k | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | 10.0k | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | 10.0k | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | 10.0k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | 10.0k | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | 10.0k | } | 453 | 6.77M | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | 6.77M | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 6.77M | } | 456 | | | 457 | 1.50k | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | 1.50k | Py_XDECREF(error_handler_obj); | 459 | 1.50k | Py_XDECREF(exc); | 460 | 1.50k | #endif | 461 | 1.50k | return p; | 462 | | | 463 | 0 | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | 17 | error: | 465 | 17 | Py_XDECREF(rep); | 466 | 17 | Py_XDECREF(error_handler_obj); | 467 | 17 | Py_XDECREF(exc); | 468 | 17 | return NULL; | 469 | 1.52k | #endif | 470 | 1.52k | } |
Unexecuted instantiation: unicodeobject.c:asciilib_utf8_encoder |
471 | | |
472 | | /* The pattern for constructing UCS2-repeated masks. */ |
473 | | #if SIZEOF_LONG == 8 |
474 | 3.53M | # define UCS2_REPEAT_MASK 0x0001000100010001ul |
475 | | #elif SIZEOF_LONG == 4 |
476 | | # define UCS2_REPEAT_MASK 0x00010001ul |
477 | | #else |
478 | | # error C 'long' size should be either 4 or 8! |
479 | | #endif |
480 | | |
481 | | /* The mask for fast checking. */ |
482 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
483 | | /* The mask for fast checking of whether a C 'long' contains a |
484 | | non-ASCII or non-Latin1 UTF16-encoded characters. */ |
485 | 159 | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) |
486 | | #else |
487 | | /* The mask for fast checking of whether a C 'long' may contain |
488 | | UTF16-encoded surrogate characters. This is an efficient heuristic, |
489 | | assuming that non-surrogate characters with a code point >= 0x8000 are |
490 | | rare in most input. |
491 | | */ |
492 | 622k | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) |
493 | | #endif |
494 | | /* The mask for fast byte-swapping. */ |
495 | 2.91M | #define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) |
496 | | /* Swap bytes. */ |
497 | 1.45M | #define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ |
498 | 1.45M | (((value) & STRIPPED_MASK) << 8)) |
499 | | |
500 | | Py_LOCAL_INLINE(Py_UCS4) |
501 | | STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, |
502 | | STRINGLIB_CHAR *dest, Py_ssize_t *outpos, |
503 | | int native_ordering) |
504 | 31.8k | { |
505 | 31.8k | Py_UCS4 ch; |
506 | 31.8k | const unsigned char *q = *inptr; |
507 | 31.8k | STRINGLIB_CHAR *p = dest + *outpos; |
508 | | /* Offsets from q for retrieving byte pairs in the right order. */ |
509 | 31.8k | #if PY_LITTLE_ENDIAN |
510 | 31.8k | int ihi = !!native_ordering, ilo = !native_ordering; |
511 | | #else |
512 | | int ihi = !native_ordering, ilo = !!native_ordering; |
513 | | #endif |
514 | 31.8k | --e; |
515 | | |
516 | 194k | while (q < e) { |
517 | 193k | Py_UCS4 ch2; |
518 | | /* First check for possible aligned read of a C 'long'. Unaligned |
519 | | reads are more expensive, better to defer to another iteration. */ |
520 | 193k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { |
521 | | /* Fast path for runs of in-range non-surrogate chars. */ |
522 | 48.5k | const unsigned char *_q = q; |
523 | 1.35M | while (_q + SIZEOF_LONG <= e) { |
524 | 1.35M | unsigned long block = * (const unsigned long *) _q; |
525 | 1.35M | if (native_ordering) { |
526 | | /* Can use buffer directly */ |
527 | 622k | if (block & FAST_CHAR_MASK) |
528 | 42.1k | break; |
529 | 622k | } |
530 | 732k | else { |
531 | | /* Need to byte-swap */ |
532 | 732k | if (block & SWAB(FAST_CHAR_MASK)) |
533 | 6.19k | break; |
534 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
535 | 0 | block >>= 8; |
536 | | #else |
537 | 725k | block = SWAB(block); |
538 | | #endif |
539 | 725k | } |
540 | 1.30M | #if PY_LITTLE_ENDIAN |
541 | | # if SIZEOF_LONG == 4 |
542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); |
544 | | # elif SIZEOF_LONG == 8 |
545 | 1.30M | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
546 | 1.30M | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
547 | 1.30M | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
548 | 1.30M | p[3] = (STRINGLIB_CHAR)(block >> 48); |
549 | 1.30M | # endif |
550 | | #else |
551 | | # if SIZEOF_LONG == 4 |
552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); |
553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
554 | | # elif SIZEOF_LONG == 8 |
555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); |
556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
559 | | # endif |
560 | | #endif |
561 | 1.30M | _q += SIZEOF_LONG; |
562 | 1.30M | p += SIZEOF_LONG / 2; |
563 | 1.30M | } |
564 | 48.5k | q = _q; |
565 | 48.5k | if (q >= e) |
566 | 5 | break; |
567 | 48.5k | } |
568 | | |
569 | 193k | ch = (q[ihi] << 8) | q[ilo]; |
570 | 193k | q += 2; |
571 | 193k | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
572 | | #if STRINGLIB_SIZEOF_CHAR < 2 |
573 | 198 | if (ch > STRINGLIB_MAX_CHAR) |
574 | | /* Out-of-range */ |
575 | 185 | goto Return; |
576 | 13 | #endif |
577 | 13 | *p++ = (STRINGLIB_CHAR)ch; |
578 | 13 | continue; |
579 | 150k | } |
580 | | |
581 | | /* UTF-16 code pair: */ |
582 | 43.6k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) |
583 | 27.8k | goto IllegalEncoding; |
584 | 15.8k | if (q >= e) |
585 | 11 | goto UnexpectedEnd; |
586 | 15.8k | ch2 = (q[ihi] << 8) | q[ilo]; |
587 | 15.8k | q += 2; |
588 | 15.8k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) |
589 | 3.52k | goto IllegalSurrogate; |
590 | 12.3k | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); |
591 | | #if STRINGLIB_SIZEOF_CHAR < 4 |
592 | | /* Out-of-range */ |
593 | 125 | goto Return; |
594 | | #else |
595 | | *p++ = (STRINGLIB_CHAR)ch; |
596 | | #endif |
597 | 12.2k | } |
598 | 198 | ch = 0; |
599 | 31.8k | Return: |
600 | 31.8k | *inptr = q; |
601 | 31.8k | *outpos = p - dest; |
602 | 31.8k | return ch; |
603 | 11 | UnexpectedEnd: |
604 | 11 | ch = 1; |
605 | 11 | goto Return; |
606 | 27.8k | IllegalEncoding: |
607 | 27.8k | ch = 2; |
608 | 27.8k | goto Return; |
609 | 3.52k | IllegalSurrogate: |
610 | 3.52k | ch = 3; |
611 | 3.52k | goto Return; |
612 | 198 | } unicodeobject.c:asciilib_utf16_decode Line | Count | Source | 504 | 229 | { | 505 | 229 | Py_UCS4 ch; | 506 | 229 | const unsigned char *q = *inptr; | 507 | 229 | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 229 | #if PY_LITTLE_ENDIAN | 510 | 229 | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 229 | --e; | 515 | | | 516 | 242 | while (q < e) { | 517 | 242 | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 242 | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 224 | const unsigned char *_q = q; | 523 | 224 | while (_q + SIZEOF_LONG <= e) { | 524 | 224 | unsigned long block = * (const unsigned long *) _q; | 525 | 224 | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 159 | if (block & FAST_CHAR_MASK) | 528 | 159 | break; | 529 | 159 | } | 530 | 65 | else { | 531 | | /* Need to byte-swap */ | 532 | 65 | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 65 | break; | 534 | 0 | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | 0 | block >>= 8; | 536 | | #else | 537 | | block = SWAB(block); | 538 | | #endif | 539 | 0 | } | 540 | 0 | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 0 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 0 | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 0 | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 0 | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 0 | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 0 | _q += SIZEOF_LONG; | 562 | 0 | p += SIZEOF_LONG / 2; | 563 | 0 | } | 564 | 224 | q = _q; | 565 | 224 | if (q >= e) | 566 | 0 | break; | 567 | 224 | } | 568 | | | 569 | 242 | ch = (q[ihi] << 8) | q[ilo]; | 570 | 242 | q += 2; | 571 | 242 | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | 197 | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | 197 | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | 184 | goto Return; | 576 | 13 | #endif | 577 | 13 | *p++ = (STRINGLIB_CHAR)ch; | 578 | 13 | continue; | 579 | 197 | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 45 | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 5 | goto IllegalEncoding; | 584 | 40 | if (q >= e) | 585 | 0 | goto UnexpectedEnd; | 586 | 40 | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 40 | q += 2; | 588 | 40 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 4 | goto IllegalSurrogate; | 590 | 36 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 36 | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 36 | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 40 | } | 598 | 0 | ch = 0; | 599 | 229 | Return: | 600 | 229 | *inptr = q; | 601 | 229 | *outpos = p - dest; | 602 | 229 | return ch; | 603 | 0 | UnexpectedEnd: | 604 | 0 | ch = 1; | 605 | 0 | goto Return; | 606 | 5 | IllegalEncoding: | 607 | 5 | ch = 2; | 608 | 5 | goto Return; | 609 | 4 | IllegalSurrogate: | 610 | 4 | ch = 3; | 611 | 4 | goto Return; | 612 | 0 | } |
unicodeobject.c:ucs1lib_utf16_decode Line | Count | Source | 504 | 2 | { | 505 | 2 | Py_UCS4 ch; | 506 | 2 | const unsigned char *q = *inptr; | 507 | 2 | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 2 | #if PY_LITTLE_ENDIAN | 510 | 2 | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 2 | --e; | 515 | | | 516 | 2 | while (q < e) { | 517 | 2 | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 2 | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 0 | const unsigned char *_q = q; | 523 | 0 | while (_q + SIZEOF_LONG <= e) { | 524 | 0 | unsigned long block = * (const unsigned long *) _q; | 525 | 0 | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 0 | if (block & FAST_CHAR_MASK) | 528 | 0 | break; | 529 | 0 | } | 530 | 0 | else { | 531 | | /* Need to byte-swap */ | 532 | 0 | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 0 | break; | 534 | 0 | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | 0 | block >>= 8; | 536 | | #else | 537 | | block = SWAB(block); | 538 | | #endif | 539 | 0 | } | 540 | 0 | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 0 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 0 | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 0 | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 0 | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 0 | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 0 | _q += SIZEOF_LONG; | 562 | 0 | p += SIZEOF_LONG / 2; | 563 | 0 | } | 564 | 0 | q = _q; | 565 | 0 | if (q >= e) | 566 | 0 | break; | 567 | 0 | } | 568 | | | 569 | 2 | ch = (q[ihi] << 8) | q[ilo]; | 570 | 2 | q += 2; | 571 | 2 | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | 1 | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | 1 | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | 1 | goto Return; | 576 | 0 | #endif | 577 | 0 | *p++ = (STRINGLIB_CHAR)ch; | 578 | 0 | continue; | 579 | 1 | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 1 | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 0 | goto IllegalEncoding; | 584 | 1 | if (q >= e) | 585 | 0 | goto UnexpectedEnd; | 586 | 1 | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 1 | q += 2; | 588 | 1 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 1 | goto IllegalSurrogate; | 590 | 0 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 0 | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 0 | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 1 | } | 598 | 0 | ch = 0; | 599 | 2 | Return: | 600 | 2 | *inptr = q; | 601 | 2 | *outpos = p - dest; | 602 | 2 | return ch; | 603 | 0 | UnexpectedEnd: | 604 | 0 | ch = 1; | 605 | 0 | goto Return; | 606 | 0 | IllegalEncoding: | 607 | 0 | ch = 2; | 608 | 0 | goto Return; | 609 | 1 | IllegalSurrogate: | 610 | 1 | ch = 3; | 611 | 1 | goto Return; | 612 | 0 | } |
unicodeobject.c:ucs2lib_utf16_decode Line | Count | Source | 504 | 2.54k | { | 505 | 2.54k | Py_UCS4 ch; | 506 | 2.54k | const unsigned char *q = *inptr; | 507 | 2.54k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 2.54k | #if PY_LITTLE_ENDIAN | 510 | 2.54k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 2.54k | --e; | 515 | | | 516 | 21.9k | while (q < e) { | 517 | 21.8k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 21.8k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 5.40k | const unsigned char *_q = q; | 523 | 697k | while (_q + SIZEOF_LONG <= e) { | 524 | 697k | unsigned long block = * (const unsigned long *) _q; | 525 | 697k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 302k | if (block & FAST_CHAR_MASK) | 528 | 2.49k | break; | 529 | 302k | } | 530 | 394k | else { | 531 | | /* Need to byte-swap */ | 532 | 394k | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 2.81k | break; | 534 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | | block >>= 8; | 536 | | #else | 537 | 391k | block = SWAB(block); | 538 | 391k | #endif | 539 | 391k | } | 540 | 692k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 692k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 692k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 692k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 692k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 692k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 692k | _q += SIZEOF_LONG; | 562 | 692k | p += SIZEOF_LONG / 2; | 563 | 692k | } | 564 | 5.40k | q = _q; | 565 | 5.40k | if (q >= e) | 566 | 2 | break; | 567 | 5.40k | } | 568 | | | 569 | 21.8k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 21.8k | q += 2; | 571 | 21.8k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | | goto Return; | 576 | | #endif | 577 | 19.4k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 19.4k | continue; | 579 | 19.4k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 2.45k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 291 | goto IllegalEncoding; | 584 | 2.16k | if (q >= e) | 585 | 6 | goto UnexpectedEnd; | 586 | 2.15k | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 2.15k | q += 2; | 588 | 2.15k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 2.06k | goto IllegalSurrogate; | 590 | 89 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 89 | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 89 | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 2.15k | } | 598 | 91 | ch = 0; | 599 | 2.54k | Return: | 600 | 2.54k | *inptr = q; | 601 | 2.54k | *outpos = p - dest; | 602 | 2.54k | return ch; | 603 | 6 | UnexpectedEnd: | 604 | 6 | ch = 1; | 605 | 6 | goto Return; | 606 | 291 | IllegalEncoding: | 607 | 291 | ch = 2; | 608 | 291 | goto Return; | 609 | 2.06k | IllegalSurrogate: | 610 | 2.06k | ch = 3; | 611 | 2.06k | goto Return; | 612 | 91 | } |
unicodeobject.c:ucs4lib_utf16_decode Line | Count | Source | 504 | 29.0k | { | 505 | 29.0k | Py_UCS4 ch; | 506 | 29.0k | const unsigned char *q = *inptr; | 507 | 29.0k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 29.0k | #if PY_LITTLE_ENDIAN | 510 | 29.0k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 29.0k | --e; | 515 | | | 516 | 171k | while (q < e) { | 517 | 171k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 171k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 42.9k | const unsigned char *_q = q; | 523 | 657k | while (_q + SIZEOF_LONG <= e) { | 524 | 656k | unsigned long block = * (const unsigned long *) _q; | 525 | 656k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 319k | if (block & FAST_CHAR_MASK) | 528 | 39.5k | break; | 529 | 319k | } | 530 | 337k | else { | 531 | | /* Need to byte-swap */ | 532 | 337k | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 3.31k | break; | 534 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | | block >>= 8; | 536 | | #else | 537 | 333k | block = SWAB(block); | 538 | 333k | #endif | 539 | 333k | } | 540 | 614k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 614k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 614k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 614k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 614k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 614k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 614k | _q += SIZEOF_LONG; | 562 | 614k | p += SIZEOF_LONG / 2; | 563 | 614k | } | 564 | 42.9k | q = _q; | 565 | 42.9k | if (q >= e) | 566 | 3 | break; | 567 | 42.9k | } | 568 | | | 569 | 171k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 171k | q += 2; | 571 | 171k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | | goto Return; | 576 | | #endif | 577 | 130k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 130k | continue; | 579 | 130k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 41.1k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 27.5k | goto IllegalEncoding; | 584 | 13.6k | if (q >= e) | 585 | 5 | goto UnexpectedEnd; | 586 | 13.6k | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 13.6k | q += 2; | 588 | 13.6k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 1.44k | goto IllegalSurrogate; | 590 | 12.2k | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | | goto Return; | 594 | | #else | 595 | 12.2k | *p++ = (STRINGLIB_CHAR)ch; | 596 | 12.2k | #endif | 597 | 12.2k | } | 598 | 107 | ch = 0; | 599 | 29.0k | Return: | 600 | 29.0k | *inptr = q; | 601 | 29.0k | *outpos = p - dest; | 602 | 29.0k | return ch; | 603 | 5 | UnexpectedEnd: | 604 | 5 | ch = 1; | 605 | 5 | goto Return; | 606 | 27.5k | IllegalEncoding: | 607 | 27.5k | ch = 2; | 608 | 27.5k | goto Return; | 609 | 1.44k | IllegalSurrogate: | 610 | 1.44k | ch = 3; | 611 | 1.44k | goto Return; | 612 | 107 | } |
|
613 | | #undef UCS2_REPEAT_MASK |
614 | | #undef FAST_CHAR_MASK |
615 | | #undef STRIPPED_MASK |
616 | | #undef SWAB |
617 | | |
618 | | |
619 | | #if STRINGLIB_MAX_CHAR >= 0x80 |
620 | | Py_LOCAL_INLINE(Py_ssize_t) |
621 | | STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, |
622 | | Py_ssize_t len, |
623 | | unsigned short **outptr, |
624 | | int native_ordering) |
625 | 0 | { |
626 | 0 | unsigned short *out = *outptr; |
627 | 0 | const STRINGLIB_CHAR *end = in + len; |
628 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
629 | 0 | if (native_ordering) { |
630 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
631 | 0 | while (in < unrolled_end) { |
632 | 0 | out[0] = in[0]; |
633 | 0 | out[1] = in[1]; |
634 | 0 | out[2] = in[2]; |
635 | 0 | out[3] = in[3]; |
636 | 0 | in += 4; out += 4; |
637 | 0 | } |
638 | 0 | while (in < end) { |
639 | 0 | *out++ = *in++; |
640 | 0 | } |
641 | 0 | } else { |
642 | 0 | # define SWAB2(CH) ((CH) << 8) /* high byte is zero */ |
643 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
644 | 0 | while (in < unrolled_end) { |
645 | 0 | out[0] = SWAB2(in[0]); |
646 | 0 | out[1] = SWAB2(in[1]); |
647 | 0 | out[2] = SWAB2(in[2]); |
648 | 0 | out[3] = SWAB2(in[3]); |
649 | 0 | in += 4; out += 4; |
650 | 0 | } |
651 | 0 | while (in < end) { |
652 | 0 | Py_UCS4 ch = *in++; |
653 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
654 | 0 | } |
655 | 0 | #undef SWAB2 |
656 | 0 | } |
657 | | *outptr = out; |
658 | | return len; |
659 | | #else |
660 | 0 | if (native_ordering) { |
661 | | #if STRINGLIB_MAX_CHAR < 0x10000 |
662 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
663 | 0 | while (in < unrolled_end) { |
664 | | /* check if any character is a surrogate character */ |
665 | 0 | if (((in[0] ^ 0xd800) & |
666 | 0 | (in[1] ^ 0xd800) & |
667 | 0 | (in[2] ^ 0xd800) & |
668 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
669 | 0 | break; |
670 | 0 | out[0] = in[0]; |
671 | 0 | out[1] = in[1]; |
672 | 0 | out[2] = in[2]; |
673 | 0 | out[3] = in[3]; |
674 | 0 | in += 4; out += 4; |
675 | 0 | } |
676 | | #endif |
677 | 0 | while (in < end) { |
678 | 0 | Py_UCS4 ch; |
679 | 0 | ch = *in++; |
680 | 0 | if (ch < 0xd800) |
681 | 0 | *out++ = ch; |
682 | 0 | else if (ch < 0xe000) |
683 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
684 | 0 | goto fail; |
685 | | #if STRINGLIB_MAX_CHAR >= 0x10000 |
686 | 0 | else if (ch >= 0x10000) { |
687 | 0 | out[0] = Py_UNICODE_HIGH_SURROGATE(ch); |
688 | 0 | out[1] = Py_UNICODE_LOW_SURROGATE(ch); |
689 | 0 | out += 2; |
690 | 0 | } |
691 | 0 | #endif |
692 | 0 | else |
693 | 0 | *out++ = ch; |
694 | 0 | } |
695 | 0 | } else { |
696 | 0 | #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) |
697 | | #if STRINGLIB_MAX_CHAR < 0x10000 |
698 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
699 | 0 | while (in < unrolled_end) { |
700 | | /* check if any character is a surrogate character */ |
701 | 0 | if (((in[0] ^ 0xd800) & |
702 | 0 | (in[1] ^ 0xd800) & |
703 | 0 | (in[2] ^ 0xd800) & |
704 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
705 | 0 | break; |
706 | 0 | out[0] = SWAB2(in[0]); |
707 | 0 | out[1] = SWAB2(in[1]); |
708 | 0 | out[2] = SWAB2(in[2]); |
709 | 0 | out[3] = SWAB2(in[3]); |
710 | 0 | in += 4; out += 4; |
711 | 0 | } |
712 | | #endif |
713 | 0 | while (in < end) { |
714 | 0 | Py_UCS4 ch = *in++; |
715 | 0 | if (ch < 0xd800) |
716 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
717 | 0 | else if (ch < 0xe000) |
718 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
719 | 0 | goto fail; |
720 | | #if STRINGLIB_MAX_CHAR >= 0x10000 |
721 | 0 | else if (ch >= 0x10000) { |
722 | 0 | Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); |
723 | 0 | Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); |
724 | 0 | out[0] = SWAB2(ch1); |
725 | 0 | out[1] = SWAB2(ch2); |
726 | 0 | out += 2; |
727 | 0 | } |
728 | 0 | #endif |
729 | 0 | else |
730 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
731 | 0 | } |
732 | 0 | #undef SWAB2 |
733 | 0 | } |
734 | 0 | *outptr = out; |
735 | 0 | return len; |
736 | 0 | fail: |
737 | 0 | *outptr = out; |
738 | 0 | return len - (end - in + 1); |
739 | | #endif |
740 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf16_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf16_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf16_encode |
741 | | |
742 | | static inline uint32_t |
743 | | STRINGLIB(SWAB4)(STRINGLIB_CHAR ch) |
744 | 0 | { |
745 | 0 | uint32_t word = ch; |
746 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
747 | | /* high bytes are zero */ |
748 | | return (word << 24); |
749 | | #elif STRINGLIB_SIZEOF_CHAR == 2 |
750 | | /* high bytes are zero */ |
751 | | return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8); |
752 | | #else |
753 | | return _Py_bswap32(word); |
754 | | #endif |
755 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_SWAB4 Unexecuted instantiation: unicodeobject.c:ucs2lib_SWAB4 Unexecuted instantiation: unicodeobject.c:ucs4lib_SWAB4 |
756 | | |
757 | | Py_LOCAL_INLINE(Py_ssize_t) |
758 | | STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, |
759 | | Py_ssize_t len, |
760 | | uint32_t **outptr, |
761 | | int native_ordering) |
762 | 0 | { |
763 | 0 | uint32_t *out = *outptr; |
764 | 0 | const STRINGLIB_CHAR *end = in + len; |
765 | 0 | if (native_ordering) { |
766 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
767 | 0 | while (in < unrolled_end) { |
768 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
769 | | /* check if any character is a surrogate character */ |
770 | 0 | if (((in[0] ^ 0xd800) & |
771 | 0 | (in[1] ^ 0xd800) & |
772 | 0 | (in[2] ^ 0xd800) & |
773 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
774 | 0 | break; |
775 | 0 | #endif |
776 | 0 | out[0] = in[0]; |
777 | 0 | out[1] = in[1]; |
778 | 0 | out[2] = in[2]; |
779 | 0 | out[3] = in[3]; |
780 | 0 | in += 4; out += 4; |
781 | 0 | } |
782 | 0 | while (in < end) { |
783 | 0 | Py_UCS4 ch; |
784 | 0 | ch = *in++; |
785 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
786 | 0 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
787 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
788 | 0 | goto fail; |
789 | 0 | } |
790 | 0 | #endif |
791 | 0 | *out++ = ch; |
792 | 0 | } |
793 | 0 | } else { |
794 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
795 | 0 | while (in < unrolled_end) { |
796 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
797 | | /* check if any character is a surrogate character */ |
798 | 0 | if (((in[0] ^ 0xd800) & |
799 | 0 | (in[1] ^ 0xd800) & |
800 | 0 | (in[2] ^ 0xd800) & |
801 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
802 | 0 | break; |
803 | 0 | #endif |
804 | 0 | out[0] = STRINGLIB(SWAB4)(in[0]); |
805 | 0 | out[1] = STRINGLIB(SWAB4)(in[1]); |
806 | 0 | out[2] = STRINGLIB(SWAB4)(in[2]); |
807 | 0 | out[3] = STRINGLIB(SWAB4)(in[3]); |
808 | 0 | in += 4; out += 4; |
809 | 0 | } |
810 | 0 | while (in < end) { |
811 | 0 | Py_UCS4 ch = *in++; |
812 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
813 | 0 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
814 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
815 | 0 | goto fail; |
816 | 0 | } |
817 | 0 | #endif |
818 | 0 | *out++ = STRINGLIB(SWAB4)(ch); |
819 | 0 | } |
820 | 0 | } |
821 | 0 | *outptr = out; |
822 | 0 | return len; |
823 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
824 | 0 | fail: |
825 | 0 | *outptr = out; |
826 | 0 | return len - (end - in + 1); |
827 | | #endif |
828 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf32_encode |
829 | | |
830 | | #endif |