/src/cpython/Objects/stringlib/codecs.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* stringlib: codec implementations */ |
2 | | |
3 | | #if !STRINGLIB_IS_UNICODE |
4 | | # error "codecs.h is specific to Unicode" |
5 | | #endif |
6 | | |
7 | | #include "pycore_bitutils.h" // _Py_bswap32() |
8 | | |
9 | | /* Mask to quickly check whether a C 'size_t' contains a |
10 | | non-ASCII, UTF8-encoded char. */ |
11 | | #if (SIZEOF_SIZE_T == 8) |
12 | 375M | # define ASCII_CHAR_MASK 0x8080808080808080ULL |
13 | | #elif (SIZEOF_SIZE_T == 4) |
14 | | # define ASCII_CHAR_MASK 0x80808080U |
15 | | #else |
16 | | # error C 'size_t' size should be either 4 or 8! |
17 | | #endif |
18 | | |
19 | | /* 10xxxxxx */ |
20 | 128M | #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0) |
21 | | |
22 | | Py_LOCAL_INLINE(Py_UCS4) |
23 | | STRINGLIB(utf8_decode)(const char **inptr, const char *end, |
24 | | STRINGLIB_CHAR *dest, |
25 | | Py_ssize_t *outpos) |
26 | 183M | { |
27 | 183M | Py_UCS4 ch; |
28 | 183M | const char *s = *inptr; |
29 | 183M | STRINGLIB_CHAR *p = dest + *outpos; |
30 | | |
31 | 385M | while (s < end) { |
32 | 385M | ch = (unsigned char)*s; |
33 | | |
34 | 385M | if (ch < 0x80) { |
35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 |
36 | | input will consist of an overwhelming majority of ASCII |
37 | | characters, we try to optimize for this case by checking |
38 | | as many characters as a C 'size_t' can contain. |
39 | | First, check if we can do an aligned read, as most CPUs have |
40 | | a penalty for unaligned reads. |
41 | | */ |
42 | 156M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { |
43 | | /* Help register allocation */ |
44 | 20.2M | const char *_s = s; |
45 | 20.2M | STRINGLIB_CHAR *_p = p; |
46 | 375M | while (_s + SIZEOF_SIZE_T <= end) { |
47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), |
48 | | and do a fast unrolled copy if it only contains ASCII |
49 | | characters. */ |
50 | 375M | size_t value = *(const size_t *) _s; |
51 | 375M | if (value & ASCII_CHAR_MASK) |
52 | 20.0M | break; |
53 | 355M | #if PY_LITTLE_ENDIAN |
54 | 355M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); |
55 | 355M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
56 | 355M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
57 | 355M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
58 | 355M | # if SIZEOF_SIZE_T == 8 |
59 | 355M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
60 | 355M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
61 | 355M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
62 | 355M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
63 | 355M | # endif |
64 | | #else |
65 | | # if SIZEOF_SIZE_T == 8 |
66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); |
74 | | # else |
75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); |
79 | | # endif |
80 | | #endif |
81 | 355M | _s += SIZEOF_SIZE_T; |
82 | 355M | _p += SIZEOF_SIZE_T; |
83 | 355M | } |
84 | 20.2M | s = _s; |
85 | 20.2M | p = _p; |
86 | 20.2M | if (s == end) |
87 | 10.4k | break; |
88 | 20.1M | ch = (unsigned char)*s; |
89 | 20.1M | } |
90 | 156M | if (ch < 0x80) { |
91 | 155M | s++; |
92 | 155M | *p++ = ch; |
93 | 155M | continue; |
94 | 155M | } |
95 | 156M | } |
96 | | |
97 | 229M | if (ch < 0xE0) { |
98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ |
99 | 95.3M | Py_UCS4 ch2; |
100 | 95.3M | if (ch < 0xC2) { |
101 | | /* invalid sequence |
102 | | \x80-\xBF -- continuation byte |
103 | | \xC0-\xC1 -- fake 0000-007F */ |
104 | 67.7M | goto InvalidStart; |
105 | 67.7M | } |
106 | 27.6M | if (end - s < 2) { |
107 | | /* unexpected end of data: the caller will decide whether |
108 | | it's an error or not */ |
109 | 8.81k | break; |
110 | 8.81k | } |
111 | 27.6M | ch2 = (unsigned char)s[1]; |
112 | 27.6M | if (!IS_CONTINUATION_BYTE(ch2)) |
113 | | /* invalid continuation byte */ |
114 | 22.3M | goto InvalidContinuation1; |
115 | 5.29M | ch = (ch << 6) + ch2 - |
116 | 5.29M | ((0xC0 << 6) + 0x80); |
117 | 5.29M | assert ((ch > 0x007F) && (ch <= 0x07FF)); |
118 | 5.29M | s += 2; |
119 | 5.29M | if (STRINGLIB_MAX_CHAR <= 0x007F || |
120 | 5.29M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) |
121 | | /* Out-of-range */ |
122 | 72.7k | goto Return; |
123 | 5.22M | *p++ = ch; |
124 | 5.22M | continue; |
125 | 5.29M | } |
126 | | |
127 | 134M | if (ch < 0xF0) { |
128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ |
129 | 52.6M | Py_UCS4 ch2, ch3; |
130 | 52.6M | if (end - s < 3) { |
131 | | /* unexpected end of data: the caller will decide whether |
132 | | it's an error or not */ |
133 | 12.4k | if (end - s < 2) |
134 | 4.26k | break; |
135 | 8.20k | ch2 = (unsigned char)s[1]; |
136 | 8.20k | if (!IS_CONTINUATION_BYTE(ch2) || |
137 | 8.20k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) |
138 | | /* for clarification see comments below */ |
139 | 5.56k | goto InvalidContinuation1; |
140 | 2.64k | break; |
141 | 8.20k | } |
142 | 52.5M | ch2 = (unsigned char)s[1]; |
143 | 52.5M | ch3 = (unsigned char)s[2]; |
144 | 52.5M | if (!IS_CONTINUATION_BYTE(ch2)) { |
145 | | /* invalid continuation byte */ |
146 | 10.5M | goto InvalidContinuation1; |
147 | 10.5M | } |
148 | 42.0M | if (ch == 0xE0) { |
149 | 114k | if (ch2 < 0xA0) |
150 | | /* invalid sequence |
151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ |
152 | 45.9k | goto InvalidContinuation1; |
153 | 41.8M | } else if (ch == 0xED && ch2 >= 0xA0) { |
154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF |
155 | | will result in surrogates in range D800-DFFF. Surrogates are |
156 | | not valid UTF-8 so they are rejected. |
157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf |
158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ |
159 | 71.7k | goto InvalidContinuation1; |
160 | 71.7k | } |
161 | 41.8M | if (!IS_CONTINUATION_BYTE(ch3)) { |
162 | | /* invalid continuation byte */ |
163 | 927k | goto InvalidContinuation2; |
164 | 927k | } |
165 | 40.9M | ch = (ch << 12) + (ch2 << 6) + ch3 - |
166 | 40.9M | ((0xE0 << 12) + (0x80 << 6) + 0x80); |
167 | 40.9M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); |
168 | 40.9M | s += 3; |
169 | 40.9M | if (STRINGLIB_MAX_CHAR <= 0x07FF || |
170 | 40.9M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) |
171 | | /* Out-of-range */ |
172 | 164k | goto Return; |
173 | 40.7M | *p++ = ch; |
174 | 40.7M | continue; |
175 | 40.9M | } |
176 | | |
177 | 81.6M | if (ch < 0xF5) { |
178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ |
179 | 4.82M | Py_UCS4 ch2, ch3, ch4; |
180 | 4.82M | if (end - s < 4) { |
181 | | /* unexpected end of data: the caller will decide whether |
182 | | it's an error or not */ |
183 | 19.0k | if (end - s < 2) |
184 | 4.93k | break; |
185 | 14.1k | ch2 = (unsigned char)s[1]; |
186 | 14.1k | if (!IS_CONTINUATION_BYTE(ch2) || |
187 | 14.1k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) |
188 | | /* for clarification see comments below */ |
189 | 9.21k | goto InvalidContinuation1; |
190 | 4.95k | if (end - s < 3) |
191 | 1.58k | break; |
192 | 3.36k | ch3 = (unsigned char)s[2]; |
193 | 3.36k | if (!IS_CONTINUATION_BYTE(ch3)) |
194 | 1.90k | goto InvalidContinuation2; |
195 | 1.45k | break; |
196 | 3.36k | } |
197 | 4.80M | ch2 = (unsigned char)s[1]; |
198 | 4.80M | ch3 = (unsigned char)s[2]; |
199 | 4.80M | ch4 = (unsigned char)s[3]; |
200 | 4.80M | if (!IS_CONTINUATION_BYTE(ch2)) { |
201 | | /* invalid continuation byte */ |
202 | 3.48M | goto InvalidContinuation1; |
203 | 3.48M | } |
204 | 1.31M | if (ch == 0xF0) { |
205 | 617k | if (ch2 < 0x90) |
206 | | /* invalid sequence |
207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ |
208 | 42.4k | goto InvalidContinuation1; |
209 | 695k | } else if (ch == 0xF4 && ch2 >= 0x90) { |
210 | | /* invalid sequence |
211 | | \xF4\x90\x80\x80- -- 110000- overflow */ |
212 | 79.4k | goto InvalidContinuation1; |
213 | 79.4k | } |
214 | 1.19M | if (!IS_CONTINUATION_BYTE(ch3)) { |
215 | | /* invalid continuation byte */ |
216 | 416k | goto InvalidContinuation2; |
217 | 416k | } |
218 | 774k | if (!IS_CONTINUATION_BYTE(ch4)) { |
219 | | /* invalid continuation byte */ |
220 | 130k | goto InvalidContinuation3; |
221 | 130k | } |
222 | 643k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - |
223 | 643k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); |
224 | 643k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); |
225 | 643k | s += 4; |
226 | 643k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || |
227 | 643k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) |
228 | | /* Out-of-range */ |
229 | 34.9k | goto Return; |
230 | 609k | *p++ = ch; |
231 | 609k | continue; |
232 | 643k | } |
233 | 76.8M | goto InvalidStart; |
234 | 81.6M | } |
235 | 317k | ch = 0; |
236 | 183M | Return: |
237 | 183M | *inptr = s; |
238 | 183M | *outpos = p - dest; |
239 | 183M | return ch; |
240 | 144M | InvalidStart: |
241 | 144M | ch = 1; |
242 | 144M | goto Return; |
243 | 36.6M | InvalidContinuation1: |
244 | 36.6M | ch = 2; |
245 | 36.6M | goto Return; |
246 | 1.34M | InvalidContinuation2: |
247 | 1.34M | ch = 3; |
248 | 1.34M | goto Return; |
249 | 130k | InvalidContinuation3: |
250 | 130k | ch = 4; |
251 | 130k | goto Return; |
252 | 317k | } unicodeobject.c:asciilib_utf8_decode Line | Count | Source | 26 | 285k | { | 27 | 285k | Py_UCS4 ch; | 28 | 285k | const char *s = *inptr; | 29 | 285k | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 285k | while (s < end) { | 32 | 285k | ch = (unsigned char)*s; | 33 | | | 34 | 285k | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 0 | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 0 | const char *_s = s; | 45 | 0 | STRINGLIB_CHAR *_p = p; | 46 | 0 | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 0 | size_t value = *(const size_t *) _s; | 51 | 0 | if (value & ASCII_CHAR_MASK) | 52 | 0 | break; | 53 | 0 | #if PY_LITTLE_ENDIAN | 54 | 0 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 0 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 0 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 0 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 0 | # if SIZEOF_SIZE_T == 8 | 59 | 0 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 0 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 0 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 0 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 0 | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 0 | _s += SIZEOF_SIZE_T; | 82 | 0 | _p += SIZEOF_SIZE_T; | 83 | 0 | } | 84 | 0 | s = _s; | 85 | 0 | p = _p; | 86 | 0 | if (s == end) | 87 | 0 | break; | 88 | 0 | ch = (unsigned char)*s; | 89 | 0 | } | 90 | 0 | if (ch < 0x80) { | 91 | 0 | s++; | 92 | 0 | *p++ = ch; | 93 | 0 | continue; | 94 | 0 | } | 95 | 0 | } | 96 | | | 97 | 285k | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 92.0k | Py_UCS4 ch2; | 100 | 92.0k | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 14.4k | goto InvalidStart; | 105 | 14.4k | } | 106 | 77.5k | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 1.32k | break; | 110 | 1.32k | } | 111 | 76.2k | ch2 = (unsigned char)s[1]; | 112 | 76.2k | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 5.25k | goto InvalidContinuation1; | 115 | 71.0k | ch = (ch << 6) + ch2 - | 116 | 71.0k | ((0xC0 << 6) + 0x80); | 117 | 71.0k | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 71.0k | s += 2; | 119 | 71.0k | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 71.0k | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 71.0k | goto Return; | 123 | 0 | *p++ = ch; | 124 | 0 | continue; | 125 | 71.0k | } | 126 | | | 127 | 193k | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 162k | Py_UCS4 ch2, ch3; | 130 | 162k | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 2.64k | if (end - s < 2) | 134 | 1.04k | break; | 135 | 1.59k | ch2 = (unsigned char)s[1]; | 136 | 1.59k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 1.59k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 1.10k | goto InvalidContinuation1; | 140 | 493 | break; | 141 | 1.59k | } | 142 | 159k | ch2 = (unsigned char)s[1]; | 143 | 159k | ch3 = (unsigned char)s[2]; | 144 | 159k | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 3.32k | goto InvalidContinuation1; | 147 | 3.32k | } | 148 | 156k | if (ch == 0xE0) { | 149 | 1.16k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 330 | goto InvalidContinuation1; | 153 | 155k | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 471 | goto InvalidContinuation1; | 160 | 471 | } | 161 | 155k | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 2.43k | goto InvalidContinuation2; | 164 | 2.43k | } | 165 | 152k | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 152k | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 152k | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 152k | s += 3; | 169 | 152k | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 152k | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 152k | goto Return; | 173 | 0 | *p++ = ch; | 174 | 0 | continue; | 175 | 152k | } | 176 | | | 177 | 31.4k | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 22.4k | Py_UCS4 ch2, ch3, ch4; | 180 | 22.4k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 7.12k | if (end - s < 2) | 184 | 2.41k | break; | 185 | 4.70k | ch2 = (unsigned char)s[1]; | 186 | 4.70k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 4.70k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 3.61k | goto InvalidContinuation1; | 190 | 1.09k | if (end - s < 3) | 191 | 504 | break; | 192 | 586 | ch3 = (unsigned char)s[2]; | 193 | 586 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 447 | goto InvalidContinuation2; | 195 | 139 | break; | 196 | 586 | } | 197 | 15.3k | ch2 = (unsigned char)s[1]; | 198 | 15.3k | ch3 = (unsigned char)s[2]; | 199 | 15.3k | ch4 = (unsigned char)s[3]; | 200 | 15.3k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 2.76k | goto InvalidContinuation1; | 203 | 2.76k | } | 204 | 12.5k | if (ch == 0xF0) { | 205 | 2.70k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 129 | goto InvalidContinuation1; | 209 | 9.83k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 506 | goto InvalidContinuation1; | 213 | 506 | } | 214 | 11.9k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 1.27k | goto InvalidContinuation2; | 217 | 1.27k | } | 218 | 10.6k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 432 | goto InvalidContinuation3; | 221 | 432 | } | 222 | 10.1k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 10.1k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 10.1k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 10.1k | s += 4; | 226 | 10.1k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 10.1k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 10.1k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 10.1k | } | 233 | 9.05k | goto InvalidStart; | 234 | 31.4k | } | 235 | 5.92k | ch = 0; | 236 | 285k | Return: | 237 | 285k | *inptr = s; | 238 | 285k | *outpos = p - dest; | 239 | 285k | return ch; | 240 | 23.5k | InvalidStart: | 241 | 23.5k | ch = 1; | 242 | 23.5k | goto Return; | 243 | 17.4k | InvalidContinuation1: | 244 | 17.4k | ch = 2; | 245 | 17.4k | goto Return; | 246 | 4.16k | InvalidContinuation2: | 247 | 4.16k | ch = 3; | 248 | 4.16k | goto Return; | 249 | 432 | InvalidContinuation3: | 250 | 432 | ch = 4; | 251 | 432 | goto Return; | 252 | 5.92k | } |
unicodeobject.c:ucs1lib_utf8_decode Line | Count | Source | 26 | 80.9k | { | 27 | 80.9k | Py_UCS4 ch; | 28 | 80.9k | const char *s = *inptr; | 29 | 80.9k | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 1.12M | while (s < end) { | 32 | 1.09M | ch = (unsigned char)*s; | 33 | | | 34 | 1.09M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 726k | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 100k | const char *_s = s; | 45 | 100k | STRINGLIB_CHAR *_p = p; | 46 | 11.6M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 11.5M | size_t value = *(const size_t *) _s; | 51 | 11.5M | if (value & ASCII_CHAR_MASK) | 52 | 79.5k | break; | 53 | 11.5M | #if PY_LITTLE_ENDIAN | 54 | 11.5M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 11.5M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 11.5M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 11.5M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 11.5M | # if SIZEOF_SIZE_T == 8 | 59 | 11.5M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 11.5M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 11.5M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 11.5M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 11.5M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 11.5M | _s += SIZEOF_SIZE_T; | 82 | 11.5M | _p += SIZEOF_SIZE_T; | 83 | 11.5M | } | 84 | 100k | s = _s; | 85 | 100k | p = _p; | 86 | 100k | if (s == end) | 87 | 2.40k | break; | 88 | 98.3k | ch = (unsigned char)*s; | 89 | 98.3k | } | 90 | 723k | if (ch < 0x80) { | 91 | 705k | s++; | 92 | 705k | *p++ = ch; | 93 | 705k | continue; | 94 | 705k | } | 95 | 723k | } | 96 | | | 97 | 385k | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 360k | Py_UCS4 ch2; | 100 | 360k | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 1.78k | goto InvalidStart; | 105 | 1.78k | } | 106 | 358k | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 706 | break; | 110 | 706 | } | 111 | 357k | ch2 = (unsigned char)s[1]; | 112 | 357k | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 15.8k | goto InvalidContinuation1; | 115 | 341k | ch = (ch << 6) + ch2 - | 116 | 341k | ((0xC0 << 6) + 0x80); | 117 | 341k | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 341k | s += 2; | 119 | 341k | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 341k | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 1.74k | goto Return; | 123 | 340k | *p++ = ch; | 124 | 340k | continue; | 125 | 341k | } | 126 | | | 127 | 25.7k | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 15.5k | Py_UCS4 ch2, ch3; | 130 | 15.5k | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 1.71k | if (end - s < 2) | 134 | 351 | break; | 135 | 1.36k | ch2 = (unsigned char)s[1]; | 136 | 1.36k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 1.36k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 903 | goto InvalidContinuation1; | 140 | 465 | break; | 141 | 1.36k | } | 142 | 13.8k | ch2 = (unsigned char)s[1]; | 143 | 13.8k | ch3 = (unsigned char)s[2]; | 144 | 13.8k | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 950 | goto InvalidContinuation1; | 147 | 950 | } | 148 | 12.9k | if (ch == 0xE0) { | 149 | 558 | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 111 | goto InvalidContinuation1; | 153 | 12.3k | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 674 | goto InvalidContinuation1; | 160 | 674 | } | 161 | 12.1k | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 650 | goto InvalidContinuation2; | 164 | 650 | } | 165 | 11.4k | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 11.4k | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 11.4k | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 11.4k | s += 3; | 169 | 11.4k | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 11.4k | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 11.4k | goto Return; | 173 | 0 | *p++ = ch; | 174 | 0 | continue; | 175 | 11.4k | } | 176 | | | 177 | 10.1k | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 9.11k | Py_UCS4 ch2, ch3, ch4; | 180 | 9.11k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 1.96k | if (end - s < 2) | 184 | 296 | break; | 185 | 1.67k | ch2 = (unsigned char)s[1]; | 186 | 1.67k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 1.67k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 1.09k | goto InvalidContinuation1; | 190 | 572 | if (end - s < 3) | 191 | 116 | break; | 192 | 456 | ch3 = (unsigned char)s[2]; | 193 | 456 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 347 | goto InvalidContinuation2; | 195 | 109 | break; | 196 | 456 | } | 197 | 7.15k | ch2 = (unsigned char)s[1]; | 198 | 7.15k | ch3 = (unsigned char)s[2]; | 199 | 7.15k | ch4 = (unsigned char)s[3]; | 200 | 7.15k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 680 | goto InvalidContinuation1; | 203 | 680 | } | 204 | 6.47k | if (ch == 0xF0) { | 205 | 1.14k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 112 | goto InvalidContinuation1; | 209 | 5.33k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 284 | goto InvalidContinuation1; | 213 | 284 | } | 214 | 6.07k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 1.82k | goto InvalidContinuation2; | 217 | 1.82k | } | 218 | 4.25k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 362 | goto InvalidContinuation3; | 221 | 362 | } | 222 | 3.89k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 3.89k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 3.89k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 3.89k | s += 4; | 226 | 3.89k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 3.89k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 3.89k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 3.89k | } | 233 | 1.03k | goto InvalidStart; | 234 | 10.1k | } | 235 | 37.1k | ch = 0; | 236 | 80.9k | Return: | 237 | 80.9k | *inptr = s; | 238 | 80.9k | *outpos = p - dest; | 239 | 80.9k | return ch; | 240 | 2.81k | InvalidStart: | 241 | 2.81k | ch = 1; | 242 | 2.81k | goto Return; | 243 | 20.6k | InvalidContinuation1: | 244 | 20.6k | ch = 2; | 245 | 20.6k | goto Return; | 246 | 2.81k | InvalidContinuation2: | 247 | 2.81k | ch = 3; | 248 | 2.81k | goto Return; | 249 | 362 | InvalidContinuation3: | 250 | 362 | ch = 4; | 251 | 362 | goto Return; | 252 | 37.1k | } |
unicodeobject.c:ucs2lib_utf8_decode Line | Count | Source | 26 | 97.8M | { | 27 | 97.8M | Py_UCS4 ch; | 28 | 97.8M | const char *s = *inptr; | 29 | 97.8M | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 190M | while (s < end) { | 32 | 190M | ch = (unsigned char)*s; | 33 | | | 34 | 190M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 72.0M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 9.32M | const char *_s = s; | 45 | 9.32M | STRINGLIB_CHAR *_p = p; | 46 | 199M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 199M | size_t value = *(const size_t *) _s; | 51 | 199M | if (value & ASCII_CHAR_MASK) | 52 | 9.24M | break; | 53 | 190M | #if PY_LITTLE_ENDIAN | 54 | 190M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 190M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 190M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 190M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 190M | # if SIZEOF_SIZE_T == 8 | 59 | 190M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 190M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 190M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 190M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 190M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 190M | _s += SIZEOF_SIZE_T; | 82 | 190M | _p += SIZEOF_SIZE_T; | 83 | 190M | } | 84 | 9.32M | s = _s; | 85 | 9.32M | p = _p; | 86 | 9.32M | if (s == end) | 87 | 5.20k | break; | 88 | 9.32M | ch = (unsigned char)*s; | 89 | 9.32M | } | 90 | 72.0M | if (ch < 0x80) { | 91 | 71.6M | s++; | 92 | 71.6M | *p++ = ch; | 93 | 71.6M | continue; | 94 | 71.6M | } | 95 | 72.0M | } | 96 | | | 97 | 118M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 47.1M | Py_UCS4 ch2; | 100 | 47.1M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 33.6M | goto InvalidStart; | 105 | 33.6M | } | 106 | 13.5M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 5.30k | break; | 110 | 5.30k | } | 111 | 13.5M | ch2 = (unsigned char)s[1]; | 112 | 13.5M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 11.9M | goto InvalidContinuation1; | 115 | 1.60M | ch = (ch << 6) + ch2 - | 116 | 1.60M | ((0xC0 << 6) + 0x80); | 117 | 1.60M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 1.60M | s += 2; | 119 | 1.60M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 1.60M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 0 | goto Return; | 123 | 1.60M | *p++ = ch; | 124 | 1.60M | continue; | 125 | 1.60M | } | 126 | | | 127 | 71.8M | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 26.3M | Py_UCS4 ch2, ch3; | 130 | 26.3M | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 4.79k | if (end - s < 2) | 134 | 1.99k | break; | 135 | 2.80k | ch2 = (unsigned char)s[1]; | 136 | 2.80k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 2.80k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 1.97k | goto InvalidContinuation1; | 140 | 828 | break; | 141 | 2.80k | } | 142 | 26.3M | ch2 = (unsigned char)s[1]; | 143 | 26.3M | ch3 = (unsigned char)s[2]; | 144 | 26.3M | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 6.32M | goto InvalidContinuation1; | 147 | 6.32M | } | 148 | 19.9M | if (ch == 0xE0) { | 149 | 29.2k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 10.0k | goto InvalidContinuation1; | 153 | 19.9M | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 13.6k | goto InvalidContinuation1; | 160 | 13.6k | } | 161 | 19.9M | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 196k | goto InvalidContinuation2; | 164 | 196k | } | 165 | 19.7M | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 19.7M | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 19.7M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 19.7M | s += 3; | 169 | 19.7M | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 19.7M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 0 | goto Return; | 173 | 19.7M | *p++ = ch; | 174 | 19.7M | continue; | 175 | 19.7M | } | 176 | | | 177 | 45.5M | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 1.37M | Py_UCS4 ch2, ch3, ch4; | 180 | 1.37M | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 5.99k | if (end - s < 2) | 184 | 1.51k | break; | 185 | 4.48k | ch2 = (unsigned char)s[1]; | 186 | 4.48k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 4.48k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 2.84k | goto InvalidContinuation1; | 190 | 1.64k | if (end - s < 3) | 191 | 545 | break; | 192 | 1.09k | ch3 = (unsigned char)s[2]; | 193 | 1.09k | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 600 | goto InvalidContinuation2; | 195 | 498 | break; | 196 | 1.09k | } | 197 | 1.36M | ch2 = (unsigned char)s[1]; | 198 | 1.36M | ch3 = (unsigned char)s[2]; | 199 | 1.36M | ch4 = (unsigned char)s[3]; | 200 | 1.36M | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 1.21M | goto InvalidContinuation1; | 203 | 1.21M | } | 204 | 158k | if (ch == 0xF0) { | 205 | 36.0k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 9.05k | goto InvalidContinuation1; | 209 | 122k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 19.5k | goto InvalidContinuation1; | 213 | 19.5k | } | 214 | 129k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 91.1k | goto InvalidContinuation2; | 217 | 91.1k | } | 218 | 38.5k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 17.7k | goto InvalidContinuation3; | 221 | 17.7k | } | 222 | 20.8k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 20.8k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 20.8k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 20.8k | s += 4; | 226 | 20.8k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 20.8k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 20.8k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 20.8k | } | 233 | 44.1M | goto InvalidStart; | 234 | 45.5M | } | 235 | 230k | ch = 0; | 236 | 97.8M | Return: | 237 | 97.8M | *inptr = s; | 238 | 97.8M | *outpos = p - dest; | 239 | 97.8M | return ch; | 240 | 77.7M | InvalidStart: | 241 | 77.7M | ch = 1; | 242 | 77.7M | goto Return; | 243 | 19.5M | InvalidContinuation1: | 244 | 19.5M | ch = 2; | 245 | 19.5M | goto Return; | 246 | 287k | InvalidContinuation2: | 247 | 287k | ch = 3; | 248 | 287k | goto Return; | 249 | 17.7k | InvalidContinuation3: | 250 | 17.7k | ch = 4; | 251 | 17.7k | goto Return; | 252 | 230k | } |
unicodeobject.c:ucs4lib_utf8_decode Line | Count | Source | 26 | 85.0M | { | 27 | 85.0M | Py_UCS4 ch; | 28 | 85.0M | const char *s = *inptr; | 29 | 85.0M | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 193M | while (s < end) { | 32 | 192M | ch = (unsigned char)*s; | 33 | | | 34 | 192M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 83.4M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 10.7M | const char *_s = s; | 45 | 10.7M | STRINGLIB_CHAR *_p = p; | 46 | 164M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 164M | size_t value = *(const size_t *) _s; | 51 | 164M | if (value & ASCII_CHAR_MASK) | 52 | 10.7M | break; | 53 | 153M | #if PY_LITTLE_ENDIAN | 54 | 153M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 153M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 153M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 153M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 153M | # if SIZEOF_SIZE_T == 8 | 59 | 153M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 153M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 153M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 153M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 153M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 153M | _s += SIZEOF_SIZE_T; | 82 | 153M | _p += SIZEOF_SIZE_T; | 83 | 153M | } | 84 | 10.7M | s = _s; | 85 | 10.7M | p = _p; | 86 | 10.7M | if (s == end) | 87 | 2.85k | break; | 88 | 10.7M | ch = (unsigned char)*s; | 89 | 10.7M | } | 90 | 83.4M | if (ch < 0x80) { | 91 | 83.0M | s++; | 92 | 83.0M | *p++ = ch; | 93 | 83.0M | continue; | 94 | 83.0M | } | 95 | 83.4M | } | 96 | | | 97 | 109M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 47.7M | Py_UCS4 ch2; | 100 | 47.7M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 34.1M | goto InvalidStart; | 105 | 34.1M | } | 106 | 13.6M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 1.48k | break; | 110 | 1.48k | } | 111 | 13.6M | ch2 = (unsigned char)s[1]; | 112 | 13.6M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 10.3M | goto InvalidContinuation1; | 115 | 3.27M | ch = (ch << 6) + ch2 - | 116 | 3.27M | ((0xC0 << 6) + 0x80); | 117 | 3.27M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 3.27M | s += 2; | 119 | 3.27M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 3.27M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 0 | goto Return; | 123 | 3.27M | *p++ = ch; | 124 | 3.27M | continue; | 125 | 3.27M | } | 126 | | | 127 | 62.1M | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 26.1M | Py_UCS4 ch2, ch3; | 130 | 26.1M | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 3.31k | if (end - s < 2) | 134 | 875 | break; | 135 | 2.44k | ch2 = (unsigned char)s[1]; | 136 | 2.44k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 2.44k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 1.58k | goto InvalidContinuation1; | 140 | 859 | break; | 141 | 2.44k | } | 142 | 26.1M | ch2 = (unsigned char)s[1]; | 143 | 26.1M | ch3 = (unsigned char)s[2]; | 144 | 26.1M | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 4.26M | goto InvalidContinuation1; | 147 | 4.26M | } | 148 | 21.8M | if (ch == 0xE0) { | 149 | 83.6k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 35.4k | goto InvalidContinuation1; | 153 | 21.7M | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 56.9k | goto InvalidContinuation1; | 160 | 56.9k | } | 161 | 21.7M | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 728k | goto InvalidContinuation2; | 164 | 728k | } | 165 | 21.0M | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 21.0M | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 21.0M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 21.0M | s += 3; | 169 | 21.0M | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 21.0M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 0 | goto Return; | 173 | 21.0M | *p++ = ch; | 174 | 21.0M | continue; | 175 | 21.0M | } | 176 | | | 177 | 36.0M | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 3.41M | Py_UCS4 ch2, ch3, ch4; | 180 | 3.41M | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 4.01k | if (end - s < 2) | 184 | 711 | break; | 185 | 3.30k | ch2 = (unsigned char)s[1]; | 186 | 3.30k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 3.30k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 1.65k | goto InvalidContinuation1; | 190 | 1.64k | if (end - s < 3) | 191 | 424 | break; | 192 | 1.22k | ch3 = (unsigned char)s[2]; | 193 | 1.22k | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 514 | goto InvalidContinuation2; | 195 | 708 | break; | 196 | 1.22k | } | 197 | 3.41M | ch2 = (unsigned char)s[1]; | 198 | 3.41M | ch3 = (unsigned char)s[2]; | 199 | 3.41M | ch4 = (unsigned char)s[3]; | 200 | 3.41M | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 2.27M | goto InvalidContinuation1; | 203 | 2.27M | } | 204 | 1.13M | if (ch == 0xF0) { | 205 | 577k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 33.1k | goto InvalidContinuation1; | 209 | 577k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 59.1k | goto InvalidContinuation1; | 213 | 59.1k | } | 214 | 1.04M | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 322k | goto InvalidContinuation2; | 217 | 322k | } | 218 | 721k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 112k | goto InvalidContinuation3; | 221 | 112k | } | 222 | 609k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 609k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 609k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 609k | s += 4; | 226 | 609k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 609k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 0 | goto Return; | 230 | 609k | *p++ = ch; | 231 | 609k | continue; | 232 | 609k | } | 233 | 32.6M | goto InvalidStart; | 234 | 36.0M | } | 235 | 43.7k | ch = 0; | 236 | 85.0M | Return: | 237 | 85.0M | *inptr = s; | 238 | 85.0M | *outpos = p - dest; | 239 | 85.0M | return ch; | 240 | 66.7M | InvalidStart: | 241 | 66.7M | ch = 1; | 242 | 66.7M | goto Return; | 243 | 17.1M | InvalidContinuation1: | 244 | 17.1M | ch = 2; | 245 | 17.1M | goto Return; | 246 | 1.05M | InvalidContinuation2: | 247 | 1.05M | ch = 3; | 248 | 1.05M | goto Return; | 249 | 112k | InvalidContinuation3: | 250 | 112k | ch = 4; | 251 | 112k | goto Return; | 252 | 43.7k | } |
|
253 | | |
254 | | #undef ASCII_CHAR_MASK |
255 | | |
256 | | |
257 | | /* UTF-8 encoder specialized for a Unicode kind to avoid the slow |
258 | | PyUnicode_READ() macro. Delete some parts of the code depending on the kind: |
259 | | UCS-1 strings don't need to handle surrogates for example. */ |
260 | | Py_LOCAL_INLINE(char *) |
261 | | STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, |
262 | | PyObject *unicode, |
263 | | const STRINGLIB_CHAR *data, |
264 | | Py_ssize_t size, |
265 | | _Py_error_handler error_handler, |
266 | | const char *errors) |
267 | 5.87M | { |
268 | 5.87M | Py_ssize_t i; /* index into data of next input character */ |
269 | 5.87M | char *p; /* next free byte in output buffer */ |
270 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
271 | | PyObject *error_handler_obj = NULL; |
272 | | PyObject *exc = NULL; |
273 | | PyObject *rep = NULL; |
274 | | #endif |
275 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
276 | | const Py_ssize_t max_char_size = 2; |
277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 |
278 | | const Py_ssize_t max_char_size = 3; |
279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ |
280 | | const Py_ssize_t max_char_size = 4; |
281 | | #endif |
282 | | |
283 | 5.87M | assert(size >= 0); |
284 | 5.87M | if (size > PY_SSIZE_T_MAX / max_char_size) { |
285 | | /* integer overflow */ |
286 | 0 | PyErr_NoMemory(); |
287 | 0 | return NULL; |
288 | 0 | } |
289 | | |
290 | 5.87M | _PyBytesWriter_Init(writer); |
291 | 5.87M | p = _PyBytesWriter_Alloc(writer, size * max_char_size); |
292 | 5.87M | if (p == NULL) |
293 | 0 | return NULL; |
294 | | |
295 | 3.19G | for (i = 0; i < size;) { |
296 | 3.18G | Py_UCS4 ch = data[i++]; |
297 | | |
298 | 3.18G | if (ch < 0x80) { |
299 | | /* Encode ASCII */ |
300 | 3.04G | *p++ = (char) ch; |
301 | | |
302 | 3.04G | } |
303 | 57.4M | else |
304 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
305 | 57.4M | if (ch < 0x0800) |
306 | 1.39M | #endif |
307 | 86.1M | { |
308 | | /* Encode Latin-1 */ |
309 | 86.1M | *p++ = (char)(0xc0 | (ch >> 6)); |
310 | 86.1M | *p++ = (char)(0x80 | (ch & 0x3f)); |
311 | 86.1M | } |
312 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
313 | 56.0M | else if (Py_UNICODE_IS_SURROGATE(ch)) { |
314 | 378k | Py_ssize_t startpos, endpos, newpos; |
315 | 378k | Py_ssize_t k; |
316 | 378k | if (error_handler == _Py_ERROR_UNKNOWN) { |
317 | 205k | error_handler = _Py_GetErrorHandler(errors); |
318 | 205k | } |
319 | | |
320 | 378k | startpos = i-1; |
321 | 378k | endpos = startpos+1; |
322 | | |
323 | 18.9M | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) |
324 | 18.5M | endpos++; |
325 | | |
326 | | /* Only overallocate the buffer if it's not the last write */ |
327 | 378k | writer->overallocate = (endpos < size); |
328 | | |
329 | 378k | switch (error_handler) |
330 | 378k | { |
331 | 0 | case _Py_ERROR_REPLACE: |
332 | 0 | memset(p, '?', endpos - startpos); |
333 | 0 | p += (endpos - startpos); |
334 | 0 | _Py_FALLTHROUGH; |
335 | 0 | case _Py_ERROR_IGNORE: |
336 | 0 | i += (endpos - startpos - 1); |
337 | 0 | break; |
338 | | |
339 | 0 | case _Py_ERROR_SURROGATEPASS: |
340 | 0 | for (k=startpos; k<endpos; k++) { |
341 | 0 | ch = data[k]; |
342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); |
343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); |
345 | 0 | } |
346 | 0 | i += (endpos - startpos - 1); |
347 | 0 | break; |
348 | | |
349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: |
350 | | /* subtract preallocated bytes */ |
351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); |
352 | 0 | p = backslashreplace(writer, p, |
353 | 0 | unicode, startpos, endpos); |
354 | 0 | if (p == NULL) |
355 | 0 | goto error; |
356 | 0 | i += (endpos - startpos - 1); |
357 | 0 | break; |
358 | | |
359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: |
360 | | /* subtract preallocated bytes */ |
361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); |
362 | 0 | p = xmlcharrefreplace(writer, p, |
363 | 0 | unicode, startpos, endpos); |
364 | 0 | if (p == NULL) |
365 | 0 | goto error; |
366 | 0 | i += (endpos - startpos - 1); |
367 | 0 | break; |
368 | | |
369 | 226k | case _Py_ERROR_SURROGATEESCAPE: |
370 | 13.6M | for (k=startpos; k<endpos; k++) { |
371 | 13.4M | ch = data[k]; |
372 | 13.4M | if (!(0xDC80 <= ch && ch <= 0xDCFF)) |
373 | 25 | break; |
374 | 13.4M | *p++ = (char)(ch & 0xff); |
375 | 13.4M | } |
376 | 226k | if (k >= endpos) { |
377 | 226k | i += (endpos - startpos - 1); |
378 | 226k | break; |
379 | 226k | } |
380 | 25 | startpos = k; |
381 | 25 | assert(startpos < endpos); |
382 | 25 | _Py_FALLTHROUGH; |
383 | 152k | default: |
384 | 152k | rep = unicode_encode_call_errorhandler( |
385 | 152k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", |
386 | 152k | unicode, &exc, startpos, endpos, &newpos); |
387 | 152k | if (!rep) |
388 | 152k | goto error; |
389 | | |
390 | 0 | if (newpos < startpos) { |
391 | 0 | writer->overallocate = 1; |
392 | 0 | p = _PyBytesWriter_Prepare(writer, p, |
393 | 0 | max_char_size * (startpos - newpos)); |
394 | 0 | if (p == NULL) |
395 | 0 | goto error; |
396 | 0 | } |
397 | 0 | else { |
398 | | /* subtract preallocated bytes */ |
399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); |
400 | | /* Only overallocate the buffer if it's not the last write */ |
401 | 0 | writer->overallocate = (newpos < size); |
402 | 0 | } |
403 | | |
404 | 0 | if (PyBytes_Check(rep)) { |
405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, |
406 | 0 | PyBytes_AS_STRING(rep), |
407 | 0 | PyBytes_GET_SIZE(rep)); |
408 | 0 | } |
409 | 0 | else { |
410 | | /* rep is unicode */ |
411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { |
412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, |
413 | 0 | startpos, endpos, |
414 | 0 | "surrogates not allowed"); |
415 | 0 | goto error; |
416 | 0 | } |
417 | | |
418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, |
419 | 0 | PyUnicode_DATA(rep), |
420 | 0 | PyUnicode_GET_LENGTH(rep)); |
421 | 0 | } |
422 | | |
423 | 0 | if (p == NULL) |
424 | 0 | goto error; |
425 | 0 | Py_CLEAR(rep); |
426 | |
|
427 | 0 | i = newpos; |
428 | 378k | } |
429 | | |
430 | | /* If overallocation was disabled, ensure that it was the last |
431 | | write. Otherwise, we missed an optimization */ |
432 | 226k | assert(writer->overallocate || i == size); |
433 | 226k | } |
434 | 31.8M | else |
435 | | #if STRINGLIB_SIZEOF_CHAR > 2 |
436 | 31.8M | if (ch < 0x10000) |
437 | 31.6M | #endif |
438 | 55.4M | { |
439 | 55.4M | *p++ = (char)(0xe0 | (ch >> 12)); |
440 | 55.4M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
441 | 55.4M | *p++ = (char)(0x80 | (ch & 0x3f)); |
442 | 55.4M | } |
443 | | #if STRINGLIB_SIZEOF_CHAR > 2 |
444 | | else /* ch >= 0x10000 */ |
445 | 160k | { |
446 | 160k | assert(ch <= MAX_UNICODE); |
447 | | /* Encode UCS4 Unicode ordinals */ |
448 | 160k | *p++ = (char)(0xf0 | (ch >> 18)); |
449 | 160k | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); |
450 | 160k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
451 | 160k | *p++ = (char)(0x80 | (ch & 0x3f)); |
452 | 160k | } |
453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ |
454 | | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ |
455 | 3.18G | } |
456 | | |
457 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
458 | 1.29M | Py_XDECREF(error_handler_obj); |
459 | 1.29M | Py_XDECREF(exc); |
460 | | #endif |
461 | 1.29M | return p; |
462 | | |
463 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
464 | 152k | error: |
465 | 152k | Py_XDECREF(rep); |
466 | 152k | Py_XDECREF(error_handler_obj); |
467 | 152k | Py_XDECREF(exc); |
468 | 152k | return NULL; |
469 | | #endif |
470 | 1.44M | } unicodeobject.c:ucs1lib_utf8_encoder Line | Count | Source | 267 | 4.42M | { | 268 | 4.42M | Py_ssize_t i; /* index into data of next input character */ | 269 | 4.42M | char *p; /* next free byte in output buffer */ | 270 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | | PyObject *error_handler_obj = NULL; | 272 | | PyObject *exc = NULL; | 273 | | PyObject *rep = NULL; | 274 | | #endif | 275 | 4.42M | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | 4.42M | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | | const Py_ssize_t max_char_size = 4; | 281 | | #endif | 282 | | | 283 | 4.42M | assert(size >= 0); | 284 | 4.42M | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 4.42M | _PyBytesWriter_Init(writer); | 291 | 4.42M | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 4.42M | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 605M | for (i = 0; i < size;) { | 296 | 600M | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 600M | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 515M | *p++ = (char) ch; | 301 | | | 302 | 515M | } | 303 | 84.7M | else | 304 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | | if (ch < 0x0800) | 306 | | #endif | 307 | 84.7M | { | 308 | | /* Encode Latin-1 */ | 309 | 84.7M | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 84.7M | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 84.7M | } | 312 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | | Py_ssize_t startpos, endpos, newpos; | 315 | | Py_ssize_t k; | 316 | | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | | error_handler = _Py_GetErrorHandler(errors); | 318 | | } | 319 | | | 320 | | startpos = i-1; | 321 | | endpos = startpos+1; | 322 | | | 323 | | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | | writer->overallocate = (endpos < size); | 328 | | | 329 | | switch (error_handler) | 330 | | { | 331 | | case _Py_ERROR_REPLACE: | 332 | | memset(p, '?', endpos - startpos); | 333 | | p += (endpos - startpos); | 334 | | _Py_FALLTHROUGH; | 335 | | case _Py_ERROR_IGNORE: | 336 | | i += (endpos - startpos - 1); | 337 | | break; | 338 | | | 339 | | case _Py_ERROR_SURROGATEPASS: | 340 | | for (k=startpos; k<endpos; k++) { | 341 | | ch = data[k]; | 342 | | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | | } | 346 | | i += (endpos - startpos - 1); | 347 | | break; | 348 | | | 349 | | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | | writer->min_size -= max_char_size * (endpos - startpos); | 352 | | p = backslashreplace(writer, p, | 353 | | unicode, startpos, endpos); | 354 | | if (p == NULL) | 355 | | goto error; | 356 | | i += (endpos - startpos - 1); | 357 | | break; | 358 | | | 359 | | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | | writer->min_size -= max_char_size * (endpos - startpos); | 362 | | p = xmlcharrefreplace(writer, p, | 363 | | unicode, startpos, endpos); | 364 | | if (p == NULL) | 365 | | goto error; | 366 | | i += (endpos - startpos - 1); | 367 | | break; | 368 | | | 369 | | case _Py_ERROR_SURROGATEESCAPE: | 370 | | for (k=startpos; k<endpos; k++) { | 371 | | ch = data[k]; | 372 | | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | | break; | 374 | | *p++ = (char)(ch & 0xff); | 375 | | } | 376 | | if (k >= endpos) { | 377 | | i += (endpos - startpos - 1); | 378 | | break; | 379 | | } | 380 | | startpos = k; | 381 | | assert(startpos < endpos); | 382 | | _Py_FALLTHROUGH; | 383 | | default: | 384 | | rep = unicode_encode_call_errorhandler( | 385 | | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | | unicode, &exc, startpos, endpos, &newpos); | 387 | | if (!rep) | 388 | | goto error; | 389 | | | 390 | | if (newpos < startpos) { | 391 | | writer->overallocate = 1; | 392 | | p = _PyBytesWriter_Prepare(writer, p, | 393 | | max_char_size * (startpos - newpos)); | 394 | | if (p == NULL) | 395 | | goto error; | 396 | | } | 397 | | else { | 398 | | /* subtract preallocated bytes */ | 399 | | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | | writer->overallocate = (newpos < size); | 402 | | } | 403 | | | 404 | | if (PyBytes_Check(rep)) { | 405 | | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | | PyBytes_AS_STRING(rep), | 407 | | PyBytes_GET_SIZE(rep)); | 408 | | } | 409 | | else { | 410 | | /* rep is unicode */ | 411 | | if (!PyUnicode_IS_ASCII(rep)) { | 412 | | raise_encode_exception(&exc, "utf-8", unicode, | 413 | | startpos, endpos, | 414 | | "surrogates not allowed"); | 415 | | goto error; | 416 | | } | 417 | | | 418 | | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | | PyUnicode_DATA(rep), | 420 | | PyUnicode_GET_LENGTH(rep)); | 421 | | } | 422 | | | 423 | | if (p == NULL) | 424 | | goto error; | 425 | | Py_CLEAR(rep); | 426 | | | 427 | | i = newpos; | 428 | | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | | assert(writer->overallocate || i == size); | 433 | | } | 434 | | else | 435 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | | if (ch < 0x10000) | 437 | | #endif | 438 | | { | 439 | | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | | } | 443 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | | else /* ch >= 0x10000 */ | 445 | | { | 446 | | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | | } | 453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 600M | } | 456 | | | 457 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | | Py_XDECREF(error_handler_obj); | 459 | | Py_XDECREF(exc); | 460 | | #endif | 461 | 4.42M | return p; | 462 | | | 463 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | | error: | 465 | | Py_XDECREF(rep); | 466 | | Py_XDECREF(error_handler_obj); | 467 | | Py_XDECREF(exc); | 468 | | return NULL; | 469 | | #endif | 470 | 4.42M | } |
unicodeobject.c:ucs2lib_utf8_encoder Line | Count | Source | 267 | 1.37M | { | 268 | 1.37M | Py_ssize_t i; /* index into data of next input character */ | 269 | 1.37M | char *p; /* next free byte in output buffer */ | 270 | 1.37M | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | 1.37M | PyObject *error_handler_obj = NULL; | 272 | 1.37M | PyObject *exc = NULL; | 273 | 1.37M | PyObject *rep = NULL; | 274 | 1.37M | #endif | 275 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | | const Py_ssize_t max_char_size = 4; | 281 | | #endif | 282 | | | 283 | 1.37M | assert(size >= 0); | 284 | 1.37M | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 1.37M | _PyBytesWriter_Init(writer); | 291 | 1.37M | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 1.37M | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 1.06G | for (i = 0; i < size;) { | 296 | 1.06G | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 1.06G | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 1.03G | *p++ = (char) ch; | 301 | | | 302 | 1.03G | } | 303 | 24.6M | else | 304 | 24.6M | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | 24.6M | if (ch < 0x0800) | 306 | 402k | #endif | 307 | 402k | { | 308 | | /* Encode Latin-1 */ | 309 | 402k | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 402k | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 402k | } | 312 | 24.2M | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | 24.2M | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | 362k | Py_ssize_t startpos, endpos, newpos; | 315 | 362k | Py_ssize_t k; | 316 | 362k | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | 197k | error_handler = _Py_GetErrorHandler(errors); | 318 | 197k | } | 319 | | | 320 | 362k | startpos = i-1; | 321 | 362k | endpos = startpos+1; | 322 | | | 323 | 18.8M | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | 18.4M | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | 362k | writer->overallocate = (endpos < size); | 328 | | | 329 | 362k | switch (error_handler) | 330 | 362k | { | 331 | 0 | case _Py_ERROR_REPLACE: | 332 | 0 | memset(p, '?', endpos - startpos); | 333 | 0 | p += (endpos - startpos); | 334 | 0 | _Py_FALLTHROUGH; | 335 | 0 | case _Py_ERROR_IGNORE: | 336 | 0 | i += (endpos - startpos - 1); | 337 | 0 | break; | 338 | | | 339 | 0 | case _Py_ERROR_SURROGATEPASS: | 340 | 0 | for (k=startpos; k<endpos; k++) { | 341 | 0 | ch = data[k]; | 342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | 0 | } | 346 | 0 | i += (endpos - startpos - 1); | 347 | 0 | break; | 348 | | | 349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 352 | 0 | p = backslashreplace(writer, p, | 353 | 0 | unicode, startpos, endpos); | 354 | 0 | if (p == NULL) | 355 | 0 | goto error; | 356 | 0 | i += (endpos - startpos - 1); | 357 | 0 | break; | 358 | | | 359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 362 | 0 | p = xmlcharrefreplace(writer, p, | 363 | 0 | unicode, startpos, endpos); | 364 | 0 | if (p == NULL) | 365 | 0 | goto error; | 366 | 0 | i += (endpos - startpos - 1); | 367 | 0 | break; | 368 | | | 369 | 215k | case _Py_ERROR_SURROGATEESCAPE: | 370 | 13.5M | for (k=startpos; k<endpos; k++) { | 371 | 13.3M | ch = data[k]; | 372 | 13.3M | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | 17 | break; | 374 | 13.3M | *p++ = (char)(ch & 0xff); | 375 | 13.3M | } | 376 | 215k | if (k >= endpos) { | 377 | 215k | i += (endpos - startpos - 1); | 378 | 215k | break; | 379 | 215k | } | 380 | 17 | startpos = k; | 381 | 17 | assert(startpos < endpos); | 382 | 17 | _Py_FALLTHROUGH; | 383 | 147k | default: | 384 | 147k | rep = unicode_encode_call_errorhandler( | 385 | 147k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | 147k | unicode, &exc, startpos, endpos, &newpos); | 387 | 147k | if (!rep) | 388 | 147k | goto error; | 389 | | | 390 | 0 | if (newpos < startpos) { | 391 | 0 | writer->overallocate = 1; | 392 | 0 | p = _PyBytesWriter_Prepare(writer, p, | 393 | 0 | max_char_size * (startpos - newpos)); | 394 | 0 | if (p == NULL) | 395 | 0 | goto error; | 396 | 0 | } | 397 | 0 | else { | 398 | | /* subtract preallocated bytes */ | 399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | 0 | writer->overallocate = (newpos < size); | 402 | 0 | } | 403 | | | 404 | 0 | if (PyBytes_Check(rep)) { | 405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | 0 | PyBytes_AS_STRING(rep), | 407 | 0 | PyBytes_GET_SIZE(rep)); | 408 | 0 | } | 409 | 0 | else { | 410 | | /* rep is unicode */ | 411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { | 412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, | 413 | 0 | startpos, endpos, | 414 | 0 | "surrogates not allowed"); | 415 | 0 | goto error; | 416 | 0 | } | 417 | | | 418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | 0 | PyUnicode_DATA(rep), | 420 | 0 | PyUnicode_GET_LENGTH(rep)); | 421 | 0 | } | 422 | | | 423 | 0 | if (p == NULL) | 424 | 0 | goto error; | 425 | 0 | Py_CLEAR(rep); | 426 | |
| 427 | 0 | i = newpos; | 428 | 362k | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | 215k | assert(writer->overallocate || i == size); | 433 | 215k | } | 434 | 23.8M | else | 435 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | | if (ch < 0x10000) | 437 | | #endif | 438 | 23.8M | { | 439 | 23.8M | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | 23.8M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | 23.8M | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | 23.8M | } | 443 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | | else /* ch >= 0x10000 */ | 445 | | { | 446 | | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | | } | 453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | 1.06G | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 1.06G | } | 456 | | | 457 | 1.22M | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | 1.22M | Py_XDECREF(error_handler_obj); | 459 | 1.22M | Py_XDECREF(exc); | 460 | 1.22M | #endif | 461 | 1.22M | return p; | 462 | | | 463 | 0 | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | 147k | error: | 465 | 147k | Py_XDECREF(rep); | 466 | 147k | Py_XDECREF(error_handler_obj); | 467 | 147k | Py_XDECREF(exc); | 468 | 147k | return NULL; | 469 | 1.37M | #endif | 470 | 1.37M | } |
unicodeobject.c:ucs4lib_utf8_encoder Line | Count | Source | 267 | 72.0k | { | 268 | 72.0k | Py_ssize_t i; /* index into data of next input character */ | 269 | 72.0k | char *p; /* next free byte in output buffer */ | 270 | 72.0k | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | 72.0k | PyObject *error_handler_obj = NULL; | 272 | 72.0k | PyObject *exc = NULL; | 273 | 72.0k | PyObject *rep = NULL; | 274 | 72.0k | #endif | 275 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | 72.0k | const Py_ssize_t max_char_size = 4; | 281 | 72.0k | #endif | 282 | | | 283 | 72.0k | assert(size >= 0); | 284 | 72.0k | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 72.0k | _PyBytesWriter_Init(writer); | 291 | 72.0k | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 72.0k | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 1.52G | for (i = 0; i < size;) { | 296 | 1.52G | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 1.52G | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 1.48G | *p++ = (char) ch; | 301 | | | 302 | 1.48G | } | 303 | 32.8M | else | 304 | 32.8M | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | 32.8M | if (ch < 0x0800) | 306 | 991k | #endif | 307 | 991k | { | 308 | | /* Encode Latin-1 */ | 309 | 991k | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 991k | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 991k | } | 312 | 31.8M | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | 31.8M | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | 16.0k | Py_ssize_t startpos, endpos, newpos; | 315 | 16.0k | Py_ssize_t k; | 316 | 16.0k | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | 8.10k | error_handler = _Py_GetErrorHandler(errors); | 318 | 8.10k | } | 319 | | | 320 | 16.0k | startpos = i-1; | 321 | 16.0k | endpos = startpos+1; | 322 | | | 323 | 103k | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | 87.8k | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | 16.0k | writer->overallocate = (endpos < size); | 328 | | | 329 | 16.0k | switch (error_handler) | 330 | 16.0k | { | 331 | 0 | case _Py_ERROR_REPLACE: | 332 | 0 | memset(p, '?', endpos - startpos); | 333 | 0 | p += (endpos - startpos); | 334 | 0 | _Py_FALLTHROUGH; | 335 | 0 | case _Py_ERROR_IGNORE: | 336 | 0 | i += (endpos - startpos - 1); | 337 | 0 | break; | 338 | | | 339 | 0 | case _Py_ERROR_SURROGATEPASS: | 340 | 0 | for (k=startpos; k<endpos; k++) { | 341 | 0 | ch = data[k]; | 342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | 0 | } | 346 | 0 | i += (endpos - startpos - 1); | 347 | 0 | break; | 348 | | | 349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 352 | 0 | p = backslashreplace(writer, p, | 353 | 0 | unicode, startpos, endpos); | 354 | 0 | if (p == NULL) | 355 | 0 | goto error; | 356 | 0 | i += (endpos - startpos - 1); | 357 | 0 | break; | 358 | | | 359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 362 | 0 | p = xmlcharrefreplace(writer, p, | 363 | 0 | unicode, startpos, endpos); | 364 | 0 | if (p == NULL) | 365 | 0 | goto error; | 366 | 0 | i += (endpos - startpos - 1); | 367 | 0 | break; | 368 | | | 369 | 10.9k | case _Py_ERROR_SURROGATEESCAPE: | 370 | 105k | for (k=startpos; k<endpos; k++) { | 371 | 94.2k | ch = data[k]; | 372 | 94.2k | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | 8 | break; | 374 | 94.2k | *p++ = (char)(ch & 0xff); | 375 | 94.2k | } | 376 | 10.9k | if (k >= endpos) { | 377 | 10.9k | i += (endpos - startpos - 1); | 378 | 10.9k | break; | 379 | 10.9k | } | 380 | 8 | startpos = k; | 381 | 8 | assert(startpos < endpos); | 382 | 8 | _Py_FALLTHROUGH; | 383 | 5.11k | default: | 384 | 5.11k | rep = unicode_encode_call_errorhandler( | 385 | 5.11k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | 5.11k | unicode, &exc, startpos, endpos, &newpos); | 387 | 5.11k | if (!rep) | 388 | 5.11k | goto error; | 389 | | | 390 | 0 | if (newpos < startpos) { | 391 | 0 | writer->overallocate = 1; | 392 | 0 | p = _PyBytesWriter_Prepare(writer, p, | 393 | 0 | max_char_size * (startpos - newpos)); | 394 | 0 | if (p == NULL) | 395 | 0 | goto error; | 396 | 0 | } | 397 | 0 | else { | 398 | | /* subtract preallocated bytes */ | 399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | 0 | writer->overallocate = (newpos < size); | 402 | 0 | } | 403 | | | 404 | 0 | if (PyBytes_Check(rep)) { | 405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | 0 | PyBytes_AS_STRING(rep), | 407 | 0 | PyBytes_GET_SIZE(rep)); | 408 | 0 | } | 409 | 0 | else { | 410 | | /* rep is unicode */ | 411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { | 412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, | 413 | 0 | startpos, endpos, | 414 | 0 | "surrogates not allowed"); | 415 | 0 | goto error; | 416 | 0 | } | 417 | | | 418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | 0 | PyUnicode_DATA(rep), | 420 | 0 | PyUnicode_GET_LENGTH(rep)); | 421 | 0 | } | 422 | | | 423 | 0 | if (p == NULL) | 424 | 0 | goto error; | 425 | 0 | Py_CLEAR(rep); | 426 | |
| 427 | 0 | i = newpos; | 428 | 16.0k | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | 10.9k | assert(writer->overallocate || i == size); | 433 | 10.9k | } | 434 | 31.8M | else | 435 | 31.8M | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | 31.8M | if (ch < 0x10000) | 437 | 31.6M | #endif | 438 | 31.6M | { | 439 | 31.6M | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | 31.6M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | 31.6M | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | 31.6M | } | 443 | 160k | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | 160k | else /* ch >= 0x10000 */ | 445 | 160k | { | 446 | 160k | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | 160k | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | 160k | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | 160k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | 160k | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | 160k | } | 453 | 1.52G | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | 1.52G | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 1.52G | } | 456 | | | 457 | 66.9k | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | 66.9k | Py_XDECREF(error_handler_obj); | 459 | 66.9k | Py_XDECREF(exc); | 460 | 66.9k | #endif | 461 | 66.9k | return p; | 462 | | | 463 | 0 | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | 5.11k | error: | 465 | 5.11k | Py_XDECREF(rep); | 466 | 5.11k | Py_XDECREF(error_handler_obj); | 467 | 5.11k | Py_XDECREF(exc); | 468 | 5.11k | return NULL; | 469 | 72.0k | #endif | 470 | 72.0k | } |
Unexecuted instantiation: unicodeobject.c:asciilib_utf8_encoder |
471 | | |
472 | | /* The pattern for constructing UCS2-repeated masks. */ |
473 | | #if SIZEOF_LONG == 8 |
474 | 533k | # define UCS2_REPEAT_MASK 0x0001000100010001ul |
475 | | #elif SIZEOF_LONG == 4 |
476 | | # define UCS2_REPEAT_MASK 0x00010001ul |
477 | | #else |
478 | | # error C 'long' size should be either 4 or 8! |
479 | | #endif |
480 | | |
481 | | /* The mask for fast checking. */ |
482 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
483 | | /* The mask for fast checking of whether a C 'long' contains a |
484 | | non-ASCII or non-Latin1 UTF16-encoded characters. */ |
485 | 10.3k | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) |
486 | | #else |
487 | | /* The mask for fast checking of whether a C 'long' may contain |
488 | | UTF16-encoded surrogate characters. This is an efficient heuristic, |
489 | | assuming that non-surrogate characters with a code point >= 0x8000 are |
490 | | rare in most input. |
491 | | */ |
492 | 492k | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) |
493 | | #endif |
494 | | /* The mask for fast byte-swapping. */ |
495 | 30.8k | #define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) |
496 | | /* Swap bytes. */ |
497 | 15.4k | #define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ |
498 | 15.4k | (((value) & STRIPPED_MASK) << 8)) |
499 | | |
500 | | Py_LOCAL_INLINE(Py_UCS4) |
501 | | STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, |
502 | | STRINGLIB_CHAR *dest, Py_ssize_t *outpos, |
503 | | int native_ordering) |
504 | 45.9k | { |
505 | 45.9k | Py_UCS4 ch; |
506 | 45.9k | const unsigned char *q = *inptr; |
507 | 45.9k | STRINGLIB_CHAR *p = dest + *outpos; |
508 | | /* Offsets from q for retrieving byte pairs in the right order. */ |
509 | 45.9k | #if PY_LITTLE_ENDIAN |
510 | 45.9k | int ihi = !!native_ordering, ilo = !native_ordering; |
511 | | #else |
512 | | int ihi = !native_ordering, ilo = !!native_ordering; |
513 | | #endif |
514 | 45.9k | --e; |
515 | | |
516 | 212k | while (q < e) { |
517 | 206k | Py_UCS4 ch2; |
518 | | /* First check for possible aligned read of a C 'long'. Unaligned |
519 | | reads are more expensive, better to defer to another iteration. */ |
520 | 206k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { |
521 | | /* Fast path for runs of in-range non-surrogate chars. */ |
522 | 56.4k | const unsigned char *_q = q; |
523 | 525k | while (_q + SIZEOF_LONG <= e) { |
524 | 514k | unsigned long block = * (const unsigned long *) _q; |
525 | 514k | if (native_ordering) { |
526 | | /* Can use buffer directly */ |
527 | 503k | if (block & FAST_CHAR_MASK) |
528 | 38.9k | break; |
529 | 503k | } |
530 | 11.3k | else { |
531 | | /* Need to byte-swap */ |
532 | 11.3k | if (block & SWAB(FAST_CHAR_MASK)) |
533 | 6.22k | break; |
534 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
535 | 1.00k | block >>= 8; |
536 | | #else |
537 | 4.09k | block = SWAB(block); |
538 | | #endif |
539 | 4.09k | } |
540 | 469k | #if PY_LITTLE_ENDIAN |
541 | | # if SIZEOF_LONG == 4 |
542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); |
544 | | # elif SIZEOF_LONG == 8 |
545 | 469k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
546 | 469k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
547 | 469k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
548 | 469k | p[3] = (STRINGLIB_CHAR)(block >> 48); |
549 | 469k | # endif |
550 | | #else |
551 | | # if SIZEOF_LONG == 4 |
552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); |
553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
554 | | # elif SIZEOF_LONG == 8 |
555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); |
556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
559 | | # endif |
560 | | #endif |
561 | 469k | _q += SIZEOF_LONG; |
562 | 469k | p += SIZEOF_LONG / 2; |
563 | 469k | } |
564 | 56.4k | q = _q; |
565 | 56.4k | if (q >= e) |
566 | 944 | break; |
567 | 56.4k | } |
568 | | |
569 | 205k | ch = (q[ihi] << 8) | q[ilo]; |
570 | 205k | q += 2; |
571 | 205k | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
572 | | #if STRINGLIB_SIZEOF_CHAR < 2 |
573 | 29.2k | if (ch > STRINGLIB_MAX_CHAR) |
574 | | /* Out-of-range */ |
575 | 12.9k | goto Return; |
576 | 16.3k | #endif |
577 | 16.3k | *p++ = (STRINGLIB_CHAR)ch; |
578 | 16.3k | continue; |
579 | 178k | } |
580 | | |
581 | | /* UTF-16 code pair: */ |
582 | 27.3k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) |
583 | 14.4k | goto IllegalEncoding; |
584 | 12.9k | if (q >= e) |
585 | 1.66k | goto UnexpectedEnd; |
586 | 11.2k | ch2 = (q[ihi] << 8) | q[ilo]; |
587 | 11.2k | q += 2; |
588 | 11.2k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) |
589 | 6.99k | goto IllegalSurrogate; |
590 | 4.28k | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); |
591 | | #if STRINGLIB_SIZEOF_CHAR < 4 |
592 | | /* Out-of-range */ |
593 | 3.48k | goto Return; |
594 | | #else |
595 | | *p++ = (STRINGLIB_CHAR)ch; |
596 | | #endif |
597 | 803 | } |
598 | 6.39k | ch = 0; |
599 | 45.9k | Return: |
600 | 45.9k | *inptr = q; |
601 | 45.9k | *outpos = p - dest; |
602 | 45.9k | return ch; |
603 | 1.66k | UnexpectedEnd: |
604 | 1.66k | ch = 1; |
605 | 1.66k | goto Return; |
606 | 14.4k | IllegalEncoding: |
607 | 14.4k | ch = 2; |
608 | 14.4k | goto Return; |
609 | 6.99k | IllegalSurrogate: |
610 | 6.99k | ch = 3; |
611 | 6.99k | goto Return; |
612 | 6.39k | } unicodeobject.c:asciilib_utf16_decode Line | Count | Source | 504 | 14.0k | { | 505 | 14.0k | Py_UCS4 ch; | 506 | 14.0k | const unsigned char *q = *inptr; | 507 | 14.0k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 14.0k | #if PY_LITTLE_ENDIAN | 510 | 14.0k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 14.0k | --e; | 515 | | | 516 | 24.8k | while (q < e) { | 517 | 24.3k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 24.3k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 13.1k | const unsigned char *_q = q; | 523 | 16.0k | while (_q + SIZEOF_LONG <= e) { | 524 | 10.9k | unsigned long block = * (const unsigned long *) _q; | 525 | 10.9k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 8.30k | if (block & FAST_CHAR_MASK) | 528 | 6.24k | break; | 529 | 8.30k | } | 530 | 2.63k | else { | 531 | | /* Need to byte-swap */ | 532 | 2.63k | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 1.89k | break; | 534 | 746 | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | 746 | block >>= 8; | 536 | | #else | 537 | | block = SWAB(block); | 538 | | #endif | 539 | 746 | } | 540 | 2.81k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 2.81k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 2.81k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 2.81k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 2.81k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 2.81k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 2.81k | _q += SIZEOF_LONG; | 562 | 2.81k | p += SIZEOF_LONG / 2; | 563 | 2.81k | } | 564 | 13.1k | q = _q; | 565 | 13.1k | if (q >= e) | 566 | 296 | break; | 567 | 13.1k | } | 568 | | | 569 | 24.0k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 24.0k | q += 2; | 571 | 24.0k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | 22.2k | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | 22.2k | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | 11.3k | goto Return; | 576 | 10.8k | #endif | 577 | 10.8k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 10.8k | continue; | 579 | 22.2k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 1.85k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 955 | goto IllegalEncoding; | 584 | 900 | if (q >= e) | 585 | 264 | goto UnexpectedEnd; | 586 | 636 | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 636 | q += 2; | 588 | 636 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 239 | goto IllegalSurrogate; | 590 | 397 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 397 | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 397 | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 636 | } | 598 | 776 | ch = 0; | 599 | 14.0k | Return: | 600 | 14.0k | *inptr = q; | 601 | 14.0k | *outpos = p - dest; | 602 | 14.0k | return ch; | 603 | 264 | UnexpectedEnd: | 604 | 264 | ch = 1; | 605 | 264 | goto Return; | 606 | 955 | IllegalEncoding: | 607 | 955 | ch = 2; | 608 | 955 | goto Return; | 609 | 239 | IllegalSurrogate: | 610 | 239 | ch = 3; | 611 | 239 | goto Return; | 612 | 776 | } |
unicodeobject.c:ucs1lib_utf16_decode Line | Count | Source | 504 | 3.55k | { | 505 | 3.55k | Py_UCS4 ch; | 506 | 3.55k | const unsigned char *q = *inptr; | 507 | 3.55k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 3.55k | #if PY_LITTLE_ENDIAN | 510 | 3.55k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 3.55k | --e; | 515 | | | 516 | 9.04k | while (q < e) { | 517 | 8.88k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 8.88k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 1.50k | const unsigned char *_q = q; | 523 | 2.67k | while (_q + SIZEOF_LONG <= e) { | 524 | 2.46k | unsigned long block = * (const unsigned long *) _q; | 525 | 2.46k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 1.99k | if (block & FAST_CHAR_MASK) | 528 | 1.08k | break; | 529 | 1.99k | } | 530 | 464 | else { | 531 | | /* Need to byte-swap */ | 532 | 464 | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 202 | break; | 534 | 262 | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | 262 | block >>= 8; | 536 | | #else | 537 | | block = SWAB(block); | 538 | | #endif | 539 | 262 | } | 540 | 1.17k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 1.17k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 1.17k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 1.17k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 1.17k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 1.17k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 1.17k | _q += SIZEOF_LONG; | 562 | 1.17k | p += SIZEOF_LONG / 2; | 563 | 1.17k | } | 564 | 1.50k | q = _q; | 565 | 1.50k | if (q >= e) | 566 | 122 | break; | 567 | 1.50k | } | 568 | | | 569 | 8.75k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 8.75k | q += 2; | 571 | 8.75k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | 7.04k | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | 7.04k | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | 1.55k | goto Return; | 576 | 5.48k | #endif | 577 | 5.48k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 5.48k | continue; | 579 | 7.04k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 1.71k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 187 | goto IllegalEncoding; | 584 | 1.52k | if (q >= e) | 585 | 79 | goto UnexpectedEnd; | 586 | 1.44k | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 1.44k | q += 2; | 588 | 1.44k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 1.04k | goto IllegalSurrogate; | 590 | 406 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 406 | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 406 | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 1.44k | } | 598 | 281 | ch = 0; | 599 | 3.55k | Return: | 600 | 3.55k | *inptr = q; | 601 | 3.55k | *outpos = p - dest; | 602 | 3.55k | return ch; | 603 | 79 | UnexpectedEnd: | 604 | 79 | ch = 1; | 605 | 79 | goto Return; | 606 | 187 | IllegalEncoding: | 607 | 187 | ch = 2; | 608 | 187 | goto Return; | 609 | 1.04k | IllegalSurrogate: | 610 | 1.04k | ch = 3; | 611 | 1.04k | goto Return; | 612 | 281 | } |
unicodeobject.c:ucs2lib_utf16_decode Line | Count | Source | 504 | 12.2k | { | 505 | 12.2k | Py_UCS4 ch; | 506 | 12.2k | const unsigned char *q = *inptr; | 507 | 12.2k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 12.2k | #if PY_LITTLE_ENDIAN | 510 | 12.2k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 12.2k | --e; | 515 | | | 516 | 119k | while (q < e) { | 517 | 115k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 115k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 27.0k | const unsigned char *_q = q; | 523 | 487k | while (_q + SIZEOF_LONG <= e) { | 524 | 482k | unsigned long block = * (const unsigned long *) _q; | 525 | 482k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 477k | if (block & FAST_CHAR_MASK) | 528 | 19.9k | break; | 529 | 477k | } | 530 | 5.99k | else { | 531 | | /* Need to byte-swap */ | 532 | 5.99k | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 2.94k | break; | 534 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | | block >>= 8; | 536 | | #else | 537 | 3.04k | block = SWAB(block); | 538 | 3.04k | #endif | 539 | 3.04k | } | 540 | 460k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 460k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 460k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 460k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 460k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 460k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 460k | _q += SIZEOF_LONG; | 562 | 460k | p += SIZEOF_LONG / 2; | 563 | 460k | } | 564 | 27.0k | q = _q; | 565 | 27.0k | if (q >= e) | 566 | 395 | break; | 567 | 27.0k | } | 568 | | | 569 | 115k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 115k | q += 2; | 571 | 115k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | | goto Return; | 576 | | #endif | 577 | 107k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 107k | continue; | 579 | 107k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 7.81k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 2.43k | goto IllegalEncoding; | 584 | 5.38k | if (q >= e) | 585 | 422 | goto UnexpectedEnd; | 586 | 4.96k | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 4.96k | q += 2; | 588 | 4.96k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 2.27k | goto IllegalSurrogate; | 590 | 2.68k | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 2.68k | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 2.68k | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 4.96k | } | 598 | 4.45k | ch = 0; | 599 | 12.2k | Return: | 600 | 12.2k | *inptr = q; | 601 | 12.2k | *outpos = p - dest; | 602 | 12.2k | return ch; | 603 | 422 | UnexpectedEnd: | 604 | 422 | ch = 1; | 605 | 422 | goto Return; | 606 | 2.43k | IllegalEncoding: | 607 | 2.43k | ch = 2; | 608 | 2.43k | goto Return; | 609 | 2.27k | IllegalSurrogate: | 610 | 2.27k | ch = 3; | 611 | 2.27k | goto Return; | 612 | 4.45k | } |
unicodeobject.c:ucs4lib_utf16_decode Line | Count | Source | 504 | 16.0k | { | 505 | 16.0k | Py_UCS4 ch; | 506 | 16.0k | const unsigned char *q = *inptr; | 507 | 16.0k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 16.0k | #if PY_LITTLE_ENDIAN | 510 | 16.0k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 16.0k | --e; | 515 | | | 516 | 58.8k | while (q < e) { | 517 | 58.0k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 58.0k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 14.7k | const unsigned char *_q = q; | 523 | 19.9k | while (_q + SIZEOF_LONG <= e) { | 524 | 18.0k | unsigned long block = * (const unsigned long *) _q; | 525 | 18.0k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 15.8k | if (block & FAST_CHAR_MASK) | 528 | 11.6k | break; | 529 | 15.8k | } | 530 | 2.23k | else { | 531 | | /* Need to byte-swap */ | 532 | 2.23k | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 1.18k | break; | 534 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | | block >>= 8; | 536 | | #else | 537 | 1.04k | block = SWAB(block); | 538 | 1.04k | #endif | 539 | 1.04k | } | 540 | 5.19k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 5.19k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 5.19k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 5.19k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 5.19k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 5.19k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 5.19k | _q += SIZEOF_LONG; | 562 | 5.19k | p += SIZEOF_LONG / 2; | 563 | 5.19k | } | 564 | 14.7k | q = _q; | 565 | 14.7k | if (q >= e) | 566 | 131 | break; | 567 | 14.7k | } | 568 | | | 569 | 57.9k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 57.9k | q += 2; | 571 | 57.9k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | | goto Return; | 576 | | #endif | 577 | 41.9k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 41.9k | continue; | 579 | 41.9k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 15.9k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 10.8k | goto IllegalEncoding; | 584 | 5.13k | if (q >= e) | 585 | 897 | goto UnexpectedEnd; | 586 | 4.23k | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 4.23k | q += 2; | 588 | 4.23k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 3.43k | goto IllegalSurrogate; | 590 | 803 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | | goto Return; | 594 | | #else | 595 | 803 | *p++ = (STRINGLIB_CHAR)ch; | 596 | 803 | #endif | 597 | 803 | } | 598 | 888 | ch = 0; | 599 | 16.0k | Return: | 600 | 16.0k | *inptr = q; | 601 | 16.0k | *outpos = p - dest; | 602 | 16.0k | return ch; | 603 | 897 | UnexpectedEnd: | 604 | 897 | ch = 1; | 605 | 897 | goto Return; | 606 | 10.8k | IllegalEncoding: | 607 | 10.8k | ch = 2; | 608 | 10.8k | goto Return; | 609 | 3.43k | IllegalSurrogate: | 610 | 3.43k | ch = 3; | 611 | 3.43k | goto Return; | 612 | 888 | } |
|
613 | | #undef UCS2_REPEAT_MASK |
614 | | #undef FAST_CHAR_MASK |
615 | | #undef STRIPPED_MASK |
616 | | #undef SWAB |
617 | | |
618 | | |
619 | | #if STRINGLIB_MAX_CHAR >= 0x80 |
620 | | Py_LOCAL_INLINE(Py_ssize_t) |
621 | | STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, |
622 | | Py_ssize_t len, |
623 | | unsigned short **outptr, |
624 | | int native_ordering) |
625 | 0 | { |
626 | 0 | unsigned short *out = *outptr; |
627 | 0 | const STRINGLIB_CHAR *end = in + len; |
628 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
629 | 0 | if (native_ordering) { |
630 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
631 | 0 | while (in < unrolled_end) { |
632 | 0 | out[0] = in[0]; |
633 | 0 | out[1] = in[1]; |
634 | 0 | out[2] = in[2]; |
635 | 0 | out[3] = in[3]; |
636 | 0 | in += 4; out += 4; |
637 | 0 | } |
638 | 0 | while (in < end) { |
639 | 0 | *out++ = *in++; |
640 | 0 | } |
641 | 0 | } else { |
642 | 0 | # define SWAB2(CH) ((CH) << 8) /* high byte is zero */ |
643 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
644 | 0 | while (in < unrolled_end) { |
645 | 0 | out[0] = SWAB2(in[0]); |
646 | 0 | out[1] = SWAB2(in[1]); |
647 | 0 | out[2] = SWAB2(in[2]); |
648 | 0 | out[3] = SWAB2(in[3]); |
649 | 0 | in += 4; out += 4; |
650 | 0 | } |
651 | 0 | while (in < end) { |
652 | 0 | Py_UCS4 ch = *in++; |
653 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
654 | 0 | } |
655 | 0 | #undef SWAB2 |
656 | 0 | } |
657 | | *outptr = out; |
658 | | return len; |
659 | | #else |
660 | 0 | if (native_ordering) { |
661 | | #if STRINGLIB_MAX_CHAR < 0x10000 |
662 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
663 | 0 | while (in < unrolled_end) { |
664 | | /* check if any character is a surrogate character */ |
665 | 0 | if (((in[0] ^ 0xd800) & |
666 | 0 | (in[1] ^ 0xd800) & |
667 | 0 | (in[2] ^ 0xd800) & |
668 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
669 | 0 | break; |
670 | 0 | out[0] = in[0]; |
671 | 0 | out[1] = in[1]; |
672 | 0 | out[2] = in[2]; |
673 | 0 | out[3] = in[3]; |
674 | 0 | in += 4; out += 4; |
675 | 0 | } |
676 | | #endif |
677 | 0 | while (in < end) { |
678 | 0 | Py_UCS4 ch; |
679 | 0 | ch = *in++; |
680 | 0 | if (ch < 0xd800) |
681 | 0 | *out++ = ch; |
682 | 0 | else if (ch < 0xe000) |
683 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
684 | 0 | goto fail; |
685 | | #if STRINGLIB_MAX_CHAR >= 0x10000 |
686 | 0 | else if (ch >= 0x10000) { |
687 | 0 | out[0] = Py_UNICODE_HIGH_SURROGATE(ch); |
688 | 0 | out[1] = Py_UNICODE_LOW_SURROGATE(ch); |
689 | 0 | out += 2; |
690 | 0 | } |
691 | 0 | #endif |
692 | 0 | else |
693 | 0 | *out++ = ch; |
694 | 0 | } |
695 | 0 | } else { |
696 | 0 | #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) |
697 | | #if STRINGLIB_MAX_CHAR < 0x10000 |
698 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
699 | 0 | while (in < unrolled_end) { |
700 | | /* check if any character is a surrogate character */ |
701 | 0 | if (((in[0] ^ 0xd800) & |
702 | 0 | (in[1] ^ 0xd800) & |
703 | 0 | (in[2] ^ 0xd800) & |
704 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
705 | 0 | break; |
706 | 0 | out[0] = SWAB2(in[0]); |
707 | 0 | out[1] = SWAB2(in[1]); |
708 | 0 | out[2] = SWAB2(in[2]); |
709 | 0 | out[3] = SWAB2(in[3]); |
710 | 0 | in += 4; out += 4; |
711 | 0 | } |
712 | | #endif |
713 | 0 | while (in < end) { |
714 | 0 | Py_UCS4 ch = *in++; |
715 | 0 | if (ch < 0xd800) |
716 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
717 | 0 | else if (ch < 0xe000) |
718 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
719 | 0 | goto fail; |
720 | | #if STRINGLIB_MAX_CHAR >= 0x10000 |
721 | 0 | else if (ch >= 0x10000) { |
722 | 0 | Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); |
723 | 0 | Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); |
724 | 0 | out[0] = SWAB2(ch1); |
725 | 0 | out[1] = SWAB2(ch2); |
726 | 0 | out += 2; |
727 | 0 | } |
728 | 0 | #endif |
729 | 0 | else |
730 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
731 | 0 | } |
732 | 0 | #undef SWAB2 |
733 | 0 | } |
734 | 0 | *outptr = out; |
735 | 0 | return len; |
736 | 0 | fail: |
737 | 0 | *outptr = out; |
738 | 0 | return len - (end - in + 1); |
739 | | #endif |
740 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf16_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf16_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf16_encode |
741 | | |
742 | | static inline uint32_t |
743 | | STRINGLIB(SWAB4)(STRINGLIB_CHAR ch) |
744 | 0 | { |
745 | 0 | uint32_t word = ch; |
746 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
747 | | /* high bytes are zero */ |
748 | | return (word << 24); |
749 | | #elif STRINGLIB_SIZEOF_CHAR == 2 |
750 | | /* high bytes are zero */ |
751 | | return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8); |
752 | | #else |
753 | | return _Py_bswap32(word); |
754 | | #endif |
755 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_SWAB4 Unexecuted instantiation: unicodeobject.c:ucs2lib_SWAB4 Unexecuted instantiation: unicodeobject.c:ucs4lib_SWAB4 |
756 | | |
757 | | Py_LOCAL_INLINE(Py_ssize_t) |
758 | | STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, |
759 | | Py_ssize_t len, |
760 | | uint32_t **outptr, |
761 | | int native_ordering) |
762 | 0 | { |
763 | 0 | uint32_t *out = *outptr; |
764 | 0 | const STRINGLIB_CHAR *end = in + len; |
765 | 0 | if (native_ordering) { |
766 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
767 | 0 | while (in < unrolled_end) { |
768 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
769 | | /* check if any character is a surrogate character */ |
770 | 0 | if (((in[0] ^ 0xd800) & |
771 | 0 | (in[1] ^ 0xd800) & |
772 | 0 | (in[2] ^ 0xd800) & |
773 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
774 | 0 | break; |
775 | 0 | #endif |
776 | 0 | out[0] = in[0]; |
777 | 0 | out[1] = in[1]; |
778 | 0 | out[2] = in[2]; |
779 | 0 | out[3] = in[3]; |
780 | 0 | in += 4; out += 4; |
781 | 0 | } |
782 | 0 | while (in < end) { |
783 | 0 | Py_UCS4 ch; |
784 | 0 | ch = *in++; |
785 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
786 | 0 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
787 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
788 | 0 | goto fail; |
789 | 0 | } |
790 | 0 | #endif |
791 | 0 | *out++ = ch; |
792 | 0 | } |
793 | 0 | } else { |
794 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
795 | 0 | while (in < unrolled_end) { |
796 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
797 | | /* check if any character is a surrogate character */ |
798 | 0 | if (((in[0] ^ 0xd800) & |
799 | 0 | (in[1] ^ 0xd800) & |
800 | 0 | (in[2] ^ 0xd800) & |
801 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
802 | 0 | break; |
803 | 0 | #endif |
804 | 0 | out[0] = STRINGLIB(SWAB4)(in[0]); |
805 | 0 | out[1] = STRINGLIB(SWAB4)(in[1]); |
806 | 0 | out[2] = STRINGLIB(SWAB4)(in[2]); |
807 | 0 | out[3] = STRINGLIB(SWAB4)(in[3]); |
808 | 0 | in += 4; out += 4; |
809 | 0 | } |
810 | 0 | while (in < end) { |
811 | 0 | Py_UCS4 ch = *in++; |
812 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
813 | 0 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
814 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
815 | 0 | goto fail; |
816 | 0 | } |
817 | 0 | #endif |
818 | 0 | *out++ = STRINGLIB(SWAB4)(ch); |
819 | 0 | } |
820 | 0 | } |
821 | 0 | *outptr = out; |
822 | 0 | return len; |
823 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
824 | 0 | fail: |
825 | 0 | *outptr = out; |
826 | 0 | return len - (end - in + 1); |
827 | | #endif |
828 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf32_encode |
829 | | |
830 | | #endif |