/src/cpython/Objects/stringlib/codecs.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* stringlib: codec implementations */ |
2 | | |
3 | | #if !STRINGLIB_IS_UNICODE |
4 | | # error "codecs.h is specific to Unicode" |
5 | | #endif |
6 | | |
7 | | #include "pycore_bitutils.h" // _Py_bswap32() |
8 | | |
9 | | /* Mask to quickly check whether a C 'size_t' contains a |
10 | | non-ASCII, UTF8-encoded char. */ |
11 | | #if (SIZEOF_SIZE_T == 8) |
12 | 364M | # define ASCII_CHAR_MASK 0x8080808080808080ULL |
13 | | #elif (SIZEOF_SIZE_T == 4) |
14 | | # define ASCII_CHAR_MASK 0x80808080U |
15 | | #else |
16 | | # error C 'size_t' size should be either 4 or 8! |
17 | | #endif |
18 | | |
19 | | /* 10xxxxxx */ |
20 | 123M | #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0) |
21 | | |
22 | | Py_LOCAL_INLINE(Py_UCS4) |
23 | | STRINGLIB(utf8_decode)(const char **inptr, const char *end, |
24 | | STRINGLIB_CHAR *dest, |
25 | | Py_ssize_t *outpos) |
26 | 176M | { |
27 | 176M | Py_UCS4 ch; |
28 | 176M | const char *s = *inptr; |
29 | 176M | STRINGLIB_CHAR *p = dest + *outpos; |
30 | | |
31 | 377M | while (s < end) { |
32 | 377M | ch = (unsigned char)*s; |
33 | | |
34 | 377M | if (ch < 0x80) { |
35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 |
36 | | input will consist of an overwhelming majority of ASCII |
37 | | characters, we try to optimize for this case by checking |
38 | | as many characters as a C 'size_t' can contain. |
39 | | First, check if we can do an aligned read, as most CPUs have |
40 | | a penalty for unaligned reads. |
41 | | */ |
42 | 159M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { |
43 | | /* Help register allocation */ |
44 | 20.3M | const char *_s = s; |
45 | 20.3M | STRINGLIB_CHAR *_p = p; |
46 | 364M | while (_s + SIZEOF_SIZE_T <= end) { |
47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), |
48 | | and do a fast unrolled copy if it only contains ASCII |
49 | | characters. */ |
50 | 364M | size_t value = *(const size_t *) _s; |
51 | 364M | if (value & ASCII_CHAR_MASK) |
52 | 20.2M | break; |
53 | 344M | #if PY_LITTLE_ENDIAN |
54 | 344M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); |
55 | 344M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
56 | 344M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
57 | 344M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
58 | 344M | # if SIZEOF_SIZE_T == 8 |
59 | 344M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
60 | 344M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
61 | 344M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
62 | 344M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
63 | 344M | # endif |
64 | | #else |
65 | | # if SIZEOF_SIZE_T == 8 |
66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); |
74 | | # else |
75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); |
79 | | # endif |
80 | | #endif |
81 | 344M | _s += SIZEOF_SIZE_T; |
82 | 344M | _p += SIZEOF_SIZE_T; |
83 | 344M | } |
84 | 20.3M | s = _s; |
85 | 20.3M | p = _p; |
86 | 20.3M | if (s == end) |
87 | 10.5k | break; |
88 | 20.3M | ch = (unsigned char)*s; |
89 | 20.3M | } |
90 | 159M | if (ch < 0x80) { |
91 | 158M | s++; |
92 | 158M | *p++ = ch; |
93 | 158M | continue; |
94 | 158M | } |
95 | 159M | } |
96 | | |
97 | 218M | if (ch < 0xE0) { |
98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ |
99 | 94.1M | Py_UCS4 ch2; |
100 | 94.1M | if (ch < 0xC2) { |
101 | | /* invalid sequence |
102 | | \x80-\xBF -- continuation byte |
103 | | \xC0-\xC1 -- fake 0000-007F */ |
104 | 68.2M | goto InvalidStart; |
105 | 68.2M | } |
106 | 25.9M | if (end - s < 2) { |
107 | | /* unexpected end of data: the caller will decide whether |
108 | | it's an error or not */ |
109 | 11.2k | break; |
110 | 11.2k | } |
111 | 25.9M | ch2 = (unsigned char)s[1]; |
112 | 25.9M | if (!IS_CONTINUATION_BYTE(ch2)) |
113 | | /* invalid continuation byte */ |
114 | 20.9M | goto InvalidContinuation1; |
115 | 4.99M | ch = (ch << 6) + ch2 - |
116 | 4.99M | ((0xC0 << 6) + 0x80); |
117 | 4.99M | assert ((ch > 0x007F) && (ch <= 0x07FF)); |
118 | 4.99M | s += 2; |
119 | 4.99M | if (STRINGLIB_MAX_CHAR <= 0x007F || |
120 | 4.99M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) |
121 | | /* Out-of-range */ |
122 | 83.5k | goto Return; |
123 | 4.91M | *p++ = ch; |
124 | 4.91M | continue; |
125 | 4.99M | } |
126 | | |
127 | 124M | if (ch < 0xF0) { |
128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ |
129 | 51.7M | Py_UCS4 ch2, ch3; |
130 | 51.7M | if (end - s < 3) { |
131 | | /* unexpected end of data: the caller will decide whether |
132 | | it's an error or not */ |
133 | 11.8k | if (end - s < 2) |
134 | 4.18k | break; |
135 | 7.65k | ch2 = (unsigned char)s[1]; |
136 | 7.65k | if (!IS_CONTINUATION_BYTE(ch2) || |
137 | 7.65k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) |
138 | | /* for clarification see comments below */ |
139 | 5.32k | goto InvalidContinuation1; |
140 | 2.32k | break; |
141 | 7.65k | } |
142 | 51.7M | ch2 = (unsigned char)s[1]; |
143 | 51.7M | ch3 = (unsigned char)s[2]; |
144 | 51.7M | if (!IS_CONTINUATION_BYTE(ch2)) { |
145 | | /* invalid continuation byte */ |
146 | 13.6M | goto InvalidContinuation1; |
147 | 13.6M | } |
148 | 38.1M | if (ch == 0xE0) { |
149 | 120k | if (ch2 < 0xA0) |
150 | | /* invalid sequence |
151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ |
152 | 51.4k | goto InvalidContinuation1; |
153 | 37.9M | } else if (ch == 0xED && ch2 >= 0xA0) { |
154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF |
155 | | will result in surrogates in range D800-DFFF. Surrogates are |
156 | | not valid UTF-8 so they are rejected. |
157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf |
158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ |
159 | 107k | goto InvalidContinuation1; |
160 | 107k | } |
161 | 37.9M | if (!IS_CONTINUATION_BYTE(ch3)) { |
162 | | /* invalid continuation byte */ |
163 | 1.05M | goto InvalidContinuation2; |
164 | 1.05M | } |
165 | 36.9M | ch = (ch << 12) + (ch2 << 6) + ch3 - |
166 | 36.9M | ((0xE0 << 12) + (0x80 << 6) + 0x80); |
167 | 36.9M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); |
168 | 36.9M | s += 3; |
169 | 36.9M | if (STRINGLIB_MAX_CHAR <= 0x07FF || |
170 | 36.9M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) |
171 | | /* Out-of-range */ |
172 | 159k | goto Return; |
173 | 36.7M | *p++ = ch; |
174 | 36.7M | continue; |
175 | 36.9M | } |
176 | | |
177 | 72.7M | if (ch < 0xF5) { |
178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ |
179 | 6.07M | Py_UCS4 ch2, ch3, ch4; |
180 | 6.07M | if (end - s < 4) { |
181 | | /* unexpected end of data: the caller will decide whether |
182 | | it's an error or not */ |
183 | 20.0k | if (end - s < 2) |
184 | 5.70k | break; |
185 | 14.3k | ch2 = (unsigned char)s[1]; |
186 | 14.3k | if (!IS_CONTINUATION_BYTE(ch2) || |
187 | 14.3k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) |
188 | | /* for clarification see comments below */ |
189 | 9.82k | goto InvalidContinuation1; |
190 | 4.52k | if (end - s < 3) |
191 | 1.50k | break; |
192 | 3.01k | ch3 = (unsigned char)s[2]; |
193 | 3.01k | if (!IS_CONTINUATION_BYTE(ch3)) |
194 | 1.93k | goto InvalidContinuation2; |
195 | 1.08k | break; |
196 | 3.01k | } |
197 | 6.05M | ch2 = (unsigned char)s[1]; |
198 | 6.05M | ch3 = (unsigned char)s[2]; |
199 | 6.05M | ch4 = (unsigned char)s[3]; |
200 | 6.05M | if (!IS_CONTINUATION_BYTE(ch2)) { |
201 | | /* invalid continuation byte */ |
202 | 4.78M | goto InvalidContinuation1; |
203 | 4.78M | } |
204 | 1.26M | if (ch == 0xF0) { |
205 | 586k | if (ch2 < 0x90) |
206 | | /* invalid sequence |
207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ |
208 | 41.6k | goto InvalidContinuation1; |
209 | 682k | } else if (ch == 0xF4 && ch2 >= 0x90) { |
210 | | /* invalid sequence |
211 | | \xF4\x90\x80\x80- -- 110000- overflow */ |
212 | 72.8k | goto InvalidContinuation1; |
213 | 72.8k | } |
214 | 1.15M | if (!IS_CONTINUATION_BYTE(ch3)) { |
215 | | /* invalid continuation byte */ |
216 | 374k | goto InvalidContinuation2; |
217 | 374k | } |
218 | 780k | if (!IS_CONTINUATION_BYTE(ch4)) { |
219 | | /* invalid continuation byte */ |
220 | 123k | goto InvalidContinuation3; |
221 | 123k | } |
222 | 656k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - |
223 | 656k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); |
224 | 656k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); |
225 | 656k | s += 4; |
226 | 656k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || |
227 | 656k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) |
228 | | /* Out-of-range */ |
229 | 32.1k | goto Return; |
230 | 624k | *p++ = ch; |
231 | 624k | continue; |
232 | 656k | } |
233 | 66.6M | goto InvalidStart; |
234 | 72.7M | } |
235 | 326k | ch = 0; |
236 | 176M | Return: |
237 | 176M | *inptr = s; |
238 | 176M | *outpos = p - dest; |
239 | 176M | return ch; |
240 | 134M | InvalidStart: |
241 | 134M | ch = 1; |
242 | 134M | goto Return; |
243 | 39.6M | InvalidContinuation1: |
244 | 39.6M | ch = 2; |
245 | 39.6M | goto Return; |
246 | 1.43M | InvalidContinuation2: |
247 | 1.43M | ch = 3; |
248 | 1.43M | goto Return; |
249 | 123k | InvalidContinuation3: |
250 | 123k | ch = 4; |
251 | 123k | goto Return; |
252 | 326k | } unicodeobject.c:asciilib_utf8_decode Line | Count | Source | 26 | 286k | { | 27 | 286k | Py_UCS4 ch; | 28 | 286k | const char *s = *inptr; | 29 | 286k | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 286k | while (s < end) { | 32 | 286k | ch = (unsigned char)*s; | 33 | | | 34 | 286k | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 0 | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 0 | const char *_s = s; | 45 | 0 | STRINGLIB_CHAR *_p = p; | 46 | 0 | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 0 | size_t value = *(const size_t *) _s; | 51 | 0 | if (value & ASCII_CHAR_MASK) | 52 | 0 | break; | 53 | 0 | #if PY_LITTLE_ENDIAN | 54 | 0 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 0 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 0 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 0 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 0 | # if SIZEOF_SIZE_T == 8 | 59 | 0 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 0 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 0 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 0 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 0 | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 0 | _s += SIZEOF_SIZE_T; | 82 | 0 | _p += SIZEOF_SIZE_T; | 83 | 0 | } | 84 | 0 | s = _s; | 85 | 0 | p = _p; | 86 | 0 | if (s == end) | 87 | 0 | break; | 88 | 0 | ch = (unsigned char)*s; | 89 | 0 | } | 90 | 0 | if (ch < 0x80) { | 91 | 0 | s++; | 92 | 0 | *p++ = ch; | 93 | 0 | continue; | 94 | 0 | } | 95 | 0 | } | 96 | | | 97 | 286k | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 103k | Py_UCS4 ch2; | 100 | 103k | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 15.0k | goto InvalidStart; | 105 | 15.0k | } | 106 | 88.1k | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 1.28k | break; | 110 | 1.28k | } | 111 | 86.9k | ch2 = (unsigned char)s[1]; | 112 | 86.9k | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 5.00k | goto InvalidContinuation1; | 115 | 81.8k | ch = (ch << 6) + ch2 - | 116 | 81.8k | ((0xC0 << 6) + 0x80); | 117 | 81.8k | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 81.8k | s += 2; | 119 | 81.8k | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 81.8k | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 81.8k | goto Return; | 123 | 0 | *p++ = ch; | 124 | 0 | continue; | 125 | 81.8k | } | 126 | | | 127 | 183k | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 150k | Py_UCS4 ch2, ch3; | 130 | 150k | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 2.62k | if (end - s < 2) | 134 | 1.09k | break; | 135 | 1.53k | ch2 = (unsigned char)s[1]; | 136 | 1.53k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 1.53k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 1.00k | goto InvalidContinuation1; | 140 | 523 | break; | 141 | 1.53k | } | 142 | 147k | ch2 = (unsigned char)s[1]; | 143 | 147k | ch3 = (unsigned char)s[2]; | 144 | 147k | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 3.39k | goto InvalidContinuation1; | 147 | 3.39k | } | 148 | 144k | if (ch == 0xE0) { | 149 | 1.07k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 247 | goto InvalidContinuation1; | 153 | 143k | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 227 | goto InvalidContinuation1; | 160 | 227 | } | 161 | 144k | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 2.02k | goto InvalidContinuation2; | 164 | 2.02k | } | 165 | 141k | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 141k | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 141k | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 141k | s += 3; | 169 | 141k | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 141k | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 141k | goto Return; | 173 | 0 | *p++ = ch; | 174 | 0 | continue; | 175 | 141k | } | 176 | | | 177 | 32.7k | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 23.0k | Py_UCS4 ch2, ch3, ch4; | 180 | 23.0k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 7.98k | if (end - s < 2) | 184 | 2.81k | break; | 185 | 5.17k | ch2 = (unsigned char)s[1]; | 186 | 5.17k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 5.17k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 4.14k | goto InvalidContinuation1; | 190 | 1.03k | if (end - s < 3) | 191 | 427 | break; | 192 | 605 | ch3 = (unsigned char)s[2]; | 193 | 605 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 475 | goto InvalidContinuation2; | 195 | 130 | break; | 196 | 605 | } | 197 | 15.0k | ch2 = (unsigned char)s[1]; | 198 | 15.0k | ch3 = (unsigned char)s[2]; | 199 | 15.0k | ch4 = (unsigned char)s[3]; | 200 | 15.0k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 2.79k | goto InvalidContinuation1; | 203 | 2.79k | } | 204 | 12.2k | if (ch == 0xF0) { | 205 | 2.57k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 100 | goto InvalidContinuation1; | 209 | 9.72k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 466 | goto InvalidContinuation1; | 213 | 466 | } | 214 | 11.7k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 1.28k | goto InvalidContinuation2; | 217 | 1.28k | } | 218 | 10.4k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 489 | goto InvalidContinuation3; | 221 | 489 | } | 222 | 9.95k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 9.95k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 9.95k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 9.95k | s += 4; | 226 | 9.95k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 9.95k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 9.95k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 9.95k | } | 233 | 9.64k | goto InvalidStart; | 234 | 32.7k | } | 235 | 6.27k | ch = 0; | 236 | 286k | Return: | 237 | 286k | *inptr = s; | 238 | 286k | *outpos = p - dest; | 239 | 286k | return ch; | 240 | 24.6k | InvalidStart: | 241 | 24.6k | ch = 1; | 242 | 24.6k | goto Return; | 243 | 17.3k | InvalidContinuation1: | 244 | 17.3k | ch = 2; | 245 | 17.3k | goto Return; | 246 | 3.78k | InvalidContinuation2: | 247 | 3.78k | ch = 3; | 248 | 3.78k | goto Return; | 249 | 489 | InvalidContinuation3: | 250 | 489 | ch = 4; | 251 | 489 | goto Return; | 252 | 6.27k | } |
unicodeobject.c:ucs1lib_utf8_decode Line | Count | Source | 26 | 94.9k | { | 27 | 94.9k | Py_UCS4 ch; | 28 | 94.9k | const char *s = *inptr; | 29 | 94.9k | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 1.25M | while (s < end) { | 32 | 1.22M | ch = (unsigned char)*s; | 33 | | | 34 | 1.22M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 836k | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 117k | const char *_s = s; | 45 | 117k | STRINGLIB_CHAR *_p = p; | 46 | 10.0M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 9.99M | size_t value = *(const size_t *) _s; | 51 | 9.99M | if (value & ASCII_CHAR_MASK) | 52 | 95.3k | break; | 53 | 9.89M | #if PY_LITTLE_ENDIAN | 54 | 9.89M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 9.89M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 9.89M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 9.89M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 9.89M | # if SIZEOF_SIZE_T == 8 | 59 | 9.89M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 9.89M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 9.89M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 9.89M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 9.89M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 9.89M | _s += SIZEOF_SIZE_T; | 82 | 9.89M | _p += SIZEOF_SIZE_T; | 83 | 9.89M | } | 84 | 117k | s = _s; | 85 | 117k | p = _p; | 86 | 117k | if (s == end) | 87 | 2.71k | break; | 88 | 114k | ch = (unsigned char)*s; | 89 | 114k | } | 90 | 833k | if (ch < 0x80) { | 91 | 809k | s++; | 92 | 809k | *p++ = ch; | 93 | 809k | continue; | 94 | 809k | } | 95 | 833k | } | 96 | | | 97 | 407k | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 377k | Py_UCS4 ch2; | 100 | 377k | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 2.95k | goto InvalidStart; | 105 | 2.95k | } | 106 | 374k | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 818 | break; | 110 | 818 | } | 111 | 373k | ch2 = (unsigned char)s[1]; | 112 | 373k | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 23.2k | goto InvalidContinuation1; | 115 | 350k | ch = (ch << 6) + ch2 - | 116 | 350k | ((0xC0 << 6) + 0x80); | 117 | 350k | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 350k | s += 2; | 119 | 350k | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 350k | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 1.63k | goto Return; | 123 | 348k | *p++ = ch; | 124 | 348k | continue; | 125 | 350k | } | 126 | | | 127 | 30.2k | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 21.0k | Py_UCS4 ch2, ch3; | 130 | 21.0k | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 1.65k | if (end - s < 2) | 134 | 397 | break; | 135 | 1.25k | ch2 = (unsigned char)s[1]; | 136 | 1.25k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 1.25k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 883 | goto InvalidContinuation1; | 140 | 371 | break; | 141 | 1.25k | } | 142 | 19.4k | ch2 = (unsigned char)s[1]; | 143 | 19.4k | ch3 = (unsigned char)s[2]; | 144 | 19.4k | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 1.11k | goto InvalidContinuation1; | 147 | 1.11k | } | 148 | 18.2k | if (ch == 0xE0) { | 149 | 795 | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 133 | goto InvalidContinuation1; | 153 | 17.4k | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 446 | goto InvalidContinuation1; | 160 | 446 | } | 161 | 17.7k | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 411 | goto InvalidContinuation2; | 164 | 411 | } | 165 | 17.3k | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 17.3k | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 17.3k | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 17.3k | s += 3; | 169 | 17.3k | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 17.3k | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 17.3k | goto Return; | 173 | 0 | *p++ = ch; | 174 | 0 | continue; | 175 | 17.3k | } | 176 | | | 177 | 9.22k | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 8.44k | Py_UCS4 ch2, ch3, ch4; | 180 | 8.44k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 1.71k | if (end - s < 2) | 184 | 306 | break; | 185 | 1.40k | ch2 = (unsigned char)s[1]; | 186 | 1.40k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 1.40k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 902 | goto InvalidContinuation1; | 190 | 505 | if (end - s < 3) | 191 | 121 | break; | 192 | 384 | ch3 = (unsigned char)s[2]; | 193 | 384 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 284 | goto InvalidContinuation2; | 195 | 100 | break; | 196 | 384 | } | 197 | 6.72k | ch2 = (unsigned char)s[1]; | 198 | 6.72k | ch3 = (unsigned char)s[2]; | 199 | 6.72k | ch4 = (unsigned char)s[3]; | 200 | 6.72k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 670 | goto InvalidContinuation1; | 203 | 670 | } | 204 | 6.05k | if (ch == 0xF0) { | 205 | 1.07k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 124 | goto InvalidContinuation1; | 209 | 4.98k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 323 | goto InvalidContinuation1; | 213 | 323 | } | 214 | 5.61k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 2.16k | goto InvalidContinuation2; | 217 | 2.16k | } | 218 | 3.44k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 394 | goto InvalidContinuation3; | 221 | 394 | } | 222 | 3.05k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 3.05k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 3.05k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 3.05k | s += 4; | 226 | 3.05k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 3.05k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 3.05k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 3.05k | } | 233 | 780 | goto InvalidStart; | 234 | 9.22k | } | 235 | 38.1k | ch = 0; | 236 | 94.9k | Return: | 237 | 94.9k | *inptr = s; | 238 | 94.9k | *outpos = p - dest; | 239 | 94.9k | return ch; | 240 | 3.73k | InvalidStart: | 241 | 3.73k | ch = 1; | 242 | 3.73k | goto Return; | 243 | 27.8k | InvalidContinuation1: | 244 | 27.8k | ch = 2; | 245 | 27.8k | goto Return; | 246 | 2.86k | InvalidContinuation2: | 247 | 2.86k | ch = 3; | 248 | 2.86k | goto Return; | 249 | 394 | InvalidContinuation3: | 250 | 394 | ch = 4; | 251 | 394 | goto Return; | 252 | 38.1k | } |
unicodeobject.c:ucs2lib_utf8_decode Line | Count | Source | 26 | 83.7M | { | 27 | 83.7M | Py_UCS4 ch; | 28 | 83.7M | const char *s = *inptr; | 29 | 83.7M | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 172M | while (s < end) { | 32 | 172M | ch = (unsigned char)*s; | 33 | | | 34 | 172M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 68.0M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 8.80M | const char *_s = s; | 45 | 8.80M | STRINGLIB_CHAR *_p = p; | 46 | 196M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 195M | size_t value = *(const size_t *) _s; | 51 | 195M | if (value & ASCII_CHAR_MASK) | 52 | 8.72M | break; | 53 | 187M | #if PY_LITTLE_ENDIAN | 54 | 187M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 187M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 187M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 187M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 187M | # if SIZEOF_SIZE_T == 8 | 59 | 187M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 187M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 187M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 187M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 187M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 187M | _s += SIZEOF_SIZE_T; | 82 | 187M | _p += SIZEOF_SIZE_T; | 83 | 187M | } | 84 | 8.80M | s = _s; | 85 | 8.80M | p = _p; | 86 | 8.80M | if (s == end) | 87 | 5.62k | break; | 88 | 8.80M | ch = (unsigned char)*s; | 89 | 8.80M | } | 90 | 68.0M | if (ch < 0x80) { | 91 | 67.6M | s++; | 92 | 67.6M | *p++ = ch; | 93 | 67.6M | continue; | 94 | 67.6M | } | 95 | 68.0M | } | 96 | | | 97 | 104M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 42.0M | Py_UCS4 ch2; | 100 | 42.0M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 31.4M | goto InvalidStart; | 105 | 31.4M | } | 106 | 10.5M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 7.70k | break; | 110 | 7.70k | } | 111 | 10.5M | ch2 = (unsigned char)s[1]; | 112 | 10.5M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 9.45M | goto InvalidContinuation1; | 115 | 1.11M | ch = (ch << 6) + ch2 - | 116 | 1.11M | ((0xC0 << 6) + 0x80); | 117 | 1.11M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 1.11M | s += 2; | 119 | 1.11M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 1.11M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 0 | goto Return; | 123 | 1.11M | *p++ = ch; | 124 | 1.11M | continue; | 125 | 1.11M | } | 126 | | | 127 | 62.6M | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 26.7M | Py_UCS4 ch2, ch3; | 130 | 26.7M | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 4.25k | if (end - s < 2) | 134 | 1.93k | break; | 135 | 2.31k | ch2 = (unsigned char)s[1]; | 136 | 2.31k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 2.31k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 1.77k | goto InvalidContinuation1; | 140 | 545 | break; | 141 | 2.31k | } | 142 | 26.7M | ch2 = (unsigned char)s[1]; | 143 | 26.7M | ch3 = (unsigned char)s[2]; | 144 | 26.7M | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 6.61M | goto InvalidContinuation1; | 147 | 6.61M | } | 148 | 20.0M | if (ch == 0xE0) { | 149 | 27.1k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 8.72k | goto InvalidContinuation1; | 153 | 20.0M | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 8.11k | goto InvalidContinuation1; | 160 | 8.11k | } | 161 | 20.0M | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 132k | goto InvalidContinuation2; | 164 | 132k | } | 165 | 19.9M | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 19.9M | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 19.9M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 19.9M | s += 3; | 169 | 19.9M | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 19.9M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 0 | goto Return; | 173 | 19.9M | *p++ = ch; | 174 | 19.9M | continue; | 175 | 19.9M | } | 176 | | | 177 | 35.8M | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 2.17M | Py_UCS4 ch2, ch3, ch4; | 180 | 2.17M | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 6.77k | if (end - s < 2) | 184 | 1.89k | break; | 185 | 4.88k | ch2 = (unsigned char)s[1]; | 186 | 4.88k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 4.88k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 3.05k | goto InvalidContinuation1; | 190 | 1.83k | if (end - s < 3) | 191 | 589 | break; | 192 | 1.24k | ch3 = (unsigned char)s[2]; | 193 | 1.24k | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 575 | goto InvalidContinuation2; | 195 | 667 | break; | 196 | 1.24k | } | 197 | 2.16M | ch2 = (unsigned char)s[1]; | 198 | 2.16M | ch3 = (unsigned char)s[2]; | 199 | 2.16M | ch4 = (unsigned char)s[3]; | 200 | 2.16M | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 2.06M | goto InvalidContinuation1; | 203 | 2.06M | } | 204 | 103k | if (ch == 0xF0) { | 205 | 23.7k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 6.27k | goto InvalidContinuation1; | 209 | 80.1k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 8.50k | goto InvalidContinuation1; | 213 | 8.50k | } | 214 | 89.1k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 55.2k | goto InvalidContinuation2; | 217 | 55.2k | } | 218 | 33.9k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 14.7k | goto InvalidContinuation3; | 221 | 14.7k | } | 222 | 19.1k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 19.1k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 19.1k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 19.1k | s += 4; | 226 | 19.1k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 19.1k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 19.1k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 19.1k | } | 233 | 33.7M | goto InvalidStart; | 234 | 35.8M | } | 235 | 246k | ch = 0; | 236 | 83.7M | Return: | 237 | 83.7M | *inptr = s; | 238 | 83.7M | *outpos = p - dest; | 239 | 83.7M | return ch; | 240 | 65.1M | InvalidStart: | 241 | 65.1M | ch = 1; | 242 | 65.1M | goto Return; | 243 | 18.1M | InvalidContinuation1: | 244 | 18.1M | ch = 2; | 245 | 18.1M | goto Return; | 246 | 188k | InvalidContinuation2: | 247 | 188k | ch = 3; | 248 | 188k | goto Return; | 249 | 14.7k | InvalidContinuation3: | 250 | 14.7k | ch = 4; | 251 | 14.7k | goto Return; | 252 | 246k | } |
unicodeobject.c:ucs4lib_utf8_decode Line | Count | Source | 26 | 92.5M | { | 27 | 92.5M | Py_UCS4 ch; | 28 | 92.5M | const char *s = *inptr; | 29 | 92.5M | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 203M | while (s < end) { | 32 | 203M | ch = (unsigned char)*s; | 33 | | | 34 | 203M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 90.5M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 11.4M | const char *_s = s; | 45 | 11.4M | STRINGLIB_CHAR *_p = p; | 46 | 158M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 158M | size_t value = *(const size_t *) _s; | 51 | 158M | if (value & ASCII_CHAR_MASK) | 52 | 11.4M | break; | 53 | 147M | #if PY_LITTLE_ENDIAN | 54 | 147M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 147M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 147M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 147M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 147M | # if SIZEOF_SIZE_T == 8 | 59 | 147M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 147M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 147M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 147M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 147M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 147M | _s += SIZEOF_SIZE_T; | 82 | 147M | _p += SIZEOF_SIZE_T; | 83 | 147M | } | 84 | 11.4M | s = _s; | 85 | 11.4M | p = _p; | 86 | 11.4M | if (s == end) | 87 | 2.21k | break; | 88 | 11.4M | ch = (unsigned char)*s; | 89 | 11.4M | } | 90 | 90.5M | if (ch < 0x80) { | 91 | 90.1M | s++; | 92 | 90.1M | *p++ = ch; | 93 | 90.1M | continue; | 94 | 90.1M | } | 95 | 90.5M | } | 96 | | | 97 | 113M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 51.6M | Py_UCS4 ch2; | 100 | 51.6M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 36.8M | goto InvalidStart; | 105 | 36.8M | } | 106 | 14.8M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 1.46k | break; | 110 | 1.46k | } | 111 | 14.8M | ch2 = (unsigned char)s[1]; | 112 | 14.8M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 11.4M | goto InvalidContinuation1; | 115 | 3.44M | ch = (ch << 6) + ch2 - | 116 | 3.44M | ((0xC0 << 6) + 0x80); | 117 | 3.44M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 3.44M | s += 2; | 119 | 3.44M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 3.44M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 0 | goto Return; | 123 | 3.44M | *p++ = ch; | 124 | 3.44M | continue; | 125 | 3.44M | } | 126 | | | 127 | 61.7M | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 24.8M | Py_UCS4 ch2, ch3; | 130 | 24.8M | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 3.30k | if (end - s < 2) | 134 | 753 | break; | 135 | 2.54k | ch2 = (unsigned char)s[1]; | 136 | 2.54k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 2.54k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 1.66k | goto InvalidContinuation1; | 140 | 888 | break; | 141 | 2.54k | } | 142 | 24.8M | ch2 = (unsigned char)s[1]; | 143 | 24.8M | ch3 = (unsigned char)s[2]; | 144 | 24.8M | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 7.00M | goto InvalidContinuation1; | 147 | 7.00M | } | 148 | 17.8M | if (ch == 0xE0) { | 149 | 91.3k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 42.3k | goto InvalidContinuation1; | 153 | 17.7M | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 98.6k | goto InvalidContinuation1; | 160 | 98.6k | } | 161 | 17.7M | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 919k | goto InvalidContinuation2; | 164 | 919k | } | 165 | 16.7M | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 16.7M | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 16.7M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 16.7M | s += 3; | 169 | 16.7M | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 16.7M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 0 | goto Return; | 173 | 16.7M | *p++ = ch; | 174 | 16.7M | continue; | 175 | 16.7M | } | 176 | | | 177 | 36.8M | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 3.86M | Py_UCS4 ch2, ch3, ch4; | 180 | 3.86M | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 3.56k | if (end - s < 2) | 184 | 688 | break; | 185 | 2.87k | ch2 = (unsigned char)s[1]; | 186 | 2.87k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 2.87k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 1.72k | goto InvalidContinuation1; | 190 | 1.15k | if (end - s < 3) | 191 | 364 | break; | 192 | 788 | ch3 = (unsigned char)s[2]; | 193 | 788 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 600 | goto InvalidContinuation2; | 195 | 188 | break; | 196 | 788 | } | 197 | 3.86M | ch2 = (unsigned char)s[1]; | 198 | 3.86M | ch3 = (unsigned char)s[2]; | 199 | 3.86M | ch4 = (unsigned char)s[3]; | 200 | 3.86M | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 2.71M | goto InvalidContinuation1; | 203 | 2.71M | } | 204 | 1.14M | if (ch == 0xF0) { | 205 | 559k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 35.1k | goto InvalidContinuation1; | 209 | 588k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 63.5k | goto InvalidContinuation1; | 213 | 63.5k | } | 214 | 1.04M | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 316k | goto InvalidContinuation2; | 217 | 316k | } | 218 | 732k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 107k | goto InvalidContinuation3; | 221 | 107k | } | 222 | 624k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 624k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 624k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 624k | s += 4; | 226 | 624k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 624k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 0 | goto Return; | 230 | 624k | *p++ = ch; | 231 | 624k | continue; | 232 | 624k | } | 233 | 32.9M | goto InvalidStart; | 234 | 36.8M | } | 235 | 35.1k | ch = 0; | 236 | 92.5M | Return: | 237 | 92.5M | *inptr = s; | 238 | 92.5M | *outpos = p - dest; | 239 | 92.5M | return ch; | 240 | 69.7M | InvalidStart: | 241 | 69.7M | ch = 1; | 242 | 69.7M | goto Return; | 243 | 21.3M | InvalidContinuation1: | 244 | 21.3M | ch = 2; | 245 | 21.3M | goto Return; | 246 | 1.23M | InvalidContinuation2: | 247 | 1.23M | ch = 3; | 248 | 1.23M | goto Return; | 249 | 107k | InvalidContinuation3: | 250 | 107k | ch = 4; | 251 | 107k | goto Return; | 252 | 35.1k | } |
|
253 | | |
254 | | #undef ASCII_CHAR_MASK |
255 | | |
256 | | |
257 | | /* UTF-8 encoder specialized for a Unicode kind to avoid the slow |
258 | | PyUnicode_READ() macro. Delete some parts of the code depending on the kind: |
259 | | UCS-1 strings don't need to handle surrogates for example. */ |
260 | | Py_LOCAL_INLINE(char *) |
261 | | STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, |
262 | | PyObject *unicode, |
263 | | const STRINGLIB_CHAR *data, |
264 | | Py_ssize_t size, |
265 | | _Py_error_handler error_handler, |
266 | | const char *errors) |
267 | 6.71M | { |
268 | 6.71M | Py_ssize_t i; /* index into data of next input character */ |
269 | 6.71M | char *p; /* next free byte in output buffer */ |
270 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
271 | | PyObject *error_handler_obj = NULL; |
272 | | PyObject *exc = NULL; |
273 | | PyObject *rep = NULL; |
274 | | #endif |
275 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
276 | | const Py_ssize_t max_char_size = 2; |
277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 |
278 | | const Py_ssize_t max_char_size = 3; |
279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ |
280 | | const Py_ssize_t max_char_size = 4; |
281 | | #endif |
282 | | |
283 | 6.71M | assert(size >= 0); |
284 | 6.71M | if (size > PY_SSIZE_T_MAX / max_char_size) { |
285 | | /* integer overflow */ |
286 | 0 | PyErr_NoMemory(); |
287 | 0 | return NULL; |
288 | 0 | } |
289 | | |
290 | 6.71M | _PyBytesWriter_Init(writer); |
291 | 6.71M | p = _PyBytesWriter_Alloc(writer, size * max_char_size); |
292 | 6.71M | if (p == NULL) |
293 | 0 | return NULL; |
294 | | |
295 | 3.16G | for (i = 0; i < size;) { |
296 | 3.16G | Py_UCS4 ch = data[i++]; |
297 | | |
298 | 3.16G | if (ch < 0x80) { |
299 | | /* Encode ASCII */ |
300 | 3.00G | *p++ = (char) ch; |
301 | | |
302 | 3.00G | } |
303 | 51.9M | else |
304 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
305 | 51.9M | if (ch < 0x0800) |
306 | 950k | #endif |
307 | 105M | { |
308 | | /* Encode Latin-1 */ |
309 | 105M | *p++ = (char)(0xc0 | (ch >> 6)); |
310 | 105M | *p++ = (char)(0x80 | (ch & 0x3f)); |
311 | 105M | } |
312 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
313 | 50.9M | else if (Py_UNICODE_IS_SURROGATE(ch)) { |
314 | 321k | Py_ssize_t startpos, endpos, newpos; |
315 | 321k | Py_ssize_t k; |
316 | 321k | if (error_handler == _Py_ERROR_UNKNOWN) { |
317 | 181k | error_handler = _Py_GetErrorHandler(errors); |
318 | 181k | } |
319 | | |
320 | 321k | startpos = i-1; |
321 | 321k | endpos = startpos+1; |
322 | | |
323 | 13.8M | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) |
324 | 13.5M | endpos++; |
325 | | |
326 | | /* Only overallocate the buffer if it's not the last write */ |
327 | 321k | writer->overallocate = (endpos < size); |
328 | | |
329 | 321k | switch (error_handler) |
330 | 321k | { |
331 | 0 | case _Py_ERROR_REPLACE: |
332 | 0 | memset(p, '?', endpos - startpos); |
333 | 0 | p += (endpos - startpos); |
334 | 0 | _Py_FALLTHROUGH; |
335 | 0 | case _Py_ERROR_IGNORE: |
336 | 0 | i += (endpos - startpos - 1); |
337 | 0 | break; |
338 | | |
339 | 0 | case _Py_ERROR_SURROGATEPASS: |
340 | 0 | for (k=startpos; k<endpos; k++) { |
341 | 0 | ch = data[k]; |
342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); |
343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); |
345 | 0 | } |
346 | 0 | i += (endpos - startpos - 1); |
347 | 0 | break; |
348 | | |
349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: |
350 | | /* subtract preallocated bytes */ |
351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); |
352 | 0 | p = backslashreplace(writer, p, |
353 | 0 | unicode, startpos, endpos); |
354 | 0 | if (p == NULL) |
355 | 0 | goto error; |
356 | 0 | i += (endpos - startpos - 1); |
357 | 0 | break; |
358 | | |
359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: |
360 | | /* subtract preallocated bytes */ |
361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); |
362 | 0 | p = xmlcharrefreplace(writer, p, |
363 | 0 | unicode, startpos, endpos); |
364 | 0 | if (p == NULL) |
365 | 0 | goto error; |
366 | 0 | i += (endpos - startpos - 1); |
367 | 0 | break; |
368 | | |
369 | 190k | case _Py_ERROR_SURROGATEESCAPE: |
370 | 9.81M | for (k=startpos; k<endpos; k++) { |
371 | 9.62M | ch = data[k]; |
372 | 9.62M | if (!(0xDC80 <= ch && ch <= 0xDCFF)) |
373 | 21 | break; |
374 | 9.62M | *p++ = (char)(ch & 0xff); |
375 | 9.62M | } |
376 | 190k | if (k >= endpos) { |
377 | 190k | i += (endpos - startpos - 1); |
378 | 190k | break; |
379 | 190k | } |
380 | 21 | startpos = k; |
381 | 21 | assert(startpos < endpos); |
382 | 21 | _Py_FALLTHROUGH; |
383 | 131k | default: |
384 | 131k | rep = unicode_encode_call_errorhandler( |
385 | 131k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", |
386 | 131k | unicode, &exc, startpos, endpos, &newpos); |
387 | 131k | if (!rep) |
388 | 131k | goto error; |
389 | | |
390 | 0 | if (newpos < startpos) { |
391 | 0 | writer->overallocate = 1; |
392 | 0 | p = _PyBytesWriter_Prepare(writer, p, |
393 | 0 | max_char_size * (startpos - newpos)); |
394 | 0 | if (p == NULL) |
395 | 0 | goto error; |
396 | 0 | } |
397 | 0 | else { |
398 | | /* subtract preallocated bytes */ |
399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); |
400 | | /* Only overallocate the buffer if it's not the last write */ |
401 | 0 | writer->overallocate = (newpos < size); |
402 | 0 | } |
403 | | |
404 | 0 | if (PyBytes_Check(rep)) { |
405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, |
406 | 0 | PyBytes_AS_STRING(rep), |
407 | 0 | PyBytes_GET_SIZE(rep)); |
408 | 0 | } |
409 | 0 | else { |
410 | | /* rep is unicode */ |
411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { |
412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, |
413 | 0 | startpos, endpos, |
414 | 0 | "surrogates not allowed"); |
415 | 0 | goto error; |
416 | 0 | } |
417 | | |
418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, |
419 | 0 | PyUnicode_DATA(rep), |
420 | 0 | PyUnicode_GET_LENGTH(rep)); |
421 | 0 | } |
422 | | |
423 | 0 | if (p == NULL) |
424 | 0 | goto error; |
425 | 0 | Py_CLEAR(rep); |
426 | |
|
427 | 0 | i = newpos; |
428 | 321k | } |
429 | | |
430 | | /* If overallocation was disabled, ensure that it was the last |
431 | | write. Otherwise, we missed an optimization */ |
432 | 190k | assert(writer->overallocate || i == size); |
433 | 190k | } |
434 | 25.4M | else |
435 | | #if STRINGLIB_SIZEOF_CHAR > 2 |
436 | 25.4M | if (ch < 0x10000) |
437 | 25.3M | #endif |
438 | 50.5M | { |
439 | 50.5M | *p++ = (char)(0xe0 | (ch >> 12)); |
440 | 50.5M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
441 | 50.5M | *p++ = (char)(0x80 | (ch & 0x3f)); |
442 | 50.5M | } |
443 | | #if STRINGLIB_SIZEOF_CHAR > 2 |
444 | | else /* ch >= 0x10000 */ |
445 | 142k | { |
446 | 142k | assert(ch <= MAX_UNICODE); |
447 | | /* Encode UCS4 Unicode ordinals */ |
448 | 142k | *p++ = (char)(0xf0 | (ch >> 18)); |
449 | 142k | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); |
450 | 142k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
451 | 142k | *p++ = (char)(0x80 | (ch & 0x3f)); |
452 | 142k | } |
453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ |
454 | | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ |
455 | 3.16G | } |
456 | | |
457 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
458 | 1.28M | Py_XDECREF(error_handler_obj); |
459 | 1.28M | Py_XDECREF(exc); |
460 | | #endif |
461 | 1.28M | return p; |
462 | | |
463 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
464 | 131k | error: |
465 | 131k | Py_XDECREF(rep); |
466 | 131k | Py_XDECREF(error_handler_obj); |
467 | 131k | Py_XDECREF(exc); |
468 | 131k | return NULL; |
469 | | #endif |
470 | 1.41M | } unicodeobject.c:ucs1lib_utf8_encoder Line | Count | Source | 267 | 5.30M | { | 268 | 5.30M | Py_ssize_t i; /* index into data of next input character */ | 269 | 5.30M | char *p; /* next free byte in output buffer */ | 270 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | | PyObject *error_handler_obj = NULL; | 272 | | PyObject *exc = NULL; | 273 | | PyObject *rep = NULL; | 274 | | #endif | 275 | 5.30M | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | 5.30M | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | | const Py_ssize_t max_char_size = 4; | 281 | | #endif | 282 | | | 283 | 5.30M | assert(size >= 0); | 284 | 5.30M | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 5.30M | _PyBytesWriter_Init(writer); | 291 | 5.30M | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 5.30M | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 663M | for (i = 0; i < size;) { | 296 | 658M | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 658M | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 553M | *p++ = (char) ch; | 301 | | | 302 | 553M | } | 303 | 104M | else | 304 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | | if (ch < 0x0800) | 306 | | #endif | 307 | 104M | { | 308 | | /* Encode Latin-1 */ | 309 | 104M | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 104M | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 104M | } | 312 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | | Py_ssize_t startpos, endpos, newpos; | 315 | | Py_ssize_t k; | 316 | | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | | error_handler = _Py_GetErrorHandler(errors); | 318 | | } | 319 | | | 320 | | startpos = i-1; | 321 | | endpos = startpos+1; | 322 | | | 323 | | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | | writer->overallocate = (endpos < size); | 328 | | | 329 | | switch (error_handler) | 330 | | { | 331 | | case _Py_ERROR_REPLACE: | 332 | | memset(p, '?', endpos - startpos); | 333 | | p += (endpos - startpos); | 334 | | _Py_FALLTHROUGH; | 335 | | case _Py_ERROR_IGNORE: | 336 | | i += (endpos - startpos - 1); | 337 | | break; | 338 | | | 339 | | case _Py_ERROR_SURROGATEPASS: | 340 | | for (k=startpos; k<endpos; k++) { | 341 | | ch = data[k]; | 342 | | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | | } | 346 | | i += (endpos - startpos - 1); | 347 | | break; | 348 | | | 349 | | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | | writer->min_size -= max_char_size * (endpos - startpos); | 352 | | p = backslashreplace(writer, p, | 353 | | unicode, startpos, endpos); | 354 | | if (p == NULL) | 355 | | goto error; | 356 | | i += (endpos - startpos - 1); | 357 | | break; | 358 | | | 359 | | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | | writer->min_size -= max_char_size * (endpos - startpos); | 362 | | p = xmlcharrefreplace(writer, p, | 363 | | unicode, startpos, endpos); | 364 | | if (p == NULL) | 365 | | goto error; | 366 | | i += (endpos - startpos - 1); | 367 | | break; | 368 | | | 369 | | case _Py_ERROR_SURROGATEESCAPE: | 370 | | for (k=startpos; k<endpos; k++) { | 371 | | ch = data[k]; | 372 | | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | | break; | 374 | | *p++ = (char)(ch & 0xff); | 375 | | } | 376 | | if (k >= endpos) { | 377 | | i += (endpos - startpos - 1); | 378 | | break; | 379 | | } | 380 | | startpos = k; | 381 | | assert(startpos < endpos); | 382 | | _Py_FALLTHROUGH; | 383 | | default: | 384 | | rep = unicode_encode_call_errorhandler( | 385 | | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | | unicode, &exc, startpos, endpos, &newpos); | 387 | | if (!rep) | 388 | | goto error; | 389 | | | 390 | | if (newpos < startpos) { | 391 | | writer->overallocate = 1; | 392 | | p = _PyBytesWriter_Prepare(writer, p, | 393 | | max_char_size * (startpos - newpos)); | 394 | | if (p == NULL) | 395 | | goto error; | 396 | | } | 397 | | else { | 398 | | /* subtract preallocated bytes */ | 399 | | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | | writer->overallocate = (newpos < size); | 402 | | } | 403 | | | 404 | | if (PyBytes_Check(rep)) { | 405 | | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | | PyBytes_AS_STRING(rep), | 407 | | PyBytes_GET_SIZE(rep)); | 408 | | } | 409 | | else { | 410 | | /* rep is unicode */ | 411 | | if (!PyUnicode_IS_ASCII(rep)) { | 412 | | raise_encode_exception(&exc, "utf-8", unicode, | 413 | | startpos, endpos, | 414 | | "surrogates not allowed"); | 415 | | goto error; | 416 | | } | 417 | | | 418 | | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | | PyUnicode_DATA(rep), | 420 | | PyUnicode_GET_LENGTH(rep)); | 421 | | } | 422 | | | 423 | | if (p == NULL) | 424 | | goto error; | 425 | | Py_CLEAR(rep); | 426 | | | 427 | | i = newpos; | 428 | | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | | assert(writer->overallocate || i == size); | 433 | | } | 434 | | else | 435 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | | if (ch < 0x10000) | 437 | | #endif | 438 | | { | 439 | | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | | } | 443 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | | else /* ch >= 0x10000 */ | 445 | | { | 446 | | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | | } | 453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 658M | } | 456 | | | 457 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | | Py_XDECREF(error_handler_obj); | 459 | | Py_XDECREF(exc); | 460 | | #endif | 461 | 5.30M | return p; | 462 | | | 463 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | | error: | 465 | | Py_XDECREF(rep); | 466 | | Py_XDECREF(error_handler_obj); | 467 | | Py_XDECREF(exc); | 468 | | return NULL; | 469 | | #endif | 470 | 5.30M | } |
unicodeobject.c:ucs2lib_utf8_encoder Line | Count | Source | 267 | 1.34M | { | 268 | 1.34M | Py_ssize_t i; /* index into data of next input character */ | 269 | 1.34M | char *p; /* next free byte in output buffer */ | 270 | 1.34M | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | 1.34M | PyObject *error_handler_obj = NULL; | 272 | 1.34M | PyObject *exc = NULL; | 273 | 1.34M | PyObject *rep = NULL; | 274 | 1.34M | #endif | 275 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | | const Py_ssize_t max_char_size = 4; | 281 | | #endif | 282 | | | 283 | 1.34M | assert(size >= 0); | 284 | 1.34M | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 1.34M | _PyBytesWriter_Init(writer); | 291 | 1.34M | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 1.34M | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 1.19G | for (i = 0; i < size;) { | 296 | 1.19G | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 1.19G | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 1.16G | *p++ = (char) ch; | 301 | | | 302 | 1.16G | } | 303 | 25.7M | else | 304 | 25.7M | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | 25.7M | if (ch < 0x0800) | 306 | 276k | #endif | 307 | 276k | { | 308 | | /* Encode Latin-1 */ | 309 | 276k | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 276k | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 276k | } | 312 | 25.4M | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | 25.4M | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | 303k | Py_ssize_t startpos, endpos, newpos; | 315 | 303k | Py_ssize_t k; | 316 | 303k | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | 176k | error_handler = _Py_GetErrorHandler(errors); | 318 | 176k | } | 319 | | | 320 | 303k | startpos = i-1; | 321 | 303k | endpos = startpos+1; | 322 | | | 323 | 13.7M | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | 13.3M | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | 303k | writer->overallocate = (endpos < size); | 328 | | | 329 | 303k | switch (error_handler) | 330 | 303k | { | 331 | 0 | case _Py_ERROR_REPLACE: | 332 | 0 | memset(p, '?', endpos - startpos); | 333 | 0 | p += (endpos - startpos); | 334 | 0 | _Py_FALLTHROUGH; | 335 | 0 | case _Py_ERROR_IGNORE: | 336 | 0 | i += (endpos - startpos - 1); | 337 | 0 | break; | 338 | | | 339 | 0 | case _Py_ERROR_SURROGATEPASS: | 340 | 0 | for (k=startpos; k<endpos; k++) { | 341 | 0 | ch = data[k]; | 342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | 0 | } | 346 | 0 | i += (endpos - startpos - 1); | 347 | 0 | break; | 348 | | | 349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 352 | 0 | p = backslashreplace(writer, p, | 353 | 0 | unicode, startpos, endpos); | 354 | 0 | if (p == NULL) | 355 | 0 | goto error; | 356 | 0 | i += (endpos - startpos - 1); | 357 | 0 | break; | 358 | | | 359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 362 | 0 | p = xmlcharrefreplace(writer, p, | 363 | 0 | unicode, startpos, endpos); | 364 | 0 | if (p == NULL) | 365 | 0 | goto error; | 366 | 0 | i += (endpos - startpos - 1); | 367 | 0 | break; | 368 | | | 369 | 175k | case _Py_ERROR_SURROGATEESCAPE: | 370 | 9.68M | for (k=startpos; k<endpos; k++) { | 371 | 9.51M | ch = data[k]; | 372 | 9.51M | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | 13 | break; | 374 | 9.51M | *p++ = (char)(ch & 0xff); | 375 | 9.51M | } | 376 | 175k | if (k >= endpos) { | 377 | 175k | i += (endpos - startpos - 1); | 378 | 175k | break; | 379 | 175k | } | 380 | 13 | startpos = k; | 381 | 13 | assert(startpos < endpos); | 382 | 13 | _Py_FALLTHROUGH; | 383 | 127k | default: | 384 | 127k | rep = unicode_encode_call_errorhandler( | 385 | 127k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | 127k | unicode, &exc, startpos, endpos, &newpos); | 387 | 127k | if (!rep) | 388 | 127k | goto error; | 389 | | | 390 | 0 | if (newpos < startpos) { | 391 | 0 | writer->overallocate = 1; | 392 | 0 | p = _PyBytesWriter_Prepare(writer, p, | 393 | 0 | max_char_size * (startpos - newpos)); | 394 | 0 | if (p == NULL) | 395 | 0 | goto error; | 396 | 0 | } | 397 | 0 | else { | 398 | | /* subtract preallocated bytes */ | 399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | 0 | writer->overallocate = (newpos < size); | 402 | 0 | } | 403 | | | 404 | 0 | if (PyBytes_Check(rep)) { | 405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | 0 | PyBytes_AS_STRING(rep), | 407 | 0 | PyBytes_GET_SIZE(rep)); | 408 | 0 | } | 409 | 0 | else { | 410 | | /* rep is unicode */ | 411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { | 412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, | 413 | 0 | startpos, endpos, | 414 | 0 | "surrogates not allowed"); | 415 | 0 | goto error; | 416 | 0 | } | 417 | | | 418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | 0 | PyUnicode_DATA(rep), | 420 | 0 | PyUnicode_GET_LENGTH(rep)); | 421 | 0 | } | 422 | | | 423 | 0 | if (p == NULL) | 424 | 0 | goto error; | 425 | 0 | Py_CLEAR(rep); | 426 | |
| 427 | 0 | i = newpos; | 428 | 303k | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | 175k | assert(writer->overallocate || i == size); | 433 | 175k | } | 434 | 25.1M | else | 435 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | | if (ch < 0x10000) | 437 | | #endif | 438 | 25.1M | { | 439 | 25.1M | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | 25.1M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | 25.1M | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | 25.1M | } | 443 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | | else /* ch >= 0x10000 */ | 445 | | { | 446 | | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | | } | 453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | 1.19G | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 1.19G | } | 456 | | | 457 | 1.21M | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | 1.21M | Py_XDECREF(error_handler_obj); | 459 | 1.21M | Py_XDECREF(exc); | 460 | 1.21M | #endif | 461 | 1.21M | return p; | 462 | | | 463 | 0 | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | 127k | error: | 465 | 127k | Py_XDECREF(rep); | 466 | 127k | Py_XDECREF(error_handler_obj); | 467 | 127k | Py_XDECREF(exc); | 468 | 127k | return NULL; | 469 | 1.34M | #endif | 470 | 1.34M | } |
unicodeobject.c:ucs4lib_utf8_encoder Line | Count | Source | 267 | 71.1k | { | 268 | 71.1k | Py_ssize_t i; /* index into data of next input character */ | 269 | 71.1k | char *p; /* next free byte in output buffer */ | 270 | 71.1k | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | 71.1k | PyObject *error_handler_obj = NULL; | 272 | 71.1k | PyObject *exc = NULL; | 273 | 71.1k | PyObject *rep = NULL; | 274 | 71.1k | #endif | 275 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | 71.1k | const Py_ssize_t max_char_size = 4; | 281 | 71.1k | #endif | 282 | | | 283 | 71.1k | assert(size >= 0); | 284 | 71.1k | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 71.1k | _PyBytesWriter_Init(writer); | 291 | 71.1k | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 71.1k | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 1.30G | for (i = 0; i < size;) { | 296 | 1.30G | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 1.30G | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 1.28G | *p++ = (char) ch; | 301 | | | 302 | 1.28G | } | 303 | 26.1M | else | 304 | 26.1M | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | 26.1M | if (ch < 0x0800) | 306 | 674k | #endif | 307 | 674k | { | 308 | | /* Encode Latin-1 */ | 309 | 674k | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 674k | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 674k | } | 312 | 25.5M | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | 25.5M | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | 18.6k | Py_ssize_t startpos, endpos, newpos; | 315 | 18.6k | Py_ssize_t k; | 316 | 18.6k | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | 5.74k | error_handler = _Py_GetErrorHandler(errors); | 318 | 5.74k | } | 319 | | | 320 | 18.6k | startpos = i-1; | 321 | 18.6k | endpos = startpos+1; | 322 | | | 323 | 123k | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | 104k | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | 18.6k | writer->overallocate = (endpos < size); | 328 | | | 329 | 18.6k | switch (error_handler) | 330 | 18.6k | { | 331 | 0 | case _Py_ERROR_REPLACE: | 332 | 0 | memset(p, '?', endpos - startpos); | 333 | 0 | p += (endpos - startpos); | 334 | 0 | _Py_FALLTHROUGH; | 335 | 0 | case _Py_ERROR_IGNORE: | 336 | 0 | i += (endpos - startpos - 1); | 337 | 0 | break; | 338 | | | 339 | 0 | case _Py_ERROR_SURROGATEPASS: | 340 | 0 | for (k=startpos; k<endpos; k++) { | 341 | 0 | ch = data[k]; | 342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | 0 | } | 346 | 0 | i += (endpos - startpos - 1); | 347 | 0 | break; | 348 | | | 349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 352 | 0 | p = backslashreplace(writer, p, | 353 | 0 | unicode, startpos, endpos); | 354 | 0 | if (p == NULL) | 355 | 0 | goto error; | 356 | 0 | i += (endpos - startpos - 1); | 357 | 0 | break; | 358 | | | 359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 362 | 0 | p = xmlcharrefreplace(writer, p, | 363 | 0 | unicode, startpos, endpos); | 364 | 0 | if (p == NULL) | 365 | 0 | goto error; | 366 | 0 | i += (endpos - startpos - 1); | 367 | 0 | break; | 368 | | | 369 | 15.0k | case _Py_ERROR_SURROGATEESCAPE: | 370 | 129k | for (k=startpos; k<endpos; k++) { | 371 | 114k | ch = data[k]; | 372 | 114k | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | 8 | break; | 374 | 114k | *p++ = (char)(ch & 0xff); | 375 | 114k | } | 376 | 15.0k | if (k >= endpos) { | 377 | 15.0k | i += (endpos - startpos - 1); | 378 | 15.0k | break; | 379 | 15.0k | } | 380 | 8 | startpos = k; | 381 | 8 | assert(startpos < endpos); | 382 | 8 | _Py_FALLTHROUGH; | 383 | 3.68k | default: | 384 | 3.68k | rep = unicode_encode_call_errorhandler( | 385 | 3.68k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | 3.68k | unicode, &exc, startpos, endpos, &newpos); | 387 | 3.68k | if (!rep) | 388 | 3.68k | goto error; | 389 | | | 390 | 0 | if (newpos < startpos) { | 391 | 0 | writer->overallocate = 1; | 392 | 0 | p = _PyBytesWriter_Prepare(writer, p, | 393 | 0 | max_char_size * (startpos - newpos)); | 394 | 0 | if (p == NULL) | 395 | 0 | goto error; | 396 | 0 | } | 397 | 0 | else { | 398 | | /* subtract preallocated bytes */ | 399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | 0 | writer->overallocate = (newpos < size); | 402 | 0 | } | 403 | | | 404 | 0 | if (PyBytes_Check(rep)) { | 405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | 0 | PyBytes_AS_STRING(rep), | 407 | 0 | PyBytes_GET_SIZE(rep)); | 408 | 0 | } | 409 | 0 | else { | 410 | | /* rep is unicode */ | 411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { | 412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, | 413 | 0 | startpos, endpos, | 414 | 0 | "surrogates not allowed"); | 415 | 0 | goto error; | 416 | 0 | } | 417 | | | 418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | 0 | PyUnicode_DATA(rep), | 420 | 0 | PyUnicode_GET_LENGTH(rep)); | 421 | 0 | } | 422 | | | 423 | 0 | if (p == NULL) | 424 | 0 | goto error; | 425 | 0 | Py_CLEAR(rep); | 426 | |
| 427 | 0 | i = newpos; | 428 | 18.6k | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | 15.0k | assert(writer->overallocate || i == size); | 433 | 15.0k | } | 434 | 25.4M | else | 435 | 25.4M | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | 25.4M | if (ch < 0x10000) | 437 | 25.3M | #endif | 438 | 25.3M | { | 439 | 25.3M | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | 25.3M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | 25.3M | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | 25.3M | } | 443 | 142k | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | 142k | else /* ch >= 0x10000 */ | 445 | 142k | { | 446 | 142k | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | 142k | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | 142k | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | 142k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | 142k | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | 142k | } | 453 | 1.30G | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | 1.30G | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 1.30G | } | 456 | | | 457 | 67.4k | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | 67.4k | Py_XDECREF(error_handler_obj); | 459 | 67.4k | Py_XDECREF(exc); | 460 | 67.4k | #endif | 461 | 67.4k | return p; | 462 | | | 463 | 0 | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | 3.68k | error: | 465 | 3.68k | Py_XDECREF(rep); | 466 | 3.68k | Py_XDECREF(error_handler_obj); | 467 | 3.68k | Py_XDECREF(exc); | 468 | 3.68k | return NULL; | 469 | 71.1k | #endif | 470 | 71.1k | } |
Unexecuted instantiation: unicodeobject.c:asciilib_utf8_encoder |
471 | | |
472 | | /* The pattern for constructing UCS2-repeated masks. */ |
473 | | #if SIZEOF_LONG == 8 |
474 | 332k | # define UCS2_REPEAT_MASK 0x0001000100010001ul |
475 | | #elif SIZEOF_LONG == 4 |
476 | | # define UCS2_REPEAT_MASK 0x00010001ul |
477 | | #else |
478 | | # error C 'long' size should be either 4 or 8! |
479 | | #endif |
480 | | |
481 | | /* The mask for fast checking. */ |
482 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
483 | | /* The mask for fast checking of whether a C 'long' contains a |
484 | | non-ASCII or non-Latin1 UTF16-encoded characters. */ |
485 | 9.01k | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) |
486 | | #else |
487 | | /* The mask for fast checking of whether a C 'long' may contain |
488 | | UTF16-encoded surrogate characters. This is an efficient heuristic, |
489 | | assuming that non-surrogate characters with a code point >= 0x8000 are |
490 | | rare in most input. |
491 | | */ |
492 | 292k | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) |
493 | | #endif |
494 | | /* The mask for fast byte-swapping. */ |
495 | 30.7k | #define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) |
496 | | /* Swap bytes. */ |
497 | 15.3k | #define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ |
498 | 15.3k | (((value) & STRIPPED_MASK) << 8)) |
499 | | |
500 | | Py_LOCAL_INLINE(Py_UCS4) |
501 | | STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, |
502 | | STRINGLIB_CHAR *dest, Py_ssize_t *outpos, |
503 | | int native_ordering) |
504 | 39.7k | { |
505 | 39.7k | Py_UCS4 ch; |
506 | 39.7k | const unsigned char *q = *inptr; |
507 | 39.7k | STRINGLIB_CHAR *p = dest + *outpos; |
508 | | /* Offsets from q for retrieving byte pairs in the right order. */ |
509 | 39.7k | #if PY_LITTLE_ENDIAN |
510 | 39.7k | int ihi = !!native_ordering, ilo = !native_ordering; |
511 | | #else |
512 | | int ihi = !native_ordering, ilo = !!native_ordering; |
513 | | #endif |
514 | 39.7k | --e; |
515 | | |
516 | 215k | while (q < e) { |
517 | 210k | Py_UCS4 ch2; |
518 | | /* First check for possible aligned read of a C 'long'. Unaligned |
519 | | reads are more expensive, better to defer to another iteration. */ |
520 | 210k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { |
521 | | /* Fast path for runs of in-range non-surrogate chars. */ |
522 | 57.4k | const unsigned char *_q = q; |
523 | 324k | while (_q + SIZEOF_LONG <= e) { |
524 | 314k | unsigned long block = * (const unsigned long *) _q; |
525 | 314k | if (native_ordering) { |
526 | | /* Can use buffer directly */ |
527 | 301k | if (block & FAST_CHAR_MASK) |
528 | 38.9k | break; |
529 | 301k | } |
530 | 12.4k | else { |
531 | | /* Need to byte-swap */ |
532 | 12.4k | if (block & SWAB(FAST_CHAR_MASK)) |
533 | 8.42k | break; |
534 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
535 | 1.09k | block >>= 8; |
536 | | #else |
537 | 2.92k | block = SWAB(block); |
538 | | #endif |
539 | 2.92k | } |
540 | 267k | #if PY_LITTLE_ENDIAN |
541 | | # if SIZEOF_LONG == 4 |
542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); |
544 | | # elif SIZEOF_LONG == 8 |
545 | 267k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
546 | 267k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
547 | 267k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
548 | 267k | p[3] = (STRINGLIB_CHAR)(block >> 48); |
549 | 267k | # endif |
550 | | #else |
551 | | # if SIZEOF_LONG == 4 |
552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); |
553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
554 | | # elif SIZEOF_LONG == 8 |
555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); |
556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
559 | | # endif |
560 | | #endif |
561 | 267k | _q += SIZEOF_LONG; |
562 | 267k | p += SIZEOF_LONG / 2; |
563 | 267k | } |
564 | 57.4k | q = _q; |
565 | 57.4k | if (q >= e) |
566 | 957 | break; |
567 | 57.4k | } |
568 | | |
569 | 209k | ch = (q[ihi] << 8) | q[ilo]; |
570 | 209k | q += 2; |
571 | 209k | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
572 | | #if STRINGLIB_SIZEOF_CHAR < 2 |
573 | 22.2k | if (ch > STRINGLIB_MAX_CHAR) |
574 | | /* Out-of-range */ |
575 | 10.7k | goto Return; |
576 | 11.5k | #endif |
577 | 11.5k | *p++ = (STRINGLIB_CHAR)ch; |
578 | 11.5k | continue; |
579 | 185k | } |
580 | | |
581 | | /* UTF-16 code pair: */ |
582 | 24.1k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) |
583 | 11.6k | goto IllegalEncoding; |
584 | 12.5k | if (q >= e) |
585 | 986 | goto UnexpectedEnd; |
586 | 11.5k | ch2 = (q[ihi] << 8) | q[ilo]; |
587 | 11.5k | q += 2; |
588 | 11.5k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) |
589 | 8.27k | goto IllegalSurrogate; |
590 | 3.25k | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); |
591 | | #if STRINGLIB_SIZEOF_CHAR < 4 |
592 | | /* Out-of-range */ |
593 | 2.56k | goto Return; |
594 | | #else |
595 | | *p++ = (STRINGLIB_CHAR)ch; |
596 | | #endif |
597 | 687 | } |
598 | 5.62k | ch = 0; |
599 | 39.7k | Return: |
600 | 39.7k | *inptr = q; |
601 | 39.7k | *outpos = p - dest; |
602 | 39.7k | return ch; |
603 | 986 | UnexpectedEnd: |
604 | 986 | ch = 1; |
605 | 986 | goto Return; |
606 | 11.6k | IllegalEncoding: |
607 | 11.6k | ch = 2; |
608 | 11.6k | goto Return; |
609 | 8.27k | IllegalSurrogate: |
610 | 8.27k | ch = 3; |
611 | 8.27k | goto Return; |
612 | 5.62k | } unicodeobject.c:asciilib_utf16_decode Line | Count | Source | 504 | 12.0k | { | 505 | 12.0k | Py_UCS4 ch; | 506 | 12.0k | const unsigned char *q = *inptr; | 507 | 12.0k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 12.0k | #if PY_LITTLE_ENDIAN | 510 | 12.0k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 12.0k | --e; | 515 | | | 516 | 19.3k | while (q < e) { | 517 | 18.8k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 18.8k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 11.3k | const unsigned char *_q = q; | 523 | 14.2k | while (_q + SIZEOF_LONG <= e) { | 524 | 9.16k | unsigned long block = * (const unsigned long *) _q; | 525 | 9.16k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 7.22k | if (block & FAST_CHAR_MASK) | 528 | 5.05k | break; | 529 | 7.22k | } | 530 | 1.94k | else { | 531 | | /* Need to byte-swap */ | 532 | 1.94k | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 1.17k | break; | 534 | 771 | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | 771 | block >>= 8; | 536 | | #else | 537 | | block = SWAB(block); | 538 | | #endif | 539 | 771 | } | 540 | 2.93k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 2.93k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 2.93k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 2.93k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 2.93k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 2.93k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 2.93k | _q += SIZEOF_LONG; | 562 | 2.93k | p += SIZEOF_LONG / 2; | 563 | 2.93k | } | 564 | 11.3k | q = _q; | 565 | 11.3k | if (q >= e) | 566 | 308 | break; | 567 | 11.3k | } | 568 | | | 569 | 18.5k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 18.5k | q += 2; | 571 | 18.5k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | 16.8k | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | 16.8k | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | 9.56k | goto Return; | 576 | 7.29k | #endif | 577 | 7.29k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 7.29k | continue; | 579 | 16.8k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 1.68k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 783 | goto IllegalEncoding; | 584 | 897 | if (q >= e) | 585 | 250 | goto UnexpectedEnd; | 586 | 647 | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 647 | q += 2; | 588 | 647 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 283 | goto IllegalSurrogate; | 590 | 364 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 364 | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 364 | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 647 | } | 598 | 782 | ch = 0; | 599 | 12.0k | Return: | 600 | 12.0k | *inptr = q; | 601 | 12.0k | *outpos = p - dest; | 602 | 12.0k | return ch; | 603 | 250 | UnexpectedEnd: | 604 | 250 | ch = 1; | 605 | 250 | goto Return; | 606 | 783 | IllegalEncoding: | 607 | 783 | ch = 2; | 608 | 783 | goto Return; | 609 | 283 | IllegalSurrogate: | 610 | 283 | ch = 3; | 611 | 283 | goto Return; | 612 | 782 | } |
unicodeobject.c:ucs1lib_utf16_decode Line | Count | Source | 504 | 2.52k | { | 505 | 2.52k | Py_UCS4 ch; | 506 | 2.52k | const unsigned char *q = *inptr; | 507 | 2.52k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 2.52k | #if PY_LITTLE_ENDIAN | 510 | 2.52k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 2.52k | --e; | 515 | | | 516 | 6.79k | while (q < e) { | 517 | 6.63k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 6.63k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 1.20k | const unsigned char *_q = q; | 523 | 2.48k | while (_q + SIZEOF_LONG <= e) { | 524 | 2.26k | unsigned long block = * (const unsigned long *) _q; | 525 | 2.26k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 1.79k | if (block & FAST_CHAR_MASK) | 528 | 839 | break; | 529 | 1.79k | } | 530 | 474 | else { | 531 | | /* Need to byte-swap */ | 532 | 474 | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 152 | break; | 534 | 322 | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | 322 | block >>= 8; | 536 | | #else | 537 | | block = SWAB(block); | 538 | | #endif | 539 | 322 | } | 540 | 1.27k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 1.27k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 1.27k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 1.27k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 1.27k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 1.27k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 1.27k | _q += SIZEOF_LONG; | 562 | 1.27k | p += SIZEOF_LONG / 2; | 563 | 1.27k | } | 564 | 1.20k | q = _q; | 565 | 1.20k | if (q >= e) | 566 | 137 | break; | 567 | 1.20k | } | 568 | | | 569 | 6.49k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 6.49k | q += 2; | 571 | 6.49k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | 5.40k | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | 5.40k | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | 1.14k | goto Return; | 576 | 4.26k | #endif | 577 | 4.26k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 4.26k | continue; | 579 | 5.40k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 1.09k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 133 | goto IllegalEncoding; | 584 | 961 | if (q >= e) | 585 | 77 | goto UnexpectedEnd; | 586 | 884 | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 884 | q += 2; | 588 | 884 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 632 | goto IllegalSurrogate; | 590 | 252 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 252 | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 252 | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 884 | } | 598 | 291 | ch = 0; | 599 | 2.52k | Return: | 600 | 2.52k | *inptr = q; | 601 | 2.52k | *outpos = p - dest; | 602 | 2.52k | return ch; | 603 | 77 | UnexpectedEnd: | 604 | 77 | ch = 1; | 605 | 77 | goto Return; | 606 | 133 | IllegalEncoding: | 607 | 133 | ch = 2; | 608 | 133 | goto Return; | 609 | 632 | IllegalSurrogate: | 610 | 632 | ch = 3; | 611 | 632 | goto Return; | 612 | 291 | } |
unicodeobject.c:ucs2lib_utf16_decode Line | Count | Source | 504 | 9.80k | { | 505 | 9.80k | Py_UCS4 ch; | 506 | 9.80k | const unsigned char *q = *inptr; | 507 | 9.80k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 9.80k | #if PY_LITTLE_ENDIAN | 510 | 9.80k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 9.80k | --e; | 515 | | | 516 | 127k | while (q < e) { | 517 | 124k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 124k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 29.4k | const unsigned char *_q = q; | 523 | 287k | while (_q + SIZEOF_LONG <= e) { | 524 | 284k | unsigned long block = * (const unsigned long *) _q; | 525 | 284k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 276k | if (block & FAST_CHAR_MASK) | 528 | 20.1k | break; | 529 | 276k | } | 530 | 7.90k | else { | 531 | | /* Need to byte-swap */ | 532 | 7.90k | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 5.98k | break; | 534 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | | block >>= 8; | 536 | | #else | 537 | 1.92k | block = SWAB(block); | 538 | 1.92k | #endif | 539 | 1.92k | } | 540 | 258k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 258k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 258k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 258k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 258k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 258k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 258k | _q += SIZEOF_LONG; | 562 | 258k | p += SIZEOF_LONG / 2; | 563 | 258k | } | 564 | 29.4k | q = _q; | 565 | 29.4k | if (q >= e) | 566 | 401 | break; | 567 | 29.4k | } | 568 | | | 569 | 123k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 123k | q += 2; | 571 | 123k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | | goto Return; | 576 | | #endif | 577 | 117k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 117k | continue; | 579 | 117k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 6.17k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 2.01k | goto IllegalEncoding; | 584 | 4.16k | if (q >= e) | 585 | 274 | goto UnexpectedEnd; | 586 | 3.89k | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 3.89k | q += 2; | 588 | 3.89k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 1.94k | goto IllegalSurrogate; | 590 | 1.94k | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 1.94k | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 1.94k | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 3.89k | } | 598 | 3.62k | ch = 0; | 599 | 9.80k | Return: | 600 | 9.80k | *inptr = q; | 601 | 9.80k | *outpos = p - dest; | 602 | 9.80k | return ch; | 603 | 274 | UnexpectedEnd: | 604 | 274 | ch = 1; | 605 | 274 | goto Return; | 606 | 2.01k | IllegalEncoding: | 607 | 2.01k | ch = 2; | 608 | 2.01k | goto Return; | 609 | 1.94k | IllegalSurrogate: | 610 | 1.94k | ch = 3; | 611 | 1.94k | goto Return; | 612 | 3.62k | } |
unicodeobject.c:ucs4lib_utf16_decode Line | Count | Source | 504 | 15.4k | { | 505 | 15.4k | Py_UCS4 ch; | 506 | 15.4k | const unsigned char *q = *inptr; | 507 | 15.4k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 15.4k | #if PY_LITTLE_ENDIAN | 510 | 15.4k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 15.4k | --e; | 515 | | | 516 | 61.6k | while (q < e) { | 517 | 60.8k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 60.8k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 15.4k | const unsigned char *_q = q; | 523 | 20.0k | while (_q + SIZEOF_LONG <= e) { | 524 | 18.6k | unsigned long block = * (const unsigned long *) _q; | 525 | 18.6k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 16.5k | if (block & FAST_CHAR_MASK) | 528 | 12.9k | break; | 529 | 16.5k | } | 530 | 2.11k | else { | 531 | | /* Need to byte-swap */ | 532 | 2.11k | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 1.11k | break; | 534 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | | block >>= 8; | 536 | | #else | 537 | 994 | block = SWAB(block); | 538 | 994 | #endif | 539 | 994 | } | 540 | 4.57k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 4.57k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 4.57k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 4.57k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 4.57k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 4.57k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 4.57k | _q += SIZEOF_LONG; | 562 | 4.57k | p += SIZEOF_LONG / 2; | 563 | 4.57k | } | 564 | 15.4k | q = _q; | 565 | 15.4k | if (q >= e) | 566 | 111 | break; | 567 | 15.4k | } | 568 | | | 569 | 60.7k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 60.7k | q += 2; | 571 | 60.7k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | | goto Return; | 576 | | #endif | 577 | 45.5k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 45.5k | continue; | 579 | 45.5k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 15.1k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 8.69k | goto IllegalEncoding; | 584 | 6.49k | if (q >= e) | 585 | 385 | goto UnexpectedEnd; | 586 | 6.10k | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 6.10k | q += 2; | 588 | 6.10k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 5.41k | goto IllegalSurrogate; | 590 | 687 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | | goto Return; | 594 | | #else | 595 | 687 | *p++ = (STRINGLIB_CHAR)ch; | 596 | 687 | #endif | 597 | 687 | } | 598 | 923 | ch = 0; | 599 | 15.4k | Return: | 600 | 15.4k | *inptr = q; | 601 | 15.4k | *outpos = p - dest; | 602 | 15.4k | return ch; | 603 | 385 | UnexpectedEnd: | 604 | 385 | ch = 1; | 605 | 385 | goto Return; | 606 | 8.69k | IllegalEncoding: | 607 | 8.69k | ch = 2; | 608 | 8.69k | goto Return; | 609 | 5.41k | IllegalSurrogate: | 610 | 5.41k | ch = 3; | 611 | 5.41k | goto Return; | 612 | 923 | } |
|
613 | | #undef UCS2_REPEAT_MASK |
614 | | #undef FAST_CHAR_MASK |
615 | | #undef STRIPPED_MASK |
616 | | #undef SWAB |
617 | | |
618 | | |
619 | | #if STRINGLIB_MAX_CHAR >= 0x80 |
620 | | Py_LOCAL_INLINE(Py_ssize_t) |
621 | | STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, |
622 | | Py_ssize_t len, |
623 | | unsigned short **outptr, |
624 | | int native_ordering) |
625 | 0 | { |
626 | 0 | unsigned short *out = *outptr; |
627 | 0 | const STRINGLIB_CHAR *end = in + len; |
628 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
629 | 0 | if (native_ordering) { |
630 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
631 | 0 | while (in < unrolled_end) { |
632 | 0 | out[0] = in[0]; |
633 | 0 | out[1] = in[1]; |
634 | 0 | out[2] = in[2]; |
635 | 0 | out[3] = in[3]; |
636 | 0 | in += 4; out += 4; |
637 | 0 | } |
638 | 0 | while (in < end) { |
639 | 0 | *out++ = *in++; |
640 | 0 | } |
641 | 0 | } else { |
642 | 0 | # define SWAB2(CH) ((CH) << 8) /* high byte is zero */ |
643 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
644 | 0 | while (in < unrolled_end) { |
645 | 0 | out[0] = SWAB2(in[0]); |
646 | 0 | out[1] = SWAB2(in[1]); |
647 | 0 | out[2] = SWAB2(in[2]); |
648 | 0 | out[3] = SWAB2(in[3]); |
649 | 0 | in += 4; out += 4; |
650 | 0 | } |
651 | 0 | while (in < end) { |
652 | 0 | Py_UCS4 ch = *in++; |
653 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
654 | 0 | } |
655 | 0 | #undef SWAB2 |
656 | 0 | } |
657 | | *outptr = out; |
658 | | return len; |
659 | | #else |
660 | 0 | if (native_ordering) { |
661 | | #if STRINGLIB_MAX_CHAR < 0x10000 |
662 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
663 | 0 | while (in < unrolled_end) { |
664 | | /* check if any character is a surrogate character */ |
665 | 0 | if (((in[0] ^ 0xd800) & |
666 | 0 | (in[1] ^ 0xd800) & |
667 | 0 | (in[2] ^ 0xd800) & |
668 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
669 | 0 | break; |
670 | 0 | out[0] = in[0]; |
671 | 0 | out[1] = in[1]; |
672 | 0 | out[2] = in[2]; |
673 | 0 | out[3] = in[3]; |
674 | 0 | in += 4; out += 4; |
675 | 0 | } |
676 | | #endif |
677 | 0 | while (in < end) { |
678 | 0 | Py_UCS4 ch; |
679 | 0 | ch = *in++; |
680 | 0 | if (ch < 0xd800) |
681 | 0 | *out++ = ch; |
682 | 0 | else if (ch < 0xe000) |
683 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
684 | 0 | goto fail; |
685 | | #if STRINGLIB_MAX_CHAR >= 0x10000 |
686 | 0 | else if (ch >= 0x10000) { |
687 | 0 | out[0] = Py_UNICODE_HIGH_SURROGATE(ch); |
688 | 0 | out[1] = Py_UNICODE_LOW_SURROGATE(ch); |
689 | 0 | out += 2; |
690 | 0 | } |
691 | 0 | #endif |
692 | 0 | else |
693 | 0 | *out++ = ch; |
694 | 0 | } |
695 | 0 | } else { |
696 | 0 | #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) |
697 | | #if STRINGLIB_MAX_CHAR < 0x10000 |
698 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
699 | 0 | while (in < unrolled_end) { |
700 | | /* check if any character is a surrogate character */ |
701 | 0 | if (((in[0] ^ 0xd800) & |
702 | 0 | (in[1] ^ 0xd800) & |
703 | 0 | (in[2] ^ 0xd800) & |
704 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
705 | 0 | break; |
706 | 0 | out[0] = SWAB2(in[0]); |
707 | 0 | out[1] = SWAB2(in[1]); |
708 | 0 | out[2] = SWAB2(in[2]); |
709 | 0 | out[3] = SWAB2(in[3]); |
710 | 0 | in += 4; out += 4; |
711 | 0 | } |
712 | | #endif |
713 | 0 | while (in < end) { |
714 | 0 | Py_UCS4 ch = *in++; |
715 | 0 | if (ch < 0xd800) |
716 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
717 | 0 | else if (ch < 0xe000) |
718 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
719 | 0 | goto fail; |
720 | | #if STRINGLIB_MAX_CHAR >= 0x10000 |
721 | 0 | else if (ch >= 0x10000) { |
722 | 0 | Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); |
723 | 0 | Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); |
724 | 0 | out[0] = SWAB2(ch1); |
725 | 0 | out[1] = SWAB2(ch2); |
726 | 0 | out += 2; |
727 | 0 | } |
728 | 0 | #endif |
729 | 0 | else |
730 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
731 | 0 | } |
732 | 0 | #undef SWAB2 |
733 | 0 | } |
734 | 0 | *outptr = out; |
735 | 0 | return len; |
736 | 0 | fail: |
737 | 0 | *outptr = out; |
738 | 0 | return len - (end - in + 1); |
739 | | #endif |
740 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf16_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf16_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf16_encode |
741 | | |
742 | | static inline uint32_t |
743 | | STRINGLIB(SWAB4)(STRINGLIB_CHAR ch) |
744 | 0 | { |
745 | 0 | uint32_t word = ch; |
746 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
747 | | /* high bytes are zero */ |
748 | | return (word << 24); |
749 | | #elif STRINGLIB_SIZEOF_CHAR == 2 |
750 | | /* high bytes are zero */ |
751 | | return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8); |
752 | | #else |
753 | | return _Py_bswap32(word); |
754 | | #endif |
755 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_SWAB4 Unexecuted instantiation: unicodeobject.c:ucs2lib_SWAB4 Unexecuted instantiation: unicodeobject.c:ucs4lib_SWAB4 |
756 | | |
757 | | Py_LOCAL_INLINE(Py_ssize_t) |
758 | | STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, |
759 | | Py_ssize_t len, |
760 | | uint32_t **outptr, |
761 | | int native_ordering) |
762 | 0 | { |
763 | 0 | uint32_t *out = *outptr; |
764 | 0 | const STRINGLIB_CHAR *end = in + len; |
765 | 0 | if (native_ordering) { |
766 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
767 | 0 | while (in < unrolled_end) { |
768 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
769 | | /* check if any character is a surrogate character */ |
770 | 0 | if (((in[0] ^ 0xd800) & |
771 | 0 | (in[1] ^ 0xd800) & |
772 | 0 | (in[2] ^ 0xd800) & |
773 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
774 | 0 | break; |
775 | 0 | #endif |
776 | 0 | out[0] = in[0]; |
777 | 0 | out[1] = in[1]; |
778 | 0 | out[2] = in[2]; |
779 | 0 | out[3] = in[3]; |
780 | 0 | in += 4; out += 4; |
781 | 0 | } |
782 | 0 | while (in < end) { |
783 | 0 | Py_UCS4 ch; |
784 | 0 | ch = *in++; |
785 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
786 | 0 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
787 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
788 | 0 | goto fail; |
789 | 0 | } |
790 | 0 | #endif |
791 | 0 | *out++ = ch; |
792 | 0 | } |
793 | 0 | } else { |
794 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
795 | 0 | while (in < unrolled_end) { |
796 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
797 | | /* check if any character is a surrogate character */ |
798 | 0 | if (((in[0] ^ 0xd800) & |
799 | 0 | (in[1] ^ 0xd800) & |
800 | 0 | (in[2] ^ 0xd800) & |
801 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
802 | 0 | break; |
803 | 0 | #endif |
804 | 0 | out[0] = STRINGLIB(SWAB4)(in[0]); |
805 | 0 | out[1] = STRINGLIB(SWAB4)(in[1]); |
806 | 0 | out[2] = STRINGLIB(SWAB4)(in[2]); |
807 | 0 | out[3] = STRINGLIB(SWAB4)(in[3]); |
808 | 0 | in += 4; out += 4; |
809 | 0 | } |
810 | 0 | while (in < end) { |
811 | 0 | Py_UCS4 ch = *in++; |
812 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
813 | 0 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
814 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
815 | 0 | goto fail; |
816 | 0 | } |
817 | 0 | #endif |
818 | 0 | *out++ = STRINGLIB(SWAB4)(ch); |
819 | 0 | } |
820 | 0 | } |
821 | 0 | *outptr = out; |
822 | 0 | return len; |
823 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
824 | 0 | fail: |
825 | 0 | *outptr = out; |
826 | 0 | return len - (end - in + 1); |
827 | | #endif |
828 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf32_encode |
829 | | |
830 | | #endif |