/src/cpython/Objects/stringlib/codecs.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* stringlib: codec implementations */ |
2 | | |
3 | | #if !STRINGLIB_IS_UNICODE |
4 | | # error "codecs.h is specific to Unicode" |
5 | | #endif |
6 | | |
7 | | #include "pycore_bitutils.h" // _Py_bswap32() |
8 | | |
9 | | /* Mask to quickly check whether a C 'size_t' contains a |
10 | | non-ASCII, UTF8-encoded char. */ |
11 | | #if (SIZEOF_SIZE_T == 8) |
12 | 301M | # define ASCII_CHAR_MASK 0x8080808080808080ULL |
13 | | #elif (SIZEOF_SIZE_T == 4) |
14 | | # define ASCII_CHAR_MASK 0x80808080U |
15 | | #else |
16 | | # error C 'size_t' size should be either 4 or 8! |
17 | | #endif |
18 | | |
19 | | /* 10xxxxxx */ |
20 | 116M | #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0) |
21 | | |
22 | | Py_LOCAL_INLINE(Py_UCS4) |
23 | | STRINGLIB(utf8_decode)(const char **inptr, const char *end, |
24 | | STRINGLIB_CHAR *dest, |
25 | | Py_ssize_t *outpos) |
26 | 176M | { |
27 | 176M | Py_UCS4 ch; |
28 | 176M | const char *s = *inptr; |
29 | 176M | STRINGLIB_CHAR *p = dest + *outpos; |
30 | | |
31 | 361M | while (s < end) { |
32 | 361M | ch = (unsigned char)*s; |
33 | | |
34 | 361M | if (ch < 0x80) { |
35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 |
36 | | input will consist of an overwhelming majority of ASCII |
37 | | characters, we try to optimize for this case by checking |
38 | | as many characters as a C 'size_t' can contain. |
39 | | First, check if we can do an aligned read, as most CPUs have |
40 | | a penalty for unaligned reads. |
41 | | */ |
42 | 145M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { |
43 | | /* Help register allocation */ |
44 | 18.7M | const char *_s = s; |
45 | 18.7M | STRINGLIB_CHAR *_p = p; |
46 | 301M | while (_s + SIZEOF_SIZE_T <= end) { |
47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), |
48 | | and do a fast unrolled copy if it only contains ASCII |
49 | | characters. */ |
50 | 301M | size_t value = *(const size_t *) _s; |
51 | 301M | if (value & ASCII_CHAR_MASK) |
52 | 18.5M | break; |
53 | 282M | #if PY_LITTLE_ENDIAN |
54 | 282M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); |
55 | 282M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
56 | 282M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
57 | 282M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
58 | 282M | # if SIZEOF_SIZE_T == 8 |
59 | 282M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
60 | 282M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
61 | 282M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
62 | 282M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
63 | 282M | # endif |
64 | | #else |
65 | | # if SIZEOF_SIZE_T == 8 |
66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); |
74 | | # else |
75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); |
79 | | # endif |
80 | | #endif |
81 | 282M | _s += SIZEOF_SIZE_T; |
82 | 282M | _p += SIZEOF_SIZE_T; |
83 | 282M | } |
84 | 18.7M | s = _s; |
85 | 18.7M | p = _p; |
86 | 18.7M | if (s == end) |
87 | 11.7k | break; |
88 | 18.7M | ch = (unsigned char)*s; |
89 | 18.7M | } |
90 | 145M | if (ch < 0x80) { |
91 | 144M | s++; |
92 | 144M | *p++ = ch; |
93 | 144M | continue; |
94 | 144M | } |
95 | 145M | } |
96 | | |
97 | 216M | if (ch < 0xE0) { |
98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ |
99 | 93.8M | Py_UCS4 ch2; |
100 | 93.8M | if (ch < 0xC2) { |
101 | | /* invalid sequence |
102 | | \x80-\xBF -- continuation byte |
103 | | \xC0-\xC1 -- fake 0000-007F */ |
104 | 67.6M | goto InvalidStart; |
105 | 67.6M | } |
106 | 26.1M | if (end - s < 2) { |
107 | | /* unexpected end of data: the caller will decide whether |
108 | | it's an error or not */ |
109 | 10.2k | break; |
110 | 10.2k | } |
111 | 26.1M | ch2 = (unsigned char)s[1]; |
112 | 26.1M | if (!IS_CONTINUATION_BYTE(ch2)) |
113 | | /* invalid continuation byte */ |
114 | 21.6M | goto InvalidContinuation1; |
115 | 4.50M | ch = (ch << 6) + ch2 - |
116 | 4.50M | ((0xC0 << 6) + 0x80); |
117 | 4.50M | assert ((ch > 0x007F) && (ch <= 0x07FF)); |
118 | 4.50M | s += 2; |
119 | 4.50M | if (STRINGLIB_MAX_CHAR <= 0x007F || |
120 | 4.50M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) |
121 | | /* Out-of-range */ |
122 | 78.7k | goto Return; |
123 | 4.42M | *p++ = ch; |
124 | 4.42M | continue; |
125 | 4.50M | } |
126 | | |
127 | 122M | if (ch < 0xF0) { |
128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ |
129 | 47.5M | Py_UCS4 ch2, ch3; |
130 | 47.5M | if (end - s < 3) { |
131 | | /* unexpected end of data: the caller will decide whether |
132 | | it's an error or not */ |
133 | 12.2k | if (end - s < 2) |
134 | 4.28k | break; |
135 | 8.00k | ch2 = (unsigned char)s[1]; |
136 | 8.00k | if (!IS_CONTINUATION_BYTE(ch2) || |
137 | 8.00k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) |
138 | | /* for clarification see comments below */ |
139 | 5.64k | goto InvalidContinuation1; |
140 | 2.35k | break; |
141 | 8.00k | } |
142 | 47.5M | ch2 = (unsigned char)s[1]; |
143 | 47.5M | ch3 = (unsigned char)s[2]; |
144 | 47.5M | if (!IS_CONTINUATION_BYTE(ch2)) { |
145 | | /* invalid continuation byte */ |
146 | 10.6M | goto InvalidContinuation1; |
147 | 10.6M | } |
148 | 36.8M | if (ch == 0xE0) { |
149 | 106k | if (ch2 < 0xA0) |
150 | | /* invalid sequence |
151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ |
152 | 44.6k | goto InvalidContinuation1; |
153 | 36.7M | } else if (ch == 0xED && ch2 >= 0xA0) { |
154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF |
155 | | will result in surrogates in range D800-DFFF. Surrogates are |
156 | | not valid UTF-8 so they are rejected. |
157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf |
158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ |
159 | 63.5k | goto InvalidContinuation1; |
160 | 63.5k | } |
161 | 36.7M | if (!IS_CONTINUATION_BYTE(ch3)) { |
162 | | /* invalid continuation byte */ |
163 | 853k | goto InvalidContinuation2; |
164 | 853k | } |
165 | 35.9M | ch = (ch << 12) + (ch2 << 6) + ch3 - |
166 | 35.9M | ((0xE0 << 12) + (0x80 << 6) + 0x80); |
167 | 35.9M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); |
168 | 35.9M | s += 3; |
169 | 35.9M | if (STRINGLIB_MAX_CHAR <= 0x07FF || |
170 | 35.9M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) |
171 | | /* Out-of-range */ |
172 | 164k | goto Return; |
173 | 35.7M | *p++ = ch; |
174 | 35.7M | continue; |
175 | 35.9M | } |
176 | | |
177 | 75.2M | if (ch < 0xF5) { |
178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ |
179 | 4.46M | Py_UCS4 ch2, ch3, ch4; |
180 | 4.46M | if (end - s < 4) { |
181 | | /* unexpected end of data: the caller will decide whether |
182 | | it's an error or not */ |
183 | 16.6k | if (end - s < 2) |
184 | 3.69k | break; |
185 | 12.9k | ch2 = (unsigned char)s[1]; |
186 | 12.9k | if (!IS_CONTINUATION_BYTE(ch2) || |
187 | 12.9k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) |
188 | | /* for clarification see comments below */ |
189 | 8.04k | goto InvalidContinuation1; |
190 | 4.87k | if (end - s < 3) |
191 | 1.57k | break; |
192 | 3.30k | ch3 = (unsigned char)s[2]; |
193 | 3.30k | if (!IS_CONTINUATION_BYTE(ch3)) |
194 | 1.87k | goto InvalidContinuation2; |
195 | 1.43k | break; |
196 | 3.30k | } |
197 | 4.44M | ch2 = (unsigned char)s[1]; |
198 | 4.44M | ch3 = (unsigned char)s[2]; |
199 | 4.44M | ch4 = (unsigned char)s[3]; |
200 | 4.44M | if (!IS_CONTINUATION_BYTE(ch2)) { |
201 | | /* invalid continuation byte */ |
202 | 3.19M | goto InvalidContinuation1; |
203 | 3.19M | } |
204 | 1.25M | if (ch == 0xF0) { |
205 | 598k | if (ch2 < 0x90) |
206 | | /* invalid sequence |
207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ |
208 | 39.6k | goto InvalidContinuation1; |
209 | 652k | } else if (ch == 0xF4 && ch2 >= 0x90) { |
210 | | /* invalid sequence |
211 | | \xF4\x90\x80\x80- -- 110000- overflow */ |
212 | 63.7k | goto InvalidContinuation1; |
213 | 63.7k | } |
214 | 1.14M | if (!IS_CONTINUATION_BYTE(ch3)) { |
215 | | /* invalid continuation byte */ |
216 | 370k | goto InvalidContinuation2; |
217 | 370k | } |
218 | 777k | if (!IS_CONTINUATION_BYTE(ch4)) { |
219 | | /* invalid continuation byte */ |
220 | 121k | goto InvalidContinuation3; |
221 | 121k | } |
222 | 655k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - |
223 | 655k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); |
224 | 655k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); |
225 | 655k | s += 4; |
226 | 655k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || |
227 | 655k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) |
228 | | /* Out-of-range */ |
229 | 33.3k | goto Return; |
230 | 622k | *p++ = ch; |
231 | 622k | continue; |
232 | 655k | } |
233 | 70.8M | goto InvalidStart; |
234 | 75.2M | } |
235 | 314k | ch = 0; |
236 | 176M | Return: |
237 | 176M | *inptr = s; |
238 | 176M | *outpos = p - dest; |
239 | 176M | return ch; |
240 | 138M | InvalidStart: |
241 | 138M | ch = 1; |
242 | 138M | goto Return; |
243 | 35.6M | InvalidContinuation1: |
244 | 35.6M | ch = 2; |
245 | 35.6M | goto Return; |
246 | 1.22M | InvalidContinuation2: |
247 | 1.22M | ch = 3; |
248 | 1.22M | goto Return; |
249 | 121k | InvalidContinuation3: |
250 | 121k | ch = 4; |
251 | 121k | goto Return; |
252 | 314k | } unicodeobject.c:asciilib_utf8_decode Line | Count | Source | 26 | 287k | { | 27 | 287k | Py_UCS4 ch; | 28 | 287k | const char *s = *inptr; | 29 | 287k | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 287k | while (s < end) { | 32 | 287k | ch = (unsigned char)*s; | 33 | | | 34 | 287k | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 0 | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 0 | const char *_s = s; | 45 | 0 | STRINGLIB_CHAR *_p = p; | 46 | 0 | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 0 | size_t value = *(const size_t *) _s; | 51 | 0 | if (value & ASCII_CHAR_MASK) | 52 | 0 | break; | 53 | 0 | #if PY_LITTLE_ENDIAN | 54 | 0 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 0 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 0 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 0 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 0 | # if SIZEOF_SIZE_T == 8 | 59 | 0 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 0 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 0 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 0 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 0 | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 0 | _s += SIZEOF_SIZE_T; | 82 | 0 | _p += SIZEOF_SIZE_T; | 83 | 0 | } | 84 | 0 | s = _s; | 85 | 0 | p = _p; | 86 | 0 | if (s == end) | 87 | 0 | break; | 88 | 0 | ch = (unsigned char)*s; | 89 | 0 | } | 90 | 0 | if (ch < 0x80) { | 91 | 0 | s++; | 92 | 0 | *p++ = ch; | 93 | 0 | continue; | 94 | 0 | } | 95 | 0 | } | 96 | | | 97 | 287k | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 98.0k | Py_UCS4 ch2; | 100 | 98.0k | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 14.1k | goto InvalidStart; | 105 | 14.1k | } | 106 | 83.8k | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 1.66k | break; | 110 | 1.66k | } | 111 | 82.1k | ch2 = (unsigned char)s[1]; | 112 | 82.1k | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 5.23k | goto InvalidContinuation1; | 115 | 76.9k | ch = (ch << 6) + ch2 - | 116 | 76.9k | ((0xC0 << 6) + 0x80); | 117 | 76.9k | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 76.9k | s += 2; | 119 | 76.9k | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 76.9k | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 76.9k | goto Return; | 123 | 0 | *p++ = ch; | 124 | 0 | continue; | 125 | 76.9k | } | 126 | | | 127 | 189k | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 161k | Py_UCS4 ch2, ch3; | 130 | 161k | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 2.75k | if (end - s < 2) | 134 | 1.00k | break; | 135 | 1.75k | ch2 = (unsigned char)s[1]; | 136 | 1.75k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 1.75k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 1.27k | goto InvalidContinuation1; | 140 | 480 | break; | 141 | 1.75k | } | 142 | 158k | ch2 = (unsigned char)s[1]; | 143 | 158k | ch3 = (unsigned char)s[2]; | 144 | 158k | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 3.54k | goto InvalidContinuation1; | 147 | 3.54k | } | 148 | 155k | if (ch == 0xE0) { | 149 | 1.11k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 301 | goto InvalidContinuation1; | 153 | 153k | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 481 | goto InvalidContinuation1; | 160 | 481 | } | 161 | 154k | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 2.41k | goto InvalidContinuation2; | 164 | 2.41k | } | 165 | 151k | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 151k | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 151k | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 151k | s += 3; | 169 | 151k | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 151k | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 151k | goto Return; | 173 | 0 | *p++ = ch; | 174 | 0 | continue; | 175 | 151k | } | 176 | | | 177 | 28.5k | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 19.7k | Py_UCS4 ch2, ch3, ch4; | 180 | 19.7k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 4.77k | if (end - s < 2) | 184 | 1.22k | break; | 185 | 3.55k | ch2 = (unsigned char)s[1]; | 186 | 3.55k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 3.55k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 2.48k | goto InvalidContinuation1; | 190 | 1.07k | if (end - s < 3) | 191 | 500 | break; | 192 | 573 | ch3 = (unsigned char)s[2]; | 193 | 573 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 437 | goto InvalidContinuation2; | 195 | 136 | break; | 196 | 573 | } | 197 | 15.0k | ch2 = (unsigned char)s[1]; | 198 | 15.0k | ch3 = (unsigned char)s[2]; | 199 | 15.0k | ch4 = (unsigned char)s[3]; | 200 | 15.0k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 2.71k | goto InvalidContinuation1; | 203 | 2.71k | } | 204 | 12.2k | if (ch == 0xF0) { | 205 | 2.69k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 138 | goto InvalidContinuation1; | 209 | 9.60k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 514 | goto InvalidContinuation1; | 213 | 514 | } | 214 | 11.6k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 1.25k | goto InvalidContinuation2; | 217 | 1.25k | } | 218 | 10.3k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 429 | goto InvalidContinuation3; | 221 | 429 | } | 222 | 9.95k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 9.95k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 9.95k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 9.95k | s += 4; | 226 | 9.95k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 9.95k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 9.95k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 9.95k | } | 233 | 8.76k | goto InvalidStart; | 234 | 28.5k | } | 235 | 4.99k | ch = 0; | 236 | 287k | Return: | 237 | 287k | *inptr = s; | 238 | 287k | *outpos = p - dest; | 239 | 287k | return ch; | 240 | 22.9k | InvalidStart: | 241 | 22.9k | ch = 1; | 242 | 22.9k | goto Return; | 243 | 16.6k | InvalidContinuation1: | 244 | 16.6k | ch = 2; | 245 | 16.6k | goto Return; | 246 | 4.10k | InvalidContinuation2: | 247 | 4.10k | ch = 3; | 248 | 4.10k | goto Return; | 249 | 429 | InvalidContinuation3: | 250 | 429 | ch = 4; | 251 | 429 | goto Return; | 252 | 4.99k | } |
unicodeobject.c:ucs1lib_utf8_decode Line | Count | Source | 26 | 86.5k | { | 27 | 86.5k | Py_UCS4 ch; | 28 | 86.5k | const char *s = *inptr; | 29 | 86.5k | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 1.17M | while (s < end) { | 32 | 1.14M | ch = (unsigned char)*s; | 33 | | | 34 | 1.14M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 783k | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 108k | const char *_s = s; | 45 | 108k | STRINGLIB_CHAR *_p = p; | 46 | 10.3M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 10.3M | size_t value = *(const size_t *) _s; | 51 | 10.3M | if (value & ASCII_CHAR_MASK) | 52 | 87.4k | break; | 53 | 10.2M | #if PY_LITTLE_ENDIAN | 54 | 10.2M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 10.2M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 10.2M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 10.2M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 10.2M | # if SIZEOF_SIZE_T == 8 | 59 | 10.2M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 10.2M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 10.2M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 10.2M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 10.2M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 10.2M | _s += SIZEOF_SIZE_T; | 82 | 10.2M | _p += SIZEOF_SIZE_T; | 83 | 10.2M | } | 84 | 108k | s = _s; | 85 | 108k | p = _p; | 86 | 108k | if (s == end) | 87 | 2.57k | break; | 88 | 106k | ch = (unsigned char)*s; | 89 | 106k | } | 90 | 781k | if (ch < 0x80) { | 91 | 760k | s++; | 92 | 760k | *p++ = ch; | 93 | 760k | continue; | 94 | 760k | } | 95 | 781k | } | 96 | | | 97 | 382k | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 354k | Py_UCS4 ch2; | 100 | 354k | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 1.59k | goto InvalidStart; | 105 | 1.59k | } | 106 | 352k | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 737 | break; | 110 | 737 | } | 111 | 351k | ch2 = (unsigned char)s[1]; | 112 | 351k | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 19.5k | goto InvalidContinuation1; | 115 | 332k | ch = (ch << 6) + ch2 - | 116 | 332k | ((0xC0 << 6) + 0x80); | 117 | 332k | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 332k | s += 2; | 119 | 332k | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 332k | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 1.75k | goto Return; | 123 | 330k | *p++ = ch; | 124 | 330k | continue; | 125 | 332k | } | 126 | | | 127 | 28.0k | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 17.3k | Py_UCS4 ch2, ch3; | 130 | 17.3k | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 1.53k | if (end - s < 2) | 134 | 346 | break; | 135 | 1.18k | ch2 = (unsigned char)s[1]; | 136 | 1.18k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 1.18k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 742 | goto InvalidContinuation1; | 140 | 444 | break; | 141 | 1.18k | } | 142 | 15.8k | ch2 = (unsigned char)s[1]; | 143 | 15.8k | ch3 = (unsigned char)s[2]; | 144 | 15.8k | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 1.51k | goto InvalidContinuation1; | 147 | 1.51k | } | 148 | 14.2k | if (ch == 0xE0) { | 149 | 534 | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 118 | goto InvalidContinuation1; | 153 | 13.7k | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 676 | goto InvalidContinuation1; | 160 | 676 | } | 161 | 13.4k | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 616 | goto InvalidContinuation2; | 164 | 616 | } | 165 | 12.8k | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 12.8k | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 12.8k | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 12.8k | s += 3; | 169 | 12.8k | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 12.8k | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 12.8k | goto Return; | 173 | 0 | *p++ = ch; | 174 | 0 | continue; | 175 | 12.8k | } | 176 | | | 177 | 10.7k | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 9.11k | Py_UCS4 ch2, ch3, ch4; | 180 | 9.11k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 1.97k | if (end - s < 2) | 184 | 300 | break; | 185 | 1.67k | ch2 = (unsigned char)s[1]; | 186 | 1.67k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 1.67k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 1.11k | goto InvalidContinuation1; | 190 | 555 | if (end - s < 3) | 191 | 115 | break; | 192 | 440 | ch3 = (unsigned char)s[2]; | 193 | 440 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 339 | goto InvalidContinuation2; | 195 | 101 | break; | 196 | 440 | } | 197 | 7.14k | ch2 = (unsigned char)s[1]; | 198 | 7.14k | ch3 = (unsigned char)s[2]; | 199 | 7.14k | ch4 = (unsigned char)s[3]; | 200 | 7.14k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 689 | goto InvalidContinuation1; | 203 | 689 | } | 204 | 6.45k | if (ch == 0xF0) { | 205 | 1.11k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 105 | goto InvalidContinuation1; | 209 | 5.33k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 299 | goto InvalidContinuation1; | 213 | 299 | } | 214 | 6.04k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 2.18k | goto InvalidContinuation2; | 217 | 2.18k | } | 218 | 3.86k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 448 | goto InvalidContinuation3; | 221 | 448 | } | 222 | 3.41k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 3.41k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 3.41k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 3.41k | s += 4; | 226 | 3.41k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 3.41k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 3.41k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 3.41k | } | 233 | 1.62k | goto InvalidStart; | 234 | 10.7k | } | 235 | 36.8k | ch = 0; | 236 | 86.5k | Return: | 237 | 86.5k | *inptr = s; | 238 | 86.5k | *outpos = p - dest; | 239 | 86.5k | return ch; | 240 | 3.21k | InvalidStart: | 241 | 3.21k | ch = 1; | 242 | 3.21k | goto Return; | 243 | 24.8k | InvalidContinuation1: | 244 | 24.8k | ch = 2; | 245 | 24.8k | goto Return; | 246 | 3.14k | InvalidContinuation2: | 247 | 3.14k | ch = 3; | 248 | 3.14k | goto Return; | 249 | 448 | InvalidContinuation3: | 250 | 448 | ch = 4; | 251 | 448 | goto Return; | 252 | 36.8k | } |
unicodeobject.c:ucs2lib_utf8_decode Line | Count | Source | 26 | 95.3M | { | 27 | 95.3M | Py_UCS4 ch; | 28 | 95.3M | const char *s = *inptr; | 29 | 95.3M | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 179M | while (s < end) { | 32 | 179M | ch = (unsigned char)*s; | 33 | | | 34 | 179M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 65.7M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 8.56M | const char *_s = s; | 45 | 8.56M | STRINGLIB_CHAR *_p = p; | 46 | 165M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 165M | size_t value = *(const size_t *) _s; | 51 | 165M | if (value & ASCII_CHAR_MASK) | 52 | 8.47M | break; | 53 | 157M | #if PY_LITTLE_ENDIAN | 54 | 157M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 157M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 157M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 157M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 157M | # if SIZEOF_SIZE_T == 8 | 59 | 157M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 157M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 157M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 157M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 157M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 157M | _s += SIZEOF_SIZE_T; | 82 | 157M | _p += SIZEOF_SIZE_T; | 83 | 157M | } | 84 | 8.56M | s = _s; | 85 | 8.56M | p = _p; | 86 | 8.56M | if (s == end) | 87 | 6.46k | break; | 88 | 8.56M | ch = (unsigned char)*s; | 89 | 8.56M | } | 90 | 65.7M | if (ch < 0x80) { | 91 | 65.4M | s++; | 92 | 65.4M | *p++ = ch; | 93 | 65.4M | continue; | 94 | 65.4M | } | 95 | 65.7M | } | 96 | | | 97 | 114M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 47.2M | Py_UCS4 ch2; | 100 | 47.2M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 34.3M | goto InvalidStart; | 105 | 34.3M | } | 106 | 12.9M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 6.32k | break; | 110 | 6.32k | } | 111 | 12.9M | ch2 = (unsigned char)s[1]; | 112 | 12.9M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 11.7M | goto InvalidContinuation1; | 115 | 1.24M | ch = (ch << 6) + ch2 - | 116 | 1.24M | ((0xC0 << 6) + 0x80); | 117 | 1.24M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 1.24M | s += 2; | 119 | 1.24M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 1.24M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 0 | goto Return; | 123 | 1.24M | *p++ = ch; | 124 | 1.24M | continue; | 125 | 1.24M | } | 126 | | | 127 | 67.0M | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 24.5M | Py_UCS4 ch2, ch3; | 130 | 24.5M | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 4.76k | if (end - s < 2) | 134 | 2.00k | break; | 135 | 2.76k | ch2 = (unsigned char)s[1]; | 136 | 2.76k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 2.76k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 1.95k | goto InvalidContinuation1; | 140 | 814 | break; | 141 | 2.76k | } | 142 | 24.5M | ch2 = (unsigned char)s[1]; | 143 | 24.5M | ch3 = (unsigned char)s[2]; | 144 | 24.5M | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 6.42M | goto InvalidContinuation1; | 147 | 6.42M | } | 148 | 18.0M | if (ch == 0xE0) { | 149 | 28.6k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 9.87k | goto InvalidContinuation1; | 153 | 18.0M | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 10.4k | goto InvalidContinuation1; | 160 | 10.4k | } | 161 | 18.0M | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 160k | goto InvalidContinuation2; | 164 | 160k | } | 165 | 17.9M | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 17.9M | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 17.9M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 17.9M | s += 3; | 169 | 17.9M | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 17.9M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 0 | goto Return; | 173 | 17.9M | *p++ = ch; | 174 | 17.9M | continue; | 175 | 17.9M | } | 176 | | | 177 | 42.4M | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 857k | Py_UCS4 ch2, ch3, ch4; | 180 | 857k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 5.88k | if (end - s < 2) | 184 | 1.48k | break; | 185 | 4.40k | ch2 = (unsigned char)s[1]; | 186 | 4.40k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 4.40k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 2.79k | goto InvalidContinuation1; | 190 | 1.60k | if (end - s < 3) | 191 | 538 | break; | 192 | 1.07k | ch3 = (unsigned char)s[2]; | 193 | 1.07k | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 583 | goto InvalidContinuation2; | 195 | 487 | break; | 196 | 1.07k | } | 197 | 852k | ch2 = (unsigned char)s[1]; | 198 | 852k | ch3 = (unsigned char)s[2]; | 199 | 852k | ch4 = (unsigned char)s[3]; | 200 | 852k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 722k | goto InvalidContinuation1; | 203 | 722k | } | 204 | 129k | if (ch == 0xF0) { | 205 | 26.7k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 6.76k | goto InvalidContinuation1; | 209 | 102k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 12.8k | goto InvalidContinuation1; | 213 | 12.8k | } | 214 | 110k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 74.1k | goto InvalidContinuation2; | 217 | 74.1k | } | 218 | 35.8k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 15.8k | goto InvalidContinuation3; | 221 | 15.8k | } | 222 | 20.0k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 20.0k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 20.0k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 20.0k | s += 4; | 226 | 20.0k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 20.0k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 20.0k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 20.0k | } | 233 | 41.6M | goto InvalidStart; | 234 | 42.4M | } | 235 | 230k | ch = 0; | 236 | 95.3M | Return: | 237 | 95.3M | *inptr = s; | 238 | 95.3M | *outpos = p - dest; | 239 | 95.3M | return ch; | 240 | 75.9M | InvalidStart: | 241 | 75.9M | ch = 1; | 242 | 75.9M | goto Return; | 243 | 18.9M | InvalidContinuation1: | 244 | 18.9M | ch = 2; | 245 | 18.9M | goto Return; | 246 | 235k | InvalidContinuation2: | 247 | 235k | ch = 3; | 248 | 235k | goto Return; | 249 | 15.8k | InvalidContinuation3: | 250 | 15.8k | ch = 4; | 251 | 15.8k | goto Return; | 252 | 230k | } |
unicodeobject.c:ucs4lib_utf8_decode Line | Count | Source | 26 | 80.3M | { | 27 | 80.3M | Py_UCS4 ch; | 28 | 80.3M | const char *s = *inptr; | 29 | 80.3M | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 180M | while (s < end) { | 32 | 180M | ch = (unsigned char)*s; | 33 | | | 34 | 180M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 78.7M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 10.0M | const char *_s = s; | 45 | 10.0M | STRINGLIB_CHAR *_p = p; | 46 | 125M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 125M | size_t value = *(const size_t *) _s; | 51 | 125M | if (value & ASCII_CHAR_MASK) | 52 | 10.0M | break; | 53 | 115M | #if PY_LITTLE_ENDIAN | 54 | 115M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 115M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 115M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 115M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 115M | # if SIZEOF_SIZE_T == 8 | 59 | 115M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 115M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 115M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 115M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 115M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 115M | _s += SIZEOF_SIZE_T; | 82 | 115M | _p += SIZEOF_SIZE_T; | 83 | 115M | } | 84 | 10.0M | s = _s; | 85 | 10.0M | p = _p; | 86 | 10.0M | if (s == end) | 87 | 2.73k | break; | 88 | 10.0M | ch = (unsigned char)*s; | 89 | 10.0M | } | 90 | 78.7M | if (ch < 0x80) { | 91 | 78.3M | s++; | 92 | 78.3M | *p++ = ch; | 93 | 78.3M | continue; | 94 | 78.3M | } | 95 | 78.7M | } | 96 | | | 97 | 101M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 46.0M | Py_UCS4 ch2; | 100 | 46.0M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 33.3M | goto InvalidStart; | 105 | 33.3M | } | 106 | 12.7M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 1.54k | break; | 110 | 1.54k | } | 111 | 12.7M | ch2 = (unsigned char)s[1]; | 112 | 12.7M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 9.87M | goto InvalidContinuation1; | 115 | 2.85M | ch = (ch << 6) + ch2 - | 116 | 2.85M | ((0xC0 << 6) + 0x80); | 117 | 2.85M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 2.85M | s += 2; | 119 | 2.85M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 2.85M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 0 | goto Return; | 123 | 2.85M | *p++ = ch; | 124 | 2.85M | continue; | 125 | 2.85M | } | 126 | | | 127 | 55.5M | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 22.8M | Py_UCS4 ch2, ch3; | 130 | 22.8M | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 3.24k | if (end - s < 2) | 134 | 939 | break; | 135 | 2.30k | ch2 = (unsigned char)s[1]; | 136 | 2.30k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 2.30k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 1.68k | goto InvalidContinuation1; | 140 | 617 | break; | 141 | 2.30k | } | 142 | 22.8M | ch2 = (unsigned char)s[1]; | 143 | 22.8M | ch3 = (unsigned char)s[2]; | 144 | 22.8M | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 4.19M | goto InvalidContinuation1; | 147 | 4.19M | } | 148 | 18.6M | if (ch == 0xE0) { | 149 | 76.6k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 34.3k | goto InvalidContinuation1; | 153 | 18.5M | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 51.9k | goto InvalidContinuation1; | 160 | 51.9k | } | 161 | 18.5M | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 690k | goto InvalidContinuation2; | 164 | 690k | } | 165 | 17.8M | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 17.8M | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 17.8M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 17.8M | s += 3; | 169 | 17.8M | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 17.8M | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 0 | goto Return; | 173 | 17.8M | *p++ = ch; | 174 | 17.8M | continue; | 175 | 17.8M | } | 176 | | | 177 | 32.7M | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 3.57M | Py_UCS4 ch2, ch3, ch4; | 180 | 3.57M | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 3.99k | if (end - s < 2) | 184 | 696 | break; | 185 | 3.29k | ch2 = (unsigned char)s[1]; | 186 | 3.29k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 3.29k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 1.65k | goto InvalidContinuation1; | 190 | 1.64k | if (end - s < 3) | 191 | 422 | break; | 192 | 1.22k | ch3 = (unsigned char)s[2]; | 193 | 1.22k | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 512 | goto InvalidContinuation2; | 195 | 709 | break; | 196 | 1.22k | } | 197 | 3.57M | ch2 = (unsigned char)s[1]; | 198 | 3.57M | ch3 = (unsigned char)s[2]; | 199 | 3.57M | ch4 = (unsigned char)s[3]; | 200 | 3.57M | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 2.46M | goto InvalidContinuation1; | 203 | 2.46M | } | 204 | 1.10M | if (ch == 0xF0) { | 205 | 568k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 32.6k | goto InvalidContinuation1; | 209 | 568k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 50.0k | goto InvalidContinuation1; | 213 | 50.0k | } | 214 | 1.02M | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 293k | goto InvalidContinuation2; | 217 | 293k | } | 218 | 727k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 104k | goto InvalidContinuation3; | 221 | 104k | } | 222 | 622k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 622k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 622k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 622k | s += 4; | 226 | 622k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 622k | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 0 | goto Return; | 230 | 622k | *p++ = ch; | 231 | 622k | continue; | 232 | 622k | } | 233 | 29.1M | goto InvalidStart; | 234 | 32.7M | } | 235 | 42.5k | ch = 0; | 236 | 80.3M | Return: | 237 | 80.3M | *inptr = s; | 238 | 80.3M | *outpos = p - dest; | 239 | 80.3M | return ch; | 240 | 62.5M | InvalidStart: | 241 | 62.5M | ch = 1; | 242 | 62.5M | goto Return; | 243 | 16.7M | InvalidContinuation1: | 244 | 16.7M | ch = 2; | 245 | 16.7M | goto Return; | 246 | 983k | InvalidContinuation2: | 247 | 983k | ch = 3; | 248 | 983k | goto Return; | 249 | 104k | InvalidContinuation3: | 250 | 104k | ch = 4; | 251 | 104k | goto Return; | 252 | 42.5k | } |
|
253 | | |
254 | | #undef ASCII_CHAR_MASK |
255 | | |
256 | | |
257 | | /* UTF-8 encoder specialized for a Unicode kind to avoid the slow |
258 | | PyUnicode_READ() macro. Delete some parts of the code depending on the kind: |
259 | | UCS-1 strings don't need to handle surrogates for example. */ |
260 | | Py_LOCAL_INLINE(char *) |
261 | | STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, |
262 | | PyObject *unicode, |
263 | | const STRINGLIB_CHAR *data, |
264 | | Py_ssize_t size, |
265 | | _Py_error_handler error_handler, |
266 | | const char *errors) |
267 | 5.86M | { |
268 | 5.86M | Py_ssize_t i; /* index into data of next input character */ |
269 | 5.86M | char *p; /* next free byte in output buffer */ |
270 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
271 | | PyObject *error_handler_obj = NULL; |
272 | | PyObject *exc = NULL; |
273 | | PyObject *rep = NULL; |
274 | | #endif |
275 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
276 | | const Py_ssize_t max_char_size = 2; |
277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 |
278 | | const Py_ssize_t max_char_size = 3; |
279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ |
280 | | const Py_ssize_t max_char_size = 4; |
281 | | #endif |
282 | | |
283 | 5.86M | assert(size >= 0); |
284 | 5.86M | if (size > PY_SSIZE_T_MAX / max_char_size) { |
285 | | /* integer overflow */ |
286 | 0 | PyErr_NoMemory(); |
287 | 0 | return NULL; |
288 | 0 | } |
289 | | |
290 | 5.86M | _PyBytesWriter_Init(writer); |
291 | 5.86M | p = _PyBytesWriter_Alloc(writer, size * max_char_size); |
292 | 5.86M | if (p == NULL) |
293 | 0 | return NULL; |
294 | | |
295 | 2.62G | for (i = 0; i < size;) { |
296 | 2.61G | Py_UCS4 ch = data[i++]; |
297 | | |
298 | 2.61G | if (ch < 0x80) { |
299 | | /* Encode ASCII */ |
300 | 2.48G | *p++ = (char) ch; |
301 | | |
302 | 2.48G | } |
303 | 50.4M | else |
304 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
305 | 50.4M | if (ch < 0x0800) |
306 | 1.16M | #endif |
307 | 86.1M | { |
308 | | /* Encode Latin-1 */ |
309 | 86.1M | *p++ = (char)(0xc0 | (ch >> 6)); |
310 | 86.1M | *p++ = (char)(0x80 | (ch & 0x3f)); |
311 | 86.1M | } |
312 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
313 | 49.2M | else if (Py_UNICODE_IS_SURROGATE(ch)) { |
314 | 388k | Py_ssize_t startpos, endpos, newpos; |
315 | 388k | Py_ssize_t k; |
316 | 388k | if (error_handler == _Py_ERROR_UNKNOWN) { |
317 | 213k | error_handler = _Py_GetErrorHandler(errors); |
318 | 213k | } |
319 | | |
320 | 388k | startpos = i-1; |
321 | 388k | endpos = startpos+1; |
322 | | |
323 | 19.0M | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) |
324 | 18.7M | endpos++; |
325 | | |
326 | | /* Only overallocate the buffer if it's not the last write */ |
327 | 388k | writer->overallocate = (endpos < size); |
328 | | |
329 | 388k | switch (error_handler) |
330 | 388k | { |
331 | 0 | case _Py_ERROR_REPLACE: |
332 | 0 | memset(p, '?', endpos - startpos); |
333 | 0 | p += (endpos - startpos); |
334 | 0 | _Py_FALLTHROUGH; |
335 | 0 | case _Py_ERROR_IGNORE: |
336 | 0 | i += (endpos - startpos - 1); |
337 | 0 | break; |
338 | | |
339 | 0 | case _Py_ERROR_SURROGATEPASS: |
340 | 0 | for (k=startpos; k<endpos; k++) { |
341 | 0 | ch = data[k]; |
342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); |
343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); |
345 | 0 | } |
346 | 0 | i += (endpos - startpos - 1); |
347 | 0 | break; |
348 | | |
349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: |
350 | | /* subtract preallocated bytes */ |
351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); |
352 | 0 | p = backslashreplace(writer, p, |
353 | 0 | unicode, startpos, endpos); |
354 | 0 | if (p == NULL) |
355 | 0 | goto error; |
356 | 0 | i += (endpos - startpos - 1); |
357 | 0 | break; |
358 | | |
359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: |
360 | | /* subtract preallocated bytes */ |
361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); |
362 | 0 | p = xmlcharrefreplace(writer, p, |
363 | 0 | unicode, startpos, endpos); |
364 | 0 | if (p == NULL) |
365 | 0 | goto error; |
366 | 0 | i += (endpos - startpos - 1); |
367 | 0 | break; |
368 | | |
369 | 228k | case _Py_ERROR_SURROGATEESCAPE: |
370 | 13.7M | for (k=startpos; k<endpos; k++) { |
371 | 13.5M | ch = data[k]; |
372 | 13.5M | if (!(0xDC80 <= ch && ch <= 0xDCFF)) |
373 | 23 | break; |
374 | 13.5M | *p++ = (char)(ch & 0xff); |
375 | 13.5M | } |
376 | 228k | if (k >= endpos) { |
377 | 228k | i += (endpos - startpos - 1); |
378 | 228k | break; |
379 | 228k | } |
380 | 23 | startpos = k; |
381 | 23 | assert(startpos < endpos); |
382 | 23 | _Py_FALLTHROUGH; |
383 | 159k | default: |
384 | 159k | rep = unicode_encode_call_errorhandler( |
385 | 159k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", |
386 | 159k | unicode, &exc, startpos, endpos, &newpos); |
387 | 159k | if (!rep) |
388 | 159k | goto error; |
389 | | |
390 | 0 | if (newpos < startpos) { |
391 | 0 | writer->overallocate = 1; |
392 | 0 | p = _PyBytesWriter_Prepare(writer, p, |
393 | 0 | max_char_size * (startpos - newpos)); |
394 | 0 | if (p == NULL) |
395 | 0 | goto error; |
396 | 0 | } |
397 | 0 | else { |
398 | | /* subtract preallocated bytes */ |
399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); |
400 | | /* Only overallocate the buffer if it's not the last write */ |
401 | 0 | writer->overallocate = (newpos < size); |
402 | 0 | } |
403 | | |
404 | 0 | if (PyBytes_Check(rep)) { |
405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, |
406 | 0 | PyBytes_AS_STRING(rep), |
407 | 0 | PyBytes_GET_SIZE(rep)); |
408 | 0 | } |
409 | 0 | else { |
410 | | /* rep is unicode */ |
411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { |
412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, |
413 | 0 | startpos, endpos, |
414 | 0 | "surrogates not allowed"); |
415 | 0 | goto error; |
416 | 0 | } |
417 | | |
418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, |
419 | 0 | PyUnicode_DATA(rep), |
420 | 0 | PyUnicode_GET_LENGTH(rep)); |
421 | 0 | } |
422 | | |
423 | 0 | if (p == NULL) |
424 | 0 | goto error; |
425 | 0 | Py_CLEAR(rep); |
426 | |
|
427 | 0 | i = newpos; |
428 | 388k | } |
429 | | |
430 | | /* If overallocation was disabled, ensure that it was the last |
431 | | write. Otherwise, we missed an optimization */ |
432 | 228k | assert(writer->overallocate || i == size); |
433 | 228k | } |
434 | 25.4M | else |
435 | | #if STRINGLIB_SIZEOF_CHAR > 2 |
436 | 25.4M | if (ch < 0x10000) |
437 | 25.3M | #endif |
438 | 48.7M | { |
439 | 48.7M | *p++ = (char)(0xe0 | (ch >> 12)); |
440 | 48.7M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
441 | 48.7M | *p++ = (char)(0x80 | (ch & 0x3f)); |
442 | 48.7M | } |
443 | | #if STRINGLIB_SIZEOF_CHAR > 2 |
444 | | else /* ch >= 0x10000 */ |
445 | 148k | { |
446 | 148k | assert(ch <= MAX_UNICODE); |
447 | | /* Encode UCS4 Unicode ordinals */ |
448 | 148k | *p++ = (char)(0xf0 | (ch >> 18)); |
449 | 148k | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); |
450 | 148k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
451 | 148k | *p++ = (char)(0x80 | (ch & 0x3f)); |
452 | 148k | } |
453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ |
454 | | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ |
455 | 2.61G | } |
456 | | |
457 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
458 | 1.27M | Py_XDECREF(error_handler_obj); |
459 | 1.27M | Py_XDECREF(exc); |
460 | | #endif |
461 | 1.27M | return p; |
462 | | |
463 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
464 | 159k | error: |
465 | 159k | Py_XDECREF(rep); |
466 | 159k | Py_XDECREF(error_handler_obj); |
467 | 159k | Py_XDECREF(exc); |
468 | 159k | return NULL; |
469 | | #endif |
470 | 1.43M | } unicodeobject.c:ucs1lib_utf8_encoder Line | Count | Source | 267 | 4.43M | { | 268 | 4.43M | Py_ssize_t i; /* index into data of next input character */ | 269 | 4.43M | char *p; /* next free byte in output buffer */ | 270 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | | PyObject *error_handler_obj = NULL; | 272 | | PyObject *exc = NULL; | 273 | | PyObject *rep = NULL; | 274 | | #endif | 275 | 4.43M | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | 4.43M | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | | const Py_ssize_t max_char_size = 4; | 281 | | #endif | 282 | | | 283 | 4.43M | assert(size >= 0); | 284 | 4.43M | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 4.43M | _PyBytesWriter_Init(writer); | 291 | 4.43M | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 4.43M | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 593M | for (i = 0; i < size;) { | 296 | 588M | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 588M | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 503M | *p++ = (char) ch; | 301 | | | 302 | 503M | } | 303 | 85.0M | else | 304 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | | if (ch < 0x0800) | 306 | | #endif | 307 | 85.0M | { | 308 | | /* Encode Latin-1 */ | 309 | 85.0M | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 85.0M | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 85.0M | } | 312 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | | Py_ssize_t startpos, endpos, newpos; | 315 | | Py_ssize_t k; | 316 | | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | | error_handler = _Py_GetErrorHandler(errors); | 318 | | } | 319 | | | 320 | | startpos = i-1; | 321 | | endpos = startpos+1; | 322 | | | 323 | | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | | writer->overallocate = (endpos < size); | 328 | | | 329 | | switch (error_handler) | 330 | | { | 331 | | case _Py_ERROR_REPLACE: | 332 | | memset(p, '?', endpos - startpos); | 333 | | p += (endpos - startpos); | 334 | | _Py_FALLTHROUGH; | 335 | | case _Py_ERROR_IGNORE: | 336 | | i += (endpos - startpos - 1); | 337 | | break; | 338 | | | 339 | | case _Py_ERROR_SURROGATEPASS: | 340 | | for (k=startpos; k<endpos; k++) { | 341 | | ch = data[k]; | 342 | | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | | } | 346 | | i += (endpos - startpos - 1); | 347 | | break; | 348 | | | 349 | | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | | writer->min_size -= max_char_size * (endpos - startpos); | 352 | | p = backslashreplace(writer, p, | 353 | | unicode, startpos, endpos); | 354 | | if (p == NULL) | 355 | | goto error; | 356 | | i += (endpos - startpos - 1); | 357 | | break; | 358 | | | 359 | | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | | writer->min_size -= max_char_size * (endpos - startpos); | 362 | | p = xmlcharrefreplace(writer, p, | 363 | | unicode, startpos, endpos); | 364 | | if (p == NULL) | 365 | | goto error; | 366 | | i += (endpos - startpos - 1); | 367 | | break; | 368 | | | 369 | | case _Py_ERROR_SURROGATEESCAPE: | 370 | | for (k=startpos; k<endpos; k++) { | 371 | | ch = data[k]; | 372 | | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | | break; | 374 | | *p++ = (char)(ch & 0xff); | 375 | | } | 376 | | if (k >= endpos) { | 377 | | i += (endpos - startpos - 1); | 378 | | break; | 379 | | } | 380 | | startpos = k; | 381 | | assert(startpos < endpos); | 382 | | _Py_FALLTHROUGH; | 383 | | default: | 384 | | rep = unicode_encode_call_errorhandler( | 385 | | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | | unicode, &exc, startpos, endpos, &newpos); | 387 | | if (!rep) | 388 | | goto error; | 389 | | | 390 | | if (newpos < startpos) { | 391 | | writer->overallocate = 1; | 392 | | p = _PyBytesWriter_Prepare(writer, p, | 393 | | max_char_size * (startpos - newpos)); | 394 | | if (p == NULL) | 395 | | goto error; | 396 | | } | 397 | | else { | 398 | | /* subtract preallocated bytes */ | 399 | | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | | writer->overallocate = (newpos < size); | 402 | | } | 403 | | | 404 | | if (PyBytes_Check(rep)) { | 405 | | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | | PyBytes_AS_STRING(rep), | 407 | | PyBytes_GET_SIZE(rep)); | 408 | | } | 409 | | else { | 410 | | /* rep is unicode */ | 411 | | if (!PyUnicode_IS_ASCII(rep)) { | 412 | | raise_encode_exception(&exc, "utf-8", unicode, | 413 | | startpos, endpos, | 414 | | "surrogates not allowed"); | 415 | | goto error; | 416 | | } | 417 | | | 418 | | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | | PyUnicode_DATA(rep), | 420 | | PyUnicode_GET_LENGTH(rep)); | 421 | | } | 422 | | | 423 | | if (p == NULL) | 424 | | goto error; | 425 | | Py_CLEAR(rep); | 426 | | | 427 | | i = newpos; | 428 | | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | | assert(writer->overallocate || i == size); | 433 | | } | 434 | | else | 435 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | | if (ch < 0x10000) | 437 | | #endif | 438 | | { | 439 | | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | | } | 443 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | | else /* ch >= 0x10000 */ | 445 | | { | 446 | | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | | } | 453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 588M | } | 456 | | | 457 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | | Py_XDECREF(error_handler_obj); | 459 | | Py_XDECREF(exc); | 460 | | #endif | 461 | 4.43M | return p; | 462 | | | 463 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | | error: | 465 | | Py_XDECREF(rep); | 466 | | Py_XDECREF(error_handler_obj); | 467 | | Py_XDECREF(exc); | 468 | | return NULL; | 469 | | #endif | 470 | 4.43M | } |
unicodeobject.c:ucs2lib_utf8_encoder Line | Count | Source | 267 | 1.36M | { | 268 | 1.36M | Py_ssize_t i; /* index into data of next input character */ | 269 | 1.36M | char *p; /* next free byte in output buffer */ | 270 | 1.36M | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | 1.36M | PyObject *error_handler_obj = NULL; | 272 | 1.36M | PyObject *exc = NULL; | 273 | 1.36M | PyObject *rep = NULL; | 274 | 1.36M | #endif | 275 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | | const Py_ssize_t max_char_size = 4; | 281 | | #endif | 282 | | | 283 | 1.36M | assert(size >= 0); | 284 | 1.36M | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 1.36M | _PyBytesWriter_Init(writer); | 291 | 1.36M | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 1.36M | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 967M | for (i = 0; i < size;) { | 296 | 966M | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 966M | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 942M | *p++ = (char) ch; | 301 | | | 302 | 942M | } | 303 | 24.1M | else | 304 | 24.1M | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | 24.1M | if (ch < 0x0800) | 306 | 421k | #endif | 307 | 421k | { | 308 | | /* Encode Latin-1 */ | 309 | 421k | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 421k | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 421k | } | 312 | 23.7M | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | 23.7M | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | 371k | Py_ssize_t startpos, endpos, newpos; | 315 | 371k | Py_ssize_t k; | 316 | 371k | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | 205k | error_handler = _Py_GetErrorHandler(errors); | 318 | 205k | } | 319 | | | 320 | 371k | startpos = i-1; | 321 | 371k | endpos = startpos+1; | 322 | | | 323 | 18.9M | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | 18.6M | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | 371k | writer->overallocate = (endpos < size); | 328 | | | 329 | 371k | switch (error_handler) | 330 | 371k | { | 331 | 0 | case _Py_ERROR_REPLACE: | 332 | 0 | memset(p, '?', endpos - startpos); | 333 | 0 | p += (endpos - startpos); | 334 | 0 | _Py_FALLTHROUGH; | 335 | 0 | case _Py_ERROR_IGNORE: | 336 | 0 | i += (endpos - startpos - 1); | 337 | 0 | break; | 338 | | | 339 | 0 | case _Py_ERROR_SURROGATEPASS: | 340 | 0 | for (k=startpos; k<endpos; k++) { | 341 | 0 | ch = data[k]; | 342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | 0 | } | 346 | 0 | i += (endpos - startpos - 1); | 347 | 0 | break; | 348 | | | 349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 352 | 0 | p = backslashreplace(writer, p, | 353 | 0 | unicode, startpos, endpos); | 354 | 0 | if (p == NULL) | 355 | 0 | goto error; | 356 | 0 | i += (endpos - startpos - 1); | 357 | 0 | break; | 358 | | | 359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 362 | 0 | p = xmlcharrefreplace(writer, p, | 363 | 0 | unicode, startpos, endpos); | 364 | 0 | if (p == NULL) | 365 | 0 | goto error; | 366 | 0 | i += (endpos - startpos - 1); | 367 | 0 | break; | 368 | | | 369 | 217k | case _Py_ERROR_SURROGATEESCAPE: | 370 | 13.6M | for (k=startpos; k<endpos; k++) { | 371 | 13.4M | ch = data[k]; | 372 | 13.4M | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | 15 | break; | 374 | 13.4M | *p++ = (char)(ch & 0xff); | 375 | 13.4M | } | 376 | 217k | if (k >= endpos) { | 377 | 217k | i += (endpos - startpos - 1); | 378 | 217k | break; | 379 | 217k | } | 380 | 15 | startpos = k; | 381 | 15 | assert(startpos < endpos); | 382 | 15 | _Py_FALLTHROUGH; | 383 | 154k | default: | 384 | 154k | rep = unicode_encode_call_errorhandler( | 385 | 154k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | 154k | unicode, &exc, startpos, endpos, &newpos); | 387 | 154k | if (!rep) | 388 | 154k | goto error; | 389 | | | 390 | 0 | if (newpos < startpos) { | 391 | 0 | writer->overallocate = 1; | 392 | 0 | p = _PyBytesWriter_Prepare(writer, p, | 393 | 0 | max_char_size * (startpos - newpos)); | 394 | 0 | if (p == NULL) | 395 | 0 | goto error; | 396 | 0 | } | 397 | 0 | else { | 398 | | /* subtract preallocated bytes */ | 399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | 0 | writer->overallocate = (newpos < size); | 402 | 0 | } | 403 | | | 404 | 0 | if (PyBytes_Check(rep)) { | 405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | 0 | PyBytes_AS_STRING(rep), | 407 | 0 | PyBytes_GET_SIZE(rep)); | 408 | 0 | } | 409 | 0 | else { | 410 | | /* rep is unicode */ | 411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { | 412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, | 413 | 0 | startpos, endpos, | 414 | 0 | "surrogates not allowed"); | 415 | 0 | goto error; | 416 | 0 | } | 417 | | | 418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | 0 | PyUnicode_DATA(rep), | 420 | 0 | PyUnicode_GET_LENGTH(rep)); | 421 | 0 | } | 422 | | | 423 | 0 | if (p == NULL) | 424 | 0 | goto error; | 425 | 0 | Py_CLEAR(rep); | 426 | |
| 427 | 0 | i = newpos; | 428 | 371k | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | 217k | assert(writer->overallocate || i == size); | 433 | 217k | } | 434 | 23.4M | else | 435 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | | if (ch < 0x10000) | 437 | | #endif | 438 | 23.4M | { | 439 | 23.4M | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | 23.4M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | 23.4M | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | 23.4M | } | 443 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | | else /* ch >= 0x10000 */ | 445 | | { | 446 | | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | | } | 453 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | 966M | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 966M | } | 456 | | | 457 | 1.21M | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | 1.21M | Py_XDECREF(error_handler_obj); | 459 | 1.21M | Py_XDECREF(exc); | 460 | 1.21M | #endif | 461 | 1.21M | return p; | 462 | | | 463 | 0 | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | 154k | error: | 465 | 154k | Py_XDECREF(rep); | 466 | 154k | Py_XDECREF(error_handler_obj); | 467 | 154k | Py_XDECREF(exc); | 468 | 154k | return NULL; | 469 | 1.36M | #endif | 470 | 1.36M | } |
unicodeobject.c:ucs4lib_utf8_encoder Line | Count | Source | 267 | 66.3k | { | 268 | 66.3k | Py_ssize_t i; /* index into data of next input character */ | 269 | 66.3k | char *p; /* next free byte in output buffer */ | 270 | 66.3k | #if STRINGLIB_SIZEOF_CHAR > 1 | 271 | 66.3k | PyObject *error_handler_obj = NULL; | 272 | 66.3k | PyObject *exc = NULL; | 273 | 66.3k | PyObject *rep = NULL; | 274 | 66.3k | #endif | 275 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 276 | | const Py_ssize_t max_char_size = 2; | 277 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 278 | | const Py_ssize_t max_char_size = 3; | 279 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 280 | 66.3k | const Py_ssize_t max_char_size = 4; | 281 | 66.3k | #endif | 282 | | | 283 | 66.3k | assert(size >= 0); | 284 | 66.3k | if (size > PY_SSIZE_T_MAX / max_char_size) { | 285 | | /* integer overflow */ | 286 | 0 | PyErr_NoMemory(); | 287 | 0 | return NULL; | 288 | 0 | } | 289 | | | 290 | 66.3k | _PyBytesWriter_Init(writer); | 291 | 66.3k | p = _PyBytesWriter_Alloc(writer, size * max_char_size); | 292 | 66.3k | if (p == NULL) | 293 | 0 | return NULL; | 294 | | | 295 | 1.06G | for (i = 0; i < size;) { | 296 | 1.06G | Py_UCS4 ch = data[i++]; | 297 | | | 298 | 1.06G | if (ch < 0x80) { | 299 | | /* Encode ASCII */ | 300 | 1.03G | *p++ = (char) ch; | 301 | | | 302 | 1.03G | } | 303 | 26.2M | else | 304 | 26.2M | #if STRINGLIB_SIZEOF_CHAR > 1 | 305 | 26.2M | if (ch < 0x0800) | 306 | 741k | #endif | 307 | 741k | { | 308 | | /* Encode Latin-1 */ | 309 | 741k | *p++ = (char)(0xc0 | (ch >> 6)); | 310 | 741k | *p++ = (char)(0x80 | (ch & 0x3f)); | 311 | 741k | } | 312 | 25.4M | #if STRINGLIB_SIZEOF_CHAR > 1 | 313 | 25.4M | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 314 | 16.3k | Py_ssize_t startpos, endpos, newpos; | 315 | 16.3k | Py_ssize_t k; | 316 | 16.3k | if (error_handler == _Py_ERROR_UNKNOWN) { | 317 | 8.45k | error_handler = _Py_GetErrorHandler(errors); | 318 | 8.45k | } | 319 | | | 320 | 16.3k | startpos = i-1; | 321 | 16.3k | endpos = startpos+1; | 322 | | | 323 | 102k | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 324 | 85.6k | endpos++; | 325 | | | 326 | | /* Only overallocate the buffer if it's not the last write */ | 327 | 16.3k | writer->overallocate = (endpos < size); | 328 | | | 329 | 16.3k | switch (error_handler) | 330 | 16.3k | { | 331 | 0 | case _Py_ERROR_REPLACE: | 332 | 0 | memset(p, '?', endpos - startpos); | 333 | 0 | p += (endpos - startpos); | 334 | 0 | _Py_FALLTHROUGH; | 335 | 0 | case _Py_ERROR_IGNORE: | 336 | 0 | i += (endpos - startpos - 1); | 337 | 0 | break; | 338 | | | 339 | 0 | case _Py_ERROR_SURROGATEPASS: | 340 | 0 | for (k=startpos; k<endpos; k++) { | 341 | 0 | ch = data[k]; | 342 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); | 343 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 344 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); | 345 | 0 | } | 346 | 0 | i += (endpos - startpos - 1); | 347 | 0 | break; | 348 | | | 349 | 0 | case _Py_ERROR_BACKSLASHREPLACE: | 350 | | /* subtract preallocated bytes */ | 351 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 352 | 0 | p = backslashreplace(writer, p, | 353 | 0 | unicode, startpos, endpos); | 354 | 0 | if (p == NULL) | 355 | 0 | goto error; | 356 | 0 | i += (endpos - startpos - 1); | 357 | 0 | break; | 358 | | | 359 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: | 360 | | /* subtract preallocated bytes */ | 361 | 0 | writer->min_size -= max_char_size * (endpos - startpos); | 362 | 0 | p = xmlcharrefreplace(writer, p, | 363 | 0 | unicode, startpos, endpos); | 364 | 0 | if (p == NULL) | 365 | 0 | goto error; | 366 | 0 | i += (endpos - startpos - 1); | 367 | 0 | break; | 368 | | | 369 | 11.0k | case _Py_ERROR_SURROGATEESCAPE: | 370 | 103k | for (k=startpos; k<endpos; k++) { | 371 | 92.5k | ch = data[k]; | 372 | 92.5k | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 373 | 8 | break; | 374 | 92.5k | *p++ = (char)(ch & 0xff); | 375 | 92.5k | } | 376 | 11.0k | if (k >= endpos) { | 377 | 11.0k | i += (endpos - startpos - 1); | 378 | 11.0k | break; | 379 | 11.0k | } | 380 | 8 | startpos = k; | 381 | 8 | assert(startpos < endpos); | 382 | 8 | _Py_FALLTHROUGH; | 383 | 5.30k | default: | 384 | 5.30k | rep = unicode_encode_call_errorhandler( | 385 | 5.30k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 386 | 5.30k | unicode, &exc, startpos, endpos, &newpos); | 387 | 5.30k | if (!rep) | 388 | 5.30k | goto error; | 389 | | | 390 | 0 | if (newpos < startpos) { | 391 | 0 | writer->overallocate = 1; | 392 | 0 | p = _PyBytesWriter_Prepare(writer, p, | 393 | 0 | max_char_size * (startpos - newpos)); | 394 | 0 | if (p == NULL) | 395 | 0 | goto error; | 396 | 0 | } | 397 | 0 | else { | 398 | | /* subtract preallocated bytes */ | 399 | 0 | writer->min_size -= max_char_size * (newpos - startpos); | 400 | | /* Only overallocate the buffer if it's not the last write */ | 401 | 0 | writer->overallocate = (newpos < size); | 402 | 0 | } | 403 | | | 404 | 0 | if (PyBytes_Check(rep)) { | 405 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 406 | 0 | PyBytes_AS_STRING(rep), | 407 | 0 | PyBytes_GET_SIZE(rep)); | 408 | 0 | } | 409 | 0 | else { | 410 | | /* rep is unicode */ | 411 | 0 | if (!PyUnicode_IS_ASCII(rep)) { | 412 | 0 | raise_encode_exception(&exc, "utf-8", unicode, | 413 | 0 | startpos, endpos, | 414 | 0 | "surrogates not allowed"); | 415 | 0 | goto error; | 416 | 0 | } | 417 | | | 418 | 0 | p = _PyBytesWriter_WriteBytes(writer, p, | 419 | 0 | PyUnicode_DATA(rep), | 420 | 0 | PyUnicode_GET_LENGTH(rep)); | 421 | 0 | } | 422 | | | 423 | 0 | if (p == NULL) | 424 | 0 | goto error; | 425 | 0 | Py_CLEAR(rep); | 426 | |
| 427 | 0 | i = newpos; | 428 | 16.3k | } | 429 | | | 430 | | /* If overallocation was disabled, ensure that it was the last | 431 | | write. Otherwise, we missed an optimization */ | 432 | 11.0k | assert(writer->overallocate || i == size); | 433 | 11.0k | } | 434 | 25.4M | else | 435 | 25.4M | #if STRINGLIB_SIZEOF_CHAR > 2 | 436 | 25.4M | if (ch < 0x10000) | 437 | 25.3M | #endif | 438 | 25.3M | { | 439 | 25.3M | *p++ = (char)(0xe0 | (ch >> 12)); | 440 | 25.3M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 441 | 25.3M | *p++ = (char)(0x80 | (ch & 0x3f)); | 442 | 25.3M | } | 443 | 148k | #if STRINGLIB_SIZEOF_CHAR > 2 | 444 | 148k | else /* ch >= 0x10000 */ | 445 | 148k | { | 446 | 148k | assert(ch <= MAX_UNICODE); | 447 | | /* Encode UCS4 Unicode ordinals */ | 448 | 148k | *p++ = (char)(0xf0 | (ch >> 18)); | 449 | 148k | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 450 | 148k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 451 | 148k | *p++ = (char)(0x80 | (ch & 0x3f)); | 452 | 148k | } | 453 | 1.06G | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 454 | 1.06G | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 455 | 1.06G | } | 456 | | | 457 | 61.0k | #if STRINGLIB_SIZEOF_CHAR > 1 | 458 | 61.0k | Py_XDECREF(error_handler_obj); | 459 | 61.0k | Py_XDECREF(exc); | 460 | 61.0k | #endif | 461 | 61.0k | return p; | 462 | | | 463 | 0 | #if STRINGLIB_SIZEOF_CHAR > 1 | 464 | 5.30k | error: | 465 | 5.30k | Py_XDECREF(rep); | 466 | 5.30k | Py_XDECREF(error_handler_obj); | 467 | 5.30k | Py_XDECREF(exc); | 468 | 5.30k | return NULL; | 469 | 66.3k | #endif | 470 | 66.3k | } |
Unexecuted instantiation: unicodeobject.c:asciilib_utf8_encoder |
471 | | |
472 | | /* The pattern for constructing UCS2-repeated masks. */ |
473 | | #if SIZEOF_LONG == 8 |
474 | 534k | # define UCS2_REPEAT_MASK 0x0001000100010001ul |
475 | | #elif SIZEOF_LONG == 4 |
476 | | # define UCS2_REPEAT_MASK 0x00010001ul |
477 | | #else |
478 | | # error C 'long' size should be either 4 or 8! |
479 | | #endif |
480 | | |
481 | | /* The mask for fast checking. */ |
482 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
483 | | /* The mask for fast checking of whether a C 'long' contains a |
484 | | non-ASCII or non-Latin1 UTF16-encoded characters. */ |
485 | 10.0k | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) |
486 | | #else |
487 | | /* The mask for fast checking of whether a C 'long' may contain |
488 | | UTF16-encoded surrogate characters. This is an efficient heuristic, |
489 | | assuming that non-surrogate characters with a code point >= 0x8000 are |
490 | | rare in most input. |
491 | | */ |
492 | 493k | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) |
493 | | #endif |
494 | | /* The mask for fast byte-swapping. */ |
495 | 31.5k | #define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) |
496 | | /* Swap bytes. */ |
497 | 15.7k | #define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ |
498 | 15.7k | (((value) & STRIPPED_MASK) << 8)) |
499 | | |
500 | | Py_LOCAL_INLINE(Py_UCS4) |
501 | | STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, |
502 | | STRINGLIB_CHAR *dest, Py_ssize_t *outpos, |
503 | | int native_ordering) |
504 | 43.6k | { |
505 | 43.6k | Py_UCS4 ch; |
506 | 43.6k | const unsigned char *q = *inptr; |
507 | 43.6k | STRINGLIB_CHAR *p = dest + *outpos; |
508 | | /* Offsets from q for retrieving byte pairs in the right order. */ |
509 | 43.6k | #if PY_LITTLE_ENDIAN |
510 | 43.6k | int ihi = !!native_ordering, ilo = !native_ordering; |
511 | | #else |
512 | | int ihi = !native_ordering, ilo = !!native_ordering; |
513 | | #endif |
514 | 43.6k | --e; |
515 | | |
516 | 204k | while (q < e) { |
517 | 199k | Py_UCS4 ch2; |
518 | | /* First check for possible aligned read of a C 'long'. Unaligned |
519 | | reads are more expensive, better to defer to another iteration. */ |
520 | 199k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { |
521 | | /* Fast path for runs of in-range non-surrogate chars. */ |
522 | 54.1k | const unsigned char *_q = q; |
523 | 525k | while (_q + SIZEOF_LONG <= e) { |
524 | 514k | unsigned long block = * (const unsigned long *) _q; |
525 | 514k | if (native_ordering) { |
526 | | /* Can use buffer directly */ |
527 | 503k | if (block & FAST_CHAR_MASK) |
528 | 37.6k | break; |
529 | 503k | } |
530 | 11.5k | else { |
531 | | /* Need to byte-swap */ |
532 | 11.5k | if (block & SWAB(FAST_CHAR_MASK)) |
533 | 6.24k | break; |
534 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
535 | 1.00k | block >>= 8; |
536 | | #else |
537 | 4.26k | block = SWAB(block); |
538 | | #endif |
539 | 4.26k | } |
540 | 470k | #if PY_LITTLE_ENDIAN |
541 | | # if SIZEOF_LONG == 4 |
542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); |
544 | | # elif SIZEOF_LONG == 8 |
545 | 470k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
546 | 470k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
547 | 470k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
548 | 470k | p[3] = (STRINGLIB_CHAR)(block >> 48); |
549 | 470k | # endif |
550 | | #else |
551 | | # if SIZEOF_LONG == 4 |
552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); |
553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
554 | | # elif SIZEOF_LONG == 8 |
555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); |
556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
559 | | # endif |
560 | | #endif |
561 | 470k | _q += SIZEOF_LONG; |
562 | 470k | p += SIZEOF_LONG / 2; |
563 | 470k | } |
564 | 54.1k | q = _q; |
565 | 54.1k | if (q >= e) |
566 | 932 | break; |
567 | 54.1k | } |
568 | | |
569 | 198k | ch = (q[ihi] << 8) | q[ilo]; |
570 | 198k | q += 2; |
571 | 198k | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
572 | | #if STRINGLIB_SIZEOF_CHAR < 2 |
573 | 26.7k | if (ch > STRINGLIB_MAX_CHAR) |
574 | | /* Out-of-range */ |
575 | 11.7k | goto Return; |
576 | 14.9k | #endif |
577 | 14.9k | *p++ = (STRINGLIB_CHAR)ch; |
578 | 14.9k | continue; |
579 | 171k | } |
580 | | |
581 | | /* UTF-16 code pair: */ |
582 | 26.7k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) |
583 | 14.1k | goto IllegalEncoding; |
584 | 12.5k | if (q >= e) |
585 | 1.79k | goto UnexpectedEnd; |
586 | 10.7k | ch2 = (q[ihi] << 8) | q[ilo]; |
587 | 10.7k | q += 2; |
588 | 10.7k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) |
589 | 6.56k | goto IllegalSurrogate; |
590 | 4.20k | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); |
591 | | #if STRINGLIB_SIZEOF_CHAR < 4 |
592 | | /* Out-of-range */ |
593 | 3.54k | goto Return; |
594 | | #else |
595 | | *p++ = (STRINGLIB_CHAR)ch; |
596 | | #endif |
597 | 663 | } |
598 | 5.86k | ch = 0; |
599 | 43.6k | Return: |
600 | 43.6k | *inptr = q; |
601 | 43.6k | *outpos = p - dest; |
602 | 43.6k | return ch; |
603 | 1.79k | UnexpectedEnd: |
604 | 1.79k | ch = 1; |
605 | 1.79k | goto Return; |
606 | 14.1k | IllegalEncoding: |
607 | 14.1k | ch = 2; |
608 | 14.1k | goto Return; |
609 | 6.56k | IllegalSurrogate: |
610 | 6.56k | ch = 3; |
611 | 6.56k | goto Return; |
612 | 5.86k | } unicodeobject.c:asciilib_utf16_decode Line | Count | Source | 504 | 12.8k | { | 505 | 12.8k | Py_UCS4 ch; | 506 | 12.8k | const unsigned char *q = *inptr; | 507 | 12.8k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 12.8k | #if PY_LITTLE_ENDIAN | 510 | 12.8k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 12.8k | --e; | 515 | | | 516 | 22.4k | while (q < e) { | 517 | 21.9k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 21.9k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 12.0k | const unsigned char *_q = q; | 523 | 14.8k | while (_q + SIZEOF_LONG <= e) { | 524 | 10.5k | unsigned long block = * (const unsigned long *) _q; | 525 | 10.5k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 8.12k | if (block & FAST_CHAR_MASK) | 528 | 6.08k | break; | 529 | 8.12k | } | 530 | 2.39k | else { | 531 | | /* Need to byte-swap */ | 532 | 2.39k | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 1.64k | break; | 534 | 746 | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | 746 | block >>= 8; | 536 | | #else | 537 | | block = SWAB(block); | 538 | | #endif | 539 | 746 | } | 540 | 2.79k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 2.79k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 2.79k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 2.79k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 2.79k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 2.79k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 2.79k | _q += SIZEOF_LONG; | 562 | 2.79k | p += SIZEOF_LONG / 2; | 563 | 2.79k | } | 564 | 12.0k | q = _q; | 565 | 12.0k | if (q >= e) | 566 | 297 | break; | 567 | 12.0k | } | 568 | | | 569 | 21.6k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 21.6k | q += 2; | 571 | 21.6k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | 19.8k | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | 19.8k | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | 10.1k | goto Return; | 576 | 9.62k | #endif | 577 | 9.62k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 9.62k | continue; | 579 | 19.8k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 1.84k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 972 | goto IllegalEncoding; | 584 | 872 | if (q >= e) | 585 | 245 | goto UnexpectedEnd; | 586 | 627 | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 627 | q += 2; | 588 | 627 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 240 | goto IllegalSurrogate; | 590 | 387 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 387 | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 387 | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 627 | } | 598 | 763 | ch = 0; | 599 | 12.8k | Return: | 600 | 12.8k | *inptr = q; | 601 | 12.8k | *outpos = p - dest; | 602 | 12.8k | return ch; | 603 | 245 | UnexpectedEnd: | 604 | 245 | ch = 1; | 605 | 245 | goto Return; | 606 | 972 | IllegalEncoding: | 607 | 972 | ch = 2; | 608 | 972 | goto Return; | 609 | 240 | IllegalSurrogate: | 610 | 240 | ch = 3; | 611 | 240 | goto Return; | 612 | 763 | } |
unicodeobject.c:ucs1lib_utf16_decode Line | Count | Source | 504 | 3.18k | { | 505 | 3.18k | Py_UCS4 ch; | 506 | 3.18k | const unsigned char *q = *inptr; | 507 | 3.18k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 3.18k | #if PY_LITTLE_ENDIAN | 510 | 3.18k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 3.18k | --e; | 515 | | | 516 | 8.56k | while (q < e) { | 517 | 8.40k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 8.40k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 1.46k | const unsigned char *_q = q; | 523 | 2.61k | while (_q + SIZEOF_LONG <= e) { | 524 | 2.40k | unsigned long block = * (const unsigned long *) _q; | 525 | 2.40k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 1.94k | if (block & FAST_CHAR_MASK) | 528 | 1.06k | break; | 529 | 1.94k | } | 530 | 457 | else { | 531 | | /* Need to byte-swap */ | 532 | 457 | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 196 | break; | 534 | 261 | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | 261 | block >>= 8; | 536 | | #else | 537 | | block = SWAB(block); | 538 | | #endif | 539 | 261 | } | 540 | 1.15k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 1.15k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 1.15k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 1.15k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 1.15k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 1.15k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 1.15k | _q += SIZEOF_LONG; | 562 | 1.15k | p += SIZEOF_LONG / 2; | 563 | 1.15k | } | 564 | 1.46k | q = _q; | 565 | 1.46k | if (q >= e) | 566 | 116 | break; | 567 | 1.46k | } | 568 | | | 569 | 8.28k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 8.28k | q += 2; | 571 | 8.28k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | 6.89k | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | 6.89k | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | 1.52k | goto Return; | 576 | 5.37k | #endif | 577 | 5.37k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 5.37k | continue; | 579 | 6.89k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 1.38k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 182 | goto IllegalEncoding; | 584 | 1.20k | if (q >= e) | 585 | 79 | goto UnexpectedEnd; | 586 | 1.12k | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 1.12k | q += 2; | 588 | 1.12k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 640 | goto IllegalSurrogate; | 590 | 488 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 488 | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 488 | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 1.12k | } | 598 | 274 | ch = 0; | 599 | 3.18k | Return: | 600 | 3.18k | *inptr = q; | 601 | 3.18k | *outpos = p - dest; | 602 | 3.18k | return ch; | 603 | 79 | UnexpectedEnd: | 604 | 79 | ch = 1; | 605 | 79 | goto Return; | 606 | 182 | IllegalEncoding: | 607 | 182 | ch = 2; | 608 | 182 | goto Return; | 609 | 640 | IllegalSurrogate: | 610 | 640 | ch = 3; | 611 | 640 | goto Return; | 612 | 274 | } |
unicodeobject.c:ucs2lib_utf16_decode Line | Count | Source | 504 | 11.4k | { | 505 | 11.4k | Py_UCS4 ch; | 506 | 11.4k | const unsigned char *q = *inptr; | 507 | 11.4k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 11.4k | #if PY_LITTLE_ENDIAN | 510 | 11.4k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 11.4k | --e; | 515 | | | 516 | 114k | while (q < e) { | 517 | 111k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 111k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 25.9k | const unsigned char *_q = q; | 523 | 489k | while (_q + SIZEOF_LONG <= e) { | 524 | 485k | unsigned long block = * (const unsigned long *) _q; | 525 | 485k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 478k | if (block & FAST_CHAR_MASK) | 528 | 18.8k | break; | 529 | 478k | } | 530 | 6.52k | else { | 531 | | /* Need to byte-swap */ | 532 | 6.52k | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 3.27k | break; | 534 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | | block >>= 8; | 536 | | #else | 537 | 3.24k | block = SWAB(block); | 538 | 3.24k | #endif | 539 | 3.24k | } | 540 | 463k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 463k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 463k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 463k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 463k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 463k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 463k | _q += SIZEOF_LONG; | 562 | 463k | p += SIZEOF_LONG / 2; | 563 | 463k | } | 564 | 25.9k | q = _q; | 565 | 25.9k | if (q >= e) | 566 | 389 | break; | 567 | 25.9k | } | 568 | | | 569 | 110k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 110k | q += 2; | 571 | 110k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | | goto Return; | 576 | | #endif | 577 | 103k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 103k | continue; | 579 | 103k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 7.53k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 2.23k | goto IllegalEncoding; | 584 | 5.30k | if (q >= e) | 585 | 493 | goto UnexpectedEnd; | 586 | 4.81k | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 4.81k | q += 2; | 588 | 4.81k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 2.14k | goto IllegalSurrogate; | 590 | 2.66k | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | 2.66k | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | 2.66k | goto Return; | 594 | | #else | 595 | | *p++ = (STRINGLIB_CHAR)ch; | 596 | | #endif | 597 | 4.81k | } | 598 | 3.94k | ch = 0; | 599 | 11.4k | Return: | 600 | 11.4k | *inptr = q; | 601 | 11.4k | *outpos = p - dest; | 602 | 11.4k | return ch; | 603 | 493 | UnexpectedEnd: | 604 | 493 | ch = 1; | 605 | 493 | goto Return; | 606 | 2.23k | IllegalEncoding: | 607 | 2.23k | ch = 2; | 608 | 2.23k | goto Return; | 609 | 2.14k | IllegalSurrogate: | 610 | 2.14k | ch = 3; | 611 | 2.14k | goto Return; | 612 | 3.94k | } |
unicodeobject.c:ucs4lib_utf16_decode Line | Count | Source | 504 | 16.1k | { | 505 | 16.1k | Py_UCS4 ch; | 506 | 16.1k | const unsigned char *q = *inptr; | 507 | 16.1k | STRINGLIB_CHAR *p = dest + *outpos; | 508 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 509 | 16.1k | #if PY_LITTLE_ENDIAN | 510 | 16.1k | int ihi = !!native_ordering, ilo = !native_ordering; | 511 | | #else | 512 | | int ihi = !native_ordering, ilo = !!native_ordering; | 513 | | #endif | 514 | 16.1k | --e; | 515 | | | 516 | 58.6k | while (q < e) { | 517 | 57.8k | Py_UCS4 ch2; | 518 | | /* First check for possible aligned read of a C 'long'. Unaligned | 519 | | reads are more expensive, better to defer to another iteration. */ | 520 | 57.8k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 521 | | /* Fast path for runs of in-range non-surrogate chars. */ | 522 | 14.7k | const unsigned char *_q = q; | 523 | 18.4k | while (_q + SIZEOF_LONG <= e) { | 524 | 16.5k | unsigned long block = * (const unsigned long *) _q; | 525 | 16.5k | if (native_ordering) { | 526 | | /* Can use buffer directly */ | 527 | 14.3k | if (block & FAST_CHAR_MASK) | 528 | 11.6k | break; | 529 | 14.3k | } | 530 | 2.14k | else { | 531 | | /* Need to byte-swap */ | 532 | 2.14k | if (block & SWAB(FAST_CHAR_MASK)) | 533 | 1.12k | break; | 534 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 535 | | block >>= 8; | 536 | | #else | 537 | 1.01k | block = SWAB(block); | 538 | 1.01k | #endif | 539 | 1.01k | } | 540 | 3.79k | #if PY_LITTLE_ENDIAN | 541 | | # if SIZEOF_LONG == 4 | 542 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 543 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 544 | | # elif SIZEOF_LONG == 8 | 545 | 3.79k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 546 | 3.79k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 547 | 3.79k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 548 | 3.79k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 549 | 3.79k | # endif | 550 | | #else | 551 | | # if SIZEOF_LONG == 4 | 552 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 553 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 554 | | # elif SIZEOF_LONG == 8 | 555 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 556 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 557 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 558 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 559 | | # endif | 560 | | #endif | 561 | 3.79k | _q += SIZEOF_LONG; | 562 | 3.79k | p += SIZEOF_LONG / 2; | 563 | 3.79k | } | 564 | 14.7k | q = _q; | 565 | 14.7k | if (q >= e) | 566 | 130 | break; | 567 | 14.7k | } | 568 | | | 569 | 57.7k | ch = (q[ihi] << 8) | q[ilo]; | 570 | 57.7k | q += 2; | 571 | 57.7k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 572 | | #if STRINGLIB_SIZEOF_CHAR < 2 | 573 | | if (ch > STRINGLIB_MAX_CHAR) | 574 | | /* Out-of-range */ | 575 | | goto Return; | 576 | | #endif | 577 | 41.7k | *p++ = (STRINGLIB_CHAR)ch; | 578 | 41.7k | continue; | 579 | 41.7k | } | 580 | | | 581 | | /* UTF-16 code pair: */ | 582 | 15.9k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 583 | 10.7k | goto IllegalEncoding; | 584 | 5.17k | if (q >= e) | 585 | 980 | goto UnexpectedEnd; | 586 | 4.19k | ch2 = (q[ihi] << 8) | q[ilo]; | 587 | 4.19k | q += 2; | 588 | 4.19k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 589 | 3.53k | goto IllegalSurrogate; | 590 | 663 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 591 | | #if STRINGLIB_SIZEOF_CHAR < 4 | 592 | | /* Out-of-range */ | 593 | | goto Return; | 594 | | #else | 595 | 663 | *p++ = (STRINGLIB_CHAR)ch; | 596 | 663 | #endif | 597 | 663 | } | 598 | 882 | ch = 0; | 599 | 16.1k | Return: | 600 | 16.1k | *inptr = q; | 601 | 16.1k | *outpos = p - dest; | 602 | 16.1k | return ch; | 603 | 980 | UnexpectedEnd: | 604 | 980 | ch = 1; | 605 | 980 | goto Return; | 606 | 10.7k | IllegalEncoding: | 607 | 10.7k | ch = 2; | 608 | 10.7k | goto Return; | 609 | 3.53k | IllegalSurrogate: | 610 | 3.53k | ch = 3; | 611 | 3.53k | goto Return; | 612 | 882 | } |
|
613 | | #undef UCS2_REPEAT_MASK |
614 | | #undef FAST_CHAR_MASK |
615 | | #undef STRIPPED_MASK |
616 | | #undef SWAB |
617 | | |
618 | | |
619 | | #if STRINGLIB_MAX_CHAR >= 0x80 |
620 | | Py_LOCAL_INLINE(Py_ssize_t) |
621 | | STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, |
622 | | Py_ssize_t len, |
623 | | unsigned short **outptr, |
624 | | int native_ordering) |
625 | 0 | { |
626 | 0 | unsigned short *out = *outptr; |
627 | 0 | const STRINGLIB_CHAR *end = in + len; |
628 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
629 | 0 | if (native_ordering) { |
630 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
631 | 0 | while (in < unrolled_end) { |
632 | 0 | out[0] = in[0]; |
633 | 0 | out[1] = in[1]; |
634 | 0 | out[2] = in[2]; |
635 | 0 | out[3] = in[3]; |
636 | 0 | in += 4; out += 4; |
637 | 0 | } |
638 | 0 | while (in < end) { |
639 | 0 | *out++ = *in++; |
640 | 0 | } |
641 | 0 | } else { |
642 | 0 | # define SWAB2(CH) ((CH) << 8) /* high byte is zero */ |
643 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
644 | 0 | while (in < unrolled_end) { |
645 | 0 | out[0] = SWAB2(in[0]); |
646 | 0 | out[1] = SWAB2(in[1]); |
647 | 0 | out[2] = SWAB2(in[2]); |
648 | 0 | out[3] = SWAB2(in[3]); |
649 | 0 | in += 4; out += 4; |
650 | 0 | } |
651 | 0 | while (in < end) { |
652 | 0 | Py_UCS4 ch = *in++; |
653 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
654 | 0 | } |
655 | 0 | #undef SWAB2 |
656 | 0 | } |
657 | | *outptr = out; |
658 | | return len; |
659 | | #else |
660 | 0 | if (native_ordering) { |
661 | | #if STRINGLIB_MAX_CHAR < 0x10000 |
662 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
663 | 0 | while (in < unrolled_end) { |
664 | | /* check if any character is a surrogate character */ |
665 | 0 | if (((in[0] ^ 0xd800) & |
666 | 0 | (in[1] ^ 0xd800) & |
667 | 0 | (in[2] ^ 0xd800) & |
668 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
669 | 0 | break; |
670 | 0 | out[0] = in[0]; |
671 | 0 | out[1] = in[1]; |
672 | 0 | out[2] = in[2]; |
673 | 0 | out[3] = in[3]; |
674 | 0 | in += 4; out += 4; |
675 | 0 | } |
676 | | #endif |
677 | 0 | while (in < end) { |
678 | 0 | Py_UCS4 ch; |
679 | 0 | ch = *in++; |
680 | 0 | if (ch < 0xd800) |
681 | 0 | *out++ = ch; |
682 | 0 | else if (ch < 0xe000) |
683 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
684 | 0 | goto fail; |
685 | | #if STRINGLIB_MAX_CHAR >= 0x10000 |
686 | 0 | else if (ch >= 0x10000) { |
687 | 0 | out[0] = Py_UNICODE_HIGH_SURROGATE(ch); |
688 | 0 | out[1] = Py_UNICODE_LOW_SURROGATE(ch); |
689 | 0 | out += 2; |
690 | 0 | } |
691 | 0 | #endif |
692 | 0 | else |
693 | 0 | *out++ = ch; |
694 | 0 | } |
695 | 0 | } else { |
696 | 0 | #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) |
697 | | #if STRINGLIB_MAX_CHAR < 0x10000 |
698 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
699 | 0 | while (in < unrolled_end) { |
700 | | /* check if any character is a surrogate character */ |
701 | 0 | if (((in[0] ^ 0xd800) & |
702 | 0 | (in[1] ^ 0xd800) & |
703 | 0 | (in[2] ^ 0xd800) & |
704 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
705 | 0 | break; |
706 | 0 | out[0] = SWAB2(in[0]); |
707 | 0 | out[1] = SWAB2(in[1]); |
708 | 0 | out[2] = SWAB2(in[2]); |
709 | 0 | out[3] = SWAB2(in[3]); |
710 | 0 | in += 4; out += 4; |
711 | 0 | } |
712 | | #endif |
713 | 0 | while (in < end) { |
714 | 0 | Py_UCS4 ch = *in++; |
715 | 0 | if (ch < 0xd800) |
716 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
717 | 0 | else if (ch < 0xe000) |
718 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
719 | 0 | goto fail; |
720 | | #if STRINGLIB_MAX_CHAR >= 0x10000 |
721 | 0 | else if (ch >= 0x10000) { |
722 | 0 | Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); |
723 | 0 | Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); |
724 | 0 | out[0] = SWAB2(ch1); |
725 | 0 | out[1] = SWAB2(ch2); |
726 | 0 | out += 2; |
727 | 0 | } |
728 | 0 | #endif |
729 | 0 | else |
730 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
731 | 0 | } |
732 | 0 | #undef SWAB2 |
733 | 0 | } |
734 | 0 | *outptr = out; |
735 | 0 | return len; |
736 | 0 | fail: |
737 | 0 | *outptr = out; |
738 | 0 | return len - (end - in + 1); |
739 | | #endif |
740 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf16_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf16_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf16_encode |
741 | | |
742 | | static inline uint32_t |
743 | | STRINGLIB(SWAB4)(STRINGLIB_CHAR ch) |
744 | 0 | { |
745 | 0 | uint32_t word = ch; |
746 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
747 | | /* high bytes are zero */ |
748 | | return (word << 24); |
749 | | #elif STRINGLIB_SIZEOF_CHAR == 2 |
750 | | /* high bytes are zero */ |
751 | | return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8); |
752 | | #else |
753 | | return _Py_bswap32(word); |
754 | | #endif |
755 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_SWAB4 Unexecuted instantiation: unicodeobject.c:ucs2lib_SWAB4 Unexecuted instantiation: unicodeobject.c:ucs4lib_SWAB4 |
756 | | |
757 | | Py_LOCAL_INLINE(Py_ssize_t) |
758 | | STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, |
759 | | Py_ssize_t len, |
760 | | uint32_t **outptr, |
761 | | int native_ordering) |
762 | 0 | { |
763 | 0 | uint32_t *out = *outptr; |
764 | 0 | const STRINGLIB_CHAR *end = in + len; |
765 | 0 | if (native_ordering) { |
766 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
767 | 0 | while (in < unrolled_end) { |
768 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
769 | | /* check if any character is a surrogate character */ |
770 | 0 | if (((in[0] ^ 0xd800) & |
771 | 0 | (in[1] ^ 0xd800) & |
772 | 0 | (in[2] ^ 0xd800) & |
773 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
774 | 0 | break; |
775 | 0 | #endif |
776 | 0 | out[0] = in[0]; |
777 | 0 | out[1] = in[1]; |
778 | 0 | out[2] = in[2]; |
779 | 0 | out[3] = in[3]; |
780 | 0 | in += 4; out += 4; |
781 | 0 | } |
782 | 0 | while (in < end) { |
783 | 0 | Py_UCS4 ch; |
784 | 0 | ch = *in++; |
785 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
786 | 0 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
787 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
788 | 0 | goto fail; |
789 | 0 | } |
790 | 0 | #endif |
791 | 0 | *out++ = ch; |
792 | 0 | } |
793 | 0 | } else { |
794 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
795 | 0 | while (in < unrolled_end) { |
796 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
797 | | /* check if any character is a surrogate character */ |
798 | 0 | if (((in[0] ^ 0xd800) & |
799 | 0 | (in[1] ^ 0xd800) & |
800 | 0 | (in[2] ^ 0xd800) & |
801 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
802 | 0 | break; |
803 | 0 | #endif |
804 | 0 | out[0] = STRINGLIB(SWAB4)(in[0]); |
805 | 0 | out[1] = STRINGLIB(SWAB4)(in[1]); |
806 | 0 | out[2] = STRINGLIB(SWAB4)(in[2]); |
807 | 0 | out[3] = STRINGLIB(SWAB4)(in[3]); |
808 | 0 | in += 4; out += 4; |
809 | 0 | } |
810 | 0 | while (in < end) { |
811 | 0 | Py_UCS4 ch = *in++; |
812 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
813 | 0 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
814 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
815 | 0 | goto fail; |
816 | 0 | } |
817 | 0 | #endif |
818 | 0 | *out++ = STRINGLIB(SWAB4)(ch); |
819 | 0 | } |
820 | 0 | } |
821 | 0 | *outptr = out; |
822 | 0 | return len; |
823 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
824 | 0 | fail: |
825 | 0 | *outptr = out; |
826 | 0 | return len - (end - in + 1); |
827 | | #endif |
828 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf32_encode |
829 | | |
830 | | #endif |