/src/cpython/Objects/stringlib/codecs.h
Line | Count | Source |
1 | | /* stringlib: codec implementations */ |
2 | | |
3 | | #if !STRINGLIB_IS_UNICODE |
4 | | # error "codecs.h is specific to Unicode" |
5 | | #endif |
6 | | |
7 | | #include "pycore_bitutils.h" // _Py_bswap32() |
8 | | |
9 | | /* Mask to quickly check whether a C 'size_t' contains a |
10 | | non-ASCII, UTF8-encoded char. */ |
11 | | #if (SIZEOF_SIZE_T == 8) |
12 | 372M | # define ASCII_CHAR_MASK 0x8080808080808080ULL |
13 | | #elif (SIZEOF_SIZE_T == 4) |
14 | | # define ASCII_CHAR_MASK 0x80808080U |
15 | | #else |
16 | | # error C 'size_t' size should be either 4 or 8! |
17 | | #endif |
18 | | |
19 | | /* 10xxxxxx */ |
20 | 189M | #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0) |
21 | | |
22 | | Py_LOCAL_INLINE(Py_UCS4) |
23 | | STRINGLIB(utf8_decode)(const char **inptr, const char *end, |
24 | | STRINGLIB_CHAR *dest, |
25 | | Py_ssize_t *outpos) |
26 | 193M | { |
27 | 193M | Py_UCS4 ch; |
28 | 193M | const char *s = *inptr; |
29 | 193M | STRINGLIB_CHAR *p = dest + *outpos; |
30 | | |
31 | 452M | while (s < end) { |
32 | 451M | ch = (unsigned char)*s; |
33 | | |
34 | 451M | if (ch < 0x80) { |
35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 |
36 | | input will consist of an overwhelming majority of ASCII |
37 | | characters, we try to optimize for this case by checking |
38 | | as many characters as a C 'size_t' can contain. |
39 | | First, check if we can do an aligned read, as most CPUs have |
40 | | a penalty for unaligned reads. |
41 | | */ |
42 | 181M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { |
43 | | /* Help register allocation */ |
44 | 23.7M | const char *_s = s; |
45 | 23.7M | STRINGLIB_CHAR *_p = p; |
46 | 372M | while (_s + SIZEOF_SIZE_T <= end) { |
47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), |
48 | | and do a fast unrolled copy if it only contains ASCII |
49 | | characters. */ |
50 | 372M | size_t value = *(const size_t *) _s; |
51 | 372M | if (value & ASCII_CHAR_MASK) |
52 | 23.3M | break; |
53 | 348M | #if PY_LITTLE_ENDIAN |
54 | 348M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); |
55 | 348M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
56 | 348M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
57 | 348M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
58 | 348M | # if SIZEOF_SIZE_T == 8 |
59 | 348M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
60 | 348M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
61 | 348M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
62 | 348M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
63 | 348M | # endif |
64 | | #else |
65 | | # if SIZEOF_SIZE_T == 8 |
66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); |
74 | | # else |
75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); |
79 | | # endif |
80 | | #endif |
81 | 348M | _s += SIZEOF_SIZE_T; |
82 | 348M | _p += SIZEOF_SIZE_T; |
83 | 348M | } |
84 | 23.7M | s = _s; |
85 | 23.7M | p = _p; |
86 | 23.7M | if (s == end) |
87 | 69.2k | break; |
88 | 23.6M | ch = (unsigned char)*s; |
89 | 23.6M | } |
90 | 181M | if (ch < 0x80) { |
91 | 180M | s++; |
92 | 180M | *p++ = ch; |
93 | 180M | continue; |
94 | 180M | } |
95 | 181M | } |
96 | | |
97 | 270M | if (ch < 0xE0) { |
98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ |
99 | 105M | Py_UCS4 ch2; |
100 | 105M | if (ch < 0xC2) { |
101 | | /* invalid sequence |
102 | | \x80-\xBF -- continuation byte |
103 | | \xC0-\xC1 -- fake 0000-007F */ |
104 | 59.8M | goto InvalidStart; |
105 | 59.8M | } |
106 | 45.5M | if (end - s < 2) { |
107 | | /* unexpected end of data: the caller will decide whether |
108 | | it's an error or not */ |
109 | 11.9k | break; |
110 | 11.9k | } |
111 | 45.5M | ch2 = (unsigned char)s[1]; |
112 | 45.5M | if (!IS_CONTINUATION_BYTE(ch2)) |
113 | | /* invalid continuation byte */ |
114 | 25.6M | goto InvalidContinuation1; |
115 | 19.9M | ch = (ch << 6) + ch2 - |
116 | 19.9M | ((0xC0 << 6) + 0x80); |
117 | 19.9M | assert ((ch > 0x007F) && (ch <= 0x07FF)); |
118 | 19.9M | s += 2; |
119 | 19.9M | if (STRINGLIB_MAX_CHAR <= 0x007F || |
120 | 3.87M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) |
121 | | /* Out-of-range */ |
122 | 160k | goto Return; |
123 | 19.7M | *p++ = ch; |
124 | 19.7M | continue; |
125 | 19.9M | } |
126 | | |
127 | 165M | if (ch < 0xF0) { |
128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ |
129 | 70.2M | Py_UCS4 ch2, ch3; |
130 | 70.2M | if (end - s < 3) { |
131 | | /* unexpected end of data: the caller will decide whether |
132 | | it's an error or not */ |
133 | 13.8k | if (end - s < 2) |
134 | 4.57k | break; |
135 | 9.32k | ch2 = (unsigned char)s[1]; |
136 | 9.32k | if (!IS_CONTINUATION_BYTE(ch2) || |
137 | 4.88k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) |
138 | | /* for clarification see comments below */ |
139 | 6.87k | goto InvalidContinuation1; |
140 | 2.44k | break; |
141 | 9.32k | } |
142 | 70.2M | ch2 = (unsigned char)s[1]; |
143 | 70.2M | ch3 = (unsigned char)s[2]; |
144 | 70.2M | if (!IS_CONTINUATION_BYTE(ch2)) { |
145 | | /* invalid continuation byte */ |
146 | 12.1M | goto InvalidContinuation1; |
147 | 12.1M | } |
148 | 58.0M | if (ch == 0xE0) { |
149 | 438k | if (ch2 < 0xA0) |
150 | | /* invalid sequence |
151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ |
152 | 71.3k | goto InvalidContinuation1; |
153 | 57.6M | } else if (ch == 0xED && ch2 >= 0xA0) { |
154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF |
155 | | will result in surrogates in range D800-DFFF. Surrogates are |
156 | | not valid UTF-8 so they are rejected. |
157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf |
158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ |
159 | 115k | goto InvalidContinuation1; |
160 | 115k | } |
161 | 57.8M | if (!IS_CONTINUATION_BYTE(ch3)) { |
162 | | /* invalid continuation byte */ |
163 | 1.46M | goto InvalidContinuation2; |
164 | 1.46M | } |
165 | 56.4M | ch = (ch << 12) + (ch2 << 6) + ch3 - |
166 | 56.4M | ((0xE0 << 12) + (0x80 << 6) + 0x80); |
167 | 56.4M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); |
168 | 56.4M | s += 3; |
169 | 56.4M | if (STRINGLIB_MAX_CHAR <= 0x07FF || |
170 | 0 | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) |
171 | | /* Out-of-range */ |
172 | 246k | goto Return; |
173 | 56.1M | *p++ = ch; |
174 | 56.1M | continue; |
175 | 56.4M | } |
176 | | |
177 | 95.2M | if (ch < 0xF5) { |
178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ |
179 | 10.1M | Py_UCS4 ch2, ch3, ch4; |
180 | 10.1M | if (end - s < 4) { |
181 | | /* unexpected end of data: the caller will decide whether |
182 | | it's an error or not */ |
183 | 20.6k | if (end - s < 2) |
184 | 4.86k | break; |
185 | 15.7k | ch2 = (unsigned char)s[1]; |
186 | 15.7k | if (!IS_CONTINUATION_BYTE(ch2) || |
187 | 9.04k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) |
188 | | /* for clarification see comments below */ |
189 | 9.54k | goto InvalidContinuation1; |
190 | 6.20k | if (end - s < 3) |
191 | 1.76k | break; |
192 | 4.43k | ch3 = (unsigned char)s[2]; |
193 | 4.43k | if (!IS_CONTINUATION_BYTE(ch3)) |
194 | 2.08k | goto InvalidContinuation2; |
195 | 2.35k | break; |
196 | 4.43k | } |
197 | 10.1M | ch2 = (unsigned char)s[1]; |
198 | 10.1M | ch3 = (unsigned char)s[2]; |
199 | 10.1M | ch4 = (unsigned char)s[3]; |
200 | 10.1M | if (!IS_CONTINUATION_BYTE(ch2)) { |
201 | | /* invalid continuation byte */ |
202 | 7.10M | goto InvalidContinuation1; |
203 | 7.10M | } |
204 | 3.06M | if (ch == 0xF0) { |
205 | 465k | if (ch2 < 0x90) |
206 | | /* invalid sequence |
207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ |
208 | 47.5k | goto InvalidContinuation1; |
209 | 2.60M | } else if (ch == 0xF4 && ch2 >= 0x90) { |
210 | | /* invalid sequence |
211 | | \xF4\x90\x80\x80- -- 110000- overflow */ |
212 | 118k | goto InvalidContinuation1; |
213 | 118k | } |
214 | 2.89M | if (!IS_CONTINUATION_BYTE(ch3)) { |
215 | | /* invalid continuation byte */ |
216 | 508k | goto InvalidContinuation2; |
217 | 508k | } |
218 | 2.39M | if (!IS_CONTINUATION_BYTE(ch4)) { |
219 | | /* invalid continuation byte */ |
220 | 275k | goto InvalidContinuation3; |
221 | 275k | } |
222 | 2.11M | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - |
223 | 2.11M | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); |
224 | 2.11M | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); |
225 | 2.11M | s += 4; |
226 | 2.11M | if (STRINGLIB_MAX_CHAR <= 0xFFFF || |
227 | 0 | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) |
228 | | /* Out-of-range */ |
229 | 80.6k | goto Return; |
230 | 2.03M | *p++ = ch; |
231 | 2.03M | continue; |
232 | 2.11M | } |
233 | 85.0M | goto InvalidStart; |
234 | 95.2M | } |
235 | 750k | ch = 0; |
236 | 193M | Return: |
237 | 193M | *inptr = s; |
238 | 193M | *outpos = p - dest; |
239 | 193M | return ch; |
240 | 144M | InvalidStart: |
241 | 144M | ch = 1; |
242 | 144M | goto Return; |
243 | 45.2M | InvalidContinuation1: |
244 | 45.2M | ch = 2; |
245 | 45.2M | goto Return; |
246 | 1.97M | InvalidContinuation2: |
247 | 1.97M | ch = 3; |
248 | 1.97M | goto Return; |
249 | 275k | InvalidContinuation3: |
250 | 275k | ch = 4; |
251 | 275k | goto Return; |
252 | 750k | } unicodeobject.c:asciilib_utf8_decode Line | Count | Source | 26 | 535k | { | 27 | 535k | Py_UCS4 ch; | 28 | 535k | const char *s = *inptr; | 29 | 535k | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 535k | while (s < end) { | 32 | 535k | ch = (unsigned char)*s; | 33 | | | 34 | 535k | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 0 | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 0 | const char *_s = s; | 45 | 0 | STRINGLIB_CHAR *_p = p; | 46 | 0 | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 0 | size_t value = *(const size_t *) _s; | 51 | 0 | if (value & ASCII_CHAR_MASK) | 52 | 0 | break; | 53 | 0 | #if PY_LITTLE_ENDIAN | 54 | 0 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 0 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 0 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 0 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 0 | # if SIZEOF_SIZE_T == 8 | 59 | 0 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 0 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 0 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 0 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 0 | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 0 | _s += SIZEOF_SIZE_T; | 82 | 0 | _p += SIZEOF_SIZE_T; | 83 | 0 | } | 84 | 0 | s = _s; | 85 | 0 | p = _p; | 86 | 0 | if (s == end) | 87 | 0 | break; | 88 | 0 | ch = (unsigned char)*s; | 89 | 0 | } | 90 | 0 | if (ch < 0x80) { | 91 | 0 | s++; | 92 | 0 | *p++ = ch; | 93 | 0 | continue; | 94 | 0 | } | 95 | 0 | } | 96 | | | 97 | 535k | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 187k | Py_UCS4 ch2; | 100 | 187k | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 19.7k | goto InvalidStart; | 105 | 19.7k | } | 106 | 167k | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 1.61k | break; | 110 | 1.61k | } | 111 | 165k | ch2 = (unsigned char)s[1]; | 112 | 165k | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 7.50k | goto InvalidContinuation1; | 115 | 158k | ch = (ch << 6) + ch2 - | 116 | 158k | ((0xC0 << 6) + 0x80); | 117 | 158k | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 158k | s += 2; | 119 | 158k | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 0 | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 158k | goto Return; | 123 | 0 | *p++ = ch; | 124 | 0 | continue; | 125 | 158k | } | 126 | | | 127 | 348k | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 234k | Py_UCS4 ch2, ch3; | 130 | 234k | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 2.88k | if (end - s < 2) | 134 | 688 | break; | 135 | 2.19k | ch2 = (unsigned char)s[1]; | 136 | 2.19k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 976 | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 1.70k | goto InvalidContinuation1; | 140 | 490 | break; | 141 | 2.19k | } | 142 | 231k | ch2 = (unsigned char)s[1]; | 143 | 231k | ch3 = (unsigned char)s[2]; | 144 | 231k | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 4.56k | goto InvalidContinuation1; | 147 | 4.56k | } | 148 | 227k | if (ch == 0xE0) { | 149 | 5.73k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 210 | goto InvalidContinuation1; | 153 | 221k | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 634 | goto InvalidContinuation1; | 160 | 634 | } | 161 | 226k | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 4.79k | goto InvalidContinuation2; | 164 | 4.79k | } | 165 | 221k | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 221k | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 221k | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 221k | s += 3; | 169 | 221k | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 0 | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 221k | goto Return; | 173 | 0 | *p++ = ch; | 174 | 0 | continue; | 175 | 221k | } | 176 | | | 177 | 114k | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 49.6k | Py_UCS4 ch2, ch3, ch4; | 180 | 49.6k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 5.10k | if (end - s < 2) | 184 | 1.49k | break; | 185 | 3.60k | ch2 = (unsigned char)s[1]; | 186 | 3.60k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 1.90k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 2.38k | goto InvalidContinuation1; | 190 | 1.21k | if (end - s < 3) | 191 | 567 | break; | 192 | 649 | ch3 = (unsigned char)s[2]; | 193 | 649 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 414 | goto InvalidContinuation2; | 195 | 235 | break; | 196 | 649 | } | 197 | 44.5k | ch2 = (unsigned char)s[1]; | 198 | 44.5k | ch3 = (unsigned char)s[2]; | 199 | 44.5k | ch4 = (unsigned char)s[3]; | 200 | 44.5k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 3.68k | goto InvalidContinuation1; | 203 | 3.68k | } | 204 | 40.8k | if (ch == 0xF0) { | 205 | 4.59k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 140 | goto InvalidContinuation1; | 209 | 36.2k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 824 | goto InvalidContinuation1; | 213 | 824 | } | 214 | 39.8k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 1.49k | goto InvalidContinuation2; | 217 | 1.49k | } | 218 | 38.3k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 472 | goto InvalidContinuation3; | 221 | 472 | } | 222 | 37.8k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 37.8k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 37.8k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 37.8k | s += 4; | 226 | 37.8k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 0 | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 37.8k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 37.8k | } | 233 | 64.5k | goto InvalidStart; | 234 | 114k | } | 235 | 5.09k | ch = 0; | 236 | 535k | Return: | 237 | 535k | *inptr = s; | 238 | 535k | *outpos = p - dest; | 239 | 535k | return ch; | 240 | 84.2k | InvalidStart: | 241 | 84.2k | ch = 1; | 242 | 84.2k | goto Return; | 243 | 21.6k | InvalidContinuation1: | 244 | 21.6k | ch = 2; | 245 | 21.6k | goto Return; | 246 | 6.70k | InvalidContinuation2: | 247 | 6.70k | ch = 3; | 248 | 6.70k | goto Return; | 249 | 472 | InvalidContinuation3: | 250 | 472 | ch = 4; | 251 | 472 | goto Return; | 252 | 5.09k | } |
unicodeobject.c:ucs1lib_utf8_decode Line | Count | Source | 26 | 372k | { | 27 | 372k | Py_UCS4 ch; | 28 | 372k | const char *s = *inptr; | 29 | 372k | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 7.05M | while (s < end) { | 32 | 6.79M | ch = (unsigned char)*s; | 33 | | | 34 | 6.79M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 2.89M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 440k | const char *_s = s; | 45 | 440k | STRINGLIB_CHAR *_p = p; | 46 | 11.0M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 10.7M | size_t value = *(const size_t *) _s; | 51 | 10.7M | if (value & ASCII_CHAR_MASK) | 52 | 178k | break; | 53 | 10.5M | #if PY_LITTLE_ENDIAN | 54 | 10.5M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 10.5M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 10.5M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 10.5M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 10.5M | # if SIZEOF_SIZE_T == 8 | 59 | 10.5M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 10.5M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 10.5M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 10.5M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 10.5M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 10.5M | _s += SIZEOF_SIZE_T; | 82 | 10.5M | _p += SIZEOF_SIZE_T; | 83 | 10.5M | } | 84 | 440k | s = _s; | 85 | 440k | p = _p; | 86 | 440k | if (s == end) | 87 | 51.8k | break; | 88 | 388k | ch = (unsigned char)*s; | 89 | 388k | } | 90 | 2.83M | if (ch < 0x80) { | 91 | 2.81M | s++; | 92 | 2.81M | *p++ = ch; | 93 | 2.81M | continue; | 94 | 2.81M | } | 95 | 2.83M | } | 96 | | | 97 | 3.92M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 3.88M | Py_UCS4 ch2; | 100 | 3.88M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 1.76k | goto InvalidStart; | 105 | 1.76k | } | 106 | 3.88M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 1.05k | break; | 110 | 1.05k | } | 111 | 3.88M | ch2 = (unsigned char)s[1]; | 112 | 3.88M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 14.4k | goto InvalidContinuation1; | 115 | 3.87M | ch = (ch << 6) + ch2 - | 116 | 3.87M | ((0xC0 << 6) + 0x80); | 117 | 3.87M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 3.87M | s += 2; | 119 | 3.87M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 3.87M | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 2.29k | goto Return; | 123 | 3.86M | *p++ = ch; | 124 | 3.86M | continue; | 125 | 3.87M | } | 126 | | | 127 | 41.0k | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 29.5k | Py_UCS4 ch2, ch3; | 130 | 29.5k | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 1.59k | if (end - s < 2) | 134 | 391 | break; | 135 | 1.20k | ch2 = (unsigned char)s[1]; | 136 | 1.20k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 859 | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 765 | goto InvalidContinuation1; | 140 | 435 | break; | 141 | 1.20k | } | 142 | 27.9k | ch2 = (unsigned char)s[1]; | 143 | 27.9k | ch3 = (unsigned char)s[2]; | 144 | 27.9k | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 1.54k | goto InvalidContinuation1; | 147 | 1.54k | } | 148 | 26.3k | if (ch == 0xE0) { | 149 | 618 | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 130 | goto InvalidContinuation1; | 153 | 25.7k | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 697 | goto InvalidContinuation1; | 160 | 697 | } | 161 | 25.5k | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 869 | goto InvalidContinuation2; | 164 | 869 | } | 165 | 24.6k | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 24.6k | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 24.6k | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 24.6k | s += 3; | 169 | 24.6k | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 0 | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 24.6k | goto Return; | 173 | 0 | *p++ = ch; | 174 | 0 | continue; | 175 | 24.6k | } | 176 | | | 177 | 11.5k | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 9.19k | Py_UCS4 ch2, ch3, ch4; | 180 | 9.19k | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 1.82k | if (end - s < 2) | 184 | 310 | break; | 185 | 1.51k | ch2 = (unsigned char)s[1]; | 186 | 1.51k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 1.04k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 869 | goto InvalidContinuation1; | 190 | 645 | if (end - s < 3) | 191 | 132 | break; | 192 | 513 | ch3 = (unsigned char)s[2]; | 193 | 513 | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 399 | goto InvalidContinuation2; | 195 | 114 | break; | 196 | 513 | } | 197 | 7.36k | ch2 = (unsigned char)s[1]; | 198 | 7.36k | ch3 = (unsigned char)s[2]; | 199 | 7.36k | ch4 = (unsigned char)s[3]; | 200 | 7.36k | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 616 | goto InvalidContinuation1; | 203 | 616 | } | 204 | 6.75k | if (ch == 0xF0) { | 205 | 1.13k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 113 | goto InvalidContinuation1; | 209 | 5.61k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 329 | goto InvalidContinuation1; | 213 | 329 | } | 214 | 6.30k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 1.64k | goto InvalidContinuation2; | 217 | 1.64k | } | 218 | 4.66k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 313 | goto InvalidContinuation3; | 221 | 313 | } | 222 | 4.34k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 4.34k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 4.34k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 4.34k | s += 4; | 226 | 4.34k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 0 | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 4.34k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 4.34k | } | 233 | 2.38k | goto InvalidStart; | 234 | 11.5k | } | 235 | 314k | ch = 0; | 236 | 372k | Return: | 237 | 372k | *inptr = s; | 238 | 372k | *outpos = p - dest; | 239 | 372k | return ch; | 240 | 4.15k | InvalidStart: | 241 | 4.15k | ch = 1; | 242 | 4.15k | goto Return; | 243 | 19.4k | InvalidContinuation1: | 244 | 19.4k | ch = 2; | 245 | 19.4k | goto Return; | 246 | 2.91k | InvalidContinuation2: | 247 | 2.91k | ch = 3; | 248 | 2.91k | goto Return; | 249 | 313 | InvalidContinuation3: | 250 | 313 | ch = 4; | 251 | 313 | goto Return; | 252 | 314k | } |
unicodeobject.c:ucs2lib_utf8_decode Line | Count | Source | 26 | 91.8M | { | 27 | 91.8M | Py_UCS4 ch; | 28 | 91.8M | const char *s = *inptr; | 29 | 91.8M | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 195M | while (s < end) { | 32 | 195M | ch = (unsigned char)*s; | 33 | | | 34 | 195M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 65.9M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 8.82M | const char *_s = s; | 45 | 8.82M | STRINGLIB_CHAR *_p = p; | 46 | 193M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 193M | size_t value = *(const size_t *) _s; | 51 | 193M | if (value & ASCII_CHAR_MASK) | 52 | 8.68M | break; | 53 | 185M | #if PY_LITTLE_ENDIAN | 54 | 185M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 185M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 185M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 185M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 185M | # if SIZEOF_SIZE_T == 8 | 59 | 185M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 185M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 185M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 185M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 185M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 185M | _s += SIZEOF_SIZE_T; | 82 | 185M | _p += SIZEOF_SIZE_T; | 83 | 185M | } | 84 | 8.82M | s = _s; | 85 | 8.82M | p = _p; | 86 | 8.82M | if (s == end) | 87 | 10.4k | break; | 88 | 8.81M | ch = (unsigned char)*s; | 89 | 8.81M | } | 90 | 65.9M | if (ch < 0x80) { | 91 | 65.4M | s++; | 92 | 65.4M | *p++ = ch; | 93 | 65.4M | continue; | 94 | 65.4M | } | 95 | 65.9M | } | 96 | | | 97 | 130M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 38.1M | Py_UCS4 ch2; | 100 | 38.1M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 19.1M | goto InvalidStart; | 105 | 19.1M | } | 106 | 19.0M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 7.48k | break; | 110 | 7.48k | } | 111 | 19.0M | ch2 = (unsigned char)s[1]; | 112 | 19.0M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 8.17M | goto InvalidContinuation1; | 115 | 10.8M | ch = (ch << 6) + ch2 - | 116 | 10.8M | ((0xC0 << 6) + 0x80); | 117 | 10.8M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 10.8M | s += 2; | 119 | 10.8M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 0 | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 0 | goto Return; | 123 | 10.8M | *p++ = ch; | 124 | 10.8M | continue; | 125 | 10.8M | } | 126 | | | 127 | 91.8M | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 32.7M | Py_UCS4 ch2, ch3; | 130 | 32.7M | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 5.08k | if (end - s < 2) | 134 | 1.96k | break; | 135 | 3.12k | ch2 = (unsigned char)s[1]; | 136 | 3.12k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 1.63k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 2.25k | goto InvalidContinuation1; | 140 | 864 | break; | 141 | 3.12k | } | 142 | 32.7M | ch2 = (unsigned char)s[1]; | 143 | 32.7M | ch3 = (unsigned char)s[2]; | 144 | 32.7M | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 4.86M | goto InvalidContinuation1; | 147 | 4.86M | } | 148 | 27.8M | if (ch == 0xE0) { | 149 | 238k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 13.8k | goto InvalidContinuation1; | 153 | 27.6M | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 19.8k | goto InvalidContinuation1; | 160 | 19.8k | } | 161 | 27.8M | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 249k | goto InvalidContinuation2; | 164 | 249k | } | 165 | 27.6M | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 27.6M | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 27.6M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 27.6M | s += 3; | 169 | 27.6M | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 0 | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 0 | goto Return; | 173 | 27.6M | *p++ = ch; | 174 | 27.6M | continue; | 175 | 27.6M | } | 176 | | | 177 | 59.1M | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 3.54M | Py_UCS4 ch2, ch3, ch4; | 180 | 3.54M | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 9.11k | if (end - s < 2) | 184 | 2.19k | break; | 185 | 6.91k | ch2 = (unsigned char)s[1]; | 186 | 6.91k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 3.63k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 4.33k | goto InvalidContinuation1; | 190 | 2.58k | if (end - s < 3) | 191 | 557 | break; | 192 | 2.02k | ch3 = (unsigned char)s[2]; | 193 | 2.02k | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 653 | goto InvalidContinuation2; | 195 | 1.37k | break; | 196 | 2.02k | } | 197 | 3.53M | ch2 = (unsigned char)s[1]; | 198 | 3.53M | ch3 = (unsigned char)s[2]; | 199 | 3.53M | ch4 = (unsigned char)s[3]; | 200 | 3.53M | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 3.34M | goto InvalidContinuation1; | 203 | 3.34M | } | 204 | 185k | if (ch == 0xF0) { | 205 | 40.0k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 9.32k | goto InvalidContinuation1; | 209 | 145k | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 33.4k | goto InvalidContinuation1; | 213 | 33.4k | } | 214 | 142k | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 86.2k | goto InvalidContinuation2; | 217 | 86.2k | } | 218 | 56.6k | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 18.2k | goto InvalidContinuation3; | 221 | 18.2k | } | 222 | 38.4k | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 38.4k | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 38.4k | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 38.4k | s += 4; | 226 | 38.4k | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 0 | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 38.4k | goto Return; | 230 | 0 | *p++ = ch; | 231 | 0 | continue; | 232 | 38.4k | } | 233 | 55.5M | goto InvalidStart; | 234 | 59.1M | } | 235 | 341k | ch = 0; | 236 | 91.8M | Return: | 237 | 91.8M | *inptr = s; | 238 | 91.8M | *outpos = p - dest; | 239 | 91.8M | return ch; | 240 | 74.6M | InvalidStart: | 241 | 74.6M | ch = 1; | 242 | 74.6M | goto Return; | 243 | 16.4M | InvalidContinuation1: | 244 | 16.4M | ch = 2; | 245 | 16.4M | goto Return; | 246 | 336k | InvalidContinuation2: | 247 | 336k | ch = 3; | 248 | 336k | goto Return; | 249 | 18.2k | InvalidContinuation3: | 250 | 18.2k | ch = 4; | 251 | 18.2k | goto Return; | 252 | 341k | } |
unicodeobject.c:ucs4lib_utf8_decode Line | Count | Source | 26 | 100M | { | 27 | 100M | Py_UCS4 ch; | 28 | 100M | const char *s = *inptr; | 29 | 100M | STRINGLIB_CHAR *p = dest + *outpos; | 30 | | | 31 | 249M | while (s < end) { | 32 | 248M | ch = (unsigned char)*s; | 33 | | | 34 | 248M | if (ch < 0x80) { | 35 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 36 | | input will consist of an overwhelming majority of ASCII | 37 | | characters, we try to optimize for this case by checking | 38 | | as many characters as a C 'size_t' can contain. | 39 | | First, check if we can do an aligned read, as most CPUs have | 40 | | a penalty for unaligned reads. | 41 | | */ | 42 | 113M | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { | 43 | | /* Help register allocation */ | 44 | 14.4M | const char *_s = s; | 45 | 14.4M | STRINGLIB_CHAR *_p = p; | 46 | 167M | while (_s + SIZEOF_SIZE_T <= end) { | 47 | | /* Read a whole size_t at a time (either 4 or 8 bytes), | 48 | | and do a fast unrolled copy if it only contains ASCII | 49 | | characters. */ | 50 | 167M | size_t value = *(const size_t *) _s; | 51 | 167M | if (value & ASCII_CHAR_MASK) | 52 | 14.4M | break; | 53 | 153M | #if PY_LITTLE_ENDIAN | 54 | 153M | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 55 | 153M | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 56 | 153M | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 57 | 153M | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 58 | 153M | # if SIZEOF_SIZE_T == 8 | 59 | 153M | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 60 | 153M | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 61 | 153M | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 62 | 153M | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 63 | 153M | # endif | 64 | | #else | 65 | | # if SIZEOF_SIZE_T == 8 | 66 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 67 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 68 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 69 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 70 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 71 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 72 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 73 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 74 | | # else | 75 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 76 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 77 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 78 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 79 | | # endif | 80 | | #endif | 81 | 153M | _s += SIZEOF_SIZE_T; | 82 | 153M | _p += SIZEOF_SIZE_T; | 83 | 153M | } | 84 | 14.4M | s = _s; | 85 | 14.4M | p = _p; | 86 | 14.4M | if (s == end) | 87 | 6.85k | break; | 88 | 14.4M | ch = (unsigned char)*s; | 89 | 14.4M | } | 90 | 113M | if (ch < 0x80) { | 91 | 112M | s++; | 92 | 112M | *p++ = ch; | 93 | 112M | continue; | 94 | 112M | } | 95 | 113M | } | 96 | | | 97 | 136M | if (ch < 0xE0) { | 98 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 99 | 63.1M | Py_UCS4 ch2; | 100 | 63.1M | if (ch < 0xC2) { | 101 | | /* invalid sequence | 102 | | \x80-\xBF -- continuation byte | 103 | | \xC0-\xC1 -- fake 0000-007F */ | 104 | 40.7M | goto InvalidStart; | 105 | 40.7M | } | 106 | 22.4M | if (end - s < 2) { | 107 | | /* unexpected end of data: the caller will decide whether | 108 | | it's an error or not */ | 109 | 1.79k | break; | 110 | 1.79k | } | 111 | 22.4M | ch2 = (unsigned char)s[1]; | 112 | 22.4M | if (!IS_CONTINUATION_BYTE(ch2)) | 113 | | /* invalid continuation byte */ | 114 | 17.4M | goto InvalidContinuation1; | 115 | 5.06M | ch = (ch << 6) + ch2 - | 116 | 5.06M | ((0xC0 << 6) + 0x80); | 117 | 5.06M | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 118 | 5.06M | s += 2; | 119 | 5.06M | if (STRINGLIB_MAX_CHAR <= 0x007F || | 120 | 0 | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 121 | | /* Out-of-range */ | 122 | 0 | goto Return; | 123 | 5.06M | *p++ = ch; | 124 | 5.06M | continue; | 125 | 5.06M | } | 126 | | | 127 | 73.2M | if (ch < 0xF0) { | 128 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 129 | 37.2M | Py_UCS4 ch2, ch3; | 130 | 37.2M | if (end - s < 3) { | 131 | | /* unexpected end of data: the caller will decide whether | 132 | | it's an error or not */ | 133 | 4.33k | if (end - s < 2) | 134 | 1.53k | break; | 135 | 2.80k | ch2 = (unsigned char)s[1]; | 136 | 2.80k | if (!IS_CONTINUATION_BYTE(ch2) || | 137 | 1.41k | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 138 | | /* for clarification see comments below */ | 139 | 2.14k | goto InvalidContinuation1; | 140 | 657 | break; | 141 | 2.80k | } | 142 | 37.2M | ch2 = (unsigned char)s[1]; | 143 | 37.2M | ch3 = (unsigned char)s[2]; | 144 | 37.2M | if (!IS_CONTINUATION_BYTE(ch2)) { | 145 | | /* invalid continuation byte */ | 146 | 7.29M | goto InvalidContinuation1; | 147 | 7.29M | } | 148 | 29.9M | if (ch == 0xE0) { | 149 | 193k | if (ch2 < 0xA0) | 150 | | /* invalid sequence | 151 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 152 | 57.1k | goto InvalidContinuation1; | 153 | 29.7M | } else if (ch == 0xED && ch2 >= 0xA0) { | 154 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 155 | | will result in surrogates in range D800-DFFF. Surrogates are | 156 | | not valid UTF-8 so they are rejected. | 157 | | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 158 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 159 | 94.2k | goto InvalidContinuation1; | 160 | 94.2k | } | 161 | 29.7M | if (!IS_CONTINUATION_BYTE(ch3)) { | 162 | | /* invalid continuation byte */ | 163 | 1.21M | goto InvalidContinuation2; | 164 | 1.21M | } | 165 | 28.5M | ch = (ch << 12) + (ch2 << 6) + ch3 - | 166 | 28.5M | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 167 | 28.5M | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 168 | 28.5M | s += 3; | 169 | 28.5M | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 170 | 0 | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 171 | | /* Out-of-range */ | 172 | 0 | goto Return; | 173 | 28.5M | *p++ = ch; | 174 | 28.5M | continue; | 175 | 28.5M | } | 176 | | | 177 | 35.9M | if (ch < 0xF5) { | 178 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 179 | 6.59M | Py_UCS4 ch2, ch3, ch4; | 180 | 6.59M | if (end - s < 4) { | 181 | | /* unexpected end of data: the caller will decide whether | 182 | | it's an error or not */ | 183 | 4.58k | if (end - s < 2) | 184 | 868 | break; | 185 | 3.71k | ch2 = (unsigned char)s[1]; | 186 | 3.71k | if (!IS_CONTINUATION_BYTE(ch2) || | 187 | 2.46k | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 188 | | /* for clarification see comments below */ | 189 | 1.95k | goto InvalidContinuation1; | 190 | 1.76k | if (end - s < 3) | 191 | 513 | break; | 192 | 1.25k | ch3 = (unsigned char)s[2]; | 193 | 1.25k | if (!IS_CONTINUATION_BYTE(ch3)) | 194 | 622 | goto InvalidContinuation2; | 195 | 628 | break; | 196 | 1.25k | } | 197 | 6.58M | ch2 = (unsigned char)s[1]; | 198 | 6.58M | ch3 = (unsigned char)s[2]; | 199 | 6.58M | ch4 = (unsigned char)s[3]; | 200 | 6.58M | if (!IS_CONTINUATION_BYTE(ch2)) { | 201 | | /* invalid continuation byte */ | 202 | 3.75M | goto InvalidContinuation1; | 203 | 3.75M | } | 204 | 2.83M | if (ch == 0xF0) { | 205 | 419k | if (ch2 < 0x90) | 206 | | /* invalid sequence | 207 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 208 | 37.9k | goto InvalidContinuation1; | 209 | 2.41M | } else if (ch == 0xF4 && ch2 >= 0x90) { | 210 | | /* invalid sequence | 211 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 212 | 84.0k | goto InvalidContinuation1; | 213 | 84.0k | } | 214 | 2.71M | if (!IS_CONTINUATION_BYTE(ch3)) { | 215 | | /* invalid continuation byte */ | 216 | 418k | goto InvalidContinuation2; | 217 | 418k | } | 218 | 2.29M | if (!IS_CONTINUATION_BYTE(ch4)) { | 219 | | /* invalid continuation byte */ | 220 | 256k | goto InvalidContinuation3; | 221 | 256k | } | 222 | 2.03M | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 223 | 2.03M | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 224 | 2.03M | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 225 | 2.03M | s += 4; | 226 | 2.03M | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 227 | 0 | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 228 | | /* Out-of-range */ | 229 | 0 | goto Return; | 230 | 2.03M | *p++ = ch; | 231 | 2.03M | continue; | 232 | 2.03M | } | 233 | 29.3M | goto InvalidStart; | 234 | 35.9M | } | 235 | 89.6k | ch = 0; | 236 | 100M | Return: | 237 | 100M | *inptr = s; | 238 | 100M | *outpos = p - dest; | 239 | 100M | return ch; | 240 | 70.1M | InvalidStart: | 241 | 70.1M | ch = 1; | 242 | 70.1M | goto Return; | 243 | 28.7M | InvalidContinuation1: | 244 | 28.7M | ch = 2; | 245 | 28.7M | goto Return; | 246 | 1.63M | InvalidContinuation2: | 247 | 1.63M | ch = 3; | 248 | 1.63M | goto Return; | 249 | 256k | InvalidContinuation3: | 250 | 256k | ch = 4; | 251 | 256k | goto Return; | 252 | 89.6k | } |
|
253 | | |
254 | | #undef ASCII_CHAR_MASK |
255 | | |
256 | | |
257 | | /* UTF-8 encoder specialized for a Unicode kind to avoid the slow |
258 | | PyUnicode_READ() macro. Delete some parts of the code depending on the kind: |
259 | | UCS-1 strings don't need to handle surrogates for example. */ |
260 | | Py_LOCAL_INLINE(PyBytesWriter*) |
261 | | STRINGLIB(utf8_encoder)(PyObject *unicode, |
262 | | const STRINGLIB_CHAR *data, |
263 | | Py_ssize_t size, |
264 | | _Py_error_handler error_handler, |
265 | | const char *errors, |
266 | | char **end) |
267 | 8.58M | { |
268 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
269 | | PyObject *error_handler_obj = NULL; |
270 | | PyObject *exc = NULL; |
271 | | PyObject *rep = NULL; |
272 | | #endif |
273 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
274 | | const Py_ssize_t max_char_size = 2; |
275 | | #elif STRINGLIB_SIZEOF_CHAR == 2 |
276 | | const Py_ssize_t max_char_size = 3; |
277 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ |
278 | | const Py_ssize_t max_char_size = 4; |
279 | | #endif |
280 | | |
281 | 8.58M | assert(size >= 0); |
282 | 8.58M | if (size > PY_SSIZE_T_MAX / max_char_size) { |
283 | | /* integer overflow */ |
284 | 0 | PyErr_NoMemory(); |
285 | 0 | *end = NULL; |
286 | 0 | return NULL; |
287 | 0 | } |
288 | | |
289 | 8.58M | PyBytesWriter *writer = PyBytesWriter_Create(size * max_char_size); |
290 | 8.58M | if (writer == NULL) { |
291 | 0 | *end = NULL; |
292 | 0 | return NULL; |
293 | 0 | } |
294 | | /* next free byte in output buffer */ |
295 | 8.58M | char *p = PyBytesWriter_GetData(writer); |
296 | | |
297 | 6.57M | Py_ssize_t i; /* index into data of next input character */ |
298 | 3.79G | for (i = 0; i < size;) { |
299 | 3.78G | Py_UCS4 ch = data[i++]; |
300 | | |
301 | 3.78G | if (ch < 0x80) { |
302 | | /* Encode ASCII */ |
303 | 3.56G | *p++ = (char) ch; |
304 | | |
305 | 3.56G | } |
306 | 120M | else |
307 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
308 | 120M | if (ch < 0x0800) |
309 | 27.0M | #endif |
310 | 133M | { |
311 | | /* Encode Latin-1 */ |
312 | 133M | *p++ = (char)(0xc0 | (ch >> 6)); |
313 | 133M | *p++ = (char)(0x80 | (ch & 0x3f)); |
314 | 133M | } |
315 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
316 | 93.6M | else if (Py_UNICODE_IS_SURROGATE(ch)) { |
317 | 411k | Py_ssize_t startpos, endpos, newpos; |
318 | 411k | Py_ssize_t k; |
319 | 411k | if (error_handler == _Py_ERROR_UNKNOWN) { |
320 | 238k | error_handler = _Py_GetErrorHandler(errors); |
321 | 238k | } |
322 | | |
323 | 411k | startpos = i-1; |
324 | 411k | endpos = startpos+1; |
325 | | |
326 | 14.6M | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) |
327 | 14.2M | endpos++; |
328 | | |
329 | | /* Only overallocate the buffer if it's not the last write */ |
330 | 411k | writer->overallocate = (endpos < size); |
331 | | |
332 | 411k | switch (error_handler) |
333 | 411k | { |
334 | 0 | case _Py_ERROR_REPLACE: |
335 | 0 | memset(p, '?', endpos - startpos); |
336 | 0 | p += (endpos - startpos); |
337 | 0 | _Py_FALLTHROUGH; |
338 | 0 | case _Py_ERROR_IGNORE: |
339 | 0 | i += (endpos - startpos - 1); |
340 | 0 | break; |
341 | | |
342 | 0 | case _Py_ERROR_SURROGATEPASS: |
343 | 0 | for (k=startpos; k<endpos; k++) { |
344 | 0 | ch = data[k]; |
345 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); |
346 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
347 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); |
348 | 0 | } |
349 | 0 | i += (endpos - startpos - 1); |
350 | 0 | break; |
351 | | |
352 | 0 | case _Py_ERROR_BACKSLASHREPLACE: |
353 | | /* subtract preallocated bytes */ |
354 | 0 | writer->size -= max_char_size * (endpos - startpos); |
355 | 0 | p = backslashreplace(writer, p, |
356 | 0 | unicode, startpos, endpos); |
357 | 0 | if (p == NULL) |
358 | 0 | goto error; |
359 | 0 | i += (endpos - startpos - 1); |
360 | 0 | break; |
361 | | |
362 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: |
363 | | /* subtract preallocated bytes */ |
364 | 0 | writer->size -= max_char_size * (endpos - startpos); |
365 | 0 | p = xmlcharrefreplace(writer, p, |
366 | 0 | unicode, startpos, endpos); |
367 | 0 | if (p == NULL) |
368 | 0 | goto error; |
369 | 0 | i += (endpos - startpos - 1); |
370 | 0 | break; |
371 | | |
372 | 236k | case _Py_ERROR_SURROGATEESCAPE: |
373 | 10.7M | for (k=startpos; k<endpos; k++) { |
374 | 10.4M | ch = data[k]; |
375 | 10.4M | if (!(0xDC80 <= ch && ch <= 0xDCFF)) |
376 | 20 | break; |
377 | 10.4M | *p++ = (char)(ch & 0xff); |
378 | 10.4M | } |
379 | 236k | if (k >= endpos) { |
380 | 236k | i += (endpos - startpos - 1); |
381 | 236k | break; |
382 | 236k | } |
383 | 20 | startpos = k; |
384 | 20 | assert(startpos < endpos); |
385 | 20 | _Py_FALLTHROUGH; |
386 | 175k | default: |
387 | 175k | rep = unicode_encode_call_errorhandler( |
388 | 175k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", |
389 | 175k | unicode, &exc, startpos, endpos, &newpos); |
390 | 175k | if (!rep) |
391 | 175k | goto error; |
392 | | |
393 | 0 | if (newpos < startpos) { |
394 | 0 | writer->overallocate = 1; |
395 | 0 | p = PyBytesWriter_GrowAndUpdatePointer(writer, |
396 | 0 | max_char_size * (startpos - newpos), |
397 | 0 | p); |
398 | 0 | if (p == NULL) { |
399 | 0 | goto error; |
400 | 0 | } |
401 | 0 | } |
402 | 0 | else { |
403 | | /* subtract preallocated bytes */ |
404 | 0 | writer->size -= max_char_size * (newpos - startpos); |
405 | | /* Only overallocate the buffer if it's not the last write */ |
406 | 0 | writer->overallocate = (newpos < size); |
407 | 0 | } |
408 | | |
409 | 0 | char *rep_str; |
410 | 0 | Py_ssize_t rep_len; |
411 | 0 | if (PyBytes_Check(rep)) { |
412 | 0 | rep_str = PyBytes_AS_STRING(rep); |
413 | 0 | rep_len = PyBytes_GET_SIZE(rep); |
414 | 0 | } |
415 | 0 | else { |
416 | | /* rep is unicode */ |
417 | 0 | if (!PyUnicode_IS_ASCII(rep)) { |
418 | 0 | raise_encode_exception(&exc, "utf-8", unicode, |
419 | 0 | startpos, endpos, |
420 | 0 | "surrogates not allowed"); |
421 | 0 | goto error; |
422 | 0 | } |
423 | | |
424 | 0 | rep_str = PyUnicode_DATA(rep); |
425 | 0 | rep_len = PyUnicode_GET_LENGTH(rep); |
426 | 0 | } |
427 | | |
428 | 0 | p = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, p); |
429 | 0 | if (p == NULL) { |
430 | 0 | goto error; |
431 | 0 | } |
432 | 0 | memcpy(p, rep_str, rep_len); |
433 | 0 | p += rep_len; |
434 | 0 | Py_CLEAR(rep); |
435 | |
|
436 | 0 | i = newpos; |
437 | 411k | } |
438 | | |
439 | | /* If overallocation was disabled, ensure that it was the last |
440 | | write. Otherwise, we missed an optimization */ |
441 | 411k | assert(writer->overallocate || i == size); |
442 | 236k | } |
443 | 38.7M | else |
444 | | #if STRINGLIB_SIZEOF_CHAR > 2 |
445 | 38.7M | if (ch < 0x10000) |
446 | 38.6M | #endif |
447 | 93.1M | { |
448 | 93.1M | *p++ = (char)(0xe0 | (ch >> 12)); |
449 | 93.1M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
450 | 93.1M | *p++ = (char)(0x80 | (ch & 0x3f)); |
451 | 93.1M | } |
452 | | #if STRINGLIB_SIZEOF_CHAR > 2 |
453 | | else /* ch >= 0x10000 */ |
454 | 123k | { |
455 | 123k | assert(ch <= MAX_UNICODE); |
456 | | /* Encode UCS4 Unicode ordinals */ |
457 | 123k | *p++ = (char)(0xf0 | (ch >> 18)); |
458 | 123k | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); |
459 | 123k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
460 | 123k | *p++ = (char)(0x80 | (ch & 0x3f)); |
461 | 123k | } |
462 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ |
463 | | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ |
464 | 3.78G | } |
465 | | |
466 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
467 | 1.82M | Py_XDECREF(error_handler_obj); |
468 | 1.82M | Py_XDECREF(exc); |
469 | | #endif |
470 | 1.82M | *end = p; |
471 | 1.82M | return writer; |
472 | | |
473 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
474 | 175k | error: |
475 | 175k | PyBytesWriter_Discard(writer); |
476 | 175k | Py_XDECREF(rep); |
477 | 175k | Py_XDECREF(error_handler_obj); |
478 | 175k | Py_XDECREF(exc); |
479 | 175k | *end = NULL; |
480 | 175k | return NULL; |
481 | | #endif |
482 | 2.00M | } unicodeobject.c:ucs1lib_utf8_encoder Line | Count | Source | 267 | 6.57M | { | 268 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 269 | | PyObject *error_handler_obj = NULL; | 270 | | PyObject *exc = NULL; | 271 | | PyObject *rep = NULL; | 272 | | #endif | 273 | 6.57M | #if STRINGLIB_SIZEOF_CHAR == 1 | 274 | 6.57M | const Py_ssize_t max_char_size = 2; | 275 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 276 | | const Py_ssize_t max_char_size = 3; | 277 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 278 | | const Py_ssize_t max_char_size = 4; | 279 | | #endif | 280 | | | 281 | 6.57M | assert(size >= 0); | 282 | 6.57M | if (size > PY_SSIZE_T_MAX / max_char_size) { | 283 | | /* integer overflow */ | 284 | 0 | PyErr_NoMemory(); | 285 | 0 | *end = NULL; | 286 | 0 | return NULL; | 287 | 0 | } | 288 | | | 289 | 6.57M | PyBytesWriter *writer = PyBytesWriter_Create(size * max_char_size); | 290 | 6.57M | if (writer == NULL) { | 291 | 0 | *end = NULL; | 292 | 0 | return NULL; | 293 | 0 | } | 294 | | /* next free byte in output buffer */ | 295 | 6.57M | char *p = PyBytesWriter_GetData(writer); | 296 | | | 297 | 6.57M | Py_ssize_t i; /* index into data of next input character */ | 298 | 840M | for (i = 0; i < size;) { | 299 | 833M | Py_UCS4 ch = data[i++]; | 300 | | | 301 | 833M | if (ch < 0x80) { | 302 | | /* Encode ASCII */ | 303 | 727M | *p++ = (char) ch; | 304 | | | 305 | 727M | } | 306 | 106M | else | 307 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 308 | | if (ch < 0x0800) | 309 | | #endif | 310 | 106M | { | 311 | | /* Encode Latin-1 */ | 312 | 106M | *p++ = (char)(0xc0 | (ch >> 6)); | 313 | 106M | *p++ = (char)(0x80 | (ch & 0x3f)); | 314 | 106M | } | 315 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 316 | | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 317 | | Py_ssize_t startpos, endpos, newpos; | 318 | | Py_ssize_t k; | 319 | | if (error_handler == _Py_ERROR_UNKNOWN) { | 320 | | error_handler = _Py_GetErrorHandler(errors); | 321 | | } | 322 | | | 323 | | startpos = i-1; | 324 | | endpos = startpos+1; | 325 | | | 326 | | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 327 | | endpos++; | 328 | | | 329 | | /* Only overallocate the buffer if it's not the last write */ | 330 | | writer->overallocate = (endpos < size); | 331 | | | 332 | | switch (error_handler) | 333 | | { | 334 | | case _Py_ERROR_REPLACE: | 335 | | memset(p, '?', endpos - startpos); | 336 | | p += (endpos - startpos); | 337 | | _Py_FALLTHROUGH; | 338 | | case _Py_ERROR_IGNORE: | 339 | | i += (endpos - startpos - 1); | 340 | | break; | 341 | | | 342 | | case _Py_ERROR_SURROGATEPASS: | 343 | | for (k=startpos; k<endpos; k++) { | 344 | | ch = data[k]; | 345 | | *p++ = (char)(0xe0 | (ch >> 12)); | 346 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 347 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 348 | | } | 349 | | i += (endpos - startpos - 1); | 350 | | break; | 351 | | | 352 | | case _Py_ERROR_BACKSLASHREPLACE: | 353 | | /* subtract preallocated bytes */ | 354 | | writer->size -= max_char_size * (endpos - startpos); | 355 | | p = backslashreplace(writer, p, | 356 | | unicode, startpos, endpos); | 357 | | if (p == NULL) | 358 | | goto error; | 359 | | i += (endpos - startpos - 1); | 360 | | break; | 361 | | | 362 | | case _Py_ERROR_XMLCHARREFREPLACE: | 363 | | /* subtract preallocated bytes */ | 364 | | writer->size -= max_char_size * (endpos - startpos); | 365 | | p = xmlcharrefreplace(writer, p, | 366 | | unicode, startpos, endpos); | 367 | | if (p == NULL) | 368 | | goto error; | 369 | | i += (endpos - startpos - 1); | 370 | | break; | 371 | | | 372 | | case _Py_ERROR_SURROGATEESCAPE: | 373 | | for (k=startpos; k<endpos; k++) { | 374 | | ch = data[k]; | 375 | | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 376 | | break; | 377 | | *p++ = (char)(ch & 0xff); | 378 | | } | 379 | | if (k >= endpos) { | 380 | | i += (endpos - startpos - 1); | 381 | | break; | 382 | | } | 383 | | startpos = k; | 384 | | assert(startpos < endpos); | 385 | | _Py_FALLTHROUGH; | 386 | | default: | 387 | | rep = unicode_encode_call_errorhandler( | 388 | | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 389 | | unicode, &exc, startpos, endpos, &newpos); | 390 | | if (!rep) | 391 | | goto error; | 392 | | | 393 | | if (newpos < startpos) { | 394 | | writer->overallocate = 1; | 395 | | p = PyBytesWriter_GrowAndUpdatePointer(writer, | 396 | | max_char_size * (startpos - newpos), | 397 | | p); | 398 | | if (p == NULL) { | 399 | | goto error; | 400 | | } | 401 | | } | 402 | | else { | 403 | | /* subtract preallocated bytes */ | 404 | | writer->size -= max_char_size * (newpos - startpos); | 405 | | /* Only overallocate the buffer if it's not the last write */ | 406 | | writer->overallocate = (newpos < size); | 407 | | } | 408 | | | 409 | | char *rep_str; | 410 | | Py_ssize_t rep_len; | 411 | | if (PyBytes_Check(rep)) { | 412 | | rep_str = PyBytes_AS_STRING(rep); | 413 | | rep_len = PyBytes_GET_SIZE(rep); | 414 | | } | 415 | | else { | 416 | | /* rep is unicode */ | 417 | | if (!PyUnicode_IS_ASCII(rep)) { | 418 | | raise_encode_exception(&exc, "utf-8", unicode, | 419 | | startpos, endpos, | 420 | | "surrogates not allowed"); | 421 | | goto error; | 422 | | } | 423 | | | 424 | | rep_str = PyUnicode_DATA(rep); | 425 | | rep_len = PyUnicode_GET_LENGTH(rep); | 426 | | } | 427 | | | 428 | | p = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, p); | 429 | | if (p == NULL) { | 430 | | goto error; | 431 | | } | 432 | | memcpy(p, rep_str, rep_len); | 433 | | p += rep_len; | 434 | | Py_CLEAR(rep); | 435 | | | 436 | | i = newpos; | 437 | | } | 438 | | | 439 | | /* If overallocation was disabled, ensure that it was the last | 440 | | write. Otherwise, we missed an optimization */ | 441 | | assert(writer->overallocate || i == size); | 442 | | } | 443 | | else | 444 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 445 | | if (ch < 0x10000) | 446 | | #endif | 447 | | { | 448 | | *p++ = (char)(0xe0 | (ch >> 12)); | 449 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 450 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 451 | | } | 452 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 453 | | else /* ch >= 0x10000 */ | 454 | | { | 455 | | assert(ch <= MAX_UNICODE); | 456 | | /* Encode UCS4 Unicode ordinals */ | 457 | | *p++ = (char)(0xf0 | (ch >> 18)); | 458 | | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 459 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 460 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 461 | | } | 462 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 463 | | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 464 | 833M | } | 465 | | | 466 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 467 | | Py_XDECREF(error_handler_obj); | 468 | | Py_XDECREF(exc); | 469 | | #endif | 470 | 6.57M | *end = p; | 471 | 6.57M | return writer; | 472 | | | 473 | | #if STRINGLIB_SIZEOF_CHAR > 1 | 474 | | error: | 475 | | PyBytesWriter_Discard(writer); | 476 | | Py_XDECREF(rep); | 477 | | Py_XDECREF(error_handler_obj); | 478 | | Py_XDECREF(exc); | 479 | | *end = NULL; | 480 | | return NULL; | 481 | | #endif | 482 | 6.57M | } |
unicodeobject.c:ucs2lib_utf8_encoder Line | Count | Source | 267 | 1.93M | { | 268 | 1.93M | #if STRINGLIB_SIZEOF_CHAR > 1 | 269 | 1.93M | PyObject *error_handler_obj = NULL; | 270 | 1.93M | PyObject *exc = NULL; | 271 | 1.93M | PyObject *rep = NULL; | 272 | 1.93M | #endif | 273 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 274 | | const Py_ssize_t max_char_size = 2; | 275 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 276 | | const Py_ssize_t max_char_size = 3; | 277 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 278 | | const Py_ssize_t max_char_size = 4; | 279 | | #endif | 280 | | | 281 | 1.93M | assert(size >= 0); | 282 | 1.93M | if (size > PY_SSIZE_T_MAX / max_char_size) { | 283 | | /* integer overflow */ | 284 | 0 | PyErr_NoMemory(); | 285 | 0 | *end = NULL; | 286 | 0 | return NULL; | 287 | 0 | } | 288 | | | 289 | 1.93M | PyBytesWriter *writer = PyBytesWriter_Create(size * max_char_size); | 290 | 1.93M | if (writer == NULL) { | 291 | 0 | *end = NULL; | 292 | 0 | return NULL; | 293 | 0 | } | 294 | | /* next free byte in output buffer */ | 295 | 1.93M | char *p = PyBytesWriter_GetData(writer); | 296 | | | 297 | 1.93M | Py_ssize_t i; /* index into data of next input character */ | 298 | 1.62G | for (i = 0; i < size;) { | 299 | 1.62G | Py_UCS4 ch = data[i++]; | 300 | | | 301 | 1.62G | if (ch < 0x80) { | 302 | | /* Encode ASCII */ | 303 | 1.53G | *p++ = (char) ch; | 304 | | | 305 | 1.53G | } | 306 | 81.0M | else | 307 | 81.0M | #if STRINGLIB_SIZEOF_CHAR > 1 | 308 | 81.0M | if (ch < 0x0800) | 309 | 26.1M | #endif | 310 | 26.1M | { | 311 | | /* Encode Latin-1 */ | 312 | 26.1M | *p++ = (char)(0xc0 | (ch >> 6)); | 313 | 26.1M | *p++ = (char)(0x80 | (ch & 0x3f)); | 314 | 26.1M | } | 315 | 54.8M | #if STRINGLIB_SIZEOF_CHAR > 1 | 316 | 54.8M | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 317 | 393k | Py_ssize_t startpos, endpos, newpos; | 318 | 393k | Py_ssize_t k; | 319 | 393k | if (error_handler == _Py_ERROR_UNKNOWN) { | 320 | 229k | error_handler = _Py_GetErrorHandler(errors); | 321 | 229k | } | 322 | | | 323 | 393k | startpos = i-1; | 324 | 393k | endpos = startpos+1; | 325 | | | 326 | 14.5M | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 327 | 14.1M | endpos++; | 328 | | | 329 | | /* Only overallocate the buffer if it's not the last write */ | 330 | 393k | writer->overallocate = (endpos < size); | 331 | | | 332 | 393k | switch (error_handler) | 333 | 393k | { | 334 | 0 | case _Py_ERROR_REPLACE: | 335 | 0 | memset(p, '?', endpos - startpos); | 336 | 0 | p += (endpos - startpos); | 337 | 0 | _Py_FALLTHROUGH; | 338 | 0 | case _Py_ERROR_IGNORE: | 339 | 0 | i += (endpos - startpos - 1); | 340 | 0 | break; | 341 | | | 342 | 0 | case _Py_ERROR_SURROGATEPASS: | 343 | 0 | for (k=startpos; k<endpos; k++) { | 344 | 0 | ch = data[k]; | 345 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); | 346 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 347 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); | 348 | 0 | } | 349 | 0 | i += (endpos - startpos - 1); | 350 | 0 | break; | 351 | | | 352 | 0 | case _Py_ERROR_BACKSLASHREPLACE: | 353 | | /* subtract preallocated bytes */ | 354 | 0 | writer->size -= max_char_size * (endpos - startpos); | 355 | 0 | p = backslashreplace(writer, p, | 356 | 0 | unicode, startpos, endpos); | 357 | 0 | if (p == NULL) | 358 | 0 | goto error; | 359 | 0 | i += (endpos - startpos - 1); | 360 | 0 | break; | 361 | | | 362 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: | 363 | | /* subtract preallocated bytes */ | 364 | 0 | writer->size -= max_char_size * (endpos - startpos); | 365 | 0 | p = xmlcharrefreplace(writer, p, | 366 | 0 | unicode, startpos, endpos); | 367 | 0 | if (p == NULL) | 368 | 0 | goto error; | 369 | 0 | i += (endpos - startpos - 1); | 370 | 0 | break; | 371 | | | 372 | 225k | case _Py_ERROR_SURROGATEESCAPE: | 373 | 10.6M | for (k=startpos; k<endpos; k++) { | 374 | 10.3M | ch = data[k]; | 375 | 10.3M | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 376 | 13 | break; | 377 | 10.3M | *p++ = (char)(ch & 0xff); | 378 | 10.3M | } | 379 | 225k | if (k >= endpos) { | 380 | 225k | i += (endpos - startpos - 1); | 381 | 225k | break; | 382 | 225k | } | 383 | 13 | startpos = k; | 384 | 13 | assert(startpos < endpos); | 385 | 13 | _Py_FALLTHROUGH; | 386 | 168k | default: | 387 | 168k | rep = unicode_encode_call_errorhandler( | 388 | 168k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 389 | 168k | unicode, &exc, startpos, endpos, &newpos); | 390 | 168k | if (!rep) | 391 | 168k | goto error; | 392 | | | 393 | 0 | if (newpos < startpos) { | 394 | 0 | writer->overallocate = 1; | 395 | 0 | p = PyBytesWriter_GrowAndUpdatePointer(writer, | 396 | 0 | max_char_size * (startpos - newpos), | 397 | 0 | p); | 398 | 0 | if (p == NULL) { | 399 | 0 | goto error; | 400 | 0 | } | 401 | 0 | } | 402 | 0 | else { | 403 | | /* subtract preallocated bytes */ | 404 | 0 | writer->size -= max_char_size * (newpos - startpos); | 405 | | /* Only overallocate the buffer if it's not the last write */ | 406 | 0 | writer->overallocate = (newpos < size); | 407 | 0 | } | 408 | | | 409 | 0 | char *rep_str; | 410 | 0 | Py_ssize_t rep_len; | 411 | 0 | if (PyBytes_Check(rep)) { | 412 | 0 | rep_str = PyBytes_AS_STRING(rep); | 413 | 0 | rep_len = PyBytes_GET_SIZE(rep); | 414 | 0 | } | 415 | 0 | else { | 416 | | /* rep is unicode */ | 417 | 0 | if (!PyUnicode_IS_ASCII(rep)) { | 418 | 0 | raise_encode_exception(&exc, "utf-8", unicode, | 419 | 0 | startpos, endpos, | 420 | 0 | "surrogates not allowed"); | 421 | 0 | goto error; | 422 | 0 | } | 423 | | | 424 | 0 | rep_str = PyUnicode_DATA(rep); | 425 | 0 | rep_len = PyUnicode_GET_LENGTH(rep); | 426 | 0 | } | 427 | | | 428 | 0 | p = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, p); | 429 | 0 | if (p == NULL) { | 430 | 0 | goto error; | 431 | 0 | } | 432 | 0 | memcpy(p, rep_str, rep_len); | 433 | 0 | p += rep_len; | 434 | 0 | Py_CLEAR(rep); | 435 | |
| 436 | 0 | i = newpos; | 437 | 393k | } | 438 | | | 439 | | /* If overallocation was disabled, ensure that it was the last | 440 | | write. Otherwise, we missed an optimization */ | 441 | 393k | assert(writer->overallocate || i == size); | 442 | 225k | } | 443 | 54.4M | else | 444 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 445 | | if (ch < 0x10000) | 446 | | #endif | 447 | 54.4M | { | 448 | 54.4M | *p++ = (char)(0xe0 | (ch >> 12)); | 449 | 54.4M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 450 | 54.4M | *p++ = (char)(0x80 | (ch & 0x3f)); | 451 | 54.4M | } | 452 | | #if STRINGLIB_SIZEOF_CHAR > 2 | 453 | | else /* ch >= 0x10000 */ | 454 | | { | 455 | | assert(ch <= MAX_UNICODE); | 456 | | /* Encode UCS4 Unicode ordinals */ | 457 | | *p++ = (char)(0xf0 | (ch >> 18)); | 458 | | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 459 | | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 460 | | *p++ = (char)(0x80 | (ch & 0x3f)); | 461 | | } | 462 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 463 | 1.62G | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 464 | 1.62G | } | 465 | | | 466 | 1.76M | #if STRINGLIB_SIZEOF_CHAR > 1 | 467 | 1.76M | Py_XDECREF(error_handler_obj); | 468 | 1.76M | Py_XDECREF(exc); | 469 | 1.76M | #endif | 470 | 1.76M | *end = p; | 471 | 1.76M | return writer; | 472 | | | 473 | 0 | #if STRINGLIB_SIZEOF_CHAR > 1 | 474 | 168k | error: | 475 | 168k | PyBytesWriter_Discard(writer); | 476 | 168k | Py_XDECREF(rep); | 477 | 168k | Py_XDECREF(error_handler_obj); | 478 | 168k | Py_XDECREF(exc); | 479 | 168k | *end = NULL; | 480 | | return NULL; | 481 | 1.93M | #endif | 482 | 1.93M | } |
unicodeobject.c:ucs4lib_utf8_encoder Line | Count | Source | 267 | 66.3k | { | 268 | 66.3k | #if STRINGLIB_SIZEOF_CHAR > 1 | 269 | 66.3k | PyObject *error_handler_obj = NULL; | 270 | 66.3k | PyObject *exc = NULL; | 271 | 66.3k | PyObject *rep = NULL; | 272 | 66.3k | #endif | 273 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 274 | | const Py_ssize_t max_char_size = 2; | 275 | | #elif STRINGLIB_SIZEOF_CHAR == 2 | 276 | | const Py_ssize_t max_char_size = 3; | 277 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ | 278 | 66.3k | const Py_ssize_t max_char_size = 4; | 279 | 66.3k | #endif | 280 | | | 281 | 66.3k | assert(size >= 0); | 282 | 66.3k | if (size > PY_SSIZE_T_MAX / max_char_size) { | 283 | | /* integer overflow */ | 284 | 0 | PyErr_NoMemory(); | 285 | 0 | *end = NULL; | 286 | 0 | return NULL; | 287 | 0 | } | 288 | | | 289 | 66.3k | PyBytesWriter *writer = PyBytesWriter_Create(size * max_char_size); | 290 | 66.3k | if (writer == NULL) { | 291 | 0 | *end = NULL; | 292 | 0 | return NULL; | 293 | 0 | } | 294 | | /* next free byte in output buffer */ | 295 | 66.3k | char *p = PyBytesWriter_GetData(writer); | 296 | | | 297 | 66.3k | Py_ssize_t i; /* index into data of next input character */ | 298 | 1.33G | for (i = 0; i < size;) { | 299 | 1.33G | Py_UCS4 ch = data[i++]; | 300 | | | 301 | 1.33G | if (ch < 0x80) { | 302 | | /* Encode ASCII */ | 303 | 1.29G | *p++ = (char) ch; | 304 | | | 305 | 1.29G | } | 306 | 39.7M | else | 307 | 39.7M | #if STRINGLIB_SIZEOF_CHAR > 1 | 308 | 39.7M | if (ch < 0x0800) | 309 | 919k | #endif | 310 | 919k | { | 311 | | /* Encode Latin-1 */ | 312 | 919k | *p++ = (char)(0xc0 | (ch >> 6)); | 313 | 919k | *p++ = (char)(0x80 | (ch & 0x3f)); | 314 | 919k | } | 315 | 38.8M | #if STRINGLIB_SIZEOF_CHAR > 1 | 316 | 38.8M | else if (Py_UNICODE_IS_SURROGATE(ch)) { | 317 | 18.0k | Py_ssize_t startpos, endpos, newpos; | 318 | 18.0k | Py_ssize_t k; | 319 | 18.0k | if (error_handler == _Py_ERROR_UNKNOWN) { | 320 | 9.65k | error_handler = _Py_GetErrorHandler(errors); | 321 | 9.65k | } | 322 | | | 323 | 18.0k | startpos = i-1; | 324 | 18.0k | endpos = startpos+1; | 325 | | | 326 | 97.0k | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) | 327 | 79.0k | endpos++; | 328 | | | 329 | | /* Only overallocate the buffer if it's not the last write */ | 330 | 18.0k | writer->overallocate = (endpos < size); | 331 | | | 332 | 18.0k | switch (error_handler) | 333 | 18.0k | { | 334 | 0 | case _Py_ERROR_REPLACE: | 335 | 0 | memset(p, '?', endpos - startpos); | 336 | 0 | p += (endpos - startpos); | 337 | 0 | _Py_FALLTHROUGH; | 338 | 0 | case _Py_ERROR_IGNORE: | 339 | 0 | i += (endpos - startpos - 1); | 340 | 0 | break; | 341 | | | 342 | 0 | case _Py_ERROR_SURROGATEPASS: | 343 | 0 | for (k=startpos; k<endpos; k++) { | 344 | 0 | ch = data[k]; | 345 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); | 346 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 347 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); | 348 | 0 | } | 349 | 0 | i += (endpos - startpos - 1); | 350 | 0 | break; | 351 | | | 352 | 0 | case _Py_ERROR_BACKSLASHREPLACE: | 353 | | /* subtract preallocated bytes */ | 354 | 0 | writer->size -= max_char_size * (endpos - startpos); | 355 | 0 | p = backslashreplace(writer, p, | 356 | 0 | unicode, startpos, endpos); | 357 | 0 | if (p == NULL) | 358 | 0 | goto error; | 359 | 0 | i += (endpos - startpos - 1); | 360 | 0 | break; | 361 | | | 362 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: | 363 | | /* subtract preallocated bytes */ | 364 | 0 | writer->size -= max_char_size * (endpos - startpos); | 365 | 0 | p = xmlcharrefreplace(writer, p, | 366 | 0 | unicode, startpos, endpos); | 367 | 0 | if (p == NULL) | 368 | 0 | goto error; | 369 | 0 | i += (endpos - startpos - 1); | 370 | 0 | break; | 371 | | | 372 | 11.3k | case _Py_ERROR_SURROGATEESCAPE: | 373 | 95.6k | for (k=startpos; k<endpos; k++) { | 374 | 84.2k | ch = data[k]; | 375 | 84.2k | if (!(0xDC80 <= ch && ch <= 0xDCFF)) | 376 | 7 | break; | 377 | 84.2k | *p++ = (char)(ch & 0xff); | 378 | 84.2k | } | 379 | 11.3k | if (k >= endpos) { | 380 | 11.3k | i += (endpos - startpos - 1); | 381 | 11.3k | break; | 382 | 11.3k | } | 383 | 7 | startpos = k; | 384 | 7 | assert(startpos < endpos); | 385 | 7 | _Py_FALLTHROUGH; | 386 | 6.64k | default: | 387 | 6.64k | rep = unicode_encode_call_errorhandler( | 388 | 6.64k | errors, &error_handler_obj, "utf-8", "surrogates not allowed", | 389 | 6.64k | unicode, &exc, startpos, endpos, &newpos); | 390 | 6.64k | if (!rep) | 391 | 6.64k | goto error; | 392 | | | 393 | 0 | if (newpos < startpos) { | 394 | 0 | writer->overallocate = 1; | 395 | 0 | p = PyBytesWriter_GrowAndUpdatePointer(writer, | 396 | 0 | max_char_size * (startpos - newpos), | 397 | 0 | p); | 398 | 0 | if (p == NULL) { | 399 | 0 | goto error; | 400 | 0 | } | 401 | 0 | } | 402 | 0 | else { | 403 | | /* subtract preallocated bytes */ | 404 | 0 | writer->size -= max_char_size * (newpos - startpos); | 405 | | /* Only overallocate the buffer if it's not the last write */ | 406 | 0 | writer->overallocate = (newpos < size); | 407 | 0 | } | 408 | | | 409 | 0 | char *rep_str; | 410 | 0 | Py_ssize_t rep_len; | 411 | 0 | if (PyBytes_Check(rep)) { | 412 | 0 | rep_str = PyBytes_AS_STRING(rep); | 413 | 0 | rep_len = PyBytes_GET_SIZE(rep); | 414 | 0 | } | 415 | 0 | else { | 416 | | /* rep is unicode */ | 417 | 0 | if (!PyUnicode_IS_ASCII(rep)) { | 418 | 0 | raise_encode_exception(&exc, "utf-8", unicode, | 419 | 0 | startpos, endpos, | 420 | 0 | "surrogates not allowed"); | 421 | 0 | goto error; | 422 | 0 | } | 423 | | | 424 | 0 | rep_str = PyUnicode_DATA(rep); | 425 | 0 | rep_len = PyUnicode_GET_LENGTH(rep); | 426 | 0 | } | 427 | | | 428 | 0 | p = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, p); | 429 | 0 | if (p == NULL) { | 430 | 0 | goto error; | 431 | 0 | } | 432 | 0 | memcpy(p, rep_str, rep_len); | 433 | 0 | p += rep_len; | 434 | 0 | Py_CLEAR(rep); | 435 | |
| 436 | 0 | i = newpos; | 437 | 18.0k | } | 438 | | | 439 | | /* If overallocation was disabled, ensure that it was the last | 440 | | write. Otherwise, we missed an optimization */ | 441 | 18.0k | assert(writer->overallocate || i == size); | 442 | 11.3k | } | 443 | 38.7M | else | 444 | 38.7M | #if STRINGLIB_SIZEOF_CHAR > 2 | 445 | 38.7M | if (ch < 0x10000) | 446 | 38.6M | #endif | 447 | 38.6M | { | 448 | 38.6M | *p++ = (char)(0xe0 | (ch >> 12)); | 449 | 38.6M | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 450 | 38.6M | *p++ = (char)(0x80 | (ch & 0x3f)); | 451 | 38.6M | } | 452 | 123k | #if STRINGLIB_SIZEOF_CHAR > 2 | 453 | 123k | else /* ch >= 0x10000 */ | 454 | 123k | { | 455 | 123k | assert(ch <= MAX_UNICODE); | 456 | | /* Encode UCS4 Unicode ordinals */ | 457 | 123k | *p++ = (char)(0xf0 | (ch >> 18)); | 458 | 123k | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); | 459 | 123k | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); | 460 | 123k | *p++ = (char)(0x80 | (ch & 0x3f)); | 461 | 123k | } | 462 | 1.33G | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ | 463 | 1.33G | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ | 464 | 1.33G | } | 465 | | | 466 | 59.7k | #if STRINGLIB_SIZEOF_CHAR > 1 | 467 | 59.7k | Py_XDECREF(error_handler_obj); | 468 | 59.7k | Py_XDECREF(exc); | 469 | 59.7k | #endif | 470 | 59.7k | *end = p; | 471 | 59.7k | return writer; | 472 | | | 473 | 0 | #if STRINGLIB_SIZEOF_CHAR > 1 | 474 | 6.64k | error: | 475 | 6.64k | PyBytesWriter_Discard(writer); | 476 | 6.64k | Py_XDECREF(rep); | 477 | 6.64k | Py_XDECREF(error_handler_obj); | 478 | 6.64k | Py_XDECREF(exc); | 479 | 6.64k | *end = NULL; | 480 | | return NULL; | 481 | 66.3k | #endif | 482 | 66.3k | } |
Unexecuted instantiation: unicodeobject.c:asciilib_utf8_encoder |
483 | | |
484 | | /* The pattern for constructing UCS2-repeated masks. */ |
485 | | #if SIZEOF_LONG == 8 |
486 | 155k | # define UCS2_REPEAT_MASK 0x0001000100010001ul |
487 | | #elif SIZEOF_LONG == 4 |
488 | | # define UCS2_REPEAT_MASK 0x00010001ul |
489 | | #else |
490 | | # error C 'long' size should be either 4 or 8! |
491 | | #endif |
492 | | |
493 | | /* The mask for fast checking. */ |
494 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
495 | | /* The mask for fast checking of whether a C 'long' contains a |
496 | | non-ASCII or non-Latin1 UTF16-encoded characters. */ |
497 | 11.8k | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) |
498 | | #else |
499 | | /* The mask for fast checking of whether a C 'long' may contain |
500 | | UTF16-encoded surrogate characters. This is an efficient heuristic, |
501 | | assuming that non-surrogate characters with a code point >= 0x8000 are |
502 | | rare in most input. |
503 | | */ |
504 | 104k | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) |
505 | | #endif |
506 | | /* The mask for fast byte-swapping. */ |
507 | 38.5k | #define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) |
508 | | /* Swap bytes. */ |
509 | 19.2k | #define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ |
510 | 19.2k | (((value) & STRIPPED_MASK) << 8)) |
511 | | |
512 | | Py_LOCAL_INLINE(Py_UCS4) |
513 | | STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, |
514 | | STRINGLIB_CHAR *dest, Py_ssize_t *outpos, |
515 | | int native_ordering) |
516 | 52.7k | { |
517 | 52.7k | Py_UCS4 ch; |
518 | 52.7k | const unsigned char *q = *inptr; |
519 | 52.7k | STRINGLIB_CHAR *p = dest + *outpos; |
520 | | /* Offsets from q for retrieving byte pairs in the right order. */ |
521 | 52.7k | #if PY_LITTLE_ENDIAN |
522 | 52.7k | int ihi = !!native_ordering, ilo = !native_ordering; |
523 | | #else |
524 | | int ihi = !native_ordering, ilo = !!native_ordering; |
525 | | #endif |
526 | 52.7k | --e; |
527 | | |
528 | 280k | while (q < e) { |
529 | 274k | Py_UCS4 ch2; |
530 | | /* First check for possible aligned read of a C 'long'. Unaligned |
531 | | reads are more expensive, better to defer to another iteration. */ |
532 | 274k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { |
533 | | /* Fast path for runs of in-range non-surrogate chars. */ |
534 | 73.6k | const unsigned char *_q = q; |
535 | 142k | while (_q + SIZEOF_LONG <= e) { |
536 | 131k | unsigned long block = * (const unsigned long *) _q; |
537 | 131k | if (native_ordering) { |
538 | | /* Can use buffer directly */ |
539 | 116k | if (block & FAST_CHAR_MASK) |
540 | 53.4k | break; |
541 | 116k | } |
542 | 14.3k | else { |
543 | | /* Need to byte-swap */ |
544 | 14.3k | if (block & SWAB(FAST_CHAR_MASK)) |
545 | 8.57k | break; |
546 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
547 | 921 | block >>= 8; |
548 | | #else |
549 | 4.89k | block = SWAB(block); |
550 | | #endif |
551 | 4.89k | } |
552 | 69.0k | #if PY_LITTLE_ENDIAN |
553 | | # if SIZEOF_LONG == 4 |
554 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
555 | | p[1] = (STRINGLIB_CHAR)(block >> 16); |
556 | | # elif SIZEOF_LONG == 8 |
557 | 69.0k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
558 | 69.0k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
559 | 69.0k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
560 | 69.0k | p[3] = (STRINGLIB_CHAR)(block >> 48); |
561 | 69.0k | # endif |
562 | | #else |
563 | | # if SIZEOF_LONG == 4 |
564 | | p[0] = (STRINGLIB_CHAR)(block >> 16); |
565 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
566 | | # elif SIZEOF_LONG == 8 |
567 | | p[0] = (STRINGLIB_CHAR)(block >> 48); |
568 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
569 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
570 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
571 | | # endif |
572 | | #endif |
573 | 69.0k | _q += SIZEOF_LONG; |
574 | 69.0k | p += SIZEOF_LONG / 2; |
575 | 69.0k | } |
576 | 73.6k | q = _q; |
577 | 73.6k | if (q >= e) |
578 | 919 | break; |
579 | 73.6k | } |
580 | | |
581 | 273k | ch = (q[ihi] << 8) | q[ilo]; |
582 | 273k | q += 2; |
583 | 273k | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
584 | | #if STRINGLIB_SIZEOF_CHAR < 2 |
585 | 30.5k | if (ch > STRINGLIB_MAX_CHAR) |
586 | | /* Out-of-range */ |
587 | 13.5k | goto Return; |
588 | 17.0k | #endif |
589 | 17.0k | *p++ = (STRINGLIB_CHAR)ch; |
590 | 17.0k | continue; |
591 | 240k | } |
592 | | |
593 | | /* UTF-16 code pair: */ |
594 | 32.6k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) |
595 | 18.4k | goto IllegalEncoding; |
596 | 14.1k | if (q >= e) |
597 | 1.90k | goto UnexpectedEnd; |
598 | 12.2k | ch2 = (q[ihi] << 8) | q[ilo]; |
599 | 12.2k | q += 2; |
600 | 12.2k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) |
601 | 7.02k | goto IllegalSurrogate; |
602 | 5.22k | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); |
603 | | #if STRINGLIB_SIZEOF_CHAR < 4 |
604 | | /* Out-of-range */ |
605 | 4.50k | goto Return; |
606 | | #else |
607 | | *p++ = (STRINGLIB_CHAR)ch; |
608 | | #endif |
609 | 728 | } |
610 | 7.32k | ch = 0; |
611 | 52.7k | Return: |
612 | 52.7k | *inptr = q; |
613 | 52.7k | *outpos = p - dest; |
614 | 52.7k | return ch; |
615 | 1.90k | UnexpectedEnd: |
616 | 1.90k | ch = 1; |
617 | 1.90k | goto Return; |
618 | 18.4k | IllegalEncoding: |
619 | 18.4k | ch = 2; |
620 | 18.4k | goto Return; |
621 | 7.02k | IllegalSurrogate: |
622 | 7.02k | ch = 3; |
623 | 7.02k | goto Return; |
624 | 7.32k | } unicodeobject.c:asciilib_utf16_decode Line | Count | Source | 516 | 14.6k | { | 517 | 14.6k | Py_UCS4 ch; | 518 | 14.6k | const unsigned char *q = *inptr; | 519 | 14.6k | STRINGLIB_CHAR *p = dest + *outpos; | 520 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 521 | 14.6k | #if PY_LITTLE_ENDIAN | 522 | 14.6k | int ihi = !!native_ordering, ilo = !native_ordering; | 523 | | #else | 524 | | int ihi = !native_ordering, ilo = !!native_ordering; | 525 | | #endif | 526 | 14.6k | --e; | 527 | | | 528 | 26.4k | while (q < e) { | 529 | 25.7k | Py_UCS4 ch2; | 530 | | /* First check for possible aligned read of a C 'long'. Unaligned | 531 | | reads are more expensive, better to defer to another iteration. */ | 532 | 25.7k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 533 | | /* Fast path for runs of in-range non-surrogate chars. */ | 534 | 13.9k | const unsigned char *_q = q; | 535 | 17.1k | while (_q + SIZEOF_LONG <= e) { | 536 | 12.9k | unsigned long block = * (const unsigned long *) _q; | 537 | 12.9k | if (native_ordering) { | 538 | | /* Can use buffer directly */ | 539 | 10.1k | if (block & FAST_CHAR_MASK) | 540 | 7.53k | break; | 541 | 10.1k | } | 542 | 2.78k | else { | 543 | | /* Need to byte-swap */ | 544 | 2.78k | if (block & SWAB(FAST_CHAR_MASK)) | 545 | 2.12k | break; | 546 | 660 | #if STRINGLIB_SIZEOF_CHAR == 1 | 547 | 660 | block >>= 8; | 548 | | #else | 549 | | block = SWAB(block); | 550 | | #endif | 551 | 660 | } | 552 | 3.29k | #if PY_LITTLE_ENDIAN | 553 | | # if SIZEOF_LONG == 4 | 554 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 555 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 556 | | # elif SIZEOF_LONG == 8 | 557 | 3.29k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 558 | 3.29k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 559 | 3.29k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 560 | 3.29k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 561 | 3.29k | # endif | 562 | | #else | 563 | | # if SIZEOF_LONG == 4 | 564 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 565 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 566 | | # elif SIZEOF_LONG == 8 | 567 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 568 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 569 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 570 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 571 | | # endif | 572 | | #endif | 573 | 3.29k | _q += SIZEOF_LONG; | 574 | 3.29k | p += SIZEOF_LONG / 2; | 575 | 3.29k | } | 576 | 13.9k | q = _q; | 577 | 13.9k | if (q >= e) | 578 | 274 | break; | 579 | 13.9k | } | 580 | | | 581 | 25.4k | ch = (q[ihi] << 8) | q[ilo]; | 582 | 25.4k | q += 2; | 583 | 25.4k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 584 | 23.7k | #if STRINGLIB_SIZEOF_CHAR < 2 | 585 | 23.7k | if (ch > STRINGLIB_MAX_CHAR) | 586 | | /* Out-of-range */ | 587 | 12.0k | goto Return; | 588 | 11.7k | #endif | 589 | 11.7k | *p++ = (STRINGLIB_CHAR)ch; | 590 | 11.7k | continue; | 591 | 23.7k | } | 592 | | | 593 | | /* UTF-16 code pair: */ | 594 | 1.68k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 595 | 821 | goto IllegalEncoding; | 596 | 861 | if (q >= e) | 597 | 266 | goto UnexpectedEnd; | 598 | 595 | ch2 = (q[ihi] << 8) | q[ilo]; | 599 | 595 | q += 2; | 600 | 595 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 601 | 263 | goto IllegalSurrogate; | 602 | 332 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 603 | 332 | #if STRINGLIB_SIZEOF_CHAR < 4 | 604 | | /* Out-of-range */ | 605 | 332 | goto Return; | 606 | | #else | 607 | | *p++ = (STRINGLIB_CHAR)ch; | 608 | | #endif | 609 | 595 | } | 610 | 976 | ch = 0; | 611 | 14.6k | Return: | 612 | 14.6k | *inptr = q; | 613 | 14.6k | *outpos = p - dest; | 614 | 14.6k | return ch; | 615 | 266 | UnexpectedEnd: | 616 | 266 | ch = 1; | 617 | 266 | goto Return; | 618 | 821 | IllegalEncoding: | 619 | 821 | ch = 2; | 620 | 821 | goto Return; | 621 | 263 | IllegalSurrogate: | 622 | 263 | ch = 3; | 623 | 263 | goto Return; | 624 | 976 | } |
unicodeobject.c:ucs1lib_utf16_decode Line | Count | Source | 516 | 3.64k | { | 517 | 3.64k | Py_UCS4 ch; | 518 | 3.64k | const unsigned char *q = *inptr; | 519 | 3.64k | STRINGLIB_CHAR *p = dest + *outpos; | 520 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 521 | 3.64k | #if PY_LITTLE_ENDIAN | 522 | 3.64k | int ihi = !!native_ordering, ilo = !native_ordering; | 523 | | #else | 524 | | int ihi = !native_ordering, ilo = !!native_ordering; | 525 | | #endif | 526 | 3.64k | --e; | 527 | | | 528 | 8.97k | while (q < e) { | 529 | 8.73k | Py_UCS4 ch2; | 530 | | /* First check for possible aligned read of a C 'long'. Unaligned | 531 | | reads are more expensive, better to defer to another iteration. */ | 532 | 8.73k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 533 | | /* Fast path for runs of in-range non-surrogate chars. */ | 534 | 1.33k | const unsigned char *_q = q; | 535 | 2.39k | while (_q + SIZEOF_LONG <= e) { | 536 | 2.11k | unsigned long block = * (const unsigned long *) _q; | 537 | 2.11k | if (native_ordering) { | 538 | | /* Can use buffer directly */ | 539 | 1.71k | if (block & FAST_CHAR_MASK) | 540 | 922 | break; | 541 | 1.71k | } | 542 | 393 | else { | 543 | | /* Need to byte-swap */ | 544 | 393 | if (block & SWAB(FAST_CHAR_MASK)) | 545 | 132 | break; | 546 | 261 | #if STRINGLIB_SIZEOF_CHAR == 1 | 547 | 261 | block >>= 8; | 548 | | #else | 549 | | block = SWAB(block); | 550 | | #endif | 551 | 261 | } | 552 | 1.05k | #if PY_LITTLE_ENDIAN | 553 | | # if SIZEOF_LONG == 4 | 554 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 555 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 556 | | # elif SIZEOF_LONG == 8 | 557 | 1.05k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 558 | 1.05k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 559 | 1.05k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 560 | 1.05k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 561 | 1.05k | # endif | 562 | | #else | 563 | | # if SIZEOF_LONG == 4 | 564 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 565 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 566 | | # elif SIZEOF_LONG == 8 | 567 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 568 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 569 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 570 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 571 | | # endif | 572 | | #endif | 573 | 1.05k | _q += SIZEOF_LONG; | 574 | 1.05k | p += SIZEOF_LONG / 2; | 575 | 1.05k | } | 576 | 1.33k | q = _q; | 577 | 1.33k | if (q >= e) | 578 | 113 | break; | 579 | 1.33k | } | 580 | | | 581 | 8.62k | ch = (q[ihi] << 8) | q[ilo]; | 582 | 8.62k | q += 2; | 583 | 8.62k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 584 | 6.81k | #if STRINGLIB_SIZEOF_CHAR < 2 | 585 | 6.81k | if (ch > STRINGLIB_MAX_CHAR) | 586 | | /* Out-of-range */ | 587 | 1.48k | goto Return; | 588 | 5.32k | #endif | 589 | 5.32k | *p++ = (STRINGLIB_CHAR)ch; | 590 | 5.32k | continue; | 591 | 6.81k | } | 592 | | | 593 | | /* UTF-16 code pair: */ | 594 | 1.81k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 595 | 119 | goto IllegalEncoding; | 596 | 1.69k | if (q >= e) | 597 | 71 | goto UnexpectedEnd; | 598 | 1.62k | ch2 = (q[ihi] << 8) | q[ilo]; | 599 | 1.62k | q += 2; | 600 | 1.62k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 601 | 1.14k | goto IllegalSurrogate; | 602 | 475 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 603 | 475 | #if STRINGLIB_SIZEOF_CHAR < 4 | 604 | | /* Out-of-range */ | 605 | 475 | goto Return; | 606 | | #else | 607 | | *p++ = (STRINGLIB_CHAR)ch; | 608 | | #endif | 609 | 1.62k | } | 610 | 350 | ch = 0; | 611 | 3.64k | Return: | 612 | 3.64k | *inptr = q; | 613 | 3.64k | *outpos = p - dest; | 614 | 3.64k | return ch; | 615 | 71 | UnexpectedEnd: | 616 | 71 | ch = 1; | 617 | 71 | goto Return; | 618 | 119 | IllegalEncoding: | 619 | 119 | ch = 2; | 620 | 119 | goto Return; | 621 | 1.14k | IllegalSurrogate: | 622 | 1.14k | ch = 3; | 623 | 1.14k | goto Return; | 624 | 350 | } |
unicodeobject.c:ucs2lib_utf16_decode Line | Count | Source | 516 | 13.4k | { | 517 | 13.4k | Py_UCS4 ch; | 518 | 13.4k | const unsigned char *q = *inptr; | 519 | 13.4k | STRINGLIB_CHAR *p = dest + *outpos; | 520 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 521 | 13.4k | #if PY_LITTLE_ENDIAN | 522 | 13.4k | int ihi = !!native_ordering, ilo = !native_ordering; | 523 | | #else | 524 | | int ihi = !native_ordering, ilo = !!native_ordering; | 525 | | #endif | 526 | 13.4k | --e; | 527 | | | 528 | 144k | while (q < e) { | 529 | 140k | Py_UCS4 ch2; | 530 | | /* First check for possible aligned read of a C 'long'. Unaligned | 531 | | reads are more expensive, better to defer to another iteration. */ | 532 | 140k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 533 | | /* Fast path for runs of in-range non-surrogate chars. */ | 534 | 32.9k | const unsigned char *_q = q; | 535 | 95.3k | while (_q + SIZEOF_LONG <= e) { | 536 | 90.6k | unsigned long block = * (const unsigned long *) _q; | 537 | 90.6k | if (native_ordering) { | 538 | | /* Can use buffer directly */ | 539 | 81.5k | if (block & FAST_CHAR_MASK) | 540 | 23.1k | break; | 541 | 81.5k | } | 542 | 9.17k | else { | 543 | | /* Need to byte-swap */ | 544 | 9.17k | if (block & SWAB(FAST_CHAR_MASK)) | 545 | 5.21k | break; | 546 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 547 | | block >>= 8; | 548 | | #else | 549 | 3.95k | block = SWAB(block); | 550 | 3.95k | #endif | 551 | 3.95k | } | 552 | 62.3k | #if PY_LITTLE_ENDIAN | 553 | | # if SIZEOF_LONG == 4 | 554 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 555 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 556 | | # elif SIZEOF_LONG == 8 | 557 | 62.3k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 558 | 62.3k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 559 | 62.3k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 560 | 62.3k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 561 | 62.3k | # endif | 562 | | #else | 563 | | # if SIZEOF_LONG == 4 | 564 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 565 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 566 | | # elif SIZEOF_LONG == 8 | 567 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 568 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 569 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 570 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 571 | | # endif | 572 | | #endif | 573 | 62.3k | _q += SIZEOF_LONG; | 574 | 62.3k | p += SIZEOF_LONG / 2; | 575 | 62.3k | } | 576 | 32.9k | q = _q; | 577 | 32.9k | if (q >= e) | 578 | 427 | break; | 579 | 32.9k | } | 580 | | | 581 | 140k | ch = (q[ihi] << 8) | q[ilo]; | 582 | 140k | q += 2; | 583 | 140k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 584 | | #if STRINGLIB_SIZEOF_CHAR < 2 | 585 | | if (ch > STRINGLIB_MAX_CHAR) | 586 | | /* Out-of-range */ | 587 | | goto Return; | 588 | | #endif | 589 | 131k | *p++ = (STRINGLIB_CHAR)ch; | 590 | 131k | continue; | 591 | 131k | } | 592 | | | 593 | | /* UTF-16 code pair: */ | 594 | 8.96k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 595 | 2.22k | goto IllegalEncoding; | 596 | 6.73k | if (q >= e) | 597 | 773 | goto UnexpectedEnd; | 598 | 5.96k | ch2 = (q[ihi] << 8) | q[ilo]; | 599 | 5.96k | q += 2; | 600 | 5.96k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 601 | 2.26k | goto IllegalSurrogate; | 602 | 3.69k | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 603 | 3.69k | #if STRINGLIB_SIZEOF_CHAR < 4 | 604 | | /* Out-of-range */ | 605 | 3.69k | goto Return; | 606 | | #else | 607 | | *p++ = (STRINGLIB_CHAR)ch; | 608 | | #endif | 609 | 5.96k | } | 610 | 4.45k | ch = 0; | 611 | 13.4k | Return: | 612 | 13.4k | *inptr = q; | 613 | 13.4k | *outpos = p - dest; | 614 | 13.4k | return ch; | 615 | 773 | UnexpectedEnd: | 616 | 773 | ch = 1; | 617 | 773 | goto Return; | 618 | 2.22k | IllegalEncoding: | 619 | 2.22k | ch = 2; | 620 | 2.22k | goto Return; | 621 | 2.26k | IllegalSurrogate: | 622 | 2.26k | ch = 3; | 623 | 2.26k | goto Return; | 624 | 4.45k | } |
unicodeobject.c:ucs4lib_utf16_decode Line | Count | Source | 516 | 20.9k | { | 517 | 20.9k | Py_UCS4 ch; | 518 | 20.9k | const unsigned char *q = *inptr; | 519 | 20.9k | STRINGLIB_CHAR *p = dest + *outpos; | 520 | | /* Offsets from q for retrieving byte pairs in the right order. */ | 521 | 20.9k | #if PY_LITTLE_ENDIAN | 522 | 20.9k | int ihi = !!native_ordering, ilo = !native_ordering; | 523 | | #else | 524 | | int ihi = !native_ordering, ilo = !!native_ordering; | 525 | | #endif | 526 | 20.9k | --e; | 527 | | | 528 | 100k | while (q < e) { | 529 | 98.9k | Py_UCS4 ch2; | 530 | | /* First check for possible aligned read of a C 'long'. Unaligned | 531 | | reads are more expensive, better to defer to another iteration. */ | 532 | 98.9k | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { | 533 | | /* Fast path for runs of in-range non-surrogate chars. */ | 534 | 25.4k | const unsigned char *_q = q; | 535 | 27.7k | while (_q + SIZEOF_LONG <= e) { | 536 | 25.3k | unsigned long block = * (const unsigned long *) _q; | 537 | 25.3k | if (native_ordering) { | 538 | | /* Can use buffer directly */ | 539 | 23.2k | if (block & FAST_CHAR_MASK) | 540 | 21.9k | break; | 541 | 23.2k | } | 542 | 2.03k | else { | 543 | | /* Need to byte-swap */ | 544 | 2.03k | if (block & SWAB(FAST_CHAR_MASK)) | 545 | 1.09k | break; | 546 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 547 | | block >>= 8; | 548 | | #else | 549 | 939 | block = SWAB(block); | 550 | 939 | #endif | 551 | 939 | } | 552 | 2.29k | #if PY_LITTLE_ENDIAN | 553 | | # if SIZEOF_LONG == 4 | 554 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 555 | | p[1] = (STRINGLIB_CHAR)(block >> 16); | 556 | | # elif SIZEOF_LONG == 8 | 557 | 2.29k | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 558 | 2.29k | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 559 | 2.29k | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 560 | 2.29k | p[3] = (STRINGLIB_CHAR)(block >> 48); | 561 | 2.29k | # endif | 562 | | #else | 563 | | # if SIZEOF_LONG == 4 | 564 | | p[0] = (STRINGLIB_CHAR)(block >> 16); | 565 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 566 | | # elif SIZEOF_LONG == 8 | 567 | | p[0] = (STRINGLIB_CHAR)(block >> 48); | 568 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); | 569 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); | 570 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); | 571 | | # endif | 572 | | #endif | 573 | 2.29k | _q += SIZEOF_LONG; | 574 | 2.29k | p += SIZEOF_LONG / 2; | 575 | 2.29k | } | 576 | 25.4k | q = _q; | 577 | 25.4k | if (q >= e) | 578 | 105 | break; | 579 | 25.4k | } | 580 | | | 581 | 98.8k | ch = (q[ihi] << 8) | q[ilo]; | 582 | 98.8k | q += 2; | 583 | 98.8k | if (!Py_UNICODE_IS_SURROGATE(ch)) { | 584 | | #if STRINGLIB_SIZEOF_CHAR < 2 | 585 | | if (ch > STRINGLIB_MAX_CHAR) | 586 | | /* Out-of-range */ | 587 | | goto Return; | 588 | | #endif | 589 | 78.7k | *p++ = (STRINGLIB_CHAR)ch; | 590 | 78.7k | continue; | 591 | 78.7k | } | 592 | | | 593 | | /* UTF-16 code pair: */ | 594 | 20.1k | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) | 595 | 15.2k | goto IllegalEncoding; | 596 | 4.87k | if (q >= e) | 597 | 790 | goto UnexpectedEnd; | 598 | 4.08k | ch2 = (q[ihi] << 8) | q[ilo]; | 599 | 4.08k | q += 2; | 600 | 4.08k | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) | 601 | 3.35k | goto IllegalSurrogate; | 602 | 728 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); | 603 | | #if STRINGLIB_SIZEOF_CHAR < 4 | 604 | | /* Out-of-range */ | 605 | | goto Return; | 606 | | #else | 607 | 728 | *p++ = (STRINGLIB_CHAR)ch; | 608 | 728 | #endif | 609 | 728 | } | 610 | 1.54k | ch = 0; | 611 | 20.9k | Return: | 612 | 20.9k | *inptr = q; | 613 | 20.9k | *outpos = p - dest; | 614 | 20.9k | return ch; | 615 | 790 | UnexpectedEnd: | 616 | 790 | ch = 1; | 617 | 790 | goto Return; | 618 | 15.2k | IllegalEncoding: | 619 | 15.2k | ch = 2; | 620 | 15.2k | goto Return; | 621 | 3.35k | IllegalSurrogate: | 622 | 3.35k | ch = 3; | 623 | 3.35k | goto Return; | 624 | 1.54k | } |
|
625 | | #undef UCS2_REPEAT_MASK |
626 | | #undef FAST_CHAR_MASK |
627 | | #undef STRIPPED_MASK |
628 | | #undef SWAB |
629 | | |
630 | | |
631 | | #if STRINGLIB_MAX_CHAR >= 0x80 |
632 | | Py_LOCAL_INLINE(Py_ssize_t) |
633 | | STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, |
634 | | Py_ssize_t len, |
635 | | unsigned short **outptr, |
636 | | int native_ordering) |
637 | 5.55k | { |
638 | 5.55k | unsigned short *out = *outptr; |
639 | 5.55k | const STRINGLIB_CHAR *end = in + len; |
640 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
641 | 5.49k | if (native_ordering) { |
642 | 2.59k | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
643 | 5.18k | while (in < unrolled_end) { |
644 | 2.59k | out[0] = in[0]; |
645 | 2.59k | out[1] = in[1]; |
646 | 2.59k | out[2] = in[2]; |
647 | 2.59k | out[3] = in[3]; |
648 | 2.59k | in += 4; out += 4; |
649 | 2.59k | } |
650 | 6.31k | while (in < end) { |
651 | 3.72k | *out++ = *in++; |
652 | 3.72k | } |
653 | 2.90k | } else { |
654 | 15.7k | # define SWAB2(CH) ((CH) << 8) /* high byte is zero */ |
655 | 2.90k | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
656 | 5.80k | while (in < unrolled_end) { |
657 | 2.90k | out[0] = SWAB2(in[0]); |
658 | 2.90k | out[1] = SWAB2(in[1]); |
659 | 2.90k | out[2] = SWAB2(in[2]); |
660 | 2.90k | out[3] = SWAB2(in[3]); |
661 | 2.90k | in += 4; out += 4; |
662 | 2.90k | } |
663 | 7.03k | while (in < end) { |
664 | 4.13k | Py_UCS4 ch = *in++; |
665 | 4.13k | *out++ = SWAB2((Py_UCS2)ch); |
666 | 4.13k | } |
667 | 2.90k | #undef SWAB2 |
668 | 2.90k | } |
669 | | *outptr = out; |
670 | | return len; |
671 | | #else |
672 | 61 | if (native_ordering) { |
673 | | #if STRINGLIB_MAX_CHAR < 0x10000 |
674 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
675 | 0 | while (in < unrolled_end) { |
676 | | /* check if any character is a surrogate character */ |
677 | 0 | if (((in[0] ^ 0xd800) & |
678 | 0 | (in[1] ^ 0xd800) & |
679 | 0 | (in[2] ^ 0xd800) & |
680 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
681 | 0 | break; |
682 | 0 | out[0] = in[0]; |
683 | 0 | out[1] = in[1]; |
684 | 0 | out[2] = in[2]; |
685 | 0 | out[3] = in[3]; |
686 | 0 | in += 4; out += 4; |
687 | 0 | } |
688 | | #endif |
689 | 0 | while (in < end) { |
690 | 0 | Py_UCS4 ch; |
691 | 0 | ch = *in++; |
692 | 0 | if (ch < 0xd800) |
693 | 0 | *out++ = ch; |
694 | 0 | else if (ch < 0xe000) |
695 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
696 | 0 | goto fail; |
697 | | #if STRINGLIB_MAX_CHAR >= 0x10000 |
698 | 0 | else if (ch >= 0x10000) { |
699 | 0 | out[0] = Py_UNICODE_HIGH_SURROGATE(ch); |
700 | 0 | out[1] = Py_UNICODE_LOW_SURROGATE(ch); |
701 | 0 | out += 2; |
702 | 0 | } |
703 | 0 | #endif |
704 | 0 | else |
705 | 0 | *out++ = ch; |
706 | 0 | } |
707 | 61 | } else { |
708 | 494 | #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) |
709 | | #if STRINGLIB_MAX_CHAR < 0x10000 |
710 | 61 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
711 | 126 | while (in < unrolled_end) { |
712 | | /* check if any character is a surrogate character */ |
713 | 87 | if (((in[0] ^ 0xd800) & |
714 | 87 | (in[1] ^ 0xd800) & |
715 | 87 | (in[2] ^ 0xd800) & |
716 | 87 | (in[3] ^ 0xd800) & 0xf800) == 0) |
717 | 22 | break; |
718 | 65 | out[0] = SWAB2(in[0]); |
719 | 65 | out[1] = SWAB2(in[1]); |
720 | 65 | out[2] = SWAB2(in[2]); |
721 | 65 | out[3] = SWAB2(in[3]); |
722 | 65 | in += 4; out += 4; |
723 | 65 | } |
724 | | #endif |
725 | 295 | while (in < end) { |
726 | 234 | Py_UCS4 ch = *in++; |
727 | 234 | if (ch < 0xd800) |
728 | 195 | *out++ = SWAB2((Py_UCS2)ch); |
729 | 39 | else if (ch < 0xe000) |
730 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
731 | 0 | goto fail; |
732 | | #if STRINGLIB_MAX_CHAR >= 0x10000 |
733 | 0 | else if (ch >= 0x10000) { |
734 | 0 | Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); |
735 | 0 | Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); |
736 | 0 | out[0] = SWAB2(ch1); |
737 | 0 | out[1] = SWAB2(ch2); |
738 | 0 | out += 2; |
739 | 0 | } |
740 | 0 | #endif |
741 | 0 | else |
742 | 39 | *out++ = SWAB2((Py_UCS2)ch); |
743 | 234 | } |
744 | 61 | #undef SWAB2 |
745 | 61 | } |
746 | 61 | *outptr = out; |
747 | 61 | return len; |
748 | 0 | fail: |
749 | 0 | *outptr = out; |
750 | 0 | return len - (end - in + 1); |
751 | | #endif |
752 | 5.55k | } unicodeobject.c:ucs1lib_utf16_encode Line | Count | Source | 637 | 5.49k | { | 638 | 5.49k | unsigned short *out = *outptr; | 639 | 5.49k | const STRINGLIB_CHAR *end = in + len; | 640 | 5.49k | #if STRINGLIB_SIZEOF_CHAR == 1 | 641 | 5.49k | if (native_ordering) { | 642 | 2.59k | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 643 | 5.18k | while (in < unrolled_end) { | 644 | 2.59k | out[0] = in[0]; | 645 | 2.59k | out[1] = in[1]; | 646 | 2.59k | out[2] = in[2]; | 647 | 2.59k | out[3] = in[3]; | 648 | 2.59k | in += 4; out += 4; | 649 | 2.59k | } | 650 | 6.31k | while (in < end) { | 651 | 3.72k | *out++ = *in++; | 652 | 3.72k | } | 653 | 2.90k | } else { | 654 | 2.90k | # define SWAB2(CH) ((CH) << 8) /* high byte is zero */ | 655 | 2.90k | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 656 | 5.80k | while (in < unrolled_end) { | 657 | 2.90k | out[0] = SWAB2(in[0]); | 658 | 2.90k | out[1] = SWAB2(in[1]); | 659 | 2.90k | out[2] = SWAB2(in[2]); | 660 | 2.90k | out[3] = SWAB2(in[3]); | 661 | 2.90k | in += 4; out += 4; | 662 | 2.90k | } | 663 | 7.03k | while (in < end) { | 664 | 4.13k | Py_UCS4 ch = *in++; | 665 | 4.13k | *out++ = SWAB2((Py_UCS2)ch); | 666 | 4.13k | } | 667 | 2.90k | #undef SWAB2 | 668 | 2.90k | } | 669 | 5.49k | *outptr = out; | 670 | 5.49k | return len; | 671 | | #else | 672 | | if (native_ordering) { | 673 | | #if STRINGLIB_MAX_CHAR < 0x10000 | 674 | | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 675 | | while (in < unrolled_end) { | 676 | | /* check if any character is a surrogate character */ | 677 | | if (((in[0] ^ 0xd800) & | 678 | | (in[1] ^ 0xd800) & | 679 | | (in[2] ^ 0xd800) & | 680 | | (in[3] ^ 0xd800) & 0xf800) == 0) | 681 | | break; | 682 | | out[0] = in[0]; | 683 | | out[1] = in[1]; | 684 | | out[2] = in[2]; | 685 | | out[3] = in[3]; | 686 | | in += 4; out += 4; | 687 | | } | 688 | | #endif | 689 | | while (in < end) { | 690 | | Py_UCS4 ch; | 691 | | ch = *in++; | 692 | | if (ch < 0xd800) | 693 | | *out++ = ch; | 694 | | else if (ch < 0xe000) | 695 | | /* reject surrogate characters (U+D800-U+DFFF) */ | 696 | | goto fail; | 697 | | #if STRINGLIB_MAX_CHAR >= 0x10000 | 698 | | else if (ch >= 0x10000) { | 699 | | out[0] = Py_UNICODE_HIGH_SURROGATE(ch); | 700 | | out[1] = Py_UNICODE_LOW_SURROGATE(ch); | 701 | | out += 2; | 702 | | } | 703 | | #endif | 704 | | else | 705 | | *out++ = ch; | 706 | | } | 707 | | } else { | 708 | | #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) | 709 | | #if STRINGLIB_MAX_CHAR < 0x10000 | 710 | | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 711 | | while (in < unrolled_end) { | 712 | | /* check if any character is a surrogate character */ | 713 | | if (((in[0] ^ 0xd800) & | 714 | | (in[1] ^ 0xd800) & | 715 | | (in[2] ^ 0xd800) & | 716 | | (in[3] ^ 0xd800) & 0xf800) == 0) | 717 | | break; | 718 | | out[0] = SWAB2(in[0]); | 719 | | out[1] = SWAB2(in[1]); | 720 | | out[2] = SWAB2(in[2]); | 721 | | out[3] = SWAB2(in[3]); | 722 | | in += 4; out += 4; | 723 | | } | 724 | | #endif | 725 | | while (in < end) { | 726 | | Py_UCS4 ch = *in++; | 727 | | if (ch < 0xd800) | 728 | | *out++ = SWAB2((Py_UCS2)ch); | 729 | | else if (ch < 0xe000) | 730 | | /* reject surrogate characters (U+D800-U+DFFF) */ | 731 | | goto fail; | 732 | | #if STRINGLIB_MAX_CHAR >= 0x10000 | 733 | | else if (ch >= 0x10000) { | 734 | | Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); | 735 | | Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); | 736 | | out[0] = SWAB2(ch1); | 737 | | out[1] = SWAB2(ch2); | 738 | | out += 2; | 739 | | } | 740 | | #endif | 741 | | else | 742 | | *out++ = SWAB2((Py_UCS2)ch); | 743 | | } | 744 | | #undef SWAB2 | 745 | | } | 746 | | *outptr = out; | 747 | | return len; | 748 | | fail: | 749 | | *outptr = out; | 750 | | return len - (end - in + 1); | 751 | | #endif | 752 | 5.49k | } |
unicodeobject.c:ucs2lib_utf16_encode Line | Count | Source | 637 | 61 | { | 638 | 61 | unsigned short *out = *outptr; | 639 | 61 | const STRINGLIB_CHAR *end = in + len; | 640 | | #if STRINGLIB_SIZEOF_CHAR == 1 | 641 | | if (native_ordering) { | 642 | | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 643 | | while (in < unrolled_end) { | 644 | | out[0] = in[0]; | 645 | | out[1] = in[1]; | 646 | | out[2] = in[2]; | 647 | | out[3] = in[3]; | 648 | | in += 4; out += 4; | 649 | | } | 650 | | while (in < end) { | 651 | | *out++ = *in++; | 652 | | } | 653 | | } else { | 654 | | # define SWAB2(CH) ((CH) << 8) /* high byte is zero */ | 655 | | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 656 | | while (in < unrolled_end) { | 657 | | out[0] = SWAB2(in[0]); | 658 | | out[1] = SWAB2(in[1]); | 659 | | out[2] = SWAB2(in[2]); | 660 | | out[3] = SWAB2(in[3]); | 661 | | in += 4; out += 4; | 662 | | } | 663 | | while (in < end) { | 664 | | Py_UCS4 ch = *in++; | 665 | | *out++ = SWAB2((Py_UCS2)ch); | 666 | | } | 667 | | #undef SWAB2 | 668 | | } | 669 | | *outptr = out; | 670 | | return len; | 671 | | #else | 672 | 61 | if (native_ordering) { | 673 | 0 | #if STRINGLIB_MAX_CHAR < 0x10000 | 674 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 675 | 0 | while (in < unrolled_end) { | 676 | | /* check if any character is a surrogate character */ | 677 | 0 | if (((in[0] ^ 0xd800) & | 678 | 0 | (in[1] ^ 0xd800) & | 679 | 0 | (in[2] ^ 0xd800) & | 680 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) | 681 | 0 | break; | 682 | 0 | out[0] = in[0]; | 683 | 0 | out[1] = in[1]; | 684 | 0 | out[2] = in[2]; | 685 | 0 | out[3] = in[3]; | 686 | 0 | in += 4; out += 4; | 687 | 0 | } | 688 | 0 | #endif | 689 | 0 | while (in < end) { | 690 | 0 | Py_UCS4 ch; | 691 | 0 | ch = *in++; | 692 | 0 | if (ch < 0xd800) | 693 | 0 | *out++ = ch; | 694 | 0 | else if (ch < 0xe000) | 695 | | /* reject surrogate characters (U+D800-U+DFFF) */ | 696 | 0 | goto fail; | 697 | | #if STRINGLIB_MAX_CHAR >= 0x10000 | 698 | | else if (ch >= 0x10000) { | 699 | | out[0] = Py_UNICODE_HIGH_SURROGATE(ch); | 700 | | out[1] = Py_UNICODE_LOW_SURROGATE(ch); | 701 | | out += 2; | 702 | | } | 703 | | #endif | 704 | 0 | else | 705 | 0 | *out++ = ch; | 706 | 0 | } | 707 | 61 | } else { | 708 | 61 | #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) | 709 | 61 | #if STRINGLIB_MAX_CHAR < 0x10000 | 710 | 61 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); | 711 | 126 | while (in < unrolled_end) { | 712 | | /* check if any character is a surrogate character */ | 713 | 87 | if (((in[0] ^ 0xd800) & | 714 | 87 | (in[1] ^ 0xd800) & | 715 | 87 | (in[2] ^ 0xd800) & | 716 | 87 | (in[3] ^ 0xd800) & 0xf800) == 0) | 717 | 22 | break; | 718 | 65 | out[0] = SWAB2(in[0]); | 719 | 65 | out[1] = SWAB2(in[1]); | 720 | 65 | out[2] = SWAB2(in[2]); | 721 | 65 | out[3] = SWAB2(in[3]); | 722 | 65 | in += 4; out += 4; | 723 | 65 | } | 724 | 61 | #endif | 725 | 295 | while (in < end) { | 726 | 234 | Py_UCS4 ch = *in++; | 727 | 234 | if (ch < 0xd800) | 728 | 195 | *out++ = SWAB2((Py_UCS2)ch); | 729 | 39 | else if (ch < 0xe000) | 730 | | /* reject surrogate characters (U+D800-U+DFFF) */ | 731 | 0 | goto fail; | 732 | | #if STRINGLIB_MAX_CHAR >= 0x10000 | 733 | | else if (ch >= 0x10000) { | 734 | | Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); | 735 | | Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); | 736 | | out[0] = SWAB2(ch1); | 737 | | out[1] = SWAB2(ch2); | 738 | | out += 2; | 739 | | } | 740 | | #endif | 741 | 39 | else | 742 | 39 | *out++ = SWAB2((Py_UCS2)ch); | 743 | 234 | } | 744 | 61 | #undef SWAB2 | 745 | 61 | } | 746 | 61 | *outptr = out; | 747 | 61 | return len; | 748 | 0 | fail: | 749 | 0 | *outptr = out; | 750 | 0 | return len - (end - in + 1); | 751 | 61 | #endif | 752 | 61 | } |
Unexecuted instantiation: unicodeobject.c:ucs4lib_utf16_encode |
753 | | |
754 | | static inline uint32_t |
755 | | STRINGLIB(SWAB4)(STRINGLIB_CHAR ch) |
756 | 0 | { |
757 | 0 | uint32_t word = ch; |
758 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
759 | | /* high bytes are zero */ |
760 | | return (word << 24); |
761 | | #elif STRINGLIB_SIZEOF_CHAR == 2 |
762 | | /* high bytes are zero */ |
763 | | return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8); |
764 | | #else |
765 | | return _Py_bswap32(word); |
766 | | #endif |
767 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_SWAB4 Unexecuted instantiation: unicodeobject.c:ucs2lib_SWAB4 Unexecuted instantiation: unicodeobject.c:ucs4lib_SWAB4 |
768 | | |
769 | | Py_LOCAL_INLINE(Py_ssize_t) |
770 | | STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, |
771 | | Py_ssize_t len, |
772 | | uint32_t **outptr, |
773 | | int native_ordering) |
774 | 0 | { |
775 | 0 | uint32_t *out = *outptr; |
776 | 0 | const STRINGLIB_CHAR *end = in + len; |
777 | 0 | if (native_ordering) { |
778 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
779 | 0 | while (in < unrolled_end) { |
780 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
781 | | /* check if any character is a surrogate character */ |
782 | 0 | if (((in[0] ^ 0xd800) & |
783 | 0 | (in[1] ^ 0xd800) & |
784 | 0 | (in[2] ^ 0xd800) & |
785 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
786 | 0 | break; |
787 | 0 | #endif |
788 | 0 | out[0] = in[0]; |
789 | 0 | out[1] = in[1]; |
790 | 0 | out[2] = in[2]; |
791 | 0 | out[3] = in[3]; |
792 | 0 | in += 4; out += 4; |
793 | 0 | } |
794 | 0 | while (in < end) { |
795 | 0 | Py_UCS4 ch; |
796 | 0 | ch = *in++; |
797 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
798 | 0 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
799 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
800 | 0 | goto fail; |
801 | 0 | } |
802 | 0 | #endif |
803 | 0 | *out++ = ch; |
804 | 0 | } |
805 | 0 | } else { |
806 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
807 | 0 | while (in < unrolled_end) { |
808 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
809 | | /* check if any character is a surrogate character */ |
810 | 0 | if (((in[0] ^ 0xd800) & |
811 | 0 | (in[1] ^ 0xd800) & |
812 | 0 | (in[2] ^ 0xd800) & |
813 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
814 | 0 | break; |
815 | 0 | #endif |
816 | 0 | out[0] = STRINGLIB(SWAB4)(in[0]); |
817 | 0 | out[1] = STRINGLIB(SWAB4)(in[1]); |
818 | 0 | out[2] = STRINGLIB(SWAB4)(in[2]); |
819 | 0 | out[3] = STRINGLIB(SWAB4)(in[3]); |
820 | 0 | in += 4; out += 4; |
821 | 0 | } |
822 | 0 | while (in < end) { |
823 | 0 | Py_UCS4 ch = *in++; |
824 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
825 | 0 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
826 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
827 | 0 | goto fail; |
828 | 0 | } |
829 | 0 | #endif |
830 | 0 | *out++ = STRINGLIB(SWAB4)(ch); |
831 | 0 | } |
832 | 0 | } |
833 | 0 | *outptr = out; |
834 | 0 | return len; |
835 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
836 | 0 | fail: |
837 | 0 | *outptr = out; |
838 | 0 | return len - (end - in + 1); |
839 | | #endif |
840 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf32_encode |
841 | | |
842 | | #endif |