/src/Python-3.8.3/Objects/stringlib/codecs.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* stringlib: codec implementations */ |
2 | | |
3 | | #if !STRINGLIB_IS_UNICODE |
4 | | # error "codecs.h is specific to Unicode" |
5 | | #endif |
6 | | |
7 | | /* Mask to quickly check whether a C 'long' contains a |
8 | | non-ASCII, UTF8-encoded char. */ |
9 | | #if (SIZEOF_LONG == 8) |
10 | 585 | # define ASCII_CHAR_MASK 0x8080808080808080UL |
11 | | #elif (SIZEOF_LONG == 4) |
12 | | # define ASCII_CHAR_MASK 0x80808080UL |
13 | | #else |
14 | | # error C 'long' size should be either 4 or 8! |
15 | | #endif |
16 | | |
17 | | /* 10xxxxxx */ |
18 | 2.66k | #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0) |
19 | | |
20 | | Py_LOCAL_INLINE(Py_UCS4) |
21 | | STRINGLIB(utf8_decode)(const char **inptr, const char *end, |
22 | | STRINGLIB_CHAR *dest, |
23 | | Py_ssize_t *outpos) |
24 | 44 | { |
25 | 44 | Py_UCS4 ch; |
26 | 44 | const char *s = *inptr; |
27 | 44 | const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); |
28 | 44 | STRINGLIB_CHAR *p = dest + *outpos; |
29 | | |
30 | 1.81k | while (s < end) { |
31 | 1.79k | ch = (unsigned char)*s; |
32 | | |
33 | 1.79k | if (ch < 0x80) { |
34 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 |
35 | | input will consist of an overwhelming majority of ASCII |
36 | | characters, we try to optimize for this case by checking |
37 | | as many characters as a C 'long' can contain. |
38 | | First, check if we can do an aligned read, as most CPUs have |
39 | | a penalty for unaligned reads. |
40 | | */ |
41 | 4 | if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) { |
42 | | /* Help register allocation */ |
43 | 1 | const char *_s = s; |
44 | 1 | STRINGLIB_CHAR *_p = p; |
45 | 586 | while (_s < aligned_end) { |
46 | | /* Read a whole long at a time (either 4 or 8 bytes), |
47 | | and do a fast unrolled copy if it only contains ASCII |
48 | | characters. */ |
49 | 585 | unsigned long value = *(const unsigned long *) _s; |
50 | 585 | if (value & ASCII_CHAR_MASK) |
51 | 0 | break; |
52 | 585 | #if PY_LITTLE_ENDIAN |
53 | 585 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); |
54 | 585 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
55 | 585 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
56 | 585 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
57 | 585 | # if SIZEOF_LONG == 8 |
58 | 585 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
59 | 585 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
60 | 585 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
61 | 585 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
62 | 585 | # endif |
63 | | #else |
64 | | # if SIZEOF_LONG == 8 |
65 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
66 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
67 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
68 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
69 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
70 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
71 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
72 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); |
73 | | # else |
74 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
75 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
76 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
77 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); |
78 | | # endif |
79 | | #endif |
80 | 585 | _s += SIZEOF_LONG; |
81 | 585 | _p += SIZEOF_LONG; |
82 | 585 | } |
83 | 1 | s = _s; |
84 | 1 | p = _p; |
85 | 1 | if (s == end) |
86 | 0 | break; |
87 | 1 | ch = (unsigned char)*s; |
88 | 1 | } |
89 | 4 | if (ch < 0x80) { |
90 | 4 | s++; |
91 | 4 | *p++ = ch; |
92 | 4 | continue; |
93 | 4 | } |
94 | 4 | } |
95 | | |
96 | 1.79k | if (ch < 0xE0) { |
97 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ |
98 | 925 | Py_UCS4 ch2; |
99 | 925 | if (ch < 0xC2) { |
100 | | /* invalid sequence |
101 | | \x80-\xBF -- continuation byte |
102 | | \xC0-\xC1 -- fake 0000-007F */ |
103 | 0 | goto InvalidStart; |
104 | 0 | } |
105 | 925 | if (end - s < 2) { |
106 | | /* unexpected end of data: the caller will decide whether |
107 | | it's an error or not */ |
108 | 0 | break; |
109 | 0 | } |
110 | 925 | ch2 = (unsigned char)s[1]; |
111 | 925 | if (!IS_CONTINUATION_BYTE(ch2)) |
112 | | /* invalid continuation byte */ |
113 | 0 | goto InvalidContinuation1; |
114 | 925 | ch = (ch << 6) + ch2 - |
115 | 925 | ((0xC0 << 6) + 0x80); |
116 | 925 | assert ((ch > 0x007F) && (ch <= 0x07FF)); |
117 | 925 | s += 2; |
118 | 925 | if (STRINGLIB_MAX_CHAR <= 0x007F || |
119 | 925 | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) |
120 | | /* Out-of-range */ |
121 | 15 | goto Return; |
122 | 910 | *p++ = ch; |
123 | 910 | continue; |
124 | 925 | } |
125 | | |
126 | 868 | if (ch < 0xF0) { |
127 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ |
128 | 868 | Py_UCS4 ch2, ch3; |
129 | 868 | if (end - s < 3) { |
130 | | /* unexpected end of data: the caller will decide whether |
131 | | it's an error or not */ |
132 | 0 | if (end - s < 2) |
133 | 0 | break; |
134 | 0 | ch2 = (unsigned char)s[1]; |
135 | 0 | if (!IS_CONTINUATION_BYTE(ch2) || |
136 | 0 | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) |
137 | | /* for clarification see comments below */ |
138 | 0 | goto InvalidContinuation1; |
139 | 0 | break; |
140 | 0 | } |
141 | 868 | ch2 = (unsigned char)s[1]; |
142 | 868 | ch3 = (unsigned char)s[2]; |
143 | 868 | if (!IS_CONTINUATION_BYTE(ch2)) { |
144 | | /* invalid continuation byte */ |
145 | 0 | goto InvalidContinuation1; |
146 | 0 | } |
147 | 868 | if (ch == 0xE0) { |
148 | 0 | if (ch2 < 0xA0) |
149 | | /* invalid sequence |
150 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ |
151 | 0 | goto InvalidContinuation1; |
152 | 868 | } else if (ch == 0xED && ch2 >= 0xA0) { |
153 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF |
154 | | will result in surrogates in range D800-DFFF. Surrogates are |
155 | | not valid UTF-8 so they are rejected. |
156 | | See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf |
157 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ |
158 | 0 | goto InvalidContinuation1; |
159 | 0 | } |
160 | 868 | if (!IS_CONTINUATION_BYTE(ch3)) { |
161 | | /* invalid continuation byte */ |
162 | 0 | goto InvalidContinuation2; |
163 | 0 | } |
164 | 868 | ch = (ch << 12) + (ch2 << 6) + ch3 - |
165 | 868 | ((0xE0 << 12) + (0x80 << 6) + 0x80); |
166 | 868 | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); |
167 | 868 | s += 3; |
168 | 868 | if (STRINGLIB_MAX_CHAR <= 0x07FF || |
169 | 868 | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) |
170 | | /* Out-of-range */ |
171 | 14 | goto Return; |
172 | 854 | *p++ = ch; |
173 | 854 | continue; |
174 | 868 | } |
175 | | |
176 | 0 | if (ch < 0xF5) { |
177 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ |
178 | 0 | Py_UCS4 ch2, ch3, ch4; |
179 | 0 | if (end - s < 4) { |
180 | | /* unexpected end of data: the caller will decide whether |
181 | | it's an error or not */ |
182 | 0 | if (end - s < 2) |
183 | 0 | break; |
184 | 0 | ch2 = (unsigned char)s[1]; |
185 | 0 | if (!IS_CONTINUATION_BYTE(ch2) || |
186 | 0 | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) |
187 | | /* for clarification see comments below */ |
188 | 0 | goto InvalidContinuation1; |
189 | 0 | if (end - s < 3) |
190 | 0 | break; |
191 | 0 | ch3 = (unsigned char)s[2]; |
192 | 0 | if (!IS_CONTINUATION_BYTE(ch3)) |
193 | 0 | goto InvalidContinuation2; |
194 | 0 | break; |
195 | 0 | } |
196 | 0 | ch2 = (unsigned char)s[1]; |
197 | 0 | ch3 = (unsigned char)s[2]; |
198 | 0 | ch4 = (unsigned char)s[3]; |
199 | 0 | if (!IS_CONTINUATION_BYTE(ch2)) { |
200 | | /* invalid continuation byte */ |
201 | 0 | goto InvalidContinuation1; |
202 | 0 | } |
203 | 0 | if (ch == 0xF0) { |
204 | 0 | if (ch2 < 0x90) |
205 | | /* invalid sequence |
206 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ |
207 | 0 | goto InvalidContinuation1; |
208 | 0 | } else if (ch == 0xF4 && ch2 >= 0x90) { |
209 | | /* invalid sequence |
210 | | \xF4\x90\x80\x80- -- 110000- overflow */ |
211 | 0 | goto InvalidContinuation1; |
212 | 0 | } |
213 | 0 | if (!IS_CONTINUATION_BYTE(ch3)) { |
214 | | /* invalid continuation byte */ |
215 | 0 | goto InvalidContinuation2; |
216 | 0 | } |
217 | 0 | if (!IS_CONTINUATION_BYTE(ch4)) { |
218 | | /* invalid continuation byte */ |
219 | 0 | goto InvalidContinuation3; |
220 | 0 | } |
221 | 0 | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - |
222 | 0 | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); |
223 | 0 | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); |
224 | 0 | s += 4; |
225 | 0 | if (STRINGLIB_MAX_CHAR <= 0xFFFF || |
226 | 0 | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) |
227 | | /* Out-of-range */ |
228 | 0 | goto Return; |
229 | 0 | *p++ = ch; |
230 | 0 | continue; |
231 | 0 | } |
232 | 0 | goto InvalidStart; |
233 | 0 | } |
234 | 15 | ch = 0; |
235 | 44 | Return: |
236 | 44 | *inptr = s; |
237 | 44 | *outpos = p - dest; |
238 | 44 | return ch; |
239 | 0 | InvalidStart: |
240 | 0 | ch = 1; |
241 | 0 | goto Return; |
242 | 0 | InvalidContinuation1: |
243 | 0 | ch = 2; |
244 | 0 | goto Return; |
245 | 0 | InvalidContinuation2: |
246 | 0 | ch = 3; |
247 | 0 | goto Return; |
248 | 0 | InvalidContinuation3: |
249 | 0 | ch = 4; |
250 | 0 | goto Return; |
251 | 15 | } unicodeobject.c:asciilib_utf8_decode Line | Count | Source | 24 | 15 | { | 25 | 15 | Py_UCS4 ch; | 26 | 15 | const char *s = *inptr; | 27 | 15 | const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); | 28 | 15 | STRINGLIB_CHAR *p = dest + *outpos; | 29 | | | 30 | 15 | while (s < end) { | 31 | 15 | ch = (unsigned char)*s; | 32 | | | 33 | 15 | if (ch < 0x80) { | 34 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 35 | | input will consist of an overwhelming majority of ASCII | 36 | | characters, we try to optimize for this case by checking | 37 | | as many characters as a C 'long' can contain. | 38 | | First, check if we can do an aligned read, as most CPUs have | 39 | | a penalty for unaligned reads. | 40 | | */ | 41 | 0 | if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) { | 42 | | /* Help register allocation */ | 43 | 0 | const char *_s = s; | 44 | 0 | STRINGLIB_CHAR *_p = p; | 45 | 0 | while (_s < aligned_end) { | 46 | | /* Read a whole long at a time (either 4 or 8 bytes), | 47 | | and do a fast unrolled copy if it only contains ASCII | 48 | | characters. */ | 49 | 0 | unsigned long value = *(const unsigned long *) _s; | 50 | 0 | if (value & ASCII_CHAR_MASK) | 51 | 0 | break; | 52 | 0 | #if PY_LITTLE_ENDIAN | 53 | 0 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 54 | 0 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 55 | 0 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 56 | 0 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 57 | 0 | # if SIZEOF_LONG == 8 | 58 | 0 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 59 | 0 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 60 | 0 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 61 | 0 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 62 | 0 | # endif | 63 | | #else | 64 | | # if SIZEOF_LONG == 8 | 65 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 66 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 67 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 68 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 69 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 70 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 71 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 72 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 73 | | # else | 74 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 75 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 76 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 77 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 78 | | # endif | 79 | | #endif | 80 | 0 | _s += SIZEOF_LONG; | 81 | 0 | _p += SIZEOF_LONG; | 82 | 0 | } | 83 | 0 | s = _s; | 84 | 0 | p = _p; | 85 | 0 | if (s == end) | 86 | 0 | break; | 87 | 0 | ch = (unsigned char)*s; | 88 | 0 | } | 89 | 0 | if (ch < 0x80) { | 90 | 0 | s++; | 91 | 0 | *p++ = ch; | 92 | 0 | continue; | 93 | 0 | } | 94 | 0 | } | 95 | | | 96 | 15 | if (ch < 0xE0) { | 97 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 98 | 15 | Py_UCS4 ch2; | 99 | 15 | if (ch < 0xC2) { | 100 | | /* invalid sequence | 101 | | \x80-\xBF -- continuation byte | 102 | | \xC0-\xC1 -- fake 0000-007F */ | 103 | 0 | goto InvalidStart; | 104 | 0 | } | 105 | 15 | if (end - s < 2) { | 106 | | /* unexpected end of data: the caller will decide whether | 107 | | it's an error or not */ | 108 | 0 | break; | 109 | 0 | } | 110 | 15 | ch2 = (unsigned char)s[1]; | 111 | 15 | if (!IS_CONTINUATION_BYTE(ch2)) | 112 | | /* invalid continuation byte */ | 113 | 0 | goto InvalidContinuation1; | 114 | 15 | ch = (ch << 6) + ch2 - | 115 | 15 | ((0xC0 << 6) + 0x80); | 116 | 15 | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 117 | 15 | s += 2; | 118 | 15 | if (STRINGLIB_MAX_CHAR <= 0x007F || | 119 | 15 | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 120 | | /* Out-of-range */ | 121 | 15 | goto Return; | 122 | 0 | *p++ = ch; | 123 | 0 | continue; | 124 | 15 | } | 125 | | | 126 | 0 | if (ch < 0xF0) { | 127 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 128 | 0 | Py_UCS4 ch2, ch3; | 129 | 0 | if (end - s < 3) { | 130 | | /* unexpected end of data: the caller will decide whether | 131 | | it's an error or not */ | 132 | 0 | if (end - s < 2) | 133 | 0 | break; | 134 | 0 | ch2 = (unsigned char)s[1]; | 135 | 0 | if (!IS_CONTINUATION_BYTE(ch2) || | 136 | 0 | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 137 | | /* for clarification see comments below */ | 138 | 0 | goto InvalidContinuation1; | 139 | 0 | break; | 140 | 0 | } | 141 | 0 | ch2 = (unsigned char)s[1]; | 142 | 0 | ch3 = (unsigned char)s[2]; | 143 | 0 | if (!IS_CONTINUATION_BYTE(ch2)) { | 144 | | /* invalid continuation byte */ | 145 | 0 | goto InvalidContinuation1; | 146 | 0 | } | 147 | 0 | if (ch == 0xE0) { | 148 | 0 | if (ch2 < 0xA0) | 149 | | /* invalid sequence | 150 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 151 | 0 | goto InvalidContinuation1; | 152 | 0 | } else if (ch == 0xED && ch2 >= 0xA0) { | 153 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 154 | | will result in surrogates in range D800-DFFF. Surrogates are | 155 | | not valid UTF-8 so they are rejected. | 156 | | See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 157 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 158 | 0 | goto InvalidContinuation1; | 159 | 0 | } | 160 | 0 | if (!IS_CONTINUATION_BYTE(ch3)) { | 161 | | /* invalid continuation byte */ | 162 | 0 | goto InvalidContinuation2; | 163 | 0 | } | 164 | 0 | ch = (ch << 12) + (ch2 << 6) + ch3 - | 165 | 0 | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 166 | 0 | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 167 | 0 | s += 3; | 168 | 0 | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 169 | 0 | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 170 | | /* Out-of-range */ | 171 | 0 | goto Return; | 172 | 0 | *p++ = ch; | 173 | 0 | continue; | 174 | 0 | } | 175 | | | 176 | 0 | if (ch < 0xF5) { | 177 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 178 | 0 | Py_UCS4 ch2, ch3, ch4; | 179 | 0 | if (end - s < 4) { | 180 | | /* unexpected end of data: the caller will decide whether | 181 | | it's an error or not */ | 182 | 0 | if (end - s < 2) | 183 | 0 | break; | 184 | 0 | ch2 = (unsigned char)s[1]; | 185 | 0 | if (!IS_CONTINUATION_BYTE(ch2) || | 186 | 0 | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 187 | | /* for clarification see comments below */ | 188 | 0 | goto InvalidContinuation1; | 189 | 0 | if (end - s < 3) | 190 | 0 | break; | 191 | 0 | ch3 = (unsigned char)s[2]; | 192 | 0 | if (!IS_CONTINUATION_BYTE(ch3)) | 193 | 0 | goto InvalidContinuation2; | 194 | 0 | break; | 195 | 0 | } | 196 | 0 | ch2 = (unsigned char)s[1]; | 197 | 0 | ch3 = (unsigned char)s[2]; | 198 | 0 | ch4 = (unsigned char)s[3]; | 199 | 0 | if (!IS_CONTINUATION_BYTE(ch2)) { | 200 | | /* invalid continuation byte */ | 201 | 0 | goto InvalidContinuation1; | 202 | 0 | } | 203 | 0 | if (ch == 0xF0) { | 204 | 0 | if (ch2 < 0x90) | 205 | | /* invalid sequence | 206 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 207 | 0 | goto InvalidContinuation1; | 208 | 0 | } else if (ch == 0xF4 && ch2 >= 0x90) { | 209 | | /* invalid sequence | 210 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 211 | 0 | goto InvalidContinuation1; | 212 | 0 | } | 213 | 0 | if (!IS_CONTINUATION_BYTE(ch3)) { | 214 | | /* invalid continuation byte */ | 215 | 0 | goto InvalidContinuation2; | 216 | 0 | } | 217 | 0 | if (!IS_CONTINUATION_BYTE(ch4)) { | 218 | | /* invalid continuation byte */ | 219 | 0 | goto InvalidContinuation3; | 220 | 0 | } | 221 | 0 | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 222 | 0 | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 223 | 0 | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 224 | 0 | s += 4; | 225 | 0 | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 226 | 0 | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 227 | | /* Out-of-range */ | 228 | 0 | goto Return; | 229 | 0 | *p++ = ch; | 230 | 0 | continue; | 231 | 0 | } | 232 | 0 | goto InvalidStart; | 233 | 0 | } | 234 | 0 | ch = 0; | 235 | 15 | Return: | 236 | 15 | *inptr = s; | 237 | 15 | *outpos = p - dest; | 238 | 15 | return ch; | 239 | 0 | InvalidStart: | 240 | 0 | ch = 1; | 241 | 0 | goto Return; | 242 | 0 | InvalidContinuation1: | 243 | 0 | ch = 2; | 244 | 0 | goto Return; | 245 | 0 | InvalidContinuation2: | 246 | 0 | ch = 3; | 247 | 0 | goto Return; | 248 | 0 | InvalidContinuation3: | 249 | 0 | ch = 4; | 250 | 0 | goto Return; | 251 | 0 | } |
unicodeobject.c:ucs1lib_utf8_decode Line | Count | Source | 24 | 15 | { | 25 | 15 | Py_UCS4 ch; | 26 | 15 | const char *s = *inptr; | 27 | 15 | const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); | 28 | 15 | STRINGLIB_CHAR *p = dest + *outpos; | 29 | | | 30 | 425 | while (s < end) { | 31 | 424 | ch = (unsigned char)*s; | 32 | | | 33 | 424 | if (ch < 0x80) { | 34 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 35 | | input will consist of an overwhelming majority of ASCII | 36 | | characters, we try to optimize for this case by checking | 37 | | as many characters as a C 'long' can contain. | 38 | | First, check if we can do an aligned read, as most CPUs have | 39 | | a penalty for unaligned reads. | 40 | | */ | 41 | 4 | if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) { | 42 | | /* Help register allocation */ | 43 | 1 | const char *_s = s; | 44 | 1 | STRINGLIB_CHAR *_p = p; | 45 | 586 | while (_s < aligned_end) { | 46 | | /* Read a whole long at a time (either 4 or 8 bytes), | 47 | | and do a fast unrolled copy if it only contains ASCII | 48 | | characters. */ | 49 | 585 | unsigned long value = *(const unsigned long *) _s; | 50 | 585 | if (value & ASCII_CHAR_MASK) | 51 | 0 | break; | 52 | 585 | #if PY_LITTLE_ENDIAN | 53 | 585 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 54 | 585 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 55 | 585 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 56 | 585 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 57 | 585 | # if SIZEOF_LONG == 8 | 58 | 585 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 59 | 585 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 60 | 585 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 61 | 585 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 62 | 585 | # endif | 63 | | #else | 64 | | # if SIZEOF_LONG == 8 | 65 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 66 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 67 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 68 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 69 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 70 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 71 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 72 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 73 | | # else | 74 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 75 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 76 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 77 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 78 | | # endif | 79 | | #endif | 80 | 585 | _s += SIZEOF_LONG; | 81 | 585 | _p += SIZEOF_LONG; | 82 | 585 | } | 83 | 1 | s = _s; | 84 | 1 | p = _p; | 85 | 1 | if (s == end) | 86 | 0 | break; | 87 | 1 | ch = (unsigned char)*s; | 88 | 1 | } | 89 | 4 | if (ch < 0x80) { | 90 | 4 | s++; | 91 | 4 | *p++ = ch; | 92 | 4 | continue; | 93 | 4 | } | 94 | 4 | } | 95 | | | 96 | 420 | if (ch < 0xE0) { | 97 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 98 | 406 | Py_UCS4 ch2; | 99 | 406 | if (ch < 0xC2) { | 100 | | /* invalid sequence | 101 | | \x80-\xBF -- continuation byte | 102 | | \xC0-\xC1 -- fake 0000-007F */ | 103 | 0 | goto InvalidStart; | 104 | 0 | } | 105 | 406 | if (end - s < 2) { | 106 | | /* unexpected end of data: the caller will decide whether | 107 | | it's an error or not */ | 108 | 0 | break; | 109 | 0 | } | 110 | 406 | ch2 = (unsigned char)s[1]; | 111 | 406 | if (!IS_CONTINUATION_BYTE(ch2)) | 112 | | /* invalid continuation byte */ | 113 | 0 | goto InvalidContinuation1; | 114 | 406 | ch = (ch << 6) + ch2 - | 115 | 406 | ((0xC0 << 6) + 0x80); | 116 | 406 | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 117 | 406 | s += 2; | 118 | 406 | if (STRINGLIB_MAX_CHAR <= 0x007F || | 119 | 406 | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 120 | | /* Out-of-range */ | 121 | 0 | goto Return; | 122 | 406 | *p++ = ch; | 123 | 406 | continue; | 124 | 406 | } | 125 | | | 126 | 14 | if (ch < 0xF0) { | 127 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 128 | 14 | Py_UCS4 ch2, ch3; | 129 | 14 | if (end - s < 3) { | 130 | | /* unexpected end of data: the caller will decide whether | 131 | | it's an error or not */ | 132 | 0 | if (end - s < 2) | 133 | 0 | break; | 134 | 0 | ch2 = (unsigned char)s[1]; | 135 | 0 | if (!IS_CONTINUATION_BYTE(ch2) || | 136 | 0 | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 137 | | /* for clarification see comments below */ | 138 | 0 | goto InvalidContinuation1; | 139 | 0 | break; | 140 | 0 | } | 141 | 14 | ch2 = (unsigned char)s[1]; | 142 | 14 | ch3 = (unsigned char)s[2]; | 143 | 14 | if (!IS_CONTINUATION_BYTE(ch2)) { | 144 | | /* invalid continuation byte */ | 145 | 0 | goto InvalidContinuation1; | 146 | 0 | } | 147 | 14 | if (ch == 0xE0) { | 148 | 0 | if (ch2 < 0xA0) | 149 | | /* invalid sequence | 150 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 151 | 0 | goto InvalidContinuation1; | 152 | 14 | } else if (ch == 0xED && ch2 >= 0xA0) { | 153 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 154 | | will result in surrogates in range D800-DFFF. Surrogates are | 155 | | not valid UTF-8 so they are rejected. | 156 | | See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 157 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 158 | 0 | goto InvalidContinuation1; | 159 | 0 | } | 160 | 14 | if (!IS_CONTINUATION_BYTE(ch3)) { | 161 | | /* invalid continuation byte */ | 162 | 0 | goto InvalidContinuation2; | 163 | 0 | } | 164 | 14 | ch = (ch << 12) + (ch2 << 6) + ch3 - | 165 | 14 | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 166 | 14 | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 167 | 14 | s += 3; | 168 | 14 | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 169 | 14 | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 170 | | /* Out-of-range */ | 171 | 14 | goto Return; | 172 | 0 | *p++ = ch; | 173 | 0 | continue; | 174 | 14 | } | 175 | | | 176 | 0 | if (ch < 0xF5) { | 177 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 178 | 0 | Py_UCS4 ch2, ch3, ch4; | 179 | 0 | if (end - s < 4) { | 180 | | /* unexpected end of data: the caller will decide whether | 181 | | it's an error or not */ | 182 | 0 | if (end - s < 2) | 183 | 0 | break; | 184 | 0 | ch2 = (unsigned char)s[1]; | 185 | 0 | if (!IS_CONTINUATION_BYTE(ch2) || | 186 | 0 | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 187 | | /* for clarification see comments below */ | 188 | 0 | goto InvalidContinuation1; | 189 | 0 | if (end - s < 3) | 190 | 0 | break; | 191 | 0 | ch3 = (unsigned char)s[2]; | 192 | 0 | if (!IS_CONTINUATION_BYTE(ch3)) | 193 | 0 | goto InvalidContinuation2; | 194 | 0 | break; | 195 | 0 | } | 196 | 0 | ch2 = (unsigned char)s[1]; | 197 | 0 | ch3 = (unsigned char)s[2]; | 198 | 0 | ch4 = (unsigned char)s[3]; | 199 | 0 | if (!IS_CONTINUATION_BYTE(ch2)) { | 200 | | /* invalid continuation byte */ | 201 | 0 | goto InvalidContinuation1; | 202 | 0 | } | 203 | 0 | if (ch == 0xF0) { | 204 | 0 | if (ch2 < 0x90) | 205 | | /* invalid sequence | 206 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 207 | 0 | goto InvalidContinuation1; | 208 | 0 | } else if (ch == 0xF4 && ch2 >= 0x90) { | 209 | | /* invalid sequence | 210 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 211 | 0 | goto InvalidContinuation1; | 212 | 0 | } | 213 | 0 | if (!IS_CONTINUATION_BYTE(ch3)) { | 214 | | /* invalid continuation byte */ | 215 | 0 | goto InvalidContinuation2; | 216 | 0 | } | 217 | 0 | if (!IS_CONTINUATION_BYTE(ch4)) { | 218 | | /* invalid continuation byte */ | 219 | 0 | goto InvalidContinuation3; | 220 | 0 | } | 221 | 0 | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 222 | 0 | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 223 | 0 | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 224 | 0 | s += 4; | 225 | 0 | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 226 | 0 | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 227 | | /* Out-of-range */ | 228 | 0 | goto Return; | 229 | 0 | *p++ = ch; | 230 | 0 | continue; | 231 | 0 | } | 232 | 0 | goto InvalidStart; | 233 | 0 | } | 234 | 1 | ch = 0; | 235 | 15 | Return: | 236 | 15 | *inptr = s; | 237 | 15 | *outpos = p - dest; | 238 | 15 | return ch; | 239 | 0 | InvalidStart: | 240 | 0 | ch = 1; | 241 | 0 | goto Return; | 242 | 0 | InvalidContinuation1: | 243 | 0 | ch = 2; | 244 | 0 | goto Return; | 245 | 0 | InvalidContinuation2: | 246 | 0 | ch = 3; | 247 | 0 | goto Return; | 248 | 0 | InvalidContinuation3: | 249 | 0 | ch = 4; | 250 | 0 | goto Return; | 251 | 1 | } |
unicodeobject.c:ucs2lib_utf8_decode Line | Count | Source | 24 | 14 | { | 25 | 14 | Py_UCS4 ch; | 26 | 14 | const char *s = *inptr; | 27 | 14 | const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); | 28 | 14 | STRINGLIB_CHAR *p = dest + *outpos; | 29 | | | 30 | 1.37k | while (s < end) { | 31 | 1.35k | ch = (unsigned char)*s; | 32 | | | 33 | 1.35k | if (ch < 0x80) { | 34 | | /* Fast path for runs of ASCII characters. Given that common UTF-8 | 35 | | input will consist of an overwhelming majority of ASCII | 36 | | characters, we try to optimize for this case by checking | 37 | | as many characters as a C 'long' can contain. | 38 | | First, check if we can do an aligned read, as most CPUs have | 39 | | a penalty for unaligned reads. | 40 | | */ | 41 | 0 | if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) { | 42 | | /* Help register allocation */ | 43 | 0 | const char *_s = s; | 44 | 0 | STRINGLIB_CHAR *_p = p; | 45 | 0 | while (_s < aligned_end) { | 46 | | /* Read a whole long at a time (either 4 or 8 bytes), | 47 | | and do a fast unrolled copy if it only contains ASCII | 48 | | characters. */ | 49 | 0 | unsigned long value = *(const unsigned long *) _s; | 50 | 0 | if (value & ASCII_CHAR_MASK) | 51 | 0 | break; | 52 | 0 | #if PY_LITTLE_ENDIAN | 53 | 0 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); | 54 | 0 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 55 | 0 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 56 | 0 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 57 | 0 | # if SIZEOF_LONG == 8 | 58 | 0 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 59 | 0 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 60 | 0 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 61 | 0 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 62 | 0 | # endif | 63 | | #else | 64 | | # if SIZEOF_LONG == 8 | 65 | | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); | 66 | | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); | 67 | | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); | 68 | | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); | 69 | | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 70 | | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 71 | | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 72 | | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); | 73 | | # else | 74 | | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); | 75 | | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); | 76 | | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); | 77 | | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); | 78 | | # endif | 79 | | #endif | 80 | 0 | _s += SIZEOF_LONG; | 81 | 0 | _p += SIZEOF_LONG; | 82 | 0 | } | 83 | 0 | s = _s; | 84 | 0 | p = _p; | 85 | 0 | if (s == end) | 86 | 0 | break; | 87 | 0 | ch = (unsigned char)*s; | 88 | 0 | } | 89 | 0 | if (ch < 0x80) { | 90 | 0 | s++; | 91 | 0 | *p++ = ch; | 92 | 0 | continue; | 93 | 0 | } | 94 | 0 | } | 95 | | | 96 | 1.35k | if (ch < 0xE0) { | 97 | | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | 98 | 504 | Py_UCS4 ch2; | 99 | 504 | if (ch < 0xC2) { | 100 | | /* invalid sequence | 101 | | \x80-\xBF -- continuation byte | 102 | | \xC0-\xC1 -- fake 0000-007F */ | 103 | 0 | goto InvalidStart; | 104 | 0 | } | 105 | 504 | if (end - s < 2) { | 106 | | /* unexpected end of data: the caller will decide whether | 107 | | it's an error or not */ | 108 | 0 | break; | 109 | 0 | } | 110 | 504 | ch2 = (unsigned char)s[1]; | 111 | 504 | if (!IS_CONTINUATION_BYTE(ch2)) | 112 | | /* invalid continuation byte */ | 113 | 0 | goto InvalidContinuation1; | 114 | 504 | ch = (ch << 6) + ch2 - | 115 | 504 | ((0xC0 << 6) + 0x80); | 116 | 504 | assert ((ch > 0x007F) && (ch <= 0x07FF)); | 117 | 504 | s += 2; | 118 | 504 | if (STRINGLIB_MAX_CHAR <= 0x007F || | 119 | 504 | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) | 120 | | /* Out-of-range */ | 121 | 0 | goto Return; | 122 | 504 | *p++ = ch; | 123 | 504 | continue; | 124 | 504 | } | 125 | | | 126 | 854 | if (ch < 0xF0) { | 127 | | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | 128 | 854 | Py_UCS4 ch2, ch3; | 129 | 854 | if (end - s < 3) { | 130 | | /* unexpected end of data: the caller will decide whether | 131 | | it's an error or not */ | 132 | 0 | if (end - s < 2) | 133 | 0 | break; | 134 | 0 | ch2 = (unsigned char)s[1]; | 135 | 0 | if (!IS_CONTINUATION_BYTE(ch2) || | 136 | 0 | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) | 137 | | /* for clarification see comments below */ | 138 | 0 | goto InvalidContinuation1; | 139 | 0 | break; | 140 | 0 | } | 141 | 854 | ch2 = (unsigned char)s[1]; | 142 | 854 | ch3 = (unsigned char)s[2]; | 143 | 854 | if (!IS_CONTINUATION_BYTE(ch2)) { | 144 | | /* invalid continuation byte */ | 145 | 0 | goto InvalidContinuation1; | 146 | 0 | } | 147 | 854 | if (ch == 0xE0) { | 148 | 0 | if (ch2 < 0xA0) | 149 | | /* invalid sequence | 150 | | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | 151 | 0 | goto InvalidContinuation1; | 152 | 854 | } else if (ch == 0xED && ch2 >= 0xA0) { | 153 | | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF | 154 | | will result in surrogates in range D800-DFFF. Surrogates are | 155 | | not valid UTF-8 so they are rejected. | 156 | | See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf | 157 | | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ | 158 | 0 | goto InvalidContinuation1; | 159 | 0 | } | 160 | 854 | if (!IS_CONTINUATION_BYTE(ch3)) { | 161 | | /* invalid continuation byte */ | 162 | 0 | goto InvalidContinuation2; | 163 | 0 | } | 164 | 854 | ch = (ch << 12) + (ch2 << 6) + ch3 - | 165 | 854 | ((0xE0 << 12) + (0x80 << 6) + 0x80); | 166 | 854 | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); | 167 | 854 | s += 3; | 168 | 854 | if (STRINGLIB_MAX_CHAR <= 0x07FF || | 169 | 854 | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) | 170 | | /* Out-of-range */ | 171 | 0 | goto Return; | 172 | 854 | *p++ = ch; | 173 | 854 | continue; | 174 | 854 | } | 175 | | | 176 | 0 | if (ch < 0xF5) { | 177 | | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | 178 | 0 | Py_UCS4 ch2, ch3, ch4; | 179 | 0 | if (end - s < 4) { | 180 | | /* unexpected end of data: the caller will decide whether | 181 | | it's an error or not */ | 182 | 0 | if (end - s < 2) | 183 | 0 | break; | 184 | 0 | ch2 = (unsigned char)s[1]; | 185 | 0 | if (!IS_CONTINUATION_BYTE(ch2) || | 186 | 0 | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) | 187 | | /* for clarification see comments below */ | 188 | 0 | goto InvalidContinuation1; | 189 | 0 | if (end - s < 3) | 190 | 0 | break; | 191 | 0 | ch3 = (unsigned char)s[2]; | 192 | 0 | if (!IS_CONTINUATION_BYTE(ch3)) | 193 | 0 | goto InvalidContinuation2; | 194 | 0 | break; | 195 | 0 | } | 196 | 0 | ch2 = (unsigned char)s[1]; | 197 | 0 | ch3 = (unsigned char)s[2]; | 198 | 0 | ch4 = (unsigned char)s[3]; | 199 | 0 | if (!IS_CONTINUATION_BYTE(ch2)) { | 200 | | /* invalid continuation byte */ | 201 | 0 | goto InvalidContinuation1; | 202 | 0 | } | 203 | 0 | if (ch == 0xF0) { | 204 | 0 | if (ch2 < 0x90) | 205 | | /* invalid sequence | 206 | | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ | 207 | 0 | goto InvalidContinuation1; | 208 | 0 | } else if (ch == 0xF4 && ch2 >= 0x90) { | 209 | | /* invalid sequence | 210 | | \xF4\x90\x80\x80- -- 110000- overflow */ | 211 | 0 | goto InvalidContinuation1; | 212 | 0 | } | 213 | 0 | if (!IS_CONTINUATION_BYTE(ch3)) { | 214 | | /* invalid continuation byte */ | 215 | 0 | goto InvalidContinuation2; | 216 | 0 | } | 217 | 0 | if (!IS_CONTINUATION_BYTE(ch4)) { | 218 | | /* invalid continuation byte */ | 219 | 0 | goto InvalidContinuation3; | 220 | 0 | } | 221 | 0 | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | 222 | 0 | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | 223 | 0 | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | 224 | 0 | s += 4; | 225 | 0 | if (STRINGLIB_MAX_CHAR <= 0xFFFF || | 226 | 0 | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | 227 | | /* Out-of-range */ | 228 | 0 | goto Return; | 229 | 0 | *p++ = ch; | 230 | 0 | continue; | 231 | 0 | } | 232 | 0 | goto InvalidStart; | 233 | 0 | } | 234 | 14 | ch = 0; | 235 | 14 | Return: | 236 | 14 | *inptr = s; | 237 | 14 | *outpos = p - dest; | 238 | 14 | return ch; | 239 | 0 | InvalidStart: | 240 | 0 | ch = 1; | 241 | 0 | goto Return; | 242 | 0 | InvalidContinuation1: | 243 | 0 | ch = 2; | 244 | 0 | goto Return; | 245 | 0 | InvalidContinuation2: | 246 | 0 | ch = 3; | 247 | 0 | goto Return; | 248 | 0 | InvalidContinuation3: | 249 | 0 | ch = 4; | 250 | 0 | goto Return; | 251 | 14 | } |
Unexecuted instantiation: unicodeobject.c:ucs4lib_utf8_decode |
252 | | |
253 | | #undef ASCII_CHAR_MASK |
254 | | |
255 | | |
256 | | /* UTF-8 encoder specialized for a Unicode kind to avoid the slow |
257 | | PyUnicode_READ() macro. Delete some parts of the code depending on the kind: |
258 | | UCS-1 strings don't need to handle surrogates for example. */ |
259 | | Py_LOCAL_INLINE(PyObject *) |
260 | | STRINGLIB(utf8_encoder)(PyObject *unicode, |
261 | | STRINGLIB_CHAR *data, |
262 | | Py_ssize_t size, |
263 | | _Py_error_handler error_handler, |
264 | | const char *errors) |
265 | 0 | { |
266 | 0 | Py_ssize_t i; /* index into data of next input character */ |
267 | 0 | char *p; /* next free byte in output buffer */ |
268 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
269 | | PyObject *error_handler_obj = NULL; |
270 | | PyObject *exc = NULL; |
271 | | PyObject *rep = NULL; |
272 | | #endif |
273 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
274 | | const Py_ssize_t max_char_size = 2; |
275 | | #elif STRINGLIB_SIZEOF_CHAR == 2 |
276 | | const Py_ssize_t max_char_size = 3; |
277 | | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ |
278 | | const Py_ssize_t max_char_size = 4; |
279 | | #endif |
280 | 0 | _PyBytesWriter writer; |
281 | |
|
282 | 0 | assert(size >= 0); |
283 | 0 | _PyBytesWriter_Init(&writer); |
284 | |
|
285 | 0 | if (size > PY_SSIZE_T_MAX / max_char_size) { |
286 | | /* integer overflow */ |
287 | 0 | return PyErr_NoMemory(); |
288 | 0 | } |
289 | | |
290 | 0 | p = _PyBytesWriter_Alloc(&writer, size * max_char_size); |
291 | 0 | if (p == NULL) |
292 | 0 | return NULL; |
293 | | |
294 | 0 | for (i = 0; i < size;) { |
295 | 0 | Py_UCS4 ch = data[i++]; |
296 | |
|
297 | 0 | if (ch < 0x80) { |
298 | | /* Encode ASCII */ |
299 | 0 | *p++ = (char) ch; |
300 | |
|
301 | 0 | } |
302 | 0 | else |
303 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
304 | 0 | if (ch < 0x0800) |
305 | 0 | #endif |
306 | 0 | { |
307 | | /* Encode Latin-1 */ |
308 | 0 | *p++ = (char)(0xc0 | (ch >> 6)); |
309 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); |
310 | 0 | } |
311 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
312 | 0 | else if (Py_UNICODE_IS_SURROGATE(ch)) { |
313 | 0 | Py_ssize_t startpos, endpos, newpos; |
314 | 0 | Py_ssize_t k; |
315 | 0 | if (error_handler == _Py_ERROR_UNKNOWN) { |
316 | 0 | error_handler = _Py_GetErrorHandler(errors); |
317 | 0 | } |
318 | |
|
319 | 0 | startpos = i-1; |
320 | 0 | endpos = startpos+1; |
321 | |
|
322 | 0 | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) |
323 | 0 | endpos++; |
324 | | |
325 | | /* Only overallocate the buffer if it's not the last write */ |
326 | 0 | writer.overallocate = (endpos < size); |
327 | |
|
328 | 0 | switch (error_handler) |
329 | 0 | { |
330 | 0 | case _Py_ERROR_REPLACE: |
331 | 0 | memset(p, '?', endpos - startpos); |
332 | 0 | p += (endpos - startpos); |
333 | | /* fall through */ |
334 | 0 | case _Py_ERROR_IGNORE: |
335 | 0 | i += (endpos - startpos - 1); |
336 | 0 | break; |
337 | | |
338 | 0 | case _Py_ERROR_SURROGATEPASS: |
339 | 0 | for (k=startpos; k<endpos; k++) { |
340 | 0 | ch = data[k]; |
341 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); |
342 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
343 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); |
344 | 0 | } |
345 | 0 | i += (endpos - startpos - 1); |
346 | 0 | break; |
347 | | |
348 | 0 | case _Py_ERROR_BACKSLASHREPLACE: |
349 | | /* subtract preallocated bytes */ |
350 | 0 | writer.min_size -= max_char_size * (endpos - startpos); |
351 | 0 | p = backslashreplace(&writer, p, |
352 | 0 | unicode, startpos, endpos); |
353 | 0 | if (p == NULL) |
354 | 0 | goto error; |
355 | 0 | i += (endpos - startpos - 1); |
356 | 0 | break; |
357 | | |
358 | 0 | case _Py_ERROR_XMLCHARREFREPLACE: |
359 | | /* subtract preallocated bytes */ |
360 | 0 | writer.min_size -= max_char_size * (endpos - startpos); |
361 | 0 | p = xmlcharrefreplace(&writer, p, |
362 | 0 | unicode, startpos, endpos); |
363 | 0 | if (p == NULL) |
364 | 0 | goto error; |
365 | 0 | i += (endpos - startpos - 1); |
366 | 0 | break; |
367 | | |
368 | 0 | case _Py_ERROR_SURROGATEESCAPE: |
369 | 0 | for (k=startpos; k<endpos; k++) { |
370 | 0 | ch = data[k]; |
371 | 0 | if (!(0xDC80 <= ch && ch <= 0xDCFF)) |
372 | 0 | break; |
373 | 0 | *p++ = (char)(ch & 0xff); |
374 | 0 | } |
375 | 0 | if (k >= endpos) { |
376 | 0 | i += (endpos - startpos - 1); |
377 | 0 | break; |
378 | 0 | } |
379 | 0 | startpos = k; |
380 | 0 | assert(startpos < endpos); |
381 | | /* fall through */ |
382 | 0 | default: |
383 | 0 | rep = unicode_encode_call_errorhandler( |
384 | 0 | errors, &error_handler_obj, "utf-8", "surrogates not allowed", |
385 | 0 | unicode, &exc, startpos, endpos, &newpos); |
386 | 0 | if (!rep) |
387 | 0 | goto error; |
388 | | |
389 | | /* subtract preallocated bytes */ |
390 | 0 | writer.min_size -= max_char_size * (newpos - startpos); |
391 | |
|
392 | 0 | if (PyBytes_Check(rep)) { |
393 | 0 | p = _PyBytesWriter_WriteBytes(&writer, p, |
394 | 0 | PyBytes_AS_STRING(rep), |
395 | 0 | PyBytes_GET_SIZE(rep)); |
396 | 0 | } |
397 | 0 | else { |
398 | | /* rep is unicode */ |
399 | 0 | if (PyUnicode_READY(rep) < 0) |
400 | 0 | goto error; |
401 | | |
402 | 0 | if (!PyUnicode_IS_ASCII(rep)) { |
403 | 0 | raise_encode_exception(&exc, "utf-8", unicode, |
404 | 0 | startpos, endpos, |
405 | 0 | "surrogates not allowed"); |
406 | 0 | goto error; |
407 | 0 | } |
408 | | |
409 | 0 | p = _PyBytesWriter_WriteBytes(&writer, p, |
410 | 0 | PyUnicode_DATA(rep), |
411 | 0 | PyUnicode_GET_LENGTH(rep)); |
412 | 0 | } |
413 | | |
414 | 0 | if (p == NULL) |
415 | 0 | goto error; |
416 | 0 | Py_CLEAR(rep); |
417 | |
|
418 | 0 | i = newpos; |
419 | 0 | } |
420 | | |
421 | | /* If overallocation was disabled, ensure that it was the last |
422 | | write. Otherwise, we missed an optimization */ |
423 | 0 | assert(writer.overallocate || i == size); |
424 | 0 | } |
425 | 0 | else |
426 | | #if STRINGLIB_SIZEOF_CHAR > 2 |
427 | 0 | if (ch < 0x10000) |
428 | 0 | #endif |
429 | 0 | { |
430 | 0 | *p++ = (char)(0xe0 | (ch >> 12)); |
431 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
432 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); |
433 | 0 | } |
434 | | #if STRINGLIB_SIZEOF_CHAR > 2 |
435 | | else /* ch >= 0x10000 */ |
436 | 0 | { |
437 | 0 | assert(ch <= MAX_UNICODE); |
438 | | /* Encode UCS4 Unicode ordinals */ |
439 | 0 | *p++ = (char)(0xf0 | (ch >> 18)); |
440 | 0 | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); |
441 | 0 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
442 | 0 | *p++ = (char)(0x80 | (ch & 0x3f)); |
443 | 0 | } |
444 | | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ |
445 | | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ |
446 | 0 | } |
447 | | |
448 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
449 | 0 | Py_XDECREF(error_handler_obj); |
450 | 0 | Py_XDECREF(exc); |
451 | | #endif |
452 | 0 | return _PyBytesWriter_Finish(&writer, p); |
453 | | |
454 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
455 | 0 | error: |
456 | 0 | Py_XDECREF(rep); |
457 | 0 | Py_XDECREF(error_handler_obj); |
458 | 0 | Py_XDECREF(exc); |
459 | 0 | _PyBytesWriter_Dealloc(&writer); |
460 | 0 | return NULL; |
461 | | #endif |
462 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf8_encoder Unexecuted instantiation: unicodeobject.c:ucs2lib_utf8_encoder Unexecuted instantiation: unicodeobject.c:ucs4lib_utf8_encoder Unexecuted instantiation: unicodeobject.c:asciilib_utf8_encoder |
463 | | |
464 | | /* The pattern for constructing UCS2-repeated masks. */ |
465 | | #if SIZEOF_LONG == 8 |
466 | 0 | # define UCS2_REPEAT_MASK 0x0001000100010001ul |
467 | | #elif SIZEOF_LONG == 4 |
468 | | # define UCS2_REPEAT_MASK 0x00010001ul |
469 | | #else |
470 | | # error C 'long' size should be either 4 or 8! |
471 | | #endif |
472 | | |
473 | | /* The mask for fast checking. */ |
474 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
475 | | /* The mask for fast checking of whether a C 'long' contains a |
476 | | non-ASCII or non-Latin1 UTF16-encoded characters. */ |
477 | 0 | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) |
478 | | #else |
479 | | /* The mask for fast checking of whether a C 'long' may contain |
480 | | UTF16-encoded surrogate characters. This is an efficient heuristic, |
481 | | assuming that non-surrogate characters with a code point >= 0x8000 are |
482 | | rare in most input. |
483 | | */ |
484 | 0 | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) |
485 | | #endif |
486 | | /* The mask for fast byte-swapping. */ |
487 | 0 | #define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) |
488 | | /* Swap bytes. */ |
489 | 0 | #define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ |
490 | 0 | (((value) & STRIPPED_MASK) << 8)) |
491 | | |
492 | | Py_LOCAL_INLINE(Py_UCS4) |
493 | | STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, |
494 | | STRINGLIB_CHAR *dest, Py_ssize_t *outpos, |
495 | | int native_ordering) |
496 | 0 | { |
497 | 0 | Py_UCS4 ch; |
498 | 0 | const unsigned char *aligned_end = |
499 | 0 | (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG); |
500 | 0 | const unsigned char *q = *inptr; |
501 | 0 | STRINGLIB_CHAR *p = dest + *outpos; |
502 | | /* Offsets from q for retrieving byte pairs in the right order. */ |
503 | 0 | #if PY_LITTLE_ENDIAN |
504 | 0 | int ihi = !!native_ordering, ilo = !native_ordering; |
505 | | #else |
506 | | int ihi = !native_ordering, ilo = !!native_ordering; |
507 | | #endif |
508 | 0 | --e; |
509 | |
|
510 | 0 | while (q < e) { |
511 | 0 | Py_UCS4 ch2; |
512 | | /* First check for possible aligned read of a C 'long'. Unaligned |
513 | | reads are more expensive, better to defer to another iteration. */ |
514 | 0 | if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) { |
515 | | /* Fast path for runs of in-range non-surrogate chars. */ |
516 | 0 | const unsigned char *_q = q; |
517 | 0 | while (_q < aligned_end) { |
518 | 0 | unsigned long block = * (const unsigned long *) _q; |
519 | 0 | if (native_ordering) { |
520 | | /* Can use buffer directly */ |
521 | 0 | if (block & FAST_CHAR_MASK) |
522 | 0 | break; |
523 | 0 | } |
524 | 0 | else { |
525 | | /* Need to byte-swap */ |
526 | 0 | if (block & SWAB(FAST_CHAR_MASK)) |
527 | 0 | break; |
528 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
529 | 0 | block >>= 8; |
530 | | #else |
531 | 0 | block = SWAB(block); |
532 | | #endif |
533 | 0 | } |
534 | 0 | #if PY_LITTLE_ENDIAN |
535 | | # if SIZEOF_LONG == 4 |
536 | | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
537 | | p[1] = (STRINGLIB_CHAR)(block >> 16); |
538 | | # elif SIZEOF_LONG == 8 |
539 | 0 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
540 | 0 | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
541 | 0 | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
542 | 0 | p[3] = (STRINGLIB_CHAR)(block >> 48); |
543 | 0 | # endif |
544 | | #else |
545 | | # if SIZEOF_LONG == 4 |
546 | | p[0] = (STRINGLIB_CHAR)(block >> 16); |
547 | | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
548 | | # elif SIZEOF_LONG == 8 |
549 | | p[0] = (STRINGLIB_CHAR)(block >> 48); |
550 | | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
551 | | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
552 | | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
553 | | # endif |
554 | | #endif |
555 | 0 | _q += SIZEOF_LONG; |
556 | 0 | p += SIZEOF_LONG / 2; |
557 | 0 | } |
558 | 0 | q = _q; |
559 | 0 | if (q >= e) |
560 | 0 | break; |
561 | 0 | } |
562 | | |
563 | 0 | ch = (q[ihi] << 8) | q[ilo]; |
564 | 0 | q += 2; |
565 | 0 | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
566 | | #if STRINGLIB_SIZEOF_CHAR < 2 |
567 | 0 | if (ch > STRINGLIB_MAX_CHAR) |
568 | | /* Out-of-range */ |
569 | 0 | goto Return; |
570 | 0 | #endif |
571 | 0 | *p++ = (STRINGLIB_CHAR)ch; |
572 | 0 | continue; |
573 | 0 | } |
574 | | |
575 | | /* UTF-16 code pair: */ |
576 | 0 | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) |
577 | 0 | goto IllegalEncoding; |
578 | 0 | if (q >= e) |
579 | 0 | goto UnexpectedEnd; |
580 | 0 | ch2 = (q[ihi] << 8) | q[ilo]; |
581 | 0 | q += 2; |
582 | 0 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) |
583 | 0 | goto IllegalSurrogate; |
584 | 0 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); |
585 | | #if STRINGLIB_SIZEOF_CHAR < 4 |
586 | | /* Out-of-range */ |
587 | 0 | goto Return; |
588 | | #else |
589 | | *p++ = (STRINGLIB_CHAR)ch; |
590 | | #endif |
591 | 0 | } |
592 | 0 | ch = 0; |
593 | 0 | Return: |
594 | 0 | *inptr = q; |
595 | 0 | *outpos = p - dest; |
596 | 0 | return ch; |
597 | 0 | UnexpectedEnd: |
598 | 0 | ch = 1; |
599 | 0 | goto Return; |
600 | 0 | IllegalEncoding: |
601 | 0 | ch = 2; |
602 | 0 | goto Return; |
603 | 0 | IllegalSurrogate: |
604 | 0 | ch = 3; |
605 | 0 | goto Return; |
606 | 0 | } Unexecuted instantiation: unicodeobject.c:asciilib_utf16_decode Unexecuted instantiation: unicodeobject.c:ucs1lib_utf16_decode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf16_decode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf16_decode |
607 | | #undef UCS2_REPEAT_MASK |
608 | | #undef FAST_CHAR_MASK |
609 | | #undef STRIPPED_MASK |
610 | | #undef SWAB |
611 | | |
612 | | |
613 | | #if STRINGLIB_MAX_CHAR >= 0x80 |
614 | | Py_LOCAL_INLINE(Py_ssize_t) |
615 | | STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, |
616 | | Py_ssize_t len, |
617 | | unsigned short **outptr, |
618 | | int native_ordering) |
619 | 0 | { |
620 | 0 | unsigned short *out = *outptr; |
621 | 0 | const STRINGLIB_CHAR *end = in + len; |
622 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
623 | 0 | if (native_ordering) { |
624 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
625 | 0 | while (in < unrolled_end) { |
626 | 0 | out[0] = in[0]; |
627 | 0 | out[1] = in[1]; |
628 | 0 | out[2] = in[2]; |
629 | 0 | out[3] = in[3]; |
630 | 0 | in += 4; out += 4; |
631 | 0 | } |
632 | 0 | while (in < end) { |
633 | 0 | *out++ = *in++; |
634 | 0 | } |
635 | 0 | } else { |
636 | 0 | # define SWAB2(CH) ((CH) << 8) /* high byte is zero */ |
637 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
638 | 0 | while (in < unrolled_end) { |
639 | 0 | out[0] = SWAB2(in[0]); |
640 | 0 | out[1] = SWAB2(in[1]); |
641 | 0 | out[2] = SWAB2(in[2]); |
642 | 0 | out[3] = SWAB2(in[3]); |
643 | 0 | in += 4; out += 4; |
644 | 0 | } |
645 | 0 | while (in < end) { |
646 | 0 | Py_UCS4 ch = *in++; |
647 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
648 | 0 | } |
649 | 0 | #undef SWAB2 |
650 | 0 | } |
651 | | *outptr = out; |
652 | | return len; |
653 | | #else |
654 | 0 | if (native_ordering) { |
655 | | #if STRINGLIB_MAX_CHAR < 0x10000 |
656 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
657 | 0 | while (in < unrolled_end) { |
658 | | /* check if any character is a surrogate character */ |
659 | 0 | if (((in[0] ^ 0xd800) & |
660 | 0 | (in[1] ^ 0xd800) & |
661 | 0 | (in[2] ^ 0xd800) & |
662 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
663 | 0 | break; |
664 | 0 | out[0] = in[0]; |
665 | 0 | out[1] = in[1]; |
666 | 0 | out[2] = in[2]; |
667 | 0 | out[3] = in[3]; |
668 | 0 | in += 4; out += 4; |
669 | 0 | } |
670 | | #endif |
671 | 0 | while (in < end) { |
672 | 0 | Py_UCS4 ch; |
673 | 0 | ch = *in++; |
674 | 0 | if (ch < 0xd800) |
675 | 0 | *out++ = ch; |
676 | 0 | else if (ch < 0xe000) |
677 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
678 | 0 | goto fail; |
679 | | #if STRINGLIB_MAX_CHAR >= 0x10000 |
680 | 0 | else if (ch >= 0x10000) { |
681 | 0 | out[0] = Py_UNICODE_HIGH_SURROGATE(ch); |
682 | 0 | out[1] = Py_UNICODE_LOW_SURROGATE(ch); |
683 | 0 | out += 2; |
684 | 0 | } |
685 | 0 | #endif |
686 | 0 | else |
687 | 0 | *out++ = ch; |
688 | 0 | } |
689 | 0 | } else { |
690 | 0 | #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) |
691 | | #if STRINGLIB_MAX_CHAR < 0x10000 |
692 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
693 | 0 | while (in < unrolled_end) { |
694 | | /* check if any character is a surrogate character */ |
695 | 0 | if (((in[0] ^ 0xd800) & |
696 | 0 | (in[1] ^ 0xd800) & |
697 | 0 | (in[2] ^ 0xd800) & |
698 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
699 | 0 | break; |
700 | 0 | out[0] = SWAB2(in[0]); |
701 | 0 | out[1] = SWAB2(in[1]); |
702 | 0 | out[2] = SWAB2(in[2]); |
703 | 0 | out[3] = SWAB2(in[3]); |
704 | 0 | in += 4; out += 4; |
705 | 0 | } |
706 | | #endif |
707 | 0 | while (in < end) { |
708 | 0 | Py_UCS4 ch = *in++; |
709 | 0 | if (ch < 0xd800) |
710 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
711 | 0 | else if (ch < 0xe000) |
712 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
713 | 0 | goto fail; |
714 | | #if STRINGLIB_MAX_CHAR >= 0x10000 |
715 | 0 | else if (ch >= 0x10000) { |
716 | 0 | Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); |
717 | 0 | Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); |
718 | 0 | out[0] = SWAB2(ch1); |
719 | 0 | out[1] = SWAB2(ch2); |
720 | 0 | out += 2; |
721 | 0 | } |
722 | 0 | #endif |
723 | 0 | else |
724 | 0 | *out++ = SWAB2((Py_UCS2)ch); |
725 | 0 | } |
726 | 0 | #undef SWAB2 |
727 | 0 | } |
728 | 0 | *outptr = out; |
729 | 0 | return len; |
730 | 0 | fail: |
731 | 0 | *outptr = out; |
732 | 0 | return len - (end - in + 1); |
733 | | #endif |
734 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf16_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf16_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf16_encode |
735 | | |
736 | | #if STRINGLIB_SIZEOF_CHAR == 1 |
737 | 0 | # define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */ |
738 | | #elif STRINGLIB_SIZEOF_CHAR == 2 |
739 | 0 | # define SWAB4(CH, tmp) (tmp = (CH), \ |
740 | 0 | ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8)) |
741 | | /* high bytes are zero */ |
742 | | #else |
743 | 0 | # define SWAB4(CH, tmp) (tmp = (CH), \ |
744 | 0 | tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \ |
745 | 0 | ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu)) |
746 | | #endif |
747 | | Py_LOCAL_INLINE(Py_ssize_t) |
748 | | STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, |
749 | | Py_ssize_t len, |
750 | | PY_UINT32_T **outptr, |
751 | | int native_ordering) |
752 | 0 | { |
753 | 0 | PY_UINT32_T *out = *outptr; |
754 | 0 | const STRINGLIB_CHAR *end = in + len; |
755 | 0 | if (native_ordering) { |
756 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
757 | 0 | while (in < unrolled_end) { |
758 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
759 | | /* check if any character is a surrogate character */ |
760 | 0 | if (((in[0] ^ 0xd800) & |
761 | 0 | (in[1] ^ 0xd800) & |
762 | 0 | (in[2] ^ 0xd800) & |
763 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
764 | 0 | break; |
765 | 0 | #endif |
766 | 0 | out[0] = in[0]; |
767 | 0 | out[1] = in[1]; |
768 | 0 | out[2] = in[2]; |
769 | 0 | out[3] = in[3]; |
770 | 0 | in += 4; out += 4; |
771 | 0 | } |
772 | 0 | while (in < end) { |
773 | 0 | Py_UCS4 ch; |
774 | 0 | ch = *in++; |
775 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
776 | 0 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
777 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
778 | 0 | goto fail; |
779 | 0 | } |
780 | 0 | #endif |
781 | 0 | *out++ = ch; |
782 | 0 | } |
783 | 0 | } else { |
784 | 0 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
785 | 0 | while (in < unrolled_end) { |
786 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
787 | | Py_UCS4 ch1, ch2, ch3, ch4; |
788 | | /* check if any character is a surrogate character */ |
789 | 0 | if (((in[0] ^ 0xd800) & |
790 | 0 | (in[1] ^ 0xd800) & |
791 | 0 | (in[2] ^ 0xd800) & |
792 | 0 | (in[3] ^ 0xd800) & 0xf800) == 0) |
793 | 0 | break; |
794 | 0 | #endif |
795 | 0 | out[0] = SWAB4(in[0], ch1); |
796 | 0 | out[1] = SWAB4(in[1], ch2); |
797 | 0 | out[2] = SWAB4(in[2], ch3); |
798 | 0 | out[3] = SWAB4(in[3], ch4); |
799 | 0 | in += 4; out += 4; |
800 | 0 | } |
801 | 0 | while (in < end) { |
802 | 0 | Py_UCS4 ch = *in++; |
803 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
804 | 0 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
805 | | /* reject surrogate characters (U+D800-U+DFFF) */ |
806 | 0 | goto fail; |
807 | 0 | } |
808 | 0 | #endif |
809 | 0 | *out++ = SWAB4(ch, ch); |
810 | 0 | } |
811 | 0 | } |
812 | 0 | *outptr = out; |
813 | 0 | return len; |
814 | | #if STRINGLIB_SIZEOF_CHAR > 1 |
815 | 0 | fail: |
816 | 0 | *outptr = out; |
817 | 0 | return len - (end - in + 1); |
818 | | #endif |
819 | 0 | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf32_encode |
820 | | #undef SWAB4 |
821 | | |
822 | | #endif |