/src/Python-3.8.3/Objects/stringlib/codecs.h
Line  | Count  | Source  | 
1  |  | /* stringlib: codec implementations */  | 
2  |  |  | 
3  |  | #if !STRINGLIB_IS_UNICODE  | 
4  |  | # error "codecs.h is specific to Unicode"  | 
5  |  | #endif  | 
6  |  |  | 
7  |  | /* Mask to quickly check whether a C 'long' contains a  | 
8  |  |    non-ASCII, UTF8-encoded char. */  | 
9  |  | #if (SIZEOF_LONG == 8)  | 
10  | 585  | # define ASCII_CHAR_MASK 0x8080808080808080UL  | 
11  |  | #elif (SIZEOF_LONG == 4)  | 
12  |  | # define ASCII_CHAR_MASK 0x80808080UL  | 
13  |  | #else  | 
14  |  | # error C 'long' size should be either 4 or 8!  | 
15  |  | #endif  | 
16  |  |  | 
17  |  | /* 10xxxxxx */  | 
18  | 2.66k  | #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)  | 
19  |  |  | 
20  |  | Py_LOCAL_INLINE(Py_UCS4)  | 
21  |  | STRINGLIB(utf8_decode)(const char **inptr, const char *end,  | 
22  |  |                        STRINGLIB_CHAR *dest,  | 
23  |  |                        Py_ssize_t *outpos)  | 
24  | 44  | { | 
25  | 44  |     Py_UCS4 ch;  | 
26  | 44  |     const char *s = *inptr;  | 
27  | 44  |     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);  | 
28  | 44  |     STRINGLIB_CHAR *p = dest + *outpos;  | 
29  |  |  | 
30  | 1.81k  |     while (s < end) { | 
31  | 1.79k  |         ch = (unsigned char)*s;  | 
32  |  |  | 
33  | 1.79k  |         if (ch < 0x80) { | 
34  |  |             /* Fast path for runs of ASCII characters. Given that common UTF-8  | 
35  |  |                input will consist of an overwhelming majority of ASCII  | 
36  |  |                characters, we try to optimize for this case by checking  | 
37  |  |                as many characters as a C 'long' can contain.  | 
38  |  |                First, check if we can do an aligned read, as most CPUs have  | 
39  |  |                a penalty for unaligned reads.  | 
40  |  |             */  | 
41  | 4  |             if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) { | 
42  |  |                 /* Help register allocation */  | 
43  | 1  |                 const char *_s = s;  | 
44  | 1  |                 STRINGLIB_CHAR *_p = p;  | 
45  | 586  |                 while (_s < aligned_end) { | 
46  |  |                     /* Read a whole long at a time (either 4 or 8 bytes),  | 
47  |  |                        and do a fast unrolled copy if it only contains ASCII  | 
48  |  |                        characters. */  | 
49  | 585  |                     unsigned long value = *(const unsigned long *) _s;  | 
50  | 585  |                     if (value & ASCII_CHAR_MASK)  | 
51  | 0  |                         break;  | 
52  | 585  | #if PY_LITTLE_ENDIAN  | 
53  | 585  |                     _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);  | 
54  | 585  |                     _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);  | 
55  | 585  |                     _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);  | 
56  | 585  |                     _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);  | 
57  | 585  | # if SIZEOF_LONG == 8  | 
58  | 585  |                     _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);  | 
59  | 585  |                     _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);  | 
60  | 585  |                     _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);  | 
61  | 585  |                     _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);  | 
62  | 585  | # endif  | 
63  |  | #else  | 
64  |  | # if SIZEOF_LONG == 8  | 
65  |  |                     _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);  | 
66  |  |                     _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);  | 
67  |  |                     _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);  | 
68  |  |                     _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);  | 
69  |  |                     _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);  | 
70  |  |                     _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);  | 
71  |  |                     _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);  | 
72  |  |                     _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);  | 
73  |  | # else  | 
74  |  |                     _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);  | 
75  |  |                     _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);  | 
76  |  |                     _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);  | 
77  |  |                     _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);  | 
78  |  | # endif  | 
79  |  | #endif  | 
80  | 585  |                     _s += SIZEOF_LONG;  | 
81  | 585  |                     _p += SIZEOF_LONG;  | 
82  | 585  |                 }  | 
83  | 1  |                 s = _s;  | 
84  | 1  |                 p = _p;  | 
85  | 1  |                 if (s == end)  | 
86  | 0  |                     break;  | 
87  | 1  |                 ch = (unsigned char)*s;  | 
88  | 1  |             }  | 
89  | 4  |             if (ch < 0x80) { | 
90  | 4  |                 s++;  | 
91  | 4  |                 *p++ = ch;  | 
92  | 4  |                 continue;  | 
93  | 4  |             }  | 
94  | 4  |         }  | 
95  |  |  | 
96  | 1.79k  |         if (ch < 0xE0) { | 
97  |  |             /* \xC2\x80-\xDF\xBF -- 0080-07FF */  | 
98  | 925  |             Py_UCS4 ch2;  | 
99  | 925  |             if (ch < 0xC2) { | 
100  |  |                 /* invalid sequence  | 
101  |  |                 \x80-\xBF -- continuation byte  | 
102  |  |                 \xC0-\xC1 -- fake 0000-007F */  | 
103  | 0  |                 goto InvalidStart;  | 
104  | 0  |             }  | 
105  | 925  |             if (end - s < 2) { | 
106  |  |                 /* unexpected end of data: the caller will decide whether  | 
107  |  |                    it's an error or not */  | 
108  | 0  |                 break;  | 
109  | 0  |             }  | 
110  | 925  |             ch2 = (unsigned char)s[1];  | 
111  | 925  |             if (!IS_CONTINUATION_BYTE(ch2))  | 
112  |  |                 /* invalid continuation byte */  | 
113  | 0  |                 goto InvalidContinuation1;  | 
114  | 925  |             ch = (ch << 6) + ch2 -  | 
115  | 925  |                  ((0xC0 << 6) + 0x80);  | 
116  | 925  |             assert ((ch > 0x007F) && (ch <= 0x07FF));  | 
117  | 925  |             s += 2;  | 
118  | 925  |             if (STRINGLIB_MAX_CHAR <= 0x007F ||  | 
119  | 406  |                 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))  | 
120  |  |                 /* Out-of-range */  | 
121  | 15  |                 goto Return;  | 
122  | 910  |             *p++ = ch;  | 
123  | 910  |             continue;  | 
124  | 925  |         }  | 
125  |  |  | 
126  | 868  |         if (ch < 0xF0) { | 
127  |  |             /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */  | 
128  | 868  |             Py_UCS4 ch2, ch3;  | 
129  | 868  |             if (end - s < 3) { | 
130  |  |                 /* unexpected end of data: the caller will decide whether  | 
131  |  |                    it's an error or not */  | 
132  | 0  |                 if (end - s < 2)  | 
133  | 0  |                     break;  | 
134  | 0  |                 ch2 = (unsigned char)s[1];  | 
135  | 0  |                 if (!IS_CONTINUATION_BYTE(ch2) ||  | 
136  | 0  |                     (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))  | 
137  |  |                     /* for clarification see comments below */  | 
138  | 0  |                     goto InvalidContinuation1;  | 
139  | 0  |                 break;  | 
140  | 0  |             }  | 
141  | 868  |             ch2 = (unsigned char)s[1];  | 
142  | 868  |             ch3 = (unsigned char)s[2];  | 
143  | 868  |             if (!IS_CONTINUATION_BYTE(ch2)) { | 
144  |  |                 /* invalid continuation byte */  | 
145  | 0  |                 goto InvalidContinuation1;  | 
146  | 0  |             }  | 
147  | 868  |             if (ch == 0xE0) { | 
148  | 0  |                 if (ch2 < 0xA0)  | 
149  |  |                     /* invalid sequence  | 
150  |  |                        \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */  | 
151  | 0  |                     goto InvalidContinuation1;  | 
152  | 868  |             } else if (ch == 0xED && ch2 >= 0xA0) { | 
153  |  |                 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF  | 
154  |  |                    will result in surrogates in range D800-DFFF. Surrogates are  | 
155  |  |                    not valid UTF-8 so they are rejected.  | 
156  |  |                    See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf  | 
157  |  |                    (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */  | 
158  | 0  |                 goto InvalidContinuation1;  | 
159  | 0  |             }  | 
160  | 868  |             if (!IS_CONTINUATION_BYTE(ch3)) { | 
161  |  |                 /* invalid continuation byte */  | 
162  | 0  |                 goto InvalidContinuation2;  | 
163  | 0  |             }  | 
164  | 868  |             ch = (ch << 12) + (ch2 << 6) + ch3 -  | 
165  | 868  |                  ((0xE0 << 12) + (0x80 << 6) + 0x80);  | 
166  | 868  |             assert ((ch > 0x07FF) && (ch <= 0xFFFF));  | 
167  | 868  |             s += 3;  | 
168  | 868  |             if (STRINGLIB_MAX_CHAR <= 0x07FF ||  | 
169  | 0  |                 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))  | 
170  |  |                 /* Out-of-range */  | 
171  | 14  |                 goto Return;  | 
172  | 854  |             *p++ = ch;  | 
173  | 854  |             continue;  | 
174  | 868  |         }  | 
175  |  |  | 
176  | 0  |         if (ch < 0xF5) { | 
177  |  |             /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */  | 
178  | 0  |             Py_UCS4 ch2, ch3, ch4;  | 
179  | 0  |             if (end - s < 4) { | 
180  |  |                 /* unexpected end of data: the caller will decide whether  | 
181  |  |                    it's an error or not */  | 
182  | 0  |                 if (end - s < 2)  | 
183  | 0  |                     break;  | 
184  | 0  |                 ch2 = (unsigned char)s[1];  | 
185  | 0  |                 if (!IS_CONTINUATION_BYTE(ch2) ||  | 
186  | 0  |                     (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))  | 
187  |  |                     /* for clarification see comments below */  | 
188  | 0  |                     goto InvalidContinuation1;  | 
189  | 0  |                 if (end - s < 3)  | 
190  | 0  |                     break;  | 
191  | 0  |                 ch3 = (unsigned char)s[2];  | 
192  | 0  |                 if (!IS_CONTINUATION_BYTE(ch3))  | 
193  | 0  |                     goto InvalidContinuation2;  | 
194  | 0  |                 break;  | 
195  | 0  |             }  | 
196  | 0  |             ch2 = (unsigned char)s[1];  | 
197  | 0  |             ch3 = (unsigned char)s[2];  | 
198  | 0  |             ch4 = (unsigned char)s[3];  | 
199  | 0  |             if (!IS_CONTINUATION_BYTE(ch2)) { | 
200  |  |                 /* invalid continuation byte */  | 
201  | 0  |                 goto InvalidContinuation1;  | 
202  | 0  |             }  | 
203  | 0  |             if (ch == 0xF0) { | 
204  | 0  |                 if (ch2 < 0x90)  | 
205  |  |                     /* invalid sequence  | 
206  |  |                        \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */  | 
207  | 0  |                     goto InvalidContinuation1;  | 
208  | 0  |             } else if (ch == 0xF4 && ch2 >= 0x90) { | 
209  |  |                 /* invalid sequence  | 
210  |  |                    \xF4\x90\x80\x80- -- 110000- overflow */  | 
211  | 0  |                 goto InvalidContinuation1;  | 
212  | 0  |             }  | 
213  | 0  |             if (!IS_CONTINUATION_BYTE(ch3)) { | 
214  |  |                 /* invalid continuation byte */  | 
215  | 0  |                 goto InvalidContinuation2;  | 
216  | 0  |             }  | 
217  | 0  |             if (!IS_CONTINUATION_BYTE(ch4)) { | 
218  |  |                 /* invalid continuation byte */  | 
219  | 0  |                 goto InvalidContinuation3;  | 
220  | 0  |             }  | 
221  | 0  |             ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -  | 
222  | 0  |                  ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);  | 
223  | 0  |             assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));  | 
224  | 0  |             s += 4;  | 
225  | 0  |             if (STRINGLIB_MAX_CHAR <= 0xFFFF ||  | 
226  | 0  |                 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))  | 
227  |  |                 /* Out-of-range */  | 
228  | 0  |                 goto Return;  | 
229  | 0  |             *p++ = ch;  | 
230  | 0  |             continue;  | 
231  | 0  |         }  | 
232  | 0  |         goto InvalidStart;  | 
233  | 0  |     }  | 
234  | 15  |     ch = 0;  | 
235  | 44  | Return:  | 
236  | 44  |     *inptr = s;  | 
237  | 44  |     *outpos = p - dest;  | 
238  | 44  |     return ch;  | 
239  | 0  | InvalidStart:  | 
240  | 0  |     ch = 1;  | 
241  | 0  |     goto Return;  | 
242  | 0  | InvalidContinuation1:  | 
243  | 0  |     ch = 2;  | 
244  | 0  |     goto Return;  | 
245  | 0  | InvalidContinuation2:  | 
246  | 0  |     ch = 3;  | 
247  | 0  |     goto Return;  | 
248  | 0  | InvalidContinuation3:  | 
249  | 0  |     ch = 4;  | 
250  | 0  |     goto Return;  | 
251  | 15  | } unicodeobject.c:asciilib_utf8_decode Line  | Count  | Source  |  24  | 15  | { |  25  | 15  |     Py_UCS4 ch;  |  26  | 15  |     const char *s = *inptr;  |  27  | 15  |     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);  |  28  | 15  |     STRINGLIB_CHAR *p = dest + *outpos;  |  29  |  |  |  30  | 15  |     while (s < end) { |  31  | 15  |         ch = (unsigned char)*s;  |  32  |  |  |  33  | 15  |         if (ch < 0x80) { |  34  |  |             /* Fast path for runs of ASCII characters. Given that common UTF-8  |  35  |  |                input will consist of an overwhelming majority of ASCII  |  36  |  |                characters, we try to optimize for this case by checking  |  37  |  |                as many characters as a C 'long' can contain.  |  38  |  |                First, check if we can do an aligned read, as most CPUs have  |  39  |  |                a penalty for unaligned reads.  |  40  |  |             */  |  41  | 0  |             if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) { |  42  |  |                 /* Help register allocation */  |  43  | 0  |                 const char *_s = s;  |  44  | 0  |                 STRINGLIB_CHAR *_p = p;  |  45  | 0  |                 while (_s < aligned_end) { |  46  |  |                     /* Read a whole long at a time (either 4 or 8 bytes),  |  47  |  |                        and do a fast unrolled copy if it only contains ASCII  |  48  |  |                        characters. */  |  49  | 0  |                     unsigned long value = *(const unsigned long *) _s;  |  50  | 0  |                     if (value & ASCII_CHAR_MASK)  |  51  | 0  |                         break;  |  52  | 0  | #if PY_LITTLE_ENDIAN  |  53  | 0  |                     _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);  |  54  | 0  |                     _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);  |  55  | 0  |                     _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);  |  56  | 0  |                     _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);  |  57  | 0  | # if SIZEOF_LONG == 8  |  58  | 0  |                     _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);  |  59  | 0  |                     _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);  |  60  | 0  |                     _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);  |  61  | 0  |                     _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);  |  62  | 0  | # endif  |  63  |  | #else  |  64  |  | # if SIZEOF_LONG == 8  |  65  |  |                     _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);  |  66  |  |                     _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);  |  67  |  |                     _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);  |  68  |  |                     _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);  |  69  |  |                     _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);  |  70  |  |                     _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);  |  71  |  |                     _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);  |  72  |  |                     _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);  |  73  |  | # else  |  74  |  |                     _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);  |  75  |  |                     _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);  |  76  |  |                     _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);  |  77  |  |                     _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);  |  78  |  | # endif  |  79  |  | #endif  |  80  | 0  |                     _s += SIZEOF_LONG;  |  81  | 0  |                     _p += SIZEOF_LONG;  |  82  | 0  |                 }  |  83  | 0  |                 s = _s;  |  84  | 0  |                 p = _p;  |  85  | 0  |                 if (s == end)  |  86  | 0  |                     break;  |  87  | 0  |                 ch = (unsigned char)*s;  |  88  | 0  |             }  |  89  | 0  |             if (ch < 0x80) { |  90  | 0  |                 s++;  |  91  | 0  |                 *p++ = ch;  |  92  | 0  |                 continue;  |  93  | 0  |             }  |  94  | 0  |         }  |  95  |  |  |  96  | 15  |         if (ch < 0xE0) { |  97  |  |             /* \xC2\x80-\xDF\xBF -- 0080-07FF */  |  98  | 15  |             Py_UCS4 ch2;  |  99  | 15  |             if (ch < 0xC2) { |  100  |  |                 /* invalid sequence  |  101  |  |                 \x80-\xBF -- continuation byte  |  102  |  |                 \xC0-\xC1 -- fake 0000-007F */  |  103  | 0  |                 goto InvalidStart;  |  104  | 0  |             }  |  105  | 15  |             if (end - s < 2) { |  106  |  |                 /* unexpected end of data: the caller will decide whether  |  107  |  |                    it's an error or not */  |  108  | 0  |                 break;  |  109  | 0  |             }  |  110  | 15  |             ch2 = (unsigned char)s[1];  |  111  | 15  |             if (!IS_CONTINUATION_BYTE(ch2))  |  112  |  |                 /* invalid continuation byte */  |  113  | 0  |                 goto InvalidContinuation1;  |  114  | 15  |             ch = (ch << 6) + ch2 -  |  115  | 15  |                  ((0xC0 << 6) + 0x80);  |  116  | 15  |             assert ((ch > 0x007F) && (ch <= 0x07FF));  |  117  | 15  |             s += 2;  |  118  | 15  |             if (STRINGLIB_MAX_CHAR <= 0x007F ||  |  119  | 0  |                 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))  |  120  |  |                 /* Out-of-range */  |  121  | 15  |                 goto Return;  |  122  | 0  |             *p++ = ch;  |  123  | 0  |             continue;  |  124  | 15  |         }  |  125  |  |  |  126  | 0  |         if (ch < 0xF0) { |  127  |  |             /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */  |  128  | 0  |             Py_UCS4 ch2, ch3;  |  129  | 0  |             if (end - s < 3) { |  130  |  |                 /* unexpected end of data: the caller will decide whether  |  131  |  |                    it's an error or not */  |  132  | 0  |                 if (end - s < 2)  |  133  | 0  |                     break;  |  134  | 0  |                 ch2 = (unsigned char)s[1];  |  135  | 0  |                 if (!IS_CONTINUATION_BYTE(ch2) ||  |  136  | 0  |                     (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))  |  137  |  |                     /* for clarification see comments below */  |  138  | 0  |                     goto InvalidContinuation1;  |  139  | 0  |                 break;  |  140  | 0  |             }  |  141  | 0  |             ch2 = (unsigned char)s[1];  |  142  | 0  |             ch3 = (unsigned char)s[2];  |  143  | 0  |             if (!IS_CONTINUATION_BYTE(ch2)) { |  144  |  |                 /* invalid continuation byte */  |  145  | 0  |                 goto InvalidContinuation1;  |  146  | 0  |             }  |  147  | 0  |             if (ch == 0xE0) { |  148  | 0  |                 if (ch2 < 0xA0)  |  149  |  |                     /* invalid sequence  |  150  |  |                        \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */  |  151  | 0  |                     goto InvalidContinuation1;  |  152  | 0  |             } else if (ch == 0xED && ch2 >= 0xA0) { |  153  |  |                 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF  |  154  |  |                    will result in surrogates in range D800-DFFF. Surrogates are  |  155  |  |                    not valid UTF-8 so they are rejected.  |  156  |  |                    See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf  |  157  |  |                    (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */  |  158  | 0  |                 goto InvalidContinuation1;  |  159  | 0  |             }  |  160  | 0  |             if (!IS_CONTINUATION_BYTE(ch3)) { |  161  |  |                 /* invalid continuation byte */  |  162  | 0  |                 goto InvalidContinuation2;  |  163  | 0  |             }  |  164  | 0  |             ch = (ch << 12) + (ch2 << 6) + ch3 -  |  165  | 0  |                  ((0xE0 << 12) + (0x80 << 6) + 0x80);  |  166  | 0  |             assert ((ch > 0x07FF) && (ch <= 0xFFFF));  |  167  | 0  |             s += 3;  |  168  | 0  |             if (STRINGLIB_MAX_CHAR <= 0x07FF ||  |  169  | 0  |                 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))  |  170  |  |                 /* Out-of-range */  |  171  | 0  |                 goto Return;  |  172  | 0  |             *p++ = ch;  |  173  | 0  |             continue;  |  174  | 0  |         }  |  175  |  |  |  176  | 0  |         if (ch < 0xF5) { |  177  |  |             /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */  |  178  | 0  |             Py_UCS4 ch2, ch3, ch4;  |  179  | 0  |             if (end - s < 4) { |  180  |  |                 /* unexpected end of data: the caller will decide whether  |  181  |  |                    it's an error or not */  |  182  | 0  |                 if (end - s < 2)  |  183  | 0  |                     break;  |  184  | 0  |                 ch2 = (unsigned char)s[1];  |  185  | 0  |                 if (!IS_CONTINUATION_BYTE(ch2) ||  |  186  | 0  |                     (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))  |  187  |  |                     /* for clarification see comments below */  |  188  | 0  |                     goto InvalidContinuation1;  |  189  | 0  |                 if (end - s < 3)  |  190  | 0  |                     break;  |  191  | 0  |                 ch3 = (unsigned char)s[2];  |  192  | 0  |                 if (!IS_CONTINUATION_BYTE(ch3))  |  193  | 0  |                     goto InvalidContinuation2;  |  194  | 0  |                 break;  |  195  | 0  |             }  |  196  | 0  |             ch2 = (unsigned char)s[1];  |  197  | 0  |             ch3 = (unsigned char)s[2];  |  198  | 0  |             ch4 = (unsigned char)s[3];  |  199  | 0  |             if (!IS_CONTINUATION_BYTE(ch2)) { |  200  |  |                 /* invalid continuation byte */  |  201  | 0  |                 goto InvalidContinuation1;  |  202  | 0  |             }  |  203  | 0  |             if (ch == 0xF0) { |  204  | 0  |                 if (ch2 < 0x90)  |  205  |  |                     /* invalid sequence  |  206  |  |                        \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */  |  207  | 0  |                     goto InvalidContinuation1;  |  208  | 0  |             } else if (ch == 0xF4 && ch2 >= 0x90) { |  209  |  |                 /* invalid sequence  |  210  |  |                    \xF4\x90\x80\x80- -- 110000- overflow */  |  211  | 0  |                 goto InvalidContinuation1;  |  212  | 0  |             }  |  213  | 0  |             if (!IS_CONTINUATION_BYTE(ch3)) { |  214  |  |                 /* invalid continuation byte */  |  215  | 0  |                 goto InvalidContinuation2;  |  216  | 0  |             }  |  217  | 0  |             if (!IS_CONTINUATION_BYTE(ch4)) { |  218  |  |                 /* invalid continuation byte */  |  219  | 0  |                 goto InvalidContinuation3;  |  220  | 0  |             }  |  221  | 0  |             ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -  |  222  | 0  |                  ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);  |  223  | 0  |             assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));  |  224  | 0  |             s += 4;  |  225  | 0  |             if (STRINGLIB_MAX_CHAR <= 0xFFFF ||  |  226  | 0  |                 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))  |  227  |  |                 /* Out-of-range */  |  228  | 0  |                 goto Return;  |  229  | 0  |             *p++ = ch;  |  230  | 0  |             continue;  |  231  | 0  |         }  |  232  | 0  |         goto InvalidStart;  |  233  | 0  |     }  |  234  | 0  |     ch = 0;  |  235  | 15  | Return:  |  236  | 15  |     *inptr = s;  |  237  | 15  |     *outpos = p - dest;  |  238  | 15  |     return ch;  |  239  | 0  | InvalidStart:  |  240  | 0  |     ch = 1;  |  241  | 0  |     goto Return;  |  242  | 0  | InvalidContinuation1:  |  243  | 0  |     ch = 2;  |  244  | 0  |     goto Return;  |  245  | 0  | InvalidContinuation2:  |  246  | 0  |     ch = 3;  |  247  | 0  |     goto Return;  |  248  | 0  | InvalidContinuation3:  |  249  | 0  |     ch = 4;  |  250  | 0  |     goto Return;  |  251  | 0  | }  |  
 unicodeobject.c:ucs1lib_utf8_decode Line  | Count  | Source  |  24  | 15  | { |  25  | 15  |     Py_UCS4 ch;  |  26  | 15  |     const char *s = *inptr;  |  27  | 15  |     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);  |  28  | 15  |     STRINGLIB_CHAR *p = dest + *outpos;  |  29  |  |  |  30  | 425  |     while (s < end) { |  31  | 424  |         ch = (unsigned char)*s;  |  32  |  |  |  33  | 424  |         if (ch < 0x80) { |  34  |  |             /* Fast path for runs of ASCII characters. Given that common UTF-8  |  35  |  |                input will consist of an overwhelming majority of ASCII  |  36  |  |                characters, we try to optimize for this case by checking  |  37  |  |                as many characters as a C 'long' can contain.  |  38  |  |                First, check if we can do an aligned read, as most CPUs have  |  39  |  |                a penalty for unaligned reads.  |  40  |  |             */  |  41  | 4  |             if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) { |  42  |  |                 /* Help register allocation */  |  43  | 1  |                 const char *_s = s;  |  44  | 1  |                 STRINGLIB_CHAR *_p = p;  |  45  | 586  |                 while (_s < aligned_end) { |  46  |  |                     /* Read a whole long at a time (either 4 or 8 bytes),  |  47  |  |                        and do a fast unrolled copy if it only contains ASCII  |  48  |  |                        characters. */  |  49  | 585  |                     unsigned long value = *(const unsigned long *) _s;  |  50  | 585  |                     if (value & ASCII_CHAR_MASK)  |  51  | 0  |                         break;  |  52  | 585  | #if PY_LITTLE_ENDIAN  |  53  | 585  |                     _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);  |  54  | 585  |                     _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);  |  55  | 585  |                     _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);  |  56  | 585  |                     _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);  |  57  | 585  | # if SIZEOF_LONG == 8  |  58  | 585  |                     _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);  |  59  | 585  |                     _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);  |  60  | 585  |                     _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);  |  61  | 585  |                     _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);  |  62  | 585  | # endif  |  63  |  | #else  |  64  |  | # if SIZEOF_LONG == 8  |  65  |  |                     _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);  |  66  |  |                     _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);  |  67  |  |                     _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);  |  68  |  |                     _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);  |  69  |  |                     _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);  |  70  |  |                     _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);  |  71  |  |                     _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);  |  72  |  |                     _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);  |  73  |  | # else  |  74  |  |                     _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);  |  75  |  |                     _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);  |  76  |  |                     _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);  |  77  |  |                     _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);  |  78  |  | # endif  |  79  |  | #endif  |  80  | 585  |                     _s += SIZEOF_LONG;  |  81  | 585  |                     _p += SIZEOF_LONG;  |  82  | 585  |                 }  |  83  | 1  |                 s = _s;  |  84  | 1  |                 p = _p;  |  85  | 1  |                 if (s == end)  |  86  | 0  |                     break;  |  87  | 1  |                 ch = (unsigned char)*s;  |  88  | 1  |             }  |  89  | 4  |             if (ch < 0x80) { |  90  | 4  |                 s++;  |  91  | 4  |                 *p++ = ch;  |  92  | 4  |                 continue;  |  93  | 4  |             }  |  94  | 4  |         }  |  95  |  |  |  96  | 420  |         if (ch < 0xE0) { |  97  |  |             /* \xC2\x80-\xDF\xBF -- 0080-07FF */  |  98  | 406  |             Py_UCS4 ch2;  |  99  | 406  |             if (ch < 0xC2) { |  100  |  |                 /* invalid sequence  |  101  |  |                 \x80-\xBF -- continuation byte  |  102  |  |                 \xC0-\xC1 -- fake 0000-007F */  |  103  | 0  |                 goto InvalidStart;  |  104  | 0  |             }  |  105  | 406  |             if (end - s < 2) { |  106  |  |                 /* unexpected end of data: the caller will decide whether  |  107  |  |                    it's an error or not */  |  108  | 0  |                 break;  |  109  | 0  |             }  |  110  | 406  |             ch2 = (unsigned char)s[1];  |  111  | 406  |             if (!IS_CONTINUATION_BYTE(ch2))  |  112  |  |                 /* invalid continuation byte */  |  113  | 0  |                 goto InvalidContinuation1;  |  114  | 406  |             ch = (ch << 6) + ch2 -  |  115  | 406  |                  ((0xC0 << 6) + 0x80);  |  116  | 406  |             assert ((ch > 0x007F) && (ch <= 0x07FF));  |  117  | 406  |             s += 2;  |  118  | 406  |             if (STRINGLIB_MAX_CHAR <= 0x007F ||  |  119  | 406  |                 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))  |  120  |  |                 /* Out-of-range */  |  121  | 0  |                 goto Return;  |  122  | 406  |             *p++ = ch;  |  123  | 406  |             continue;  |  124  | 406  |         }  |  125  |  |  |  126  | 14  |         if (ch < 0xF0) { |  127  |  |             /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */  |  128  | 14  |             Py_UCS4 ch2, ch3;  |  129  | 14  |             if (end - s < 3) { |  130  |  |                 /* unexpected end of data: the caller will decide whether  |  131  |  |                    it's an error or not */  |  132  | 0  |                 if (end - s < 2)  |  133  | 0  |                     break;  |  134  | 0  |                 ch2 = (unsigned char)s[1];  |  135  | 0  |                 if (!IS_CONTINUATION_BYTE(ch2) ||  |  136  | 0  |                     (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))  |  137  |  |                     /* for clarification see comments below */  |  138  | 0  |                     goto InvalidContinuation1;  |  139  | 0  |                 break;  |  140  | 0  |             }  |  141  | 14  |             ch2 = (unsigned char)s[1];  |  142  | 14  |             ch3 = (unsigned char)s[2];  |  143  | 14  |             if (!IS_CONTINUATION_BYTE(ch2)) { |  144  |  |                 /* invalid continuation byte */  |  145  | 0  |                 goto InvalidContinuation1;  |  146  | 0  |             }  |  147  | 14  |             if (ch == 0xE0) { |  148  | 0  |                 if (ch2 < 0xA0)  |  149  |  |                     /* invalid sequence  |  150  |  |                        \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */  |  151  | 0  |                     goto InvalidContinuation1;  |  152  | 14  |             } else if (ch == 0xED && ch2 >= 0xA0) { |  153  |  |                 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF  |  154  |  |                    will result in surrogates in range D800-DFFF. Surrogates are  |  155  |  |                    not valid UTF-8 so they are rejected.  |  156  |  |                    See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf  |  157  |  |                    (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */  |  158  | 0  |                 goto InvalidContinuation1;  |  159  | 0  |             }  |  160  | 14  |             if (!IS_CONTINUATION_BYTE(ch3)) { |  161  |  |                 /* invalid continuation byte */  |  162  | 0  |                 goto InvalidContinuation2;  |  163  | 0  |             }  |  164  | 14  |             ch = (ch << 12) + (ch2 << 6) + ch3 -  |  165  | 14  |                  ((0xE0 << 12) + (0x80 << 6) + 0x80);  |  166  | 14  |             assert ((ch > 0x07FF) && (ch <= 0xFFFF));  |  167  | 14  |             s += 3;  |  168  | 14  |             if (STRINGLIB_MAX_CHAR <= 0x07FF ||  |  169  | 0  |                 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))  |  170  |  |                 /* Out-of-range */  |  171  | 14  |                 goto Return;  |  172  | 0  |             *p++ = ch;  |  173  | 0  |             continue;  |  174  | 14  |         }  |  175  |  |  |  176  | 0  |         if (ch < 0xF5) { |  177  |  |             /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */  |  178  | 0  |             Py_UCS4 ch2, ch3, ch4;  |  179  | 0  |             if (end - s < 4) { |  180  |  |                 /* unexpected end of data: the caller will decide whether  |  181  |  |                    it's an error or not */  |  182  | 0  |                 if (end - s < 2)  |  183  | 0  |                     break;  |  184  | 0  |                 ch2 = (unsigned char)s[1];  |  185  | 0  |                 if (!IS_CONTINUATION_BYTE(ch2) ||  |  186  | 0  |                     (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))  |  187  |  |                     /* for clarification see comments below */  |  188  | 0  |                     goto InvalidContinuation1;  |  189  | 0  |                 if (end - s < 3)  |  190  | 0  |                     break;  |  191  | 0  |                 ch3 = (unsigned char)s[2];  |  192  | 0  |                 if (!IS_CONTINUATION_BYTE(ch3))  |  193  | 0  |                     goto InvalidContinuation2;  |  194  | 0  |                 break;  |  195  | 0  |             }  |  196  | 0  |             ch2 = (unsigned char)s[1];  |  197  | 0  |             ch3 = (unsigned char)s[2];  |  198  | 0  |             ch4 = (unsigned char)s[3];  |  199  | 0  |             if (!IS_CONTINUATION_BYTE(ch2)) { |  200  |  |                 /* invalid continuation byte */  |  201  | 0  |                 goto InvalidContinuation1;  |  202  | 0  |             }  |  203  | 0  |             if (ch == 0xF0) { |  204  | 0  |                 if (ch2 < 0x90)  |  205  |  |                     /* invalid sequence  |  206  |  |                        \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */  |  207  | 0  |                     goto InvalidContinuation1;  |  208  | 0  |             } else if (ch == 0xF4 && ch2 >= 0x90) { |  209  |  |                 /* invalid sequence  |  210  |  |                    \xF4\x90\x80\x80- -- 110000- overflow */  |  211  | 0  |                 goto InvalidContinuation1;  |  212  | 0  |             }  |  213  | 0  |             if (!IS_CONTINUATION_BYTE(ch3)) { |  214  |  |                 /* invalid continuation byte */  |  215  | 0  |                 goto InvalidContinuation2;  |  216  | 0  |             }  |  217  | 0  |             if (!IS_CONTINUATION_BYTE(ch4)) { |  218  |  |                 /* invalid continuation byte */  |  219  | 0  |                 goto InvalidContinuation3;  |  220  | 0  |             }  |  221  | 0  |             ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -  |  222  | 0  |                  ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);  |  223  | 0  |             assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));  |  224  | 0  |             s += 4;  |  225  | 0  |             if (STRINGLIB_MAX_CHAR <= 0xFFFF ||  |  226  | 0  |                 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))  |  227  |  |                 /* Out-of-range */  |  228  | 0  |                 goto Return;  |  229  | 0  |             *p++ = ch;  |  230  | 0  |             continue;  |  231  | 0  |         }  |  232  | 0  |         goto InvalidStart;  |  233  | 0  |     }  |  234  | 1  |     ch = 0;  |  235  | 15  | Return:  |  236  | 15  |     *inptr = s;  |  237  | 15  |     *outpos = p - dest;  |  238  | 15  |     return ch;  |  239  | 0  | InvalidStart:  |  240  | 0  |     ch = 1;  |  241  | 0  |     goto Return;  |  242  | 0  | InvalidContinuation1:  |  243  | 0  |     ch = 2;  |  244  | 0  |     goto Return;  |  245  | 0  | InvalidContinuation2:  |  246  | 0  |     ch = 3;  |  247  | 0  |     goto Return;  |  248  | 0  | InvalidContinuation3:  |  249  | 0  |     ch = 4;  |  250  | 0  |     goto Return;  |  251  | 1  | }  |  
 unicodeobject.c:ucs2lib_utf8_decode Line  | Count  | Source  |  24  | 14  | { |  25  | 14  |     Py_UCS4 ch;  |  26  | 14  |     const char *s = *inptr;  |  27  | 14  |     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);  |  28  | 14  |     STRINGLIB_CHAR *p = dest + *outpos;  |  29  |  |  |  30  | 1.37k  |     while (s < end) { |  31  | 1.35k  |         ch = (unsigned char)*s;  |  32  |  |  |  33  | 1.35k  |         if (ch < 0x80) { |  34  |  |             /* Fast path for runs of ASCII characters. Given that common UTF-8  |  35  |  |                input will consist of an overwhelming majority of ASCII  |  36  |  |                characters, we try to optimize for this case by checking  |  37  |  |                as many characters as a C 'long' can contain.  |  38  |  |                First, check if we can do an aligned read, as most CPUs have  |  39  |  |                a penalty for unaligned reads.  |  40  |  |             */  |  41  | 0  |             if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) { |  42  |  |                 /* Help register allocation */  |  43  | 0  |                 const char *_s = s;  |  44  | 0  |                 STRINGLIB_CHAR *_p = p;  |  45  | 0  |                 while (_s < aligned_end) { |  46  |  |                     /* Read a whole long at a time (either 4 or 8 bytes),  |  47  |  |                        and do a fast unrolled copy if it only contains ASCII  |  48  |  |                        characters. */  |  49  | 0  |                     unsigned long value = *(const unsigned long *) _s;  |  50  | 0  |                     if (value & ASCII_CHAR_MASK)  |  51  | 0  |                         break;  |  52  | 0  | #if PY_LITTLE_ENDIAN  |  53  | 0  |                     _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);  |  54  | 0  |                     _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);  |  55  | 0  |                     _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);  |  56  | 0  |                     _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);  |  57  | 0  | # if SIZEOF_LONG == 8  |  58  | 0  |                     _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);  |  59  | 0  |                     _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);  |  60  | 0  |                     _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);  |  61  | 0  |                     _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);  |  62  | 0  | # endif  |  63  |  | #else  |  64  |  | # if SIZEOF_LONG == 8  |  65  |  |                     _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);  |  66  |  |                     _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);  |  67  |  |                     _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);  |  68  |  |                     _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);  |  69  |  |                     _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);  |  70  |  |                     _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);  |  71  |  |                     _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);  |  72  |  |                     _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);  |  73  |  | # else  |  74  |  |                     _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);  |  75  |  |                     _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);  |  76  |  |                     _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);  |  77  |  |                     _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);  |  78  |  | # endif  |  79  |  | #endif  |  80  | 0  |                     _s += SIZEOF_LONG;  |  81  | 0  |                     _p += SIZEOF_LONG;  |  82  | 0  |                 }  |  83  | 0  |                 s = _s;  |  84  | 0  |                 p = _p;  |  85  | 0  |                 if (s == end)  |  86  | 0  |                     break;  |  87  | 0  |                 ch = (unsigned char)*s;  |  88  | 0  |             }  |  89  | 0  |             if (ch < 0x80) { |  90  | 0  |                 s++;  |  91  | 0  |                 *p++ = ch;  |  92  | 0  |                 continue;  |  93  | 0  |             }  |  94  | 0  |         }  |  95  |  |  |  96  | 1.35k  |         if (ch < 0xE0) { |  97  |  |             /* \xC2\x80-\xDF\xBF -- 0080-07FF */  |  98  | 504  |             Py_UCS4 ch2;  |  99  | 504  |             if (ch < 0xC2) { |  100  |  |                 /* invalid sequence  |  101  |  |                 \x80-\xBF -- continuation byte  |  102  |  |                 \xC0-\xC1 -- fake 0000-007F */  |  103  | 0  |                 goto InvalidStart;  |  104  | 0  |             }  |  105  | 504  |             if (end - s < 2) { |  106  |  |                 /* unexpected end of data: the caller will decide whether  |  107  |  |                    it's an error or not */  |  108  | 0  |                 break;  |  109  | 0  |             }  |  110  | 504  |             ch2 = (unsigned char)s[1];  |  111  | 504  |             if (!IS_CONTINUATION_BYTE(ch2))  |  112  |  |                 /* invalid continuation byte */  |  113  | 0  |                 goto InvalidContinuation1;  |  114  | 504  |             ch = (ch << 6) + ch2 -  |  115  | 504  |                  ((0xC0 << 6) + 0x80);  |  116  | 504  |             assert ((ch > 0x007F) && (ch <= 0x07FF));  |  117  | 504  |             s += 2;  |  118  | 504  |             if (STRINGLIB_MAX_CHAR <= 0x007F ||  |  119  | 0  |                 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))  |  120  |  |                 /* Out-of-range */  |  121  | 0  |                 goto Return;  |  122  | 504  |             *p++ = ch;  |  123  | 504  |             continue;  |  124  | 504  |         }  |  125  |  |  |  126  | 854  |         if (ch < 0xF0) { |  127  |  |             /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */  |  128  | 854  |             Py_UCS4 ch2, ch3;  |  129  | 854  |             if (end - s < 3) { |  130  |  |                 /* unexpected end of data: the caller will decide whether  |  131  |  |                    it's an error or not */  |  132  | 0  |                 if (end - s < 2)  |  133  | 0  |                     break;  |  134  | 0  |                 ch2 = (unsigned char)s[1];  |  135  | 0  |                 if (!IS_CONTINUATION_BYTE(ch2) ||  |  136  | 0  |                     (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))  |  137  |  |                     /* for clarification see comments below */  |  138  | 0  |                     goto InvalidContinuation1;  |  139  | 0  |                 break;  |  140  | 0  |             }  |  141  | 854  |             ch2 = (unsigned char)s[1];  |  142  | 854  |             ch3 = (unsigned char)s[2];  |  143  | 854  |             if (!IS_CONTINUATION_BYTE(ch2)) { |  144  |  |                 /* invalid continuation byte */  |  145  | 0  |                 goto InvalidContinuation1;  |  146  | 0  |             }  |  147  | 854  |             if (ch == 0xE0) { |  148  | 0  |                 if (ch2 < 0xA0)  |  149  |  |                     /* invalid sequence  |  150  |  |                        \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */  |  151  | 0  |                     goto InvalidContinuation1;  |  152  | 854  |             } else if (ch == 0xED && ch2 >= 0xA0) { |  153  |  |                 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF  |  154  |  |                    will result in surrogates in range D800-DFFF. Surrogates are  |  155  |  |                    not valid UTF-8 so they are rejected.  |  156  |  |                    See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf  |  157  |  |                    (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */  |  158  | 0  |                 goto InvalidContinuation1;  |  159  | 0  |             }  |  160  | 854  |             if (!IS_CONTINUATION_BYTE(ch3)) { |  161  |  |                 /* invalid continuation byte */  |  162  | 0  |                 goto InvalidContinuation2;  |  163  | 0  |             }  |  164  | 854  |             ch = (ch << 12) + (ch2 << 6) + ch3 -  |  165  | 854  |                  ((0xE0 << 12) + (0x80 << 6) + 0x80);  |  166  | 854  |             assert ((ch > 0x07FF) && (ch <= 0xFFFF));  |  167  | 854  |             s += 3;  |  168  | 854  |             if (STRINGLIB_MAX_CHAR <= 0x07FF ||  |  169  | 0  |                 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))  |  170  |  |                 /* Out-of-range */  |  171  | 0  |                 goto Return;  |  172  | 854  |             *p++ = ch;  |  173  | 854  |             continue;  |  174  | 854  |         }  |  175  |  |  |  176  | 0  |         if (ch < 0xF5) { |  177  |  |             /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */  |  178  | 0  |             Py_UCS4 ch2, ch3, ch4;  |  179  | 0  |             if (end - s < 4) { |  180  |  |                 /* unexpected end of data: the caller will decide whether  |  181  |  |                    it's an error or not */  |  182  | 0  |                 if (end - s < 2)  |  183  | 0  |                     break;  |  184  | 0  |                 ch2 = (unsigned char)s[1];  |  185  | 0  |                 if (!IS_CONTINUATION_BYTE(ch2) ||  |  186  | 0  |                     (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))  |  187  |  |                     /* for clarification see comments below */  |  188  | 0  |                     goto InvalidContinuation1;  |  189  | 0  |                 if (end - s < 3)  |  190  | 0  |                     break;  |  191  | 0  |                 ch3 = (unsigned char)s[2];  |  192  | 0  |                 if (!IS_CONTINUATION_BYTE(ch3))  |  193  | 0  |                     goto InvalidContinuation2;  |  194  | 0  |                 break;  |  195  | 0  |             }  |  196  | 0  |             ch2 = (unsigned char)s[1];  |  197  | 0  |             ch3 = (unsigned char)s[2];  |  198  | 0  |             ch4 = (unsigned char)s[3];  |  199  | 0  |             if (!IS_CONTINUATION_BYTE(ch2)) { |  200  |  |                 /* invalid continuation byte */  |  201  | 0  |                 goto InvalidContinuation1;  |  202  | 0  |             }  |  203  | 0  |             if (ch == 0xF0) { |  204  | 0  |                 if (ch2 < 0x90)  |  205  |  |                     /* invalid sequence  |  206  |  |                        \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */  |  207  | 0  |                     goto InvalidContinuation1;  |  208  | 0  |             } else if (ch == 0xF4 && ch2 >= 0x90) { |  209  |  |                 /* invalid sequence  |  210  |  |                    \xF4\x90\x80\x80- -- 110000- overflow */  |  211  | 0  |                 goto InvalidContinuation1;  |  212  | 0  |             }  |  213  | 0  |             if (!IS_CONTINUATION_BYTE(ch3)) { |  214  |  |                 /* invalid continuation byte */  |  215  | 0  |                 goto InvalidContinuation2;  |  216  | 0  |             }  |  217  | 0  |             if (!IS_CONTINUATION_BYTE(ch4)) { |  218  |  |                 /* invalid continuation byte */  |  219  | 0  |                 goto InvalidContinuation3;  |  220  | 0  |             }  |  221  | 0  |             ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -  |  222  | 0  |                  ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);  |  223  | 0  |             assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));  |  224  | 0  |             s += 4;  |  225  | 0  |             if (STRINGLIB_MAX_CHAR <= 0xFFFF ||  |  226  | 0  |                 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))  |  227  |  |                 /* Out-of-range */  |  228  | 0  |                 goto Return;  |  229  | 0  |             *p++ = ch;  |  230  | 0  |             continue;  |  231  | 0  |         }  |  232  | 0  |         goto InvalidStart;  |  233  | 0  |     }  |  234  | 14  |     ch = 0;  |  235  | 14  | Return:  |  236  | 14  |     *inptr = s;  |  237  | 14  |     *outpos = p - dest;  |  238  | 14  |     return ch;  |  239  | 0  | InvalidStart:  |  240  | 0  |     ch = 1;  |  241  | 0  |     goto Return;  |  242  | 0  | InvalidContinuation1:  |  243  | 0  |     ch = 2;  |  244  | 0  |     goto Return;  |  245  | 0  | InvalidContinuation2:  |  246  | 0  |     ch = 3;  |  247  | 0  |     goto Return;  |  248  | 0  | InvalidContinuation3:  |  249  | 0  |     ch = 4;  |  250  | 0  |     goto Return;  |  251  | 14  | }  |  
 Unexecuted instantiation: unicodeobject.c:ucs4lib_utf8_decode  | 
252  |  |  | 
253  |  | #undef ASCII_CHAR_MASK  | 
254  |  |  | 
255  |  |  | 
256  |  | /* UTF-8 encoder specialized for a Unicode kind to avoid the slow  | 
257  |  |    PyUnicode_READ() macro. Delete some parts of the code depending on the kind:  | 
258  |  |    UCS-1 strings don't need to handle surrogates for example. */  | 
259  |  | Py_LOCAL_INLINE(PyObject *)  | 
260  |  | STRINGLIB(utf8_encoder)(PyObject *unicode,  | 
261  |  |                         STRINGLIB_CHAR *data,  | 
262  |  |                         Py_ssize_t size,  | 
263  |  |                         _Py_error_handler error_handler,  | 
264  |  |                         const char *errors)  | 
265  | 0  | { | 
266  | 0  |     Py_ssize_t i;                /* index into data of next input character */  | 
267  | 0  |     char *p;                     /* next free byte in output buffer */  | 
268  |  | #if STRINGLIB_SIZEOF_CHAR > 1  | 
269  |  |     PyObject *error_handler_obj = NULL;  | 
270  |  |     PyObject *exc = NULL;  | 
271  |  |     PyObject *rep = NULL;  | 
272  |  | #endif  | 
273  |  | #if STRINGLIB_SIZEOF_CHAR == 1  | 
274  |  |     const Py_ssize_t max_char_size = 2;  | 
275  |  | #elif STRINGLIB_SIZEOF_CHAR == 2  | 
276  |  |     const Py_ssize_t max_char_size = 3;  | 
277  |  | #else /*  STRINGLIB_SIZEOF_CHAR == 4 */  | 
278  |  |     const Py_ssize_t max_char_size = 4;  | 
279  |  | #endif  | 
280  | 0  |     _PyBytesWriter writer;  | 
281  |  | 
  | 
282  | 0  |     assert(size >= 0);  | 
283  | 0  |     _PyBytesWriter_Init(&writer);  | 
284  |  | 
  | 
285  | 0  |     if (size > PY_SSIZE_T_MAX / max_char_size) { | 
286  |  |         /* integer overflow */  | 
287  | 0  |         return PyErr_NoMemory();  | 
288  | 0  |     }  | 
289  |  |  | 
290  | 0  |     p = _PyBytesWriter_Alloc(&writer, size * max_char_size);  | 
291  | 0  |     if (p == NULL)  | 
292  | 0  |         return NULL;  | 
293  |  |  | 
294  | 0  |     for (i = 0; i < size;) { | 
295  | 0  |         Py_UCS4 ch = data[i++];  | 
296  |  | 
  | 
297  | 0  |         if (ch < 0x80) { | 
298  |  |             /* Encode ASCII */  | 
299  | 0  |             *p++ = (char) ch;  | 
300  |  | 
  | 
301  | 0  |         }  | 
302  | 0  |         else  | 
303  |  | #if STRINGLIB_SIZEOF_CHAR > 1  | 
304  | 0  |         if (ch < 0x0800)  | 
305  | 0  | #endif  | 
306  | 0  |         { | 
307  |  |             /* Encode Latin-1 */  | 
308  | 0  |             *p++ = (char)(0xc0 | (ch >> 6));  | 
309  | 0  |             *p++ = (char)(0x80 | (ch & 0x3f));  | 
310  | 0  |         }  | 
311  |  | #if STRINGLIB_SIZEOF_CHAR > 1  | 
312  | 0  |         else if (Py_UNICODE_IS_SURROGATE(ch)) { | 
313  | 0  |             Py_ssize_t startpos, endpos, newpos;  | 
314  | 0  |             Py_ssize_t k;  | 
315  | 0  |             if (error_handler == _Py_ERROR_UNKNOWN) { | 
316  | 0  |                 error_handler = _Py_GetErrorHandler(errors);  | 
317  | 0  |             }  | 
318  |  | 
  | 
319  | 0  |             startpos = i-1;  | 
320  | 0  |             endpos = startpos+1;  | 
321  |  | 
  | 
322  | 0  |             while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))  | 
323  | 0  |                 endpos++;  | 
324  |  |  | 
325  |  |             /* Only overallocate the buffer if it's not the last write */  | 
326  | 0  |             writer.overallocate = (endpos < size);  | 
327  |  | 
  | 
328  | 0  |             switch (error_handler)  | 
329  | 0  |             { | 
330  | 0  |             case _Py_ERROR_REPLACE:  | 
331  | 0  |                 memset(p, '?', endpos - startpos);  | 
332  | 0  |                 p += (endpos - startpos);  | 
333  |  |                 /* fall through */  | 
334  | 0  |             case _Py_ERROR_IGNORE:  | 
335  | 0  |                 i += (endpos - startpos - 1);  | 
336  | 0  |                 break;  | 
337  |  |  | 
338  | 0  |             case _Py_ERROR_SURROGATEPASS:  | 
339  | 0  |                 for (k=startpos; k<endpos; k++) { | 
340  | 0  |                     ch = data[k];  | 
341  | 0  |                     *p++ = (char)(0xe0 | (ch >> 12));  | 
342  | 0  |                     *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));  | 
343  | 0  |                     *p++ = (char)(0x80 | (ch & 0x3f));  | 
344  | 0  |                 }  | 
345  | 0  |                 i += (endpos - startpos - 1);  | 
346  | 0  |                 break;  | 
347  |  |  | 
348  | 0  |             case _Py_ERROR_BACKSLASHREPLACE:  | 
349  |  |                 /* subtract preallocated bytes */  | 
350  | 0  |                 writer.min_size -= max_char_size * (endpos - startpos);  | 
351  | 0  |                 p = backslashreplace(&writer, p,  | 
352  | 0  |                                      unicode, startpos, endpos);  | 
353  | 0  |                 if (p == NULL)  | 
354  | 0  |                     goto error;  | 
355  | 0  |                 i += (endpos - startpos - 1);  | 
356  | 0  |                 break;  | 
357  |  |  | 
358  | 0  |             case _Py_ERROR_XMLCHARREFREPLACE:  | 
359  |  |                 /* subtract preallocated bytes */  | 
360  | 0  |                 writer.min_size -= max_char_size * (endpos - startpos);  | 
361  | 0  |                 p = xmlcharrefreplace(&writer, p,  | 
362  | 0  |                                       unicode, startpos, endpos);  | 
363  | 0  |                 if (p == NULL)  | 
364  | 0  |                     goto error;  | 
365  | 0  |                 i += (endpos - startpos - 1);  | 
366  | 0  |                 break;  | 
367  |  |  | 
368  | 0  |             case _Py_ERROR_SURROGATEESCAPE:  | 
369  | 0  |                 for (k=startpos; k<endpos; k++) { | 
370  | 0  |                     ch = data[k];  | 
371  | 0  |                     if (!(0xDC80 <= ch && ch <= 0xDCFF))  | 
372  | 0  |                         break;  | 
373  | 0  |                     *p++ = (char)(ch & 0xff);  | 
374  | 0  |                 }  | 
375  | 0  |                 if (k >= endpos) { | 
376  | 0  |                     i += (endpos - startpos - 1);  | 
377  | 0  |                     break;  | 
378  | 0  |                 }  | 
379  | 0  |                 startpos = k;  | 
380  | 0  |                 assert(startpos < endpos);  | 
381  |  |                 /* fall through */  | 
382  | 0  |             default:  | 
383  | 0  |                 rep = unicode_encode_call_errorhandler(  | 
384  | 0  |                       errors, &error_handler_obj, "utf-8", "surrogates not allowed",  | 
385  | 0  |                       unicode, &exc, startpos, endpos, &newpos);  | 
386  | 0  |                 if (!rep)  | 
387  | 0  |                     goto error;  | 
388  |  |  | 
389  |  |                 /* subtract preallocated bytes */  | 
390  | 0  |                 writer.min_size -= max_char_size * (newpos - startpos);  | 
391  |  | 
  | 
392  | 0  |                 if (PyBytes_Check(rep)) { | 
393  | 0  |                     p = _PyBytesWriter_WriteBytes(&writer, p,  | 
394  | 0  |                                                   PyBytes_AS_STRING(rep),  | 
395  | 0  |                                                   PyBytes_GET_SIZE(rep));  | 
396  | 0  |                 }  | 
397  | 0  |                 else { | 
398  |  |                     /* rep is unicode */  | 
399  | 0  |                     if (PyUnicode_READY(rep) < 0)  | 
400  | 0  |                         goto error;  | 
401  |  |  | 
402  | 0  |                     if (!PyUnicode_IS_ASCII(rep)) { | 
403  | 0  |                         raise_encode_exception(&exc, "utf-8", unicode,  | 
404  | 0  |                                                startpos, endpos,  | 
405  | 0  |                                                "surrogates not allowed");  | 
406  | 0  |                         goto error;  | 
407  | 0  |                     }  | 
408  |  |  | 
409  | 0  |                     p = _PyBytesWriter_WriteBytes(&writer, p,  | 
410  | 0  |                                                   PyUnicode_DATA(rep),  | 
411  | 0  |                                                   PyUnicode_GET_LENGTH(rep));  | 
412  | 0  |                 }  | 
413  |  |  | 
414  | 0  |                 if (p == NULL)  | 
415  | 0  |                     goto error;  | 
416  | 0  |                 Py_CLEAR(rep);  | 
417  |  | 
  | 
418  | 0  |                 i = newpos;  | 
419  | 0  |             }  | 
420  |  |  | 
421  |  |             /* If overallocation was disabled, ensure that it was the last  | 
422  |  |                write. Otherwise, we missed an optimization */  | 
423  | 0  |             assert(writer.overallocate || i == size);  | 
424  | 0  |         }  | 
425  | 0  |         else  | 
426  |  | #if STRINGLIB_SIZEOF_CHAR > 2  | 
427  | 0  |         if (ch < 0x10000)  | 
428  | 0  | #endif  | 
429  | 0  |         { | 
430  | 0  |             *p++ = (char)(0xe0 | (ch >> 12));  | 
431  | 0  |             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));  | 
432  | 0  |             *p++ = (char)(0x80 | (ch & 0x3f));  | 
433  | 0  |         }  | 
434  |  | #if STRINGLIB_SIZEOF_CHAR > 2  | 
435  |  |         else /* ch >= 0x10000 */  | 
436  | 0  |         { | 
437  | 0  |             assert(ch <= MAX_UNICODE);  | 
438  |  |             /* Encode UCS4 Unicode ordinals */  | 
439  | 0  |             *p++ = (char)(0xf0 | (ch >> 18));  | 
440  | 0  |             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));  | 
441  | 0  |             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));  | 
442  | 0  |             *p++ = (char)(0x80 | (ch & 0x3f));  | 
443  | 0  |         }  | 
444  |  | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */  | 
445  |  | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */  | 
446  | 0  |     }  | 
447  |  |  | 
448  |  | #if STRINGLIB_SIZEOF_CHAR > 1  | 
449  | 0  |     Py_XDECREF(error_handler_obj);  | 
450  | 0  |     Py_XDECREF(exc);  | 
451  |  | #endif  | 
452  | 0  |     return _PyBytesWriter_Finish(&writer, p);  | 
453  |  |  | 
454  |  | #if STRINGLIB_SIZEOF_CHAR > 1  | 
455  | 0  |  error:  | 
456  | 0  |     Py_XDECREF(rep);  | 
457  | 0  |     Py_XDECREF(error_handler_obj);  | 
458  | 0  |     Py_XDECREF(exc);  | 
459  | 0  |     _PyBytesWriter_Dealloc(&writer);  | 
460  | 0  |     return NULL;  | 
461  |  | #endif  | 
462  | 0  | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf8_encoder Unexecuted instantiation: unicodeobject.c:ucs2lib_utf8_encoder Unexecuted instantiation: unicodeobject.c:ucs4lib_utf8_encoder Unexecuted instantiation: unicodeobject.c:asciilib_utf8_encoder  | 
463  |  |  | 
464  |  | /* The pattern for constructing UCS2-repeated masks. */  | 
465  |  | #if SIZEOF_LONG == 8  | 
466  | 0  | # define UCS2_REPEAT_MASK 0x0001000100010001ul  | 
467  |  | #elif SIZEOF_LONG == 4  | 
468  |  | # define UCS2_REPEAT_MASK 0x00010001ul  | 
469  |  | #else  | 
470  |  | # error C 'long' size should be either 4 or 8!  | 
471  |  | #endif  | 
472  |  |  | 
473  |  | /* The mask for fast checking. */  | 
474  |  | #if STRINGLIB_SIZEOF_CHAR == 1  | 
475  |  | /* The mask for fast checking of whether a C 'long' contains a  | 
476  |  |    non-ASCII or non-Latin1 UTF16-encoded characters. */  | 
477  | 0  | # define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))  | 
478  |  | #else  | 
479  |  | /* The mask for fast checking of whether a C 'long' may contain  | 
480  |  |    UTF16-encoded surrogate characters. This is an efficient heuristic,  | 
481  |  |    assuming that non-surrogate characters with a code point >= 0x8000 are  | 
482  |  |    rare in most input.  | 
483  |  | */  | 
484  | 0  | # define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * 0x8000u)  | 
485  |  | #endif  | 
486  |  | /* The mask for fast byte-swapping. */  | 
487  | 0  | #define STRIPPED_MASK           (UCS2_REPEAT_MASK * 0x00FFu)  | 
488  |  | /* Swap bytes. */  | 
489  | 0  | #define SWAB(value)             ((((value) >> 8) & STRIPPED_MASK) | \  | 
490  | 0  |                                  (((value) & STRIPPED_MASK) << 8))  | 
491  |  |  | 
492  |  | Py_LOCAL_INLINE(Py_UCS4)  | 
493  |  | STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,  | 
494  |  |                         STRINGLIB_CHAR *dest, Py_ssize_t *outpos,  | 
495  |  |                         int native_ordering)  | 
496  | 0  | { | 
497  | 0  |     Py_UCS4 ch;  | 
498  | 0  |     const unsigned char *aligned_end =  | 
499  | 0  |             (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);  | 
500  | 0  |     const unsigned char *q = *inptr;  | 
501  | 0  |     STRINGLIB_CHAR *p = dest + *outpos;  | 
502  |  |     /* Offsets from q for retrieving byte pairs in the right order. */  | 
503  | 0  | #if PY_LITTLE_ENDIAN  | 
504  | 0  |     int ihi = !!native_ordering, ilo = !native_ordering;  | 
505  |  | #else  | 
506  |  |     int ihi = !native_ordering, ilo = !!native_ordering;  | 
507  |  | #endif  | 
508  | 0  |     --e;  | 
509  |  | 
  | 
510  | 0  |     while (q < e) { | 
511  | 0  |         Py_UCS4 ch2;  | 
512  |  |         /* First check for possible aligned read of a C 'long'. Unaligned  | 
513  |  |            reads are more expensive, better to defer to another iteration. */  | 
514  | 0  |         if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) { | 
515  |  |             /* Fast path for runs of in-range non-surrogate chars. */  | 
516  | 0  |             const unsigned char *_q = q;  | 
517  | 0  |             while (_q < aligned_end) { | 
518  | 0  |                 unsigned long block = * (const unsigned long *) _q;  | 
519  | 0  |                 if (native_ordering) { | 
520  |  |                     /* Can use buffer directly */  | 
521  | 0  |                     if (block & FAST_CHAR_MASK)  | 
522  | 0  |                         break;  | 
523  | 0  |                 }  | 
524  | 0  |                 else { | 
525  |  |                     /* Need to byte-swap */  | 
526  | 0  |                     if (block & SWAB(FAST_CHAR_MASK))  | 
527  | 0  |                         break;  | 
528  |  | #if STRINGLIB_SIZEOF_CHAR == 1  | 
529  | 0  |                     block >>= 8;  | 
530  |  | #else  | 
531  | 0  |                     block = SWAB(block);  | 
532  |  | #endif  | 
533  | 0  |                 }  | 
534  | 0  | #if PY_LITTLE_ENDIAN  | 
535  |  | # if SIZEOF_LONG == 4  | 
536  |  |                 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);  | 
537  |  |                 p[1] = (STRINGLIB_CHAR)(block >> 16);  | 
538  |  | # elif SIZEOF_LONG == 8  | 
539  | 0  |                 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);  | 
540  | 0  |                 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);  | 
541  | 0  |                 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);  | 
542  | 0  |                 p[3] = (STRINGLIB_CHAR)(block >> 48);  | 
543  | 0  | # endif  | 
544  |  | #else  | 
545  |  | # if SIZEOF_LONG == 4  | 
546  |  |                 p[0] = (STRINGLIB_CHAR)(block >> 16);  | 
547  |  |                 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);  | 
548  |  | # elif SIZEOF_LONG == 8  | 
549  |  |                 p[0] = (STRINGLIB_CHAR)(block >> 48);  | 
550  |  |                 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);  | 
551  |  |                 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);  | 
552  |  |                 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);  | 
553  |  | # endif  | 
554  |  | #endif  | 
555  | 0  |                 _q += SIZEOF_LONG;  | 
556  | 0  |                 p += SIZEOF_LONG / 2;  | 
557  | 0  |             }  | 
558  | 0  |             q = _q;  | 
559  | 0  |             if (q >= e)  | 
560  | 0  |                 break;  | 
561  | 0  |         }  | 
562  |  |  | 
563  | 0  |         ch = (q[ihi] << 8) | q[ilo];  | 
564  | 0  |         q += 2;  | 
565  | 0  |         if (!Py_UNICODE_IS_SURROGATE(ch)) { | 
566  |  | #if STRINGLIB_SIZEOF_CHAR < 2  | 
567  | 0  |             if (ch > STRINGLIB_MAX_CHAR)  | 
568  |  |                 /* Out-of-range */  | 
569  | 0  |                 goto Return;  | 
570  | 0  | #endif  | 
571  | 0  |             *p++ = (STRINGLIB_CHAR)ch;  | 
572  | 0  |             continue;  | 
573  | 0  |         }  | 
574  |  |  | 
575  |  |         /* UTF-16 code pair: */  | 
576  | 0  |         if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))  | 
577  | 0  |             goto IllegalEncoding;  | 
578  | 0  |         if (q >= e)  | 
579  | 0  |             goto UnexpectedEnd;  | 
580  | 0  |         ch2 = (q[ihi] << 8) | q[ilo];  | 
581  | 0  |         q += 2;  | 
582  | 0  |         if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))  | 
583  | 0  |             goto IllegalSurrogate;  | 
584  | 0  |         ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);  | 
585  |  | #if STRINGLIB_SIZEOF_CHAR < 4  | 
586  |  |         /* Out-of-range */  | 
587  | 0  |         goto Return;  | 
588  |  | #else  | 
589  |  |         *p++ = (STRINGLIB_CHAR)ch;  | 
590  |  | #endif  | 
591  | 0  |     }  | 
592  | 0  |     ch = 0;  | 
593  | 0  | Return:  | 
594  | 0  |     *inptr = q;  | 
595  | 0  |     *outpos = p - dest;  | 
596  | 0  |     return ch;  | 
597  | 0  | UnexpectedEnd:  | 
598  | 0  |     ch = 1;  | 
599  | 0  |     goto Return;  | 
600  | 0  | IllegalEncoding:  | 
601  | 0  |     ch = 2;  | 
602  | 0  |     goto Return;  | 
603  | 0  | IllegalSurrogate:  | 
604  | 0  |     ch = 3;  | 
605  | 0  |     goto Return;  | 
606  | 0  | } Unexecuted instantiation: unicodeobject.c:asciilib_utf16_decode Unexecuted instantiation: unicodeobject.c:ucs1lib_utf16_decode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf16_decode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf16_decode  | 
607  |  | #undef UCS2_REPEAT_MASK  | 
608  |  | #undef FAST_CHAR_MASK  | 
609  |  | #undef STRIPPED_MASK  | 
610  |  | #undef SWAB  | 
611  |  |  | 
612  |  |  | 
613  |  | #if STRINGLIB_MAX_CHAR >= 0x80  | 
614  |  | Py_LOCAL_INLINE(Py_ssize_t)  | 
615  |  | STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,  | 
616  |  |                         Py_ssize_t len,  | 
617  |  |                         unsigned short **outptr,  | 
618  |  |                         int native_ordering)  | 
619  | 0  | { | 
620  | 0  |     unsigned short *out = *outptr;  | 
621  | 0  |     const STRINGLIB_CHAR *end = in + len;  | 
622  |  | #if STRINGLIB_SIZEOF_CHAR == 1  | 
623  | 0  |     if (native_ordering) { | 
624  | 0  |         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);  | 
625  | 0  |         while (in < unrolled_end) { | 
626  | 0  |             out[0] = in[0];  | 
627  | 0  |             out[1] = in[1];  | 
628  | 0  |             out[2] = in[2];  | 
629  | 0  |             out[3] = in[3];  | 
630  | 0  |             in += 4; out += 4;  | 
631  | 0  |         }  | 
632  | 0  |         while (in < end) { | 
633  | 0  |             *out++ = *in++;  | 
634  | 0  |         }  | 
635  | 0  |     } else { | 
636  | 0  | # define SWAB2(CH)  ((CH) << 8) /* high byte is zero */  | 
637  | 0  |         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);  | 
638  | 0  |         while (in < unrolled_end) { | 
639  | 0  |             out[0] = SWAB2(in[0]);  | 
640  | 0  |             out[1] = SWAB2(in[1]);  | 
641  | 0  |             out[2] = SWAB2(in[2]);  | 
642  | 0  |             out[3] = SWAB2(in[3]);  | 
643  | 0  |             in += 4; out += 4;  | 
644  | 0  |         }  | 
645  | 0  |         while (in < end) { | 
646  | 0  |             Py_UCS4 ch = *in++;  | 
647  | 0  |             *out++ = SWAB2((Py_UCS2)ch);  | 
648  | 0  |         }  | 
649  | 0  | #undef SWAB2  | 
650  | 0  |     }  | 
651  |  |     *outptr = out;  | 
652  |  |     return len;  | 
653  |  | #else  | 
654  | 0  |     if (native_ordering) { | 
655  |  | #if STRINGLIB_MAX_CHAR < 0x10000  | 
656  | 0  |         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);  | 
657  | 0  |         while (in < unrolled_end) { | 
658  |  |             /* check if any character is a surrogate character */  | 
659  | 0  |             if (((in[0] ^ 0xd800) &  | 
660  | 0  |                  (in[1] ^ 0xd800) &  | 
661  | 0  |                  (in[2] ^ 0xd800) &  | 
662  | 0  |                  (in[3] ^ 0xd800) & 0xf800) == 0)  | 
663  | 0  |                 break;  | 
664  | 0  |             out[0] = in[0];  | 
665  | 0  |             out[1] = in[1];  | 
666  | 0  |             out[2] = in[2];  | 
667  | 0  |             out[3] = in[3];  | 
668  | 0  |             in += 4; out += 4;  | 
669  | 0  |         }  | 
670  |  | #endif  | 
671  | 0  |         while (in < end) { | 
672  | 0  |             Py_UCS4 ch;  | 
673  | 0  |             ch = *in++;  | 
674  | 0  |             if (ch < 0xd800)  | 
675  | 0  |                 *out++ = ch;  | 
676  | 0  |             else if (ch < 0xe000)  | 
677  |  |                 /* reject surrogate characters (U+D800-U+DFFF) */  | 
678  | 0  |                 goto fail;  | 
679  |  | #if STRINGLIB_MAX_CHAR >= 0x10000  | 
680  | 0  |             else if (ch >= 0x10000) { | 
681  | 0  |                 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);  | 
682  | 0  |                 out[1] = Py_UNICODE_LOW_SURROGATE(ch);  | 
683  | 0  |                 out += 2;  | 
684  | 0  |             }  | 
685  | 0  | #endif  | 
686  | 0  |             else  | 
687  | 0  |                 *out++ = ch;  | 
688  | 0  |         }  | 
689  | 0  |     } else { | 
690  | 0  | #define SWAB2(CH)  (((CH) << 8) | ((CH) >> 8))  | 
691  |  | #if STRINGLIB_MAX_CHAR < 0x10000  | 
692  | 0  |         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);  | 
693  | 0  |         while (in < unrolled_end) { | 
694  |  |             /* check if any character is a surrogate character */  | 
695  | 0  |             if (((in[0] ^ 0xd800) &  | 
696  | 0  |                  (in[1] ^ 0xd800) &  | 
697  | 0  |                  (in[2] ^ 0xd800) &  | 
698  | 0  |                  (in[3] ^ 0xd800) & 0xf800) == 0)  | 
699  | 0  |                 break;  | 
700  | 0  |             out[0] = SWAB2(in[0]);  | 
701  | 0  |             out[1] = SWAB2(in[1]);  | 
702  | 0  |             out[2] = SWAB2(in[2]);  | 
703  | 0  |             out[3] = SWAB2(in[3]);  | 
704  | 0  |             in += 4; out += 4;  | 
705  | 0  |         }  | 
706  |  | #endif  | 
707  | 0  |         while (in < end) { | 
708  | 0  |             Py_UCS4 ch = *in++;  | 
709  | 0  |             if (ch < 0xd800)  | 
710  | 0  |                 *out++ = SWAB2((Py_UCS2)ch);  | 
711  | 0  |             else if (ch < 0xe000)  | 
712  |  |                 /* reject surrogate characters (U+D800-U+DFFF) */  | 
713  | 0  |                 goto fail;  | 
714  |  | #if STRINGLIB_MAX_CHAR >= 0x10000  | 
715  | 0  |             else if (ch >= 0x10000) { | 
716  | 0  |                 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);  | 
717  | 0  |                 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);  | 
718  | 0  |                 out[0] = SWAB2(ch1);  | 
719  | 0  |                 out[1] = SWAB2(ch2);  | 
720  | 0  |                 out += 2;  | 
721  | 0  |             }  | 
722  | 0  | #endif  | 
723  | 0  |             else  | 
724  | 0  |                 *out++ = SWAB2((Py_UCS2)ch);  | 
725  | 0  |         }  | 
726  | 0  | #undef SWAB2  | 
727  | 0  |     }  | 
728  | 0  |     *outptr = out;  | 
729  | 0  |     return len;  | 
730  | 0  |   fail:  | 
731  | 0  |     *outptr = out;  | 
732  | 0  |     return len - (end - in + 1);  | 
733  |  | #endif  | 
734  | 0  | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf16_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf16_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf16_encode  | 
735  |  |  | 
736  |  | #if STRINGLIB_SIZEOF_CHAR == 1  | 
737  | 0  | # define SWAB4(CH, tmp)  ((CH) << 24) /* high bytes are zero */  | 
738  |  | #elif STRINGLIB_SIZEOF_CHAR == 2  | 
739  | 0  | # define SWAB4(CH, tmp)  (tmp = (CH), \  | 
740  | 0  |             ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))  | 
741  |  |             /* high bytes are zero */  | 
742  |  | #else  | 
743  | 0  | # define SWAB4(CH, tmp)  (tmp = (CH), \  | 
744  | 0  |             tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \  | 
745  | 0  |             ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))  | 
746  |  | #endif  | 
747  |  | Py_LOCAL_INLINE(Py_ssize_t)  | 
748  |  | STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,  | 
749  |  |                         Py_ssize_t len,  | 
750  |  |                         PY_UINT32_T **outptr,  | 
751  |  |                         int native_ordering)  | 
752  | 0  | { | 
753  | 0  |     PY_UINT32_T *out = *outptr;  | 
754  | 0  |     const STRINGLIB_CHAR *end = in + len;  | 
755  | 0  |     if (native_ordering) { | 
756  | 0  |         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);  | 
757  | 0  |         while (in < unrolled_end) { | 
758  |  | #if STRINGLIB_SIZEOF_CHAR > 1  | 
759  |  |             /* check if any character is a surrogate character */  | 
760  | 0  |             if (((in[0] ^ 0xd800) &  | 
761  | 0  |                  (in[1] ^ 0xd800) &  | 
762  | 0  |                  (in[2] ^ 0xd800) &  | 
763  | 0  |                  (in[3] ^ 0xd800) & 0xf800) == 0)  | 
764  | 0  |                 break;  | 
765  | 0  | #endif  | 
766  | 0  |             out[0] = in[0];  | 
767  | 0  |             out[1] = in[1];  | 
768  | 0  |             out[2] = in[2];  | 
769  | 0  |             out[3] = in[3];  | 
770  | 0  |             in += 4; out += 4;  | 
771  | 0  |         }  | 
772  | 0  |         while (in < end) { | 
773  | 0  |             Py_UCS4 ch;  | 
774  | 0  |             ch = *in++;  | 
775  |  | #if STRINGLIB_SIZEOF_CHAR > 1  | 
776  | 0  |             if (Py_UNICODE_IS_SURROGATE(ch)) { | 
777  |  |                 /* reject surrogate characters (U+D800-U+DFFF) */  | 
778  | 0  |                 goto fail;  | 
779  | 0  |             }  | 
780  | 0  | #endif  | 
781  | 0  |             *out++ = ch;  | 
782  | 0  |         }  | 
783  | 0  |     } else { | 
784  | 0  |         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);  | 
785  | 0  |         while (in < unrolled_end) { | 
786  |  | #if STRINGLIB_SIZEOF_CHAR > 1  | 
787  |  |             Py_UCS4 ch1, ch2, ch3, ch4;  | 
788  |  |             /* check if any character is a surrogate character */  | 
789  | 0  |             if (((in[0] ^ 0xd800) &  | 
790  | 0  |                  (in[1] ^ 0xd800) &  | 
791  | 0  |                  (in[2] ^ 0xd800) &  | 
792  | 0  |                  (in[3] ^ 0xd800) & 0xf800) == 0)  | 
793  | 0  |                 break;  | 
794  | 0  | #endif  | 
795  | 0  |             out[0] = SWAB4(in[0], ch1);  | 
796  | 0  |             out[1] = SWAB4(in[1], ch2);  | 
797  | 0  |             out[2] = SWAB4(in[2], ch3);  | 
798  | 0  |             out[3] = SWAB4(in[3], ch4);  | 
799  | 0  |             in += 4; out += 4;  | 
800  | 0  |         }  | 
801  | 0  |         while (in < end) { | 
802  | 0  |             Py_UCS4 ch = *in++;  | 
803  |  | #if STRINGLIB_SIZEOF_CHAR > 1  | 
804  | 0  |             if (Py_UNICODE_IS_SURROGATE(ch)) { | 
805  |  |                 /* reject surrogate characters (U+D800-U+DFFF) */  | 
806  | 0  |                 goto fail;  | 
807  | 0  |             }  | 
808  | 0  | #endif  | 
809  | 0  |             *out++ = SWAB4(ch, ch);  | 
810  | 0  |         }  | 
811  | 0  |     }  | 
812  | 0  |     *outptr = out;  | 
813  | 0  |     return len;  | 
814  |  | #if STRINGLIB_SIZEOF_CHAR > 1  | 
815  | 0  |   fail:  | 
816  | 0  |     *outptr = out;  | 
817  | 0  |     return len - (end - in + 1);  | 
818  |  | #endif  | 
819  | 0  | } Unexecuted instantiation: unicodeobject.c:ucs1lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs2lib_utf32_encode Unexecuted instantiation: unicodeobject.c:ucs4lib_utf32_encode  | 
820  |  | #undef SWAB4  | 
821  |  |  | 
822  |  | #endif  |