/src/Python-3.8.3/Objects/unicodeobject.c
Line  | Count  | Source  | 
1  |  | /*  | 
2  |  |  | 
3  |  | Unicode implementation based on original code by Fredrik Lundh,  | 
4  |  | modified by Marc-Andre Lemburg <mal@lemburg.com>.  | 
5  |  |  | 
6  |  | Major speed upgrades to the method implementations at the Reykjavik  | 
7  |  | NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.  | 
8  |  |  | 
9  |  | Copyright (c) Corporation for National Research Initiatives.  | 
10  |  |  | 
11  |  | --------------------------------------------------------------------  | 
12  |  | The original string type implementation is:  | 
13  |  |  | 
14  |  |   Copyright (c) 1999 by Secret Labs AB  | 
15  |  |   Copyright (c) 1999 by Fredrik Lundh  | 
16  |  |  | 
17  |  | By obtaining, using, and/or copying this software and/or its  | 
18  |  | associated documentation, you agree that you have read, understood,  | 
19  |  | and will comply with the following terms and conditions:  | 
20  |  |  | 
21  |  | Permission to use, copy, modify, and distribute this software and its  | 
22  |  | associated documentation for any purpose and without fee is hereby  | 
23  |  | granted, provided that the above copyright notice appears in all  | 
24  |  | copies, and that both that copyright notice and this permission notice  | 
25  |  | appear in supporting documentation, and that the name of Secret Labs  | 
26  |  | AB or the author not be used in advertising or publicity pertaining to  | 
27  |  | distribution of the software without specific, written prior  | 
28  |  | permission.  | 
29  |  |  | 
30  |  | SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO  | 
31  |  | THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND  | 
32  |  | FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR  | 
33  |  | ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES  | 
34  |  | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN  | 
35  |  | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT  | 
36  |  | OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.  | 
37  |  | --------------------------------------------------------------------  | 
38  |  |  | 
39  |  | */  | 
40  |  |  | 
41  |  | #define PY_SSIZE_T_CLEAN  | 
42  |  | #include "Python.h"  | 
43  |  | #include "pycore_initconfig.h"  | 
44  |  | #include "pycore_fileutils.h"  | 
45  |  | #include "pycore_object.h"  | 
46  |  | #include "pycore_pylifecycle.h"  | 
47  |  | #include "pycore_pystate.h"  | 
48  |  | #include "ucnhash.h"  | 
49  |  | #include "bytes_methods.h"  | 
50  |  | #include "stringlib/eq.h"  | 
51  |  |  | 
52  |  | #ifdef MS_WINDOWS  | 
53  |  | #include <windows.h>  | 
54  |  | #endif  | 
55  |  |  | 
56  |  | /* Uncomment to display statistics on interned strings at exit when  | 
57  |  |    using Valgrind or Insecure++. */  | 
58  |  | /* #define INTERNED_STATS 1 */  | 
59  |  |  | 
60  |  |  | 
61  |  | /*[clinic input]  | 
62  |  | class str "PyObject *" "&PyUnicode_Type"  | 
63  |  | [clinic start generated code]*/  | 
64  |  | /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/  | 
65  |  |  | 
66  |  | /*[python input]  | 
67  |  | class Py_UCS4_converter(CConverter):  | 
68  |  |     type = 'Py_UCS4'  | 
69  |  |     converter = 'convert_uc'  | 
70  |  |  | 
71  |  |     def converter_init(self):  | 
72  |  |         if self.default is not unspecified:  | 
73  |  |             self.c_default = ascii(self.default)  | 
74  |  |             if len(self.c_default) > 4 or self.c_default[0] != "'":  | 
75  |  |                 self.c_default = hex(ord(self.default))  | 
76  |  |  | 
77  |  | [python start generated code]*/  | 
78  |  | /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/  | 
79  |  |  | 
80  |  | /* --- Globals ------------------------------------------------------------  | 
81  |  |  | 
82  |  | NOTE: In the interpreter's initialization phase, some globals are currently  | 
83  |  |       initialized dynamically as needed. In the process Unicode objects may  | 
84  |  |       be created before the Unicode type is ready.  | 
85  |  |  | 
86  |  | */  | 
87  |  |  | 
88  |  |  | 
89  |  | #ifdef __cplusplus  | 
90  |  | extern "C" { | 
91  |  | #endif  | 
92  |  |  | 
93  |  | /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */  | 
94  | 16.7k  | #define MAX_UNICODE 0x10ffff  | 
95  |  |  | 
96  |  | #ifdef Py_DEBUG  | 
97  |  | #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)  | 
98  |  | #else  | 
99  |  | #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)  | 
100  |  | #endif  | 
101  |  |  | 
102  |  | #define _PyUnicode_UTF8(op)                             \  | 
103  | 90.2k  |     (((PyCompactUnicodeObject*)(op))->utf8)  | 
104  |  | #define PyUnicode_UTF8(op)                              \  | 
105  | 6.89k  |     (assert(_PyUnicode_CHECK(op)),                      \  | 
106  | 6.89k  |      assert(PyUnicode_IS_READY(op)),                    \  | 
107  | 6.89k  |      PyUnicode_IS_COMPACT_ASCII(op) ?                   \  | 
108  | 6.89k  |          ((char*)((PyASCIIObject*)(op) + 1)) :          \  | 
109  | 6.89k  |          _PyUnicode_UTF8(op))  | 
110  |  | #define _PyUnicode_UTF8_LENGTH(op)                      \  | 
111  | 2.13k  |     (((PyCompactUnicodeObject*)(op))->utf8_length)  | 
112  |  | #define PyUnicode_UTF8_LENGTH(op)                       \  | 
113  | 2.13k  |     (assert(_PyUnicode_CHECK(op)),                      \  | 
114  | 2.13k  |      assert(PyUnicode_IS_READY(op)),                    \  | 
115  | 2.13k  |      PyUnicode_IS_COMPACT_ASCII(op) ?                   \  | 
116  | 2.13k  |          ((PyASCIIObject*)(op))->length :               \  | 
117  | 2.13k  |          _PyUnicode_UTF8_LENGTH(op))  | 
118  |  | #define _PyUnicode_WSTR(op)                             \  | 
119  | 318k  |     (((PyASCIIObject*)(op))->wstr)  | 
120  |  | #define _PyUnicode_WSTR_LENGTH(op)                      \  | 
121  | 29  |     (((PyCompactUnicodeObject*)(op))->wstr_length)  | 
122  |  | #define _PyUnicode_LENGTH(op)                           \  | 
123  | 151k  |     (((PyASCIIObject *)(op))->length)  | 
124  |  | #define _PyUnicode_STATE(op)                            \  | 
125  | 765k  |     (((PyASCIIObject *)(op))->state)  | 
126  |  | #define _PyUnicode_HASH(op)                             \  | 
127  | 353k  |     (((PyASCIIObject *)(op))->hash)  | 
128  |  | #define _PyUnicode_KIND(op)                             \  | 
129  | 104  |     (assert(_PyUnicode_CHECK(op)),                      \  | 
130  | 104  |      ((PyASCIIObject *)(op))->state.kind)  | 
131  |  | #define _PyUnicode_GET_LENGTH(op)                       \  | 
132  |  |     (assert(_PyUnicode_CHECK(op)),                      \  | 
133  |  |      ((PyASCIIObject *)(op))->length)  | 
134  |  | #define _PyUnicode_DATA_ANY(op)                         \  | 
135  | 0  |     (((PyUnicodeObject*)(op))->data.any)  | 
136  |  |  | 
137  |  | #undef PyUnicode_READY  | 
138  |  | #define PyUnicode_READY(op)                             \  | 
139  | 273k  |     (assert(_PyUnicode_CHECK(op)),                      \  | 
140  | 273k  |      (PyUnicode_IS_READY(op) ?                          \  | 
141  | 273k  |       0 :                                               \  | 
142  | 273k  |       _PyUnicode_Ready(op)))  | 
143  |  |  | 
144  |  | #define _PyUnicode_SHARE_UTF8(op)                       \  | 
145  | 0  |     (assert(_PyUnicode_CHECK(op)),                      \  | 
146  | 0  |      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \  | 
147  | 0  |      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))  | 
148  |  | #define _PyUnicode_SHARE_WSTR(op)                       \  | 
149  | 6.42k  |     (assert(_PyUnicode_CHECK(op)),                      \  | 
150  | 6.42k  |      (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))  | 
151  |  |  | 
152  |  | /* true if the Unicode object has an allocated UTF-8 memory block  | 
153  |  |    (not shared with other data) */  | 
154  |  | #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \  | 
155  | 83.3k  |     ((!PyUnicode_IS_COMPACT_ASCII(op)                   \  | 
156  | 83.3k  |       && _PyUnicode_UTF8(op)                            \  | 
157  | 83.3k  |       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))  | 
158  |  |  | 
159  |  | /* true if the Unicode object has an allocated wstr memory block  | 
160  |  |    (not shared with other data) */  | 
161  |  | #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \  | 
162  | 83.3k  |     ((_PyUnicode_WSTR(op) &&                            \  | 
163  | 83.3k  |       (!PyUnicode_IS_READY(op) ||                       \  | 
164  | 0  |        _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))  | 
165  |  |  | 
166  |  | /* Generic helper macro to convert characters of different types.  | 
167  |  |    from_type and to_type have to be valid type names, begin and end  | 
168  |  |    are pointers to the source characters which should be of type  | 
169  |  |    "from_type *".  to is a pointer of type "to_type *" and points to the  | 
170  |  |    buffer where the result characters are written to. */  | 
171  |  | #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \  | 
172  | 5.15k  |     do {                                                \ | 
173  | 20.6k  |         to_type *_to = (to_type *)(to);                \  | 
174  | 5.15k  |         const from_type *_iter = (const from_type *)(begin);\  | 
175  | 5.15k  |         const from_type *_end = (const from_type *)(end);\  | 
176  | 5.15k  |         Py_ssize_t n = (_end) - (_iter);                \  | 
177  | 5.15k  |         const from_type *_unrolled_end =                \  | 
178  | 5.15k  |             _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \  | 
179  | 19.6k  |         while (_iter < (_unrolled_end)) {               \ | 
180  | 14.5k  |             _to[0] = (to_type) _iter[0];                \  | 
181  | 14.5k  |             _to[1] = (to_type) _iter[1];                \  | 
182  | 14.5k  |             _to[2] = (to_type) _iter[2];                \  | 
183  | 14.5k  |             _to[3] = (to_type) _iter[3];                \  | 
184  | 14.5k  |             _iter += 4; _to += 4;                       \  | 
185  | 14.5k  |         }                                               \  | 
186  | 11.9k  |         while (_iter < (_end))                          \  | 
187  | 6.83k  |             *_to++ = (to_type) *_iter++;                \  | 
188  | 5.15k  |     } while (0)  | 
189  |  |  | 
190  |  | #ifdef MS_WINDOWS  | 
191  |  |    /* On Windows, overallocate by 50% is the best factor */  | 
192  |  | #  define OVERALLOCATE_FACTOR 2  | 
193  |  | #else  | 
194  |  |    /* On Linux, overallocate by 25% is the best factor */  | 
195  | 12.6k  | #  define OVERALLOCATE_FACTOR 4  | 
196  |  | #endif  | 
197  |  |  | 
198  |  | /* This dictionary holds all interned unicode strings.  Note that references  | 
199  |  |    to strings in this dictionary are *not* counted in the string's ob_refcnt.  | 
200  |  |    When the interned string reaches a refcnt of 0 the string deallocation  | 
201  |  |    function will delete the reference from this dictionary.  | 
202  |  |  | 
203  |  |    Another way to look at this is that to say that the actual reference  | 
204  |  |    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)  | 
205  |  | */  | 
206  |  | static PyObject *interned = NULL;  | 
207  |  |  | 
208  |  | /* The empty Unicode object is shared to improve performance. */  | 
209  |  | static PyObject *unicode_empty = NULL;  | 
210  |  |  | 
211  |  | #define _Py_INCREF_UNICODE_EMPTY()                      \  | 
212  | 597  |     do {                                                \ | 
213  | 597  |         if (unicode_empty != NULL)                      \  | 
214  | 597  |             Py_INCREF(unicode_empty);                   \  | 
215  | 597  |         else {                                          \ | 
216  | 14  |             unicode_empty = PyUnicode_New(0, 0);        \  | 
217  | 14  |             if (unicode_empty != NULL) {                \ | 
218  | 14  |                 Py_INCREF(unicode_empty);               \  | 
219  | 14  |                 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \  | 
220  | 14  |             }                                           \  | 
221  | 14  |         }                                               \  | 
222  | 597  |     } while (0)  | 
223  |  |  | 
224  |  | #define _Py_RETURN_UNICODE_EMPTY()                      \  | 
225  | 583  |     do {                                                \ | 
226  | 583  |         _Py_INCREF_UNICODE_EMPTY();                     \  | 
227  | 583  |         return unicode_empty;                           \  | 
228  | 583  |     } while (0)  | 
229  |  |  | 
230  |  | static inline void  | 
231  |  | unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,  | 
232  |  |              Py_ssize_t start, Py_ssize_t length)  | 
233  | 0  | { | 
234  | 0  |     assert(0 <= start);  | 
235  | 0  |     assert(kind != PyUnicode_WCHAR_KIND);  | 
236  | 0  |     switch (kind) { | 
237  | 0  |     case PyUnicode_1BYTE_KIND: { | 
238  | 0  |         assert(value <= 0xff);  | 
239  | 0  |         Py_UCS1 ch = (unsigned char)value;  | 
240  | 0  |         Py_UCS1 *to = (Py_UCS1 *)data + start;  | 
241  | 0  |         memset(to, ch, length);  | 
242  | 0  |         break;  | 
243  | 0  |     }  | 
244  | 0  |     case PyUnicode_2BYTE_KIND: { | 
245  | 0  |         assert(value <= 0xffff);  | 
246  | 0  |         Py_UCS2 ch = (Py_UCS2)value;  | 
247  | 0  |         Py_UCS2 *to = (Py_UCS2 *)data + start;  | 
248  | 0  |         const Py_UCS2 *end = to + length;  | 
249  | 0  |         for (; to < end; ++to) *to = ch;  | 
250  | 0  |         break;  | 
251  | 0  |     }  | 
252  | 0  |     case PyUnicode_4BYTE_KIND: { | 
253  | 0  |         assert(value <= MAX_UNICODE);  | 
254  | 0  |         Py_UCS4 ch = value;  | 
255  | 0  |         Py_UCS4 * to = (Py_UCS4 *)data + start;  | 
256  | 0  |         const Py_UCS4 *end = to + length;  | 
257  | 0  |         for (; to < end; ++to) *to = ch;  | 
258  | 0  |         break;  | 
259  | 0  |     }  | 
260  | 0  |     default: Py_UNREACHABLE();  | 
261  | 0  |     }  | 
262  | 0  | }  | 
263  |  |  | 
264  |  |  | 
265  |  | /* Forward declaration */  | 
266  |  | static inline int  | 
267  |  | _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);  | 
268  |  | static PyObject *  | 
269  |  | unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,  | 
270  |  |                     const char *errors);  | 
271  |  | static PyObject *  | 
272  |  | unicode_decode_utf8(const char *s, Py_ssize_t size,  | 
273  |  |                     _Py_error_handler error_handler, const char *errors,  | 
274  |  |                     Py_ssize_t *consumed);  | 
275  |  |  | 
276  |  | /* List of static strings. */  | 
277  |  | static _Py_Identifier *static_strings = NULL;  | 
278  |  |  | 
279  |  | /* Single character Unicode strings in the Latin-1 range are being  | 
280  |  |    shared as well. */  | 
281  |  | static PyObject *unicode_latin1[256] = {NULL}; | 
282  |  |  | 
283  |  | /* Fast detection of the most frequent whitespace characters */  | 
284  |  | const unsigned char _Py_ascii_whitespace[] = { | 
285  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
286  |  | /*     case 0x0009: * CHARACTER TABULATION */  | 
287  |  | /*     case 0x000A: * LINE FEED */  | 
288  |  | /*     case 0x000B: * LINE TABULATION */  | 
289  |  | /*     case 0x000C: * FORM FEED */  | 
290  |  | /*     case 0x000D: * CARRIAGE RETURN */  | 
291  |  |     0, 1, 1, 1, 1, 1, 0, 0,  | 
292  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
293  |  | /*     case 0x001C: * FILE SEPARATOR */  | 
294  |  | /*     case 0x001D: * GROUP SEPARATOR */  | 
295  |  | /*     case 0x001E: * RECORD SEPARATOR */  | 
296  |  | /*     case 0x001F: * UNIT SEPARATOR */  | 
297  |  |     0, 0, 0, 0, 1, 1, 1, 1,  | 
298  |  | /*     case 0x0020: * SPACE */  | 
299  |  |     1, 0, 0, 0, 0, 0, 0, 0,  | 
300  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
301  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
302  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
303  |  |  | 
304  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
305  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
306  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
307  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
308  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
309  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
310  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
311  |  |     0, 0, 0, 0, 0, 0, 0, 0  | 
312  |  | };  | 
313  |  |  | 
314  |  | /* forward */  | 
315  |  | static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);  | 
316  |  | static PyObject* get_latin1_char(unsigned char ch);  | 
317  |  | static int unicode_modifiable(PyObject *unicode);  | 
318  |  |  | 
319  |  |  | 
320  |  | static PyObject *  | 
321  |  | _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);  | 
322  |  | static PyObject *  | 
323  |  | _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);  | 
324  |  | static PyObject *  | 
325  |  | _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);  | 
326  |  |  | 
327  |  | static PyObject *  | 
328  |  | unicode_encode_call_errorhandler(const char *errors,  | 
329  |  |        PyObject **errorHandler,const char *encoding, const char *reason,  | 
330  |  |        PyObject *unicode, PyObject **exceptionObject,  | 
331  |  |        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);  | 
332  |  |  | 
333  |  | static void  | 
334  |  | raise_encode_exception(PyObject **exceptionObject,  | 
335  |  |                        const char *encoding,  | 
336  |  |                        PyObject *unicode,  | 
337  |  |                        Py_ssize_t startpos, Py_ssize_t endpos,  | 
338  |  |                        const char *reason);  | 
339  |  |  | 
340  |  | /* Same for linebreaks */  | 
341  |  | static const unsigned char ascii_linebreak[] = { | 
342  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
343  |  | /*         0x000A, * LINE FEED */  | 
344  |  | /*         0x000B, * LINE TABULATION */  | 
345  |  | /*         0x000C, * FORM FEED */  | 
346  |  | /*         0x000D, * CARRIAGE RETURN */  | 
347  |  |     0, 0, 1, 1, 1, 1, 0, 0,  | 
348  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
349  |  | /*         0x001C, * FILE SEPARATOR */  | 
350  |  | /*         0x001D, * GROUP SEPARATOR */  | 
351  |  | /*         0x001E, * RECORD SEPARATOR */  | 
352  |  |     0, 0, 0, 0, 1, 1, 1, 0,  | 
353  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
354  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
355  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
356  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
357  |  |  | 
358  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
359  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
360  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
361  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
362  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
363  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
364  |  |     0, 0, 0, 0, 0, 0, 0, 0,  | 
365  |  |     0, 0, 0, 0, 0, 0, 0, 0  | 
366  |  | };  | 
367  |  |  | 
368  |  | static int convert_uc(PyObject *obj, void *addr);  | 
369  |  |  | 
370  |  | #include "clinic/unicodeobject.c.h"  | 
371  |  |  | 
372  |  | _Py_error_handler  | 
373  |  | _Py_GetErrorHandler(const char *errors)  | 
374  | 230  | { | 
375  | 230  |     if (errors == NULL || strcmp(errors, "strict") == 0) { | 
376  | 0  |         return _Py_ERROR_STRICT;  | 
377  | 0  |     }  | 
378  | 230  |     if (strcmp(errors, "surrogateescape") == 0) { | 
379  | 230  |         return _Py_ERROR_SURROGATEESCAPE;  | 
380  | 230  |     }  | 
381  | 0  |     if (strcmp(errors, "replace") == 0) { | 
382  | 0  |         return _Py_ERROR_REPLACE;  | 
383  | 0  |     }  | 
384  | 0  |     if (strcmp(errors, "ignore") == 0) { | 
385  | 0  |         return _Py_ERROR_IGNORE;  | 
386  | 0  |     }  | 
387  | 0  |     if (strcmp(errors, "backslashreplace") == 0) { | 
388  | 0  |         return _Py_ERROR_BACKSLASHREPLACE;  | 
389  | 0  |     }  | 
390  | 0  |     if (strcmp(errors, "surrogatepass") == 0) { | 
391  | 0  |         return _Py_ERROR_SURROGATEPASS;  | 
392  | 0  |     }  | 
393  | 0  |     if (strcmp(errors, "xmlcharrefreplace") == 0) { | 
394  | 0  |         return _Py_ERROR_XMLCHARREFREPLACE;  | 
395  | 0  |     }  | 
396  | 0  |     return _Py_ERROR_OTHER;  | 
397  | 0  | }  | 
398  |  |  | 
399  |  |  | 
400  |  | static _Py_error_handler  | 
401  |  | get_error_handler_wide(const wchar_t *errors)  | 
402  | 4.99k  | { | 
403  | 4.99k  |     if (errors == NULL || wcscmp(errors, L"strict") == 0) { | 
404  | 0  |         return _Py_ERROR_STRICT;  | 
405  | 0  |     }  | 
406  | 4.99k  |     if (wcscmp(errors, L"surrogateescape") == 0) { | 
407  | 4.99k  |         return _Py_ERROR_SURROGATEESCAPE;  | 
408  | 4.99k  |     }  | 
409  | 0  |     if (wcscmp(errors, L"replace") == 0) { | 
410  | 0  |         return _Py_ERROR_REPLACE;  | 
411  | 0  |     }  | 
412  | 0  |     if (wcscmp(errors, L"ignore") == 0) { | 
413  | 0  |         return _Py_ERROR_IGNORE;  | 
414  | 0  |     }  | 
415  | 0  |     if (wcscmp(errors, L"backslashreplace") == 0) { | 
416  | 0  |         return _Py_ERROR_BACKSLASHREPLACE;  | 
417  | 0  |     }  | 
418  | 0  |     if (wcscmp(errors, L"surrogatepass") == 0) { | 
419  | 0  |         return _Py_ERROR_SURROGATEPASS;  | 
420  | 0  |     }  | 
421  | 0  |     if (wcscmp(errors, L"xmlcharrefreplace") == 0) { | 
422  | 0  |         return _Py_ERROR_XMLCHARREFREPLACE;  | 
423  | 0  |     }  | 
424  | 0  |     return _Py_ERROR_OTHER;  | 
425  | 0  | }  | 
426  |  |  | 
427  |  |  | 
428  |  | /* The max unicode value is always 0x10FFFF while using the PEP-393 API.  | 
429  |  |    This function is kept for backward compatibility with the old API. */  | 
430  |  | Py_UNICODE  | 
431  |  | PyUnicode_GetMax(void)  | 
432  | 0  | { | 
433  | 0  | #ifdef Py_UNICODE_WIDE  | 
434  | 0  |     return 0x10FFFF;  | 
435  |  | #else  | 
436  |  |     /* This is actually an illegal character, so it should  | 
437  |  |        not be passed to unichr. */  | 
438  |  |     return 0xFFFF;  | 
439  |  | #endif  | 
440  | 0  | }  | 
441  |  |  | 
442  |  | int  | 
443  |  | _PyUnicode_CheckConsistency(PyObject *op, int check_content)  | 
444  | 0  | { | 
445  | 0  | #define CHECK(expr) \  | 
446  | 0  |     do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0) | 
447  |  | 
  | 
448  | 0  |     PyASCIIObject *ascii;  | 
449  | 0  |     unsigned int kind;  | 
450  |  | 
  | 
451  | 0  |     assert(op != NULL);  | 
452  | 0  |     CHECK(PyUnicode_Check(op));  | 
453  |  | 
  | 
454  | 0  |     ascii = (PyASCIIObject *)op;  | 
455  | 0  |     kind = ascii->state.kind;  | 
456  |  | 
  | 
457  | 0  |     if (ascii->state.ascii == 1 && ascii->state.compact == 1) { | 
458  | 0  |         CHECK(kind == PyUnicode_1BYTE_KIND);  | 
459  | 0  |         CHECK(ascii->state.ready == 1);  | 
460  | 0  |     }  | 
461  | 0  |     else { | 
462  | 0  |         PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;  | 
463  | 0  |         void *data;  | 
464  |  | 
  | 
465  | 0  |         if (ascii->state.compact == 1) { | 
466  | 0  |             data = compact + 1;  | 
467  | 0  |             CHECK(kind == PyUnicode_1BYTE_KIND  | 
468  | 0  |                                  || kind == PyUnicode_2BYTE_KIND  | 
469  | 0  |                                  || kind == PyUnicode_4BYTE_KIND);  | 
470  | 0  |             CHECK(ascii->state.ascii == 0);  | 
471  | 0  |             CHECK(ascii->state.ready == 1);  | 
472  | 0  |             CHECK(compact->utf8 != data);  | 
473  | 0  |         }  | 
474  | 0  |         else { | 
475  | 0  |             PyUnicodeObject *unicode = (PyUnicodeObject *)op;  | 
476  |  | 
  | 
477  | 0  |             data = unicode->data.any;  | 
478  | 0  |             if (kind == PyUnicode_WCHAR_KIND) { | 
479  | 0  |                 CHECK(ascii->length == 0);  | 
480  | 0  |                 CHECK(ascii->hash == -1);  | 
481  | 0  |                 CHECK(ascii->state.compact == 0);  | 
482  | 0  |                 CHECK(ascii->state.ascii == 0);  | 
483  | 0  |                 CHECK(ascii->state.ready == 0);  | 
484  | 0  |                 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);  | 
485  | 0  |                 CHECK(ascii->wstr != NULL);  | 
486  | 0  |                 CHECK(data == NULL);  | 
487  | 0  |                 CHECK(compact->utf8 == NULL);  | 
488  | 0  |             }  | 
489  | 0  |             else { | 
490  | 0  |                 CHECK(kind == PyUnicode_1BYTE_KIND  | 
491  | 0  |                                      || kind == PyUnicode_2BYTE_KIND  | 
492  | 0  |                                      || kind == PyUnicode_4BYTE_KIND);  | 
493  | 0  |                 CHECK(ascii->state.compact == 0);  | 
494  | 0  |                 CHECK(ascii->state.ready == 1);  | 
495  | 0  |                 CHECK(data != NULL);  | 
496  | 0  |                 if (ascii->state.ascii) { | 
497  | 0  |                     CHECK(compact->utf8 == data);  | 
498  | 0  |                     CHECK(compact->utf8_length == ascii->length);  | 
499  | 0  |                 }  | 
500  | 0  |                 else  | 
501  | 0  |                     CHECK(compact->utf8 != data);  | 
502  | 0  |             }  | 
503  | 0  |         }  | 
504  | 0  |         if (kind != PyUnicode_WCHAR_KIND) { | 
505  | 0  |             if (  | 
506  |  | #if SIZEOF_WCHAR_T == 2  | 
507  |  |                 kind == PyUnicode_2BYTE_KIND  | 
508  |  | #else  | 
509  | 0  |                 kind == PyUnicode_4BYTE_KIND  | 
510  | 0  | #endif  | 
511  | 0  |                )  | 
512  | 0  |             { | 
513  | 0  |                 CHECK(ascii->wstr == data);  | 
514  | 0  |                 CHECK(compact->wstr_length == ascii->length);  | 
515  | 0  |             } else  | 
516  | 0  |                 CHECK(ascii->wstr != data);  | 
517  | 0  |         }  | 
518  |  | 
  | 
519  | 0  |         if (compact->utf8 == NULL)  | 
520  | 0  |             CHECK(compact->utf8_length == 0);  | 
521  | 0  |         if (ascii->wstr == NULL)  | 
522  | 0  |             CHECK(compact->wstr_length == 0);  | 
523  | 0  |     }  | 
524  |  |  | 
525  |  |     /* check that the best kind is used: O(n) operation */  | 
526  | 0  |     if (check_content && kind != PyUnicode_WCHAR_KIND) { | 
527  | 0  |         Py_ssize_t i;  | 
528  | 0  |         Py_UCS4 maxchar = 0;  | 
529  | 0  |         void *data;  | 
530  | 0  |         Py_UCS4 ch;  | 
531  |  | 
  | 
532  | 0  |         data = PyUnicode_DATA(ascii);  | 
533  | 0  |         for (i=0; i < ascii->length; i++)  | 
534  | 0  |         { | 
535  | 0  |             ch = PyUnicode_READ(kind, data, i);  | 
536  | 0  |             if (ch > maxchar)  | 
537  | 0  |                 maxchar = ch;  | 
538  | 0  |         }  | 
539  | 0  |         if (kind == PyUnicode_1BYTE_KIND) { | 
540  | 0  |             if (ascii->state.ascii == 0) { | 
541  | 0  |                 CHECK(maxchar >= 128);  | 
542  | 0  |                 CHECK(maxchar <= 255);  | 
543  | 0  |             }  | 
544  | 0  |             else  | 
545  | 0  |                 CHECK(maxchar < 128);  | 
546  | 0  |         }  | 
547  | 0  |         else if (kind == PyUnicode_2BYTE_KIND) { | 
548  | 0  |             CHECK(maxchar >= 0x100);  | 
549  | 0  |             CHECK(maxchar <= 0xFFFF);  | 
550  | 0  |         }  | 
551  | 0  |         else { | 
552  | 0  |             CHECK(maxchar >= 0x10000);  | 
553  | 0  |             CHECK(maxchar <= MAX_UNICODE);  | 
554  | 0  |         }  | 
555  | 0  |         CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);  | 
556  | 0  |     }  | 
557  | 0  |     return 1;  | 
558  |  | 
  | 
559  | 0  | #undef CHECK  | 
560  | 0  | }  | 
561  |  |  | 
562  |  |  | 
563  |  | static PyObject*  | 
564  |  | unicode_result_wchar(PyObject *unicode)  | 
565  | 0  | { | 
566  | 0  | #ifndef Py_DEBUG  | 
567  | 0  |     Py_ssize_t len;  | 
568  |  | 
  | 
569  | 0  |     len = _PyUnicode_WSTR_LENGTH(unicode);  | 
570  | 0  |     if (len == 0) { | 
571  | 0  |         Py_DECREF(unicode);  | 
572  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
573  | 0  |     }  | 
574  |  |  | 
575  | 0  |     if (len == 1) { | 
576  | 0  |         wchar_t ch = _PyUnicode_WSTR(unicode)[0];  | 
577  | 0  |         if ((Py_UCS4)ch < 256) { | 
578  | 0  |             PyObject *latin1_char = get_latin1_char((unsigned char)ch);  | 
579  | 0  |             Py_DECREF(unicode);  | 
580  | 0  |             return latin1_char;  | 
581  | 0  |         }  | 
582  | 0  |     }  | 
583  |  |  | 
584  | 0  |     if (_PyUnicode_Ready(unicode) < 0) { | 
585  | 0  |         Py_DECREF(unicode);  | 
586  | 0  |         return NULL;  | 
587  | 0  |     }  | 
588  |  | #else  | 
589  |  |     assert(Py_REFCNT(unicode) == 1);  | 
590  |  |  | 
591  |  |     /* don't make the result ready in debug mode to ensure that the caller  | 
592  |  |        makes the string ready before using it */  | 
593  |  |     assert(_PyUnicode_CheckConsistency(unicode, 1));  | 
594  |  | #endif  | 
595  | 0  |     return unicode;  | 
596  | 0  | }  | 
597  |  |  | 
598  |  | static PyObject*  | 
599  |  | unicode_result_ready(PyObject *unicode)  | 
600  | 75.7k  | { | 
601  | 75.7k  |     Py_ssize_t length;  | 
602  |  |  | 
603  | 75.7k  |     length = PyUnicode_GET_LENGTH(unicode);  | 
604  | 75.7k  |     if (length == 0) { | 
605  | 0  |         if (unicode != unicode_empty) { | 
606  | 0  |             Py_DECREF(unicode);  | 
607  | 0  |             _Py_RETURN_UNICODE_EMPTY();  | 
608  | 0  |         }  | 
609  | 0  |         return unicode_empty;  | 
610  | 0  |     }  | 
611  |  |  | 
612  | 75.7k  |     if (length == 1) { | 
613  | 13  |         void *data = PyUnicode_DATA(unicode);  | 
614  | 13  |         int kind = PyUnicode_KIND(unicode);  | 
615  | 13  |         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);  | 
616  | 13  |         if (ch < 256) { | 
617  | 13  |             PyObject *latin1_char = unicode_latin1[ch];  | 
618  | 13  |             if (latin1_char != NULL) { | 
619  | 11  |                 if (unicode != latin1_char) { | 
620  | 11  |                     Py_INCREF(latin1_char);  | 
621  | 11  |                     Py_DECREF(unicode);  | 
622  | 11  |                 }  | 
623  | 11  |                 return latin1_char;  | 
624  | 11  |             }  | 
625  | 2  |             else { | 
626  | 2  |                 assert(_PyUnicode_CheckConsistency(unicode, 1));  | 
627  | 2  |                 Py_INCREF(unicode);  | 
628  | 2  |                 unicode_latin1[ch] = unicode;  | 
629  | 2  |                 return unicode;  | 
630  | 2  |             }  | 
631  | 13  |         }  | 
632  | 13  |     }  | 
633  |  |  | 
634  | 75.7k  |     assert(_PyUnicode_CheckConsistency(unicode, 1));  | 
635  | 75.7k  |     return unicode;  | 
636  | 75.7k  | }  | 
637  |  |  | 
638  |  | static PyObject*  | 
639  |  | unicode_result(PyObject *unicode)  | 
640  | 5.14k  | { | 
641  | 5.14k  |     assert(_PyUnicode_CHECK(unicode));  | 
642  | 5.14k  |     if (PyUnicode_IS_READY(unicode))  | 
643  | 5.14k  |         return unicode_result_ready(unicode);  | 
644  | 0  |     else  | 
645  | 0  |         return unicode_result_wchar(unicode);  | 
646  | 5.14k  | }  | 
647  |  |  | 
648  |  | static PyObject*  | 
649  |  | unicode_result_unchanged(PyObject *unicode)  | 
650  | 3.80k  | { | 
651  | 3.80k  |     if (PyUnicode_CheckExact(unicode)) { | 
652  | 3.80k  |         if (PyUnicode_READY(unicode) == -1)  | 
653  | 0  |             return NULL;  | 
654  | 3.80k  |         Py_INCREF(unicode);  | 
655  | 3.80k  |         return unicode;  | 
656  | 3.80k  |     }  | 
657  | 0  |     else  | 
658  |  |         /* Subtype -- return genuine unicode string with the same value. */  | 
659  | 0  |         return _PyUnicode_Copy(unicode);  | 
660  | 3.80k  | }  | 
661  |  |  | 
662  |  | /* Implementation of the "backslashreplace" error handler for 8-bit encodings:  | 
663  |  |    ASCII, Latin1, UTF-8, etc. */  | 
664  |  | static char*  | 
665  |  | backslashreplace(_PyBytesWriter *writer, char *str,  | 
666  |  |                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)  | 
667  | 0  | { | 
668  | 0  |     Py_ssize_t size, i;  | 
669  | 0  |     Py_UCS4 ch;  | 
670  | 0  |     enum PyUnicode_Kind kind;  | 
671  | 0  |     void *data;  | 
672  |  | 
  | 
673  | 0  |     assert(PyUnicode_IS_READY(unicode));  | 
674  | 0  |     kind = PyUnicode_KIND(unicode);  | 
675  | 0  |     data = PyUnicode_DATA(unicode);  | 
676  |  | 
  | 
677  | 0  |     size = 0;  | 
678  |  |     /* determine replacement size */  | 
679  | 0  |     for (i = collstart; i < collend; ++i) { | 
680  | 0  |         Py_ssize_t incr;  | 
681  |  | 
  | 
682  | 0  |         ch = PyUnicode_READ(kind, data, i);  | 
683  | 0  |         if (ch < 0x100)  | 
684  | 0  |             incr = 2+2;  | 
685  | 0  |         else if (ch < 0x10000)  | 
686  | 0  |             incr = 2+4;  | 
687  | 0  |         else { | 
688  | 0  |             assert(ch <= MAX_UNICODE);  | 
689  | 0  |             incr = 2+8;  | 
690  | 0  |         }  | 
691  | 0  |         if (size > PY_SSIZE_T_MAX - incr) { | 
692  | 0  |             PyErr_SetString(PyExc_OverflowError,  | 
693  | 0  |                             "encoded result is too long for a Python string");  | 
694  | 0  |             return NULL;  | 
695  | 0  |         }  | 
696  | 0  |         size += incr;  | 
697  | 0  |     }  | 
698  |  |  | 
699  | 0  |     str = _PyBytesWriter_Prepare(writer, str, size);  | 
700  | 0  |     if (str == NULL)  | 
701  | 0  |         return NULL;  | 
702  |  |  | 
703  |  |     /* generate replacement */  | 
704  | 0  |     for (i = collstart; i < collend; ++i) { | 
705  | 0  |         ch = PyUnicode_READ(kind, data, i);  | 
706  | 0  |         *str++ = '\\';  | 
707  | 0  |         if (ch >= 0x00010000) { | 
708  | 0  |             *str++ = 'U';  | 
709  | 0  |             *str++ = Py_hexdigits[(ch>>28)&0xf];  | 
710  | 0  |             *str++ = Py_hexdigits[(ch>>24)&0xf];  | 
711  | 0  |             *str++ = Py_hexdigits[(ch>>20)&0xf];  | 
712  | 0  |             *str++ = Py_hexdigits[(ch>>16)&0xf];  | 
713  | 0  |             *str++ = Py_hexdigits[(ch>>12)&0xf];  | 
714  | 0  |             *str++ = Py_hexdigits[(ch>>8)&0xf];  | 
715  | 0  |         }  | 
716  | 0  |         else if (ch >= 0x100) { | 
717  | 0  |             *str++ = 'u';  | 
718  | 0  |             *str++ = Py_hexdigits[(ch>>12)&0xf];  | 
719  | 0  |             *str++ = Py_hexdigits[(ch>>8)&0xf];  | 
720  | 0  |         }  | 
721  | 0  |         else  | 
722  | 0  |             *str++ = 'x';  | 
723  | 0  |         *str++ = Py_hexdigits[(ch>>4)&0xf];  | 
724  | 0  |         *str++ = Py_hexdigits[ch&0xf];  | 
725  | 0  |     }  | 
726  | 0  |     return str;  | 
727  | 0  | }  | 
728  |  |  | 
729  |  | /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:  | 
730  |  |    ASCII, Latin1, UTF-8, etc. */  | 
731  |  | static char*  | 
732  |  | xmlcharrefreplace(_PyBytesWriter *writer, char *str,  | 
733  |  |                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)  | 
734  | 0  | { | 
735  | 0  |     Py_ssize_t size, i;  | 
736  | 0  |     Py_UCS4 ch;  | 
737  | 0  |     enum PyUnicode_Kind kind;  | 
738  | 0  |     void *data;  | 
739  |  | 
  | 
740  | 0  |     assert(PyUnicode_IS_READY(unicode));  | 
741  | 0  |     kind = PyUnicode_KIND(unicode);  | 
742  | 0  |     data = PyUnicode_DATA(unicode);  | 
743  |  | 
  | 
744  | 0  |     size = 0;  | 
745  |  |     /* determine replacement size */  | 
746  | 0  |     for (i = collstart; i < collend; ++i) { | 
747  | 0  |         Py_ssize_t incr;  | 
748  |  | 
  | 
749  | 0  |         ch = PyUnicode_READ(kind, data, i);  | 
750  | 0  |         if (ch < 10)  | 
751  | 0  |             incr = 2+1+1;  | 
752  | 0  |         else if (ch < 100)  | 
753  | 0  |             incr = 2+2+1;  | 
754  | 0  |         else if (ch < 1000)  | 
755  | 0  |             incr = 2+3+1;  | 
756  | 0  |         else if (ch < 10000)  | 
757  | 0  |             incr = 2+4+1;  | 
758  | 0  |         else if (ch < 100000)  | 
759  | 0  |             incr = 2+5+1;  | 
760  | 0  |         else if (ch < 1000000)  | 
761  | 0  |             incr = 2+6+1;  | 
762  | 0  |         else { | 
763  | 0  |             assert(ch <= MAX_UNICODE);  | 
764  | 0  |             incr = 2+7+1;  | 
765  | 0  |         }  | 
766  | 0  |         if (size > PY_SSIZE_T_MAX - incr) { | 
767  | 0  |             PyErr_SetString(PyExc_OverflowError,  | 
768  | 0  |                             "encoded result is too long for a Python string");  | 
769  | 0  |             return NULL;  | 
770  | 0  |         }  | 
771  | 0  |         size += incr;  | 
772  | 0  |     }  | 
773  |  |  | 
774  | 0  |     str = _PyBytesWriter_Prepare(writer, str, size);  | 
775  | 0  |     if (str == NULL)  | 
776  | 0  |         return NULL;  | 
777  |  |  | 
778  |  |     /* generate replacement */  | 
779  | 0  |     for (i = collstart; i < collend; ++i) { | 
780  | 0  |         str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));  | 
781  | 0  |     }  | 
782  | 0  |     return str;  | 
783  | 0  | }  | 
784  |  |  | 
785  |  | /* --- Bloom Filters ----------------------------------------------------- */  | 
786  |  |  | 
787  |  | /* stuff to implement simple "bloom filters" for Unicode characters.  | 
788  |  |    to keep things simple, we use a single bitmask, using the least 5  | 
789  |  |    bits from each unicode characters as the bit index. */  | 
790  |  |  | 
791  |  | /* the linebreak mask is set up by Unicode_Init below */  | 
792  |  |  | 
793  |  | #if LONG_BIT >= 128  | 
794  |  | #define BLOOM_WIDTH 128  | 
795  |  | #elif LONG_BIT >= 64  | 
796  | 7.58k  | #define BLOOM_WIDTH 64  | 
797  |  | #elif LONG_BIT >= 32  | 
798  |  | #define BLOOM_WIDTH 32  | 
799  |  | #else  | 
800  |  | #error "LONG_BIT is smaller than 32"  | 
801  |  | #endif  | 
802  |  |  | 
803  | 7.44k  | #define BLOOM_MASK unsigned long  | 
804  |  |  | 
805  |  | static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;  | 
806  |  |  | 
807  | 3.75k  | #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))  | 
808  |  |  | 
809  |  | #define BLOOM_LINEBREAK(ch)                                             \  | 
810  | 0  |     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \  | 
811  | 0  |      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))  | 
812  |  |  | 
813  |  | static inline BLOOM_MASK  | 
814  |  | make_bloom_mask(int kind, void* ptr, Py_ssize_t len)  | 
815  | 3.72k  | { | 
816  | 3.72k  | #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \  | 
817  | 3.72k  |     do {                                               \ | 
818  | 3.72k  |         TYPE *data = (TYPE *)PTR;                      \  | 
819  | 3.72k  |         TYPE *end = data + LEN;                        \  | 
820  | 3.72k  |         Py_UCS4 ch;                                    \  | 
821  | 7.55k  |         for (; data != end; data++) {                  \ | 
822  | 3.82k  |             ch = *data;                                \  | 
823  | 3.82k  |             MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \  | 
824  | 3.82k  |         }                                              \  | 
825  | 3.72k  |         break;                                         \  | 
826  | 3.72k  |     } while (0)  | 
827  |  |  | 
828  |  |     /* calculate simple bloom-style bitmask for a given unicode string */  | 
829  |  |  | 
830  | 3.72k  |     BLOOM_MASK mask;  | 
831  |  |  | 
832  | 3.72k  |     mask = 0;  | 
833  | 3.72k  |     switch (kind) { | 
834  | 3.71k  |     case PyUnicode_1BYTE_KIND:  | 
835  | 3.71k  |         BLOOM_UPDATE(Py_UCS1, mask, ptr, len);  | 
836  | 3.71k  |         break;  | 
837  | 14  |     case PyUnicode_2BYTE_KIND:  | 
838  | 14  |         BLOOM_UPDATE(Py_UCS2, mask, ptr, len);  | 
839  | 14  |         break;  | 
840  | 0  |     case PyUnicode_4BYTE_KIND:  | 
841  | 0  |         BLOOM_UPDATE(Py_UCS4, mask, ptr, len);  | 
842  | 0  |         break;  | 
843  | 0  |     default:  | 
844  | 0  |         Py_UNREACHABLE();  | 
845  | 3.72k  |     }  | 
846  | 3.72k  |     return mask;  | 
847  |  |  | 
848  | 3.72k  | #undef BLOOM_UPDATE  | 
849  | 3.72k  | }  | 
850  |  |  | 
851  |  | static int  | 
852  |  | ensure_unicode(PyObject *obj)  | 
853  | 5.00k  | { | 
854  | 5.00k  |     if (!PyUnicode_Check(obj)) { | 
855  | 0  |         PyErr_Format(PyExc_TypeError,  | 
856  | 0  |                      "must be str, not %.100s",  | 
857  | 0  |                      Py_TYPE(obj)->tp_name);  | 
858  | 0  |         return -1;  | 
859  | 0  |     }  | 
860  | 5.00k  |     return PyUnicode_READY(obj);  | 
861  | 5.00k  | }  | 
862  |  |  | 
863  |  | /* Compilation of templated routines */  | 
864  |  |  | 
865  |  | #include "stringlib/asciilib.h"  | 
866  |  | #include "stringlib/fastsearch.h"  | 
867  |  | #include "stringlib/partition.h"  | 
868  |  | #include "stringlib/split.h"  | 
869  |  | #include "stringlib/count.h"  | 
870  |  | #include "stringlib/find.h"  | 
871  |  | #include "stringlib/find_max_char.h"  | 
872  |  | #include "stringlib/undef.h"  | 
873  |  |  | 
874  |  | #include "stringlib/ucs1lib.h"  | 
875  |  | #include "stringlib/fastsearch.h"  | 
876  |  | #include "stringlib/partition.h"  | 
877  |  | #include "stringlib/split.h"  | 
878  |  | #include "stringlib/count.h"  | 
879  |  | #include "stringlib/find.h"  | 
880  |  | #include "stringlib/replace.h"  | 
881  |  | #include "stringlib/find_max_char.h"  | 
882  |  | #include "stringlib/undef.h"  | 
883  |  |  | 
884  |  | #include "stringlib/ucs2lib.h"  | 
885  |  | #include "stringlib/fastsearch.h"  | 
886  |  | #include "stringlib/partition.h"  | 
887  |  | #include "stringlib/split.h"  | 
888  |  | #include "stringlib/count.h"  | 
889  |  | #include "stringlib/find.h"  | 
890  |  | #include "stringlib/replace.h"  | 
891  |  | #include "stringlib/find_max_char.h"  | 
892  |  | #include "stringlib/undef.h"  | 
893  |  |  | 
894  |  | #include "stringlib/ucs4lib.h"  | 
895  |  | #include "stringlib/fastsearch.h"  | 
896  |  | #include "stringlib/partition.h"  | 
897  |  | #include "stringlib/split.h"  | 
898  |  | #include "stringlib/count.h"  | 
899  |  | #include "stringlib/find.h"  | 
900  |  | #include "stringlib/replace.h"  | 
901  |  | #include "stringlib/find_max_char.h"  | 
902  |  | #include "stringlib/undef.h"  | 
903  |  |  | 
904  |  | #include "stringlib/unicodedefs.h"  | 
905  |  | #include "stringlib/fastsearch.h"  | 
906  |  | #include "stringlib/count.h"  | 
907  |  | #include "stringlib/find.h"  | 
908  |  | #include "stringlib/undef.h"  | 
909  |  |  | 
910  |  | /* --- Unicode Object ----------------------------------------------------- */  | 
911  |  |  | 
912  |  | static inline Py_ssize_t  | 
913  |  | findchar(const void *s, int kind,  | 
914  |  |          Py_ssize_t size, Py_UCS4 ch,  | 
915  |  |          int direction)  | 
916  | 2.67k  | { | 
917  | 2.67k  |     switch (kind) { | 
918  | 2.67k  |     case PyUnicode_1BYTE_KIND:  | 
919  | 2.67k  |         if ((Py_UCS1) ch != ch)  | 
920  | 0  |             return -1;  | 
921  | 2.67k  |         if (direction > 0)  | 
922  | 2.58k  |             return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);  | 
923  | 84  |         else  | 
924  | 84  |             return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);  | 
925  | 0  |     case PyUnicode_2BYTE_KIND:  | 
926  | 0  |         if ((Py_UCS2) ch != ch)  | 
927  | 0  |             return -1;  | 
928  | 0  |         if (direction > 0)  | 
929  | 0  |             return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);  | 
930  | 0  |         else  | 
931  | 0  |             return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);  | 
932  | 0  |     case PyUnicode_4BYTE_KIND:  | 
933  | 0  |         if (direction > 0)  | 
934  | 0  |             return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);  | 
935  | 0  |         else  | 
936  | 0  |             return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);  | 
937  | 0  |     default:  | 
938  | 0  |         Py_UNREACHABLE();  | 
939  | 2.67k  |     }  | 
940  | 2.67k  | }  | 
941  |  |  | 
942  |  | #ifdef Py_DEBUG  | 
943  |  | /* Fill the data of a Unicode string with invalid characters to detect bugs  | 
944  |  |    earlier.  | 
945  |  |  | 
946  |  |    _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for  | 
947  |  |    ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an  | 
948  |  |    invalid character in Unicode 6.0. */  | 
949  |  | static void  | 
950  |  | unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)  | 
951  |  | { | 
952  |  |     int kind = PyUnicode_KIND(unicode);  | 
953  |  |     Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);  | 
954  |  |     Py_ssize_t length = _PyUnicode_LENGTH(unicode);  | 
955  |  |     if (length <= old_length)  | 
956  |  |         return;  | 
957  |  |     memset(data + old_length * kind, 0xff, (length - old_length) * kind);  | 
958  |  | }  | 
959  |  | #endif  | 
960  |  |  | 
961  |  | static PyObject*  | 
962  |  | resize_compact(PyObject *unicode, Py_ssize_t length)  | 
963  | 6.42k  | { | 
964  | 6.42k  |     Py_ssize_t char_size;  | 
965  | 6.42k  |     Py_ssize_t struct_size;  | 
966  | 6.42k  |     Py_ssize_t new_size;  | 
967  | 6.42k  |     int share_wstr;  | 
968  | 6.42k  |     PyObject *new_unicode;  | 
969  |  | #ifdef Py_DEBUG  | 
970  |  |     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);  | 
971  |  | #endif  | 
972  |  |  | 
973  | 6.42k  |     assert(unicode_modifiable(unicode));  | 
974  | 6.42k  |     assert(PyUnicode_IS_READY(unicode));  | 
975  | 6.42k  |     assert(PyUnicode_IS_COMPACT(unicode));  | 
976  |  |  | 
977  | 6.42k  |     char_size = PyUnicode_KIND(unicode);  | 
978  | 6.42k  |     if (PyUnicode_IS_ASCII(unicode))  | 
979  | 6.41k  |         struct_size = sizeof(PyASCIIObject);  | 
980  | 15  |     else  | 
981  | 15  |         struct_size = sizeof(PyCompactUnicodeObject);  | 
982  | 6.42k  |     share_wstr = _PyUnicode_SHARE_WSTR(unicode);  | 
983  |  |  | 
984  | 6.42k  |     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { | 
985  | 0  |         PyErr_NoMemory();  | 
986  | 0  |         return NULL;  | 
987  | 0  |     }  | 
988  | 6.42k  |     new_size = (struct_size + (length + 1) * char_size);  | 
989  |  |  | 
990  | 6.42k  |     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { | 
991  | 0  |         PyObject_DEL(_PyUnicode_UTF8(unicode));  | 
992  | 0  |         _PyUnicode_UTF8(unicode) = NULL;  | 
993  | 0  |         _PyUnicode_UTF8_LENGTH(unicode) = 0;  | 
994  | 0  |     }  | 
995  | 6.42k  |     _Py_DEC_REFTOTAL;  | 
996  | 6.42k  |     _Py_ForgetReference(unicode);  | 
997  |  |  | 
998  | 6.42k  |     new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);  | 
999  | 6.42k  |     if (new_unicode == NULL) { | 
1000  | 0  |         _Py_NewReference(unicode);  | 
1001  | 0  |         PyErr_NoMemory();  | 
1002  | 0  |         return NULL;  | 
1003  | 0  |     }  | 
1004  | 6.42k  |     unicode = new_unicode;  | 
1005  | 6.42k  |     _Py_NewReference(unicode);  | 
1006  |  |  | 
1007  | 6.42k  |     _PyUnicode_LENGTH(unicode) = length;  | 
1008  | 6.42k  |     if (share_wstr) { | 
1009  | 0  |         _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);  | 
1010  | 0  |         if (!PyUnicode_IS_ASCII(unicode))  | 
1011  | 0  |             _PyUnicode_WSTR_LENGTH(unicode) = length;  | 
1012  | 0  |     }  | 
1013  | 6.42k  |     else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { | 
1014  | 0  |         PyObject_DEL(_PyUnicode_WSTR(unicode));  | 
1015  | 0  |         _PyUnicode_WSTR(unicode) = NULL;  | 
1016  | 0  |         if (!PyUnicode_IS_ASCII(unicode))  | 
1017  | 0  |             _PyUnicode_WSTR_LENGTH(unicode) = 0;  | 
1018  | 0  |     }  | 
1019  |  | #ifdef Py_DEBUG  | 
1020  |  |     unicode_fill_invalid(unicode, old_length);  | 
1021  |  | #endif  | 
1022  | 6.42k  |     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),  | 
1023  | 6.42k  |                     length, 0);  | 
1024  | 6.42k  |     assert(_PyUnicode_CheckConsistency(unicode, 0));  | 
1025  | 6.42k  |     return unicode;  | 
1026  | 6.42k  | }  | 
1027  |  |  | 
1028  |  | static int  | 
1029  |  | resize_inplace(PyObject *unicode, Py_ssize_t length)  | 
1030  | 0  | { | 
1031  | 0  |     wchar_t *wstr;  | 
1032  | 0  |     Py_ssize_t new_size;  | 
1033  | 0  |     assert(!PyUnicode_IS_COMPACT(unicode));  | 
1034  | 0  |     assert(Py_REFCNT(unicode) == 1);  | 
1035  |  | 
  | 
1036  | 0  |     if (PyUnicode_IS_READY(unicode)) { | 
1037  | 0  |         Py_ssize_t char_size;  | 
1038  | 0  |         int share_wstr, share_utf8;  | 
1039  | 0  |         void *data;  | 
1040  |  | #ifdef Py_DEBUG  | 
1041  |  |         Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);  | 
1042  |  | #endif  | 
1043  |  | 
  | 
1044  | 0  |         data = _PyUnicode_DATA_ANY(unicode);  | 
1045  | 0  |         char_size = PyUnicode_KIND(unicode);  | 
1046  | 0  |         share_wstr = _PyUnicode_SHARE_WSTR(unicode);  | 
1047  | 0  |         share_utf8 = _PyUnicode_SHARE_UTF8(unicode);  | 
1048  |  | 
  | 
1049  | 0  |         if (length > (PY_SSIZE_T_MAX / char_size - 1)) { | 
1050  | 0  |             PyErr_NoMemory();  | 
1051  | 0  |             return -1;  | 
1052  | 0  |         }  | 
1053  | 0  |         new_size = (length + 1) * char_size;  | 
1054  |  | 
  | 
1055  | 0  |         if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))  | 
1056  | 0  |         { | 
1057  | 0  |             PyObject_DEL(_PyUnicode_UTF8(unicode));  | 
1058  | 0  |             _PyUnicode_UTF8(unicode) = NULL;  | 
1059  | 0  |             _PyUnicode_UTF8_LENGTH(unicode) = 0;  | 
1060  | 0  |         }  | 
1061  |  | 
  | 
1062  | 0  |         data = (PyObject *)PyObject_REALLOC(data, new_size);  | 
1063  | 0  |         if (data == NULL) { | 
1064  | 0  |             PyErr_NoMemory();  | 
1065  | 0  |             return -1;  | 
1066  | 0  |         }  | 
1067  | 0  |         _PyUnicode_DATA_ANY(unicode) = data;  | 
1068  | 0  |         if (share_wstr) { | 
1069  | 0  |             _PyUnicode_WSTR(unicode) = data;  | 
1070  | 0  |             _PyUnicode_WSTR_LENGTH(unicode) = length;  | 
1071  | 0  |         }  | 
1072  | 0  |         if (share_utf8) { | 
1073  | 0  |             _PyUnicode_UTF8(unicode) = data;  | 
1074  | 0  |             _PyUnicode_UTF8_LENGTH(unicode) = length;  | 
1075  | 0  |         }  | 
1076  | 0  |         _PyUnicode_LENGTH(unicode) = length;  | 
1077  | 0  |         PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);  | 
1078  |  | #ifdef Py_DEBUG  | 
1079  |  |         unicode_fill_invalid(unicode, old_length);  | 
1080  |  | #endif  | 
1081  | 0  |         if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { | 
1082  | 0  |             assert(_PyUnicode_CheckConsistency(unicode, 0));  | 
1083  | 0  |             return 0;  | 
1084  | 0  |         }  | 
1085  | 0  |     }  | 
1086  | 0  |     assert(_PyUnicode_WSTR(unicode) != NULL);  | 
1087  |  |  | 
1088  |  |     /* check for integer overflow */  | 
1089  | 0  |     if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) { | 
1090  | 0  |         PyErr_NoMemory();  | 
1091  | 0  |         return -1;  | 
1092  | 0  |     }  | 
1093  | 0  |     new_size = sizeof(wchar_t) * (length + 1);  | 
1094  | 0  |     wstr =  _PyUnicode_WSTR(unicode);  | 
1095  | 0  |     wstr = PyObject_REALLOC(wstr, new_size);  | 
1096  | 0  |     if (!wstr) { | 
1097  | 0  |         PyErr_NoMemory();  | 
1098  | 0  |         return -1;  | 
1099  | 0  |     }  | 
1100  | 0  |     _PyUnicode_WSTR(unicode) = wstr;  | 
1101  | 0  |     _PyUnicode_WSTR(unicode)[length] = 0;  | 
1102  | 0  |     _PyUnicode_WSTR_LENGTH(unicode) = length;  | 
1103  | 0  |     assert(_PyUnicode_CheckConsistency(unicode, 0));  | 
1104  | 0  |     return 0;  | 
1105  | 0  | }  | 
1106  |  |  | 
1107  |  | static PyObject*  | 
1108  |  | resize_copy(PyObject *unicode, Py_ssize_t length)  | 
1109  | 0  | { | 
1110  | 0  |     Py_ssize_t copy_length;  | 
1111  | 0  |     if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { | 
1112  | 0  |         PyObject *copy;  | 
1113  |  | 
  | 
1114  | 0  |         assert(PyUnicode_IS_READY(unicode));  | 
1115  |  | 
  | 
1116  | 0  |         copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));  | 
1117  | 0  |         if (copy == NULL)  | 
1118  | 0  |             return NULL;  | 
1119  |  |  | 
1120  | 0  |         copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));  | 
1121  | 0  |         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);  | 
1122  | 0  |         return copy;  | 
1123  | 0  |     }  | 
1124  | 0  |     else { | 
1125  | 0  |         PyObject *w;  | 
1126  |  | 
  | 
1127  | 0  |         w = (PyObject*)_PyUnicode_New(length);  | 
1128  | 0  |         if (w == NULL)  | 
1129  | 0  |             return NULL;  | 
1130  | 0  |         copy_length = _PyUnicode_WSTR_LENGTH(unicode);  | 
1131  | 0  |         copy_length = Py_MIN(copy_length, length);  | 
1132  | 0  |         memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),  | 
1133  | 0  |                   copy_length * sizeof(wchar_t));  | 
1134  | 0  |         return w;  | 
1135  | 0  |     }  | 
1136  | 0  | }  | 
1137  |  |  | 
1138  |  | /* We allocate one more byte to make sure the string is  | 
1139  |  |    Ux0000 terminated; some code (e.g. new_identifier)  | 
1140  |  |    relies on that.  | 
1141  |  |  | 
1142  |  |    XXX This allocator could further be enhanced by assuring that the  | 
1143  |  |    free list never reduces its size below 1.  | 
1144  |  |  | 
1145  |  | */  | 
1146  |  |  | 
1147  |  | static PyUnicodeObject *  | 
1148  |  | _PyUnicode_New(Py_ssize_t length)  | 
1149  | 14  | { | 
1150  | 14  |     PyUnicodeObject *unicode;  | 
1151  | 14  |     size_t new_size;  | 
1152  |  |  | 
1153  |  |     /* Optimization for empty strings */  | 
1154  | 14  |     if (length == 0 && unicode_empty != NULL) { | 
1155  | 14  |         Py_INCREF(unicode_empty);  | 
1156  | 14  |         return (PyUnicodeObject*)unicode_empty;  | 
1157  | 14  |     }  | 
1158  |  |  | 
1159  |  |     /* Ensure we won't overflow the size. */  | 
1160  | 0  |     if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { | 
1161  | 0  |         return (PyUnicodeObject *)PyErr_NoMemory();  | 
1162  | 0  |     }  | 
1163  | 0  |     if (length < 0) { | 
1164  | 0  |         PyErr_SetString(PyExc_SystemError,  | 
1165  | 0  |                         "Negative size passed to _PyUnicode_New");  | 
1166  | 0  |         return NULL;  | 
1167  | 0  |     }  | 
1168  |  |  | 
1169  | 0  |     unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);  | 
1170  | 0  |     if (unicode == NULL)  | 
1171  | 0  |         return NULL;  | 
1172  | 0  |     new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);  | 
1173  |  | 
  | 
1174  | 0  |     _PyUnicode_WSTR_LENGTH(unicode) = length;  | 
1175  | 0  |     _PyUnicode_HASH(unicode) = -1;  | 
1176  | 0  |     _PyUnicode_STATE(unicode).interned = 0;  | 
1177  | 0  |     _PyUnicode_STATE(unicode).kind = 0;  | 
1178  | 0  |     _PyUnicode_STATE(unicode).compact = 0;  | 
1179  | 0  |     _PyUnicode_STATE(unicode).ready = 0;  | 
1180  | 0  |     _PyUnicode_STATE(unicode).ascii = 0;  | 
1181  | 0  |     _PyUnicode_DATA_ANY(unicode) = NULL;  | 
1182  | 0  |     _PyUnicode_LENGTH(unicode) = 0;  | 
1183  | 0  |     _PyUnicode_UTF8(unicode) = NULL;  | 
1184  | 0  |     _PyUnicode_UTF8_LENGTH(unicode) = 0;  | 
1185  |  | 
  | 
1186  | 0  |     _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);  | 
1187  | 0  |     if (!_PyUnicode_WSTR(unicode)) { | 
1188  | 0  |         Py_DECREF(unicode);  | 
1189  | 0  |         PyErr_NoMemory();  | 
1190  | 0  |         return NULL;  | 
1191  | 0  |     }  | 
1192  |  |  | 
1193  |  |     /* Initialize the first element to guard against cases where  | 
1194  |  |      * the caller fails before initializing str -- unicode_resize()  | 
1195  |  |      * reads str[0], and the Keep-Alive optimization can keep memory  | 
1196  |  |      * allocated for str alive across a call to unicode_dealloc(unicode).  | 
1197  |  |      * We don't want unicode_resize to read uninitialized memory in  | 
1198  |  |      * that case.  | 
1199  |  |      */  | 
1200  | 0  |     _PyUnicode_WSTR(unicode)[0] = 0;  | 
1201  | 0  |     _PyUnicode_WSTR(unicode)[length] = 0;  | 
1202  |  | 
  | 
1203  | 0  |     assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));  | 
1204  | 0  |     return unicode;  | 
1205  | 0  | }  | 
1206  |  |  | 
1207  |  | static const char*  | 
1208  |  | unicode_kind_name(PyObject *unicode)  | 
1209  | 0  | { | 
1210  |  |     /* don't check consistency: unicode_kind_name() is called from  | 
1211  |  |        _PyUnicode_Dump() */  | 
1212  | 0  |     if (!PyUnicode_IS_COMPACT(unicode))  | 
1213  | 0  |     { | 
1214  | 0  |         if (!PyUnicode_IS_READY(unicode))  | 
1215  | 0  |             return "wstr";  | 
1216  | 0  |         switch (PyUnicode_KIND(unicode))  | 
1217  | 0  |         { | 
1218  | 0  |         case PyUnicode_1BYTE_KIND:  | 
1219  | 0  |             if (PyUnicode_IS_ASCII(unicode))  | 
1220  | 0  |                 return "legacy ascii";  | 
1221  | 0  |             else  | 
1222  | 0  |                 return "legacy latin1";  | 
1223  | 0  |         case PyUnicode_2BYTE_KIND:  | 
1224  | 0  |             return "legacy UCS2";  | 
1225  | 0  |         case PyUnicode_4BYTE_KIND:  | 
1226  | 0  |             return "legacy UCS4";  | 
1227  | 0  |         default:  | 
1228  | 0  |             return "<legacy invalid kind>";  | 
1229  | 0  |         }  | 
1230  | 0  |     }  | 
1231  | 0  |     assert(PyUnicode_IS_READY(unicode));  | 
1232  | 0  |     switch (PyUnicode_KIND(unicode)) { | 
1233  | 0  |     case PyUnicode_1BYTE_KIND:  | 
1234  | 0  |         if (PyUnicode_IS_ASCII(unicode))  | 
1235  | 0  |             return "ascii";  | 
1236  | 0  |         else  | 
1237  | 0  |             return "latin1";  | 
1238  | 0  |     case PyUnicode_2BYTE_KIND:  | 
1239  | 0  |         return "UCS2";  | 
1240  | 0  |     case PyUnicode_4BYTE_KIND:  | 
1241  | 0  |         return "UCS4";  | 
1242  | 0  |     default:  | 
1243  | 0  |         return "<invalid compact kind>";  | 
1244  | 0  |     }  | 
1245  | 0  | }  | 
1246  |  |  | 
1247  |  | #ifdef Py_DEBUG  | 
1248  |  | /* Functions wrapping macros for use in debugger */  | 
1249  |  | char *_PyUnicode_utf8(void *unicode_raw){ | 
1250  |  |     PyObject *unicode = _PyObject_CAST(unicode_raw);  | 
1251  |  |     return PyUnicode_UTF8(unicode);  | 
1252  |  | }  | 
1253  |  |  | 
1254  |  | void *_PyUnicode_compact_data(void *unicode_raw) { | 
1255  |  |     PyObject *unicode = _PyObject_CAST(unicode_raw);  | 
1256  |  |     return _PyUnicode_COMPACT_DATA(unicode);  | 
1257  |  | }  | 
1258  |  | void *_PyUnicode_data(void *unicode_raw) { | 
1259  |  |     PyObject *unicode = _PyObject_CAST(unicode_raw);  | 
1260  |  |     printf("obj %p\n", (void*)unicode); | 
1261  |  |     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); | 
1262  |  |     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); | 
1263  |  |     printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); | 
1264  |  |     printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); | 
1265  |  |     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); | 
1266  |  |     return PyUnicode_DATA(unicode);  | 
1267  |  | }  | 
1268  |  |  | 
1269  |  | void  | 
1270  |  | _PyUnicode_Dump(PyObject *op)  | 
1271  |  | { | 
1272  |  |     PyASCIIObject *ascii = (PyASCIIObject *)op;  | 
1273  |  |     PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;  | 
1274  |  |     PyUnicodeObject *unicode = (PyUnicodeObject *)op;  | 
1275  |  |     void *data;  | 
1276  |  |  | 
1277  |  |     if (ascii->state.compact)  | 
1278  |  |     { | 
1279  |  |         if (ascii->state.ascii)  | 
1280  |  |             data = (ascii + 1);  | 
1281  |  |         else  | 
1282  |  |             data = (compact + 1);  | 
1283  |  |     }  | 
1284  |  |     else  | 
1285  |  |         data = unicode->data.any;  | 
1286  |  |     printf("%s: len=%" PY_FORMAT_SIZE_T "u, ", | 
1287  |  |            unicode_kind_name(op), ascii->length);  | 
1288  |  |  | 
1289  |  |     if (ascii->wstr == data)  | 
1290  |  |         printf("shared "); | 
1291  |  |     printf("wstr=%p", (void *)ascii->wstr); | 
1292  |  |  | 
1293  |  |     if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { | 
1294  |  |         printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length); | 
1295  |  |         if (!ascii->state.compact && compact->utf8 == unicode->data.any)  | 
1296  |  |             printf("shared "); | 
1297  |  |         printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)", | 
1298  |  |                (void *)compact->utf8, compact->utf8_length);  | 
1299  |  |     }  | 
1300  |  |     printf(", data=%p\n", data); | 
1301  |  | }  | 
1302  |  | #endif  | 
1303  |  |  | 
1304  |  | PyObject *  | 
1305  |  | PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)  | 
1306  | 144k  | { | 
1307  | 144k  |     PyObject *obj;  | 
1308  | 144k  |     PyCompactUnicodeObject *unicode;  | 
1309  | 144k  |     void *data;  | 
1310  | 144k  |     enum PyUnicode_Kind kind;  | 
1311  | 144k  |     int is_sharing, is_ascii;  | 
1312  | 144k  |     Py_ssize_t char_size;  | 
1313  | 144k  |     Py_ssize_t struct_size;  | 
1314  |  |  | 
1315  |  |     /* Optimization for empty strings */  | 
1316  | 144k  |     if (size == 0 && unicode_empty != NULL) { | 
1317  | 140  |         Py_INCREF(unicode_empty);  | 
1318  | 140  |         return unicode_empty;  | 
1319  | 140  |     }  | 
1320  |  |  | 
1321  | 144k  |     is_ascii = 0;  | 
1322  | 144k  |     is_sharing = 0;  | 
1323  | 144k  |     struct_size = sizeof(PyCompactUnicodeObject);  | 
1324  | 144k  |     if (maxchar < 128) { | 
1325  | 144k  |         kind = PyUnicode_1BYTE_KIND;  | 
1326  | 144k  |         char_size = 1;  | 
1327  | 144k  |         is_ascii = 1;  | 
1328  | 144k  |         struct_size = sizeof(PyASCIIObject);  | 
1329  | 144k  |     }  | 
1330  | 29  |     else if (maxchar < 256) { | 
1331  | 15  |         kind = PyUnicode_1BYTE_KIND;  | 
1332  | 15  |         char_size = 1;  | 
1333  | 15  |     }  | 
1334  | 14  |     else if (maxchar < 65536) { | 
1335  | 14  |         kind = PyUnicode_2BYTE_KIND;  | 
1336  | 14  |         char_size = 2;  | 
1337  | 14  |         if (sizeof(wchar_t) == 2)  | 
1338  | 0  |             is_sharing = 1;  | 
1339  | 14  |     }  | 
1340  | 0  |     else { | 
1341  | 0  |         if (maxchar > MAX_UNICODE) { | 
1342  | 0  |             PyErr_SetString(PyExc_SystemError,  | 
1343  | 0  |                             "invalid maximum character passed to PyUnicode_New");  | 
1344  | 0  |             return NULL;  | 
1345  | 0  |         }  | 
1346  | 0  |         kind = PyUnicode_4BYTE_KIND;  | 
1347  | 0  |         char_size = 4;  | 
1348  | 0  |         if (sizeof(wchar_t) == 4)  | 
1349  | 0  |             is_sharing = 1;  | 
1350  | 0  |     }  | 
1351  |  |  | 
1352  |  |     /* Ensure we won't overflow the size. */  | 
1353  | 144k  |     if (size < 0) { | 
1354  | 0  |         PyErr_SetString(PyExc_SystemError,  | 
1355  | 0  |                         "Negative size passed to PyUnicode_New");  | 
1356  | 0  |         return NULL;  | 
1357  | 0  |     }  | 
1358  | 144k  |     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))  | 
1359  | 0  |         return PyErr_NoMemory();  | 
1360  |  |  | 
1361  |  |     /* Duplicated allocation code from _PyObject_New() instead of a call to  | 
1362  |  |      * PyObject_New() so we are able to allocate space for the object and  | 
1363  |  |      * it's data buffer.  | 
1364  |  |      */  | 
1365  | 144k  |     obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);  | 
1366  | 144k  |     if (obj == NULL)  | 
1367  | 0  |         return PyErr_NoMemory();  | 
1368  | 144k  |     obj = PyObject_INIT(obj, &PyUnicode_Type);  | 
1369  | 144k  |     if (obj == NULL)  | 
1370  | 0  |         return NULL;  | 
1371  |  |  | 
1372  | 144k  |     unicode = (PyCompactUnicodeObject *)obj;  | 
1373  | 144k  |     if (is_ascii)  | 
1374  | 144k  |         data = ((PyASCIIObject*)obj) + 1;  | 
1375  | 29  |     else  | 
1376  | 29  |         data = unicode + 1;  | 
1377  | 144k  |     _PyUnicode_LENGTH(unicode) = size;  | 
1378  | 144k  |     _PyUnicode_HASH(unicode) = -1;  | 
1379  | 144k  |     _PyUnicode_STATE(unicode).interned = 0;  | 
1380  | 144k  |     _PyUnicode_STATE(unicode).kind = kind;  | 
1381  | 144k  |     _PyUnicode_STATE(unicode).compact = 1;  | 
1382  | 144k  |     _PyUnicode_STATE(unicode).ready = 1;  | 
1383  | 144k  |     _PyUnicode_STATE(unicode).ascii = is_ascii;  | 
1384  | 144k  |     if (is_ascii) { | 
1385  | 144k  |         ((char*)data)[size] = 0;  | 
1386  | 144k  |         _PyUnicode_WSTR(unicode) = NULL;  | 
1387  | 144k  |     }  | 
1388  | 29  |     else if (kind == PyUnicode_1BYTE_KIND) { | 
1389  | 15  |         ((char*)data)[size] = 0;  | 
1390  | 15  |         _PyUnicode_WSTR(unicode) = NULL;  | 
1391  | 15  |         _PyUnicode_WSTR_LENGTH(unicode) = 0;  | 
1392  | 15  |         unicode->utf8 = NULL;  | 
1393  | 15  |         unicode->utf8_length = 0;  | 
1394  | 15  |     }  | 
1395  | 14  |     else { | 
1396  | 14  |         unicode->utf8 = NULL;  | 
1397  | 14  |         unicode->utf8_length = 0;  | 
1398  | 14  |         if (kind == PyUnicode_2BYTE_KIND)  | 
1399  | 14  |             ((Py_UCS2*)data)[size] = 0;  | 
1400  | 0  |         else /* kind == PyUnicode_4BYTE_KIND */  | 
1401  | 0  |             ((Py_UCS4*)data)[size] = 0;  | 
1402  | 14  |         if (is_sharing) { | 
1403  | 0  |             _PyUnicode_WSTR_LENGTH(unicode) = size;  | 
1404  | 0  |             _PyUnicode_WSTR(unicode) = (wchar_t *)data;  | 
1405  | 0  |         }  | 
1406  | 14  |         else { | 
1407  | 14  |             _PyUnicode_WSTR_LENGTH(unicode) = 0;  | 
1408  | 14  |             _PyUnicode_WSTR(unicode) = NULL;  | 
1409  | 14  |         }  | 
1410  | 14  |     }  | 
1411  |  | #ifdef Py_DEBUG  | 
1412  |  |     unicode_fill_invalid((PyObject*)unicode, 0);  | 
1413  |  | #endif  | 
1414  | 144k  |     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));  | 
1415  | 144k  |     return obj;  | 
1416  | 144k  | }  | 
1417  |  |  | 
1418  |  | #if SIZEOF_WCHAR_T == 2  | 
1419  |  | /* Helper function to convert a 16-bits wchar_t representation to UCS4, this  | 
1420  |  |    will decode surrogate pairs, the other conversions are implemented as macros  | 
1421  |  |    for efficiency.  | 
1422  |  |  | 
1423  |  |    This function assumes that unicode can hold one more code point than wstr  | 
1424  |  |    characters for a terminating null character. */  | 
1425  |  | static void  | 
1426  |  | unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,  | 
1427  |  |                               PyObject *unicode)  | 
1428  |  | { | 
1429  |  |     const wchar_t *iter;  | 
1430  |  |     Py_UCS4 *ucs4_out;  | 
1431  |  |  | 
1432  |  |     assert(unicode != NULL);  | 
1433  |  |     assert(_PyUnicode_CHECK(unicode));  | 
1434  |  |     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);  | 
1435  |  |     ucs4_out = PyUnicode_4BYTE_DATA(unicode);  | 
1436  |  |  | 
1437  |  |     for (iter = begin; iter < end; ) { | 
1438  |  |         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +  | 
1439  |  |                            _PyUnicode_GET_LENGTH(unicode)));  | 
1440  |  |         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])  | 
1441  |  |             && (iter+1) < end  | 
1442  |  |             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))  | 
1443  |  |         { | 
1444  |  |             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);  | 
1445  |  |             iter += 2;  | 
1446  |  |         }  | 
1447  |  |         else { | 
1448  |  |             *ucs4_out++ = *iter;  | 
1449  |  |             iter++;  | 
1450  |  |         }  | 
1451  |  |     }  | 
1452  |  |     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +  | 
1453  |  |                         _PyUnicode_GET_LENGTH(unicode)));  | 
1454  |  |  | 
1455  |  | }  | 
1456  |  | #endif  | 
1457  |  |  | 
1458  |  | static int  | 
1459  |  | unicode_check_modifiable(PyObject *unicode)  | 
1460  | 0  | { | 
1461  | 0  |     if (!unicode_modifiable(unicode)) { | 
1462  | 0  |         PyErr_SetString(PyExc_SystemError,  | 
1463  | 0  |                         "Cannot modify a string currently used");  | 
1464  | 0  |         return -1;  | 
1465  | 0  |     }  | 
1466  | 0  |     return 0;  | 
1467  | 0  | }  | 
1468  |  |  | 
1469  |  | static int  | 
1470  |  | _copy_characters(PyObject *to, Py_ssize_t to_start,  | 
1471  |  |                  PyObject *from, Py_ssize_t from_start,  | 
1472  |  |                  Py_ssize_t how_many, int check_maxchar)  | 
1473  | 19.2k  | { | 
1474  | 19.2k  |     unsigned int from_kind, to_kind;  | 
1475  | 19.2k  |     void *from_data, *to_data;  | 
1476  |  |  | 
1477  | 19.2k  |     assert(0 <= how_many);  | 
1478  | 19.2k  |     assert(0 <= from_start);  | 
1479  | 19.2k  |     assert(0 <= to_start);  | 
1480  | 19.2k  |     assert(PyUnicode_Check(from));  | 
1481  | 19.2k  |     assert(PyUnicode_IS_READY(from));  | 
1482  | 19.2k  |     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));  | 
1483  |  |  | 
1484  | 19.2k  |     assert(PyUnicode_Check(to));  | 
1485  | 19.2k  |     assert(PyUnicode_IS_READY(to));  | 
1486  | 19.2k  |     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));  | 
1487  |  |  | 
1488  | 19.2k  |     if (how_many == 0)  | 
1489  | 0  |         return 0;  | 
1490  |  |  | 
1491  | 19.2k  |     from_kind = PyUnicode_KIND(from);  | 
1492  | 19.2k  |     from_data = PyUnicode_DATA(from);  | 
1493  | 19.2k  |     to_kind = PyUnicode_KIND(to);  | 
1494  | 19.2k  |     to_data = PyUnicode_DATA(to);  | 
1495  |  |  | 
1496  |  | #ifdef Py_DEBUG  | 
1497  |  |     if (!check_maxchar  | 
1498  |  |         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))  | 
1499  |  |     { | 
1500  |  |         const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);  | 
1501  |  |         Py_UCS4 ch;  | 
1502  |  |         Py_ssize_t i;  | 
1503  |  |         for (i=0; i < how_many; i++) { | 
1504  |  |             ch = PyUnicode_READ(from_kind, from_data, from_start + i);  | 
1505  |  |             assert(ch <= to_maxchar);  | 
1506  |  |         }  | 
1507  |  |     }  | 
1508  |  | #endif  | 
1509  |  |  | 
1510  | 19.2k  |     if (from_kind == to_kind) { | 
1511  | 19.2k  |         if (check_maxchar  | 
1512  | 0  |             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))  | 
1513  | 0  |         { | 
1514  |  |             /* Writing Latin-1 characters into an ASCII string requires to  | 
1515  |  |                check that all written characters are pure ASCII */  | 
1516  | 0  |             Py_UCS4 max_char;  | 
1517  | 0  |             max_char = ucs1lib_find_max_char(from_data,  | 
1518  | 0  |                                              (Py_UCS1*)from_data + how_many);  | 
1519  | 0  |             if (max_char >= 128)  | 
1520  | 0  |                 return -1;  | 
1521  | 0  |         }  | 
1522  | 19.2k  |         memcpy((char*)to_data + to_kind * to_start,  | 
1523  | 19.2k  |                   (char*)from_data + from_kind * from_start,  | 
1524  | 19.2k  |                   to_kind * how_many);  | 
1525  | 19.2k  |     }  | 
1526  | 14  |     else if (from_kind == PyUnicode_1BYTE_KIND  | 
1527  | 14  |              && to_kind == PyUnicode_2BYTE_KIND)  | 
1528  | 14  |     { | 
1529  | 14  |         _PyUnicode_CONVERT_BYTES(  | 
1530  | 14  |             Py_UCS1, Py_UCS2,  | 
1531  | 14  |             PyUnicode_1BYTE_DATA(from) + from_start,  | 
1532  | 14  |             PyUnicode_1BYTE_DATA(from) + from_start + how_many,  | 
1533  | 14  |             PyUnicode_2BYTE_DATA(to) + to_start  | 
1534  | 14  |             );  | 
1535  | 14  |     }  | 
1536  | 0  |     else if (from_kind == PyUnicode_1BYTE_KIND  | 
1537  | 0  |              && to_kind == PyUnicode_4BYTE_KIND)  | 
1538  | 0  |     { | 
1539  | 0  |         _PyUnicode_CONVERT_BYTES(  | 
1540  | 0  |             Py_UCS1, Py_UCS4,  | 
1541  | 0  |             PyUnicode_1BYTE_DATA(from) + from_start,  | 
1542  | 0  |             PyUnicode_1BYTE_DATA(from) + from_start + how_many,  | 
1543  | 0  |             PyUnicode_4BYTE_DATA(to) + to_start  | 
1544  | 0  |             );  | 
1545  | 0  |     }  | 
1546  | 0  |     else if (from_kind == PyUnicode_2BYTE_KIND  | 
1547  | 0  |              && to_kind == PyUnicode_4BYTE_KIND)  | 
1548  | 0  |     { | 
1549  | 0  |         _PyUnicode_CONVERT_BYTES(  | 
1550  | 0  |             Py_UCS2, Py_UCS4,  | 
1551  | 0  |             PyUnicode_2BYTE_DATA(from) + from_start,  | 
1552  | 0  |             PyUnicode_2BYTE_DATA(from) + from_start + how_many,  | 
1553  | 0  |             PyUnicode_4BYTE_DATA(to) + to_start  | 
1554  | 0  |             );  | 
1555  | 0  |     }  | 
1556  | 0  |     else { | 
1557  | 0  |         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));  | 
1558  |  | 
  | 
1559  | 0  |         if (!check_maxchar) { | 
1560  | 0  |             if (from_kind == PyUnicode_2BYTE_KIND  | 
1561  | 0  |                 && to_kind == PyUnicode_1BYTE_KIND)  | 
1562  | 0  |             { | 
1563  | 0  |                 _PyUnicode_CONVERT_BYTES(  | 
1564  | 0  |                     Py_UCS2, Py_UCS1,  | 
1565  | 0  |                     PyUnicode_2BYTE_DATA(from) + from_start,  | 
1566  | 0  |                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,  | 
1567  | 0  |                     PyUnicode_1BYTE_DATA(to) + to_start  | 
1568  | 0  |                     );  | 
1569  | 0  |             }  | 
1570  | 0  |             else if (from_kind == PyUnicode_4BYTE_KIND  | 
1571  | 0  |                      && to_kind == PyUnicode_1BYTE_KIND)  | 
1572  | 0  |             { | 
1573  | 0  |                 _PyUnicode_CONVERT_BYTES(  | 
1574  | 0  |                     Py_UCS4, Py_UCS1,  | 
1575  | 0  |                     PyUnicode_4BYTE_DATA(from) + from_start,  | 
1576  | 0  |                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,  | 
1577  | 0  |                     PyUnicode_1BYTE_DATA(to) + to_start  | 
1578  | 0  |                     );  | 
1579  | 0  |             }  | 
1580  | 0  |             else if (from_kind == PyUnicode_4BYTE_KIND  | 
1581  | 0  |                      && to_kind == PyUnicode_2BYTE_KIND)  | 
1582  | 0  |             { | 
1583  | 0  |                 _PyUnicode_CONVERT_BYTES(  | 
1584  | 0  |                     Py_UCS4, Py_UCS2,  | 
1585  | 0  |                     PyUnicode_4BYTE_DATA(from) + from_start,  | 
1586  | 0  |                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,  | 
1587  | 0  |                     PyUnicode_2BYTE_DATA(to) + to_start  | 
1588  | 0  |                     );  | 
1589  | 0  |             }  | 
1590  | 0  |             else { | 
1591  | 0  |                 Py_UNREACHABLE();  | 
1592  | 0  |             }  | 
1593  | 0  |         }  | 
1594  | 0  |         else { | 
1595  | 0  |             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);  | 
1596  | 0  |             Py_UCS4 ch;  | 
1597  | 0  |             Py_ssize_t i;  | 
1598  |  | 
  | 
1599  | 0  |             for (i=0; i < how_many; i++) { | 
1600  | 0  |                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);  | 
1601  | 0  |                 if (ch > to_maxchar)  | 
1602  | 0  |                     return -1;  | 
1603  | 0  |                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);  | 
1604  | 0  |             }  | 
1605  | 0  |         }  | 
1606  | 0  |     }  | 
1607  | 19.2k  |     return 0;  | 
1608  | 19.2k  | }  | 
1609  |  |  | 
1610  |  | void  | 
1611  |  | _PyUnicode_FastCopyCharacters(  | 
1612  |  |     PyObject *to, Py_ssize_t to_start,  | 
1613  |  |     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)  | 
1614  | 19.2k  | { | 
1615  | 19.2k  |     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);  | 
1616  | 19.2k  | }  | 
1617  |  |  | 
1618  |  | Py_ssize_t  | 
1619  |  | PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,  | 
1620  |  |                          PyObject *from, Py_ssize_t from_start,  | 
1621  |  |                          Py_ssize_t how_many)  | 
1622  | 0  | { | 
1623  | 0  |     int err;  | 
1624  |  | 
  | 
1625  | 0  |     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { | 
1626  | 0  |         PyErr_BadInternalCall();  | 
1627  | 0  |         return -1;  | 
1628  | 0  |     }  | 
1629  |  |  | 
1630  | 0  |     if (PyUnicode_READY(from) == -1)  | 
1631  | 0  |         return -1;  | 
1632  | 0  |     if (PyUnicode_READY(to) == -1)  | 
1633  | 0  |         return -1;  | 
1634  |  |  | 
1635  | 0  |     if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) { | 
1636  | 0  |         PyErr_SetString(PyExc_IndexError, "string index out of range");  | 
1637  | 0  |         return -1;  | 
1638  | 0  |     }  | 
1639  | 0  |     if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) { | 
1640  | 0  |         PyErr_SetString(PyExc_IndexError, "string index out of range");  | 
1641  | 0  |         return -1;  | 
1642  | 0  |     }  | 
1643  | 0  |     if (how_many < 0) { | 
1644  | 0  |         PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");  | 
1645  | 0  |         return -1;  | 
1646  | 0  |     }  | 
1647  | 0  |     how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);  | 
1648  | 0  |     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { | 
1649  | 0  |         PyErr_Format(PyExc_SystemError,  | 
1650  | 0  |                      "Cannot write %zi characters at %zi "  | 
1651  | 0  |                      "in a string of %zi characters",  | 
1652  | 0  |                      how_many, to_start, PyUnicode_GET_LENGTH(to));  | 
1653  | 0  |         return -1;  | 
1654  | 0  |     }  | 
1655  |  |  | 
1656  | 0  |     if (how_many == 0)  | 
1657  | 0  |         return 0;  | 
1658  |  |  | 
1659  | 0  |     if (unicode_check_modifiable(to))  | 
1660  | 0  |         return -1;  | 
1661  |  |  | 
1662  | 0  |     err = _copy_characters(to, to_start, from, from_start, how_many, 1);  | 
1663  | 0  |     if (err) { | 
1664  | 0  |         PyErr_Format(PyExc_SystemError,  | 
1665  | 0  |                      "Cannot copy %s characters "  | 
1666  | 0  |                      "into a string of %s characters",  | 
1667  | 0  |                      unicode_kind_name(from),  | 
1668  | 0  |                      unicode_kind_name(to));  | 
1669  | 0  |         return -1;  | 
1670  | 0  |     }  | 
1671  | 0  |     return how_many;  | 
1672  | 0  | }  | 
1673  |  |  | 
1674  |  | /* Find the maximum code point and count the number of surrogate pairs so a  | 
1675  |  |    correct string length can be computed before converting a string to UCS4.  | 
1676  |  |    This function counts single surrogates as a character and not as a pair.  | 
1677  |  |  | 
1678  |  |    Return 0 on success, or -1 on error. */  | 
1679  |  | static int  | 
1680  |  | find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,  | 
1681  |  |                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)  | 
1682  | 5.14k  | { | 
1683  | 5.14k  |     const wchar_t *iter;  | 
1684  | 5.14k  |     Py_UCS4 ch;  | 
1685  |  |  | 
1686  | 5.14k  |     assert(num_surrogates != NULL && maxchar != NULL);  | 
1687  | 5.14k  |     *num_surrogates = 0;  | 
1688  | 5.14k  |     *maxchar = 0;  | 
1689  |  |  | 
1690  | 67.8k  |     for (iter = begin; iter < end; ) { | 
1691  |  | #if SIZEOF_WCHAR_T == 2  | 
1692  |  |         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])  | 
1693  |  |             && (iter+1) < end  | 
1694  |  |             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))  | 
1695  |  |         { | 
1696  |  |             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);  | 
1697  |  |             ++(*num_surrogates);  | 
1698  |  |             iter += 2;  | 
1699  |  |         }  | 
1700  |  |         else  | 
1701  |  | #endif  | 
1702  | 62.6k  |         { | 
1703  | 62.6k  |             ch = *iter;  | 
1704  | 62.6k  |             iter++;  | 
1705  | 62.6k  |         }  | 
1706  | 62.6k  |         if (ch > *maxchar) { | 
1707  | 16.2k  |             *maxchar = ch;  | 
1708  | 16.2k  |             if (*maxchar > MAX_UNICODE) { | 
1709  | 0  |                 PyErr_Format(PyExc_ValueError,  | 
1710  | 0  |                              "character U+%x is not in range [U+0000; U+10ffff]",  | 
1711  | 0  |                              ch);  | 
1712  | 0  |                 return -1;  | 
1713  | 0  |             }  | 
1714  | 16.2k  |         }  | 
1715  | 62.6k  |     }  | 
1716  | 5.14k  |     return 0;  | 
1717  | 5.14k  | }  | 
1718  |  |  | 
1719  |  | int  | 
1720  |  | _PyUnicode_Ready(PyObject *unicode)  | 
1721  | 0  | { | 
1722  | 0  |     wchar_t *end;  | 
1723  | 0  |     Py_UCS4 maxchar = 0;  | 
1724  | 0  |     Py_ssize_t num_surrogates;  | 
1725  |  | #if SIZEOF_WCHAR_T == 2  | 
1726  |  |     Py_ssize_t length_wo_surrogates;  | 
1727  |  | #endif  | 
1728  |  |  | 
1729  |  |     /* _PyUnicode_Ready() is only intended for old-style API usage where  | 
1730  |  |        strings were created using _PyObject_New() and where no canonical  | 
1731  |  |        representation (the str field) has been set yet aka strings  | 
1732  |  |        which are not yet ready. */  | 
1733  | 0  |     assert(_PyUnicode_CHECK(unicode));  | 
1734  | 0  |     assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);  | 
1735  | 0  |     assert(_PyUnicode_WSTR(unicode) != NULL);  | 
1736  | 0  |     assert(_PyUnicode_DATA_ANY(unicode) == NULL);  | 
1737  | 0  |     assert(_PyUnicode_UTF8(unicode) == NULL);  | 
1738  |  |     /* Actually, it should neither be interned nor be anything else: */  | 
1739  | 0  |     assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);  | 
1740  |  | 
  | 
1741  | 0  |     end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);  | 
1742  | 0  |     if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,  | 
1743  | 0  |                                 &maxchar, &num_surrogates) == -1)  | 
1744  | 0  |         return -1;  | 
1745  |  |  | 
1746  | 0  |     if (maxchar < 256) { | 
1747  | 0  |         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);  | 
1748  | 0  |         if (!_PyUnicode_DATA_ANY(unicode)) { | 
1749  | 0  |             PyErr_NoMemory();  | 
1750  | 0  |             return -1;  | 
1751  | 0  |         }  | 
1752  | 0  |         _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,  | 
1753  | 0  |                                 _PyUnicode_WSTR(unicode), end,  | 
1754  | 0  |                                 PyUnicode_1BYTE_DATA(unicode));  | 
1755  | 0  |         PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';  | 
1756  | 0  |         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);  | 
1757  | 0  |         _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;  | 
1758  | 0  |         if (maxchar < 128) { | 
1759  | 0  |             _PyUnicode_STATE(unicode).ascii = 1;  | 
1760  | 0  |             _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);  | 
1761  | 0  |             _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);  | 
1762  | 0  |         }  | 
1763  | 0  |         else { | 
1764  | 0  |             _PyUnicode_STATE(unicode).ascii = 0;  | 
1765  | 0  |             _PyUnicode_UTF8(unicode) = NULL;  | 
1766  | 0  |             _PyUnicode_UTF8_LENGTH(unicode) = 0;  | 
1767  | 0  |         }  | 
1768  | 0  |         PyObject_FREE(_PyUnicode_WSTR(unicode));  | 
1769  | 0  |         _PyUnicode_WSTR(unicode) = NULL;  | 
1770  | 0  |         _PyUnicode_WSTR_LENGTH(unicode) = 0;  | 
1771  | 0  |     }  | 
1772  |  |     /* In this case we might have to convert down from 4-byte native  | 
1773  |  |        wchar_t to 2-byte unicode. */  | 
1774  | 0  |     else if (maxchar < 65536) { | 
1775  | 0  |         assert(num_surrogates == 0 &&  | 
1776  | 0  |                "FindMaxCharAndNumSurrogatePairs() messed up");  | 
1777  |  | 
  | 
1778  |  | #if SIZEOF_WCHAR_T == 2  | 
1779  |  |         /* We can share representations and are done. */  | 
1780  |  |         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);  | 
1781  |  |         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';  | 
1782  |  |         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);  | 
1783  |  |         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;  | 
1784  |  |         _PyUnicode_UTF8(unicode) = NULL;  | 
1785  |  |         _PyUnicode_UTF8_LENGTH(unicode) = 0;  | 
1786  |  | #else  | 
1787  |  |         /* sizeof(wchar_t) == 4 */  | 
1788  | 0  |         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(  | 
1789  | 0  |             2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));  | 
1790  | 0  |         if (!_PyUnicode_DATA_ANY(unicode)) { | 
1791  | 0  |             PyErr_NoMemory();  | 
1792  | 0  |             return -1;  | 
1793  | 0  |         }  | 
1794  | 0  |         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,  | 
1795  | 0  |                                 _PyUnicode_WSTR(unicode), end,  | 
1796  | 0  |                                 PyUnicode_2BYTE_DATA(unicode));  | 
1797  | 0  |         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';  | 
1798  | 0  |         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);  | 
1799  | 0  |         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;  | 
1800  | 0  |         _PyUnicode_UTF8(unicode) = NULL;  | 
1801  | 0  |         _PyUnicode_UTF8_LENGTH(unicode) = 0;  | 
1802  | 0  |         PyObject_FREE(_PyUnicode_WSTR(unicode));  | 
1803  | 0  |         _PyUnicode_WSTR(unicode) = NULL;  | 
1804  | 0  |         _PyUnicode_WSTR_LENGTH(unicode) = 0;  | 
1805  | 0  | #endif  | 
1806  | 0  |     }  | 
1807  |  |     /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */  | 
1808  | 0  |     else { | 
1809  |  | #if SIZEOF_WCHAR_T == 2  | 
1810  |  |         /* in case the native representation is 2-bytes, we need to allocate a  | 
1811  |  |            new normalized 4-byte version. */  | 
1812  |  |         length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;  | 
1813  |  |         if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) { | 
1814  |  |             PyErr_NoMemory();  | 
1815  |  |             return -1;  | 
1816  |  |         }  | 
1817  |  |         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));  | 
1818  |  |         if (!_PyUnicode_DATA_ANY(unicode)) { | 
1819  |  |             PyErr_NoMemory();  | 
1820  |  |             return -1;  | 
1821  |  |         }  | 
1822  |  |         _PyUnicode_LENGTH(unicode) = length_wo_surrogates;  | 
1823  |  |         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;  | 
1824  |  |         _PyUnicode_UTF8(unicode) = NULL;  | 
1825  |  |         _PyUnicode_UTF8_LENGTH(unicode) = 0;  | 
1826  |  |         /* unicode_convert_wchar_to_ucs4() requires a ready string */  | 
1827  |  |         _PyUnicode_STATE(unicode).ready = 1;  | 
1828  |  |         unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);  | 
1829  |  |         PyObject_FREE(_PyUnicode_WSTR(unicode));  | 
1830  |  |         _PyUnicode_WSTR(unicode) = NULL;  | 
1831  |  |         _PyUnicode_WSTR_LENGTH(unicode) = 0;  | 
1832  |  | #else  | 
1833  | 0  |         assert(num_surrogates == 0);  | 
1834  |  | 
  | 
1835  | 0  |         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);  | 
1836  | 0  |         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);  | 
1837  | 0  |         _PyUnicode_UTF8(unicode) = NULL;  | 
1838  | 0  |         _PyUnicode_UTF8_LENGTH(unicode) = 0;  | 
1839  | 0  |         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;  | 
1840  | 0  | #endif  | 
1841  | 0  |         PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';  | 
1842  | 0  |     }  | 
1843  | 0  |     _PyUnicode_STATE(unicode).ready = 1;  | 
1844  | 0  |     assert(_PyUnicode_CheckConsistency(unicode, 1));  | 
1845  | 0  |     return 0;  | 
1846  | 0  | }  | 
1847  |  |  | 
1848  |  | static void  | 
1849  |  | unicode_dealloc(PyObject *unicode)  | 
1850  | 76.9k  | { | 
1851  | 76.9k  |     switch (PyUnicode_CHECK_INTERNED(unicode)) { | 
1852  | 75.7k  |     case SSTATE_NOT_INTERNED:  | 
1853  | 75.7k  |         break;  | 
1854  |  |  | 
1855  | 1.16k  |     case SSTATE_INTERNED_MORTAL:  | 
1856  |  |         /* revive dead object temporarily for DelItem */  | 
1857  | 1.16k  |         Py_REFCNT(unicode) = 3;  | 
1858  | 1.16k  |         if (PyDict_DelItem(interned, unicode) != 0)  | 
1859  | 0  |             Py_FatalError(  | 
1860  | 0  |                 "deletion of interned string failed");  | 
1861  | 1.16k  |         break;  | 
1862  |  |  | 
1863  | 1.16k  |     case SSTATE_INTERNED_IMMORTAL:  | 
1864  | 0  |         Py_FatalError("Immortal interned string died."); | 
1865  |  |         /* fall through */  | 
1866  |  |  | 
1867  | 0  |     default:  | 
1868  | 0  |         Py_FatalError("Inconsistent interned string state."); | 
1869  | 76.9k  |     }  | 
1870  |  |  | 
1871  | 76.9k  |     if (_PyUnicode_HAS_WSTR_MEMORY(unicode))  | 
1872  | 0  |         PyObject_DEL(_PyUnicode_WSTR(unicode));  | 
1873  | 76.9k  |     if (_PyUnicode_HAS_UTF8_MEMORY(unicode))  | 
1874  | 0  |         PyObject_DEL(_PyUnicode_UTF8(unicode));  | 
1875  | 76.9k  |     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))  | 
1876  | 0  |         PyObject_DEL(_PyUnicode_DATA_ANY(unicode));  | 
1877  |  |  | 
1878  | 76.9k  |     Py_TYPE(unicode)->tp_free(unicode);  | 
1879  | 76.9k  | }  | 
1880  |  |  | 
1881  |  | #ifdef Py_DEBUG  | 
1882  |  | static int  | 
1883  |  | unicode_is_singleton(PyObject *unicode)  | 
1884  |  | { | 
1885  |  |     PyASCIIObject *ascii = (PyASCIIObject *)unicode;  | 
1886  |  |     if (unicode == unicode_empty)  | 
1887  |  |         return 1;  | 
1888  |  |     if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)  | 
1889  |  |     { | 
1890  |  |         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);  | 
1891  |  |         if (ch < 256 && unicode_latin1[ch] == unicode)  | 
1892  |  |             return 1;  | 
1893  |  |     }  | 
1894  |  |     return 0;  | 
1895  |  | }  | 
1896  |  | #endif  | 
1897  |  |  | 
1898  |  | static int  | 
1899  |  | unicode_modifiable(PyObject *unicode)  | 
1900  | 3.60k  | { | 
1901  | 3.60k  |     assert(_PyUnicode_CHECK(unicode));  | 
1902  | 3.60k  |     if (Py_REFCNT(unicode) != 1)  | 
1903  | 3.40k  |         return 0;  | 
1904  | 208  |     if (_PyUnicode_HASH(unicode) != -1)  | 
1905  | 0  |         return 0;  | 
1906  | 208  |     if (PyUnicode_CHECK_INTERNED(unicode))  | 
1907  | 0  |         return 0;  | 
1908  | 208  |     if (!PyUnicode_CheckExact(unicode))  | 
1909  | 0  |         return 0;  | 
1910  |  | #ifdef Py_DEBUG  | 
1911  |  |     /* singleton refcount is greater than 1 */  | 
1912  |  |     assert(!unicode_is_singleton(unicode));  | 
1913  |  | #endif  | 
1914  | 208  |     return 1;  | 
1915  | 208  | }  | 
1916  |  |  | 
1917  |  | static int  | 
1918  |  | unicode_resize(PyObject **p_unicode, Py_ssize_t length)  | 
1919  | 104  | { | 
1920  | 104  |     PyObject *unicode;  | 
1921  | 104  |     Py_ssize_t old_length;  | 
1922  |  |  | 
1923  | 104  |     assert(p_unicode != NULL);  | 
1924  | 104  |     unicode = *p_unicode;  | 
1925  |  |  | 
1926  | 104  |     assert(unicode != NULL);  | 
1927  | 104  |     assert(PyUnicode_Check(unicode));  | 
1928  | 104  |     assert(0 <= length);  | 
1929  |  |  | 
1930  | 104  |     if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)  | 
1931  | 0  |         old_length = PyUnicode_WSTR_LENGTH(unicode);  | 
1932  | 104  |     else  | 
1933  | 104  |         old_length = PyUnicode_GET_LENGTH(unicode);  | 
1934  | 104  |     if (old_length == length)  | 
1935  | 0  |         return 0;  | 
1936  |  |  | 
1937  | 104  |     if (length == 0) { | 
1938  | 0  |         _Py_INCREF_UNICODE_EMPTY();  | 
1939  | 0  |         if (!unicode_empty)  | 
1940  | 0  |             return -1;  | 
1941  | 0  |         Py_SETREF(*p_unicode, unicode_empty);  | 
1942  | 0  |         return 0;  | 
1943  | 0  |     }  | 
1944  |  |  | 
1945  | 104  |     if (!unicode_modifiable(unicode)) { | 
1946  | 0  |         PyObject *copy = resize_copy(unicode, length);  | 
1947  | 0  |         if (copy == NULL)  | 
1948  | 0  |             return -1;  | 
1949  | 0  |         Py_SETREF(*p_unicode, copy);  | 
1950  | 0  |         return 0;  | 
1951  | 0  |     }  | 
1952  |  |  | 
1953  | 104  |     if (PyUnicode_IS_COMPACT(unicode)) { | 
1954  | 104  |         PyObject *new_unicode = resize_compact(unicode, length);  | 
1955  | 104  |         if (new_unicode == NULL)  | 
1956  | 0  |             return -1;  | 
1957  | 104  |         *p_unicode = new_unicode;  | 
1958  | 104  |         return 0;  | 
1959  | 104  |     }  | 
1960  | 0  |     return resize_inplace(unicode, length);  | 
1961  | 104  | }  | 
1962  |  |  | 
1963  |  | int  | 
1964  |  | PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)  | 
1965  | 0  | { | 
1966  | 0  |     PyObject *unicode;  | 
1967  | 0  |     if (p_unicode == NULL) { | 
1968  | 0  |         PyErr_BadInternalCall();  | 
1969  | 0  |         return -1;  | 
1970  | 0  |     }  | 
1971  | 0  |     unicode = *p_unicode;  | 
1972  | 0  |     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)  | 
1973  | 0  |     { | 
1974  | 0  |         PyErr_BadInternalCall();  | 
1975  | 0  |         return -1;  | 
1976  | 0  |     }  | 
1977  | 0  |     return unicode_resize(p_unicode, length);  | 
1978  | 0  | }  | 
1979  |  |  | 
1980  |  | /* Copy an ASCII or latin1 char* string into a Python Unicode string.  | 
1981  |  |  | 
1982  |  |    WARNING: The function doesn't copy the terminating null character and  | 
1983  |  |    doesn't check the maximum character (may write a latin1 character in an  | 
1984  |  |    ASCII string). */  | 
1985  |  | static void  | 
1986  |  | unicode_write_cstr(PyObject *unicode, Py_ssize_t index,  | 
1987  |  |                    const char *str, Py_ssize_t len)  | 
1988  | 0  | { | 
1989  | 0  |     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);  | 
1990  | 0  |     void *data = PyUnicode_DATA(unicode);  | 
1991  | 0  |     const char *end = str + len;  | 
1992  |  | 
  | 
1993  | 0  |     switch (kind) { | 
1994  | 0  |     case PyUnicode_1BYTE_KIND: { | 
1995  | 0  |         assert(index + len <= PyUnicode_GET_LENGTH(unicode));  | 
1996  |  | #ifdef Py_DEBUG  | 
1997  |  |         if (PyUnicode_IS_ASCII(unicode)) { | 
1998  |  |             Py_UCS4 maxchar = ucs1lib_find_max_char(  | 
1999  |  |                 (const Py_UCS1*)str,  | 
2000  |  |                 (const Py_UCS1*)str + len);  | 
2001  |  |             assert(maxchar < 128);  | 
2002  |  |         }  | 
2003  |  | #endif  | 
2004  | 0  |         memcpy((char *) data + index, str, len);  | 
2005  | 0  |         break;  | 
2006  | 0  |     }  | 
2007  | 0  |     case PyUnicode_2BYTE_KIND: { | 
2008  | 0  |         Py_UCS2 *start = (Py_UCS2 *)data + index;  | 
2009  | 0  |         Py_UCS2 *ucs2 = start;  | 
2010  | 0  |         assert(index <= PyUnicode_GET_LENGTH(unicode));  | 
2011  |  | 
  | 
2012  | 0  |         for (; str < end; ++ucs2, ++str)  | 
2013  | 0  |             *ucs2 = (Py_UCS2)*str;  | 
2014  |  | 
  | 
2015  | 0  |         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));  | 
2016  | 0  |         break;  | 
2017  | 0  |     }  | 
2018  | 0  |     default: { | 
2019  | 0  |         Py_UCS4 *start = (Py_UCS4 *)data + index;  | 
2020  | 0  |         Py_UCS4 *ucs4 = start;  | 
2021  | 0  |         assert(kind == PyUnicode_4BYTE_KIND);  | 
2022  | 0  |         assert(index <= PyUnicode_GET_LENGTH(unicode));  | 
2023  |  | 
  | 
2024  | 0  |         for (; str < end; ++ucs4, ++str)  | 
2025  | 0  |             *ucs4 = (Py_UCS4)*str;  | 
2026  |  | 
  | 
2027  | 0  |         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));  | 
2028  | 0  |     }  | 
2029  | 0  |     }  | 
2030  | 0  | }  | 
2031  |  |  | 
2032  |  | static PyObject*  | 
2033  |  | get_latin1_char(unsigned char ch)  | 
2034  | 8.09k  | { | 
2035  | 8.09k  |     PyObject *unicode = unicode_latin1[ch];  | 
2036  | 8.09k  |     if (!unicode) { | 
2037  | 932  |         unicode = PyUnicode_New(1, ch);  | 
2038  | 932  |         if (!unicode)  | 
2039  | 0  |             return NULL;  | 
2040  | 932  |         PyUnicode_1BYTE_DATA(unicode)[0] = ch;  | 
2041  | 932  |         assert(_PyUnicode_CheckConsistency(unicode, 1));  | 
2042  | 932  |         unicode_latin1[ch] = unicode;  | 
2043  | 932  |     }  | 
2044  | 8.09k  |     Py_INCREF(unicode);  | 
2045  | 8.09k  |     return unicode;  | 
2046  | 8.09k  | }  | 
2047  |  |  | 
2048  |  | static PyObject*  | 
2049  |  | unicode_char(Py_UCS4 ch)  | 
2050  | 5.56k  | { | 
2051  | 5.56k  |     PyObject *unicode;  | 
2052  |  |  | 
2053  | 5.56k  |     assert(ch <= MAX_UNICODE);  | 
2054  |  |  | 
2055  | 5.56k  |     if (ch < 256)  | 
2056  | 5.56k  |         return get_latin1_char(ch);  | 
2057  |  |  | 
2058  | 0  |     unicode = PyUnicode_New(1, ch);  | 
2059  | 0  |     if (unicode == NULL)  | 
2060  | 0  |         return NULL;  | 
2061  |  |  | 
2062  | 0  |     assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);  | 
2063  | 0  |     if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { | 
2064  | 0  |         PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;  | 
2065  | 0  |     } else { | 
2066  | 0  |         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);  | 
2067  | 0  |         PyUnicode_4BYTE_DATA(unicode)[0] = ch;  | 
2068  | 0  |     }  | 
2069  | 0  |     assert(_PyUnicode_CheckConsistency(unicode, 1));  | 
2070  | 0  |     return unicode;  | 
2071  | 0  | }  | 
2072  |  |  | 
2073  |  | PyObject *  | 
2074  |  | PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)  | 
2075  | 0  | { | 
2076  | 0  |     if (u == NULL)  | 
2077  | 0  |         return (PyObject*)_PyUnicode_New(size);  | 
2078  |  |  | 
2079  | 0  |     if (size < 0) { | 
2080  | 0  |         PyErr_BadInternalCall();  | 
2081  | 0  |         return NULL;  | 
2082  | 0  |     }  | 
2083  |  |  | 
2084  | 0  |     return PyUnicode_FromWideChar(u, size);  | 
2085  | 0  | }  | 
2086  |  |  | 
2087  |  | PyObject *  | 
2088  |  | PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)  | 
2089  | 5.15k  | { | 
2090  | 5.15k  |     PyObject *unicode;  | 
2091  | 5.15k  |     Py_UCS4 maxchar = 0;  | 
2092  | 5.15k  |     Py_ssize_t num_surrogates;  | 
2093  |  |  | 
2094  | 5.15k  |     if (u == NULL && size != 0) { | 
2095  | 0  |         PyErr_BadInternalCall();  | 
2096  | 0  |         return NULL;  | 
2097  | 0  |     }  | 
2098  |  |  | 
2099  | 5.15k  |     if (size == -1) { | 
2100  | 294  |         size = wcslen(u);  | 
2101  | 294  |     }  | 
2102  |  |  | 
2103  |  |     /* If the Unicode data is known at construction time, we can apply  | 
2104  |  |        some optimizations which share commonly used objects. */  | 
2105  |  |  | 
2106  |  |     /* Optimization for empty strings */  | 
2107  | 5.15k  |     if (size == 0)  | 
2108  | 14  |         _Py_RETURN_UNICODE_EMPTY();  | 
2109  |  |  | 
2110  |  |     /* Single character Unicode objects in the Latin-1 range are  | 
2111  |  |        shared when using this constructor */  | 
2112  | 5.14k  |     if (size == 1 && (Py_UCS4)*u < 256)  | 
2113  | 0  |         return get_latin1_char((unsigned char)*u);  | 
2114  |  |  | 
2115  |  |     /* If not empty and not single character, copy the Unicode data  | 
2116  |  |        into the new object */  | 
2117  | 5.14k  |     if (find_maxchar_surrogates(u, u + size,  | 
2118  | 5.14k  |                                 &maxchar, &num_surrogates) == -1)  | 
2119  | 0  |         return NULL;  | 
2120  |  |  | 
2121  | 5.14k  |     unicode = PyUnicode_New(size - num_surrogates, maxchar);  | 
2122  | 5.14k  |     if (!unicode)  | 
2123  | 0  |         return NULL;  | 
2124  |  |  | 
2125  | 5.14k  |     switch (PyUnicode_KIND(unicode)) { | 
2126  | 5.14k  |     case PyUnicode_1BYTE_KIND:  | 
2127  | 5.14k  |         _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,  | 
2128  | 5.14k  |                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));  | 
2129  | 5.14k  |         break;  | 
2130  | 0  |     case PyUnicode_2BYTE_KIND:  | 
2131  |  | #if Py_UNICODE_SIZE == 2  | 
2132  |  |         memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);  | 
2133  |  | #else  | 
2134  | 0  |         _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,  | 
2135  | 0  |                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));  | 
2136  | 0  | #endif  | 
2137  | 0  |         break;  | 
2138  | 0  |     case PyUnicode_4BYTE_KIND:  | 
2139  |  | #if SIZEOF_WCHAR_T == 2  | 
2140  |  |         /* This is the only case which has to process surrogates, thus  | 
2141  |  |            a simple copy loop is not enough and we need a function. */  | 
2142  |  |         unicode_convert_wchar_to_ucs4(u, u + size, unicode);  | 
2143  |  | #else  | 
2144  | 0  |         assert(num_surrogates == 0);  | 
2145  | 0  |         memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);  | 
2146  | 0  | #endif  | 
2147  | 0  |         break;  | 
2148  | 0  |     default:  | 
2149  | 0  |         Py_UNREACHABLE();  | 
2150  | 5.14k  |     }  | 
2151  |  |  | 
2152  | 5.14k  |     return unicode_result(unicode);  | 
2153  | 5.14k  | }  | 
2154  |  |  | 
2155  |  | PyObject *  | 
2156  |  | PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)  | 
2157  | 2.20k  | { | 
2158  | 2.20k  |     if (size < 0) { | 
2159  | 0  |         PyErr_SetString(PyExc_SystemError,  | 
2160  | 0  |                         "Negative size passed to PyUnicode_FromStringAndSize");  | 
2161  | 0  |         return NULL;  | 
2162  | 0  |     }  | 
2163  | 2.20k  |     if (u != NULL)  | 
2164  | 2.18k  |         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);  | 
2165  | 14  |     else  | 
2166  | 14  |         return (PyObject *)_PyUnicode_New(size);  | 
2167  | 2.20k  | }  | 
2168  |  |  | 
2169  |  | PyObject *  | 
2170  |  | PyUnicode_FromString(const char *u)  | 
2171  | 55.8k  | { | 
2172  | 55.8k  |     size_t size = strlen(u);  | 
2173  | 55.8k  |     if (size > PY_SSIZE_T_MAX) { | 
2174  | 0  |         PyErr_SetString(PyExc_OverflowError, "input too long");  | 
2175  | 0  |         return NULL;  | 
2176  | 0  |     }  | 
2177  | 55.8k  |     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);  | 
2178  | 55.8k  | }  | 
2179  |  |  | 
2180  |  | PyObject *  | 
2181  |  | _PyUnicode_FromId(_Py_Identifier *id)  | 
2182  | 101k  | { | 
2183  | 101k  |     if (!id->object) { | 
2184  | 1.41k  |         id->object = PyUnicode_DecodeUTF8Stateful(id->string,  | 
2185  | 1.41k  |                                                   strlen(id->string),  | 
2186  | 1.41k  |                                                   NULL, NULL);  | 
2187  | 1.41k  |         if (!id->object)  | 
2188  | 0  |             return NULL;  | 
2189  | 1.41k  |         PyUnicode_InternInPlace(&id->object);  | 
2190  | 1.41k  |         assert(!id->next);  | 
2191  | 1.41k  |         id->next = static_strings;  | 
2192  | 1.41k  |         static_strings = id;  | 
2193  | 1.41k  |     }  | 
2194  | 101k  |     return id->object;  | 
2195  | 101k  | }  | 
2196  |  |  | 
2197  |  | void  | 
2198  |  | _PyUnicode_ClearStaticStrings()  | 
2199  | 0  | { | 
2200  | 0  |     _Py_Identifier *tmp, *s = static_strings;  | 
2201  | 0  |     while (s) { | 
2202  | 0  |         Py_CLEAR(s->object);  | 
2203  | 0  |         tmp = s->next;  | 
2204  | 0  |         s->next = NULL;  | 
2205  | 0  |         s = tmp;  | 
2206  | 0  |     }  | 
2207  | 0  |     static_strings = NULL;  | 
2208  | 0  | }  | 
2209  |  |  | 
2210  |  | /* Internal function, doesn't check maximum character */  | 
2211  |  |  | 
2212  |  | PyObject*  | 
2213  |  | _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)  | 
2214  | 3.97k  | { | 
2215  | 3.97k  |     const unsigned char *s = (const unsigned char *)buffer;  | 
2216  | 3.97k  |     PyObject *unicode;  | 
2217  | 3.97k  |     if (size == 1) { | 
2218  |  | #ifdef Py_DEBUG  | 
2219  |  |         assert((unsigned char)s[0] < 128);  | 
2220  |  | #endif  | 
2221  | 196  |         return get_latin1_char(s[0]);  | 
2222  | 196  |     }  | 
2223  | 3.78k  |     unicode = PyUnicode_New(size, 127);  | 
2224  | 3.78k  |     if (!unicode)  | 
2225  | 0  |         return NULL;  | 
2226  | 3.78k  |     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);  | 
2227  | 3.78k  |     assert(_PyUnicode_CheckConsistency(unicode, 1));  | 
2228  | 3.78k  |     return unicode;  | 
2229  | 3.78k  | }  | 
2230  |  |  | 
2231  |  | static Py_UCS4  | 
2232  |  | kind_maxchar_limit(unsigned int kind)  | 
2233  | 0  | { | 
2234  | 0  |     switch (kind) { | 
2235  | 0  |     case PyUnicode_1BYTE_KIND:  | 
2236  | 0  |         return 0x80;  | 
2237  | 0  |     case PyUnicode_2BYTE_KIND:  | 
2238  | 0  |         return 0x100;  | 
2239  | 0  |     case PyUnicode_4BYTE_KIND:  | 
2240  | 0  |         return 0x10000;  | 
2241  | 0  |     default:  | 
2242  | 0  |         Py_UNREACHABLE();  | 
2243  | 0  |     }  | 
2244  | 0  | }  | 
2245  |  |  | 
2246  |  | static PyObject*  | 
2247  |  | _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)  | 
2248  | 59.8k  | { | 
2249  | 59.8k  |     PyObject *res;  | 
2250  | 59.8k  |     unsigned char max_char;  | 
2251  |  |  | 
2252  | 59.8k  |     if (size == 0)  | 
2253  | 166  |         _Py_RETURN_UNICODE_EMPTY();  | 
2254  | 59.8k  |     assert(size > 0);  | 
2255  | 59.7k  |     if (size == 1)  | 
2256  | 1.76k  |         return get_latin1_char(u[0]);  | 
2257  |  |  | 
2258  | 57.9k  |     max_char = ucs1lib_find_max_char(u, u + size);  | 
2259  | 57.9k  |     res = PyUnicode_New(size, max_char);  | 
2260  | 57.9k  |     if (!res)  | 
2261  | 0  |         return NULL;  | 
2262  | 57.9k  |     memcpy(PyUnicode_1BYTE_DATA(res), u, size);  | 
2263  | 57.9k  |     assert(_PyUnicode_CheckConsistency(res, 1));  | 
2264  | 57.9k  |     return res;  | 
2265  | 57.9k  | }  | 
2266  |  |  | 
2267  |  | static PyObject*  | 
2268  |  | _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)  | 
2269  | 0  | { | 
2270  | 0  |     PyObject *res;  | 
2271  | 0  |     Py_UCS2 max_char;  | 
2272  |  | 
  | 
2273  | 0  |     if (size == 0)  | 
2274  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
2275  | 0  |     assert(size > 0);  | 
2276  | 0  |     if (size == 1)  | 
2277  | 0  |         return unicode_char(u[0]);  | 
2278  |  |  | 
2279  | 0  |     max_char = ucs2lib_find_max_char(u, u + size);  | 
2280  | 0  |     res = PyUnicode_New(size, max_char);  | 
2281  | 0  |     if (!res)  | 
2282  | 0  |         return NULL;  | 
2283  | 0  |     if (max_char >= 256)  | 
2284  | 0  |         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);  | 
2285  | 0  |     else { | 
2286  | 0  |         _PyUnicode_CONVERT_BYTES(  | 
2287  | 0  |             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));  | 
2288  | 0  |     }  | 
2289  | 0  |     assert(_PyUnicode_CheckConsistency(res, 1));  | 
2290  | 0  |     return res;  | 
2291  | 0  | }  | 
2292  |  |  | 
2293  |  | static PyObject*  | 
2294  |  | _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)  | 
2295  | 0  | { | 
2296  | 0  |     PyObject *res;  | 
2297  | 0  |     Py_UCS4 max_char;  | 
2298  |  | 
  | 
2299  | 0  |     if (size == 0)  | 
2300  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
2301  | 0  |     assert(size > 0);  | 
2302  | 0  |     if (size == 1)  | 
2303  | 0  |         return unicode_char(u[0]);  | 
2304  |  |  | 
2305  | 0  |     max_char = ucs4lib_find_max_char(u, u + size);  | 
2306  | 0  |     res = PyUnicode_New(size, max_char);  | 
2307  | 0  |     if (!res)  | 
2308  | 0  |         return NULL;  | 
2309  | 0  |     if (max_char < 256)  | 
2310  | 0  |         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,  | 
2311  | 0  |                                  PyUnicode_1BYTE_DATA(res));  | 
2312  | 0  |     else if (max_char < 0x10000)  | 
2313  | 0  |         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,  | 
2314  | 0  |                                  PyUnicode_2BYTE_DATA(res));  | 
2315  | 0  |     else  | 
2316  | 0  |         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);  | 
2317  | 0  |     assert(_PyUnicode_CheckConsistency(res, 1));  | 
2318  | 0  |     return res;  | 
2319  | 0  | }  | 
2320  |  |  | 
2321  |  | PyObject*  | 
2322  |  | PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)  | 
2323  | 59.8k  | { | 
2324  | 59.8k  |     if (size < 0) { | 
2325  | 0  |         PyErr_SetString(PyExc_ValueError, "size must be positive");  | 
2326  | 0  |         return NULL;  | 
2327  | 0  |     }  | 
2328  | 59.8k  |     switch (kind) { | 
2329  | 59.8k  |     case PyUnicode_1BYTE_KIND:  | 
2330  | 59.8k  |         return _PyUnicode_FromUCS1(buffer, size);  | 
2331  | 0  |     case PyUnicode_2BYTE_KIND:  | 
2332  | 0  |         return _PyUnicode_FromUCS2(buffer, size);  | 
2333  | 0  |     case PyUnicode_4BYTE_KIND:  | 
2334  | 0  |         return _PyUnicode_FromUCS4(buffer, size);  | 
2335  | 0  |     default:  | 
2336  | 0  |         PyErr_SetString(PyExc_SystemError, "invalid kind");  | 
2337  | 0  |         return NULL;  | 
2338  | 59.8k  |     }  | 
2339  | 59.8k  | }  | 
2340  |  |  | 
2341  |  | Py_UCS4  | 
2342  |  | _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)  | 
2343  | 120  | { | 
2344  | 120  |     enum PyUnicode_Kind kind;  | 
2345  | 120  |     void *startptr, *endptr;  | 
2346  |  |  | 
2347  | 120  |     assert(PyUnicode_IS_READY(unicode));  | 
2348  | 120  |     assert(0 <= start);  | 
2349  | 120  |     assert(end <= PyUnicode_GET_LENGTH(unicode));  | 
2350  | 120  |     assert(start <= end);  | 
2351  |  |  | 
2352  | 120  |     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))  | 
2353  | 0  |         return PyUnicode_MAX_CHAR_VALUE(unicode);  | 
2354  |  |  | 
2355  | 120  |     if (start == end)  | 
2356  | 0  |         return 127;  | 
2357  |  |  | 
2358  | 120  |     if (PyUnicode_IS_ASCII(unicode))  | 
2359  | 120  |         return 127;  | 
2360  |  |  | 
2361  | 0  |     kind = PyUnicode_KIND(unicode);  | 
2362  | 0  |     startptr = PyUnicode_DATA(unicode);  | 
2363  | 0  |     endptr = (char *)startptr + end * kind;  | 
2364  | 0  |     startptr = (char *)startptr + start * kind;  | 
2365  | 0  |     switch(kind) { | 
2366  | 0  |     case PyUnicode_1BYTE_KIND:  | 
2367  | 0  |         return ucs1lib_find_max_char(startptr, endptr);  | 
2368  | 0  |     case PyUnicode_2BYTE_KIND:  | 
2369  | 0  |         return ucs2lib_find_max_char(startptr, endptr);  | 
2370  | 0  |     case PyUnicode_4BYTE_KIND:  | 
2371  | 0  |         return ucs4lib_find_max_char(startptr, endptr);  | 
2372  | 0  |     default:  | 
2373  | 0  |         Py_UNREACHABLE();  | 
2374  | 0  |     }  | 
2375  | 0  | }  | 
2376  |  |  | 
2377  |  | /* Ensure that a string uses the most efficient storage, if it is not the  | 
2378  |  |    case: create a new string with of the right kind. Write NULL into *p_unicode  | 
2379  |  |    on error. */  | 
2380  |  | static void  | 
2381  |  | unicode_adjust_maxchar(PyObject **p_unicode)  | 
2382  | 0  | { | 
2383  | 0  |     PyObject *unicode, *copy;  | 
2384  | 0  |     Py_UCS4 max_char;  | 
2385  | 0  |     Py_ssize_t len;  | 
2386  | 0  |     unsigned int kind;  | 
2387  |  | 
  | 
2388  | 0  |     assert(p_unicode != NULL);  | 
2389  | 0  |     unicode = *p_unicode;  | 
2390  | 0  |     assert(PyUnicode_IS_READY(unicode));  | 
2391  | 0  |     if (PyUnicode_IS_ASCII(unicode))  | 
2392  | 0  |         return;  | 
2393  |  |  | 
2394  | 0  |     len = PyUnicode_GET_LENGTH(unicode);  | 
2395  | 0  |     kind = PyUnicode_KIND(unicode);  | 
2396  | 0  |     if (kind == PyUnicode_1BYTE_KIND) { | 
2397  | 0  |         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);  | 
2398  | 0  |         max_char = ucs1lib_find_max_char(u, u + len);  | 
2399  | 0  |         if (max_char >= 128)  | 
2400  | 0  |             return;  | 
2401  | 0  |     }  | 
2402  | 0  |     else if (kind == PyUnicode_2BYTE_KIND) { | 
2403  | 0  |         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);  | 
2404  | 0  |         max_char = ucs2lib_find_max_char(u, u + len);  | 
2405  | 0  |         if (max_char >= 256)  | 
2406  | 0  |             return;  | 
2407  | 0  |     }  | 
2408  | 0  |     else { | 
2409  | 0  |         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);  | 
2410  | 0  |         assert(kind == PyUnicode_4BYTE_KIND);  | 
2411  | 0  |         max_char = ucs4lib_find_max_char(u, u + len);  | 
2412  | 0  |         if (max_char >= 0x10000)  | 
2413  | 0  |             return;  | 
2414  | 0  |     }  | 
2415  | 0  |     copy = PyUnicode_New(len, max_char);  | 
2416  | 0  |     if (copy != NULL)  | 
2417  | 0  |         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);  | 
2418  | 0  |     Py_DECREF(unicode);  | 
2419  | 0  |     *p_unicode = copy;  | 
2420  | 0  | }  | 
2421  |  |  | 
2422  |  | PyObject*  | 
2423  |  | _PyUnicode_Copy(PyObject *unicode)  | 
2424  | 0  | { | 
2425  | 0  |     Py_ssize_t length;  | 
2426  | 0  |     PyObject *copy;  | 
2427  |  | 
  | 
2428  | 0  |     if (!PyUnicode_Check(unicode)) { | 
2429  | 0  |         PyErr_BadInternalCall();  | 
2430  | 0  |         return NULL;  | 
2431  | 0  |     }  | 
2432  | 0  |     if (PyUnicode_READY(unicode) == -1)  | 
2433  | 0  |         return NULL;  | 
2434  |  |  | 
2435  | 0  |     length = PyUnicode_GET_LENGTH(unicode);  | 
2436  | 0  |     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));  | 
2437  | 0  |     if (!copy)  | 
2438  | 0  |         return NULL;  | 
2439  | 0  |     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));  | 
2440  |  | 
  | 
2441  | 0  |     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),  | 
2442  | 0  |               length * PyUnicode_KIND(unicode));  | 
2443  | 0  |     assert(_PyUnicode_CheckConsistency(copy, 1));  | 
2444  | 0  |     return copy;  | 
2445  | 0  | }  | 
2446  |  |  | 
2447  |  |  | 
2448  |  | /* Widen Unicode objects to larger buffers. Don't write terminating null  | 
2449  |  |    character. Return NULL on error. */  | 
2450  |  |  | 
2451  |  | void*  | 
2452  |  | _PyUnicode_AsKind(PyObject *s, unsigned int kind)  | 
2453  | 0  | { | 
2454  | 0  |     Py_ssize_t len;  | 
2455  | 0  |     void *result;  | 
2456  | 0  |     unsigned int skind;  | 
2457  |  | 
  | 
2458  | 0  |     if (PyUnicode_READY(s) == -1)  | 
2459  | 0  |         return NULL;  | 
2460  |  |  | 
2461  | 0  |     len = PyUnicode_GET_LENGTH(s);  | 
2462  | 0  |     skind = PyUnicode_KIND(s);  | 
2463  | 0  |     if (skind >= kind) { | 
2464  | 0  |         PyErr_SetString(PyExc_SystemError, "invalid widening attempt");  | 
2465  | 0  |         return NULL;  | 
2466  | 0  |     }  | 
2467  | 0  |     switch (kind) { | 
2468  | 0  |     case PyUnicode_2BYTE_KIND:  | 
2469  | 0  |         result = PyMem_New(Py_UCS2, len);  | 
2470  | 0  |         if (!result)  | 
2471  | 0  |             return PyErr_NoMemory();  | 
2472  | 0  |         assert(skind == PyUnicode_1BYTE_KIND);  | 
2473  | 0  |         _PyUnicode_CONVERT_BYTES(  | 
2474  | 0  |             Py_UCS1, Py_UCS2,  | 
2475  | 0  |             PyUnicode_1BYTE_DATA(s),  | 
2476  | 0  |             PyUnicode_1BYTE_DATA(s) + len,  | 
2477  | 0  |             result);  | 
2478  | 0  |         return result;  | 
2479  | 0  |     case PyUnicode_4BYTE_KIND:  | 
2480  | 0  |         result = PyMem_New(Py_UCS4, len);  | 
2481  | 0  |         if (!result)  | 
2482  | 0  |             return PyErr_NoMemory();  | 
2483  | 0  |         if (skind == PyUnicode_2BYTE_KIND) { | 
2484  | 0  |             _PyUnicode_CONVERT_BYTES(  | 
2485  | 0  |                 Py_UCS2, Py_UCS4,  | 
2486  | 0  |                 PyUnicode_2BYTE_DATA(s),  | 
2487  | 0  |                 PyUnicode_2BYTE_DATA(s) + len,  | 
2488  | 0  |                 result);  | 
2489  | 0  |         }  | 
2490  | 0  |         else { | 
2491  | 0  |             assert(skind == PyUnicode_1BYTE_KIND);  | 
2492  | 0  |             _PyUnicode_CONVERT_BYTES(  | 
2493  | 0  |                 Py_UCS1, Py_UCS4,  | 
2494  | 0  |                 PyUnicode_1BYTE_DATA(s),  | 
2495  | 0  |                 PyUnicode_1BYTE_DATA(s) + len,  | 
2496  | 0  |                 result);  | 
2497  | 0  |         }  | 
2498  | 0  |         return result;  | 
2499  | 0  |     default:  | 
2500  | 0  |         break;  | 
2501  | 0  |     }  | 
2502  | 0  |     PyErr_SetString(PyExc_SystemError, "invalid kind");  | 
2503  | 0  |     return NULL;  | 
2504  | 0  | }  | 
2505  |  |  | 
2506  |  | static Py_UCS4*  | 
2507  |  | as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,  | 
2508  |  |         int copy_null)  | 
2509  | 0  | { | 
2510  | 0  |     int kind;  | 
2511  | 0  |     void *data;  | 
2512  | 0  |     Py_ssize_t len, targetlen;  | 
2513  | 0  |     if (PyUnicode_READY(string) == -1)  | 
2514  | 0  |         return NULL;  | 
2515  | 0  |     kind = PyUnicode_KIND(string);  | 
2516  | 0  |     data = PyUnicode_DATA(string);  | 
2517  | 0  |     len = PyUnicode_GET_LENGTH(string);  | 
2518  | 0  |     targetlen = len;  | 
2519  | 0  |     if (copy_null)  | 
2520  | 0  |         targetlen++;  | 
2521  | 0  |     if (!target) { | 
2522  | 0  |         target = PyMem_New(Py_UCS4, targetlen);  | 
2523  | 0  |         if (!target) { | 
2524  | 0  |             PyErr_NoMemory();  | 
2525  | 0  |             return NULL;  | 
2526  | 0  |         }  | 
2527  | 0  |     }  | 
2528  | 0  |     else { | 
2529  | 0  |         if (targetsize < targetlen) { | 
2530  | 0  |             PyErr_Format(PyExc_SystemError,  | 
2531  | 0  |                          "string is longer than the buffer");  | 
2532  | 0  |             if (copy_null && 0 < targetsize)  | 
2533  | 0  |                 target[0] = 0;  | 
2534  | 0  |             return NULL;  | 
2535  | 0  |         }  | 
2536  | 0  |     }  | 
2537  | 0  |     if (kind == PyUnicode_1BYTE_KIND) { | 
2538  | 0  |         Py_UCS1 *start = (Py_UCS1 *) data;  | 
2539  | 0  |         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);  | 
2540  | 0  |     }  | 
2541  | 0  |     else if (kind == PyUnicode_2BYTE_KIND) { | 
2542  | 0  |         Py_UCS2 *start = (Py_UCS2 *) data;  | 
2543  | 0  |         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);  | 
2544  | 0  |     }  | 
2545  | 0  |     else { | 
2546  | 0  |         assert(kind == PyUnicode_4BYTE_KIND);  | 
2547  | 0  |         memcpy(target, data, len * sizeof(Py_UCS4));  | 
2548  | 0  |     }  | 
2549  | 0  |     if (copy_null)  | 
2550  | 0  |         target[len] = 0;  | 
2551  | 0  |     return target;  | 
2552  | 0  | }  | 
2553  |  |  | 
2554  |  | Py_UCS4*  | 
2555  |  | PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,  | 
2556  |  |                  int copy_null)  | 
2557  | 0  | { | 
2558  | 0  |     if (target == NULL || targetsize < 0) { | 
2559  | 0  |         PyErr_BadInternalCall();  | 
2560  | 0  |         return NULL;  | 
2561  | 0  |     }  | 
2562  | 0  |     return as_ucs4(string, target, targetsize, copy_null);  | 
2563  | 0  | }  | 
2564  |  |  | 
2565  |  | Py_UCS4*  | 
2566  |  | PyUnicode_AsUCS4Copy(PyObject *string)  | 
2567  | 0  | { | 
2568  | 0  |     return as_ucs4(string, NULL, 0, 1);  | 
2569  | 0  | }  | 
2570  |  |  | 
2571  |  | /* maximum number of characters required for output of %lld or %p.  | 
2572  |  |    We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,  | 
2573  |  |    plus 1 for the sign.  53/22 is an upper bound for log10(256). */  | 
2574  |  | #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)  | 
2575  |  |  | 
2576  |  | static int  | 
2577  |  | unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,  | 
2578  |  |                              Py_ssize_t width, Py_ssize_t precision)  | 
2579  | 11.8k  | { | 
2580  | 11.8k  |     Py_ssize_t length, fill, arglen;  | 
2581  | 11.8k  |     Py_UCS4 maxchar;  | 
2582  |  |  | 
2583  | 11.8k  |     if (PyUnicode_READY(str) == -1)  | 
2584  | 0  |         return -1;  | 
2585  |  |  | 
2586  | 11.8k  |     length = PyUnicode_GET_LENGTH(str);  | 
2587  | 11.8k  |     if ((precision == -1 || precision >= length)  | 
2588  | 11.8k  |         && width <= length)  | 
2589  | 11.8k  |         return _PyUnicodeWriter_WriteStr(writer, str);  | 
2590  |  |  | 
2591  | 0  |     if (precision != -1)  | 
2592  | 0  |         length = Py_MIN(precision, length);  | 
2593  |  | 
  | 
2594  | 0  |     arglen = Py_MAX(length, width);  | 
2595  | 0  |     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)  | 
2596  | 0  |         maxchar = _PyUnicode_FindMaxChar(str, 0, length);  | 
2597  | 0  |     else  | 
2598  | 0  |         maxchar = writer->maxchar;  | 
2599  |  | 
  | 
2600  | 0  |     if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)  | 
2601  | 0  |         return -1;  | 
2602  |  |  | 
2603  | 0  |     if (width > length) { | 
2604  | 0  |         fill = width - length;  | 
2605  | 0  |         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)  | 
2606  | 0  |             return -1;  | 
2607  | 0  |         writer->pos += fill;  | 
2608  | 0  |     }  | 
2609  |  |  | 
2610  | 0  |     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,  | 
2611  | 0  |                                   str, 0, length);  | 
2612  | 0  |     writer->pos += length;  | 
2613  | 0  |     return 0;  | 
2614  | 0  | }  | 
2615  |  |  | 
2616  |  | static int  | 
2617  |  | unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,  | 
2618  |  |                               Py_ssize_t width, Py_ssize_t precision)  | 
2619  | 4.95k  | { | 
2620  |  |     /* UTF-8 */  | 
2621  | 4.95k  |     Py_ssize_t length;  | 
2622  | 4.95k  |     PyObject *unicode;  | 
2623  | 4.95k  |     int res;  | 
2624  |  |  | 
2625  | 4.95k  |     if (precision == -1) { | 
2626  | 610  |         length = strlen(str);  | 
2627  | 610  |     }  | 
2628  | 4.34k  |     else { | 
2629  | 4.34k  |         length = 0;  | 
2630  | 38.4k  |         while (length < precision && str[length]) { | 
2631  | 34.1k  |             length++;  | 
2632  | 34.1k  |         }  | 
2633  | 4.34k  |     }  | 
2634  | 4.95k  |     unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);  | 
2635  | 4.95k  |     if (unicode == NULL)  | 
2636  | 0  |         return -1;  | 
2637  |  |  | 
2638  | 4.95k  |     res = unicode_fromformat_write_str(writer, unicode, width, -1);  | 
2639  | 4.95k  |     Py_DECREF(unicode);  | 
2640  | 4.95k  |     return res;  | 
2641  | 4.95k  | }  | 
2642  |  |  | 
2643  |  | static const char*  | 
2644  |  | unicode_fromformat_arg(_PyUnicodeWriter *writer,  | 
2645  |  |                        const char *f, va_list *vargs)  | 
2646  | 12.3k  | { | 
2647  | 12.3k  |     const char *p;  | 
2648  | 12.3k  |     Py_ssize_t len;  | 
2649  | 12.3k  |     int zeropad;  | 
2650  | 12.3k  |     Py_ssize_t width;  | 
2651  | 12.3k  |     Py_ssize_t precision;  | 
2652  | 12.3k  |     int longflag;  | 
2653  | 12.3k  |     int longlongflag;  | 
2654  | 12.3k  |     int size_tflag;  | 
2655  | 12.3k  |     Py_ssize_t fill;  | 
2656  |  |  | 
2657  | 12.3k  |     p = f;  | 
2658  | 12.3k  |     f++;  | 
2659  | 12.3k  |     zeropad = 0;  | 
2660  | 12.3k  |     if (*f == '0') { | 
2661  | 0  |         zeropad = 1;  | 
2662  | 0  |         f++;  | 
2663  | 0  |     }  | 
2664  |  |  | 
2665  |  |     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */  | 
2666  | 12.3k  |     width = -1;  | 
2667  | 12.3k  |     if (Py_ISDIGIT((unsigned)*f)) { | 
2668  | 0  |         width = *f - '0';  | 
2669  | 0  |         f++;  | 
2670  | 0  |         while (Py_ISDIGIT((unsigned)*f)) { | 
2671  | 0  |             if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { | 
2672  | 0  |                 PyErr_SetString(PyExc_ValueError,  | 
2673  | 0  |                                 "width too big");  | 
2674  | 0  |                 return NULL;  | 
2675  | 0  |             }  | 
2676  | 0  |             width = (width * 10) + (*f - '0');  | 
2677  | 0  |             f++;  | 
2678  | 0  |         }  | 
2679  | 0  |     }  | 
2680  | 12.3k  |     precision = -1;  | 
2681  | 12.3k  |     if (*f == '.') { | 
2682  | 4.34k  |         f++;  | 
2683  | 4.34k  |         if (Py_ISDIGIT((unsigned)*f)) { | 
2684  | 4.34k  |             precision = (*f - '0');  | 
2685  | 4.34k  |             f++;  | 
2686  | 9.56k  |             while (Py_ISDIGIT((unsigned)*f)) { | 
2687  | 5.22k  |                 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { | 
2688  | 0  |                     PyErr_SetString(PyExc_ValueError,  | 
2689  | 0  |                                     "precision too big");  | 
2690  | 0  |                     return NULL;  | 
2691  | 0  |                 }  | 
2692  | 5.22k  |                 precision = (precision * 10) + (*f - '0');  | 
2693  | 5.22k  |                 f++;  | 
2694  | 5.22k  |             }  | 
2695  | 4.34k  |         }  | 
2696  | 4.34k  |         if (*f == '%') { | 
2697  |  |             /* "%.3%s" => f points to "3" */  | 
2698  | 0  |             f--;  | 
2699  | 0  |         }  | 
2700  | 4.34k  |     }  | 
2701  | 12.3k  |     if (*f == '\0') { | 
2702  |  |         /* bogus format "%.123" => go backward, f points to "3" */  | 
2703  | 0  |         f--;  | 
2704  | 0  |     }  | 
2705  |  |  | 
2706  |  |     /* Handle %ld, %lu, %lld and %llu. */  | 
2707  | 12.3k  |     longflag = 0;  | 
2708  | 12.3k  |     longlongflag = 0;  | 
2709  | 12.3k  |     size_tflag = 0;  | 
2710  | 12.3k  |     if (*f == 'l') { | 
2711  | 0  |         if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { | 
2712  | 0  |             longflag = 1;  | 
2713  | 0  |             ++f;  | 
2714  | 0  |         }  | 
2715  | 0  |         else if (f[1] == 'l' &&  | 
2716  | 0  |                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { | 
2717  | 0  |             longlongflag = 1;  | 
2718  | 0  |             f += 2;  | 
2719  | 0  |         }  | 
2720  | 0  |     }  | 
2721  |  |     /* handle the size_t flag. */  | 
2722  | 12.3k  |     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { | 
2723  | 86  |         size_tflag = 1;  | 
2724  | 86  |         ++f;  | 
2725  | 86  |     }  | 
2726  |  |  | 
2727  | 12.3k  |     if (f[1] == '\0')  | 
2728  | 14  |         writer->overallocate = 0;  | 
2729  |  |  | 
2730  | 12.3k  |     switch (*f) { | 
2731  | 68  |     case 'c':  | 
2732  | 68  |     { | 
2733  | 68  |         int ordinal = va_arg(*vargs, int);  | 
2734  | 68  |         if (ordinal < 0 || ordinal > MAX_UNICODE) { | 
2735  | 32  |             PyErr_SetString(PyExc_OverflowError,  | 
2736  | 32  |                             "character argument not in range(0x110000)");  | 
2737  | 32  |             return NULL;  | 
2738  | 32  |         }  | 
2739  | 36  |         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)  | 
2740  | 0  |             return NULL;  | 
2741  | 36  |         break;  | 
2742  | 36  |     }  | 
2743  |  |  | 
2744  | 36  |     case 'i':  | 
2745  | 456  |     case 'd':  | 
2746  | 456  |     case 'u':  | 
2747  | 456  |     case 'x':  | 
2748  | 456  |     { | 
2749  |  |         /* used by sprintf */  | 
2750  | 456  |         char buffer[MAX_LONG_LONG_CHARS];  | 
2751  | 456  |         Py_ssize_t arglen;  | 
2752  |  |  | 
2753  | 456  |         if (*f == 'u') { | 
2754  | 0  |             if (longflag)  | 
2755  | 0  |                 len = sprintf(buffer, "%lu",  | 
2756  | 0  |                         va_arg(*vargs, unsigned long));  | 
2757  | 0  |             else if (longlongflag)  | 
2758  | 0  |                 len = sprintf(buffer, "%llu",  | 
2759  | 0  |                         va_arg(*vargs, unsigned long long));  | 
2760  | 0  |             else if (size_tflag)  | 
2761  | 0  |                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",  | 
2762  | 0  |                         va_arg(*vargs, size_t));  | 
2763  | 0  |             else  | 
2764  | 0  |                 len = sprintf(buffer, "%u",  | 
2765  | 0  |                         va_arg(*vargs, unsigned int));  | 
2766  | 0  |         }  | 
2767  | 456  |         else if (*f == 'x') { | 
2768  | 0  |             len = sprintf(buffer, "%x", va_arg(*vargs, int));  | 
2769  | 0  |         }  | 
2770  | 456  |         else { | 
2771  | 456  |             if (longflag)  | 
2772  | 0  |                 len = sprintf(buffer, "%li",  | 
2773  | 0  |                         va_arg(*vargs, long));  | 
2774  | 456  |             else if (longlongflag)  | 
2775  | 0  |                 len = sprintf(buffer, "%lli",  | 
2776  | 0  |                         va_arg(*vargs, long long));  | 
2777  | 456  |             else if (size_tflag)  | 
2778  | 86  |                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",  | 
2779  | 86  |                         va_arg(*vargs, Py_ssize_t));  | 
2780  | 370  |             else  | 
2781  | 370  |                 len = sprintf(buffer, "%i",  | 
2782  | 370  |                         va_arg(*vargs, int));  | 
2783  | 456  |         }  | 
2784  | 456  |         assert(len >= 0);  | 
2785  |  |  | 
2786  | 456  |         if (precision < len)  | 
2787  | 456  |             precision = len;  | 
2788  |  |  | 
2789  | 456  |         arglen = Py_MAX(precision, width);  | 
2790  | 456  |         if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)  | 
2791  | 0  |             return NULL;  | 
2792  |  |  | 
2793  | 456  |         if (width > precision) { | 
2794  | 0  |             Py_UCS4 fillchar;  | 
2795  | 0  |             fill = width - precision;  | 
2796  | 0  |             fillchar = zeropad?'0':' ';  | 
2797  | 0  |             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)  | 
2798  | 0  |                 return NULL;  | 
2799  | 0  |             writer->pos += fill;  | 
2800  | 0  |         }  | 
2801  | 456  |         if (precision > len) { | 
2802  | 0  |             fill = precision - len;  | 
2803  | 0  |             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)  | 
2804  | 0  |                 return NULL;  | 
2805  | 0  |             writer->pos += fill;  | 
2806  | 0  |         }  | 
2807  |  |  | 
2808  | 456  |         if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)  | 
2809  | 0  |             return NULL;  | 
2810  | 456  |         break;  | 
2811  | 456  |     }  | 
2812  |  |  | 
2813  | 456  |     case 'p':  | 
2814  | 0  |     { | 
2815  | 0  |         char number[MAX_LONG_LONG_CHARS];  | 
2816  |  | 
  | 
2817  | 0  |         len = sprintf(number, "%p", va_arg(*vargs, void*));  | 
2818  | 0  |         assert(len >= 0);  | 
2819  |  |  | 
2820  |  |         /* %p is ill-defined:  ensure leading 0x. */  | 
2821  | 0  |         if (number[1] == 'X')  | 
2822  | 0  |             number[1] = 'x';  | 
2823  | 0  |         else if (number[1] != 'x') { | 
2824  | 0  |             memmove(number + 2, number,  | 
2825  | 0  |                     strlen(number) + 1);  | 
2826  | 0  |             number[0] = '0';  | 
2827  | 0  |             number[1] = 'x';  | 
2828  | 0  |             len += 2;  | 
2829  | 0  |         }  | 
2830  |  | 
  | 
2831  | 0  |         if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)  | 
2832  | 0  |             return NULL;  | 
2833  | 0  |         break;  | 
2834  | 0  |     }  | 
2835  |  |  | 
2836  | 4.95k  |     case 's':  | 
2837  | 4.95k  |     { | 
2838  |  |         /* UTF-8 */  | 
2839  | 4.95k  |         const char *s = va_arg(*vargs, const char*);  | 
2840  | 4.95k  |         if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)  | 
2841  | 0  |             return NULL;  | 
2842  | 4.95k  |         break;  | 
2843  | 4.95k  |     }  | 
2844  |  |  | 
2845  | 6.88k  |     case 'U':  | 
2846  | 6.88k  |     { | 
2847  | 6.88k  |         PyObject *obj = va_arg(*vargs, PyObject *);  | 
2848  | 6.88k  |         assert(obj && _PyUnicode_CHECK(obj));  | 
2849  |  |  | 
2850  | 6.88k  |         if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)  | 
2851  | 0  |             return NULL;  | 
2852  | 6.88k  |         break;  | 
2853  | 6.88k  |     }  | 
2854  |  |  | 
2855  | 6.88k  |     case 'V':  | 
2856  | 0  |     { | 
2857  | 0  |         PyObject *obj = va_arg(*vargs, PyObject *);  | 
2858  | 0  |         const char *str = va_arg(*vargs, const char *);  | 
2859  | 0  |         if (obj) { | 
2860  | 0  |             assert(_PyUnicode_CHECK(obj));  | 
2861  | 0  |             if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)  | 
2862  | 0  |                 return NULL;  | 
2863  | 0  |         }  | 
2864  | 0  |         else { | 
2865  | 0  |             assert(str != NULL);  | 
2866  | 0  |             if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)  | 
2867  | 0  |                 return NULL;  | 
2868  | 0  |         }  | 
2869  | 0  |         break;  | 
2870  | 0  |     }  | 
2871  |  |  | 
2872  | 0  |     case 'S':  | 
2873  | 0  |     { | 
2874  | 0  |         PyObject *obj = va_arg(*vargs, PyObject *);  | 
2875  | 0  |         PyObject *str;  | 
2876  | 0  |         assert(obj);  | 
2877  | 0  |         str = PyObject_Str(obj);  | 
2878  | 0  |         if (!str)  | 
2879  | 0  |             return NULL;  | 
2880  | 0  |         if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { | 
2881  | 0  |             Py_DECREF(str);  | 
2882  | 0  |             return NULL;  | 
2883  | 0  |         }  | 
2884  | 0  |         Py_DECREF(str);  | 
2885  | 0  |         break;  | 
2886  | 0  |     }  | 
2887  |  |  | 
2888  | 28  |     case 'R':  | 
2889  | 28  |     { | 
2890  | 28  |         PyObject *obj = va_arg(*vargs, PyObject *);  | 
2891  | 28  |         PyObject *repr;  | 
2892  | 28  |         assert(obj);  | 
2893  | 28  |         repr = PyObject_Repr(obj);  | 
2894  | 28  |         if (!repr)  | 
2895  | 0  |             return NULL;  | 
2896  | 28  |         if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { | 
2897  | 0  |             Py_DECREF(repr);  | 
2898  | 0  |             return NULL;  | 
2899  | 0  |         }  | 
2900  | 28  |         Py_DECREF(repr);  | 
2901  | 28  |         break;  | 
2902  | 28  |     }  | 
2903  |  |  | 
2904  | 0  |     case 'A':  | 
2905  | 0  |     { | 
2906  | 0  |         PyObject *obj = va_arg(*vargs, PyObject *);  | 
2907  | 0  |         PyObject *ascii;  | 
2908  | 0  |         assert(obj);  | 
2909  | 0  |         ascii = PyObject_ASCII(obj);  | 
2910  | 0  |         if (!ascii)  | 
2911  | 0  |             return NULL;  | 
2912  | 0  |         if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { | 
2913  | 0  |             Py_DECREF(ascii);  | 
2914  | 0  |             return NULL;  | 
2915  | 0  |         }  | 
2916  | 0  |         Py_DECREF(ascii);  | 
2917  | 0  |         break;  | 
2918  | 0  |     }  | 
2919  |  |  | 
2920  | 0  |     case '%':  | 
2921  | 0  |         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)  | 
2922  | 0  |             return NULL;  | 
2923  | 0  |         break;  | 
2924  |  |  | 
2925  | 0  |     default:  | 
2926  |  |         /* if we stumble upon an unknown formatting code, copy the rest  | 
2927  |  |            of the format string to the output string. (we cannot just  | 
2928  |  |            skip the code, since there's no way to know what's in the  | 
2929  |  |            argument list) */  | 
2930  | 0  |         len = strlen(p);  | 
2931  | 0  |         if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)  | 
2932  | 0  |             return NULL;  | 
2933  | 0  |         f = p+len;  | 
2934  | 0  |         return f;  | 
2935  | 12.3k  |     }  | 
2936  |  |  | 
2937  | 12.3k  |     f++;  | 
2938  | 12.3k  |     return f;  | 
2939  | 12.3k  | }  | 
2940  |  |  | 
2941  |  | PyObject *  | 
2942  |  | PyUnicode_FromFormatV(const char *format, va_list vargs)  | 
2943  | 6.13k  | { | 
2944  | 6.13k  |     va_list vargs2;  | 
2945  | 6.13k  |     const char *f;  | 
2946  | 6.13k  |     _PyUnicodeWriter writer;  | 
2947  |  |  | 
2948  | 6.13k  |     _PyUnicodeWriter_Init(&writer);  | 
2949  | 6.13k  |     writer.min_length = strlen(format) + 100;  | 
2950  | 6.13k  |     writer.overallocate = 1;  | 
2951  |  |  | 
2952  |  |     // Copy varags to be able to pass a reference to a subfunction.  | 
2953  | 6.13k  |     va_copy(vargs2, vargs);  | 
2954  |  |  | 
2955  | 36.4k  |     for (f = format; *f; ) { | 
2956  | 30.3k  |         if (*f == '%') { | 
2957  | 12.3k  |             f = unicode_fromformat_arg(&writer, f, &vargs2);  | 
2958  | 12.3k  |             if (f == NULL)  | 
2959  | 32  |                 goto fail;  | 
2960  | 12.3k  |         }  | 
2961  | 17.9k  |         else { | 
2962  | 17.9k  |             const char *p;  | 
2963  | 17.9k  |             Py_ssize_t len;  | 
2964  |  |  | 
2965  | 17.9k  |             p = f;  | 
2966  | 17.9k  |             do  | 
2967  | 181k  |             { | 
2968  | 181k  |                 if ((unsigned char)*p > 127) { | 
2969  | 0  |                     PyErr_Format(PyExc_ValueError,  | 
2970  | 0  |                         "PyUnicode_FromFormatV() expects an ASCII-encoded format "  | 
2971  | 0  |                         "string, got a non-ASCII byte: 0x%02x",  | 
2972  | 0  |                         (unsigned char)*p);  | 
2973  | 0  |                     goto fail;  | 
2974  | 0  |                 }  | 
2975  | 181k  |                 p++;  | 
2976  | 181k  |             }  | 
2977  | 181k  |             while (*p != '\0' && *p != '%');  | 
2978  | 17.9k  |             len = p - f;  | 
2979  |  |  | 
2980  | 17.9k  |             if (*p == '\0')  | 
2981  | 6.09k  |                 writer.overallocate = 0;  | 
2982  |  |  | 
2983  | 17.9k  |             if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)  | 
2984  | 0  |                 goto fail;  | 
2985  |  |  | 
2986  | 17.9k  |             f = p;  | 
2987  | 17.9k  |         }  | 
2988  | 30.3k  |     }  | 
2989  | 6.13k  |     va_end(vargs2);  | 
2990  | 6.10k  |     return _PyUnicodeWriter_Finish(&writer);  | 
2991  |  |  | 
2992  | 32  |   fail:  | 
2993  | 32  |     va_end(vargs2);  | 
2994  | 32  |     _PyUnicodeWriter_Dealloc(&writer);  | 
2995  | 32  |     return NULL;  | 
2996  | 6.13k  | }  | 
2997  |  |  | 
2998  |  | PyObject *  | 
2999  |  | PyUnicode_FromFormat(const char *format, ...)  | 
3000  | 28  | { | 
3001  | 28  |     PyObject* ret;  | 
3002  | 28  |     va_list vargs;  | 
3003  |  |  | 
3004  | 28  | #ifdef HAVE_STDARG_PROTOTYPES  | 
3005  | 28  |     va_start(vargs, format);  | 
3006  |  | #else  | 
3007  |  |     va_start(vargs);  | 
3008  |  | #endif  | 
3009  | 28  |     ret = PyUnicode_FromFormatV(format, vargs);  | 
3010  | 28  |     va_end(vargs);  | 
3011  | 28  |     return ret;  | 
3012  | 28  | }  | 
3013  |  |  | 
3014  |  | static Py_ssize_t  | 
3015  |  | unicode_get_widechar_size(PyObject *unicode)  | 
3016  | 378  | { | 
3017  | 378  |     Py_ssize_t res;  | 
3018  |  |  | 
3019  | 378  |     assert(unicode != NULL);  | 
3020  | 378  |     assert(_PyUnicode_CHECK(unicode));  | 
3021  |  |  | 
3022  | 378  |     if (_PyUnicode_WSTR(unicode) != NULL) { | 
3023  | 0  |         return PyUnicode_WSTR_LENGTH(unicode);  | 
3024  | 0  |     }  | 
3025  | 378  |     assert(PyUnicode_IS_READY(unicode));  | 
3026  |  |  | 
3027  | 378  |     res = _PyUnicode_LENGTH(unicode);  | 
3028  |  | #if SIZEOF_WCHAR_T == 2  | 
3029  |  |     if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { | 
3030  |  |         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);  | 
3031  |  |         const Py_UCS4 *end = s + res;  | 
3032  |  |         for (; s < end; ++s) { | 
3033  |  |             if (*s > 0xFFFF) { | 
3034  |  |                 ++res;  | 
3035  |  |             }  | 
3036  |  |         }  | 
3037  |  |     }  | 
3038  |  | #endif  | 
3039  | 378  |     return res;  | 
3040  | 378  | }  | 
3041  |  |  | 
3042  |  | static void  | 
3043  |  | unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)  | 
3044  | 378  | { | 
3045  | 378  |     const wchar_t *wstr;  | 
3046  |  |  | 
3047  | 378  |     assert(unicode != NULL);  | 
3048  | 378  |     assert(_PyUnicode_CHECK(unicode));  | 
3049  |  |  | 
3050  | 378  |     wstr = _PyUnicode_WSTR(unicode);  | 
3051  | 378  |     if (wstr != NULL) { | 
3052  | 0  |         memcpy(w, wstr, size * sizeof(wchar_t));  | 
3053  | 0  |         return;  | 
3054  | 0  |     }  | 
3055  | 378  |     assert(PyUnicode_IS_READY(unicode));  | 
3056  |  |  | 
3057  | 378  |     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { | 
3058  | 378  |         const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);  | 
3059  | 28.3k  |         for (; size--; ++s, ++w) { | 
3060  | 27.9k  |             *w = *s;  | 
3061  | 27.9k  |         }  | 
3062  | 378  |     }  | 
3063  | 0  |     else { | 
3064  | 0  | #if SIZEOF_WCHAR_T == 4  | 
3065  | 0  |         assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);  | 
3066  | 0  |         const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);  | 
3067  | 0  |         for (; size--; ++s, ++w) { | 
3068  | 0  |             *w = *s;  | 
3069  | 0  |         }  | 
3070  |  | #else  | 
3071  |  |         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);  | 
3072  |  |         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);  | 
3073  |  |         for (; size--; ++s, ++w) { | 
3074  |  |             Py_UCS4 ch = *s;  | 
3075  |  |             if (ch > 0xFFFF) { | 
3076  |  |                 assert(ch <= MAX_UNICODE);  | 
3077  |  |                 /* encode surrogate pair in this case */  | 
3078  |  |                 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);  | 
3079  |  |                 if (!size--)  | 
3080  |  |                     break;  | 
3081  |  |                 *w = Py_UNICODE_LOW_SURROGATE(ch);  | 
3082  |  |             }  | 
3083  |  |             else { | 
3084  |  |                 *w = ch;  | 
3085  |  |             }  | 
3086  |  |         }  | 
3087  |  | #endif  | 
3088  | 0  |     }  | 
3089  | 378  | }  | 
3090  |  |  | 
3091  |  | #ifdef HAVE_WCHAR_H  | 
3092  |  |  | 
3093  |  | /* Convert a Unicode object to a wide character string.  | 
3094  |  |  | 
3095  |  |    - If w is NULL: return the number of wide characters (including the null  | 
3096  |  |      character) required to convert the unicode object. Ignore size argument.  | 
3097  |  |  | 
3098  |  |    - Otherwise: return the number of wide characters (excluding the null  | 
3099  |  |      character) written into w. Write at most size wide characters (including  | 
3100  |  |      the null character). */  | 
3101  |  | Py_ssize_t  | 
3102  |  | PyUnicode_AsWideChar(PyObject *unicode,  | 
3103  |  |                      wchar_t *w,  | 
3104  |  |                      Py_ssize_t size)  | 
3105  | 0  | { | 
3106  | 0  |     Py_ssize_t res;  | 
3107  |  | 
  | 
3108  | 0  |     if (unicode == NULL) { | 
3109  | 0  |         PyErr_BadInternalCall();  | 
3110  | 0  |         return -1;  | 
3111  | 0  |     }  | 
3112  | 0  |     if (!PyUnicode_Check(unicode)) { | 
3113  | 0  |         PyErr_BadArgument();  | 
3114  | 0  |         return -1;  | 
3115  | 0  |     }  | 
3116  |  |  | 
3117  | 0  |     res = unicode_get_widechar_size(unicode);  | 
3118  | 0  |     if (w == NULL) { | 
3119  | 0  |         return res + 1;  | 
3120  | 0  |     }  | 
3121  |  |  | 
3122  | 0  |     if (size > res) { | 
3123  | 0  |         size = res + 1;  | 
3124  | 0  |     }  | 
3125  | 0  |     else { | 
3126  | 0  |         res = size;  | 
3127  | 0  |     }  | 
3128  | 0  |     unicode_copy_as_widechar(unicode, w, size);  | 
3129  | 0  |     return res;  | 
3130  | 0  | }  | 
3131  |  |  | 
3132  |  | wchar_t*  | 
3133  |  | PyUnicode_AsWideCharString(PyObject *unicode,  | 
3134  |  |                            Py_ssize_t *size)  | 
3135  | 378  | { | 
3136  | 378  |     wchar_t *buffer;  | 
3137  | 378  |     Py_ssize_t buflen;  | 
3138  |  |  | 
3139  | 378  |     if (unicode == NULL) { | 
3140  | 0  |         PyErr_BadInternalCall();  | 
3141  | 0  |         return NULL;  | 
3142  | 0  |     }  | 
3143  | 378  |     if (!PyUnicode_Check(unicode)) { | 
3144  | 0  |         PyErr_BadArgument();  | 
3145  | 0  |         return NULL;  | 
3146  | 0  |     }  | 
3147  |  |  | 
3148  | 378  |     buflen = unicode_get_widechar_size(unicode);  | 
3149  | 378  |     buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));  | 
3150  | 378  |     if (buffer == NULL) { | 
3151  | 0  |         PyErr_NoMemory();  | 
3152  | 0  |         return NULL;  | 
3153  | 0  |     }  | 
3154  | 378  |     unicode_copy_as_widechar(unicode, buffer, buflen + 1);  | 
3155  | 378  |     if (size != NULL) { | 
3156  | 350  |         *size = buflen;  | 
3157  | 350  |     }  | 
3158  | 28  |     else if (wcslen(buffer) != (size_t)buflen) { | 
3159  | 0  |         PyMem_FREE(buffer);  | 
3160  | 0  |         PyErr_SetString(PyExc_ValueError,  | 
3161  | 0  |                         "embedded null character");  | 
3162  | 0  |         return NULL;  | 
3163  | 0  |     }  | 
3164  | 378  |     return buffer;  | 
3165  | 378  | }  | 
3166  |  |  | 
3167  |  | #endif /* HAVE_WCHAR_H */  | 
3168  |  |  | 
3169  |  | PyObject *  | 
3170  |  | PyUnicode_FromOrdinal(int ordinal)  | 
3171  | 417  | { | 
3172  | 417  |     if (ordinal < 0 || ordinal > MAX_UNICODE) { | 
3173  | 0  |         PyErr_SetString(PyExc_ValueError,  | 
3174  | 0  |                         "chr() arg not in range(0x110000)");  | 
3175  | 0  |         return NULL;  | 
3176  | 0  |     }  | 
3177  |  |  | 
3178  | 417  |     return unicode_char((Py_UCS4)ordinal);  | 
3179  | 417  | }  | 
3180  |  |  | 
3181  |  | PyObject *  | 
3182  |  | PyUnicode_FromObject(PyObject *obj)  | 
3183  | 0  | { | 
3184  |  |     /* XXX Perhaps we should make this API an alias of  | 
3185  |  |        PyObject_Str() instead ?! */  | 
3186  | 0  |     if (PyUnicode_CheckExact(obj)) { | 
3187  | 0  |         if (PyUnicode_READY(obj) == -1)  | 
3188  | 0  |             return NULL;  | 
3189  | 0  |         Py_INCREF(obj);  | 
3190  | 0  |         return obj;  | 
3191  | 0  |     }  | 
3192  | 0  |     if (PyUnicode_Check(obj)) { | 
3193  |  |         /* For a Unicode subtype that's not a Unicode object,  | 
3194  |  |            return a true Unicode object with the same data. */  | 
3195  | 0  |         return _PyUnicode_Copy(obj);  | 
3196  | 0  |     }  | 
3197  | 0  |     PyErr_Format(PyExc_TypeError,  | 
3198  | 0  |                  "Can't convert '%.100s' object to str implicitly",  | 
3199  | 0  |                  Py_TYPE(obj)->tp_name);  | 
3200  | 0  |     return NULL;  | 
3201  | 0  | }  | 
3202  |  |  | 
3203  |  | PyObject *  | 
3204  |  | PyUnicode_FromEncodedObject(PyObject *obj,  | 
3205  |  |                             const char *encoding,  | 
3206  |  |                             const char *errors)  | 
3207  | 17  | { | 
3208  | 17  |     Py_buffer buffer;  | 
3209  | 17  |     PyObject *v;  | 
3210  |  |  | 
3211  | 17  |     if (obj == NULL) { | 
3212  | 0  |         PyErr_BadInternalCall();  | 
3213  | 0  |         return NULL;  | 
3214  | 0  |     }  | 
3215  |  |  | 
3216  |  |     /* Decoding bytes objects is the most common case and should be fast */  | 
3217  | 17  |     if (PyBytes_Check(obj)) { | 
3218  | 17  |         if (PyBytes_GET_SIZE(obj) == 0)  | 
3219  | 0  |             _Py_RETURN_UNICODE_EMPTY();  | 
3220  | 17  |         v = PyUnicode_Decode(  | 
3221  | 17  |                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),  | 
3222  | 17  |                 encoding, errors);  | 
3223  | 17  |         return v;  | 
3224  | 17  |     }  | 
3225  |  |  | 
3226  | 0  |     if (PyUnicode_Check(obj)) { | 
3227  | 0  |         PyErr_SetString(PyExc_TypeError,  | 
3228  | 0  |                         "decoding str is not supported");  | 
3229  | 0  |         return NULL;  | 
3230  | 0  |     }  | 
3231  |  |  | 
3232  |  |     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */  | 
3233  | 0  |     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { | 
3234  | 0  |         PyErr_Format(PyExc_TypeError,  | 
3235  | 0  |                      "decoding to str: need a bytes-like object, %.80s found",  | 
3236  | 0  |                      Py_TYPE(obj)->tp_name);  | 
3237  | 0  |         return NULL;  | 
3238  | 0  |     }  | 
3239  |  |  | 
3240  | 0  |     if (buffer.len == 0) { | 
3241  | 0  |         PyBuffer_Release(&buffer);  | 
3242  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
3243  | 0  |     }  | 
3244  |  |  | 
3245  | 0  |     v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);  | 
3246  | 0  |     PyBuffer_Release(&buffer);  | 
3247  | 0  |     return v;  | 
3248  | 0  | }  | 
3249  |  |  | 
3250  |  | /* Normalize an encoding name: similar to encodings.normalize_encoding(), but  | 
3251  |  |    also convert to lowercase. Return 1 on success, or 0 on error (encoding is  | 
3252  |  |    longer than lower_len-1). */  | 
3253  |  | int  | 
3254  |  | _Py_normalize_encoding(const char *encoding,  | 
3255  |  |                        char *lower,  | 
3256  |  |                        size_t lower_len)  | 
3257  | 1.06k  | { | 
3258  | 1.06k  |     const char *e;  | 
3259  | 1.06k  |     char *l;  | 
3260  | 1.06k  |     char *l_end;  | 
3261  | 1.06k  |     int punct;  | 
3262  |  |  | 
3263  | 1.06k  |     assert(encoding != NULL);  | 
3264  |  |  | 
3265  | 1.06k  |     e = encoding;  | 
3266  | 1.06k  |     l = lower;  | 
3267  | 1.06k  |     l_end = &lower[lower_len - 1];  | 
3268  | 1.06k  |     punct = 0;  | 
3269  | 6.49k  |     while (1) { | 
3270  | 6.49k  |         char c = *e;  | 
3271  | 6.49k  |         if (c == 0) { | 
3272  | 1.06k  |             break;  | 
3273  | 1.06k  |         }  | 
3274  |  |  | 
3275  | 5.43k  |         if (Py_ISALNUM(c) || c == '.') { | 
3276  | 5.40k  |             if (punct && l != lower) { | 
3277  | 30  |                 if (l == l_end) { | 
3278  | 0  |                     return 0;  | 
3279  | 0  |                 }  | 
3280  | 30  |                 *l++ = '_';  | 
3281  | 30  |             }  | 
3282  | 5.40k  |             punct = 0;  | 
3283  |  |  | 
3284  | 5.40k  |             if (l == l_end) { | 
3285  | 0  |                 return 0;  | 
3286  | 0  |             }  | 
3287  | 5.40k  |             *l++ = Py_TOLOWER(c);  | 
3288  | 5.40k  |         }  | 
3289  | 30  |         else { | 
3290  | 30  |             punct = 1;  | 
3291  | 30  |         }  | 
3292  |  |  | 
3293  | 5.43k  |         e++;  | 
3294  | 5.43k  |     }  | 
3295  | 1.06k  |     *l = '\0';  | 
3296  | 1.06k  |     return 1;  | 
3297  | 1.06k  | }  | 
3298  |  |  | 
3299  |  | PyObject *  | 
3300  |  | PyUnicode_Decode(const char *s,  | 
3301  |  |                  Py_ssize_t size,  | 
3302  |  |                  const char *encoding,  | 
3303  |  |                  const char *errors)  | 
3304  | 34  | { | 
3305  | 34  |     PyObject *buffer = NULL, *unicode;  | 
3306  | 34  |     Py_buffer info;  | 
3307  | 34  |     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */ | 
3308  |  |  | 
3309  | 34  |     if (encoding == NULL) { | 
3310  | 0  |         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);  | 
3311  | 0  |     }  | 
3312  |  |  | 
3313  |  |     /* Shortcuts for common default encodings */  | 
3314  | 34  |     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { | 
3315  | 34  |         char *lower = buflower;  | 
3316  |  |  | 
3317  |  |         /* Fast paths */  | 
3318  | 34  |         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') { | 
3319  | 2  |             lower += 3;  | 
3320  | 2  |             if (*lower == '_') { | 
3321  |  |                 /* Match "utf8" and "utf_8" */  | 
3322  | 2  |                 lower++;  | 
3323  | 2  |             }  | 
3324  |  |  | 
3325  | 2  |             if (lower[0] == '8' && lower[1] == 0) { | 
3326  | 2  |                 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);  | 
3327  | 2  |             }  | 
3328  | 0  |             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) { | 
3329  | 0  |                 return PyUnicode_DecodeUTF16(s, size, errors, 0);  | 
3330  | 0  |             }  | 
3331  | 0  |             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) { | 
3332  | 0  |                 return PyUnicode_DecodeUTF32(s, size, errors, 0);  | 
3333  | 0  |             }  | 
3334  | 2  |         }  | 
3335  | 32  |         else { | 
3336  | 32  |             if (strcmp(lower, "ascii") == 0  | 
3337  | 31  |                 || strcmp(lower, "us_ascii") == 0) { | 
3338  | 31  |                 return PyUnicode_DecodeASCII(s, size, errors);  | 
3339  | 31  |             }  | 
3340  |  |     #ifdef MS_WINDOWS  | 
3341  |  |             else if (strcmp(lower, "mbcs") == 0) { | 
3342  |  |                 return PyUnicode_DecodeMBCS(s, size, errors);  | 
3343  |  |             }  | 
3344  |  |     #endif  | 
3345  | 1  |             else if (strcmp(lower, "latin1") == 0  | 
3346  | 0  |                      || strcmp(lower, "latin_1") == 0  | 
3347  | 0  |                      || strcmp(lower, "iso_8859_1") == 0  | 
3348  | 1  |                      || strcmp(lower, "iso8859_1") == 0) { | 
3349  | 1  |                 return PyUnicode_DecodeLatin1(s, size, errors);  | 
3350  | 1  |             }  | 
3351  | 32  |         }  | 
3352  | 34  |     }  | 
3353  |  |  | 
3354  |  |     /* Decode via the codec registry */  | 
3355  | 0  |     buffer = NULL;  | 
3356  | 0  |     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)  | 
3357  | 0  |         goto onError;  | 
3358  | 0  |     buffer = PyMemoryView_FromBuffer(&info);  | 
3359  | 0  |     if (buffer == NULL)  | 
3360  | 0  |         goto onError;  | 
3361  | 0  |     unicode = _PyCodec_DecodeText(buffer, encoding, errors);  | 
3362  | 0  |     if (unicode == NULL)  | 
3363  | 0  |         goto onError;  | 
3364  | 0  |     if (!PyUnicode_Check(unicode)) { | 
3365  | 0  |         PyErr_Format(PyExc_TypeError,  | 
3366  | 0  |                      "'%.400s' decoder returned '%.400s' instead of 'str'; "  | 
3367  | 0  |                      "use codecs.decode() to decode to arbitrary types",  | 
3368  | 0  |                      encoding,  | 
3369  | 0  |                      Py_TYPE(unicode)->tp_name);  | 
3370  | 0  |         Py_DECREF(unicode);  | 
3371  | 0  |         goto onError;  | 
3372  | 0  |     }  | 
3373  | 0  |     Py_DECREF(buffer);  | 
3374  | 0  |     return unicode_result(unicode);  | 
3375  |  |  | 
3376  | 0  |   onError:  | 
3377  | 0  |     Py_XDECREF(buffer);  | 
3378  | 0  |     return NULL;  | 
3379  | 0  | }  | 
3380  |  |  | 
3381  |  | PyObject *  | 
3382  |  | PyUnicode_AsDecodedObject(PyObject *unicode,  | 
3383  |  |                           const char *encoding,  | 
3384  |  |                           const char *errors)  | 
3385  | 0  | { | 
3386  | 0  |     if (!PyUnicode_Check(unicode)) { | 
3387  | 0  |         PyErr_BadArgument();  | 
3388  | 0  |         return NULL;  | 
3389  | 0  |     }  | 
3390  |  |  | 
3391  | 0  |     if (PyErr_WarnEx(PyExc_DeprecationWarning,  | 
3392  | 0  |                      "PyUnicode_AsDecodedObject() is deprecated; "  | 
3393  | 0  |                      "use PyCodec_Decode() to decode from str", 1) < 0)  | 
3394  | 0  |         return NULL;  | 
3395  |  |  | 
3396  | 0  |     if (encoding == NULL)  | 
3397  | 0  |         encoding = PyUnicode_GetDefaultEncoding();  | 
3398  |  |  | 
3399  |  |     /* Decode via the codec registry */  | 
3400  | 0  |     return PyCodec_Decode(unicode, encoding, errors);  | 
3401  | 0  | }  | 
3402  |  |  | 
3403  |  | PyObject *  | 
3404  |  | PyUnicode_AsDecodedUnicode(PyObject *unicode,  | 
3405  |  |                            const char *encoding,  | 
3406  |  |                            const char *errors)  | 
3407  | 0  | { | 
3408  | 0  |     PyObject *v;  | 
3409  |  | 
  | 
3410  | 0  |     if (!PyUnicode_Check(unicode)) { | 
3411  | 0  |         PyErr_BadArgument();  | 
3412  | 0  |         goto onError;  | 
3413  | 0  |     }  | 
3414  |  |  | 
3415  | 0  |     if (PyErr_WarnEx(PyExc_DeprecationWarning,  | 
3416  | 0  |                      "PyUnicode_AsDecodedUnicode() is deprecated; "  | 
3417  | 0  |                      "use PyCodec_Decode() to decode from str to str", 1) < 0)  | 
3418  | 0  |         return NULL;  | 
3419  |  |  | 
3420  | 0  |     if (encoding == NULL)  | 
3421  | 0  |         encoding = PyUnicode_GetDefaultEncoding();  | 
3422  |  |  | 
3423  |  |     /* Decode via the codec registry */  | 
3424  | 0  |     v = PyCodec_Decode(unicode, encoding, errors);  | 
3425  | 0  |     if (v == NULL)  | 
3426  | 0  |         goto onError;  | 
3427  | 0  |     if (!PyUnicode_Check(v)) { | 
3428  | 0  |         PyErr_Format(PyExc_TypeError,  | 
3429  | 0  |                      "'%.400s' decoder returned '%.400s' instead of 'str'; "  | 
3430  | 0  |                      "use codecs.decode() to decode to arbitrary types",  | 
3431  | 0  |                      encoding,  | 
3432  | 0  |                      Py_TYPE(unicode)->tp_name);  | 
3433  | 0  |         Py_DECREF(v);  | 
3434  | 0  |         goto onError;  | 
3435  | 0  |     }  | 
3436  | 0  |     return unicode_result(v);  | 
3437  |  |  | 
3438  | 0  |   onError:  | 
3439  | 0  |     return NULL;  | 
3440  | 0  | }  | 
3441  |  |  | 
3442  |  | PyObject *  | 
3443  |  | PyUnicode_Encode(const Py_UNICODE *s,  | 
3444  |  |                  Py_ssize_t size,  | 
3445  |  |                  const char *encoding,  | 
3446  |  |                  const char *errors)  | 
3447  | 0  | { | 
3448  | 0  |     PyObject *v, *unicode;  | 
3449  |  | 
  | 
3450  | 0  |     unicode = PyUnicode_FromWideChar(s, size);  | 
3451  | 0  |     if (unicode == NULL)  | 
3452  | 0  |         return NULL;  | 
3453  | 0  |     v = PyUnicode_AsEncodedString(unicode, encoding, errors);  | 
3454  | 0  |     Py_DECREF(unicode);  | 
3455  | 0  |     return v;  | 
3456  | 0  | }  | 
3457  |  |  | 
3458  |  | PyObject *  | 
3459  |  | PyUnicode_AsEncodedObject(PyObject *unicode,  | 
3460  |  |                           const char *encoding,  | 
3461  |  |                           const char *errors)  | 
3462  | 0  | { | 
3463  | 0  |     PyObject *v;  | 
3464  |  | 
  | 
3465  | 0  |     if (!PyUnicode_Check(unicode)) { | 
3466  | 0  |         PyErr_BadArgument();  | 
3467  | 0  |         goto onError;  | 
3468  | 0  |     }  | 
3469  |  |  | 
3470  | 0  |     if (PyErr_WarnEx(PyExc_DeprecationWarning,  | 
3471  | 0  |                      "PyUnicode_AsEncodedObject() is deprecated; "  | 
3472  | 0  |                      "use PyUnicode_AsEncodedString() to encode from str to bytes "  | 
3473  | 0  |                      "or PyCodec_Encode() for generic encoding", 1) < 0)  | 
3474  | 0  |         return NULL;  | 
3475  |  |  | 
3476  | 0  |     if (encoding == NULL)  | 
3477  | 0  |         encoding = PyUnicode_GetDefaultEncoding();  | 
3478  |  |  | 
3479  |  |     /* Encode via the codec registry */  | 
3480  | 0  |     v = PyCodec_Encode(unicode, encoding, errors);  | 
3481  | 0  |     if (v == NULL)  | 
3482  | 0  |         goto onError;  | 
3483  | 0  |     return v;  | 
3484  |  |  | 
3485  | 0  |   onError:  | 
3486  | 0  |     return NULL;  | 
3487  | 0  | }  | 
3488  |  |  | 
3489  |  |  | 
3490  |  | static PyObject *  | 
3491  |  | unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,  | 
3492  |  |                       int current_locale)  | 
3493  | 350  | { | 
3494  | 350  |     Py_ssize_t wlen;  | 
3495  | 350  |     wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);  | 
3496  | 350  |     if (wstr == NULL) { | 
3497  | 0  |         return NULL;  | 
3498  | 0  |     }  | 
3499  |  |  | 
3500  | 350  |     if ((size_t)wlen != wcslen(wstr)) { | 
3501  | 0  |         PyErr_SetString(PyExc_ValueError, "embedded null character");  | 
3502  | 0  |         PyMem_Free(wstr);  | 
3503  | 0  |         return NULL;  | 
3504  | 0  |     }  | 
3505  |  |  | 
3506  | 350  |     char *str;  | 
3507  | 350  |     size_t error_pos;  | 
3508  | 350  |     const char *reason;  | 
3509  | 350  |     int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,  | 
3510  | 350  |                                  current_locale, error_handler);  | 
3511  | 350  |     PyMem_Free(wstr);  | 
3512  |  |  | 
3513  | 350  |     if (res != 0) { | 
3514  | 0  |         if (res == -2) { | 
3515  | 0  |             PyObject *exc;  | 
3516  | 0  |             exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",  | 
3517  | 0  |                     "locale", unicode,  | 
3518  | 0  |                     (Py_ssize_t)error_pos,  | 
3519  | 0  |                     (Py_ssize_t)(error_pos+1),  | 
3520  | 0  |                     reason);  | 
3521  | 0  |             if (exc != NULL) { | 
3522  | 0  |                 PyCodec_StrictErrors(exc);  | 
3523  | 0  |                 Py_DECREF(exc);  | 
3524  | 0  |             }  | 
3525  | 0  |         }  | 
3526  | 0  |         else if (res == -3) { | 
3527  | 0  |             PyErr_SetString(PyExc_ValueError, "unsupported error handler");  | 
3528  | 0  |         }  | 
3529  | 0  |         else { | 
3530  | 0  |             PyErr_NoMemory();  | 
3531  | 0  |         }  | 
3532  | 0  |         return NULL;  | 
3533  | 0  |     }  | 
3534  |  |  | 
3535  | 350  |     PyObject *bytes = PyBytes_FromString(str);  | 
3536  | 350  |     PyMem_RawFree(str);  | 
3537  | 350  |     return bytes;  | 
3538  | 350  | }  | 
3539  |  |  | 
3540  |  | PyObject *  | 
3541  |  | PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)  | 
3542  | 0  | { | 
3543  | 0  |     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);  | 
3544  | 0  |     return unicode_encode_locale(unicode, error_handler, 1);  | 
3545  | 0  | }  | 
3546  |  |  | 
3547  |  | PyObject *  | 
3548  |  | PyUnicode_EncodeFSDefault(PyObject *unicode)  | 
3549  | 1.34k  | { | 
3550  | 1.34k  |     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();  | 
3551  |  | #ifdef _Py_FORCE_UTF8_FS_ENCODING  | 
3552  |  |     if (interp->fs_codec.encoding) { | 
3553  |  |         return unicode_encode_utf8(unicode,  | 
3554  |  |                                    interp->fs_codec.error_handler,  | 
3555  |  |                                    interp->fs_codec.errors);  | 
3556  |  |     }  | 
3557  |  |     else { | 
3558  |  |         const wchar_t *filesystem_errors = interp->config.filesystem_errors;  | 
3559  |  |         _Py_error_handler errors;  | 
3560  |  |         errors = get_error_handler_wide(filesystem_errors);  | 
3561  |  |         assert(errors != _Py_ERROR_UNKNOWN);  | 
3562  |  |         return unicode_encode_utf8(unicode, errors, NULL);  | 
3563  |  |     }  | 
3564  |  | #else  | 
3565  |  |     /* Bootstrap check: if the filesystem codec is implemented in Python, we  | 
3566  |  |        cannot use it to encode and decode filenames before it is loaded. Load  | 
3567  |  |        the Python codec requires to encode at least its own filename. Use the C  | 
3568  |  |        implementation of the locale codec until the codec registry is  | 
3569  |  |        initialized and the Python codec is loaded. See initfsencoding(). */  | 
3570  | 1.34k  |     if (interp->fs_codec.encoding) { | 
3571  | 999  |         return PyUnicode_AsEncodedString(unicode,  | 
3572  | 999  |                                          interp->fs_codec.encoding,  | 
3573  | 999  |                                          interp->fs_codec.errors);  | 
3574  | 999  |     }  | 
3575  | 350  |     else { | 
3576  | 350  |         const wchar_t *filesystem_errors = interp->config.filesystem_errors;  | 
3577  | 350  |         _Py_error_handler errors;  | 
3578  | 350  |         errors = get_error_handler_wide(filesystem_errors);  | 
3579  | 350  |         assert(errors != _Py_ERROR_UNKNOWN);  | 
3580  | 350  |         return unicode_encode_locale(unicode, errors, 0);  | 
3581  | 350  |     }  | 
3582  | 1.34k  | #endif  | 
3583  | 1.34k  | }  | 
3584  |  |  | 
3585  |  | PyObject *  | 
3586  |  | PyUnicode_AsEncodedString(PyObject *unicode,  | 
3587  |  |                           const char *encoding,  | 
3588  |  |                           const char *errors)  | 
3589  | 1.01k  | { | 
3590  | 1.01k  |     PyObject *v;  | 
3591  | 1.01k  |     char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */ | 
3592  |  |  | 
3593  | 1.01k  |     if (!PyUnicode_Check(unicode)) { | 
3594  | 0  |         PyErr_BadArgument();  | 
3595  | 0  |         return NULL;  | 
3596  | 0  |     }  | 
3597  |  |  | 
3598  | 1.01k  |     if (encoding == NULL) { | 
3599  | 0  |         return _PyUnicode_AsUTF8String(unicode, errors);  | 
3600  | 0  |     }  | 
3601  |  |  | 
3602  |  |     /* Shortcuts for common default encodings */  | 
3603  | 1.01k  |     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { | 
3604  | 1.01k  |         char *lower = buflower;  | 
3605  |  |  | 
3606  |  |         /* Fast paths */  | 
3607  | 1.01k  |         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') { | 
3608  | 0  |             lower += 3;  | 
3609  | 0  |             if (*lower == '_') { | 
3610  |  |                 /* Match "utf8" and "utf_8" */  | 
3611  | 0  |                 lower++;  | 
3612  | 0  |             }  | 
3613  |  | 
  | 
3614  | 0  |             if (lower[0] == '8' && lower[1] == 0) { | 
3615  | 0  |                 return _PyUnicode_AsUTF8String(unicode, errors);  | 
3616  | 0  |             }  | 
3617  | 0  |             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) { | 
3618  | 0  |                 return _PyUnicode_EncodeUTF16(unicode, errors, 0);  | 
3619  | 0  |             }  | 
3620  | 0  |             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) { | 
3621  | 0  |                 return _PyUnicode_EncodeUTF32(unicode, errors, 0);  | 
3622  | 0  |             }  | 
3623  | 0  |         }  | 
3624  | 1.01k  |         else { | 
3625  | 1.01k  |             if (strcmp(lower, "ascii") == 0  | 
3626  | 1.01k  |                 || strcmp(lower, "us_ascii") == 0) { | 
3627  | 1.01k  |                 return _PyUnicode_AsASCIIString(unicode, errors);  | 
3628  | 1.01k  |             }  | 
3629  |  | #ifdef MS_WINDOWS  | 
3630  |  |             else if (strcmp(lower, "mbcs") == 0) { | 
3631  |  |                 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);  | 
3632  |  |             }  | 
3633  |  | #endif  | 
3634  | 0  |             else if (strcmp(lower, "latin1") == 0 ||  | 
3635  | 0  |                      strcmp(lower, "latin_1") == 0 ||  | 
3636  | 0  |                      strcmp(lower, "iso_8859_1") == 0 ||  | 
3637  | 0  |                      strcmp(lower, "iso8859_1") == 0) { | 
3638  | 0  |                 return _PyUnicode_AsLatin1String(unicode, errors);  | 
3639  | 0  |             }  | 
3640  | 1.01k  |         }  | 
3641  | 1.01k  |     }  | 
3642  |  |  | 
3643  |  |     /* Encode via the codec registry */  | 
3644  | 0  |     v = _PyCodec_EncodeText(unicode, encoding, errors);  | 
3645  | 0  |     if (v == NULL)  | 
3646  | 0  |         return NULL;  | 
3647  |  |  | 
3648  |  |     /* The normal path */  | 
3649  | 0  |     if (PyBytes_Check(v))  | 
3650  | 0  |         return v;  | 
3651  |  |  | 
3652  |  |     /* If the codec returns a buffer, raise a warning and convert to bytes */  | 
3653  | 0  |     if (PyByteArray_Check(v)) { | 
3654  | 0  |         int error;  | 
3655  | 0  |         PyObject *b;  | 
3656  |  | 
  | 
3657  | 0  |         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,  | 
3658  | 0  |             "encoder %s returned bytearray instead of bytes; "  | 
3659  | 0  |             "use codecs.encode() to encode to arbitrary types",  | 
3660  | 0  |             encoding);  | 
3661  | 0  |         if (error) { | 
3662  | 0  |             Py_DECREF(v);  | 
3663  | 0  |             return NULL;  | 
3664  | 0  |         }  | 
3665  |  |  | 
3666  | 0  |         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),  | 
3667  | 0  |                                       PyByteArray_GET_SIZE(v));  | 
3668  | 0  |         Py_DECREF(v);  | 
3669  | 0  |         return b;  | 
3670  | 0  |     }  | 
3671  |  |  | 
3672  | 0  |     PyErr_Format(PyExc_TypeError,  | 
3673  | 0  |                  "'%.400s' encoder returned '%.400s' instead of 'bytes'; "  | 
3674  | 0  |                  "use codecs.encode() to encode to arbitrary types",  | 
3675  | 0  |                  encoding,  | 
3676  | 0  |                  Py_TYPE(v)->tp_name);  | 
3677  | 0  |     Py_DECREF(v);  | 
3678  | 0  |     return NULL;  | 
3679  | 0  | }  | 
3680  |  |  | 
3681  |  | PyObject *  | 
3682  |  | PyUnicode_AsEncodedUnicode(PyObject *unicode,  | 
3683  |  |                            const char *encoding,  | 
3684  |  |                            const char *errors)  | 
3685  | 0  | { | 
3686  | 0  |     PyObject *v;  | 
3687  |  | 
  | 
3688  | 0  |     if (!PyUnicode_Check(unicode)) { | 
3689  | 0  |         PyErr_BadArgument();  | 
3690  | 0  |         goto onError;  | 
3691  | 0  |     }  | 
3692  |  |  | 
3693  | 0  |     if (PyErr_WarnEx(PyExc_DeprecationWarning,  | 
3694  | 0  |                      "PyUnicode_AsEncodedUnicode() is deprecated; "  | 
3695  | 0  |                      "use PyCodec_Encode() to encode from str to str", 1) < 0)  | 
3696  | 0  |         return NULL;  | 
3697  |  |  | 
3698  | 0  |     if (encoding == NULL)  | 
3699  | 0  |         encoding = PyUnicode_GetDefaultEncoding();  | 
3700  |  |  | 
3701  |  |     /* Encode via the codec registry */  | 
3702  | 0  |     v = PyCodec_Encode(unicode, encoding, errors);  | 
3703  | 0  |     if (v == NULL)  | 
3704  | 0  |         goto onError;  | 
3705  | 0  |     if (!PyUnicode_Check(v)) { | 
3706  | 0  |         PyErr_Format(PyExc_TypeError,  | 
3707  | 0  |                      "'%.400s' encoder returned '%.400s' instead of 'str'; "  | 
3708  | 0  |                      "use codecs.encode() to encode to arbitrary types",  | 
3709  | 0  |                      encoding,  | 
3710  | 0  |                      Py_TYPE(v)->tp_name);  | 
3711  | 0  |         Py_DECREF(v);  | 
3712  | 0  |         goto onError;  | 
3713  | 0  |     }  | 
3714  | 0  |     return v;  | 
3715  |  |  | 
3716  | 0  |   onError:  | 
3717  | 0  |     return NULL;  | 
3718  | 0  | }  | 
3719  |  |  | 
3720  |  | static PyObject*  | 
3721  |  | unicode_decode_locale(const char *str, Py_ssize_t len,  | 
3722  |  |                       _Py_error_handler errors, int current_locale)  | 
3723  | 4.86k  | { | 
3724  | 4.86k  |     if (str[len] != '\0' || (size_t)len != strlen(str))  { | 
3725  | 0  |         PyErr_SetString(PyExc_ValueError, "embedded null byte");  | 
3726  | 0  |         return NULL;  | 
3727  | 0  |     }  | 
3728  |  |  | 
3729  | 4.86k  |     wchar_t *wstr;  | 
3730  | 4.86k  |     size_t wlen;  | 
3731  | 4.86k  |     const char *reason;  | 
3732  | 4.86k  |     int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,  | 
3733  | 4.86k  |                                  current_locale, errors);  | 
3734  | 4.86k  |     if (res != 0) { | 
3735  | 0  |         if (res == -2) { | 
3736  | 0  |             PyObject *exc;  | 
3737  | 0  |             exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",  | 
3738  | 0  |                                         "locale", str, len,  | 
3739  | 0  |                                         (Py_ssize_t)wlen,  | 
3740  | 0  |                                         (Py_ssize_t)(wlen + 1),  | 
3741  | 0  |                                         reason);  | 
3742  | 0  |             if (exc != NULL) { | 
3743  | 0  |                 PyCodec_StrictErrors(exc);  | 
3744  | 0  |                 Py_DECREF(exc);  | 
3745  | 0  |             }  | 
3746  | 0  |         }  | 
3747  | 0  |         else if (res == -3) { | 
3748  | 0  |             PyErr_SetString(PyExc_ValueError, "unsupported error handler");  | 
3749  | 0  |         }  | 
3750  | 0  |         else { | 
3751  | 0  |             PyErr_NoMemory();  | 
3752  | 0  |         }  | 
3753  | 0  |         return NULL;  | 
3754  | 0  |     }  | 
3755  |  |  | 
3756  | 4.86k  |     PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);  | 
3757  | 4.86k  |     PyMem_RawFree(wstr);  | 
3758  | 4.86k  |     return unicode;  | 
3759  | 4.86k  | }  | 
3760  |  |  | 
3761  |  | PyObject*  | 
3762  |  | PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,  | 
3763  |  |                               const char *errors)  | 
3764  | 0  | { | 
3765  | 0  |     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);  | 
3766  | 0  |     return unicode_decode_locale(str, len, error_handler, 1);  | 
3767  | 0  | }  | 
3768  |  |  | 
3769  |  | PyObject*  | 
3770  |  | PyUnicode_DecodeLocale(const char *str, const char *errors)  | 
3771  | 230  | { | 
3772  | 230  |     Py_ssize_t size = (Py_ssize_t)strlen(str);  | 
3773  | 230  |     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);  | 
3774  | 230  |     return unicode_decode_locale(str, size, error_handler, 1);  | 
3775  | 230  | }  | 
3776  |  |  | 
3777  |  |  | 
3778  |  | PyObject*  | 
3779  | 14  | PyUnicode_DecodeFSDefault(const char *s) { | 
3780  | 14  |     Py_ssize_t size = (Py_ssize_t)strlen(s);  | 
3781  | 14  |     return PyUnicode_DecodeFSDefaultAndSize(s, size);  | 
3782  | 14  | }  | 
3783  |  |  | 
3784  |  | PyObject*  | 
3785  |  | PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)  | 
3786  | 4.65k  | { | 
3787  | 4.65k  |     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();  | 
3788  |  | #ifdef _Py_FORCE_UTF8_FS_ENCODING  | 
3789  |  |     if (interp->fs_codec.encoding) { | 
3790  |  |         return unicode_decode_utf8(s, size,  | 
3791  |  |                                    interp->fs_codec.error_handler,  | 
3792  |  |                                    interp->fs_codec.errors,  | 
3793  |  |                                    NULL);  | 
3794  |  |     }  | 
3795  |  |     else { | 
3796  |  |         const wchar_t *filesystem_errors = interp->config.filesystem_errors;  | 
3797  |  |         _Py_error_handler errors;  | 
3798  |  |         errors = get_error_handler_wide(filesystem_errors);  | 
3799  |  |         assert(errors != _Py_ERROR_UNKNOWN);  | 
3800  |  |         return unicode_decode_utf8(s, size, errors, NULL, NULL);  | 
3801  |  |     }  | 
3802  |  | #else  | 
3803  |  |     /* Bootstrap check: if the filesystem codec is implemented in Python, we  | 
3804  |  |        cannot use it to encode and decode filenames before it is loaded. Load  | 
3805  |  |        the Python codec requires to encode at least its own filename. Use the C  | 
3806  |  |        implementation of the locale codec until the codec registry is  | 
3807  |  |        initialized and the Python codec is loaded. See initfsencoding(). */  | 
3808  | 4.65k  |     if (interp->fs_codec.encoding) { | 
3809  | 17  |         return PyUnicode_Decode(s, size,  | 
3810  | 17  |                                 interp->fs_codec.encoding,  | 
3811  | 17  |                                 interp->fs_codec.errors);  | 
3812  | 17  |     }  | 
3813  | 4.63k  |     else { | 
3814  | 4.63k  |         const wchar_t *filesystem_errors = interp->config.filesystem_errors;  | 
3815  | 4.63k  |         _Py_error_handler errors;  | 
3816  | 4.63k  |         errors = get_error_handler_wide(filesystem_errors);  | 
3817  | 4.63k  |         return unicode_decode_locale(s, size, errors, 0);  | 
3818  | 4.63k  |     }  | 
3819  | 4.65k  | #endif  | 
3820  | 4.65k  | }  | 
3821  |  |  | 
3822  |  |  | 
3823  |  | int  | 
3824  |  | PyUnicode_FSConverter(PyObject* arg, void* addr)  | 
3825  | 1.34k  | { | 
3826  | 1.34k  |     PyObject *path = NULL;  | 
3827  | 1.34k  |     PyObject *output = NULL;  | 
3828  | 1.34k  |     Py_ssize_t size;  | 
3829  | 1.34k  |     void *data;  | 
3830  | 1.34k  |     if (arg == NULL) { | 
3831  | 0  |         Py_DECREF(*(PyObject**)addr);  | 
3832  | 0  |         *(PyObject**)addr = NULL;  | 
3833  | 0  |         return 1;  | 
3834  | 0  |     }  | 
3835  | 1.34k  |     path = PyOS_FSPath(arg);  | 
3836  | 1.34k  |     if (path == NULL) { | 
3837  | 0  |         return 0;  | 
3838  | 0  |     }  | 
3839  | 1.34k  |     if (PyBytes_Check(path)) { | 
3840  | 0  |         output = path;  | 
3841  | 0  |     }  | 
3842  | 1.34k  |     else {  // PyOS_FSPath() guarantees its returned value is bytes or str. | 
3843  | 1.34k  |         output = PyUnicode_EncodeFSDefault(path);  | 
3844  | 1.34k  |         Py_DECREF(path);  | 
3845  | 1.34k  |         if (!output) { | 
3846  | 0  |             return 0;  | 
3847  | 0  |         }  | 
3848  | 1.34k  |         assert(PyBytes_Check(output));  | 
3849  | 1.34k  |     }  | 
3850  |  |  | 
3851  | 1.34k  |     size = PyBytes_GET_SIZE(output);  | 
3852  | 1.34k  |     data = PyBytes_AS_STRING(output);  | 
3853  | 1.34k  |     if ((size_t)size != strlen(data)) { | 
3854  | 0  |         PyErr_SetString(PyExc_ValueError, "embedded null byte");  | 
3855  | 0  |         Py_DECREF(output);  | 
3856  | 0  |         return 0;  | 
3857  | 0  |     }  | 
3858  | 1.34k  |     *(PyObject**)addr = output;  | 
3859  | 1.34k  |     return Py_CLEANUP_SUPPORTED;  | 
3860  | 1.34k  | }  | 
3861  |  |  | 
3862  |  |  | 
3863  |  | int  | 
3864  |  | PyUnicode_FSDecoder(PyObject* arg, void* addr)  | 
3865  | 0  | { | 
3866  | 0  |     int is_buffer = 0;  | 
3867  | 0  |     PyObject *path = NULL;  | 
3868  | 0  |     PyObject *output = NULL;  | 
3869  | 0  |     if (arg == NULL) { | 
3870  | 0  |         Py_DECREF(*(PyObject**)addr);  | 
3871  | 0  |         *(PyObject**)addr = NULL;  | 
3872  | 0  |         return 1;  | 
3873  | 0  |     }  | 
3874  |  |  | 
3875  | 0  |     is_buffer = PyObject_CheckBuffer(arg);  | 
3876  | 0  |     if (!is_buffer) { | 
3877  | 0  |         path = PyOS_FSPath(arg);  | 
3878  | 0  |         if (path == NULL) { | 
3879  | 0  |             return 0;  | 
3880  | 0  |         }  | 
3881  | 0  |     }  | 
3882  | 0  |     else { | 
3883  | 0  |         path = arg;  | 
3884  | 0  |         Py_INCREF(arg);  | 
3885  | 0  |     }  | 
3886  |  |  | 
3887  | 0  |     if (PyUnicode_Check(path)) { | 
3888  | 0  |         output = path;  | 
3889  | 0  |     }  | 
3890  | 0  |     else if (PyBytes_Check(path) || is_buffer) { | 
3891  | 0  |         PyObject *path_bytes = NULL;  | 
3892  |  | 
  | 
3893  | 0  |         if (!PyBytes_Check(path) &&  | 
3894  | 0  |             PyErr_WarnFormat(PyExc_DeprecationWarning, 1,  | 
3895  | 0  |             "path should be string, bytes, or os.PathLike, not %.200s",  | 
3896  | 0  |             Py_TYPE(arg)->tp_name)) { | 
3897  | 0  |                 Py_DECREF(path);  | 
3898  | 0  |             return 0;  | 
3899  | 0  |         }  | 
3900  | 0  |         path_bytes = PyBytes_FromObject(path);  | 
3901  | 0  |         Py_DECREF(path);  | 
3902  | 0  |         if (!path_bytes) { | 
3903  | 0  |             return 0;  | 
3904  | 0  |         }  | 
3905  | 0  |         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),  | 
3906  | 0  |                                                   PyBytes_GET_SIZE(path_bytes));  | 
3907  | 0  |         Py_DECREF(path_bytes);  | 
3908  | 0  |         if (!output) { | 
3909  | 0  |             return 0;  | 
3910  | 0  |         }  | 
3911  | 0  |     }  | 
3912  | 0  |     else { | 
3913  | 0  |         PyErr_Format(PyExc_TypeError,  | 
3914  | 0  |                      "path should be string, bytes, or os.PathLike, not %.200s",  | 
3915  | 0  |                      Py_TYPE(arg)->tp_name);  | 
3916  | 0  |         Py_DECREF(path);  | 
3917  | 0  |         return 0;  | 
3918  | 0  |     }  | 
3919  | 0  |     if (PyUnicode_READY(output) == -1) { | 
3920  | 0  |         Py_DECREF(output);  | 
3921  | 0  |         return 0;  | 
3922  | 0  |     }  | 
3923  | 0  |     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),  | 
3924  | 0  |                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { | 
3925  | 0  |         PyErr_SetString(PyExc_ValueError, "embedded null character");  | 
3926  | 0  |         Py_DECREF(output);  | 
3927  | 0  |         return 0;  | 
3928  | 0  |     }  | 
3929  | 0  |     *(PyObject**)addr = output;  | 
3930  | 0  |     return Py_CLEANUP_SUPPORTED;  | 
3931  | 0  | }  | 
3932  |  |  | 
3933  |  |  | 
3934  |  | const char *  | 
3935  |  | PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)  | 
3936  | 3.44k  | { | 
3937  | 3.44k  |     PyObject *bytes;  | 
3938  |  |  | 
3939  | 3.44k  |     if (!PyUnicode_Check(unicode)) { | 
3940  | 0  |         PyErr_BadArgument();  | 
3941  | 0  |         return NULL;  | 
3942  | 0  |     }  | 
3943  | 3.44k  |     if (PyUnicode_READY(unicode) == -1)  | 
3944  | 0  |         return NULL;  | 
3945  |  |  | 
3946  | 3.44k  |     if (PyUnicode_UTF8(unicode) == NULL) { | 
3947  | 0  |         assert(!PyUnicode_IS_COMPACT_ASCII(unicode));  | 
3948  | 0  |         bytes = _PyUnicode_AsUTF8String(unicode, NULL);  | 
3949  | 0  |         if (bytes == NULL)  | 
3950  | 0  |             return NULL;  | 
3951  | 0  |         _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);  | 
3952  | 0  |         if (_PyUnicode_UTF8(unicode) == NULL) { | 
3953  | 0  |             PyErr_NoMemory();  | 
3954  | 0  |             Py_DECREF(bytes);  | 
3955  | 0  |             return NULL;  | 
3956  | 0  |         }  | 
3957  | 0  |         _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);  | 
3958  | 0  |         memcpy(_PyUnicode_UTF8(unicode),  | 
3959  | 0  |                   PyBytes_AS_STRING(bytes),  | 
3960  | 0  |                   _PyUnicode_UTF8_LENGTH(unicode) + 1);  | 
3961  | 0  |         Py_DECREF(bytes);  | 
3962  | 0  |     }  | 
3963  |  |  | 
3964  | 3.44k  |     if (psize)  | 
3965  | 2.13k  |         *psize = PyUnicode_UTF8_LENGTH(unicode);  | 
3966  | 3.44k  |     return PyUnicode_UTF8(unicode);  | 
3967  | 3.44k  | }  | 
3968  |  |  | 
3969  |  | const char *  | 
3970  |  | PyUnicode_AsUTF8(PyObject *unicode)  | 
3971  | 1.30k  | { | 
3972  | 1.30k  |     return PyUnicode_AsUTF8AndSize(unicode, NULL);  | 
3973  | 1.30k  | }  | 
3974  |  |  | 
3975  |  | Py_UNICODE *  | 
3976  |  | PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)  | 
3977  | 0  | { | 
3978  | 0  |     if (!PyUnicode_Check(unicode)) { | 
3979  | 0  |         PyErr_BadArgument();  | 
3980  | 0  |         return NULL;  | 
3981  | 0  |     }  | 
3982  | 0  |     Py_UNICODE *w = _PyUnicode_WSTR(unicode);  | 
3983  | 0  |     if (w == NULL) { | 
3984  |  |         /* Non-ASCII compact unicode object */  | 
3985  | 0  |         assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);  | 
3986  | 0  |         assert(PyUnicode_IS_READY(unicode));  | 
3987  |  | 
  | 
3988  | 0  |         Py_ssize_t wlen = unicode_get_widechar_size(unicode);  | 
3989  | 0  |         if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { | 
3990  | 0  |             PyErr_NoMemory();  | 
3991  | 0  |             return NULL;  | 
3992  | 0  |         }  | 
3993  | 0  |         w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));  | 
3994  | 0  |         if (w == NULL) { | 
3995  | 0  |             PyErr_NoMemory();  | 
3996  | 0  |             return NULL;  | 
3997  | 0  |         }  | 
3998  | 0  |         unicode_copy_as_widechar(unicode, w, wlen + 1);  | 
3999  | 0  |         _PyUnicode_WSTR(unicode) = w;  | 
4000  | 0  |         if (!PyUnicode_IS_COMPACT_ASCII(unicode)) { | 
4001  | 0  |             _PyUnicode_WSTR_LENGTH(unicode) = wlen;  | 
4002  | 0  |         }  | 
4003  | 0  |     }  | 
4004  | 0  |     if (size != NULL)  | 
4005  | 0  |         *size = PyUnicode_WSTR_LENGTH(unicode);  | 
4006  | 0  |     return w;  | 
4007  | 0  | }  | 
4008  |  |  | 
4009  |  | Py_UNICODE *  | 
4010  |  | PyUnicode_AsUnicode(PyObject *unicode)  | 
4011  | 0  | { | 
4012  | 0  |     return PyUnicode_AsUnicodeAndSize(unicode, NULL);  | 
4013  | 0  | }  | 
4014  |  |  | 
4015  |  | const Py_UNICODE *  | 
4016  |  | _PyUnicode_AsUnicode(PyObject *unicode)  | 
4017  | 0  | { | 
4018  | 0  |     Py_ssize_t size;  | 
4019  | 0  |     const Py_UNICODE *wstr;  | 
4020  |  | 
  | 
4021  | 0  |     wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);  | 
4022  | 0  |     if (wstr && wcslen(wstr) != (size_t)size) { | 
4023  | 0  |         PyErr_SetString(PyExc_ValueError, "embedded null character");  | 
4024  | 0  |         return NULL;  | 
4025  | 0  |     }  | 
4026  | 0  |     return wstr;  | 
4027  | 0  | }  | 
4028  |  |  | 
4029  |  |  | 
4030  |  | Py_ssize_t  | 
4031  |  | PyUnicode_GetSize(PyObject *unicode)  | 
4032  | 0  | { | 
4033  | 0  |     if (!PyUnicode_Check(unicode)) { | 
4034  | 0  |         PyErr_BadArgument();  | 
4035  | 0  |         goto onError;  | 
4036  | 0  |     }  | 
4037  | 0  |     if (_PyUnicode_WSTR(unicode) == NULL) { | 
4038  | 0  |         if (PyUnicode_AsUnicode(unicode) == NULL)  | 
4039  | 0  |             goto onError;  | 
4040  | 0  |     }  | 
4041  | 0  |     return PyUnicode_WSTR_LENGTH(unicode);  | 
4042  |  |  | 
4043  | 0  |   onError:  | 
4044  | 0  |     return -1;  | 
4045  | 0  | }  | 
4046  |  |  | 
4047  |  | Py_ssize_t  | 
4048  |  | PyUnicode_GetLength(PyObject *unicode)  | 
4049  | 0  | { | 
4050  | 0  |     if (!PyUnicode_Check(unicode)) { | 
4051  | 0  |         PyErr_BadArgument();  | 
4052  | 0  |         return -1;  | 
4053  | 0  |     }  | 
4054  | 0  |     if (PyUnicode_READY(unicode) == -1)  | 
4055  | 0  |         return -1;  | 
4056  | 0  |     return PyUnicode_GET_LENGTH(unicode);  | 
4057  | 0  | }  | 
4058  |  |  | 
4059  |  | Py_UCS4  | 
4060  |  | PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)  | 
4061  | 0  | { | 
4062  | 0  |     void *data;  | 
4063  | 0  |     int kind;  | 
4064  |  | 
  | 
4065  | 0  |     if (!PyUnicode_Check(unicode)) { | 
4066  | 0  |         PyErr_BadArgument();  | 
4067  | 0  |         return (Py_UCS4)-1;  | 
4068  | 0  |     }  | 
4069  | 0  |     if (PyUnicode_READY(unicode) == -1) { | 
4070  | 0  |         return (Py_UCS4)-1;  | 
4071  | 0  |     }  | 
4072  | 0  |     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { | 
4073  | 0  |         PyErr_SetString(PyExc_IndexError, "string index out of range");  | 
4074  | 0  |         return (Py_UCS4)-1;  | 
4075  | 0  |     }  | 
4076  | 0  |     data = PyUnicode_DATA(unicode);  | 
4077  | 0  |     kind = PyUnicode_KIND(unicode);  | 
4078  | 0  |     return PyUnicode_READ(kind, data, index);  | 
4079  | 0  | }  | 
4080  |  |  | 
4081  |  | int  | 
4082  |  | PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)  | 
4083  | 0  | { | 
4084  | 0  |     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { | 
4085  | 0  |         PyErr_BadArgument();  | 
4086  | 0  |         return -1;  | 
4087  | 0  |     }  | 
4088  | 0  |     assert(PyUnicode_IS_READY(unicode));  | 
4089  | 0  |     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { | 
4090  | 0  |         PyErr_SetString(PyExc_IndexError, "string index out of range");  | 
4091  | 0  |         return -1;  | 
4092  | 0  |     }  | 
4093  | 0  |     if (unicode_check_modifiable(unicode))  | 
4094  | 0  |         return -1;  | 
4095  | 0  |     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { | 
4096  | 0  |         PyErr_SetString(PyExc_ValueError, "character out of range");  | 
4097  | 0  |         return -1;  | 
4098  | 0  |     }  | 
4099  | 0  |     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),  | 
4100  | 0  |                     index, ch);  | 
4101  | 0  |     return 0;  | 
4102  | 0  | }  | 
4103  |  |  | 
4104  |  | const char *  | 
4105  |  | PyUnicode_GetDefaultEncoding(void)  | 
4106  | 0  | { | 
4107  | 0  |     return "utf-8";  | 
4108  | 0  | }  | 
4109  |  |  | 
4110  |  | /* create or adjust a UnicodeDecodeError */  | 
4111  |  | static void  | 
4112  |  | make_decode_exception(PyObject **exceptionObject,  | 
4113  |  |                       const char *encoding,  | 
4114  |  |                       const char *input, Py_ssize_t length,  | 
4115  |  |                       Py_ssize_t startpos, Py_ssize_t endpos,  | 
4116  |  |                       const char *reason)  | 
4117  | 0  | { | 
4118  | 0  |     if (*exceptionObject == NULL) { | 
4119  | 0  |         *exceptionObject = PyUnicodeDecodeError_Create(  | 
4120  | 0  |             encoding, input, length, startpos, endpos, reason);  | 
4121  | 0  |     }  | 
4122  | 0  |     else { | 
4123  | 0  |         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))  | 
4124  | 0  |             goto onError;  | 
4125  | 0  |         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))  | 
4126  | 0  |             goto onError;  | 
4127  | 0  |         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))  | 
4128  | 0  |             goto onError;  | 
4129  | 0  |     }  | 
4130  | 0  |     return;  | 
4131  |  |  | 
4132  | 0  | onError:  | 
4133  | 0  |     Py_CLEAR(*exceptionObject);  | 
4134  | 0  | }  | 
4135  |  |  | 
4136  |  | #ifdef MS_WINDOWS  | 
4137  |  | static int  | 
4138  |  | widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)  | 
4139  |  | { | 
4140  |  |     if (newsize > *size) { | 
4141  |  |         wchar_t *newbuf = *buf;  | 
4142  |  |         if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) { | 
4143  |  |             PyErr_NoMemory();  | 
4144  |  |             return -1;  | 
4145  |  |         }  | 
4146  |  |         *buf = newbuf;  | 
4147  |  |     }  | 
4148  |  |     *size = newsize;  | 
4149  |  |     return 0;  | 
4150  |  | }  | 
4151  |  |  | 
4152  |  | /* error handling callback helper:  | 
4153  |  |    build arguments, call the callback and check the arguments,  | 
4154  |  |    if no exception occurred, copy the replacement to the output  | 
4155  |  |    and adjust various state variables.  | 
4156  |  |    return 0 on success, -1 on error  | 
4157  |  | */  | 
4158  |  |  | 
4159  |  | static int  | 
4160  |  | unicode_decode_call_errorhandler_wchar(  | 
4161  |  |     const char *errors, PyObject **errorHandler,  | 
4162  |  |     const char *encoding, const char *reason,  | 
4163  |  |     const char **input, const char **inend, Py_ssize_t *startinpos,  | 
4164  |  |     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,  | 
4165  |  |     wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)  | 
4166  |  | { | 
4167  |  |     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";  | 
4168  |  |  | 
4169  |  |     PyObject *restuple = NULL;  | 
4170  |  |     PyObject *repunicode = NULL;  | 
4171  |  |     Py_ssize_t outsize;  | 
4172  |  |     Py_ssize_t insize;  | 
4173  |  |     Py_ssize_t requiredsize;  | 
4174  |  |     Py_ssize_t newpos;  | 
4175  |  |     PyObject *inputobj = NULL;  | 
4176  |  |     wchar_t *repwstr;  | 
4177  |  |     Py_ssize_t repwlen;  | 
4178  |  |  | 
4179  |  |     if (*errorHandler == NULL) { | 
4180  |  |         *errorHandler = PyCodec_LookupError(errors);  | 
4181  |  |         if (*errorHandler == NULL)  | 
4182  |  |             goto onError;  | 
4183  |  |     }  | 
4184  |  |  | 
4185  |  |     make_decode_exception(exceptionObject,  | 
4186  |  |         encoding,  | 
4187  |  |         *input, *inend - *input,  | 
4188  |  |         *startinpos, *endinpos,  | 
4189  |  |         reason);  | 
4190  |  |     if (*exceptionObject == NULL)  | 
4191  |  |         goto onError;  | 
4192  |  |  | 
4193  |  |     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);  | 
4194  |  |     if (restuple == NULL)  | 
4195  |  |         goto onError;  | 
4196  |  |     if (!PyTuple_Check(restuple)) { | 
4197  |  |         PyErr_SetString(PyExc_TypeError, &argparse[3]);  | 
4198  |  |         goto onError;  | 
4199  |  |     }  | 
4200  |  |     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))  | 
4201  |  |         goto onError;  | 
4202  |  |  | 
4203  |  |     /* Copy back the bytes variables, which might have been modified by the  | 
4204  |  |        callback */  | 
4205  |  |     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);  | 
4206  |  |     if (!inputobj)  | 
4207  |  |         goto onError;  | 
4208  |  |     *input = PyBytes_AS_STRING(inputobj);  | 
4209  |  |     insize = PyBytes_GET_SIZE(inputobj);  | 
4210  |  |     *inend = *input + insize;  | 
4211  |  |     /* we can DECREF safely, as the exception has another reference,  | 
4212  |  |        so the object won't go away. */  | 
4213  |  |     Py_DECREF(inputobj);  | 
4214  |  |  | 
4215  |  |     if (newpos<0)  | 
4216  |  |         newpos = insize+newpos;  | 
4217  |  |     if (newpos<0 || newpos>insize) { | 
4218  |  |         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);  | 
4219  |  |         goto onError;  | 
4220  |  |     }  | 
4221  |  |  | 
4222  |  |     repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);  | 
4223  |  |     if (repwstr == NULL)  | 
4224  |  |         goto onError;  | 
4225  |  |     /* need more space? (at least enough for what we  | 
4226  |  |        have+the replacement+the rest of the string (starting  | 
4227  |  |        at the new input position), so we won't have to check space  | 
4228  |  |        when there are no errors in the rest of the string) */  | 
4229  |  |     requiredsize = *outpos;  | 
4230  |  |     if (requiredsize > PY_SSIZE_T_MAX - repwlen)  | 
4231  |  |         goto overflow;  | 
4232  |  |     requiredsize += repwlen;  | 
4233  |  |     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))  | 
4234  |  |         goto overflow;  | 
4235  |  |     requiredsize += insize - newpos;  | 
4236  |  |     outsize = *bufsize;  | 
4237  |  |     if (requiredsize > outsize) { | 
4238  |  |         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)  | 
4239  |  |             requiredsize = 2*outsize;  | 
4240  |  |         if (widechar_resize(buf, bufsize, requiredsize) < 0) { | 
4241  |  |             goto onError;  | 
4242  |  |         }  | 
4243  |  |     }  | 
4244  |  |     wcsncpy(*buf + *outpos, repwstr, repwlen);  | 
4245  |  |     *outpos += repwlen;  | 
4246  |  |     *endinpos = newpos;  | 
4247  |  |     *inptr = *input + newpos;  | 
4248  |  |  | 
4249  |  |     /* we made it! */  | 
4250  |  |     Py_DECREF(restuple);  | 
4251  |  |     return 0;  | 
4252  |  |  | 
4253  |  |   overflow:  | 
4254  |  |     PyErr_SetString(PyExc_OverflowError,  | 
4255  |  |                     "decoded result is too long for a Python string");  | 
4256  |  |  | 
4257  |  |   onError:  | 
4258  |  |     Py_XDECREF(restuple);  | 
4259  |  |     return -1;  | 
4260  |  | }  | 
4261  |  | #endif   /* MS_WINDOWS */  | 
4262  |  |  | 
4263  |  | static int  | 
4264  |  | unicode_decode_call_errorhandler_writer(  | 
4265  |  |     const char *errors, PyObject **errorHandler,  | 
4266  |  |     const char *encoding, const char *reason,  | 
4267  |  |     const char **input, const char **inend, Py_ssize_t *startinpos,  | 
4268  |  |     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,  | 
4269  |  |     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)  | 
4270  | 0  | { | 
4271  | 0  |     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";  | 
4272  |  | 
  | 
4273  | 0  |     PyObject *restuple = NULL;  | 
4274  | 0  |     PyObject *repunicode = NULL;  | 
4275  | 0  |     Py_ssize_t insize;  | 
4276  | 0  |     Py_ssize_t newpos;  | 
4277  | 0  |     Py_ssize_t replen;  | 
4278  | 0  |     Py_ssize_t remain;  | 
4279  | 0  |     PyObject *inputobj = NULL;  | 
4280  | 0  |     int need_to_grow = 0;  | 
4281  | 0  |     const char *new_inptr;  | 
4282  |  | 
  | 
4283  | 0  |     if (*errorHandler == NULL) { | 
4284  | 0  |         *errorHandler = PyCodec_LookupError(errors);  | 
4285  | 0  |         if (*errorHandler == NULL)  | 
4286  | 0  |             goto onError;  | 
4287  | 0  |     }  | 
4288  |  |  | 
4289  | 0  |     make_decode_exception(exceptionObject,  | 
4290  | 0  |         encoding,  | 
4291  | 0  |         *input, *inend - *input,  | 
4292  | 0  |         *startinpos, *endinpos,  | 
4293  | 0  |         reason);  | 
4294  | 0  |     if (*exceptionObject == NULL)  | 
4295  | 0  |         goto onError;  | 
4296  |  |  | 
4297  | 0  |     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);  | 
4298  | 0  |     if (restuple == NULL)  | 
4299  | 0  |         goto onError;  | 
4300  | 0  |     if (!PyTuple_Check(restuple)) { | 
4301  | 0  |         PyErr_SetString(PyExc_TypeError, &argparse[3]);  | 
4302  | 0  |         goto onError;  | 
4303  | 0  |     }  | 
4304  | 0  |     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))  | 
4305  | 0  |         goto onError;  | 
4306  |  |  | 
4307  |  |     /* Copy back the bytes variables, which might have been modified by the  | 
4308  |  |        callback */  | 
4309  | 0  |     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);  | 
4310  | 0  |     if (!inputobj)  | 
4311  | 0  |         goto onError;  | 
4312  | 0  |     remain = *inend - *input - *endinpos;  | 
4313  | 0  |     *input = PyBytes_AS_STRING(inputobj);  | 
4314  | 0  |     insize = PyBytes_GET_SIZE(inputobj);  | 
4315  | 0  |     *inend = *input + insize;  | 
4316  |  |     /* we can DECREF safely, as the exception has another reference,  | 
4317  |  |        so the object won't go away. */  | 
4318  | 0  |     Py_DECREF(inputobj);  | 
4319  |  | 
  | 
4320  | 0  |     if (newpos<0)  | 
4321  | 0  |         newpos = insize+newpos;  | 
4322  | 0  |     if (newpos<0 || newpos>insize) { | 
4323  | 0  |         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);  | 
4324  | 0  |         goto onError;  | 
4325  | 0  |     }  | 
4326  |  |  | 
4327  | 0  |     replen = PyUnicode_GET_LENGTH(repunicode);  | 
4328  | 0  |     if (replen > 1) { | 
4329  | 0  |         writer->min_length += replen - 1;  | 
4330  | 0  |         need_to_grow = 1;  | 
4331  | 0  |     }  | 
4332  | 0  |     new_inptr = *input + newpos;  | 
4333  | 0  |     if (*inend - new_inptr > remain) { | 
4334  |  |         /* We don't know the decoding algorithm here so we make the worst  | 
4335  |  |            assumption that one byte decodes to one unicode character.  | 
4336  |  |            If unfortunately one byte could decode to more unicode characters,  | 
4337  |  |            the decoder may write out-of-bound then.  Is it possible for the  | 
4338  |  |            algorithms using this function? */  | 
4339  | 0  |         writer->min_length += *inend - new_inptr - remain;  | 
4340  | 0  |         need_to_grow = 1;  | 
4341  | 0  |     }  | 
4342  | 0  |     if (need_to_grow) { | 
4343  | 0  |         writer->overallocate = 1;  | 
4344  | 0  |         if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,  | 
4345  | 0  |                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)  | 
4346  | 0  |             goto onError;  | 
4347  | 0  |     }  | 
4348  | 0  |     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)  | 
4349  | 0  |         goto onError;  | 
4350  |  |  | 
4351  | 0  |     *endinpos = newpos;  | 
4352  | 0  |     *inptr = new_inptr;  | 
4353  |  |  | 
4354  |  |     /* we made it! */  | 
4355  | 0  |     Py_DECREF(restuple);  | 
4356  | 0  |     return 0;  | 
4357  |  |  | 
4358  | 0  |   onError:  | 
4359  | 0  |     Py_XDECREF(restuple);  | 
4360  | 0  |     return -1;  | 
4361  | 0  | }  | 
4362  |  |  | 
4363  |  | /* --- UTF-7 Codec -------------------------------------------------------- */  | 
4364  |  |  | 
4365  |  | /* See RFC2152 for details.  We encode conservatively and decode liberally. */  | 
4366  |  |  | 
4367  |  | /* Three simple macros defining base-64. */  | 
4368  |  |  | 
4369  |  | /* Is c a base-64 character? */  | 
4370  |  |  | 
4371  |  | #define IS_BASE64(c) \  | 
4372  | 0  |     (((c) >= 'A' && (c) <= 'Z') ||     \  | 
4373  | 0  |      ((c) >= 'a' && (c) <= 'z') ||     \  | 
4374  | 0  |      ((c) >= '0' && (c) <= '9') ||     \  | 
4375  | 0  |      (c) == '+' || (c) == '/')  | 
4376  |  |  | 
4377  |  | /* given that c is a base-64 character, what is its base-64 value? */  | 
4378  |  |  | 
4379  |  | #define FROM_BASE64(c)                                                  \  | 
4380  | 0  |     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \  | 
4381  | 0  |      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \  | 
4382  | 0  |      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \  | 
4383  | 0  |      (c) == '+' ? 62 : 63)  | 
4384  |  |  | 
4385  |  | /* What is the base-64 character of the bottom 6 bits of n? */  | 
4386  |  |  | 
4387  |  | #define TO_BASE64(n)  \  | 
4388  | 0  |     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) | 
4389  |  |  | 
4390  |  | /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be  | 
4391  |  |  * decoded as itself.  We are permissive on decoding; the only ASCII  | 
4392  |  |  * byte not decoding to itself is the + which begins a base64  | 
4393  |  |  * string. */  | 
4394  |  |  | 
4395  |  | #define DECODE_DIRECT(c)                                \  | 
4396  | 0  |     ((c) <= 127 && (c) != '+')  | 
4397  |  |  | 
4398  |  | /* The UTF-7 encoder treats ASCII characters differently according to  | 
4399  |  |  * whether they are Set D, Set O, Whitespace, or special (i.e. none of  | 
4400  |  |  * the above).  See RFC2152.  This array identifies these different  | 
4401  |  |  * sets:  | 
4402  |  |  * 0 : "Set D"  | 
4403  |  |  *     alphanumeric and '(),-./:?  | 
4404  |  |  * 1 : "Set O"  | 
4405  |  |  *     !"#$%&*;<=>@[]^_`{|} | 
4406  |  |  * 2 : "whitespace"  | 
4407  |  |  *     ht nl cr sp  | 
4408  |  |  * 3 : special (must be base64 encoded)  | 
4409  |  |  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)  | 
4410  |  |  */  | 
4411  |  |  | 
4412  |  | static  | 
4413  |  | char utf7_category[128] = { | 
4414  |  | /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */  | 
4415  |  |     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,  | 
4416  |  | /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */  | 
4417  |  |     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  | 
4418  |  | /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */  | 
4419  |  |     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,  | 
4420  |  | /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */  | 
4421  |  |     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,  | 
4422  |  | /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */  | 
4423  |  |     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  | 
4424  |  | /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */  | 
4425  |  |     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,  | 
4426  |  | /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */  | 
4427  |  |     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  | 
4428  |  | /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */ | 
4429  |  |     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,  | 
4430  |  | };  | 
4431  |  |  | 
4432  |  | /* ENCODE_DIRECT: this character should be encoded as itself.  The  | 
4433  |  |  * answer depends on whether we are encoding set O as itself, and also  | 
4434  |  |  * on whether we are encoding whitespace as itself.  RFC2152 makes it  | 
4435  |  |  * clear that the answers to these questions vary between  | 
4436  |  |  * applications, so this code needs to be flexible.  */  | 
4437  |  |  | 
4438  |  | #define ENCODE_DIRECT(c, directO, directWS)             \  | 
4439  | 0  |     ((c) < 128 && (c) > 0 &&                            \  | 
4440  | 0  |      ((utf7_category[(c)] == 0) ||                      \  | 
4441  | 0  |       (directWS && (utf7_category[(c)] == 2)) ||        \  | 
4442  | 0  |       (directO && (utf7_category[(c)] == 1))))  | 
4443  |  |  | 
4444  |  | PyObject *  | 
4445  |  | PyUnicode_DecodeUTF7(const char *s,  | 
4446  |  |                      Py_ssize_t size,  | 
4447  |  |                      const char *errors)  | 
4448  | 0  | { | 
4449  | 0  |     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);  | 
4450  | 0  | }  | 
4451  |  |  | 
4452  |  | /* The decoder.  The only state we preserve is our read position,  | 
4453  |  |  * i.e. how many characters we have consumed.  So if we end in the  | 
4454  |  |  * middle of a shift sequence we have to back off the read position  | 
4455  |  |  * and the output to the beginning of the sequence, otherwise we lose  | 
4456  |  |  * all the shift state (seen bits, number of bits seen, high  | 
4457  |  |  * surrogate). */  | 
4458  |  |  | 
4459  |  | PyObject *  | 
4460  |  | PyUnicode_DecodeUTF7Stateful(const char *s,  | 
4461  |  |                              Py_ssize_t size,  | 
4462  |  |                              const char *errors,  | 
4463  |  |                              Py_ssize_t *consumed)  | 
4464  | 0  | { | 
4465  | 0  |     const char *starts = s;  | 
4466  | 0  |     Py_ssize_t startinpos;  | 
4467  | 0  |     Py_ssize_t endinpos;  | 
4468  | 0  |     const char *e;  | 
4469  | 0  |     _PyUnicodeWriter writer;  | 
4470  | 0  |     const char *errmsg = "";  | 
4471  | 0  |     int inShift = 0;  | 
4472  | 0  |     Py_ssize_t shiftOutStart;  | 
4473  | 0  |     unsigned int base64bits = 0;  | 
4474  | 0  |     unsigned long base64buffer = 0;  | 
4475  | 0  |     Py_UCS4 surrogate = 0;  | 
4476  | 0  |     PyObject *errorHandler = NULL;  | 
4477  | 0  |     PyObject *exc = NULL;  | 
4478  |  | 
  | 
4479  | 0  |     if (size == 0) { | 
4480  | 0  |         if (consumed)  | 
4481  | 0  |             *consumed = 0;  | 
4482  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
4483  | 0  |     }  | 
4484  |  |  | 
4485  |  |     /* Start off assuming it's all ASCII. Widen later as necessary. */  | 
4486  | 0  |     _PyUnicodeWriter_Init(&writer);  | 
4487  | 0  |     writer.min_length = size;  | 
4488  |  | 
  | 
4489  | 0  |     shiftOutStart = 0;  | 
4490  | 0  |     e = s + size;  | 
4491  |  | 
  | 
4492  | 0  |     while (s < e) { | 
4493  | 0  |         Py_UCS4 ch;  | 
4494  | 0  |       restart:  | 
4495  | 0  |         ch = (unsigned char) *s;  | 
4496  |  | 
  | 
4497  | 0  |         if (inShift) { /* in a base-64 section */ | 
4498  | 0  |             if (IS_BASE64(ch)) { /* consume a base-64 character */ | 
4499  | 0  |                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);  | 
4500  | 0  |                 base64bits += 6;  | 
4501  | 0  |                 s++;  | 
4502  | 0  |                 if (base64bits >= 16) { | 
4503  |  |                     /* we have enough bits for a UTF-16 value */  | 
4504  | 0  |                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));  | 
4505  | 0  |                     base64bits -= 16;  | 
4506  | 0  |                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */  | 
4507  | 0  |                     assert(outCh <= 0xffff);  | 
4508  | 0  |                     if (surrogate) { | 
4509  |  |                         /* expecting a second surrogate */  | 
4510  | 0  |                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { | 
4511  | 0  |                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);  | 
4512  | 0  |                             if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)  | 
4513  | 0  |                                 goto onError;  | 
4514  | 0  |                             surrogate = 0;  | 
4515  | 0  |                             continue;  | 
4516  | 0  |                         }  | 
4517  | 0  |                         else { | 
4518  | 0  |                             if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)  | 
4519  | 0  |                                 goto onError;  | 
4520  | 0  |                             surrogate = 0;  | 
4521  | 0  |                         }  | 
4522  | 0  |                     }  | 
4523  | 0  |                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { | 
4524  |  |                         /* first surrogate */  | 
4525  | 0  |                         surrogate = outCh;  | 
4526  | 0  |                     }  | 
4527  | 0  |                     else { | 
4528  | 0  |                         if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)  | 
4529  | 0  |                             goto onError;  | 
4530  | 0  |                     }  | 
4531  | 0  |                 }  | 
4532  | 0  |             }  | 
4533  | 0  |             else { /* now leaving a base-64 section */ | 
4534  | 0  |                 inShift = 0;  | 
4535  | 0  |                 if (base64bits > 0) { /* left-over bits */ | 
4536  | 0  |                     if (base64bits >= 6) { | 
4537  |  |                         /* We've seen at least one base-64 character */  | 
4538  | 0  |                         s++;  | 
4539  | 0  |                         errmsg = "partial character in shift sequence";  | 
4540  | 0  |                         goto utf7Error;  | 
4541  | 0  |                     }  | 
4542  | 0  |                     else { | 
4543  |  |                         /* Some bits remain; they should be zero */  | 
4544  | 0  |                         if (base64buffer != 0) { | 
4545  | 0  |                             s++;  | 
4546  | 0  |                             errmsg = "non-zero padding bits in shift sequence";  | 
4547  | 0  |                             goto utf7Error;  | 
4548  | 0  |                         }  | 
4549  | 0  |                     }  | 
4550  | 0  |                 }  | 
4551  | 0  |                 if (surrogate && DECODE_DIRECT(ch)) { | 
4552  | 0  |                     if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)  | 
4553  | 0  |                         goto onError;  | 
4554  | 0  |                 }  | 
4555  | 0  |                 surrogate = 0;  | 
4556  | 0  |                 if (ch == '-') { | 
4557  |  |                     /* '-' is absorbed; other terminating  | 
4558  |  |                        characters are preserved */  | 
4559  | 0  |                     s++;  | 
4560  | 0  |                 }  | 
4561  | 0  |             }  | 
4562  | 0  |         }  | 
4563  | 0  |         else if ( ch == '+' ) { | 
4564  | 0  |             startinpos = s-starts;  | 
4565  | 0  |             s++; /* consume '+' */  | 
4566  | 0  |             if (s < e && *s == '-') { /* '+-' encodes '+' */ | 
4567  | 0  |                 s++;  | 
4568  | 0  |                 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)  | 
4569  | 0  |                     goto onError;  | 
4570  | 0  |             }  | 
4571  | 0  |             else if (s < e && !IS_BASE64(*s)) { | 
4572  | 0  |                 s++;  | 
4573  | 0  |                 errmsg = "ill-formed sequence";  | 
4574  | 0  |                 goto utf7Error;  | 
4575  | 0  |             }  | 
4576  | 0  |             else { /* begin base64-encoded section */ | 
4577  | 0  |                 inShift = 1;  | 
4578  | 0  |                 surrogate = 0;  | 
4579  | 0  |                 shiftOutStart = writer.pos;  | 
4580  | 0  |                 base64bits = 0;  | 
4581  | 0  |                 base64buffer = 0;  | 
4582  | 0  |             }  | 
4583  | 0  |         }  | 
4584  | 0  |         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ | 
4585  | 0  |             s++;  | 
4586  | 0  |             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)  | 
4587  | 0  |                 goto onError;  | 
4588  | 0  |         }  | 
4589  | 0  |         else { | 
4590  | 0  |             startinpos = s-starts;  | 
4591  | 0  |             s++;  | 
4592  | 0  |             errmsg = "unexpected special character";  | 
4593  | 0  |             goto utf7Error;  | 
4594  | 0  |         }  | 
4595  | 0  |         continue;  | 
4596  | 0  | utf7Error:  | 
4597  | 0  |         endinpos = s-starts;  | 
4598  | 0  |         if (unicode_decode_call_errorhandler_writer(  | 
4599  | 0  |                 errors, &errorHandler,  | 
4600  | 0  |                 "utf7", errmsg,  | 
4601  | 0  |                 &starts, &e, &startinpos, &endinpos, &exc, &s,  | 
4602  | 0  |                 &writer))  | 
4603  | 0  |             goto onError;  | 
4604  | 0  |     }  | 
4605  |  |  | 
4606  |  |     /* end of string */  | 
4607  |  |  | 
4608  | 0  |     if (inShift && !consumed) { /* in shift sequence, no more to follow */ | 
4609  |  |         /* if we're in an inconsistent state, that's an error */  | 
4610  | 0  |         inShift = 0;  | 
4611  | 0  |         if (surrogate ||  | 
4612  | 0  |                 (base64bits >= 6) ||  | 
4613  | 0  |                 (base64bits > 0 && base64buffer != 0)) { | 
4614  | 0  |             endinpos = size;  | 
4615  | 0  |             if (unicode_decode_call_errorhandler_writer(  | 
4616  | 0  |                     errors, &errorHandler,  | 
4617  | 0  |                     "utf7", "unterminated shift sequence",  | 
4618  | 0  |                     &starts, &e, &startinpos, &endinpos, &exc, &s,  | 
4619  | 0  |                     &writer))  | 
4620  | 0  |                 goto onError;  | 
4621  | 0  |             if (s < e)  | 
4622  | 0  |                 goto restart;  | 
4623  | 0  |         }  | 
4624  | 0  |     }  | 
4625  |  |  | 
4626  |  |     /* return state */  | 
4627  | 0  |     if (consumed) { | 
4628  | 0  |         if (inShift) { | 
4629  | 0  |             *consumed = startinpos;  | 
4630  | 0  |             if (writer.pos != shiftOutStart && writer.maxchar > 127) { | 
4631  | 0  |                 PyObject *result = PyUnicode_FromKindAndData(  | 
4632  | 0  |                         writer.kind, writer.data, shiftOutStart);  | 
4633  | 0  |                 Py_XDECREF(errorHandler);  | 
4634  | 0  |                 Py_XDECREF(exc);  | 
4635  | 0  |                 _PyUnicodeWriter_Dealloc(&writer);  | 
4636  | 0  |                 return result;  | 
4637  | 0  |             }  | 
4638  | 0  |             writer.pos = shiftOutStart; /* back off output */  | 
4639  | 0  |         }  | 
4640  | 0  |         else { | 
4641  | 0  |             *consumed = s-starts;  | 
4642  | 0  |         }  | 
4643  | 0  |     }  | 
4644  |  |  | 
4645  | 0  |     Py_XDECREF(errorHandler);  | 
4646  | 0  |     Py_XDECREF(exc);  | 
4647  | 0  |     return _PyUnicodeWriter_Finish(&writer);  | 
4648  |  |  | 
4649  | 0  |   onError:  | 
4650  | 0  |     Py_XDECREF(errorHandler);  | 
4651  | 0  |     Py_XDECREF(exc);  | 
4652  | 0  |     _PyUnicodeWriter_Dealloc(&writer);  | 
4653  | 0  |     return NULL;  | 
4654  | 0  | }  | 
4655  |  |  | 
4656  |  |  | 
4657  |  | PyObject *  | 
4658  |  | _PyUnicode_EncodeUTF7(PyObject *str,  | 
4659  |  |                       int base64SetO,  | 
4660  |  |                       int base64WhiteSpace,  | 
4661  |  |                       const char *errors)  | 
4662  | 0  | { | 
4663  | 0  |     int kind;  | 
4664  | 0  |     void *data;  | 
4665  | 0  |     Py_ssize_t len;  | 
4666  | 0  |     PyObject *v;  | 
4667  | 0  |     int inShift = 0;  | 
4668  | 0  |     Py_ssize_t i;  | 
4669  | 0  |     unsigned int base64bits = 0;  | 
4670  | 0  |     unsigned long base64buffer = 0;  | 
4671  | 0  |     char * out;  | 
4672  | 0  |     char * start;  | 
4673  |  | 
  | 
4674  | 0  |     if (PyUnicode_READY(str) == -1)  | 
4675  | 0  |         return NULL;  | 
4676  | 0  |     kind = PyUnicode_KIND(str);  | 
4677  | 0  |     data = PyUnicode_DATA(str);  | 
4678  | 0  |     len = PyUnicode_GET_LENGTH(str);  | 
4679  |  | 
  | 
4680  | 0  |     if (len == 0)  | 
4681  | 0  |         return PyBytes_FromStringAndSize(NULL, 0);  | 
4682  |  |  | 
4683  |  |     /* It might be possible to tighten this worst case */  | 
4684  | 0  |     if (len > PY_SSIZE_T_MAX / 8)  | 
4685  | 0  |         return PyErr_NoMemory();  | 
4686  | 0  |     v = PyBytes_FromStringAndSize(NULL, len * 8);  | 
4687  | 0  |     if (v == NULL)  | 
4688  | 0  |         return NULL;  | 
4689  |  |  | 
4690  | 0  |     start = out = PyBytes_AS_STRING(v);  | 
4691  | 0  |     for (i = 0; i < len; ++i) { | 
4692  | 0  |         Py_UCS4 ch = PyUnicode_READ(kind, data, i);  | 
4693  |  | 
  | 
4694  | 0  |         if (inShift) { | 
4695  | 0  |             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { | 
4696  |  |                 /* shifting out */  | 
4697  | 0  |                 if (base64bits) { /* output remaining bits */ | 
4698  | 0  |                     *out++ = TO_BASE64(base64buffer << (6-base64bits));  | 
4699  | 0  |                     base64buffer = 0;  | 
4700  | 0  |                     base64bits = 0;  | 
4701  | 0  |                 }  | 
4702  | 0  |                 inShift = 0;  | 
4703  |  |                 /* Characters not in the BASE64 set implicitly unshift the sequence  | 
4704  |  |                    so no '-' is required, except if the character is itself a '-' */  | 
4705  | 0  |                 if (IS_BASE64(ch) || ch == '-') { | 
4706  | 0  |                     *out++ = '-';  | 
4707  | 0  |                 }  | 
4708  | 0  |                 *out++ = (char) ch;  | 
4709  | 0  |             }  | 
4710  | 0  |             else { | 
4711  | 0  |                 goto encode_char;  | 
4712  | 0  |             }  | 
4713  | 0  |         }  | 
4714  | 0  |         else { /* not in a shift sequence */ | 
4715  | 0  |             if (ch == '+') { | 
4716  | 0  |                 *out++ = '+';  | 
4717  | 0  |                         *out++ = '-';  | 
4718  | 0  |             }  | 
4719  | 0  |             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { | 
4720  | 0  |                 *out++ = (char) ch;  | 
4721  | 0  |             }  | 
4722  | 0  |             else { | 
4723  | 0  |                 *out++ = '+';  | 
4724  | 0  |                 inShift = 1;  | 
4725  | 0  |                 goto encode_char;  | 
4726  | 0  |             }  | 
4727  | 0  |         }  | 
4728  | 0  |         continue;  | 
4729  | 0  | encode_char:  | 
4730  | 0  |         if (ch >= 0x10000) { | 
4731  | 0  |             assert(ch <= MAX_UNICODE);  | 
4732  |  |  | 
4733  |  |             /* code first surrogate */  | 
4734  | 0  |             base64bits += 16;  | 
4735  | 0  |             base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);  | 
4736  | 0  |             while (base64bits >= 6) { | 
4737  | 0  |                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));  | 
4738  | 0  |                 base64bits -= 6;  | 
4739  | 0  |             }  | 
4740  |  |             /* prepare second surrogate */  | 
4741  | 0  |             ch = Py_UNICODE_LOW_SURROGATE(ch);  | 
4742  | 0  |         }  | 
4743  | 0  |         base64bits += 16;  | 
4744  | 0  |         base64buffer = (base64buffer << 16) | ch;  | 
4745  | 0  |         while (base64bits >= 6) { | 
4746  | 0  |             *out++ = TO_BASE64(base64buffer >> (base64bits-6));  | 
4747  | 0  |             base64bits -= 6;  | 
4748  | 0  |         }  | 
4749  | 0  |     }  | 
4750  | 0  |     if (base64bits)  | 
4751  | 0  |         *out++= TO_BASE64(base64buffer << (6-base64bits) );  | 
4752  | 0  |     if (inShift)  | 
4753  | 0  |         *out++ = '-';  | 
4754  | 0  |     if (_PyBytes_Resize(&v, out - start) < 0)  | 
4755  | 0  |         return NULL;  | 
4756  | 0  |     return v;  | 
4757  | 0  | }  | 
4758  |  | PyObject *  | 
4759  |  | PyUnicode_EncodeUTF7(const Py_UNICODE *s,  | 
4760  |  |                      Py_ssize_t size,  | 
4761  |  |                      int base64SetO,  | 
4762  |  |                      int base64WhiteSpace,  | 
4763  |  |                      const char *errors)  | 
4764  | 0  | { | 
4765  | 0  |     PyObject *result;  | 
4766  | 0  |     PyObject *tmp = PyUnicode_FromWideChar(s, size);  | 
4767  | 0  |     if (tmp == NULL)  | 
4768  | 0  |         return NULL;  | 
4769  | 0  |     result = _PyUnicode_EncodeUTF7(tmp, base64SetO,  | 
4770  | 0  |                                    base64WhiteSpace, errors);  | 
4771  | 0  |     Py_DECREF(tmp);  | 
4772  | 0  |     return result;  | 
4773  | 0  | }  | 
4774  |  |  | 
4775  |  | #undef IS_BASE64  | 
4776  |  | #undef FROM_BASE64  | 
4777  |  | #undef TO_BASE64  | 
4778  |  | #undef DECODE_DIRECT  | 
4779  |  | #undef ENCODE_DIRECT  | 
4780  |  |  | 
4781  |  | /* --- UTF-8 Codec -------------------------------------------------------- */  | 
4782  |  |  | 
4783  |  | PyObject *  | 
4784  |  | PyUnicode_DecodeUTF8(const char *s,  | 
4785  |  |                      Py_ssize_t size,  | 
4786  |  |                      const char *errors)  | 
4787  | 405  | { | 
4788  | 405  |     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);  | 
4789  | 405  | }  | 
4790  |  |  | 
4791  |  | #include "stringlib/asciilib.h"  | 
4792  |  | #include "stringlib/codecs.h"  | 
4793  |  | #include "stringlib/undef.h"  | 
4794  |  |  | 
4795  |  | #include "stringlib/ucs1lib.h"  | 
4796  |  | #include "stringlib/codecs.h"  | 
4797  |  | #include "stringlib/undef.h"  | 
4798  |  |  | 
4799  |  | #include "stringlib/ucs2lib.h"  | 
4800  |  | #include "stringlib/codecs.h"  | 
4801  |  | #include "stringlib/undef.h"  | 
4802  |  |  | 
4803  |  | #include "stringlib/ucs4lib.h"  | 
4804  |  | #include "stringlib/codecs.h"  | 
4805  |  | #include "stringlib/undef.h"  | 
4806  |  |  | 
4807  |  | /* Mask to quickly check whether a C 'long' contains a  | 
4808  |  |    non-ASCII, UTF8-encoded char. */  | 
4809  |  | #if (SIZEOF_LONG == 8)  | 
4810  | 78.7k  | # define ASCII_CHAR_MASK 0x8080808080808080UL  | 
4811  |  | #elif (SIZEOF_LONG == 4)  | 
4812  |  | # define ASCII_CHAR_MASK 0x80808080UL  | 
4813  |  | #else  | 
4814  |  | # error C 'long' size should be either 4 or 8!  | 
4815  |  | #endif  | 
4816  |  |  | 
4817  |  | static Py_ssize_t  | 
4818  |  | ascii_decode(const char *start, const char *end, Py_UCS1 *dest)  | 
4819  | 64.3k  | { | 
4820  | 64.3k  |     const char *p = start;  | 
4821  | 64.3k  |     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);  | 
4822  |  |  | 
4823  |  |     /*  | 
4824  |  |      * Issue #17237: m68k is a bit different from most architectures in  | 
4825  |  |      * that objects do not use "natural alignment" - for example, int and  | 
4826  |  |      * long are only aligned at 2-byte boundaries.  Therefore the assert()  | 
4827  |  |      * won't work; also, tests have shown that skipping the "optimised  | 
4828  |  |      * version" will even speed up m68k.  | 
4829  |  |      */  | 
4830  | 64.3k  | #if !defined(__m68k__)  | 
4831  | 64.3k  | #if SIZEOF_LONG <= SIZEOF_VOID_P  | 
4832  | 64.3k  |     assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));  | 
4833  | 64.3k  |     if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { | 
4834  |  |         /* Fast path, see in STRINGLIB(utf8_decode) for  | 
4835  |  |            an explanation. */  | 
4836  |  |         /* Help allocation */  | 
4837  | 14.7k  |         const char *_p = p;  | 
4838  | 14.7k  |         Py_UCS1 * q = dest;  | 
4839  | 66.6k  |         while (_p < aligned_end) { | 
4840  | 51.8k  |             unsigned long value = *(const unsigned long *) _p;  | 
4841  | 51.8k  |             if (value & ASCII_CHAR_MASK)  | 
4842  | 0  |                 break;  | 
4843  | 51.8k  |             *((unsigned long *)q) = value;  | 
4844  | 51.8k  |             _p += SIZEOF_LONG;  | 
4845  | 51.8k  |             q += SIZEOF_LONG;  | 
4846  | 51.8k  |         }  | 
4847  | 14.7k  |         p = _p;  | 
4848  | 62.2k  |         while (p < end) { | 
4849  | 47.5k  |             if ((unsigned char)*p & 0x80)  | 
4850  | 0  |                 break;  | 
4851  | 47.5k  |             *q++ = *p++;  | 
4852  | 47.5k  |         }  | 
4853  | 14.7k  |         return p - start;  | 
4854  | 14.7k  |     }  | 
4855  | 49.5k  | #endif  | 
4856  | 49.5k  | #endif  | 
4857  | 397k  |     while (p < end) { | 
4858  |  |         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h  | 
4859  |  |            for an explanation. */  | 
4860  | 350k  |         if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { | 
4861  |  |             /* Help allocation */  | 
4862  | 43.7k  |             const char *_p = p;  | 
4863  | 70.6k  |             while (_p < aligned_end) { | 
4864  | 26.8k  |                 unsigned long value = *(const unsigned long *) _p;  | 
4865  | 26.8k  |                 if (value & ASCII_CHAR_MASK)  | 
4866  | 15  |                     break;  | 
4867  | 26.8k  |                 _p += SIZEOF_LONG;  | 
4868  | 26.8k  |             }  | 
4869  | 43.7k  |             p = _p;  | 
4870  | 43.7k  |             if (_p == end)  | 
4871  | 2.62k  |                 break;  | 
4872  | 43.7k  |         }  | 
4873  | 348k  |         if ((unsigned char)*p & 0x80)  | 
4874  | 15  |             break;  | 
4875  | 348k  |         ++p;  | 
4876  | 348k  |     }  | 
4877  | 49.5k  |     memcpy(dest, start, p - start);  | 
4878  | 49.5k  |     return p - start;  | 
4879  | 64.3k  | }  | 
4880  |  |  | 
4881  |  | static PyObject *  | 
4882  |  | unicode_decode_utf8(const char *s, Py_ssize_t size,  | 
4883  |  |                     _Py_error_handler error_handler, const char *errors,  | 
4884  |  |                     Py_ssize_t *consumed)  | 
4885  | 64.7k  | { | 
4886  | 64.7k  |     _PyUnicodeWriter writer;  | 
4887  | 64.7k  |     const char *starts = s;  | 
4888  | 64.7k  |     const char *end = s + size;  | 
4889  |  |  | 
4890  | 64.7k  |     Py_ssize_t startinpos;  | 
4891  | 64.7k  |     Py_ssize_t endinpos;  | 
4892  | 64.7k  |     const char *errmsg = "";  | 
4893  | 64.7k  |     PyObject *error_handler_obj = NULL;  | 
4894  | 64.7k  |     PyObject *exc = NULL;  | 
4895  |  |  | 
4896  | 64.7k  |     if (size == 0) { | 
4897  | 333  |         if (consumed)  | 
4898  | 0  |             *consumed = 0;  | 
4899  | 333  |         _Py_RETURN_UNICODE_EMPTY();  | 
4900  | 333  |     }  | 
4901  |  |  | 
4902  |  |     /* ASCII is equivalent to the first 128 ordinals in Unicode. */  | 
4903  | 64.4k  |     if (size == 1 && (unsigned char)s[0] < 128) { | 
4904  | 566  |         if (consumed)  | 
4905  | 0  |             *consumed = 1;  | 
4906  | 566  |         return get_latin1_char((unsigned char)s[0]);  | 
4907  | 566  |     }  | 
4908  |  |  | 
4909  | 63.8k  |     _PyUnicodeWriter_Init(&writer);  | 
4910  | 63.8k  |     writer.min_length = size;  | 
4911  | 63.8k  |     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)  | 
4912  | 0  |         goto onError;  | 
4913  |  |  | 
4914  | 63.8k  |     writer.pos = ascii_decode(s, end, writer.data);  | 
4915  | 63.8k  |     s += writer.pos;  | 
4916  | 63.9k  |     while (s < end) { | 
4917  | 44  |         Py_UCS4 ch;  | 
4918  | 44  |         int kind = writer.kind;  | 
4919  |  |  | 
4920  | 44  |         if (kind == PyUnicode_1BYTE_KIND) { | 
4921  | 30  |             if (PyUnicode_IS_ASCII(writer.buffer))  | 
4922  | 15  |                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);  | 
4923  | 15  |             else  | 
4924  | 15  |                 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);  | 
4925  | 30  |         } else if (kind == PyUnicode_2BYTE_KIND) { | 
4926  | 14  |             ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);  | 
4927  | 14  |         } else { | 
4928  | 0  |             assert(kind == PyUnicode_4BYTE_KIND);  | 
4929  | 0  |             ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);  | 
4930  | 0  |         }  | 
4931  |  |  | 
4932  | 44  |         switch (ch) { | 
4933  | 15  |         case 0:  | 
4934  | 15  |             if (s == end || consumed)  | 
4935  | 15  |                 goto End;  | 
4936  | 0  |             errmsg = "unexpected end of data";  | 
4937  | 0  |             startinpos = s - starts;  | 
4938  | 0  |             endinpos = end - starts;  | 
4939  | 0  |             break;  | 
4940  | 0  |         case 1:  | 
4941  | 0  |             errmsg = "invalid start byte";  | 
4942  | 0  |             startinpos = s - starts;  | 
4943  | 0  |             endinpos = startinpos + 1;  | 
4944  | 0  |             break;  | 
4945  | 0  |         case 2:  | 
4946  | 0  |             if (consumed && (unsigned char)s[0] == 0xED && end - s == 2  | 
4947  | 0  |                 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)  | 
4948  | 0  |             { | 
4949  |  |                 /* Truncated surrogate code in range D800-DFFF */  | 
4950  | 0  |                 goto End;  | 
4951  | 0  |             }  | 
4952  |  |             /* fall through */  | 
4953  | 0  |         case 3:  | 
4954  | 0  |         case 4:  | 
4955  | 0  |             errmsg = "invalid continuation byte";  | 
4956  | 0  |             startinpos = s - starts;  | 
4957  | 0  |             endinpos = startinpos + ch - 1;  | 
4958  | 0  |             break;  | 
4959  | 29  |         default:  | 
4960  | 29  |             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)  | 
4961  | 0  |                 goto onError;  | 
4962  | 29  |             continue;  | 
4963  | 44  |         }  | 
4964  |  |  | 
4965  | 0  |         if (error_handler == _Py_ERROR_UNKNOWN)  | 
4966  | 0  |             error_handler = _Py_GetErrorHandler(errors);  | 
4967  |  | 
  | 
4968  | 0  |         switch (error_handler) { | 
4969  | 0  |         case _Py_ERROR_IGNORE:  | 
4970  | 0  |             s += (endinpos - startinpos);  | 
4971  | 0  |             break;  | 
4972  |  |  | 
4973  | 0  |         case _Py_ERROR_REPLACE:  | 
4974  | 0  |             if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)  | 
4975  | 0  |                 goto onError;  | 
4976  | 0  |             s += (endinpos - startinpos);  | 
4977  | 0  |             break;  | 
4978  |  |  | 
4979  | 0  |         case _Py_ERROR_SURROGATEESCAPE:  | 
4980  | 0  |         { | 
4981  | 0  |             Py_ssize_t i;  | 
4982  |  | 
  | 
4983  | 0  |             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)  | 
4984  | 0  |                 goto onError;  | 
4985  | 0  |             for (i=startinpos; i<endinpos; i++) { | 
4986  | 0  |                 ch = (Py_UCS4)(unsigned char)(starts[i]);  | 
4987  | 0  |                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,  | 
4988  | 0  |                                 ch + 0xdc00);  | 
4989  | 0  |                 writer.pos++;  | 
4990  | 0  |             }  | 
4991  | 0  |             s += (endinpos - startinpos);  | 
4992  | 0  |             break;  | 
4993  | 0  |         }  | 
4994  |  |  | 
4995  | 0  |         default:  | 
4996  | 0  |             if (unicode_decode_call_errorhandler_writer(  | 
4997  | 0  |                     errors, &error_handler_obj,  | 
4998  | 0  |                     "utf-8", errmsg,  | 
4999  | 0  |                     &starts, &end, &startinpos, &endinpos, &exc, &s,  | 
5000  | 0  |                     &writer))  | 
5001  | 0  |                 goto onError;  | 
5002  | 0  |         }  | 
5003  | 0  |     }  | 
5004  |  |  | 
5005  | 63.8k  | End:  | 
5006  | 63.8k  |     if (consumed)  | 
5007  | 2  |         *consumed = s - starts;  | 
5008  |  |  | 
5009  | 63.8k  |     Py_XDECREF(error_handler_obj);  | 
5010  | 63.8k  |     Py_XDECREF(exc);  | 
5011  | 63.8k  |     return _PyUnicodeWriter_Finish(&writer);  | 
5012  |  |  | 
5013  | 0  | onError:  | 
5014  | 0  |     Py_XDECREF(error_handler_obj);  | 
5015  | 0  |     Py_XDECREF(exc);  | 
5016  | 0  |     _PyUnicodeWriter_Dealloc(&writer);  | 
5017  | 0  |     return NULL;  | 
5018  | 63.8k  | }  | 
5019  |  |  | 
5020  |  |  | 
5021  |  | PyObject *  | 
5022  |  | PyUnicode_DecodeUTF8Stateful(const char *s,  | 
5023  |  |                              Py_ssize_t size,  | 
5024  |  |                              const char *errors,  | 
5025  |  |                              Py_ssize_t *consumed)  | 
5026  | 64.7k  | { | 
5027  | 64.7k  |     return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);  | 
5028  | 64.7k  | }  | 
5029  |  |  | 
5030  |  |  | 
5031  |  | /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is  | 
5032  |  |    non-zero, use strict error handler otherwise.  | 
5033  |  |  | 
5034  |  |    On success, write a pointer to a newly allocated wide character string into  | 
5035  |  |    *wstr (use PyMem_RawFree() to free the memory) and write the output length  | 
5036  |  |    (in number of wchar_t units) into *wlen (if wlen is set).  | 
5037  |  |  | 
5038  |  |    On memory allocation failure, return -1.  | 
5039  |  |  | 
5040  |  |    On decoding error (if surrogateescape is zero), return -2. If wlen is  | 
5041  |  |    non-NULL, write the start of the illegal byte sequence into *wlen. If reason  | 
5042  |  |    is not NULL, write the decoding error message into *reason. */  | 
5043  |  | int  | 
5044  |  | _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,  | 
5045  |  |                  const char **reason, _Py_error_handler errors)  | 
5046  | 0  | { | 
5047  | 0  |     const char *orig_s = s;  | 
5048  | 0  |     const char *e;  | 
5049  | 0  |     wchar_t *unicode;  | 
5050  | 0  |     Py_ssize_t outpos;  | 
5051  |  | 
  | 
5052  | 0  |     int surrogateescape = 0;  | 
5053  | 0  |     int surrogatepass = 0;  | 
5054  | 0  |     switch (errors)  | 
5055  | 0  |     { | 
5056  | 0  |     case _Py_ERROR_STRICT:  | 
5057  | 0  |         break;  | 
5058  | 0  |     case _Py_ERROR_SURROGATEESCAPE:  | 
5059  | 0  |         surrogateescape = 1;  | 
5060  | 0  |         break;  | 
5061  | 0  |     case _Py_ERROR_SURROGATEPASS:  | 
5062  | 0  |         surrogatepass = 1;  | 
5063  | 0  |         break;  | 
5064  | 0  |     default:  | 
5065  | 0  |         return -3;  | 
5066  | 0  |     }  | 
5067  |  |  | 
5068  |  |     /* Note: size will always be longer than the resulting Unicode  | 
5069  |  |        character count */  | 
5070  | 0  |     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) { | 
5071  | 0  |         return -1;  | 
5072  | 0  |     }  | 
5073  |  |  | 
5074  | 0  |     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));  | 
5075  | 0  |     if (!unicode) { | 
5076  | 0  |         return -1;  | 
5077  | 0  |     }  | 
5078  |  |  | 
5079  |  |     /* Unpack UTF-8 encoded data */  | 
5080  | 0  |     e = s + size;  | 
5081  | 0  |     outpos = 0;  | 
5082  | 0  |     while (s < e) { | 
5083  | 0  |         Py_UCS4 ch;  | 
5084  | 0  | #if SIZEOF_WCHAR_T == 4  | 
5085  | 0  |         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);  | 
5086  |  | #else  | 
5087  |  |         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);  | 
5088  |  | #endif  | 
5089  | 0  |         if (ch > 0xFF) { | 
5090  | 0  | #if SIZEOF_WCHAR_T == 4  | 
5091  | 0  |             Py_UNREACHABLE();  | 
5092  |  | #else  | 
5093  |  |             assert(ch > 0xFFFF && ch <= MAX_UNICODE);  | 
5094  |  |             /* write a surrogate pair */  | 
5095  |  |             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);  | 
5096  |  |             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);  | 
5097  |  | #endif  | 
5098  | 0  |         }  | 
5099  | 0  |         else { | 
5100  | 0  |             if (!ch && s == e) { | 
5101  | 0  |                 break;  | 
5102  | 0  |             }  | 
5103  |  |  | 
5104  | 0  |             if (surrogateescape) { | 
5105  | 0  |                 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;  | 
5106  | 0  |             }  | 
5107  | 0  |             else { | 
5108  |  |                 /* Is it a valid three-byte code? */  | 
5109  | 0  |                 if (surrogatepass  | 
5110  | 0  |                     && (e - s) >= 3  | 
5111  | 0  |                     && (s[0] & 0xf0) == 0xe0  | 
5112  | 0  |                     && (s[1] & 0xc0) == 0x80  | 
5113  | 0  |                     && (s[2] & 0xc0) == 0x80)  | 
5114  | 0  |                 { | 
5115  | 0  |                     ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);  | 
5116  | 0  |                     s += 3;  | 
5117  | 0  |                     unicode[outpos++] = ch;  | 
5118  | 0  |                 }  | 
5119  | 0  |                 else { | 
5120  | 0  |                     PyMem_RawFree(unicode );  | 
5121  | 0  |                     if (reason != NULL) { | 
5122  | 0  |                         switch (ch) { | 
5123  | 0  |                         case 0:  | 
5124  | 0  |                             *reason = "unexpected end of data";  | 
5125  | 0  |                             break;  | 
5126  | 0  |                         case 1:  | 
5127  | 0  |                             *reason = "invalid start byte";  | 
5128  | 0  |                             break;  | 
5129  |  |                         /* 2, 3, 4 */  | 
5130  | 0  |                         default:  | 
5131  | 0  |                             *reason = "invalid continuation byte";  | 
5132  | 0  |                             break;  | 
5133  | 0  |                         }  | 
5134  | 0  |                     }  | 
5135  | 0  |                     if (wlen != NULL) { | 
5136  | 0  |                         *wlen = s - orig_s;  | 
5137  | 0  |                     }  | 
5138  | 0  |                     return -2;  | 
5139  | 0  |                 }  | 
5140  | 0  |             }  | 
5141  | 0  |         }  | 
5142  | 0  |     }  | 
5143  | 0  |     unicode[outpos] = L'\0';  | 
5144  | 0  |     if (wlen) { | 
5145  | 0  |         *wlen = outpos;  | 
5146  | 0  |     }  | 
5147  | 0  |     *wstr = unicode;  | 
5148  | 0  |     return 0;  | 
5149  | 0  | }  | 
5150  |  |  | 
5151  |  |  | 
5152  |  | wchar_t*  | 
5153  |  | _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,  | 
5154  |  |                                size_t *wlen)  | 
5155  | 0  | { | 
5156  | 0  |     wchar_t *wstr;  | 
5157  | 0  |     int res = _Py_DecodeUTF8Ex(arg, arglen,  | 
5158  | 0  |                                &wstr, wlen,  | 
5159  | 0  |                                NULL, _Py_ERROR_SURROGATEESCAPE);  | 
5160  | 0  |     if (res != 0) { | 
5161  |  |         /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */  | 
5162  | 0  |         assert(res != -3);  | 
5163  | 0  |         if (wlen) { | 
5164  | 0  |             *wlen = (size_t)res;  | 
5165  | 0  |         }  | 
5166  | 0  |         return NULL;  | 
5167  | 0  |     }  | 
5168  | 0  |     return wstr;  | 
5169  | 0  | }  | 
5170  |  |  | 
5171  |  |  | 
5172  |  | /* UTF-8 encoder using the surrogateescape error handler .  | 
5173  |  |  | 
5174  |  |    On success, return 0 and write the newly allocated character string (use  | 
5175  |  |    PyMem_Free() to free the memory) into *str.  | 
5176  |  |  | 
5177  |  |    On encoding failure, return -2 and write the position of the invalid  | 
5178  |  |    surrogate character into *error_pos (if error_pos is set) and the decoding  | 
5179  |  |    error message into *reason (if reason is set).  | 
5180  |  |  | 
5181  |  |    On memory allocation failure, return -1. */  | 
5182  |  | int  | 
5183  |  | _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,  | 
5184  |  |                  const char **reason, int raw_malloc, _Py_error_handler errors)  | 
5185  | 56  | { | 
5186  | 56  |     const Py_ssize_t max_char_size = 4;  | 
5187  | 56  |     Py_ssize_t len = wcslen(text);  | 
5188  |  |  | 
5189  | 56  |     assert(len >= 0);  | 
5190  |  |  | 
5191  | 56  |     int surrogateescape = 0;  | 
5192  | 56  |     int surrogatepass = 0;  | 
5193  | 56  |     switch (errors)  | 
5194  | 56  |     { | 
5195  | 56  |     case _Py_ERROR_STRICT:  | 
5196  | 56  |         break;  | 
5197  | 0  |     case _Py_ERROR_SURROGATEESCAPE:  | 
5198  | 0  |         surrogateescape = 1;  | 
5199  | 0  |         break;  | 
5200  | 0  |     case _Py_ERROR_SURROGATEPASS:  | 
5201  | 0  |         surrogatepass = 1;  | 
5202  | 0  |         break;  | 
5203  | 0  |     default:  | 
5204  | 0  |         return -3;  | 
5205  | 56  |     }  | 
5206  |  |  | 
5207  | 56  |     if (len > PY_SSIZE_T_MAX / max_char_size - 1) { | 
5208  | 0  |         return -1;  | 
5209  | 0  |     }  | 
5210  | 56  |     char *bytes;  | 
5211  | 56  |     if (raw_malloc) { | 
5212  | 56  |         bytes = PyMem_RawMalloc((len + 1) * max_char_size);  | 
5213  | 56  |     }  | 
5214  | 0  |     else { | 
5215  | 0  |         bytes = PyMem_Malloc((len + 1) * max_char_size);  | 
5216  | 0  |     }  | 
5217  | 56  |     if (bytes == NULL) { | 
5218  | 0  |         return -1;  | 
5219  | 0  |     }  | 
5220  |  |  | 
5221  | 56  |     char *p = bytes;  | 
5222  | 56  |     Py_ssize_t i;  | 
5223  | 728  |     for (i = 0; i < len; ) { | 
5224  | 672  |         Py_ssize_t ch_pos = i;  | 
5225  | 672  |         Py_UCS4 ch = text[i];  | 
5226  | 672  |         i++;  | 
5227  |  | #if Py_UNICODE_SIZE == 2  | 
5228  |  |         if (Py_UNICODE_IS_HIGH_SURROGATE(ch)  | 
5229  |  |             && i < len  | 
5230  |  |             && Py_UNICODE_IS_LOW_SURROGATE(text[i]))  | 
5231  |  |         { | 
5232  |  |             ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);  | 
5233  |  |             i++;  | 
5234  |  |         }  | 
5235  |  | #endif  | 
5236  |  |  | 
5237  | 672  |         if (ch < 0x80) { | 
5238  |  |             /* Encode ASCII */  | 
5239  | 672  |             *p++ = (char) ch;  | 
5240  |  |  | 
5241  | 672  |         }  | 
5242  | 0  |         else if (ch < 0x0800) { | 
5243  |  |             /* Encode Latin-1 */  | 
5244  | 0  |             *p++ = (char)(0xc0 | (ch >> 6));  | 
5245  | 0  |             *p++ = (char)(0x80 | (ch & 0x3f));  | 
5246  | 0  |         }  | 
5247  | 0  |         else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) { | 
5248  |  |             /* surrogateescape error handler */  | 
5249  | 0  |             if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) { | 
5250  | 0  |                 if (error_pos != NULL) { | 
5251  | 0  |                     *error_pos = (size_t)ch_pos;  | 
5252  | 0  |                 }  | 
5253  | 0  |                 if (reason != NULL) { | 
5254  | 0  |                     *reason = "encoding error";  | 
5255  | 0  |                 }  | 
5256  | 0  |                 if (raw_malloc) { | 
5257  | 0  |                     PyMem_RawFree(bytes);  | 
5258  | 0  |                 }  | 
5259  | 0  |                 else { | 
5260  | 0  |                     PyMem_Free(bytes);  | 
5261  | 0  |                 }  | 
5262  | 0  |                 return -2;  | 
5263  | 0  |             }  | 
5264  | 0  |             *p++ = (char)(ch & 0xff);  | 
5265  | 0  |         }  | 
5266  | 0  |         else if (ch < 0x10000) { | 
5267  | 0  |             *p++ = (char)(0xe0 | (ch >> 12));  | 
5268  | 0  |             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));  | 
5269  | 0  |             *p++ = (char)(0x80 | (ch & 0x3f));  | 
5270  | 0  |         }  | 
5271  | 0  |         else {  /* ch >= 0x10000 */ | 
5272  | 0  |             assert(ch <= MAX_UNICODE);  | 
5273  |  |             /* Encode UCS4 Unicode ordinals */  | 
5274  | 0  |             *p++ = (char)(0xf0 | (ch >> 18));  | 
5275  | 0  |             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));  | 
5276  | 0  |             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));  | 
5277  | 0  |             *p++ = (char)(0x80 | (ch & 0x3f));  | 
5278  | 0  |         }  | 
5279  | 672  |     }  | 
5280  | 56  |     *p++ = '\0';  | 
5281  |  |  | 
5282  | 56  |     size_t final_size = (p - bytes);  | 
5283  | 56  |     char *bytes2;  | 
5284  | 56  |     if (raw_malloc) { | 
5285  | 56  |         bytes2 = PyMem_RawRealloc(bytes, final_size);  | 
5286  | 56  |     }  | 
5287  | 0  |     else { | 
5288  | 0  |         bytes2 = PyMem_Realloc(bytes, final_size);  | 
5289  | 0  |     }  | 
5290  | 56  |     if (bytes2 == NULL) { | 
5291  | 0  |         if (error_pos != NULL) { | 
5292  | 0  |             *error_pos = (size_t)-1;  | 
5293  | 0  |         }  | 
5294  | 0  |         if (raw_malloc) { | 
5295  | 0  |             PyMem_RawFree(bytes);  | 
5296  | 0  |         }  | 
5297  | 0  |         else { | 
5298  | 0  |             PyMem_Free(bytes);  | 
5299  | 0  |         }  | 
5300  | 0  |         return -1;  | 
5301  | 0  |     }  | 
5302  | 56  |     *str = bytes2;  | 
5303  | 56  |     return 0;  | 
5304  | 56  | }  | 
5305  |  |  | 
5306  |  |  | 
5307  |  | /* Primary internal function which creates utf8 encoded bytes objects.  | 
5308  |  |  | 
5309  |  |    Allocation strategy:  if the string is short, convert into a stack buffer  | 
5310  |  |    and allocate exactly as much space needed at the end.  Else allocate the  | 
5311  |  |    maximum possible needed (4 result bytes per Unicode character), and return  | 
5312  |  |    the excess memory at the end.  | 
5313  |  | */  | 
5314  |  | static PyObject *  | 
5315  |  | unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,  | 
5316  |  |                     const char *errors)  | 
5317  | 0  | { | 
5318  | 0  |     enum PyUnicode_Kind kind;  | 
5319  | 0  |     void *data;  | 
5320  | 0  |     Py_ssize_t size;  | 
5321  |  | 
  | 
5322  | 0  |     if (!PyUnicode_Check(unicode)) { | 
5323  | 0  |         PyErr_BadArgument();  | 
5324  | 0  |         return NULL;  | 
5325  | 0  |     }  | 
5326  |  |  | 
5327  | 0  |     if (PyUnicode_READY(unicode) == -1)  | 
5328  | 0  |         return NULL;  | 
5329  |  |  | 
5330  | 0  |     if (PyUnicode_UTF8(unicode))  | 
5331  | 0  |         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),  | 
5332  | 0  |                                          PyUnicode_UTF8_LENGTH(unicode));  | 
5333  |  |  | 
5334  | 0  |     kind = PyUnicode_KIND(unicode);  | 
5335  | 0  |     data = PyUnicode_DATA(unicode);  | 
5336  | 0  |     size = PyUnicode_GET_LENGTH(unicode);  | 
5337  |  | 
  | 
5338  | 0  |     switch (kind) { | 
5339  | 0  |     default:  | 
5340  | 0  |         Py_UNREACHABLE();  | 
5341  | 0  |     case PyUnicode_1BYTE_KIND:  | 
5342  |  |         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */  | 
5343  | 0  |         assert(!PyUnicode_IS_ASCII(unicode));  | 
5344  | 0  |         return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);  | 
5345  | 0  |     case PyUnicode_2BYTE_KIND:  | 
5346  | 0  |         return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);  | 
5347  | 0  |     case PyUnicode_4BYTE_KIND:  | 
5348  | 0  |         return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);  | 
5349  | 0  |     }  | 
5350  | 0  | }  | 
5351  |  |  | 
5352  |  | PyObject *  | 
5353  |  | _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)  | 
5354  | 0  | { | 
5355  | 0  |     return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);  | 
5356  | 0  | }  | 
5357  |  |  | 
5358  |  |  | 
5359  |  | PyObject *  | 
5360  |  | PyUnicode_EncodeUTF8(const Py_UNICODE *s,  | 
5361  |  |                      Py_ssize_t size,  | 
5362  |  |                      const char *errors)  | 
5363  | 0  | { | 
5364  | 0  |     PyObject *v, *unicode;  | 
5365  |  | 
  | 
5366  | 0  |     unicode = PyUnicode_FromWideChar(s, size);  | 
5367  | 0  |     if (unicode == NULL)  | 
5368  | 0  |         return NULL;  | 
5369  | 0  |     v = _PyUnicode_AsUTF8String(unicode, errors);  | 
5370  | 0  |     Py_DECREF(unicode);  | 
5371  | 0  |     return v;  | 
5372  | 0  | }  | 
5373  |  |  | 
5374  |  | PyObject *  | 
5375  |  | PyUnicode_AsUTF8String(PyObject *unicode)  | 
5376  | 0  | { | 
5377  | 0  |     return _PyUnicode_AsUTF8String(unicode, NULL);  | 
5378  | 0  | }  | 
5379  |  |  | 
5380  |  | /* --- UTF-32 Codec ------------------------------------------------------- */  | 
5381  |  |  | 
5382  |  | PyObject *  | 
5383  |  | PyUnicode_DecodeUTF32(const char *s,  | 
5384  |  |                       Py_ssize_t size,  | 
5385  |  |                       const char *errors,  | 
5386  |  |                       int *byteorder)  | 
5387  | 0  | { | 
5388  | 0  |     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);  | 
5389  | 0  | }  | 
5390  |  |  | 
5391  |  | PyObject *  | 
5392  |  | PyUnicode_DecodeUTF32Stateful(const char *s,  | 
5393  |  |                               Py_ssize_t size,  | 
5394  |  |                               const char *errors,  | 
5395  |  |                               int *byteorder,  | 
5396  |  |                               Py_ssize_t *consumed)  | 
5397  | 0  | { | 
5398  | 0  |     const char *starts = s;  | 
5399  | 0  |     Py_ssize_t startinpos;  | 
5400  | 0  |     Py_ssize_t endinpos;  | 
5401  | 0  |     _PyUnicodeWriter writer;  | 
5402  | 0  |     const unsigned char *q, *e;  | 
5403  | 0  |     int le, bo = 0;       /* assume native ordering by default */  | 
5404  | 0  |     const char *encoding;  | 
5405  | 0  |     const char *errmsg = "";  | 
5406  | 0  |     PyObject *errorHandler = NULL;  | 
5407  | 0  |     PyObject *exc = NULL;  | 
5408  |  | 
  | 
5409  | 0  |     q = (const unsigned char *)s;  | 
5410  | 0  |     e = q + size;  | 
5411  |  | 
  | 
5412  | 0  |     if (byteorder)  | 
5413  | 0  |         bo = *byteorder;  | 
5414  |  |  | 
5415  |  |     /* Check for BOM marks (U+FEFF) in the input and adjust current  | 
5416  |  |        byte order setting accordingly. In native mode, the leading BOM  | 
5417  |  |        mark is skipped, in all other modes, it is copied to the output  | 
5418  |  |        stream as-is (giving a ZWNBSP character). */  | 
5419  | 0  |     if (bo == 0 && size >= 4) { | 
5420  | 0  |         Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];  | 
5421  | 0  |         if (bom == 0x0000FEFF) { | 
5422  | 0  |             bo = -1;  | 
5423  | 0  |             q += 4;  | 
5424  | 0  |         }  | 
5425  | 0  |         else if (bom == 0xFFFE0000) { | 
5426  | 0  |             bo = 1;  | 
5427  | 0  |             q += 4;  | 
5428  | 0  |         }  | 
5429  | 0  |         if (byteorder)  | 
5430  | 0  |             *byteorder = bo;  | 
5431  | 0  |     }  | 
5432  |  | 
  | 
5433  | 0  |     if (q == e) { | 
5434  | 0  |         if (consumed)  | 
5435  | 0  |             *consumed = size;  | 
5436  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
5437  | 0  |     }  | 
5438  |  |  | 
5439  |  | #ifdef WORDS_BIGENDIAN  | 
5440  |  |     le = bo < 0;  | 
5441  |  | #else  | 
5442  | 0  |     le = bo <= 0;  | 
5443  | 0  | #endif  | 
5444  | 0  |     encoding = le ? "utf-32-le" : "utf-32-be";  | 
5445  |  | 
  | 
5446  | 0  |     _PyUnicodeWriter_Init(&writer);  | 
5447  | 0  |     writer.min_length = (e - q + 3) / 4;  | 
5448  | 0  |     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)  | 
5449  | 0  |         goto onError;  | 
5450  |  |  | 
5451  | 0  |     while (1) { | 
5452  | 0  |         Py_UCS4 ch = 0;  | 
5453  | 0  |         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);  | 
5454  |  | 
  | 
5455  | 0  |         if (e - q >= 4) { | 
5456  | 0  |             enum PyUnicode_Kind kind = writer.kind;  | 
5457  | 0  |             void *data = writer.data;  | 
5458  | 0  |             const unsigned char *last = e - 4;  | 
5459  | 0  |             Py_ssize_t pos = writer.pos;  | 
5460  | 0  |             if (le) { | 
5461  | 0  |                 do { | 
5462  | 0  |                     ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];  | 
5463  | 0  |                     if (ch > maxch)  | 
5464  | 0  |                         break;  | 
5465  | 0  |                     if (kind != PyUnicode_1BYTE_KIND &&  | 
5466  | 0  |                         Py_UNICODE_IS_SURROGATE(ch))  | 
5467  | 0  |                         break;  | 
5468  | 0  |                     PyUnicode_WRITE(kind, data, pos++, ch);  | 
5469  | 0  |                     q += 4;  | 
5470  | 0  |                 } while (q <= last);  | 
5471  | 0  |             }  | 
5472  | 0  |             else { | 
5473  | 0  |                 do { | 
5474  | 0  |                     ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];  | 
5475  | 0  |                     if (ch > maxch)  | 
5476  | 0  |                         break;  | 
5477  | 0  |                     if (kind != PyUnicode_1BYTE_KIND &&  | 
5478  | 0  |                         Py_UNICODE_IS_SURROGATE(ch))  | 
5479  | 0  |                         break;  | 
5480  | 0  |                     PyUnicode_WRITE(kind, data, pos++, ch);  | 
5481  | 0  |                     q += 4;  | 
5482  | 0  |                 } while (q <= last);  | 
5483  | 0  |             }  | 
5484  | 0  |             writer.pos = pos;  | 
5485  | 0  |         }  | 
5486  |  |  | 
5487  | 0  |         if (Py_UNICODE_IS_SURROGATE(ch)) { | 
5488  | 0  |             errmsg = "code point in surrogate code point range(0xd800, 0xe000)";  | 
5489  | 0  |             startinpos = ((const char *)q) - starts;  | 
5490  | 0  |             endinpos = startinpos + 4;  | 
5491  | 0  |         }  | 
5492  | 0  |         else if (ch <= maxch) { | 
5493  | 0  |             if (q == e || consumed)  | 
5494  | 0  |                 break;  | 
5495  |  |             /* remaining bytes at the end? (size should be divisible by 4) */  | 
5496  | 0  |             errmsg = "truncated data";  | 
5497  | 0  |             startinpos = ((const char *)q) - starts;  | 
5498  | 0  |             endinpos = ((const char *)e) - starts;  | 
5499  | 0  |         }  | 
5500  | 0  |         else { | 
5501  | 0  |             if (ch < 0x110000) { | 
5502  | 0  |                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)  | 
5503  | 0  |                     goto onError;  | 
5504  | 0  |                 q += 4;  | 
5505  | 0  |                 continue;  | 
5506  | 0  |             }  | 
5507  | 0  |             errmsg = "code point not in range(0x110000)";  | 
5508  | 0  |             startinpos = ((const char *)q) - starts;  | 
5509  | 0  |             endinpos = startinpos + 4;  | 
5510  | 0  |         }  | 
5511  |  |  | 
5512  |  |         /* The remaining input chars are ignored if the callback  | 
5513  |  |            chooses to skip the input */  | 
5514  | 0  |         if (unicode_decode_call_errorhandler_writer(  | 
5515  | 0  |                 errors, &errorHandler,  | 
5516  | 0  |                 encoding, errmsg,  | 
5517  | 0  |                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,  | 
5518  | 0  |                 &writer))  | 
5519  | 0  |             goto onError;  | 
5520  | 0  |     }  | 
5521  |  |  | 
5522  | 0  |     if (consumed)  | 
5523  | 0  |         *consumed = (const char *)q-starts;  | 
5524  |  | 
  | 
5525  | 0  |     Py_XDECREF(errorHandler);  | 
5526  | 0  |     Py_XDECREF(exc);  | 
5527  | 0  |     return _PyUnicodeWriter_Finish(&writer);  | 
5528  |  |  | 
5529  | 0  |   onError:  | 
5530  | 0  |     _PyUnicodeWriter_Dealloc(&writer);  | 
5531  | 0  |     Py_XDECREF(errorHandler);  | 
5532  | 0  |     Py_XDECREF(exc);  | 
5533  | 0  |     return NULL;  | 
5534  | 0  | }  | 
5535  |  |  | 
5536  |  | PyObject *  | 
5537  |  | _PyUnicode_EncodeUTF32(PyObject *str,  | 
5538  |  |                        const char *errors,  | 
5539  |  |                        int byteorder)  | 
5540  | 0  | { | 
5541  | 0  |     enum PyUnicode_Kind kind;  | 
5542  | 0  |     const void *data;  | 
5543  | 0  |     Py_ssize_t len;  | 
5544  | 0  |     PyObject *v;  | 
5545  | 0  |     uint32_t *out;  | 
5546  | 0  | #if PY_LITTLE_ENDIAN  | 
5547  | 0  |     int native_ordering = byteorder <= 0;  | 
5548  |  | #else  | 
5549  |  |     int native_ordering = byteorder >= 0;  | 
5550  |  | #endif  | 
5551  | 0  |     const char *encoding;  | 
5552  | 0  |     Py_ssize_t nsize, pos;  | 
5553  | 0  |     PyObject *errorHandler = NULL;  | 
5554  | 0  |     PyObject *exc = NULL;  | 
5555  | 0  |     PyObject *rep = NULL;  | 
5556  |  | 
  | 
5557  | 0  |     if (!PyUnicode_Check(str)) { | 
5558  | 0  |         PyErr_BadArgument();  | 
5559  | 0  |         return NULL;  | 
5560  | 0  |     }  | 
5561  | 0  |     if (PyUnicode_READY(str) == -1)  | 
5562  | 0  |         return NULL;  | 
5563  | 0  |     kind = PyUnicode_KIND(str);  | 
5564  | 0  |     data = PyUnicode_DATA(str);  | 
5565  | 0  |     len = PyUnicode_GET_LENGTH(str);  | 
5566  |  | 
  | 
5567  | 0  |     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))  | 
5568  | 0  |         return PyErr_NoMemory();  | 
5569  | 0  |     nsize = len + (byteorder == 0);  | 
5570  | 0  |     v = PyBytes_FromStringAndSize(NULL, nsize * 4);  | 
5571  | 0  |     if (v == NULL)  | 
5572  | 0  |         return NULL;  | 
5573  |  |  | 
5574  |  |     /* output buffer is 4-bytes aligned */  | 
5575  | 0  |     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));  | 
5576  | 0  |     out = (uint32_t *)PyBytes_AS_STRING(v);  | 
5577  | 0  |     if (byteorder == 0)  | 
5578  | 0  |         *out++ = 0xFEFF;  | 
5579  | 0  |     if (len == 0)  | 
5580  | 0  |         goto done;  | 
5581  |  |  | 
5582  | 0  |     if (byteorder == -1)  | 
5583  | 0  |         encoding = "utf-32-le";  | 
5584  | 0  |     else if (byteorder == 1)  | 
5585  | 0  |         encoding = "utf-32-be";  | 
5586  | 0  |     else  | 
5587  | 0  |         encoding = "utf-32";  | 
5588  |  | 
  | 
5589  | 0  |     if (kind == PyUnicode_1BYTE_KIND) { | 
5590  | 0  |         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);  | 
5591  | 0  |         goto done;  | 
5592  | 0  |     }  | 
5593  |  |  | 
5594  | 0  |     pos = 0;  | 
5595  | 0  |     while (pos < len) { | 
5596  | 0  |         Py_ssize_t repsize, moreunits;  | 
5597  |  | 
  | 
5598  | 0  |         if (kind == PyUnicode_2BYTE_KIND) { | 
5599  | 0  |             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,  | 
5600  | 0  |                                         &out, native_ordering);  | 
5601  | 0  |         }  | 
5602  | 0  |         else { | 
5603  | 0  |             assert(kind == PyUnicode_4BYTE_KIND);  | 
5604  | 0  |             pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,  | 
5605  | 0  |                                         &out, native_ordering);  | 
5606  | 0  |         }  | 
5607  | 0  |         if (pos == len)  | 
5608  | 0  |             break;  | 
5609  |  |  | 
5610  | 0  |         rep = unicode_encode_call_errorhandler(  | 
5611  | 0  |                 errors, &errorHandler,  | 
5612  | 0  |                 encoding, "surrogates not allowed",  | 
5613  | 0  |                 str, &exc, pos, pos + 1, &pos);  | 
5614  | 0  |         if (!rep)  | 
5615  | 0  |             goto error;  | 
5616  |  |  | 
5617  | 0  |         if (PyBytes_Check(rep)) { | 
5618  | 0  |             repsize = PyBytes_GET_SIZE(rep);  | 
5619  | 0  |             if (repsize & 3) { | 
5620  | 0  |                 raise_encode_exception(&exc, encoding,  | 
5621  | 0  |                                        str, pos - 1, pos,  | 
5622  | 0  |                                        "surrogates not allowed");  | 
5623  | 0  |                 goto error;  | 
5624  | 0  |             }  | 
5625  | 0  |             moreunits = repsize / 4;  | 
5626  | 0  |         }  | 
5627  | 0  |         else { | 
5628  | 0  |             assert(PyUnicode_Check(rep));  | 
5629  | 0  |             if (PyUnicode_READY(rep) < 0)  | 
5630  | 0  |                 goto error;  | 
5631  | 0  |             moreunits = repsize = PyUnicode_GET_LENGTH(rep);  | 
5632  | 0  |             if (!PyUnicode_IS_ASCII(rep)) { | 
5633  | 0  |                 raise_encode_exception(&exc, encoding,  | 
5634  | 0  |                                        str, pos - 1, pos,  | 
5635  | 0  |                                        "surrogates not allowed");  | 
5636  | 0  |                 goto error;  | 
5637  | 0  |             }  | 
5638  | 0  |         }  | 
5639  |  |  | 
5640  |  |         /* four bytes are reserved for each surrogate */  | 
5641  | 0  |         if (moreunits > 1) { | 
5642  | 0  |             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);  | 
5643  | 0  |             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) { | 
5644  |  |                 /* integer overflow */  | 
5645  | 0  |                 PyErr_NoMemory();  | 
5646  | 0  |                 goto error;  | 
5647  | 0  |             }  | 
5648  | 0  |             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)  | 
5649  | 0  |                 goto error;  | 
5650  | 0  |             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;  | 
5651  | 0  |         }  | 
5652  |  |  | 
5653  | 0  |         if (PyBytes_Check(rep)) { | 
5654  | 0  |             memcpy(out, PyBytes_AS_STRING(rep), repsize);  | 
5655  | 0  |             out += moreunits;  | 
5656  | 0  |         } else /* rep is unicode */ { | 
5657  | 0  |             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);  | 
5658  | 0  |             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,  | 
5659  | 0  |                                  &out, native_ordering);  | 
5660  | 0  |         }  | 
5661  |  | 
  | 
5662  | 0  |         Py_CLEAR(rep);  | 
5663  | 0  |     }  | 
5664  |  |  | 
5665  |  |     /* Cut back to size actually needed. This is necessary for, for example,  | 
5666  |  |        encoding of a string containing isolated surrogates and the 'ignore'  | 
5667  |  |        handler is used. */  | 
5668  | 0  |     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);  | 
5669  | 0  |     if (nsize != PyBytes_GET_SIZE(v))  | 
5670  | 0  |       _PyBytes_Resize(&v, nsize);  | 
5671  | 0  |     Py_XDECREF(errorHandler);  | 
5672  | 0  |     Py_XDECREF(exc);  | 
5673  | 0  |   done:  | 
5674  | 0  |     return v;  | 
5675  | 0  |   error:  | 
5676  | 0  |     Py_XDECREF(rep);  | 
5677  | 0  |     Py_XDECREF(errorHandler);  | 
5678  | 0  |     Py_XDECREF(exc);  | 
5679  | 0  |     Py_XDECREF(v);  | 
5680  | 0  |     return NULL;  | 
5681  | 0  | }  | 
5682  |  |  | 
5683  |  | PyObject *  | 
5684  |  | PyUnicode_EncodeUTF32(const Py_UNICODE *s,  | 
5685  |  |                       Py_ssize_t size,  | 
5686  |  |                       const char *errors,  | 
5687  |  |                       int byteorder)  | 
5688  | 0  | { | 
5689  | 0  |     PyObject *result;  | 
5690  | 0  |     PyObject *tmp = PyUnicode_FromWideChar(s, size);  | 
5691  | 0  |     if (tmp == NULL)  | 
5692  | 0  |         return NULL;  | 
5693  | 0  |     result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);  | 
5694  | 0  |     Py_DECREF(tmp);  | 
5695  | 0  |     return result;  | 
5696  | 0  | }  | 
5697  |  |  | 
5698  |  | PyObject *  | 
5699  |  | PyUnicode_AsUTF32String(PyObject *unicode)  | 
5700  | 0  | { | 
5701  | 0  |     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);  | 
5702  | 0  | }  | 
5703  |  |  | 
5704  |  | /* --- UTF-16 Codec ------------------------------------------------------- */  | 
5705  |  |  | 
5706  |  | PyObject *  | 
5707  |  | PyUnicode_DecodeUTF16(const char *s,  | 
5708  |  |                       Py_ssize_t size,  | 
5709  |  |                       const char *errors,  | 
5710  |  |                       int *byteorder)  | 
5711  | 0  | { | 
5712  | 0  |     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);  | 
5713  | 0  | }  | 
5714  |  |  | 
5715  |  | PyObject *  | 
5716  |  | PyUnicode_DecodeUTF16Stateful(const char *s,  | 
5717  |  |                               Py_ssize_t size,  | 
5718  |  |                               const char *errors,  | 
5719  |  |                               int *byteorder,  | 
5720  |  |                               Py_ssize_t *consumed)  | 
5721  | 0  | { | 
5722  | 0  |     const char *starts = s;  | 
5723  | 0  |     Py_ssize_t startinpos;  | 
5724  | 0  |     Py_ssize_t endinpos;  | 
5725  | 0  |     _PyUnicodeWriter writer;  | 
5726  | 0  |     const unsigned char *q, *e;  | 
5727  | 0  |     int bo = 0;       /* assume native ordering by default */  | 
5728  | 0  |     int native_ordering;  | 
5729  | 0  |     const char *errmsg = "";  | 
5730  | 0  |     PyObject *errorHandler = NULL;  | 
5731  | 0  |     PyObject *exc = NULL;  | 
5732  | 0  |     const char *encoding;  | 
5733  |  | 
  | 
5734  | 0  |     q = (const unsigned char *)s;  | 
5735  | 0  |     e = q + size;  | 
5736  |  | 
  | 
5737  | 0  |     if (byteorder)  | 
5738  | 0  |         bo = *byteorder;  | 
5739  |  |  | 
5740  |  |     /* Check for BOM marks (U+FEFF) in the input and adjust current  | 
5741  |  |        byte order setting accordingly. In native mode, the leading BOM  | 
5742  |  |        mark is skipped, in all other modes, it is copied to the output  | 
5743  |  |        stream as-is (giving a ZWNBSP character). */  | 
5744  | 0  |     if (bo == 0 && size >= 2) { | 
5745  | 0  |         const Py_UCS4 bom = (q[1] << 8) | q[0];  | 
5746  | 0  |         if (bom == 0xFEFF) { | 
5747  | 0  |             q += 2;  | 
5748  | 0  |             bo = -1;  | 
5749  | 0  |         }  | 
5750  | 0  |         else if (bom == 0xFFFE) { | 
5751  | 0  |             q += 2;  | 
5752  | 0  |             bo = 1;  | 
5753  | 0  |         }  | 
5754  | 0  |         if (byteorder)  | 
5755  | 0  |             *byteorder = bo;  | 
5756  | 0  |     }  | 
5757  |  | 
  | 
5758  | 0  |     if (q == e) { | 
5759  | 0  |         if (consumed)  | 
5760  | 0  |             *consumed = size;  | 
5761  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
5762  | 0  |     }  | 
5763  |  |  | 
5764  | 0  | #if PY_LITTLE_ENDIAN  | 
5765  | 0  |     native_ordering = bo <= 0;  | 
5766  | 0  |     encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";  | 
5767  |  | #else  | 
5768  |  |     native_ordering = bo >= 0;  | 
5769  |  |     encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";  | 
5770  |  | #endif  | 
5771  |  |  | 
5772  |  |     /* Note: size will always be longer than the resulting Unicode  | 
5773  |  |        character count normally.  Error handler will take care of  | 
5774  |  |        resizing when needed. */  | 
5775  | 0  |     _PyUnicodeWriter_Init(&writer);  | 
5776  | 0  |     writer.min_length = (e - q + 1) / 2;  | 
5777  | 0  |     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)  | 
5778  | 0  |         goto onError;  | 
5779  |  |  | 
5780  | 0  |     while (1) { | 
5781  | 0  |         Py_UCS4 ch = 0;  | 
5782  | 0  |         if (e - q >= 2) { | 
5783  | 0  |             int kind = writer.kind;  | 
5784  | 0  |             if (kind == PyUnicode_1BYTE_KIND) { | 
5785  | 0  |                 if (PyUnicode_IS_ASCII(writer.buffer))  | 
5786  | 0  |                     ch = asciilib_utf16_decode(&q, e,  | 
5787  | 0  |                             (Py_UCS1*)writer.data, &writer.pos,  | 
5788  | 0  |                             native_ordering);  | 
5789  | 0  |                 else  | 
5790  | 0  |                     ch = ucs1lib_utf16_decode(&q, e,  | 
5791  | 0  |                             (Py_UCS1*)writer.data, &writer.pos,  | 
5792  | 0  |                             native_ordering);  | 
5793  | 0  |             } else if (kind == PyUnicode_2BYTE_KIND) { | 
5794  | 0  |                 ch = ucs2lib_utf16_decode(&q, e,  | 
5795  | 0  |                         (Py_UCS2*)writer.data, &writer.pos,  | 
5796  | 0  |                         native_ordering);  | 
5797  | 0  |             } else { | 
5798  | 0  |                 assert(kind == PyUnicode_4BYTE_KIND);  | 
5799  | 0  |                 ch = ucs4lib_utf16_decode(&q, e,  | 
5800  | 0  |                         (Py_UCS4*)writer.data, &writer.pos,  | 
5801  | 0  |                         native_ordering);  | 
5802  | 0  |             }  | 
5803  | 0  |         }  | 
5804  |  | 
  | 
5805  | 0  |         switch (ch)  | 
5806  | 0  |         { | 
5807  | 0  |         case 0:  | 
5808  |  |             /* remaining byte at the end? (size should be even) */  | 
5809  | 0  |             if (q == e || consumed)  | 
5810  | 0  |                 goto End;  | 
5811  | 0  |             errmsg = "truncated data";  | 
5812  | 0  |             startinpos = ((const char *)q) - starts;  | 
5813  | 0  |             endinpos = ((const char *)e) - starts;  | 
5814  | 0  |             break;  | 
5815  |  |             /* The remaining input chars are ignored if the callback  | 
5816  |  |                chooses to skip the input */  | 
5817  | 0  |         case 1:  | 
5818  | 0  |             q -= 2;  | 
5819  | 0  |             if (consumed)  | 
5820  | 0  |                 goto End;  | 
5821  | 0  |             errmsg = "unexpected end of data";  | 
5822  | 0  |             startinpos = ((const char *)q) - starts;  | 
5823  | 0  |             endinpos = ((const char *)e) - starts;  | 
5824  | 0  |             break;  | 
5825  | 0  |         case 2:  | 
5826  | 0  |             errmsg = "illegal encoding";  | 
5827  | 0  |             startinpos = ((const char *)q) - 2 - starts;  | 
5828  | 0  |             endinpos = startinpos + 2;  | 
5829  | 0  |             break;  | 
5830  | 0  |         case 3:  | 
5831  | 0  |             errmsg = "illegal UTF-16 surrogate";  | 
5832  | 0  |             startinpos = ((const char *)q) - 4 - starts;  | 
5833  | 0  |             endinpos = startinpos + 2;  | 
5834  | 0  |             break;  | 
5835  | 0  |         default:  | 
5836  | 0  |             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)  | 
5837  | 0  |                 goto onError;  | 
5838  | 0  |             continue;  | 
5839  | 0  |         }  | 
5840  |  |  | 
5841  | 0  |         if (unicode_decode_call_errorhandler_writer(  | 
5842  | 0  |                 errors,  | 
5843  | 0  |                 &errorHandler,  | 
5844  | 0  |                 encoding, errmsg,  | 
5845  | 0  |                 &starts,  | 
5846  | 0  |                 (const char **)&e,  | 
5847  | 0  |                 &startinpos,  | 
5848  | 0  |                 &endinpos,  | 
5849  | 0  |                 &exc,  | 
5850  | 0  |                 (const char **)&q,  | 
5851  | 0  |                 &writer))  | 
5852  | 0  |             goto onError;  | 
5853  | 0  |     }  | 
5854  |  |  | 
5855  | 0  | End:  | 
5856  | 0  |     if (consumed)  | 
5857  | 0  |         *consumed = (const char *)q-starts;  | 
5858  |  | 
  | 
5859  | 0  |     Py_XDECREF(errorHandler);  | 
5860  | 0  |     Py_XDECREF(exc);  | 
5861  | 0  |     return _PyUnicodeWriter_Finish(&writer);  | 
5862  |  |  | 
5863  | 0  |   onError:  | 
5864  | 0  |     _PyUnicodeWriter_Dealloc(&writer);  | 
5865  | 0  |     Py_XDECREF(errorHandler);  | 
5866  | 0  |     Py_XDECREF(exc);  | 
5867  | 0  |     return NULL;  | 
5868  | 0  | }  | 
5869  |  |  | 
5870  |  | PyObject *  | 
5871  |  | _PyUnicode_EncodeUTF16(PyObject *str,  | 
5872  |  |                        const char *errors,  | 
5873  |  |                        int byteorder)  | 
5874  | 0  | { | 
5875  | 0  |     enum PyUnicode_Kind kind;  | 
5876  | 0  |     const void *data;  | 
5877  | 0  |     Py_ssize_t len;  | 
5878  | 0  |     PyObject *v;  | 
5879  | 0  |     unsigned short *out;  | 
5880  | 0  |     Py_ssize_t pairs;  | 
5881  |  | #if PY_BIG_ENDIAN  | 
5882  |  |     int native_ordering = byteorder >= 0;  | 
5883  |  | #else  | 
5884  | 0  |     int native_ordering = byteorder <= 0;  | 
5885  | 0  | #endif  | 
5886  | 0  |     const char *encoding;  | 
5887  | 0  |     Py_ssize_t nsize, pos;  | 
5888  | 0  |     PyObject *errorHandler = NULL;  | 
5889  | 0  |     PyObject *exc = NULL;  | 
5890  | 0  |     PyObject *rep = NULL;  | 
5891  |  | 
  | 
5892  | 0  |     if (!PyUnicode_Check(str)) { | 
5893  | 0  |         PyErr_BadArgument();  | 
5894  | 0  |         return NULL;  | 
5895  | 0  |     }  | 
5896  | 0  |     if (PyUnicode_READY(str) == -1)  | 
5897  | 0  |         return NULL;  | 
5898  | 0  |     kind = PyUnicode_KIND(str);  | 
5899  | 0  |     data = PyUnicode_DATA(str);  | 
5900  | 0  |     len = PyUnicode_GET_LENGTH(str);  | 
5901  |  | 
  | 
5902  | 0  |     pairs = 0;  | 
5903  | 0  |     if (kind == PyUnicode_4BYTE_KIND) { | 
5904  | 0  |         const Py_UCS4 *in = (const Py_UCS4 *)data;  | 
5905  | 0  |         const Py_UCS4 *end = in + len;  | 
5906  | 0  |         while (in < end) { | 
5907  | 0  |             if (*in++ >= 0x10000) { | 
5908  | 0  |                 pairs++;  | 
5909  | 0  |             }  | 
5910  | 0  |         }  | 
5911  | 0  |     }  | 
5912  | 0  |     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) { | 
5913  | 0  |         return PyErr_NoMemory();  | 
5914  | 0  |     }  | 
5915  | 0  |     nsize = len + pairs + (byteorder == 0);  | 
5916  | 0  |     v = PyBytes_FromStringAndSize(NULL, nsize * 2);  | 
5917  | 0  |     if (v == NULL) { | 
5918  | 0  |         return NULL;  | 
5919  | 0  |     }  | 
5920  |  |  | 
5921  |  |     /* output buffer is 2-bytes aligned */  | 
5922  | 0  |     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));  | 
5923  | 0  |     out = (unsigned short *)PyBytes_AS_STRING(v);  | 
5924  | 0  |     if (byteorder == 0) { | 
5925  | 0  |         *out++ = 0xFEFF;  | 
5926  | 0  |     }  | 
5927  | 0  |     if (len == 0) { | 
5928  | 0  |         goto done;  | 
5929  | 0  |     }  | 
5930  |  |  | 
5931  | 0  |     if (kind == PyUnicode_1BYTE_KIND) { | 
5932  | 0  |         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);  | 
5933  | 0  |         goto done;  | 
5934  | 0  |     }  | 
5935  |  |  | 
5936  | 0  |     if (byteorder < 0) { | 
5937  | 0  |         encoding = "utf-16-le";  | 
5938  | 0  |     }  | 
5939  | 0  |     else if (byteorder > 0) { | 
5940  | 0  |         encoding = "utf-16-be";  | 
5941  | 0  |     }  | 
5942  | 0  |     else { | 
5943  | 0  |         encoding = "utf-16";  | 
5944  | 0  |     }  | 
5945  |  | 
  | 
5946  | 0  |     pos = 0;  | 
5947  | 0  |     while (pos < len) { | 
5948  | 0  |         Py_ssize_t repsize, moreunits;  | 
5949  |  | 
  | 
5950  | 0  |         if (kind == PyUnicode_2BYTE_KIND) { | 
5951  | 0  |             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,  | 
5952  | 0  |                                         &out, native_ordering);  | 
5953  | 0  |         }  | 
5954  | 0  |         else { | 
5955  | 0  |             assert(kind == PyUnicode_4BYTE_KIND);  | 
5956  | 0  |             pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,  | 
5957  | 0  |                                         &out, native_ordering);  | 
5958  | 0  |         }  | 
5959  | 0  |         if (pos == len)  | 
5960  | 0  |             break;  | 
5961  |  |  | 
5962  | 0  |         rep = unicode_encode_call_errorhandler(  | 
5963  | 0  |                 errors, &errorHandler,  | 
5964  | 0  |                 encoding, "surrogates not allowed",  | 
5965  | 0  |                 str, &exc, pos, pos + 1, &pos);  | 
5966  | 0  |         if (!rep)  | 
5967  | 0  |             goto error;  | 
5968  |  |  | 
5969  | 0  |         if (PyBytes_Check(rep)) { | 
5970  | 0  |             repsize = PyBytes_GET_SIZE(rep);  | 
5971  | 0  |             if (repsize & 1) { | 
5972  | 0  |                 raise_encode_exception(&exc, encoding,  | 
5973  | 0  |                                        str, pos - 1, pos,  | 
5974  | 0  |                                        "surrogates not allowed");  | 
5975  | 0  |                 goto error;  | 
5976  | 0  |             }  | 
5977  | 0  |             moreunits = repsize / 2;  | 
5978  | 0  |         }  | 
5979  | 0  |         else { | 
5980  | 0  |             assert(PyUnicode_Check(rep));  | 
5981  | 0  |             if (PyUnicode_READY(rep) < 0)  | 
5982  | 0  |                 goto error;  | 
5983  | 0  |             moreunits = repsize = PyUnicode_GET_LENGTH(rep);  | 
5984  | 0  |             if (!PyUnicode_IS_ASCII(rep)) { | 
5985  | 0  |                 raise_encode_exception(&exc, encoding,  | 
5986  | 0  |                                        str, pos - 1, pos,  | 
5987  | 0  |                                        "surrogates not allowed");  | 
5988  | 0  |                 goto error;  | 
5989  | 0  |             }  | 
5990  | 0  |         }  | 
5991  |  |  | 
5992  |  |         /* two bytes are reserved for each surrogate */  | 
5993  | 0  |         if (moreunits > 1) { | 
5994  | 0  |             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);  | 
5995  | 0  |             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) { | 
5996  |  |                 /* integer overflow */  | 
5997  | 0  |                 PyErr_NoMemory();  | 
5998  | 0  |                 goto error;  | 
5999  | 0  |             }  | 
6000  | 0  |             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)  | 
6001  | 0  |                 goto error;  | 
6002  | 0  |             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;  | 
6003  | 0  |         }  | 
6004  |  |  | 
6005  | 0  |         if (PyBytes_Check(rep)) { | 
6006  | 0  |             memcpy(out, PyBytes_AS_STRING(rep), repsize);  | 
6007  | 0  |             out += moreunits;  | 
6008  | 0  |         } else /* rep is unicode */ { | 
6009  | 0  |             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);  | 
6010  | 0  |             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,  | 
6011  | 0  |                                  &out, native_ordering);  | 
6012  | 0  |         }  | 
6013  |  | 
  | 
6014  | 0  |         Py_CLEAR(rep);  | 
6015  | 0  |     }  | 
6016  |  |  | 
6017  |  |     /* Cut back to size actually needed. This is necessary for, for example,  | 
6018  |  |     encoding of a string containing isolated surrogates and the 'ignore' handler  | 
6019  |  |     is used. */  | 
6020  | 0  |     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);  | 
6021  | 0  |     if (nsize != PyBytes_GET_SIZE(v))  | 
6022  | 0  |       _PyBytes_Resize(&v, nsize);  | 
6023  | 0  |     Py_XDECREF(errorHandler);  | 
6024  | 0  |     Py_XDECREF(exc);  | 
6025  | 0  |   done:  | 
6026  | 0  |     return v;  | 
6027  | 0  |   error:  | 
6028  | 0  |     Py_XDECREF(rep);  | 
6029  | 0  |     Py_XDECREF(errorHandler);  | 
6030  | 0  |     Py_XDECREF(exc);  | 
6031  | 0  |     Py_XDECREF(v);  | 
6032  | 0  |     return NULL;  | 
6033  | 0  | #undef STORECHAR  | 
6034  | 0  | }  | 
6035  |  |  | 
6036  |  | PyObject *  | 
6037  |  | PyUnicode_EncodeUTF16(const Py_UNICODE *s,  | 
6038  |  |                       Py_ssize_t size,  | 
6039  |  |                       const char *errors,  | 
6040  |  |                       int byteorder)  | 
6041  | 0  | { | 
6042  | 0  |     PyObject *result;  | 
6043  | 0  |     PyObject *tmp = PyUnicode_FromWideChar(s, size);  | 
6044  | 0  |     if (tmp == NULL)  | 
6045  | 0  |         return NULL;  | 
6046  | 0  |     result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);  | 
6047  | 0  |     Py_DECREF(tmp);  | 
6048  | 0  |     return result;  | 
6049  | 0  | }  | 
6050  |  |  | 
6051  |  | PyObject *  | 
6052  |  | PyUnicode_AsUTF16String(PyObject *unicode)  | 
6053  | 0  | { | 
6054  | 0  |     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);  | 
6055  | 0  | }  | 
6056  |  |  | 
6057  |  | /* --- Unicode Escape Codec ----------------------------------------------- */  | 
6058  |  |  | 
6059  |  | static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;  | 
6060  |  |  | 
6061  |  | PyObject *  | 
6062  |  | _PyUnicode_DecodeUnicodeEscape(const char *s,  | 
6063  |  |                                Py_ssize_t size,  | 
6064  |  |                                const char *errors,  | 
6065  |  |                                const char **first_invalid_escape)  | 
6066  | 4  | { | 
6067  | 4  |     const char *starts = s;  | 
6068  | 4  |     _PyUnicodeWriter writer;  | 
6069  | 4  |     const char *end;  | 
6070  | 4  |     PyObject *errorHandler = NULL;  | 
6071  | 4  |     PyObject *exc = NULL;  | 
6072  |  |  | 
6073  |  |     // so we can remember if we've seen an invalid escape char or not  | 
6074  | 4  |     *first_invalid_escape = NULL;  | 
6075  |  |  | 
6076  | 4  |     if (size == 0) { | 
6077  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
6078  | 0  |     }  | 
6079  |  |     /* Escaped strings will always be longer than the resulting  | 
6080  |  |        Unicode string, so we start with size here and then reduce the  | 
6081  |  |        length after conversion to the true value.  | 
6082  |  |        (but if the error callback returns a long replacement string  | 
6083  |  |        we'll have to allocate more space) */  | 
6084  | 4  |     _PyUnicodeWriter_Init(&writer);  | 
6085  | 4  |     writer.min_length = size;  | 
6086  | 4  |     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { | 
6087  | 0  |         goto onError;  | 
6088  | 0  |     }  | 
6089  |  |  | 
6090  | 4  |     end = s + size;  | 
6091  | 8  |     while (s < end) { | 
6092  | 4  |         unsigned char c = (unsigned char) *s++;  | 
6093  | 4  |         Py_UCS4 ch;  | 
6094  | 4  |         int count;  | 
6095  | 4  |         Py_ssize_t startinpos;  | 
6096  | 4  |         Py_ssize_t endinpos;  | 
6097  | 4  |         const char *message;  | 
6098  |  |  | 
6099  | 4  | #define WRITE_ASCII_CHAR(ch)                                                  \  | 
6100  | 4  |             do {                                                              \ | 
6101  | 2  |                 assert(ch <= 127);                                            \  | 
6102  | 2  |                 assert(writer.pos < writer.size);                             \  | 
6103  | 2  |                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \  | 
6104  | 2  |             } while(0)  | 
6105  |  |  | 
6106  | 4  | #define WRITE_CHAR(ch)                                                        \  | 
6107  | 4  |             do {                                                              \ | 
6108  | 2  |                 if (ch <= writer.maxchar) {                                   \ | 
6109  | 2  |                     assert(writer.pos < writer.size);                         \  | 
6110  | 2  |                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \  | 
6111  | 2  |                 }                                                             \  | 
6112  | 2  |                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \ | 
6113  | 0  |                     goto onError;                                             \  | 
6114  | 0  |                 }                                                             \  | 
6115  | 2  |             } while(0)  | 
6116  |  |  | 
6117  |  |         /* Non-escape characters are interpreted as Unicode ordinals */  | 
6118  | 4  |         if (c != '\\') { | 
6119  | 0  |             WRITE_CHAR(c);  | 
6120  | 0  |             continue;  | 
6121  | 0  |         }  | 
6122  |  |  | 
6123  | 4  |         startinpos = s - starts - 1;  | 
6124  |  |         /* \ - Escapes */  | 
6125  | 4  |         if (s >= end) { | 
6126  | 0  |             message = "\\ at end of string";  | 
6127  | 0  |             goto error;  | 
6128  | 0  |         }  | 
6129  | 4  |         c = (unsigned char) *s++;  | 
6130  |  |  | 
6131  | 4  |         assert(writer.pos < writer.size);  | 
6132  | 4  |         switch (c) { | 
6133  |  |  | 
6134  |  |             /* \x escapes */  | 
6135  | 0  |         case '\n': continue;  | 
6136  | 0  |         case '\\': WRITE_ASCII_CHAR('\\'); continue; | 
6137  | 0  |         case '\'': WRITE_ASCII_CHAR('\''); continue; | 
6138  | 0  |         case '\"': WRITE_ASCII_CHAR('\"'); continue; | 
6139  | 0  |         case 'b': WRITE_ASCII_CHAR('\b'); continue; | 
6140  |  |         /* FF */  | 
6141  | 0  |         case 'f': WRITE_ASCII_CHAR('\014'); continue; | 
6142  | 0  |         case 't': WRITE_ASCII_CHAR('\t'); continue; | 
6143  | 2  |         case 'n': WRITE_ASCII_CHAR('\n'); continue; | 
6144  | 2  |         case 'r': WRITE_ASCII_CHAR('\r'); continue; | 
6145  |  |         /* VT */  | 
6146  | 0  |         case 'v': WRITE_ASCII_CHAR('\013'); continue; | 
6147  |  |         /* BEL, not classic C */  | 
6148  | 0  |         case 'a': WRITE_ASCII_CHAR('\007'); continue; | 
6149  |  |  | 
6150  |  |             /* \OOO (octal) escapes */  | 
6151  | 2  |         case '0': case '1': case '2': case '3':  | 
6152  | 2  |         case '4': case '5': case '6': case '7':  | 
6153  | 2  |             ch = c - '0';  | 
6154  | 2  |             if (s < end && '0' <= *s && *s <= '7') { | 
6155  | 0  |                 ch = (ch<<3) + *s++ - '0';  | 
6156  | 0  |                 if (s < end && '0' <= *s && *s <= '7') { | 
6157  | 0  |                     ch = (ch<<3) + *s++ - '0';  | 
6158  | 0  |                 }  | 
6159  | 0  |             }  | 
6160  | 2  |             WRITE_CHAR(ch);  | 
6161  | 2  |             continue;  | 
6162  |  |  | 
6163  |  |             /* hex escapes */  | 
6164  |  |             /* \xXX */  | 
6165  | 2  |         case 'x':  | 
6166  | 0  |             count = 2;  | 
6167  | 0  |             message = "truncated \\xXX escape";  | 
6168  | 0  |             goto hexescape;  | 
6169  |  |  | 
6170  |  |             /* \uXXXX */  | 
6171  | 0  |         case 'u':  | 
6172  | 0  |             count = 4;  | 
6173  | 0  |             message = "truncated \\uXXXX escape";  | 
6174  | 0  |             goto hexescape;  | 
6175  |  |  | 
6176  |  |             /* \UXXXXXXXX */  | 
6177  | 0  |         case 'U':  | 
6178  | 0  |             count = 8;  | 
6179  | 0  |             message = "truncated \\UXXXXXXXX escape";  | 
6180  | 0  |         hexescape:  | 
6181  | 0  |             for (ch = 0; count && s < end; ++s, --count) { | 
6182  | 0  |                 c = (unsigned char)*s;  | 
6183  | 0  |                 ch <<= 4;  | 
6184  | 0  |                 if (c >= '0' && c <= '9') { | 
6185  | 0  |                     ch += c - '0';  | 
6186  | 0  |                 }  | 
6187  | 0  |                 else if (c >= 'a' && c <= 'f') { | 
6188  | 0  |                     ch += c - ('a' - 10); | 
6189  | 0  |                 }  | 
6190  | 0  |                 else if (c >= 'A' && c <= 'F') { | 
6191  | 0  |                     ch += c - ('A' - 10); | 
6192  | 0  |                 }  | 
6193  | 0  |                 else { | 
6194  | 0  |                     break;  | 
6195  | 0  |                 }  | 
6196  | 0  |             }  | 
6197  | 0  |             if (count) { | 
6198  | 0  |                 goto error;  | 
6199  | 0  |             }  | 
6200  |  |  | 
6201  |  |             /* when we get here, ch is a 32-bit unicode character */  | 
6202  | 0  |             if (ch > MAX_UNICODE) { | 
6203  | 0  |                 message = "illegal Unicode character";  | 
6204  | 0  |                 goto error;  | 
6205  | 0  |             }  | 
6206  |  |  | 
6207  | 0  |             WRITE_CHAR(ch);  | 
6208  | 0  |             continue;  | 
6209  |  |  | 
6210  |  |             /* \N{name} */ | 
6211  | 0  |         case 'N':  | 
6212  | 0  |             if (ucnhash_CAPI == NULL) { | 
6213  |  |                 /* load the unicode data module */  | 
6214  | 0  |                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(  | 
6215  | 0  |                                                 PyUnicodeData_CAPSULE_NAME, 1);  | 
6216  | 0  |                 if (ucnhash_CAPI == NULL) { | 
6217  | 0  |                     PyErr_SetString(  | 
6218  | 0  |                         PyExc_UnicodeError,  | 
6219  | 0  |                         "\\N escapes not supported (can't load unicodedata module)"  | 
6220  | 0  |                         );  | 
6221  | 0  |                     goto onError;  | 
6222  | 0  |                 }  | 
6223  | 0  |             }  | 
6224  |  |  | 
6225  | 0  |             message = "malformed \\N character escape";  | 
6226  | 0  |             if (s < end && *s == '{') { | 
6227  | 0  |                 const char *start = ++s;  | 
6228  | 0  |                 size_t namelen;  | 
6229  |  |                 /* look for the closing brace */  | 
6230  | 0  |                 while (s < end && *s != '}')  | 
6231  | 0  |                     s++;  | 
6232  | 0  |                 namelen = s - start;  | 
6233  | 0  |                 if (namelen && s < end) { | 
6234  |  |                     /* found a name.  look it up in the unicode database */  | 
6235  | 0  |                     s++;  | 
6236  | 0  |                     ch = 0xffffffff; /* in case 'getcode' messes up */  | 
6237  | 0  |                     if (namelen <= INT_MAX &&  | 
6238  | 0  |                         ucnhash_CAPI->getcode(NULL, start, (int)namelen,  | 
6239  | 0  |                                               &ch, 0)) { | 
6240  | 0  |                         assert(ch <= MAX_UNICODE);  | 
6241  | 0  |                         WRITE_CHAR(ch);  | 
6242  | 0  |                         continue;  | 
6243  | 0  |                     }  | 
6244  | 0  |                     message = "unknown Unicode character name";  | 
6245  | 0  |                 }  | 
6246  | 0  |             }  | 
6247  | 0  |             goto error;  | 
6248  |  |  | 
6249  | 0  |         default:  | 
6250  | 0  |             if (*first_invalid_escape == NULL) { | 
6251  | 0  |                 *first_invalid_escape = s-1; /* Back up one char, since we've  | 
6252  |  |                                                 already incremented s. */  | 
6253  | 0  |             }  | 
6254  | 0  |             WRITE_ASCII_CHAR('\\'); | 
6255  | 0  |             WRITE_CHAR(c);  | 
6256  | 0  |             continue;  | 
6257  | 4  |         }  | 
6258  |  |  | 
6259  | 0  |       error:  | 
6260  | 0  |         endinpos = s-starts;  | 
6261  | 0  |         writer.min_length = end - s + writer.pos;  | 
6262  | 0  |         if (unicode_decode_call_errorhandler_writer(  | 
6263  | 0  |                 errors, &errorHandler,  | 
6264  | 0  |                 "unicodeescape", message,  | 
6265  | 0  |                 &starts, &end, &startinpos, &endinpos, &exc, &s,  | 
6266  | 0  |                 &writer)) { | 
6267  | 0  |             goto onError;  | 
6268  | 0  |         }  | 
6269  | 0  |         assert(end - s <= writer.size - writer.pos);  | 
6270  |  | 
  | 
6271  | 0  | #undef WRITE_ASCII_CHAR  | 
6272  | 0  | #undef WRITE_CHAR  | 
6273  | 0  |     }  | 
6274  |  |  | 
6275  | 4  |     Py_XDECREF(errorHandler);  | 
6276  | 4  |     Py_XDECREF(exc);  | 
6277  | 4  |     return _PyUnicodeWriter_Finish(&writer);  | 
6278  |  |  | 
6279  | 0  |   onError:  | 
6280  | 0  |     _PyUnicodeWriter_Dealloc(&writer);  | 
6281  | 0  |     Py_XDECREF(errorHandler);  | 
6282  | 0  |     Py_XDECREF(exc);  | 
6283  | 0  |     return NULL;  | 
6284  | 4  | }  | 
6285  |  |  | 
6286  |  | PyObject *  | 
6287  |  | PyUnicode_DecodeUnicodeEscape(const char *s,  | 
6288  |  |                               Py_ssize_t size,  | 
6289  |  |                               const char *errors)  | 
6290  | 0  | { | 
6291  | 0  |     const char *first_invalid_escape;  | 
6292  | 0  |     PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,  | 
6293  | 0  |                                                       &first_invalid_escape);  | 
6294  | 0  |     if (result == NULL)  | 
6295  | 0  |         return NULL;  | 
6296  | 0  |     if (first_invalid_escape != NULL) { | 
6297  | 0  |         if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,  | 
6298  | 0  |                              "invalid escape sequence '\\%c'",  | 
6299  | 0  |                              (unsigned char)*first_invalid_escape) < 0) { | 
6300  | 0  |             Py_DECREF(result);  | 
6301  | 0  |             return NULL;  | 
6302  | 0  |         }  | 
6303  | 0  |     }  | 
6304  | 0  |     return result;  | 
6305  | 0  | }  | 
6306  |  |  | 
6307  |  | /* Return a Unicode-Escape string version of the Unicode object. */  | 
6308  |  |  | 
6309  |  | PyObject *  | 
6310  |  | PyUnicode_AsUnicodeEscapeString(PyObject *unicode)  | 
6311  | 0  | { | 
6312  | 0  |     Py_ssize_t i, len;  | 
6313  | 0  |     PyObject *repr;  | 
6314  | 0  |     char *p;  | 
6315  | 0  |     enum PyUnicode_Kind kind;  | 
6316  | 0  |     void *data;  | 
6317  | 0  |     Py_ssize_t expandsize;  | 
6318  |  |  | 
6319  |  |     /* Initial allocation is based on the longest-possible character  | 
6320  |  |        escape.  | 
6321  |  |  | 
6322  |  |        For UCS1 strings it's '\xxx', 4 bytes per source character.  | 
6323  |  |        For UCS2 strings it's '\uxxxx', 6 bytes per source character.  | 
6324  |  |        For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.  | 
6325  |  |     */  | 
6326  |  | 
  | 
6327  | 0  |     if (!PyUnicode_Check(unicode)) { | 
6328  | 0  |         PyErr_BadArgument();  | 
6329  | 0  |         return NULL;  | 
6330  | 0  |     }  | 
6331  | 0  |     if (PyUnicode_READY(unicode) == -1) { | 
6332  | 0  |         return NULL;  | 
6333  | 0  |     }  | 
6334  |  |  | 
6335  | 0  |     len = PyUnicode_GET_LENGTH(unicode);  | 
6336  | 0  |     if (len == 0) { | 
6337  | 0  |         return PyBytes_FromStringAndSize(NULL, 0);  | 
6338  | 0  |     }  | 
6339  |  |  | 
6340  | 0  |     kind = PyUnicode_KIND(unicode);  | 
6341  | 0  |     data = PyUnicode_DATA(unicode);  | 
6342  |  |     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6  | 
6343  |  |        bytes, and 1 byte characters 4. */  | 
6344  | 0  |     expandsize = kind * 2 + 2;  | 
6345  | 0  |     if (len > PY_SSIZE_T_MAX / expandsize) { | 
6346  | 0  |         return PyErr_NoMemory();  | 
6347  | 0  |     }  | 
6348  | 0  |     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);  | 
6349  | 0  |     if (repr == NULL) { | 
6350  | 0  |         return NULL;  | 
6351  | 0  |     }  | 
6352  |  |  | 
6353  | 0  |     p = PyBytes_AS_STRING(repr);  | 
6354  | 0  |     for (i = 0; i < len; i++) { | 
6355  | 0  |         Py_UCS4 ch = PyUnicode_READ(kind, data, i);  | 
6356  |  |  | 
6357  |  |         /* U+0000-U+00ff range */  | 
6358  | 0  |         if (ch < 0x100) { | 
6359  | 0  |             if (ch >= ' ' && ch < 127) { | 
6360  | 0  |                 if (ch != '\\') { | 
6361  |  |                     /* Copy printable US ASCII as-is */  | 
6362  | 0  |                     *p++ = (char) ch;  | 
6363  | 0  |                 }  | 
6364  |  |                 /* Escape backslashes */  | 
6365  | 0  |                 else { | 
6366  | 0  |                     *p++ = '\\';  | 
6367  | 0  |                     *p++ = '\\';  | 
6368  | 0  |                 }  | 
6369  | 0  |             }  | 
6370  |  |  | 
6371  |  |             /* Map special whitespace to '\t', \n', '\r' */  | 
6372  | 0  |             else if (ch == '\t') { | 
6373  | 0  |                 *p++ = '\\';  | 
6374  | 0  |                 *p++ = 't';  | 
6375  | 0  |             }  | 
6376  | 0  |             else if (ch == '\n') { | 
6377  | 0  |                 *p++ = '\\';  | 
6378  | 0  |                 *p++ = 'n';  | 
6379  | 0  |             }  | 
6380  | 0  |             else if (ch == '\r') { | 
6381  | 0  |                 *p++ = '\\';  | 
6382  | 0  |                 *p++ = 'r';  | 
6383  | 0  |             }  | 
6384  |  |  | 
6385  |  |             /* Map non-printable US ASCII and 8-bit characters to '\xHH' */  | 
6386  | 0  |             else { | 
6387  | 0  |                 *p++ = '\\';  | 
6388  | 0  |                 *p++ = 'x';  | 
6389  | 0  |                 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];  | 
6390  | 0  |                 *p++ = Py_hexdigits[ch & 0x000F];  | 
6391  | 0  |             }  | 
6392  | 0  |         }  | 
6393  |  |         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */  | 
6394  | 0  |         else if (ch < 0x10000) { | 
6395  | 0  |             *p++ = '\\';  | 
6396  | 0  |             *p++ = 'u';  | 
6397  | 0  |             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];  | 
6398  | 0  |             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];  | 
6399  | 0  |             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];  | 
6400  | 0  |             *p++ = Py_hexdigits[ch & 0x000F];  | 
6401  | 0  |         }  | 
6402  |  |         /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */  | 
6403  | 0  |         else { | 
6404  |  |  | 
6405  |  |             /* Make sure that the first two digits are zero */  | 
6406  | 0  |             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);  | 
6407  | 0  |             *p++ = '\\';  | 
6408  | 0  |             *p++ = 'U';  | 
6409  | 0  |             *p++ = '0';  | 
6410  | 0  |             *p++ = '0';  | 
6411  | 0  |             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];  | 
6412  | 0  |             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];  | 
6413  | 0  |             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];  | 
6414  | 0  |             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];  | 
6415  | 0  |             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];  | 
6416  | 0  |             *p++ = Py_hexdigits[ch & 0x0000000F];  | 
6417  | 0  |         }  | 
6418  | 0  |     }  | 
6419  |  | 
  | 
6420  | 0  |     assert(p - PyBytes_AS_STRING(repr) > 0);  | 
6421  | 0  |     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) { | 
6422  | 0  |         return NULL;  | 
6423  | 0  |     }  | 
6424  | 0  |     return repr;  | 
6425  | 0  | }  | 
6426  |  |  | 
6427  |  | PyObject *  | 
6428  |  | PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,  | 
6429  |  |                               Py_ssize_t size)  | 
6430  | 0  | { | 
6431  | 0  |     PyObject *result;  | 
6432  | 0  |     PyObject *tmp = PyUnicode_FromWideChar(s, size);  | 
6433  | 0  |     if (tmp == NULL) { | 
6434  | 0  |         return NULL;  | 
6435  | 0  |     }  | 
6436  |  |  | 
6437  | 0  |     result = PyUnicode_AsUnicodeEscapeString(tmp);  | 
6438  | 0  |     Py_DECREF(tmp);  | 
6439  | 0  |     return result;  | 
6440  | 0  | }  | 
6441  |  |  | 
6442  |  | /* --- Raw Unicode Escape Codec ------------------------------------------- */  | 
6443  |  |  | 
6444  |  | PyObject *  | 
6445  |  | PyUnicode_DecodeRawUnicodeEscape(const char *s,  | 
6446  |  |                                  Py_ssize_t size,  | 
6447  |  |                                  const char *errors)  | 
6448  | 0  | { | 
6449  | 0  |     const char *starts = s;  | 
6450  | 0  |     _PyUnicodeWriter writer;  | 
6451  | 0  |     const char *end;  | 
6452  | 0  |     PyObject *errorHandler = NULL;  | 
6453  | 0  |     PyObject *exc = NULL;  | 
6454  |  | 
  | 
6455  | 0  |     if (size == 0) { | 
6456  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
6457  | 0  |     }  | 
6458  |  |  | 
6459  |  |     /* Escaped strings will always be longer than the resulting  | 
6460  |  |        Unicode string, so we start with size here and then reduce the  | 
6461  |  |        length after conversion to the true value. (But decoding error  | 
6462  |  |        handler might have to resize the string) */  | 
6463  | 0  |     _PyUnicodeWriter_Init(&writer);  | 
6464  | 0  |      writer.min_length = size;  | 
6465  | 0  |     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { | 
6466  | 0  |         goto onError;  | 
6467  | 0  |     }  | 
6468  |  |  | 
6469  | 0  |     end = s + size;  | 
6470  | 0  |     while (s < end) { | 
6471  | 0  |         unsigned char c = (unsigned char) *s++;  | 
6472  | 0  |         Py_UCS4 ch;  | 
6473  | 0  |         int count;  | 
6474  | 0  |         Py_ssize_t startinpos;  | 
6475  | 0  |         Py_ssize_t endinpos;  | 
6476  | 0  |         const char *message;  | 
6477  |  | 
  | 
6478  | 0  | #define WRITE_CHAR(ch)                                                        \  | 
6479  | 0  |             do {                                                              \ | 
6480  | 0  |                 if (ch <= writer.maxchar) {                                   \ | 
6481  | 0  |                     assert(writer.pos < writer.size);                         \  | 
6482  | 0  |                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \  | 
6483  | 0  |                 }                                                             \  | 
6484  | 0  |                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \ | 
6485  | 0  |                     goto onError;                                             \  | 
6486  | 0  |                 }                                                             \  | 
6487  | 0  |             } while(0)  | 
6488  |  |  | 
6489  |  |         /* Non-escape characters are interpreted as Unicode ordinals */  | 
6490  | 0  |         if (c != '\\' || s >= end) { | 
6491  | 0  |             WRITE_CHAR(c);  | 
6492  | 0  |             continue;  | 
6493  | 0  |         }  | 
6494  |  |  | 
6495  | 0  |         c = (unsigned char) *s++;  | 
6496  | 0  |         if (c == 'u') { | 
6497  | 0  |             count = 4;  | 
6498  | 0  |             message = "truncated \\uXXXX escape";  | 
6499  | 0  |         }  | 
6500  | 0  |         else if (c == 'U') { | 
6501  | 0  |             count = 8;  | 
6502  | 0  |             message = "truncated \\UXXXXXXXX escape";  | 
6503  | 0  |         }  | 
6504  | 0  |         else { | 
6505  | 0  |             assert(writer.pos < writer.size);  | 
6506  | 0  |             PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');  | 
6507  | 0  |             WRITE_CHAR(c);  | 
6508  | 0  |             continue;  | 
6509  | 0  |         }  | 
6510  | 0  |         startinpos = s - starts - 2;  | 
6511  |  |  | 
6512  |  |         /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */  | 
6513  | 0  |         for (ch = 0; count && s < end; ++s, --count) { | 
6514  | 0  |             c = (unsigned char)*s;  | 
6515  | 0  |             ch <<= 4;  | 
6516  | 0  |             if (c >= '0' && c <= '9') { | 
6517  | 0  |                 ch += c - '0';  | 
6518  | 0  |             }  | 
6519  | 0  |             else if (c >= 'a' && c <= 'f') { | 
6520  | 0  |                 ch += c - ('a' - 10); | 
6521  | 0  |             }  | 
6522  | 0  |             else if (c >= 'A' && c <= 'F') { | 
6523  | 0  |                 ch += c - ('A' - 10); | 
6524  | 0  |             }  | 
6525  | 0  |             else { | 
6526  | 0  |                 break;  | 
6527  | 0  |             }  | 
6528  | 0  |         }  | 
6529  | 0  |         if (!count) { | 
6530  | 0  |             if (ch <= MAX_UNICODE) { | 
6531  | 0  |                 WRITE_CHAR(ch);  | 
6532  | 0  |                 continue;  | 
6533  | 0  |             }  | 
6534  | 0  |             message = "\\Uxxxxxxxx out of range";  | 
6535  | 0  |         }  | 
6536  |  |  | 
6537  | 0  |         endinpos = s-starts;  | 
6538  | 0  |         writer.min_length = end - s + writer.pos;  | 
6539  | 0  |         if (unicode_decode_call_errorhandler_writer(  | 
6540  | 0  |                 errors, &errorHandler,  | 
6541  | 0  |                 "rawunicodeescape", message,  | 
6542  | 0  |                 &starts, &end, &startinpos, &endinpos, &exc, &s,  | 
6543  | 0  |                 &writer)) { | 
6544  | 0  |             goto onError;  | 
6545  | 0  |         }  | 
6546  | 0  |         assert(end - s <= writer.size - writer.pos);  | 
6547  |  | 
  | 
6548  | 0  | #undef WRITE_CHAR  | 
6549  | 0  |     }  | 
6550  | 0  |     Py_XDECREF(errorHandler);  | 
6551  | 0  |     Py_XDECREF(exc);  | 
6552  | 0  |     return _PyUnicodeWriter_Finish(&writer);  | 
6553  |  |  | 
6554  | 0  |   onError:  | 
6555  | 0  |     _PyUnicodeWriter_Dealloc(&writer);  | 
6556  | 0  |     Py_XDECREF(errorHandler);  | 
6557  | 0  |     Py_XDECREF(exc);  | 
6558  | 0  |     return NULL;  | 
6559  |  | 
  | 
6560  | 0  | }  | 
6561  |  |  | 
6562  |  |  | 
6563  |  | PyObject *  | 
6564  |  | PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)  | 
6565  | 0  | { | 
6566  | 0  |     PyObject *repr;  | 
6567  | 0  |     char *p;  | 
6568  | 0  |     Py_ssize_t expandsize, pos;  | 
6569  | 0  |     int kind;  | 
6570  | 0  |     void *data;  | 
6571  | 0  |     Py_ssize_t len;  | 
6572  |  | 
  | 
6573  | 0  |     if (!PyUnicode_Check(unicode)) { | 
6574  | 0  |         PyErr_BadArgument();  | 
6575  | 0  |         return NULL;  | 
6576  | 0  |     }  | 
6577  | 0  |     if (PyUnicode_READY(unicode) == -1) { | 
6578  | 0  |         return NULL;  | 
6579  | 0  |     }  | 
6580  | 0  |     kind = PyUnicode_KIND(unicode);  | 
6581  | 0  |     data = PyUnicode_DATA(unicode);  | 
6582  | 0  |     len = PyUnicode_GET_LENGTH(unicode);  | 
6583  | 0  |     if (kind == PyUnicode_1BYTE_KIND) { | 
6584  | 0  |         return PyBytes_FromStringAndSize(data, len);  | 
6585  | 0  |     }  | 
6586  |  |  | 
6587  |  |     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6  | 
6588  |  |        bytes, and 1 byte characters 4. */  | 
6589  | 0  |     expandsize = kind * 2 + 2;  | 
6590  |  | 
  | 
6591  | 0  |     if (len > PY_SSIZE_T_MAX / expandsize) { | 
6592  | 0  |         return PyErr_NoMemory();  | 
6593  | 0  |     }  | 
6594  | 0  |     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);  | 
6595  | 0  |     if (repr == NULL) { | 
6596  | 0  |         return NULL;  | 
6597  | 0  |     }  | 
6598  | 0  |     if (len == 0) { | 
6599  | 0  |         return repr;  | 
6600  | 0  |     }  | 
6601  |  |  | 
6602  | 0  |     p = PyBytes_AS_STRING(repr);  | 
6603  | 0  |     for (pos = 0; pos < len; pos++) { | 
6604  | 0  |         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);  | 
6605  |  |  | 
6606  |  |         /* U+0000-U+00ff range: Copy 8-bit characters as-is */  | 
6607  | 0  |         if (ch < 0x100) { | 
6608  | 0  |             *p++ = (char) ch;  | 
6609  | 0  |         }  | 
6610  |  |         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */  | 
6611  | 0  |         else if (ch < 0x10000) { | 
6612  | 0  |             *p++ = '\\';  | 
6613  | 0  |             *p++ = 'u';  | 
6614  | 0  |             *p++ = Py_hexdigits[(ch >> 12) & 0xf];  | 
6615  | 0  |             *p++ = Py_hexdigits[(ch >> 8) & 0xf];  | 
6616  | 0  |             *p++ = Py_hexdigits[(ch >> 4) & 0xf];  | 
6617  | 0  |             *p++ = Py_hexdigits[ch & 15];  | 
6618  | 0  |         }  | 
6619  |  |         /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */  | 
6620  | 0  |         else { | 
6621  | 0  |             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);  | 
6622  | 0  |             *p++ = '\\';  | 
6623  | 0  |             *p++ = 'U';  | 
6624  | 0  |             *p++ = '0';  | 
6625  | 0  |             *p++ = '0';  | 
6626  | 0  |             *p++ = Py_hexdigits[(ch >> 20) & 0xf];  | 
6627  | 0  |             *p++ = Py_hexdigits[(ch >> 16) & 0xf];  | 
6628  | 0  |             *p++ = Py_hexdigits[(ch >> 12) & 0xf];  | 
6629  | 0  |             *p++ = Py_hexdigits[(ch >> 8) & 0xf];  | 
6630  | 0  |             *p++ = Py_hexdigits[(ch >> 4) & 0xf];  | 
6631  | 0  |             *p++ = Py_hexdigits[ch & 15];  | 
6632  | 0  |         }  | 
6633  | 0  |     }  | 
6634  |  | 
  | 
6635  | 0  |     assert(p > PyBytes_AS_STRING(repr));  | 
6636  | 0  |     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) { | 
6637  | 0  |         return NULL;  | 
6638  | 0  |     }  | 
6639  | 0  |     return repr;  | 
6640  | 0  | }  | 
6641  |  |  | 
6642  |  | PyObject *  | 
6643  |  | PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,  | 
6644  |  |                                  Py_ssize_t size)  | 
6645  | 0  | { | 
6646  | 0  |     PyObject *result;  | 
6647  | 0  |     PyObject *tmp = PyUnicode_FromWideChar(s, size);  | 
6648  | 0  |     if (tmp == NULL)  | 
6649  | 0  |         return NULL;  | 
6650  | 0  |     result = PyUnicode_AsRawUnicodeEscapeString(tmp);  | 
6651  | 0  |     Py_DECREF(tmp);  | 
6652  | 0  |     return result;  | 
6653  | 0  | }  | 
6654  |  |  | 
6655  |  | /* --- Latin-1 Codec ------------------------------------------------------ */  | 
6656  |  |  | 
6657  |  | PyObject *  | 
6658  |  | PyUnicode_DecodeLatin1(const char *s,  | 
6659  |  |                        Py_ssize_t size,  | 
6660  |  |                        const char *errors)  | 
6661  | 1  | { | 
6662  |  |     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */  | 
6663  | 1  |     return _PyUnicode_FromUCS1((const unsigned char*)s, size);  | 
6664  | 1  | }  | 
6665  |  |  | 
6666  |  | /* create or adjust a UnicodeEncodeError */  | 
6667  |  | static void  | 
6668  |  | make_encode_exception(PyObject **exceptionObject,  | 
6669  |  |                       const char *encoding,  | 
6670  |  |                       PyObject *unicode,  | 
6671  |  |                       Py_ssize_t startpos, Py_ssize_t endpos,  | 
6672  |  |                       const char *reason)  | 
6673  | 0  | { | 
6674  | 0  |     if (*exceptionObject == NULL) { | 
6675  | 0  |         *exceptionObject = PyObject_CallFunction(  | 
6676  | 0  |             PyExc_UnicodeEncodeError, "sOnns",  | 
6677  | 0  |             encoding, unicode, startpos, endpos, reason);  | 
6678  | 0  |     }  | 
6679  | 0  |     else { | 
6680  | 0  |         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))  | 
6681  | 0  |             goto onError;  | 
6682  | 0  |         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))  | 
6683  | 0  |             goto onError;  | 
6684  | 0  |         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))  | 
6685  | 0  |             goto onError;  | 
6686  | 0  |         return;  | 
6687  | 0  |       onError:  | 
6688  | 0  |         Py_CLEAR(*exceptionObject);  | 
6689  | 0  |     }  | 
6690  | 0  | }  | 
6691  |  |  | 
6692  |  | /* raises a UnicodeEncodeError */  | 
6693  |  | static void  | 
6694  |  | raise_encode_exception(PyObject **exceptionObject,  | 
6695  |  |                        const char *encoding,  | 
6696  |  |                        PyObject *unicode,  | 
6697  |  |                        Py_ssize_t startpos, Py_ssize_t endpos,  | 
6698  |  |                        const char *reason)  | 
6699  | 0  | { | 
6700  | 0  |     make_encode_exception(exceptionObject,  | 
6701  | 0  |                           encoding, unicode, startpos, endpos, reason);  | 
6702  | 0  |     if (*exceptionObject != NULL)  | 
6703  | 0  |         PyCodec_StrictErrors(*exceptionObject);  | 
6704  | 0  | }  | 
6705  |  |  | 
6706  |  | /* error handling callback helper:  | 
6707  |  |    build arguments, call the callback and check the arguments,  | 
6708  |  |    put the result into newpos and return the replacement string, which  | 
6709  |  |    has to be freed by the caller */  | 
6710  |  | static PyObject *  | 
6711  |  | unicode_encode_call_errorhandler(const char *errors,  | 
6712  |  |                                  PyObject **errorHandler,  | 
6713  |  |                                  const char *encoding, const char *reason,  | 
6714  |  |                                  PyObject *unicode, PyObject **exceptionObject,  | 
6715  |  |                                  Py_ssize_t startpos, Py_ssize_t endpos,  | 
6716  |  |                                  Py_ssize_t *newpos)  | 
6717  | 0  | { | 
6718  | 0  |     static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";  | 
6719  | 0  |     Py_ssize_t len;  | 
6720  | 0  |     PyObject *restuple;  | 
6721  | 0  |     PyObject *resunicode;  | 
6722  |  | 
  | 
6723  | 0  |     if (*errorHandler == NULL) { | 
6724  | 0  |         *errorHandler = PyCodec_LookupError(errors);  | 
6725  | 0  |         if (*errorHandler == NULL)  | 
6726  | 0  |             return NULL;  | 
6727  | 0  |     }  | 
6728  |  |  | 
6729  | 0  |     if (PyUnicode_READY(unicode) == -1)  | 
6730  | 0  |         return NULL;  | 
6731  | 0  |     len = PyUnicode_GET_LENGTH(unicode);  | 
6732  |  | 
  | 
6733  | 0  |     make_encode_exception(exceptionObject,  | 
6734  | 0  |                           encoding, unicode, startpos, endpos, reason);  | 
6735  | 0  |     if (*exceptionObject == NULL)  | 
6736  | 0  |         return NULL;  | 
6737  |  |  | 
6738  | 0  |     restuple = PyObject_CallFunctionObjArgs(  | 
6739  | 0  |         *errorHandler, *exceptionObject, NULL);  | 
6740  | 0  |     if (restuple == NULL)  | 
6741  | 0  |         return NULL;  | 
6742  | 0  |     if (!PyTuple_Check(restuple)) { | 
6743  | 0  |         PyErr_SetString(PyExc_TypeError, &argparse[3]);  | 
6744  | 0  |         Py_DECREF(restuple);  | 
6745  | 0  |         return NULL;  | 
6746  | 0  |     }  | 
6747  | 0  |     if (!PyArg_ParseTuple(restuple, argparse,  | 
6748  | 0  |                           &resunicode, newpos)) { | 
6749  | 0  |         Py_DECREF(restuple);  | 
6750  | 0  |         return NULL;  | 
6751  | 0  |     }  | 
6752  | 0  |     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { | 
6753  | 0  |         PyErr_SetString(PyExc_TypeError, &argparse[3]);  | 
6754  | 0  |         Py_DECREF(restuple);  | 
6755  | 0  |         return NULL;  | 
6756  | 0  |     }  | 
6757  | 0  |     if (*newpos<0)  | 
6758  | 0  |         *newpos = len + *newpos;  | 
6759  | 0  |     if (*newpos<0 || *newpos>len) { | 
6760  | 0  |         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);  | 
6761  | 0  |         Py_DECREF(restuple);  | 
6762  | 0  |         return NULL;  | 
6763  | 0  |     }  | 
6764  | 0  |     Py_INCREF(resunicode);  | 
6765  | 0  |     Py_DECREF(restuple);  | 
6766  | 0  |     return resunicode;  | 
6767  | 0  | }  | 
6768  |  |  | 
6769  |  | static PyObject *  | 
6770  |  | unicode_encode_ucs1(PyObject *unicode,  | 
6771  |  |                     const char *errors,  | 
6772  |  |                     const Py_UCS4 limit)  | 
6773  | 0  | { | 
6774  |  |     /* input state */  | 
6775  | 0  |     Py_ssize_t pos=0, size;  | 
6776  | 0  |     int kind;  | 
6777  | 0  |     void *data;  | 
6778  |  |     /* pointer into the output */  | 
6779  | 0  |     char *str;  | 
6780  | 0  |     const char *encoding = (limit == 256) ? "latin-1" : "ascii";  | 
6781  | 0  |     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";  | 
6782  | 0  |     PyObject *error_handler_obj = NULL;  | 
6783  | 0  |     PyObject *exc = NULL;  | 
6784  | 0  |     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;  | 
6785  | 0  |     PyObject *rep = NULL;  | 
6786  |  |     /* output object */  | 
6787  | 0  |     _PyBytesWriter writer;  | 
6788  |  | 
  | 
6789  | 0  |     if (PyUnicode_READY(unicode) == -1)  | 
6790  | 0  |         return NULL;  | 
6791  | 0  |     size = PyUnicode_GET_LENGTH(unicode);  | 
6792  | 0  |     kind = PyUnicode_KIND(unicode);  | 
6793  | 0  |     data = PyUnicode_DATA(unicode);  | 
6794  |  |     /* allocate enough for a simple encoding without  | 
6795  |  |        replacements, if we need more, we'll resize */  | 
6796  | 0  |     if (size == 0)  | 
6797  | 0  |         return PyBytes_FromStringAndSize(NULL, 0);  | 
6798  |  |  | 
6799  | 0  |     _PyBytesWriter_Init(&writer);  | 
6800  | 0  |     str = _PyBytesWriter_Alloc(&writer, size);  | 
6801  | 0  |     if (str == NULL)  | 
6802  | 0  |         return NULL;  | 
6803  |  |  | 
6804  | 0  |     while (pos < size) { | 
6805  | 0  |         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);  | 
6806  |  |  | 
6807  |  |         /* can we encode this? */  | 
6808  | 0  |         if (ch < limit) { | 
6809  |  |             /* no overflow check, because we know that the space is enough */  | 
6810  | 0  |             *str++ = (char)ch;  | 
6811  | 0  |             ++pos;  | 
6812  | 0  |         }  | 
6813  | 0  |         else { | 
6814  | 0  |             Py_ssize_t newpos, i;  | 
6815  |  |             /* startpos for collecting unencodable chars */  | 
6816  | 0  |             Py_ssize_t collstart = pos;  | 
6817  | 0  |             Py_ssize_t collend = collstart + 1;  | 
6818  |  |             /* find all unecodable characters */  | 
6819  |  | 
  | 
6820  | 0  |             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))  | 
6821  | 0  |                 ++collend;  | 
6822  |  |  | 
6823  |  |             /* Only overallocate the buffer if it's not the last write */  | 
6824  | 0  |             writer.overallocate = (collend < size);  | 
6825  |  |  | 
6826  |  |             /* cache callback name lookup (if not done yet, i.e. it's the first error) */  | 
6827  | 0  |             if (error_handler == _Py_ERROR_UNKNOWN)  | 
6828  | 0  |                 error_handler = _Py_GetErrorHandler(errors);  | 
6829  |  | 
  | 
6830  | 0  |             switch (error_handler) { | 
6831  | 0  |             case _Py_ERROR_STRICT:  | 
6832  | 0  |                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);  | 
6833  | 0  |                 goto onError;  | 
6834  |  |  | 
6835  | 0  |             case _Py_ERROR_REPLACE:  | 
6836  | 0  |                 memset(str, '?', collend - collstart);  | 
6837  | 0  |                 str += (collend - collstart);  | 
6838  |  |                 /* fall through */  | 
6839  | 0  |             case _Py_ERROR_IGNORE:  | 
6840  | 0  |                 pos = collend;  | 
6841  | 0  |                 break;  | 
6842  |  |  | 
6843  | 0  |             case _Py_ERROR_BACKSLASHREPLACE:  | 
6844  |  |                 /* subtract preallocated bytes */  | 
6845  | 0  |                 writer.min_size -= (collend - collstart);  | 
6846  | 0  |                 str = backslashreplace(&writer, str,  | 
6847  | 0  |                                        unicode, collstart, collend);  | 
6848  | 0  |                 if (str == NULL)  | 
6849  | 0  |                     goto onError;  | 
6850  | 0  |                 pos = collend;  | 
6851  | 0  |                 break;  | 
6852  |  |  | 
6853  | 0  |             case _Py_ERROR_XMLCHARREFREPLACE:  | 
6854  |  |                 /* subtract preallocated bytes */  | 
6855  | 0  |                 writer.min_size -= (collend - collstart);  | 
6856  | 0  |                 str = xmlcharrefreplace(&writer, str,  | 
6857  | 0  |                                         unicode, collstart, collend);  | 
6858  | 0  |                 if (str == NULL)  | 
6859  | 0  |                     goto onError;  | 
6860  | 0  |                 pos = collend;  | 
6861  | 0  |                 break;  | 
6862  |  |  | 
6863  | 0  |             case _Py_ERROR_SURROGATEESCAPE:  | 
6864  | 0  |                 for (i = collstart; i < collend; ++i) { | 
6865  | 0  |                     ch = PyUnicode_READ(kind, data, i);  | 
6866  | 0  |                     if (ch < 0xdc80 || 0xdcff < ch) { | 
6867  |  |                         /* Not a UTF-8b surrogate */  | 
6868  | 0  |                         break;  | 
6869  | 0  |                     }  | 
6870  | 0  |                     *str++ = (char)(ch - 0xdc00);  | 
6871  | 0  |                     ++pos;  | 
6872  | 0  |                 }  | 
6873  | 0  |                 if (i >= collend)  | 
6874  | 0  |                     break;  | 
6875  | 0  |                 collstart = pos;  | 
6876  | 0  |                 assert(collstart != collend);  | 
6877  |  |                 /* fall through */  | 
6878  |  | 
  | 
6879  | 0  |             default:  | 
6880  | 0  |                 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,  | 
6881  | 0  |                                                        encoding, reason, unicode, &exc,  | 
6882  | 0  |                                                        collstart, collend, &newpos);  | 
6883  | 0  |                 if (rep == NULL)  | 
6884  | 0  |                     goto onError;  | 
6885  |  |  | 
6886  |  |                 /* subtract preallocated bytes */  | 
6887  | 0  |                 writer.min_size -= newpos - collstart;  | 
6888  |  | 
  | 
6889  | 0  |                 if (PyBytes_Check(rep)) { | 
6890  |  |                     /* Directly copy bytes result to output. */  | 
6891  | 0  |                     str = _PyBytesWriter_WriteBytes(&writer, str,  | 
6892  | 0  |                                                     PyBytes_AS_STRING(rep),  | 
6893  | 0  |                                                     PyBytes_GET_SIZE(rep));  | 
6894  | 0  |                 }  | 
6895  | 0  |                 else { | 
6896  | 0  |                     assert(PyUnicode_Check(rep));  | 
6897  |  | 
  | 
6898  | 0  |                     if (PyUnicode_READY(rep) < 0)  | 
6899  | 0  |                         goto onError;  | 
6900  |  |  | 
6901  | 0  |                     if (limit == 256 ?  | 
6902  | 0  |                         PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :  | 
6903  | 0  |                         !PyUnicode_IS_ASCII(rep))  | 
6904  | 0  |                     { | 
6905  |  |                         /* Not all characters are smaller than limit */  | 
6906  | 0  |                         raise_encode_exception(&exc, encoding, unicode,  | 
6907  | 0  |                                                collstart, collend, reason);  | 
6908  | 0  |                         goto onError;  | 
6909  | 0  |                     }  | 
6910  | 0  |                     assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);  | 
6911  | 0  |                     str = _PyBytesWriter_WriteBytes(&writer, str,  | 
6912  | 0  |                                                     PyUnicode_DATA(rep),  | 
6913  | 0  |                                                     PyUnicode_GET_LENGTH(rep));  | 
6914  | 0  |                 }  | 
6915  | 0  |                 if (str == NULL)  | 
6916  | 0  |                     goto onError;  | 
6917  |  |  | 
6918  | 0  |                 pos = newpos;  | 
6919  | 0  |                 Py_CLEAR(rep);  | 
6920  | 0  |             }  | 
6921  |  |  | 
6922  |  |             /* If overallocation was disabled, ensure that it was the last  | 
6923  |  |                write. Otherwise, we missed an optimization */  | 
6924  | 0  |             assert(writer.overallocate || pos == size);  | 
6925  | 0  |         }  | 
6926  | 0  |     }  | 
6927  |  |  | 
6928  | 0  |     Py_XDECREF(error_handler_obj);  | 
6929  | 0  |     Py_XDECREF(exc);  | 
6930  | 0  |     return _PyBytesWriter_Finish(&writer, str);  | 
6931  |  |  | 
6932  | 0  |   onError:  | 
6933  | 0  |     Py_XDECREF(rep);  | 
6934  | 0  |     _PyBytesWriter_Dealloc(&writer);  | 
6935  | 0  |     Py_XDECREF(error_handler_obj);  | 
6936  | 0  |     Py_XDECREF(exc);  | 
6937  | 0  |     return NULL;  | 
6938  | 0  | }  | 
6939  |  |  | 
6940  |  | /* Deprecated */  | 
6941  |  | PyObject *  | 
6942  |  | PyUnicode_EncodeLatin1(const Py_UNICODE *p,  | 
6943  |  |                        Py_ssize_t size,  | 
6944  |  |                        const char *errors)  | 
6945  | 0  | { | 
6946  | 0  |     PyObject *result;  | 
6947  | 0  |     PyObject *unicode = PyUnicode_FromWideChar(p, size);  | 
6948  | 0  |     if (unicode == NULL)  | 
6949  | 0  |         return NULL;  | 
6950  | 0  |     result = unicode_encode_ucs1(unicode, errors, 256);  | 
6951  | 0  |     Py_DECREF(unicode);  | 
6952  | 0  |     return result;  | 
6953  | 0  | }  | 
6954  |  |  | 
6955  |  | PyObject *  | 
6956  |  | _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)  | 
6957  | 0  | { | 
6958  | 0  |     if (!PyUnicode_Check(unicode)) { | 
6959  | 0  |         PyErr_BadArgument();  | 
6960  | 0  |         return NULL;  | 
6961  | 0  |     }  | 
6962  | 0  |     if (PyUnicode_READY(unicode) == -1)  | 
6963  | 0  |         return NULL;  | 
6964  |  |     /* Fast path: if it is a one-byte string, construct  | 
6965  |  |        bytes object directly. */  | 
6966  | 0  |     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)  | 
6967  | 0  |         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),  | 
6968  | 0  |                                          PyUnicode_GET_LENGTH(unicode));  | 
6969  |  |     /* Non-Latin-1 characters present. Defer to above function to  | 
6970  |  |        raise the exception. */  | 
6971  | 0  |     return unicode_encode_ucs1(unicode, errors, 256);  | 
6972  | 0  | }  | 
6973  |  |  | 
6974  |  | PyObject*  | 
6975  |  | PyUnicode_AsLatin1String(PyObject *unicode)  | 
6976  | 0  | { | 
6977  | 0  |     return _PyUnicode_AsLatin1String(unicode, NULL);  | 
6978  | 0  | }  | 
6979  |  |  | 
6980  |  | /* --- 7-bit ASCII Codec -------------------------------------------------- */  | 
6981  |  |  | 
6982  |  | PyObject *  | 
6983  |  | PyUnicode_DecodeASCII(const char *s,  | 
6984  |  |                       Py_ssize_t size,  | 
6985  |  |                       const char *errors)  | 
6986  | 437  | { | 
6987  | 437  |     const char *starts = s;  | 
6988  | 437  |     _PyUnicodeWriter writer;  | 
6989  | 437  |     int kind;  | 
6990  | 437  |     void *data;  | 
6991  | 437  |     Py_ssize_t startinpos;  | 
6992  | 437  |     Py_ssize_t endinpos;  | 
6993  | 437  |     Py_ssize_t outpos;  | 
6994  | 437  |     const char *e;  | 
6995  | 437  |     PyObject *error_handler_obj = NULL;  | 
6996  | 437  |     PyObject *exc = NULL;  | 
6997  | 437  |     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;  | 
6998  |  |  | 
6999  | 437  |     if (size == 0)  | 
7000  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
7001  |  |  | 
7002  |  |     /* ASCII is equivalent to the first 128 ordinals in Unicode. */  | 
7003  | 437  |     if (size == 1 && (unsigned char)s[0] < 128)  | 
7004  | 0  |         return get_latin1_char((unsigned char)s[0]);  | 
7005  |  |  | 
7006  | 437  |     _PyUnicodeWriter_Init(&writer);  | 
7007  | 437  |     writer.min_length = size;  | 
7008  | 437  |     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)  | 
7009  | 0  |         return NULL;  | 
7010  |  |  | 
7011  | 437  |     e = s + size;  | 
7012  | 437  |     data = writer.data;  | 
7013  | 437  |     outpos = ascii_decode(s, e, (Py_UCS1 *)data);  | 
7014  | 437  |     writer.pos = outpos;  | 
7015  | 437  |     if (writer.pos == size)  | 
7016  | 437  |         return _PyUnicodeWriter_Finish(&writer);  | 
7017  |  |  | 
7018  | 0  |     s += writer.pos;  | 
7019  | 0  |     kind = writer.kind;  | 
7020  | 0  |     while (s < e) { | 
7021  | 0  |         unsigned char c = (unsigned char)*s;  | 
7022  | 0  |         if (c < 128) { | 
7023  | 0  |             PyUnicode_WRITE(kind, data, writer.pos, c);  | 
7024  | 0  |             writer.pos++;  | 
7025  | 0  |             ++s;  | 
7026  | 0  |             continue;  | 
7027  | 0  |         }  | 
7028  |  |  | 
7029  |  |         /* byte outsize range 0x00..0x7f: call the error handler */  | 
7030  |  |  | 
7031  | 0  |         if (error_handler == _Py_ERROR_UNKNOWN)  | 
7032  | 0  |             error_handler = _Py_GetErrorHandler(errors);  | 
7033  |  | 
  | 
7034  | 0  |         switch (error_handler)  | 
7035  | 0  |         { | 
7036  | 0  |         case _Py_ERROR_REPLACE:  | 
7037  | 0  |         case _Py_ERROR_SURROGATEESCAPE:  | 
7038  |  |             /* Fast-path: the error handler only writes one character,  | 
7039  |  |                but we may switch to UCS2 at the first write */  | 
7040  | 0  |             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)  | 
7041  | 0  |                 goto onError;  | 
7042  | 0  |             kind = writer.kind;  | 
7043  | 0  |             data = writer.data;  | 
7044  |  | 
  | 
7045  | 0  |             if (error_handler == _Py_ERROR_REPLACE)  | 
7046  | 0  |                 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);  | 
7047  | 0  |             else  | 
7048  | 0  |                 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);  | 
7049  | 0  |             writer.pos++;  | 
7050  | 0  |             ++s;  | 
7051  | 0  |             break;  | 
7052  |  |  | 
7053  | 0  |         case _Py_ERROR_IGNORE:  | 
7054  | 0  |             ++s;  | 
7055  | 0  |             break;  | 
7056  |  |  | 
7057  | 0  |         default:  | 
7058  | 0  |             startinpos = s-starts;  | 
7059  | 0  |             endinpos = startinpos + 1;  | 
7060  | 0  |             if (unicode_decode_call_errorhandler_writer(  | 
7061  | 0  |                     errors, &error_handler_obj,  | 
7062  | 0  |                     "ascii", "ordinal not in range(128)",  | 
7063  | 0  |                     &starts, &e, &startinpos, &endinpos, &exc, &s,  | 
7064  | 0  |                     &writer))  | 
7065  | 0  |                 goto onError;  | 
7066  | 0  |             kind = writer.kind;  | 
7067  | 0  |             data = writer.data;  | 
7068  | 0  |         }  | 
7069  | 0  |     }  | 
7070  | 0  |     Py_XDECREF(error_handler_obj);  | 
7071  | 0  |     Py_XDECREF(exc);  | 
7072  | 0  |     return _PyUnicodeWriter_Finish(&writer);  | 
7073  |  |  | 
7074  | 0  |   onError:  | 
7075  | 0  |     _PyUnicodeWriter_Dealloc(&writer);  | 
7076  | 0  |     Py_XDECREF(error_handler_obj);  | 
7077  | 0  |     Py_XDECREF(exc);  | 
7078  | 0  |     return NULL;  | 
7079  | 0  | }  | 
7080  |  |  | 
7081  |  | /* Deprecated */  | 
7082  |  | PyObject *  | 
7083  |  | PyUnicode_EncodeASCII(const Py_UNICODE *p,  | 
7084  |  |                       Py_ssize_t size,  | 
7085  |  |                       const char *errors)  | 
7086  | 0  | { | 
7087  | 0  |     PyObject *result;  | 
7088  | 0  |     PyObject *unicode = PyUnicode_FromWideChar(p, size);  | 
7089  | 0  |     if (unicode == NULL)  | 
7090  | 0  |         return NULL;  | 
7091  | 0  |     result = unicode_encode_ucs1(unicode, errors, 128);  | 
7092  | 0  |     Py_DECREF(unicode);  | 
7093  | 0  |     return result;  | 
7094  | 0  | }  | 
7095  |  |  | 
7096  |  | PyObject *  | 
7097  |  | _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)  | 
7098  | 1.01k  | { | 
7099  | 1.01k  |     if (!PyUnicode_Check(unicode)) { | 
7100  | 0  |         PyErr_BadArgument();  | 
7101  | 0  |         return NULL;  | 
7102  | 0  |     }  | 
7103  | 1.01k  |     if (PyUnicode_READY(unicode) == -1)  | 
7104  | 0  |         return NULL;  | 
7105  |  |     /* Fast path: if it is an ASCII-only string, construct bytes object  | 
7106  |  |        directly. Else defer to above function to raise the exception. */  | 
7107  | 1.01k  |     if (PyUnicode_IS_ASCII(unicode))  | 
7108  | 1.01k  |         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),  | 
7109  | 1.01k  |                                          PyUnicode_GET_LENGTH(unicode));  | 
7110  | 0  |     return unicode_encode_ucs1(unicode, errors, 128);  | 
7111  | 1.01k  | }  | 
7112  |  |  | 
7113  |  | PyObject *  | 
7114  |  | PyUnicode_AsASCIIString(PyObject *unicode)  | 
7115  | 2  | { | 
7116  | 2  |     return _PyUnicode_AsASCIIString(unicode, NULL);  | 
7117  | 2  | }  | 
7118  |  |  | 
7119  |  | #ifdef MS_WINDOWS  | 
7120  |  |  | 
7121  |  | /* --- MBCS codecs for Windows -------------------------------------------- */  | 
7122  |  |  | 
7123  |  | #if SIZEOF_INT < SIZEOF_SIZE_T  | 
7124  |  | #define NEED_RETRY  | 
7125  |  | #endif  | 
7126  |  |  | 
7127  |  | /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when  | 
7128  |  |    transcoding from UTF-16), but INT_MAX / 4 perfoms better in  | 
7129  |  |    both cases also and avoids partial characters overrunning the  | 
7130  |  |    length limit in MultiByteToWideChar on Windows */  | 
7131  |  | #define DECODING_CHUNK_SIZE (INT_MAX/4)  | 
7132  |  |  | 
7133  |  | #ifndef WC_ERR_INVALID_CHARS  | 
7134  |  | #  define WC_ERR_INVALID_CHARS 0x0080  | 
7135  |  | #endif  | 
7136  |  |  | 
7137  |  | static const char*  | 
7138  |  | code_page_name(UINT code_page, PyObject **obj)  | 
7139  |  | { | 
7140  |  |     *obj = NULL;  | 
7141  |  |     if (code_page == CP_ACP)  | 
7142  |  |         return "mbcs";  | 
7143  |  |     if (code_page == CP_UTF7)  | 
7144  |  |         return "CP_UTF7";  | 
7145  |  |     if (code_page == CP_UTF8)  | 
7146  |  |         return "CP_UTF8";  | 
7147  |  |  | 
7148  |  |     *obj = PyBytes_FromFormat("cp%u", code_page); | 
7149  |  |     if (*obj == NULL)  | 
7150  |  |         return NULL;  | 
7151  |  |     return PyBytes_AS_STRING(*obj);  | 
7152  |  | }  | 
7153  |  |  | 
7154  |  | static DWORD  | 
7155  |  | decode_code_page_flags(UINT code_page)  | 
7156  |  | { | 
7157  |  |     if (code_page == CP_UTF7) { | 
7158  |  |         /* The CP_UTF7 decoder only supports flags=0 */  | 
7159  |  |         return 0;  | 
7160  |  |     }  | 
7161  |  |     else  | 
7162  |  |         return MB_ERR_INVALID_CHARS;  | 
7163  |  | }  | 
7164  |  |  | 
7165  |  | /*  | 
7166  |  |  * Decode a byte string from a Windows code page into unicode object in strict  | 
7167  |  |  * mode.  | 
7168  |  |  *  | 
7169  |  |  * Returns consumed size if succeed, returns -2 on decode error, or raise an  | 
7170  |  |  * OSError and returns -1 on other error.  | 
7171  |  |  */  | 
7172  |  | static int  | 
7173  |  | decode_code_page_strict(UINT code_page,  | 
7174  |  |                         wchar_t **buf,  | 
7175  |  |                         Py_ssize_t *bufsize,  | 
7176  |  |                         const char *in,  | 
7177  |  |                         int insize)  | 
7178  |  | { | 
7179  |  |     DWORD flags = MB_ERR_INVALID_CHARS;  | 
7180  |  |     wchar_t *out;  | 
7181  |  |     DWORD outsize;  | 
7182  |  |  | 
7183  |  |     /* First get the size of the result */  | 
7184  |  |     assert(insize > 0);  | 
7185  |  |     while ((outsize = MultiByteToWideChar(code_page, flags,  | 
7186  |  |                                           in, insize, NULL, 0)) <= 0)  | 
7187  |  |     { | 
7188  |  |         if (!flags || GetLastError() != ERROR_INVALID_FLAGS) { | 
7189  |  |             goto error;  | 
7190  |  |         }  | 
7191  |  |         /* For some code pages (e.g. UTF-7) flags must be set to 0. */  | 
7192  |  |         flags = 0;  | 
7193  |  |     }  | 
7194  |  |  | 
7195  |  |     /* Extend a wchar_t* buffer */  | 
7196  |  |     Py_ssize_t n = *bufsize;   /* Get the current length */  | 
7197  |  |     if (widechar_resize(buf, bufsize, n + outsize) < 0) { | 
7198  |  |         return -1;  | 
7199  |  |     }  | 
7200  |  |     out = *buf + n;  | 
7201  |  |  | 
7202  |  |     /* Do the conversion */  | 
7203  |  |     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);  | 
7204  |  |     if (outsize <= 0)  | 
7205  |  |         goto error;  | 
7206  |  |     return insize;  | 
7207  |  |  | 
7208  |  | error:  | 
7209  |  |     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)  | 
7210  |  |         return -2;  | 
7211  |  |     PyErr_SetFromWindowsErr(0);  | 
7212  |  |     return -1;  | 
7213  |  | }  | 
7214  |  |  | 
7215  |  | /*  | 
7216  |  |  * Decode a byte string from a code page into unicode object with an error  | 
7217  |  |  * handler.  | 
7218  |  |  *  | 
7219  |  |  * Returns consumed size if succeed, or raise an OSError or  | 
7220  |  |  * UnicodeDecodeError exception and returns -1 on error.  | 
7221  |  |  */  | 
7222  |  | static int  | 
7223  |  | decode_code_page_errors(UINT code_page,  | 
7224  |  |                         wchar_t **buf,  | 
7225  |  |                         Py_ssize_t *bufsize,  | 
7226  |  |                         const char *in, const int size,  | 
7227  |  |                         const char *errors, int final)  | 
7228  |  | { | 
7229  |  |     const char *startin = in;  | 
7230  |  |     const char *endin = in + size;  | 
7231  |  |     DWORD flags = MB_ERR_INVALID_CHARS;  | 
7232  |  |     /* Ideally, we should get reason from FormatMessage. This is the Windows  | 
7233  |  |        2000 English version of the message. */  | 
7234  |  |     const char *reason = "No mapping for the Unicode character exists "  | 
7235  |  |                          "in the target code page.";  | 
7236  |  |     /* each step cannot decode more than 1 character, but a character can be  | 
7237  |  |        represented as a surrogate pair */  | 
7238  |  |     wchar_t buffer[2], *out;  | 
7239  |  |     int insize;  | 
7240  |  |     Py_ssize_t outsize;  | 
7241  |  |     PyObject *errorHandler = NULL;  | 
7242  |  |     PyObject *exc = NULL;  | 
7243  |  |     PyObject *encoding_obj = NULL;  | 
7244  |  |     const char *encoding;  | 
7245  |  |     DWORD err;  | 
7246  |  |     int ret = -1;  | 
7247  |  |  | 
7248  |  |     assert(size > 0);  | 
7249  |  |  | 
7250  |  |     encoding = code_page_name(code_page, &encoding_obj);  | 
7251  |  |     if (encoding == NULL)  | 
7252  |  |         return -1;  | 
7253  |  |  | 
7254  |  |     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) { | 
7255  |  |         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a  | 
7256  |  |            UnicodeDecodeError. */  | 
7257  |  |         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);  | 
7258  |  |         if (exc != NULL) { | 
7259  |  |             PyCodec_StrictErrors(exc);  | 
7260  |  |             Py_CLEAR(exc);  | 
7261  |  |         }  | 
7262  |  |         goto error;  | 
7263  |  |     }  | 
7264  |  |  | 
7265  |  |     /* Extend a wchar_t* buffer */  | 
7266  |  |     Py_ssize_t n = *bufsize;   /* Get the current length */  | 
7267  |  |     if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { | 
7268  |  |         PyErr_NoMemory();  | 
7269  |  |         goto error;  | 
7270  |  |     }  | 
7271  |  |     if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) { | 
7272  |  |         goto error;  | 
7273  |  |     }  | 
7274  |  |     out = *buf + n;  | 
7275  |  |  | 
7276  |  |     /* Decode the byte string character per character */  | 
7277  |  |     while (in < endin)  | 
7278  |  |     { | 
7279  |  |         /* Decode a character */  | 
7280  |  |         insize = 1;  | 
7281  |  |         do  | 
7282  |  |         { | 
7283  |  |             outsize = MultiByteToWideChar(code_page, flags,  | 
7284  |  |                                           in, insize,  | 
7285  |  |                                           buffer, Py_ARRAY_LENGTH(buffer));  | 
7286  |  |             if (outsize > 0)  | 
7287  |  |                 break;  | 
7288  |  |             err = GetLastError();  | 
7289  |  |             if (err == ERROR_INVALID_FLAGS && flags) { | 
7290  |  |                 /* For some code pages (e.g. UTF-7) flags must be set to 0. */  | 
7291  |  |                 flags = 0;  | 
7292  |  |                 continue;  | 
7293  |  |             }  | 
7294  |  |             if (err != ERROR_NO_UNICODE_TRANSLATION  | 
7295  |  |                 && err != ERROR_INSUFFICIENT_BUFFER)  | 
7296  |  |             { | 
7297  |  |                 PyErr_SetFromWindowsErr(0);  | 
7298  |  |                 goto error;  | 
7299  |  |             }  | 
7300  |  |             insize++;  | 
7301  |  |         }  | 
7302  |  |         /* 4=maximum length of a UTF-8 sequence */  | 
7303  |  |         while (insize <= 4 && (in + insize) <= endin);  | 
7304  |  |  | 
7305  |  |         if (outsize <= 0) { | 
7306  |  |             Py_ssize_t startinpos, endinpos, outpos;  | 
7307  |  |  | 
7308  |  |             /* last character in partial decode? */  | 
7309  |  |             if (in + insize >= endin && !final)  | 
7310  |  |                 break;  | 
7311  |  |  | 
7312  |  |             startinpos = in - startin;  | 
7313  |  |             endinpos = startinpos + 1;  | 
7314  |  |             outpos = out - *buf;  | 
7315  |  |             if (unicode_decode_call_errorhandler_wchar(  | 
7316  |  |                     errors, &errorHandler,  | 
7317  |  |                     encoding, reason,  | 
7318  |  |                     &startin, &endin, &startinpos, &endinpos, &exc, &in,  | 
7319  |  |                     buf, bufsize, &outpos))  | 
7320  |  |             { | 
7321  |  |                 goto error;  | 
7322  |  |             }  | 
7323  |  |             out = *buf + outpos;  | 
7324  |  |         }  | 
7325  |  |         else { | 
7326  |  |             in += insize;  | 
7327  |  |             memcpy(out, buffer, outsize * sizeof(wchar_t));  | 
7328  |  |             out += outsize;  | 
7329  |  |         }  | 
7330  |  |     }  | 
7331  |  |  | 
7332  |  |     /* Shrink the buffer */  | 
7333  |  |     assert(out - *buf <= *bufsize);  | 
7334  |  |     *bufsize = out - *buf;  | 
7335  |  |     /* (in - startin) <= size and size is an int */  | 
7336  |  |     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);  | 
7337  |  |  | 
7338  |  | error:  | 
7339  |  |     Py_XDECREF(encoding_obj);  | 
7340  |  |     Py_XDECREF(errorHandler);  | 
7341  |  |     Py_XDECREF(exc);  | 
7342  |  |     return ret;  | 
7343  |  | }  | 
7344  |  |  | 
7345  |  | static PyObject *  | 
7346  |  | decode_code_page_stateful(int code_page,  | 
7347  |  |                           const char *s, Py_ssize_t size,  | 
7348  |  |                           const char *errors, Py_ssize_t *consumed)  | 
7349  |  | { | 
7350  |  |     wchar_t *buf = NULL;  | 
7351  |  |     Py_ssize_t bufsize = 0;  | 
7352  |  |     int chunk_size, final, converted, done;  | 
7353  |  |  | 
7354  |  |     if (code_page < 0) { | 
7355  |  |         PyErr_SetString(PyExc_ValueError, "invalid code page number");  | 
7356  |  |         return NULL;  | 
7357  |  |     }  | 
7358  |  |     if (size < 0) { | 
7359  |  |         PyErr_BadInternalCall();  | 
7360  |  |         return NULL;  | 
7361  |  |     }  | 
7362  |  |  | 
7363  |  |     if (consumed)  | 
7364  |  |         *consumed = 0;  | 
7365  |  |  | 
7366  |  |     do  | 
7367  |  |     { | 
7368  |  | #ifdef NEED_RETRY  | 
7369  |  |         if (size > DECODING_CHUNK_SIZE) { | 
7370  |  |             chunk_size = DECODING_CHUNK_SIZE;  | 
7371  |  |             final = 0;  | 
7372  |  |             done = 0;  | 
7373  |  |         }  | 
7374  |  |         else  | 
7375  |  | #endif  | 
7376  |  |         { | 
7377  |  |             chunk_size = (int)size;  | 
7378  |  |             final = (consumed == NULL);  | 
7379  |  |             done = 1;  | 
7380  |  |         }  | 
7381  |  |  | 
7382  |  |         if (chunk_size == 0 && done) { | 
7383  |  |             if (buf != NULL)  | 
7384  |  |                 break;  | 
7385  |  |             _Py_RETURN_UNICODE_EMPTY();  | 
7386  |  |         }  | 
7387  |  |  | 
7388  |  |         converted = decode_code_page_strict(code_page, &buf, &bufsize,  | 
7389  |  |                                             s, chunk_size);  | 
7390  |  |         if (converted == -2)  | 
7391  |  |             converted = decode_code_page_errors(code_page, &buf, &bufsize,  | 
7392  |  |                                                 s, chunk_size,  | 
7393  |  |                                                 errors, final);  | 
7394  |  |         assert(converted != 0 || done);  | 
7395  |  |  | 
7396  |  |         if (converted < 0) { | 
7397  |  |             PyMem_Free(buf);  | 
7398  |  |             return NULL;  | 
7399  |  |         }  | 
7400  |  |  | 
7401  |  |         if (consumed)  | 
7402  |  |             *consumed += converted;  | 
7403  |  |  | 
7404  |  |         s += converted;  | 
7405  |  |         size -= converted;  | 
7406  |  |     } while (!done);  | 
7407  |  |  | 
7408  |  |     PyObject *v = PyUnicode_FromWideChar(buf, bufsize);  | 
7409  |  |     PyMem_Free(buf);  | 
7410  |  |     return v;  | 
7411  |  | }  | 
7412  |  |  | 
7413  |  | PyObject *  | 
7414  |  | PyUnicode_DecodeCodePageStateful(int code_page,  | 
7415  |  |                                  const char *s,  | 
7416  |  |                                  Py_ssize_t size,  | 
7417  |  |                                  const char *errors,  | 
7418  |  |                                  Py_ssize_t *consumed)  | 
7419  |  | { | 
7420  |  |     return decode_code_page_stateful(code_page, s, size, errors, consumed);  | 
7421  |  | }  | 
7422  |  |  | 
7423  |  | PyObject *  | 
7424  |  | PyUnicode_DecodeMBCSStateful(const char *s,  | 
7425  |  |                              Py_ssize_t size,  | 
7426  |  |                              const char *errors,  | 
7427  |  |                              Py_ssize_t *consumed)  | 
7428  |  | { | 
7429  |  |     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);  | 
7430  |  | }  | 
7431  |  |  | 
7432  |  | PyObject *  | 
7433  |  | PyUnicode_DecodeMBCS(const char *s,  | 
7434  |  |                      Py_ssize_t size,  | 
7435  |  |                      const char *errors)  | 
7436  |  | { | 
7437  |  |     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);  | 
7438  |  | }  | 
7439  |  |  | 
7440  |  | static DWORD  | 
7441  |  | encode_code_page_flags(UINT code_page, const char *errors)  | 
7442  |  | { | 
7443  |  |     if (code_page == CP_UTF8) { | 
7444  |  |         return WC_ERR_INVALID_CHARS;  | 
7445  |  |     }  | 
7446  |  |     else if (code_page == CP_UTF7) { | 
7447  |  |         /* CP_UTF7 only supports flags=0 */  | 
7448  |  |         return 0;  | 
7449  |  |     }  | 
7450  |  |     else { | 
7451  |  |         if (errors != NULL && strcmp(errors, "replace") == 0)  | 
7452  |  |             return 0;  | 
7453  |  |         else  | 
7454  |  |             return WC_NO_BEST_FIT_CHARS;  | 
7455  |  |     }  | 
7456  |  | }  | 
7457  |  |  | 
7458  |  | /*  | 
7459  |  |  * Encode a Unicode string to a Windows code page into a byte string in strict  | 
7460  |  |  * mode.  | 
7461  |  |  *  | 
7462  |  |  * Returns consumed characters if succeed, returns -2 on encode error, or raise  | 
7463  |  |  * an OSError and returns -1 on other error.  | 
7464  |  |  */  | 
7465  |  | static int  | 
7466  |  | encode_code_page_strict(UINT code_page, PyObject **outbytes,  | 
7467  |  |                         PyObject *unicode, Py_ssize_t offset, int len,  | 
7468  |  |                         const char* errors)  | 
7469  |  | { | 
7470  |  |     BOOL usedDefaultChar = FALSE;  | 
7471  |  |     BOOL *pusedDefaultChar = &usedDefaultChar;  | 
7472  |  |     int outsize;  | 
7473  |  |     wchar_t *p;  | 
7474  |  |     Py_ssize_t size;  | 
7475  |  |     const DWORD flags = encode_code_page_flags(code_page, NULL);  | 
7476  |  |     char *out;  | 
7477  |  |     /* Create a substring so that we can get the UTF-16 representation  | 
7478  |  |        of just the slice under consideration. */  | 
7479  |  |     PyObject *substring;  | 
7480  |  |  | 
7481  |  |     assert(len > 0);  | 
7482  |  |  | 
7483  |  |     if (code_page != CP_UTF8 && code_page != CP_UTF7)  | 
7484  |  |         pusedDefaultChar = &usedDefaultChar;  | 
7485  |  |     else  | 
7486  |  |         pusedDefaultChar = NULL;  | 
7487  |  |  | 
7488  |  |     substring = PyUnicode_Substring(unicode, offset, offset+len);  | 
7489  |  |     if (substring == NULL)  | 
7490  |  |         return -1;  | 
7491  |  |     p = PyUnicode_AsUnicodeAndSize(substring, &size);  | 
7492  |  |     if (p == NULL) { | 
7493  |  |         Py_DECREF(substring);  | 
7494  |  |         return -1;  | 
7495  |  |     }  | 
7496  |  |     assert(size <= INT_MAX);  | 
7497  |  |  | 
7498  |  |     /* First get the size of the result */  | 
7499  |  |     outsize = WideCharToMultiByte(code_page, flags,  | 
7500  |  |                                   p, (int)size,  | 
7501  |  |                                   NULL, 0,  | 
7502  |  |                                   NULL, pusedDefaultChar);  | 
7503  |  |     if (outsize <= 0)  | 
7504  |  |         goto error;  | 
7505  |  |     /* If we used a default char, then we failed! */  | 
7506  |  |     if (pusedDefaultChar && *pusedDefaultChar) { | 
7507  |  |         Py_DECREF(substring);  | 
7508  |  |         return -2;  | 
7509  |  |     }  | 
7510  |  |  | 
7511  |  |     if (*outbytes == NULL) { | 
7512  |  |         /* Create string object */  | 
7513  |  |         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);  | 
7514  |  |         if (*outbytes == NULL) { | 
7515  |  |             Py_DECREF(substring);  | 
7516  |  |             return -1;  | 
7517  |  |         }  | 
7518  |  |         out = PyBytes_AS_STRING(*outbytes);  | 
7519  |  |     }  | 
7520  |  |     else { | 
7521  |  |         /* Extend string object */  | 
7522  |  |         const Py_ssize_t n = PyBytes_Size(*outbytes);  | 
7523  |  |         if (outsize > PY_SSIZE_T_MAX - n) { | 
7524  |  |             PyErr_NoMemory();  | 
7525  |  |             Py_DECREF(substring);  | 
7526  |  |             return -1;  | 
7527  |  |         }  | 
7528  |  |         if (_PyBytes_Resize(outbytes, n + outsize) < 0) { | 
7529  |  |             Py_DECREF(substring);  | 
7530  |  |             return -1;  | 
7531  |  |         }  | 
7532  |  |         out = PyBytes_AS_STRING(*outbytes) + n;  | 
7533  |  |     }  | 
7534  |  |  | 
7535  |  |     /* Do the conversion */  | 
7536  |  |     outsize = WideCharToMultiByte(code_page, flags,  | 
7537  |  |                                   p, (int)size,  | 
7538  |  |                                   out, outsize,  | 
7539  |  |                                   NULL, pusedDefaultChar);  | 
7540  |  |     Py_CLEAR(substring);  | 
7541  |  |     if (outsize <= 0)  | 
7542  |  |         goto error;  | 
7543  |  |     if (pusedDefaultChar && *pusedDefaultChar)  | 
7544  |  |         return -2;  | 
7545  |  |     return 0;  | 
7546  |  |  | 
7547  |  | error:  | 
7548  |  |     Py_XDECREF(substring);  | 
7549  |  |     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)  | 
7550  |  |         return -2;  | 
7551  |  |     PyErr_SetFromWindowsErr(0);  | 
7552  |  |     return -1;  | 
7553  |  | }  | 
7554  |  |  | 
7555  |  | /*  | 
7556  |  |  * Encode a Unicode string to a Windows code page into a byte string using an  | 
7557  |  |  * error handler.  | 
7558  |  |  *  | 
7559  |  |  * Returns consumed characters if succeed, or raise an OSError and returns  | 
7560  |  |  * -1 on other error.  | 
7561  |  |  */  | 
7562  |  | static int  | 
7563  |  | encode_code_page_errors(UINT code_page, PyObject **outbytes,  | 
7564  |  |                         PyObject *unicode, Py_ssize_t unicode_offset,  | 
7565  |  |                         Py_ssize_t insize, const char* errors)  | 
7566  |  | { | 
7567  |  |     const DWORD flags = encode_code_page_flags(code_page, errors);  | 
7568  |  |     Py_ssize_t pos = unicode_offset;  | 
7569  |  |     Py_ssize_t endin = unicode_offset + insize;  | 
7570  |  |     /* Ideally, we should get reason from FormatMessage. This is the Windows  | 
7571  |  |        2000 English version of the message. */  | 
7572  |  |     const char *reason = "invalid character";  | 
7573  |  |     /* 4=maximum length of a UTF-8 sequence */  | 
7574  |  |     char buffer[4];  | 
7575  |  |     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;  | 
7576  |  |     Py_ssize_t outsize;  | 
7577  |  |     char *out;  | 
7578  |  |     PyObject *errorHandler = NULL;  | 
7579  |  |     PyObject *exc = NULL;  | 
7580  |  |     PyObject *encoding_obj = NULL;  | 
7581  |  |     const char *encoding;  | 
7582  |  |     Py_ssize_t newpos, newoutsize;  | 
7583  |  |     PyObject *rep;  | 
7584  |  |     int ret = -1;  | 
7585  |  |  | 
7586  |  |     assert(insize > 0);  | 
7587  |  |  | 
7588  |  |     encoding = code_page_name(code_page, &encoding_obj);  | 
7589  |  |     if (encoding == NULL)  | 
7590  |  |         return -1;  | 
7591  |  |  | 
7592  |  |     if (errors == NULL || strcmp(errors, "strict") == 0) { | 
7593  |  |         /* The last error was ERROR_NO_UNICODE_TRANSLATION,  | 
7594  |  |            then we raise a UnicodeEncodeError. */  | 
7595  |  |         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);  | 
7596  |  |         if (exc != NULL) { | 
7597  |  |             PyCodec_StrictErrors(exc);  | 
7598  |  |             Py_DECREF(exc);  | 
7599  |  |         }  | 
7600  |  |         Py_XDECREF(encoding_obj);  | 
7601  |  |         return -1;  | 
7602  |  |     }  | 
7603  |  |  | 
7604  |  |     if (code_page != CP_UTF8 && code_page != CP_UTF7)  | 
7605  |  |         pusedDefaultChar = &usedDefaultChar;  | 
7606  |  |     else  | 
7607  |  |         pusedDefaultChar = NULL;  | 
7608  |  |  | 
7609  |  |     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { | 
7610  |  |         PyErr_NoMemory();  | 
7611  |  |         goto error;  | 
7612  |  |     }  | 
7613  |  |     outsize = insize * Py_ARRAY_LENGTH(buffer);  | 
7614  |  |  | 
7615  |  |     if (*outbytes == NULL) { | 
7616  |  |         /* Create string object */  | 
7617  |  |         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);  | 
7618  |  |         if (*outbytes == NULL)  | 
7619  |  |             goto error;  | 
7620  |  |         out = PyBytes_AS_STRING(*outbytes);  | 
7621  |  |     }  | 
7622  |  |     else { | 
7623  |  |         /* Extend string object */  | 
7624  |  |         Py_ssize_t n = PyBytes_Size(*outbytes);  | 
7625  |  |         if (n > PY_SSIZE_T_MAX - outsize) { | 
7626  |  |             PyErr_NoMemory();  | 
7627  |  |             goto error;  | 
7628  |  |         }  | 
7629  |  |         if (_PyBytes_Resize(outbytes, n + outsize) < 0)  | 
7630  |  |             goto error;  | 
7631  |  |         out = PyBytes_AS_STRING(*outbytes) + n;  | 
7632  |  |     }  | 
7633  |  |  | 
7634  |  |     /* Encode the string character per character */  | 
7635  |  |     while (pos < endin)  | 
7636  |  |     { | 
7637  |  |         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);  | 
7638  |  |         wchar_t chars[2];  | 
7639  |  |         int charsize;  | 
7640  |  |         if (ch < 0x10000) { | 
7641  |  |             chars[0] = (wchar_t)ch;  | 
7642  |  |             charsize = 1;  | 
7643  |  |         }  | 
7644  |  |         else { | 
7645  |  |             chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);  | 
7646  |  |             chars[1] = Py_UNICODE_LOW_SURROGATE(ch);  | 
7647  |  |             charsize = 2;  | 
7648  |  |         }  | 
7649  |  |  | 
7650  |  |         outsize = WideCharToMultiByte(code_page, flags,  | 
7651  |  |                                       chars, charsize,  | 
7652  |  |                                       buffer, Py_ARRAY_LENGTH(buffer),  | 
7653  |  |                                       NULL, pusedDefaultChar);  | 
7654  |  |         if (outsize > 0) { | 
7655  |  |             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))  | 
7656  |  |             { | 
7657  |  |                 pos++;  | 
7658  |  |                 memcpy(out, buffer, outsize);  | 
7659  |  |                 out += outsize;  | 
7660  |  |                 continue;  | 
7661  |  |             }  | 
7662  |  |         }  | 
7663  |  |         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { | 
7664  |  |             PyErr_SetFromWindowsErr(0);  | 
7665  |  |             goto error;  | 
7666  |  |         }  | 
7667  |  |  | 
7668  |  |         rep = unicode_encode_call_errorhandler(  | 
7669  |  |                   errors, &errorHandler, encoding, reason,  | 
7670  |  |                   unicode, &exc,  | 
7671  |  |                   pos, pos + 1, &newpos);  | 
7672  |  |         if (rep == NULL)  | 
7673  |  |             goto error;  | 
7674  |  |         pos = newpos;  | 
7675  |  |  | 
7676  |  |         if (PyBytes_Check(rep)) { | 
7677  |  |             outsize = PyBytes_GET_SIZE(rep);  | 
7678  |  |             if (outsize != 1) { | 
7679  |  |                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);  | 
7680  |  |                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);  | 
7681  |  |                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { | 
7682  |  |                     Py_DECREF(rep);  | 
7683  |  |                     goto error;  | 
7684  |  |                 }  | 
7685  |  |                 out = PyBytes_AS_STRING(*outbytes) + offset;  | 
7686  |  |             }  | 
7687  |  |             memcpy(out, PyBytes_AS_STRING(rep), outsize);  | 
7688  |  |             out += outsize;  | 
7689  |  |         }  | 
7690  |  |         else { | 
7691  |  |             Py_ssize_t i;  | 
7692  |  |             enum PyUnicode_Kind kind;  | 
7693  |  |             void *data;  | 
7694  |  |  | 
7695  |  |             if (PyUnicode_READY(rep) == -1) { | 
7696  |  |                 Py_DECREF(rep);  | 
7697  |  |                 goto error;  | 
7698  |  |             }  | 
7699  |  |  | 
7700  |  |             outsize = PyUnicode_GET_LENGTH(rep);  | 
7701  |  |             if (outsize != 1) { | 
7702  |  |                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);  | 
7703  |  |                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);  | 
7704  |  |                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { | 
7705  |  |                     Py_DECREF(rep);  | 
7706  |  |                     goto error;  | 
7707  |  |                 }  | 
7708  |  |                 out = PyBytes_AS_STRING(*outbytes) + offset;  | 
7709  |  |             }  | 
7710  |  |             kind = PyUnicode_KIND(rep);  | 
7711  |  |             data = PyUnicode_DATA(rep);  | 
7712  |  |             for (i=0; i < outsize; i++) { | 
7713  |  |                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);  | 
7714  |  |                 if (ch > 127) { | 
7715  |  |                     raise_encode_exception(&exc,  | 
7716  |  |                         encoding, unicode,  | 
7717  |  |                         pos, pos + 1,  | 
7718  |  |                         "unable to encode error handler result to ASCII");  | 
7719  |  |                     Py_DECREF(rep);  | 
7720  |  |                     goto error;  | 
7721  |  |                 }  | 
7722  |  |                 *out = (unsigned char)ch;  | 
7723  |  |                 out++;  | 
7724  |  |             }  | 
7725  |  |         }  | 
7726  |  |         Py_DECREF(rep);  | 
7727  |  |     }  | 
7728  |  |     /* write a NUL byte */  | 
7729  |  |     *out = 0;  | 
7730  |  |     outsize = out - PyBytes_AS_STRING(*outbytes);  | 
7731  |  |     assert(outsize <= PyBytes_GET_SIZE(*outbytes));  | 
7732  |  |     if (_PyBytes_Resize(outbytes, outsize) < 0)  | 
7733  |  |         goto error;  | 
7734  |  |     ret = 0;  | 
7735  |  |  | 
7736  |  | error:  | 
7737  |  |     Py_XDECREF(encoding_obj);  | 
7738  |  |     Py_XDECREF(errorHandler);  | 
7739  |  |     Py_XDECREF(exc);  | 
7740  |  |     return ret;  | 
7741  |  | }  | 
7742  |  |  | 
7743  |  | static PyObject *  | 
7744  |  | encode_code_page(int code_page,  | 
7745  |  |                  PyObject *unicode,  | 
7746  |  |                  const char *errors)  | 
7747  |  | { | 
7748  |  |     Py_ssize_t len;  | 
7749  |  |     PyObject *outbytes = NULL;  | 
7750  |  |     Py_ssize_t offset;  | 
7751  |  |     int chunk_len, ret, done;  | 
7752  |  |  | 
7753  |  |     if (!PyUnicode_Check(unicode)) { | 
7754  |  |         PyErr_BadArgument();  | 
7755  |  |         return NULL;  | 
7756  |  |     }  | 
7757  |  |  | 
7758  |  |     if (PyUnicode_READY(unicode) == -1)  | 
7759  |  |         return NULL;  | 
7760  |  |     len = PyUnicode_GET_LENGTH(unicode);  | 
7761  |  |  | 
7762  |  |     if (code_page < 0) { | 
7763  |  |         PyErr_SetString(PyExc_ValueError, "invalid code page number");  | 
7764  |  |         return NULL;  | 
7765  |  |     }  | 
7766  |  |  | 
7767  |  |     if (len == 0)  | 
7768  |  |         return PyBytes_FromStringAndSize(NULL, 0);  | 
7769  |  |  | 
7770  |  |     offset = 0;  | 
7771  |  |     do  | 
7772  |  |     { | 
7773  |  | #ifdef NEED_RETRY  | 
7774  |  |         if (len > DECODING_CHUNK_SIZE) { | 
7775  |  |             chunk_len = DECODING_CHUNK_SIZE;  | 
7776  |  |             done = 0;  | 
7777  |  |         }  | 
7778  |  |         else  | 
7779  |  | #endif  | 
7780  |  |         { | 
7781  |  |             chunk_len = (int)len;  | 
7782  |  |             done = 1;  | 
7783  |  |         }  | 
7784  |  |  | 
7785  |  |         ret = encode_code_page_strict(code_page, &outbytes,  | 
7786  |  |                                       unicode, offset, chunk_len,  | 
7787  |  |                                       errors);  | 
7788  |  |         if (ret == -2)  | 
7789  |  |             ret = encode_code_page_errors(code_page, &outbytes,  | 
7790  |  |                                           unicode, offset,  | 
7791  |  |                                           chunk_len, errors);  | 
7792  |  |         if (ret < 0) { | 
7793  |  |             Py_XDECREF(outbytes);  | 
7794  |  |             return NULL;  | 
7795  |  |         }  | 
7796  |  |  | 
7797  |  |         offset += chunk_len;  | 
7798  |  |         len -= chunk_len;  | 
7799  |  |     } while (!done);  | 
7800  |  |  | 
7801  |  |     return outbytes;  | 
7802  |  | }  | 
7803  |  |  | 
7804  |  | PyObject *  | 
7805  |  | PyUnicode_EncodeMBCS(const Py_UNICODE *p,  | 
7806  |  |                      Py_ssize_t size,  | 
7807  |  |                      const char *errors)  | 
7808  |  | { | 
7809  |  |     PyObject *unicode, *res;  | 
7810  |  |     unicode = PyUnicode_FromWideChar(p, size);  | 
7811  |  |     if (unicode == NULL)  | 
7812  |  |         return NULL;  | 
7813  |  |     res = encode_code_page(CP_ACP, unicode, errors);  | 
7814  |  |     Py_DECREF(unicode);  | 
7815  |  |     return res;  | 
7816  |  | }  | 
7817  |  |  | 
7818  |  | PyObject *  | 
7819  |  | PyUnicode_EncodeCodePage(int code_page,  | 
7820  |  |                          PyObject *unicode,  | 
7821  |  |                          const char *errors)  | 
7822  |  | { | 
7823  |  |     return encode_code_page(code_page, unicode, errors);  | 
7824  |  | }  | 
7825  |  |  | 
7826  |  | PyObject *  | 
7827  |  | PyUnicode_AsMBCSString(PyObject *unicode)  | 
7828  |  | { | 
7829  |  |     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);  | 
7830  |  | }  | 
7831  |  |  | 
7832  |  | #undef NEED_RETRY  | 
7833  |  |  | 
7834  |  | #endif /* MS_WINDOWS */  | 
7835  |  |  | 
7836  |  | /* --- Character Mapping Codec -------------------------------------------- */  | 
7837  |  |  | 
7838  |  | static int  | 
7839  |  | charmap_decode_string(const char *s,  | 
7840  |  |                       Py_ssize_t size,  | 
7841  |  |                       PyObject *mapping,  | 
7842  |  |                       const char *errors,  | 
7843  |  |                       _PyUnicodeWriter *writer)  | 
7844  | 0  | { | 
7845  | 0  |     const char *starts = s;  | 
7846  | 0  |     const char *e;  | 
7847  | 0  |     Py_ssize_t startinpos, endinpos;  | 
7848  | 0  |     PyObject *errorHandler = NULL, *exc = NULL;  | 
7849  | 0  |     Py_ssize_t maplen;  | 
7850  | 0  |     enum PyUnicode_Kind mapkind;  | 
7851  | 0  |     void *mapdata;  | 
7852  | 0  |     Py_UCS4 x;  | 
7853  | 0  |     unsigned char ch;  | 
7854  |  | 
  | 
7855  | 0  |     if (PyUnicode_READY(mapping) == -1)  | 
7856  | 0  |         return -1;  | 
7857  |  |  | 
7858  | 0  |     maplen = PyUnicode_GET_LENGTH(mapping);  | 
7859  | 0  |     mapdata = PyUnicode_DATA(mapping);  | 
7860  | 0  |     mapkind = PyUnicode_KIND(mapping);  | 
7861  |  | 
  | 
7862  | 0  |     e = s + size;  | 
7863  |  | 
  | 
7864  | 0  |     if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { | 
7865  |  |         /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1  | 
7866  |  |          * is disabled in encoding aliases, latin1 is preferred because  | 
7867  |  |          * its implementation is faster. */  | 
7868  | 0  |         Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;  | 
7869  | 0  |         Py_UCS1 *outdata = (Py_UCS1 *)writer->data;  | 
7870  | 0  |         Py_UCS4 maxchar = writer->maxchar;  | 
7871  |  | 
  | 
7872  | 0  |         assert (writer->kind == PyUnicode_1BYTE_KIND);  | 
7873  | 0  |         while (s < e) { | 
7874  | 0  |             ch = *s;  | 
7875  | 0  |             x = mapdata_ucs1[ch];  | 
7876  | 0  |             if (x > maxchar) { | 
7877  | 0  |                 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)  | 
7878  | 0  |                     goto onError;  | 
7879  | 0  |                 maxchar = writer->maxchar;  | 
7880  | 0  |                 outdata = (Py_UCS1 *)writer->data;  | 
7881  | 0  |             }  | 
7882  | 0  |             outdata[writer->pos] = x;  | 
7883  | 0  |             writer->pos++;  | 
7884  | 0  |             ++s;  | 
7885  | 0  |         }  | 
7886  | 0  |         return 0;  | 
7887  | 0  |     }  | 
7888  |  |  | 
7889  | 0  |     while (s < e) { | 
7890  | 0  |         if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { | 
7891  | 0  |             enum PyUnicode_Kind outkind = writer->kind;  | 
7892  | 0  |             Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;  | 
7893  | 0  |             if (outkind == PyUnicode_1BYTE_KIND) { | 
7894  | 0  |                 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;  | 
7895  | 0  |                 Py_UCS4 maxchar = writer->maxchar;  | 
7896  | 0  |                 while (s < e) { | 
7897  | 0  |                     ch = *s;  | 
7898  | 0  |                     x = mapdata_ucs2[ch];  | 
7899  | 0  |                     if (x > maxchar)  | 
7900  | 0  |                         goto Error;  | 
7901  | 0  |                     outdata[writer->pos] = x;  | 
7902  | 0  |                     writer->pos++;  | 
7903  | 0  |                     ++s;  | 
7904  | 0  |                 }  | 
7905  | 0  |                 break;  | 
7906  | 0  |             }  | 
7907  | 0  |             else if (outkind == PyUnicode_2BYTE_KIND) { | 
7908  | 0  |                 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;  | 
7909  | 0  |                 while (s < e) { | 
7910  | 0  |                     ch = *s;  | 
7911  | 0  |                     x = mapdata_ucs2[ch];  | 
7912  | 0  |                     if (x == 0xFFFE)  | 
7913  | 0  |                         goto Error;  | 
7914  | 0  |                     outdata[writer->pos] = x;  | 
7915  | 0  |                     writer->pos++;  | 
7916  | 0  |                     ++s;  | 
7917  | 0  |                 }  | 
7918  | 0  |                 break;  | 
7919  | 0  |             }  | 
7920  | 0  |         }  | 
7921  | 0  |         ch = *s;  | 
7922  |  | 
  | 
7923  | 0  |         if (ch < maplen)  | 
7924  | 0  |             x = PyUnicode_READ(mapkind, mapdata, ch);  | 
7925  | 0  |         else  | 
7926  | 0  |             x = 0xfffe; /* invalid value */  | 
7927  | 0  | Error:  | 
7928  | 0  |         if (x == 0xfffe)  | 
7929  | 0  |         { | 
7930  |  |             /* undefined mapping */  | 
7931  | 0  |             startinpos = s-starts;  | 
7932  | 0  |             endinpos = startinpos+1;  | 
7933  | 0  |             if (unicode_decode_call_errorhandler_writer(  | 
7934  | 0  |                     errors, &errorHandler,  | 
7935  | 0  |                     "charmap", "character maps to <undefined>",  | 
7936  | 0  |                     &starts, &e, &startinpos, &endinpos, &exc, &s,  | 
7937  | 0  |                     writer)) { | 
7938  | 0  |                 goto onError;  | 
7939  | 0  |             }  | 
7940  | 0  |             continue;  | 
7941  | 0  |         }  | 
7942  |  |  | 
7943  | 0  |         if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)  | 
7944  | 0  |             goto onError;  | 
7945  | 0  |         ++s;  | 
7946  | 0  |     }  | 
7947  | 0  |     Py_XDECREF(errorHandler);  | 
7948  | 0  |     Py_XDECREF(exc);  | 
7949  | 0  |     return 0;  | 
7950  |  |  | 
7951  | 0  | onError:  | 
7952  | 0  |     Py_XDECREF(errorHandler);  | 
7953  | 0  |     Py_XDECREF(exc);  | 
7954  | 0  |     return -1;  | 
7955  | 0  | }  | 
7956  |  |  | 
7957  |  | static int  | 
7958  |  | charmap_decode_mapping(const char *s,  | 
7959  |  |                        Py_ssize_t size,  | 
7960  |  |                        PyObject *mapping,  | 
7961  |  |                        const char *errors,  | 
7962  |  |                        _PyUnicodeWriter *writer)  | 
7963  | 0  | { | 
7964  | 0  |     const char *starts = s;  | 
7965  | 0  |     const char *e;  | 
7966  | 0  |     Py_ssize_t startinpos, endinpos;  | 
7967  | 0  |     PyObject *errorHandler = NULL, *exc = NULL;  | 
7968  | 0  |     unsigned char ch;  | 
7969  | 0  |     PyObject *key, *item = NULL;  | 
7970  |  | 
  | 
7971  | 0  |     e = s + size;  | 
7972  |  | 
  | 
7973  | 0  |     while (s < e) { | 
7974  | 0  |         ch = *s;  | 
7975  |  |  | 
7976  |  |         /* Get mapping (char ordinal -> integer, Unicode char or None) */  | 
7977  | 0  |         key = PyLong_FromLong((long)ch);  | 
7978  | 0  |         if (key == NULL)  | 
7979  | 0  |             goto onError;  | 
7980  |  |  | 
7981  | 0  |         item = PyObject_GetItem(mapping, key);  | 
7982  | 0  |         Py_DECREF(key);  | 
7983  | 0  |         if (item == NULL) { | 
7984  | 0  |             if (PyErr_ExceptionMatches(PyExc_LookupError)) { | 
7985  |  |                 /* No mapping found means: mapping is undefined. */  | 
7986  | 0  |                 PyErr_Clear();  | 
7987  | 0  |                 goto Undefined;  | 
7988  | 0  |             } else  | 
7989  | 0  |                 goto onError;  | 
7990  | 0  |         }  | 
7991  |  |  | 
7992  |  |         /* Apply mapping */  | 
7993  | 0  |         if (item == Py_None)  | 
7994  | 0  |             goto Undefined;  | 
7995  | 0  |         if (PyLong_Check(item)) { | 
7996  | 0  |             long value = PyLong_AS_LONG(item);  | 
7997  | 0  |             if (value == 0xFFFE)  | 
7998  | 0  |                 goto Undefined;  | 
7999  | 0  |             if (value < 0 || value > MAX_UNICODE) { | 
8000  | 0  |                 PyErr_Format(PyExc_TypeError,  | 
8001  | 0  |                              "character mapping must be in range(0x%lx)",  | 
8002  | 0  |                              (unsigned long)MAX_UNICODE + 1);  | 
8003  | 0  |                 goto onError;  | 
8004  | 0  |             }  | 
8005  |  |  | 
8006  | 0  |             if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)  | 
8007  | 0  |                 goto onError;  | 
8008  | 0  |         }  | 
8009  | 0  |         else if (PyUnicode_Check(item)) { | 
8010  | 0  |             if (PyUnicode_READY(item) == -1)  | 
8011  | 0  |                 goto onError;  | 
8012  | 0  |             if (PyUnicode_GET_LENGTH(item) == 1) { | 
8013  | 0  |                 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);  | 
8014  | 0  |                 if (value == 0xFFFE)  | 
8015  | 0  |                     goto Undefined;  | 
8016  | 0  |                 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)  | 
8017  | 0  |                     goto onError;  | 
8018  | 0  |             }  | 
8019  | 0  |             else { | 
8020  | 0  |                 writer->overallocate = 1;  | 
8021  | 0  |                 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)  | 
8022  | 0  |                     goto onError;  | 
8023  | 0  |             }  | 
8024  | 0  |         }  | 
8025  | 0  |         else { | 
8026  |  |             /* wrong return value */  | 
8027  | 0  |             PyErr_SetString(PyExc_TypeError,  | 
8028  | 0  |                             "character mapping must return integer, None or str");  | 
8029  | 0  |             goto onError;  | 
8030  | 0  |         }  | 
8031  | 0  |         Py_CLEAR(item);  | 
8032  | 0  |         ++s;  | 
8033  | 0  |         continue;  | 
8034  |  |  | 
8035  | 0  | Undefined:  | 
8036  |  |         /* undefined mapping */  | 
8037  | 0  |         Py_CLEAR(item);  | 
8038  | 0  |         startinpos = s-starts;  | 
8039  | 0  |         endinpos = startinpos+1;  | 
8040  | 0  |         if (unicode_decode_call_errorhandler_writer(  | 
8041  | 0  |                 errors, &errorHandler,  | 
8042  | 0  |                 "charmap", "character maps to <undefined>",  | 
8043  | 0  |                 &starts, &e, &startinpos, &endinpos, &exc, &s,  | 
8044  | 0  |                 writer)) { | 
8045  | 0  |             goto onError;  | 
8046  | 0  |         }  | 
8047  | 0  |     }  | 
8048  | 0  |     Py_XDECREF(errorHandler);  | 
8049  | 0  |     Py_XDECREF(exc);  | 
8050  | 0  |     return 0;  | 
8051  |  |  | 
8052  | 0  | onError:  | 
8053  | 0  |     Py_XDECREF(item);  | 
8054  | 0  |     Py_XDECREF(errorHandler);  | 
8055  | 0  |     Py_XDECREF(exc);  | 
8056  | 0  |     return -1;  | 
8057  | 0  | }  | 
8058  |  |  | 
8059  |  | PyObject *  | 
8060  |  | PyUnicode_DecodeCharmap(const char *s,  | 
8061  |  |                         Py_ssize_t size,  | 
8062  |  |                         PyObject *mapping,  | 
8063  |  |                         const char *errors)  | 
8064  | 0  | { | 
8065  | 0  |     _PyUnicodeWriter writer;  | 
8066  |  |  | 
8067  |  |     /* Default to Latin-1 */  | 
8068  | 0  |     if (mapping == NULL)  | 
8069  | 0  |         return PyUnicode_DecodeLatin1(s, size, errors);  | 
8070  |  |  | 
8071  | 0  |     if (size == 0)  | 
8072  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
8073  | 0  |     _PyUnicodeWriter_Init(&writer);  | 
8074  | 0  |     writer.min_length = size;  | 
8075  | 0  |     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)  | 
8076  | 0  |         goto onError;  | 
8077  |  |  | 
8078  | 0  |     if (PyUnicode_CheckExact(mapping)) { | 
8079  | 0  |         if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)  | 
8080  | 0  |             goto onError;  | 
8081  | 0  |     }  | 
8082  | 0  |     else { | 
8083  | 0  |         if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)  | 
8084  | 0  |             goto onError;  | 
8085  | 0  |     }  | 
8086  | 0  |     return _PyUnicodeWriter_Finish(&writer);  | 
8087  |  |  | 
8088  | 0  |   onError:  | 
8089  | 0  |     _PyUnicodeWriter_Dealloc(&writer);  | 
8090  | 0  |     return NULL;  | 
8091  | 0  | }  | 
8092  |  |  | 
8093  |  | /* Charmap encoding: the lookup table */  | 
8094  |  |  | 
8095  |  | struct encoding_map { | 
8096  |  |     PyObject_HEAD  | 
8097  |  |     unsigned char level1[32];  | 
8098  |  |     int count2, count3;  | 
8099  |  |     unsigned char level23[1];  | 
8100  |  | };  | 
8101  |  |  | 
8102  |  | static PyObject*  | 
8103  |  | encoding_map_size(PyObject *obj, PyObject* args)  | 
8104  | 0  | { | 
8105  | 0  |     struct encoding_map *map = (struct encoding_map*)obj;  | 
8106  | 0  |     return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +  | 
8107  | 0  |                            128*map->count3);  | 
8108  | 0  | }  | 
8109  |  |  | 
8110  |  | static PyMethodDef encoding_map_methods[] = { | 
8111  |  |     {"size", encoding_map_size, METH_NOARGS, | 
8112  |  |      PyDoc_STR("Return the size (in bytes) of this object") }, | 
8113  |  |     { 0 } | 
8114  |  | };  | 
8115  |  |  | 
8116  |  | static PyTypeObject EncodingMapType = { | 
8117  |  |     PyVarObject_HEAD_INIT(NULL, 0)  | 
8118  |  |     "EncodingMap",          /*tp_name*/  | 
8119  |  |     sizeof(struct encoding_map),   /*tp_basicsize*/  | 
8120  |  |     0,                      /*tp_itemsize*/  | 
8121  |  |     /* methods */  | 
8122  |  |     0,                      /*tp_dealloc*/  | 
8123  |  |     0,                      /*tp_vectorcall_offset*/  | 
8124  |  |     0,                      /*tp_getattr*/  | 
8125  |  |     0,                      /*tp_setattr*/  | 
8126  |  |     0,                      /*tp_as_async*/  | 
8127  |  |     0,                      /*tp_repr*/  | 
8128  |  |     0,                      /*tp_as_number*/  | 
8129  |  |     0,                      /*tp_as_sequence*/  | 
8130  |  |     0,                      /*tp_as_mapping*/  | 
8131  |  |     0,                      /*tp_hash*/  | 
8132  |  |     0,                      /*tp_call*/  | 
8133  |  |     0,                      /*tp_str*/  | 
8134  |  |     0,                      /*tp_getattro*/  | 
8135  |  |     0,                      /*tp_setattro*/  | 
8136  |  |     0,                      /*tp_as_buffer*/  | 
8137  |  |     Py_TPFLAGS_DEFAULT,     /*tp_flags*/  | 
8138  |  |     0,                      /*tp_doc*/  | 
8139  |  |     0,                      /*tp_traverse*/  | 
8140  |  |     0,                      /*tp_clear*/  | 
8141  |  |     0,                      /*tp_richcompare*/  | 
8142  |  |     0,                      /*tp_weaklistoffset*/  | 
8143  |  |     0,                      /*tp_iter*/  | 
8144  |  |     0,                      /*tp_iternext*/  | 
8145  |  |     encoding_map_methods,   /*tp_methods*/  | 
8146  |  |     0,                      /*tp_members*/  | 
8147  |  |     0,                      /*tp_getset*/  | 
8148  |  |     0,                      /*tp_base*/  | 
8149  |  |     0,                      /*tp_dict*/  | 
8150  |  |     0,                      /*tp_descr_get*/  | 
8151  |  |     0,                      /*tp_descr_set*/  | 
8152  |  |     0,                      /*tp_dictoffset*/  | 
8153  |  |     0,                      /*tp_init*/  | 
8154  |  |     0,                      /*tp_alloc*/  | 
8155  |  |     0,                      /*tp_new*/  | 
8156  |  |     0,                      /*tp_free*/  | 
8157  |  |     0,                      /*tp_is_gc*/  | 
8158  |  | };  | 
8159  |  |  | 
8160  |  | PyObject*  | 
8161  |  | PyUnicode_BuildEncodingMap(PyObject* string)  | 
8162  | 0  | { | 
8163  | 0  |     PyObject *result;  | 
8164  | 0  |     struct encoding_map *mresult;  | 
8165  | 0  |     int i;  | 
8166  | 0  |     int need_dict = 0;  | 
8167  | 0  |     unsigned char level1[32];  | 
8168  | 0  |     unsigned char level2[512];  | 
8169  | 0  |     unsigned char *mlevel1, *mlevel2, *mlevel3;  | 
8170  | 0  |     int count2 = 0, count3 = 0;  | 
8171  | 0  |     int kind;  | 
8172  | 0  |     void *data;  | 
8173  | 0  |     Py_ssize_t length;  | 
8174  | 0  |     Py_UCS4 ch;  | 
8175  |  | 
  | 
8176  | 0  |     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { | 
8177  | 0  |         PyErr_BadArgument();  | 
8178  | 0  |         return NULL;  | 
8179  | 0  |     }  | 
8180  | 0  |     kind = PyUnicode_KIND(string);  | 
8181  | 0  |     data = PyUnicode_DATA(string);  | 
8182  | 0  |     length = PyUnicode_GET_LENGTH(string);  | 
8183  | 0  |     length = Py_MIN(length, 256);  | 
8184  | 0  |     memset(level1, 0xFF, sizeof level1);  | 
8185  | 0  |     memset(level2, 0xFF, sizeof level2);  | 
8186  |  |  | 
8187  |  |     /* If there isn't a one-to-one mapping of NULL to \0,  | 
8188  |  |        or if there are non-BMP characters, we need to use  | 
8189  |  |        a mapping dictionary. */  | 
8190  | 0  |     if (PyUnicode_READ(kind, data, 0) != 0)  | 
8191  | 0  |         need_dict = 1;  | 
8192  | 0  |     for (i = 1; i < length; i++) { | 
8193  | 0  |         int l1, l2;  | 
8194  | 0  |         ch = PyUnicode_READ(kind, data, i);  | 
8195  | 0  |         if (ch == 0 || ch > 0xFFFF) { | 
8196  | 0  |             need_dict = 1;  | 
8197  | 0  |             break;  | 
8198  | 0  |         }  | 
8199  | 0  |         if (ch == 0xFFFE)  | 
8200  |  |             /* unmapped character */  | 
8201  | 0  |             continue;  | 
8202  | 0  |         l1 = ch >> 11;  | 
8203  | 0  |         l2 = ch >> 7;  | 
8204  | 0  |         if (level1[l1] == 0xFF)  | 
8205  | 0  |             level1[l1] = count2++;  | 
8206  | 0  |         if (level2[l2] == 0xFF)  | 
8207  | 0  |             level2[l2] = count3++;  | 
8208  | 0  |     }  | 
8209  |  | 
  | 
8210  | 0  |     if (count2 >= 0xFF || count3 >= 0xFF)  | 
8211  | 0  |         need_dict = 1;  | 
8212  |  | 
  | 
8213  | 0  |     if (need_dict) { | 
8214  | 0  |         PyObject *result = PyDict_New();  | 
8215  | 0  |         PyObject *key, *value;  | 
8216  | 0  |         if (!result)  | 
8217  | 0  |             return NULL;  | 
8218  | 0  |         for (i = 0; i < length; i++) { | 
8219  | 0  |             key = PyLong_FromLong(PyUnicode_READ(kind, data, i));  | 
8220  | 0  |             value = PyLong_FromLong(i);  | 
8221  | 0  |             if (!key || !value)  | 
8222  | 0  |                 goto failed1;  | 
8223  | 0  |             if (PyDict_SetItem(result, key, value) == -1)  | 
8224  | 0  |                 goto failed1;  | 
8225  | 0  |             Py_DECREF(key);  | 
8226  | 0  |             Py_DECREF(value);  | 
8227  | 0  |         }  | 
8228  | 0  |         return result;  | 
8229  | 0  |       failed1:  | 
8230  | 0  |         Py_XDECREF(key);  | 
8231  | 0  |         Py_XDECREF(value);  | 
8232  | 0  |         Py_DECREF(result);  | 
8233  | 0  |         return NULL;  | 
8234  | 0  |     }  | 
8235  |  |  | 
8236  |  |     /* Create a three-level trie */  | 
8237  | 0  |     result = PyObject_MALLOC(sizeof(struct encoding_map) +  | 
8238  | 0  |                              16*count2 + 128*count3 - 1);  | 
8239  | 0  |     if (!result)  | 
8240  | 0  |         return PyErr_NoMemory();  | 
8241  | 0  |     PyObject_Init(result, &EncodingMapType);  | 
8242  | 0  |     mresult = (struct encoding_map*)result;  | 
8243  | 0  |     mresult->count2 = count2;  | 
8244  | 0  |     mresult->count3 = count3;  | 
8245  | 0  |     mlevel1 = mresult->level1;  | 
8246  | 0  |     mlevel2 = mresult->level23;  | 
8247  | 0  |     mlevel3 = mresult->level23 + 16*count2;  | 
8248  | 0  |     memcpy(mlevel1, level1, 32);  | 
8249  | 0  |     memset(mlevel2, 0xFF, 16*count2);  | 
8250  | 0  |     memset(mlevel3, 0, 128*count3);  | 
8251  | 0  |     count3 = 0;  | 
8252  | 0  |     for (i = 1; i < length; i++) { | 
8253  | 0  |         int o1, o2, o3, i2, i3;  | 
8254  | 0  |         Py_UCS4 ch = PyUnicode_READ(kind, data, i);  | 
8255  | 0  |         if (ch == 0xFFFE)  | 
8256  |  |             /* unmapped character */  | 
8257  | 0  |             continue;  | 
8258  | 0  |         o1 = ch>>11;  | 
8259  | 0  |         o2 = (ch>>7) & 0xF;  | 
8260  | 0  |         i2 = 16*mlevel1[o1] + o2;  | 
8261  | 0  |         if (mlevel2[i2] == 0xFF)  | 
8262  | 0  |             mlevel2[i2] = count3++;  | 
8263  | 0  |         o3 = ch & 0x7F;  | 
8264  | 0  |         i3 = 128*mlevel2[i2] + o3;  | 
8265  | 0  |         mlevel3[i3] = i;  | 
8266  | 0  |     }  | 
8267  | 0  |     return result;  | 
8268  | 0  | }  | 
8269  |  |  | 
8270  |  | static int  | 
8271  |  | encoding_map_lookup(Py_UCS4 c, PyObject *mapping)  | 
8272  | 0  | { | 
8273  | 0  |     struct encoding_map *map = (struct encoding_map*)mapping;  | 
8274  | 0  |     int l1 = c>>11;  | 
8275  | 0  |     int l2 = (c>>7) & 0xF;  | 
8276  | 0  |     int l3 = c & 0x7F;  | 
8277  | 0  |     int i;  | 
8278  |  | 
  | 
8279  | 0  |     if (c > 0xFFFF)  | 
8280  | 0  |         return -1;  | 
8281  | 0  |     if (c == 0)  | 
8282  | 0  |         return 0;  | 
8283  |  |     /* level 1*/  | 
8284  | 0  |     i = map->level1[l1];  | 
8285  | 0  |     if (i == 0xFF) { | 
8286  | 0  |         return -1;  | 
8287  | 0  |     }  | 
8288  |  |     /* level 2*/  | 
8289  | 0  |     i = map->level23[16*i+l2];  | 
8290  | 0  |     if (i == 0xFF) { | 
8291  | 0  |         return -1;  | 
8292  | 0  |     }  | 
8293  |  |     /* level 3 */  | 
8294  | 0  |     i = map->level23[16*map->count2 + 128*i + l3];  | 
8295  | 0  |     if (i == 0) { | 
8296  | 0  |         return -1;  | 
8297  | 0  |     }  | 
8298  | 0  |     return i;  | 
8299  | 0  | }  | 
8300  |  |  | 
8301  |  | /* Lookup the character ch in the mapping. If the character  | 
8302  |  |    can't be found, Py_None is returned (or NULL, if another  | 
8303  |  |    error occurred). */  | 
8304  |  | static PyObject *  | 
8305  |  | charmapencode_lookup(Py_UCS4 c, PyObject *mapping)  | 
8306  | 0  | { | 
8307  | 0  |     PyObject *w = PyLong_FromLong((long)c);  | 
8308  | 0  |     PyObject *x;  | 
8309  |  | 
  | 
8310  | 0  |     if (w == NULL)  | 
8311  | 0  |         return NULL;  | 
8312  | 0  |     x = PyObject_GetItem(mapping, w);  | 
8313  | 0  |     Py_DECREF(w);  | 
8314  | 0  |     if (x == NULL) { | 
8315  | 0  |         if (PyErr_ExceptionMatches(PyExc_LookupError)) { | 
8316  |  |             /* No mapping found means: mapping is undefined. */  | 
8317  | 0  |             PyErr_Clear();  | 
8318  | 0  |             Py_RETURN_NONE;  | 
8319  | 0  |         } else  | 
8320  | 0  |             return NULL;  | 
8321  | 0  |     }  | 
8322  | 0  |     else if (x == Py_None)  | 
8323  | 0  |         return x;  | 
8324  | 0  |     else if (PyLong_Check(x)) { | 
8325  | 0  |         long value = PyLong_AS_LONG(x);  | 
8326  | 0  |         if (value < 0 || value > 255) { | 
8327  | 0  |             PyErr_SetString(PyExc_TypeError,  | 
8328  | 0  |                             "character mapping must be in range(256)");  | 
8329  | 0  |             Py_DECREF(x);  | 
8330  | 0  |             return NULL;  | 
8331  | 0  |         }  | 
8332  | 0  |         return x;  | 
8333  | 0  |     }  | 
8334  | 0  |     else if (PyBytes_Check(x))  | 
8335  | 0  |         return x;  | 
8336  | 0  |     else { | 
8337  |  |         /* wrong return value */  | 
8338  | 0  |         PyErr_Format(PyExc_TypeError,  | 
8339  | 0  |                      "character mapping must return integer, bytes or None, not %.400s",  | 
8340  | 0  |                      x->ob_type->tp_name);  | 
8341  | 0  |         Py_DECREF(x);  | 
8342  | 0  |         return NULL;  | 
8343  | 0  |     }  | 
8344  | 0  | }  | 
8345  |  |  | 
8346  |  | static int  | 
8347  |  | charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)  | 
8348  | 0  | { | 
8349  | 0  |     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);  | 
8350  |  |     /* exponentially overallocate to minimize reallocations */  | 
8351  | 0  |     if (requiredsize < 2*outsize)  | 
8352  | 0  |         requiredsize = 2*outsize;  | 
8353  | 0  |     if (_PyBytes_Resize(outobj, requiredsize))  | 
8354  | 0  |         return -1;  | 
8355  | 0  |     return 0;  | 
8356  | 0  | }  | 
8357  |  |  | 
8358  |  | typedef enum charmapencode_result { | 
8359  |  |     enc_SUCCESS, enc_FAILED, enc_EXCEPTION  | 
8360  |  | } charmapencode_result;  | 
8361  |  | /* lookup the character, put the result in the output string and adjust  | 
8362  |  |    various state variables. Resize the output bytes object if not enough  | 
8363  |  |    space is available. Return a new reference to the object that  | 
8364  |  |    was put in the output buffer, or Py_None, if the mapping was undefined  | 
8365  |  |    (in which case no character was written) or NULL, if a  | 
8366  |  |    reallocation error occurred. The caller must decref the result */  | 
8367  |  | static charmapencode_result  | 
8368  |  | charmapencode_output(Py_UCS4 c, PyObject *mapping,  | 
8369  |  |                      PyObject **outobj, Py_ssize_t *outpos)  | 
8370  | 0  | { | 
8371  | 0  |     PyObject *rep;  | 
8372  | 0  |     char *outstart;  | 
8373  | 0  |     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);  | 
8374  |  | 
  | 
8375  | 0  |     if (Py_TYPE(mapping) == &EncodingMapType) { | 
8376  | 0  |         int res = encoding_map_lookup(c, mapping);  | 
8377  | 0  |         Py_ssize_t requiredsize = *outpos+1;  | 
8378  | 0  |         if (res == -1)  | 
8379  | 0  |             return enc_FAILED;  | 
8380  | 0  |         if (outsize<requiredsize)  | 
8381  | 0  |             if (charmapencode_resize(outobj, outpos, requiredsize))  | 
8382  | 0  |                 return enc_EXCEPTION;  | 
8383  | 0  |         outstart = PyBytes_AS_STRING(*outobj);  | 
8384  | 0  |         outstart[(*outpos)++] = (char)res;  | 
8385  | 0  |         return enc_SUCCESS;  | 
8386  | 0  |     }  | 
8387  |  |  | 
8388  | 0  |     rep = charmapencode_lookup(c, mapping);  | 
8389  | 0  |     if (rep==NULL)  | 
8390  | 0  |         return enc_EXCEPTION;  | 
8391  | 0  |     else if (rep==Py_None) { | 
8392  | 0  |         Py_DECREF(rep);  | 
8393  | 0  |         return enc_FAILED;  | 
8394  | 0  |     } else { | 
8395  | 0  |         if (PyLong_Check(rep)) { | 
8396  | 0  |             Py_ssize_t requiredsize = *outpos+1;  | 
8397  | 0  |             if (outsize<requiredsize)  | 
8398  | 0  |                 if (charmapencode_resize(outobj, outpos, requiredsize)) { | 
8399  | 0  |                     Py_DECREF(rep);  | 
8400  | 0  |                     return enc_EXCEPTION;  | 
8401  | 0  |                 }  | 
8402  | 0  |             outstart = PyBytes_AS_STRING(*outobj);  | 
8403  | 0  |             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);  | 
8404  | 0  |         }  | 
8405  | 0  |         else { | 
8406  | 0  |             const char *repchars = PyBytes_AS_STRING(rep);  | 
8407  | 0  |             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);  | 
8408  | 0  |             Py_ssize_t requiredsize = *outpos+repsize;  | 
8409  | 0  |             if (outsize<requiredsize)  | 
8410  | 0  |                 if (charmapencode_resize(outobj, outpos, requiredsize)) { | 
8411  | 0  |                     Py_DECREF(rep);  | 
8412  | 0  |                     return enc_EXCEPTION;  | 
8413  | 0  |                 }  | 
8414  | 0  |             outstart = PyBytes_AS_STRING(*outobj);  | 
8415  | 0  |             memcpy(outstart + *outpos, repchars, repsize);  | 
8416  | 0  |             *outpos += repsize;  | 
8417  | 0  |         }  | 
8418  | 0  |     }  | 
8419  | 0  |     Py_DECREF(rep);  | 
8420  | 0  |     return enc_SUCCESS;  | 
8421  | 0  | }  | 
8422  |  |  | 
8423  |  | /* handle an error in PyUnicode_EncodeCharmap  | 
8424  |  |    Return 0 on success, -1 on error */  | 
8425  |  | static int  | 
8426  |  | charmap_encoding_error(  | 
8427  |  |     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,  | 
8428  |  |     PyObject **exceptionObject,  | 
8429  |  |     _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,  | 
8430  |  |     PyObject **res, Py_ssize_t *respos)  | 
8431  | 0  | { | 
8432  | 0  |     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */  | 
8433  | 0  |     Py_ssize_t size, repsize;  | 
8434  | 0  |     Py_ssize_t newpos;  | 
8435  | 0  |     enum PyUnicode_Kind kind;  | 
8436  | 0  |     void *data;  | 
8437  | 0  |     Py_ssize_t index;  | 
8438  |  |     /* startpos for collecting unencodable chars */  | 
8439  | 0  |     Py_ssize_t collstartpos = *inpos;  | 
8440  | 0  |     Py_ssize_t collendpos = *inpos+1;  | 
8441  | 0  |     Py_ssize_t collpos;  | 
8442  | 0  |     const char *encoding = "charmap";  | 
8443  | 0  |     const char *reason = "character maps to <undefined>";  | 
8444  | 0  |     charmapencode_result x;  | 
8445  | 0  |     Py_UCS4 ch;  | 
8446  | 0  |     int val;  | 
8447  |  | 
  | 
8448  | 0  |     if (PyUnicode_READY(unicode) == -1)  | 
8449  | 0  |         return -1;  | 
8450  | 0  |     size = PyUnicode_GET_LENGTH(unicode);  | 
8451  |  |     /* find all unencodable characters */  | 
8452  | 0  |     while (collendpos < size) { | 
8453  | 0  |         PyObject *rep;  | 
8454  | 0  |         if (Py_TYPE(mapping) == &EncodingMapType) { | 
8455  | 0  |             ch = PyUnicode_READ_CHAR(unicode, collendpos);  | 
8456  | 0  |             val = encoding_map_lookup(ch, mapping);  | 
8457  | 0  |             if (val != -1)  | 
8458  | 0  |                 break;  | 
8459  | 0  |             ++collendpos;  | 
8460  | 0  |             continue;  | 
8461  | 0  |         }  | 
8462  |  |  | 
8463  | 0  |         ch = PyUnicode_READ_CHAR(unicode, collendpos);  | 
8464  | 0  |         rep = charmapencode_lookup(ch, mapping);  | 
8465  | 0  |         if (rep==NULL)  | 
8466  | 0  |             return -1;  | 
8467  | 0  |         else if (rep!=Py_None) { | 
8468  | 0  |             Py_DECREF(rep);  | 
8469  | 0  |             break;  | 
8470  | 0  |         }  | 
8471  | 0  |         Py_DECREF(rep);  | 
8472  | 0  |         ++collendpos;  | 
8473  | 0  |     }  | 
8474  |  |     /* cache callback name lookup  | 
8475  |  |      * (if not done yet, i.e. it's the first error) */  | 
8476  | 0  |     if (*error_handler == _Py_ERROR_UNKNOWN)  | 
8477  | 0  |         *error_handler = _Py_GetErrorHandler(errors);  | 
8478  |  | 
  | 
8479  | 0  |     switch (*error_handler) { | 
8480  | 0  |     case _Py_ERROR_STRICT:  | 
8481  | 0  |         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);  | 
8482  | 0  |         return -1;  | 
8483  |  |  | 
8484  | 0  |     case _Py_ERROR_REPLACE:  | 
8485  | 0  |         for (collpos = collstartpos; collpos<collendpos; ++collpos) { | 
8486  | 0  |             x = charmapencode_output('?', mapping, res, respos); | 
8487  | 0  |             if (x==enc_EXCEPTION) { | 
8488  | 0  |                 return -1;  | 
8489  | 0  |             }  | 
8490  | 0  |             else if (x==enc_FAILED) { | 
8491  | 0  |                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);  | 
8492  | 0  |                 return -1;  | 
8493  | 0  |             }  | 
8494  | 0  |         }  | 
8495  |  |         /* fall through */  | 
8496  | 0  |     case _Py_ERROR_IGNORE:  | 
8497  | 0  |         *inpos = collendpos;  | 
8498  | 0  |         break;  | 
8499  |  |  | 
8500  | 0  |     case _Py_ERROR_XMLCHARREFREPLACE:  | 
8501  |  |         /* generate replacement (temporarily (mis)uses p) */  | 
8502  | 0  |         for (collpos = collstartpos; collpos < collendpos; ++collpos) { | 
8503  | 0  |             char buffer[2+29+1+1];  | 
8504  | 0  |             char *cp;  | 
8505  | 0  |             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));  | 
8506  | 0  |             for (cp = buffer; *cp; ++cp) { | 
8507  | 0  |                 x = charmapencode_output(*cp, mapping, res, respos);  | 
8508  | 0  |                 if (x==enc_EXCEPTION)  | 
8509  | 0  |                     return -1;  | 
8510  | 0  |                 else if (x==enc_FAILED) { | 
8511  | 0  |                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);  | 
8512  | 0  |                     return -1;  | 
8513  | 0  |                 }  | 
8514  | 0  |             }  | 
8515  | 0  |         }  | 
8516  | 0  |         *inpos = collendpos;  | 
8517  | 0  |         break;  | 
8518  |  |  | 
8519  | 0  |     default:  | 
8520  | 0  |         repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,  | 
8521  | 0  |                                                       encoding, reason, unicode, exceptionObject,  | 
8522  | 0  |                                                       collstartpos, collendpos, &newpos);  | 
8523  | 0  |         if (repunicode == NULL)  | 
8524  | 0  |             return -1;  | 
8525  | 0  |         if (PyBytes_Check(repunicode)) { | 
8526  |  |             /* Directly copy bytes result to output. */  | 
8527  | 0  |             Py_ssize_t outsize = PyBytes_Size(*res);  | 
8528  | 0  |             Py_ssize_t requiredsize;  | 
8529  | 0  |             repsize = PyBytes_Size(repunicode);  | 
8530  | 0  |             requiredsize = *respos + repsize;  | 
8531  | 0  |             if (requiredsize > outsize)  | 
8532  |  |                 /* Make room for all additional bytes. */  | 
8533  | 0  |                 if (charmapencode_resize(res, respos, requiredsize)) { | 
8534  | 0  |                     Py_DECREF(repunicode);  | 
8535  | 0  |                     return -1;  | 
8536  | 0  |                 }  | 
8537  | 0  |             memcpy(PyBytes_AsString(*res) + *respos,  | 
8538  | 0  |                    PyBytes_AsString(repunicode),  repsize);  | 
8539  | 0  |             *respos += repsize;  | 
8540  | 0  |             *inpos = newpos;  | 
8541  | 0  |             Py_DECREF(repunicode);  | 
8542  | 0  |             break;  | 
8543  | 0  |         }  | 
8544  |  |         /* generate replacement  */  | 
8545  | 0  |         if (PyUnicode_READY(repunicode) == -1) { | 
8546  | 0  |             Py_DECREF(repunicode);  | 
8547  | 0  |             return -1;  | 
8548  | 0  |         }  | 
8549  | 0  |         repsize = PyUnicode_GET_LENGTH(repunicode);  | 
8550  | 0  |         data = PyUnicode_DATA(repunicode);  | 
8551  | 0  |         kind = PyUnicode_KIND(repunicode);  | 
8552  | 0  |         for (index = 0; index < repsize; index++) { | 
8553  | 0  |             Py_UCS4 repch = PyUnicode_READ(kind, data, index);  | 
8554  | 0  |             x = charmapencode_output(repch, mapping, res, respos);  | 
8555  | 0  |             if (x==enc_EXCEPTION) { | 
8556  | 0  |                 Py_DECREF(repunicode);  | 
8557  | 0  |                 return -1;  | 
8558  | 0  |             }  | 
8559  | 0  |             else if (x==enc_FAILED) { | 
8560  | 0  |                 Py_DECREF(repunicode);  | 
8561  | 0  |                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);  | 
8562  | 0  |                 return -1;  | 
8563  | 0  |             }  | 
8564  | 0  |         }  | 
8565  | 0  |         *inpos = newpos;  | 
8566  | 0  |         Py_DECREF(repunicode);  | 
8567  | 0  |     }  | 
8568  | 0  |     return 0;  | 
8569  | 0  | }  | 
8570  |  |  | 
8571  |  | PyObject *  | 
8572  |  | _PyUnicode_EncodeCharmap(PyObject *unicode,  | 
8573  |  |                          PyObject *mapping,  | 
8574  |  |                          const char *errors)  | 
8575  | 0  | { | 
8576  |  |     /* output object */  | 
8577  | 0  |     PyObject *res = NULL;  | 
8578  |  |     /* current input position */  | 
8579  | 0  |     Py_ssize_t inpos = 0;  | 
8580  | 0  |     Py_ssize_t size;  | 
8581  |  |     /* current output position */  | 
8582  | 0  |     Py_ssize_t respos = 0;  | 
8583  | 0  |     PyObject *error_handler_obj = NULL;  | 
8584  | 0  |     PyObject *exc = NULL;  | 
8585  | 0  |     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;  | 
8586  | 0  |     void *data;  | 
8587  | 0  |     int kind;  | 
8588  |  | 
  | 
8589  | 0  |     if (PyUnicode_READY(unicode) == -1)  | 
8590  | 0  |         return NULL;  | 
8591  | 0  |     size = PyUnicode_GET_LENGTH(unicode);  | 
8592  | 0  |     data = PyUnicode_DATA(unicode);  | 
8593  | 0  |     kind = PyUnicode_KIND(unicode);  | 
8594  |  |  | 
8595  |  |     /* Default to Latin-1 */  | 
8596  | 0  |     if (mapping == NULL)  | 
8597  | 0  |         return unicode_encode_ucs1(unicode, errors, 256);  | 
8598  |  |  | 
8599  |  |     /* allocate enough for a simple encoding without  | 
8600  |  |        replacements, if we need more, we'll resize */  | 
8601  | 0  |     res = PyBytes_FromStringAndSize(NULL, size);  | 
8602  | 0  |     if (res == NULL)  | 
8603  | 0  |         goto onError;  | 
8604  | 0  |     if (size == 0)  | 
8605  | 0  |         return res;  | 
8606  |  |  | 
8607  | 0  |     while (inpos<size) { | 
8608  | 0  |         Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);  | 
8609  |  |         /* try to encode it */  | 
8610  | 0  |         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);  | 
8611  | 0  |         if (x==enc_EXCEPTION) /* error */  | 
8612  | 0  |             goto onError;  | 
8613  | 0  |         if (x==enc_FAILED) { /* unencodable character */ | 
8614  | 0  |             if (charmap_encoding_error(unicode, &inpos, mapping,  | 
8615  | 0  |                                        &exc,  | 
8616  | 0  |                                        &error_handler, &error_handler_obj, errors,  | 
8617  | 0  |                                        &res, &respos)) { | 
8618  | 0  |                 goto onError;  | 
8619  | 0  |             }  | 
8620  | 0  |         }  | 
8621  | 0  |         else  | 
8622  |  |             /* done with this character => adjust input position */  | 
8623  | 0  |             ++inpos;  | 
8624  | 0  |     }  | 
8625  |  |  | 
8626  |  |     /* Resize if we allocated to much */  | 
8627  | 0  |     if (respos<PyBytes_GET_SIZE(res))  | 
8628  | 0  |         if (_PyBytes_Resize(&res, respos) < 0)  | 
8629  | 0  |             goto onError;  | 
8630  |  |  | 
8631  | 0  |     Py_XDECREF(exc);  | 
8632  | 0  |     Py_XDECREF(error_handler_obj);  | 
8633  | 0  |     return res;  | 
8634  |  |  | 
8635  | 0  |   onError:  | 
8636  | 0  |     Py_XDECREF(res);  | 
8637  | 0  |     Py_XDECREF(exc);  | 
8638  | 0  |     Py_XDECREF(error_handler_obj);  | 
8639  | 0  |     return NULL;  | 
8640  | 0  | }  | 
8641  |  |  | 
8642  |  | /* Deprecated */  | 
8643  |  | PyObject *  | 
8644  |  | PyUnicode_EncodeCharmap(const Py_UNICODE *p,  | 
8645  |  |                         Py_ssize_t size,  | 
8646  |  |                         PyObject *mapping,  | 
8647  |  |                         const char *errors)  | 
8648  | 0  | { | 
8649  | 0  |     PyObject *result;  | 
8650  | 0  |     PyObject *unicode = PyUnicode_FromWideChar(p, size);  | 
8651  | 0  |     if (unicode == NULL)  | 
8652  | 0  |         return NULL;  | 
8653  | 0  |     result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);  | 
8654  | 0  |     Py_DECREF(unicode);  | 
8655  | 0  |     return result;  | 
8656  | 0  | }  | 
8657  |  |  | 
8658  |  | PyObject *  | 
8659  |  | PyUnicode_AsCharmapString(PyObject *unicode,  | 
8660  |  |                           PyObject *mapping)  | 
8661  | 0  | { | 
8662  | 0  |     if (!PyUnicode_Check(unicode) || mapping == NULL) { | 
8663  | 0  |         PyErr_BadArgument();  | 
8664  | 0  |         return NULL;  | 
8665  | 0  |     }  | 
8666  | 0  |     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);  | 
8667  | 0  | }  | 
8668  |  |  | 
8669  |  | /* create or adjust a UnicodeTranslateError */  | 
8670  |  | static void  | 
8671  |  | make_translate_exception(PyObject **exceptionObject,  | 
8672  |  |                          PyObject *unicode,  | 
8673  |  |                          Py_ssize_t startpos, Py_ssize_t endpos,  | 
8674  |  |                          const char *reason)  | 
8675  | 0  | { | 
8676  | 0  |     if (*exceptionObject == NULL) { | 
8677  | 0  |         *exceptionObject = _PyUnicodeTranslateError_Create(  | 
8678  | 0  |             unicode, startpos, endpos, reason);  | 
8679  | 0  |     }  | 
8680  | 0  |     else { | 
8681  | 0  |         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))  | 
8682  | 0  |             goto onError;  | 
8683  | 0  |         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))  | 
8684  | 0  |             goto onError;  | 
8685  | 0  |         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))  | 
8686  | 0  |             goto onError;  | 
8687  | 0  |         return;  | 
8688  | 0  |       onError:  | 
8689  | 0  |         Py_CLEAR(*exceptionObject);  | 
8690  | 0  |     }  | 
8691  | 0  | }  | 
8692  |  |  | 
8693  |  | /* error handling callback helper:  | 
8694  |  |    build arguments, call the callback and check the arguments,  | 
8695  |  |    put the result into newpos and return the replacement string, which  | 
8696  |  |    has to be freed by the caller */  | 
8697  |  | static PyObject *  | 
8698  |  | unicode_translate_call_errorhandler(const char *errors,  | 
8699  |  |                                     PyObject **errorHandler,  | 
8700  |  |                                     const char *reason,  | 
8701  |  |                                     PyObject *unicode, PyObject **exceptionObject,  | 
8702  |  |                                     Py_ssize_t startpos, Py_ssize_t endpos,  | 
8703  |  |                                     Py_ssize_t *newpos)  | 
8704  | 0  | { | 
8705  | 0  |     static const char *argparse = "Un;translating error handler must return (str, int) tuple";  | 
8706  |  | 
  | 
8707  | 0  |     Py_ssize_t i_newpos;  | 
8708  | 0  |     PyObject *restuple;  | 
8709  | 0  |     PyObject *resunicode;  | 
8710  |  | 
  | 
8711  | 0  |     if (*errorHandler == NULL) { | 
8712  | 0  |         *errorHandler = PyCodec_LookupError(errors);  | 
8713  | 0  |         if (*errorHandler == NULL)  | 
8714  | 0  |             return NULL;  | 
8715  | 0  |     }  | 
8716  |  |  | 
8717  | 0  |     make_translate_exception(exceptionObject,  | 
8718  | 0  |                              unicode, startpos, endpos, reason);  | 
8719  | 0  |     if (*exceptionObject == NULL)  | 
8720  | 0  |         return NULL;  | 
8721  |  |  | 
8722  | 0  |     restuple = PyObject_CallFunctionObjArgs(  | 
8723  | 0  |         *errorHandler, *exceptionObject, NULL);  | 
8724  | 0  |     if (restuple == NULL)  | 
8725  | 0  |         return NULL;  | 
8726  | 0  |     if (!PyTuple_Check(restuple)) { | 
8727  | 0  |         PyErr_SetString(PyExc_TypeError, &argparse[3]);  | 
8728  | 0  |         Py_DECREF(restuple);  | 
8729  | 0  |         return NULL;  | 
8730  | 0  |     }  | 
8731  | 0  |     if (!PyArg_ParseTuple(restuple, argparse,  | 
8732  | 0  |                           &resunicode, &i_newpos)) { | 
8733  | 0  |         Py_DECREF(restuple);  | 
8734  | 0  |         return NULL;  | 
8735  | 0  |     }  | 
8736  | 0  |     if (i_newpos<0)  | 
8737  | 0  |         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;  | 
8738  | 0  |     else  | 
8739  | 0  |         *newpos = i_newpos;  | 
8740  | 0  |     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { | 
8741  | 0  |         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);  | 
8742  | 0  |         Py_DECREF(restuple);  | 
8743  | 0  |         return NULL;  | 
8744  | 0  |     }  | 
8745  | 0  |     Py_INCREF(resunicode);  | 
8746  | 0  |     Py_DECREF(restuple);  | 
8747  | 0  |     return resunicode;  | 
8748  | 0  | }  | 
8749  |  |  | 
8750  |  | /* Lookup the character ch in the mapping and put the result in result,  | 
8751  |  |    which must be decrefed by the caller.  | 
8752  |  |    Return 0 on success, -1 on error */  | 
8753  |  | static int  | 
8754  |  | charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)  | 
8755  | 95  | { | 
8756  | 95  |     PyObject *w = PyLong_FromLong((long)c);  | 
8757  | 95  |     PyObject *x;  | 
8758  |  |  | 
8759  | 95  |     if (w == NULL)  | 
8760  | 0  |         return -1;  | 
8761  | 95  |     x = PyObject_GetItem(mapping, w);  | 
8762  | 95  |     Py_DECREF(w);  | 
8763  | 95  |     if (x == NULL) { | 
8764  | 41  |         if (PyErr_ExceptionMatches(PyExc_LookupError)) { | 
8765  |  |             /* No mapping found means: use 1:1 mapping. */  | 
8766  | 41  |             PyErr_Clear();  | 
8767  | 41  |             *result = NULL;  | 
8768  | 41  |             return 0;  | 
8769  | 41  |         } else  | 
8770  | 0  |             return -1;  | 
8771  | 41  |     }  | 
8772  | 54  |     else if (x == Py_None) { | 
8773  | 0  |         *result = x;  | 
8774  | 0  |         return 0;  | 
8775  | 0  |     }  | 
8776  | 54  |     else if (PyLong_Check(x)) { | 
8777  | 0  |         long value = PyLong_AS_LONG(x);  | 
8778  | 0  |         if (value < 0 || value > MAX_UNICODE) { | 
8779  | 0  |             PyErr_Format(PyExc_ValueError,  | 
8780  | 0  |                          "character mapping must be in range(0x%x)",  | 
8781  | 0  |                          MAX_UNICODE+1);  | 
8782  | 0  |             Py_DECREF(x);  | 
8783  | 0  |             return -1;  | 
8784  | 0  |         }  | 
8785  | 0  |         *result = x;  | 
8786  | 0  |         return 0;  | 
8787  | 0  |     }  | 
8788  | 54  |     else if (PyUnicode_Check(x)) { | 
8789  | 54  |         *result = x;  | 
8790  | 54  |         return 0;  | 
8791  | 54  |     }  | 
8792  | 0  |     else { | 
8793  |  |         /* wrong return value */  | 
8794  | 0  |         PyErr_SetString(PyExc_TypeError,  | 
8795  | 0  |                         "character mapping must return integer, None or str");  | 
8796  | 0  |         Py_DECREF(x);  | 
8797  | 0  |         return -1;  | 
8798  | 0  |     }  | 
8799  | 95  | }  | 
8800  |  |  | 
8801  |  | /* lookup the character, write the result into the writer.  | 
8802  |  |    Return 1 if the result was written into the writer, return 0 if the mapping  | 
8803  |  |    was undefined, raise an exception return -1 on error. */  | 
8804  |  | static int  | 
8805  |  | charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,  | 
8806  |  |                         _PyUnicodeWriter *writer)  | 
8807  | 37  | { | 
8808  | 37  |     PyObject *item;  | 
8809  |  |  | 
8810  | 37  |     if (charmaptranslate_lookup(ch, mapping, &item))  | 
8811  | 0  |         return -1;  | 
8812  |  |  | 
8813  | 37  |     if (item == NULL) { | 
8814  |  |         /* not found => default to 1:1 mapping */  | 
8815  | 8  |         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { | 
8816  | 0  |             return -1;  | 
8817  | 0  |         }  | 
8818  | 8  |         return 1;  | 
8819  | 8  |     }  | 
8820  |  |  | 
8821  | 29  |     if (item == Py_None) { | 
8822  | 0  |         Py_DECREF(item);  | 
8823  | 0  |         return 0;  | 
8824  | 0  |     }  | 
8825  |  |  | 
8826  | 29  |     if (PyLong_Check(item)) { | 
8827  | 0  |         long ch = (Py_UCS4)PyLong_AS_LONG(item);  | 
8828  |  |         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already  | 
8829  |  |            used it */  | 
8830  | 0  |         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { | 
8831  | 0  |             Py_DECREF(item);  | 
8832  | 0  |             return -1;  | 
8833  | 0  |         }  | 
8834  | 0  |         Py_DECREF(item);  | 
8835  | 0  |         return 1;  | 
8836  | 0  |     }  | 
8837  |  |  | 
8838  | 29  |     if (!PyUnicode_Check(item)) { | 
8839  | 0  |         Py_DECREF(item);  | 
8840  | 0  |         return -1;  | 
8841  | 0  |     }  | 
8842  |  |  | 
8843  | 29  |     if (_PyUnicodeWriter_WriteStr(writer, item) < 0) { | 
8844  | 0  |         Py_DECREF(item);  | 
8845  | 0  |         return -1;  | 
8846  | 0  |     }  | 
8847  |  |  | 
8848  | 29  |     Py_DECREF(item);  | 
8849  | 29  |     return 1;  | 
8850  | 29  | }  | 
8851  |  |  | 
8852  |  | static int  | 
8853  |  | unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,  | 
8854  |  |                               Py_UCS1 *translate)  | 
8855  | 58  | { | 
8856  | 58  |     PyObject *item = NULL;  | 
8857  | 58  |     int ret = 0;  | 
8858  |  |  | 
8859  | 58  |     if (charmaptranslate_lookup(ch, mapping, &item)) { | 
8860  | 0  |         return -1;  | 
8861  | 0  |     }  | 
8862  |  |  | 
8863  | 58  |     if (item == Py_None) { | 
8864  |  |         /* deletion */  | 
8865  | 0  |         translate[ch] = 0xfe;  | 
8866  | 0  |     }  | 
8867  | 58  |     else if (item == NULL) { | 
8868  |  |         /* not found => default to 1:1 mapping */  | 
8869  | 33  |         translate[ch] = ch;  | 
8870  | 33  |         return 1;  | 
8871  | 33  |     }  | 
8872  | 25  |     else if (PyLong_Check(item)) { | 
8873  | 0  |         long replace = PyLong_AS_LONG(item);  | 
8874  |  |         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already  | 
8875  |  |            used it */  | 
8876  | 0  |         if (127 < replace) { | 
8877  |  |             /* invalid character or character outside ASCII:  | 
8878  |  |                skip the fast translate */  | 
8879  | 0  |             goto exit;  | 
8880  | 0  |         }  | 
8881  | 0  |         translate[ch] = (Py_UCS1)replace;  | 
8882  | 0  |     }  | 
8883  | 25  |     else if (PyUnicode_Check(item)) { | 
8884  | 25  |         Py_UCS4 replace;  | 
8885  |  |  | 
8886  | 25  |         if (PyUnicode_READY(item) == -1) { | 
8887  | 0  |             Py_DECREF(item);  | 
8888  | 0  |             return -1;  | 
8889  | 0  |         }  | 
8890  | 25  |         if (PyUnicode_GET_LENGTH(item) != 1)  | 
8891  | 25  |             goto exit;  | 
8892  |  |  | 
8893  | 0  |         replace = PyUnicode_READ_CHAR(item, 0);  | 
8894  | 0  |         if (replace > 127)  | 
8895  | 0  |             goto exit;  | 
8896  | 0  |         translate[ch] = (Py_UCS1)replace;  | 
8897  | 0  |     }  | 
8898  | 0  |     else { | 
8899  |  |         /* not None, NULL, long or unicode */  | 
8900  | 0  |         goto exit;  | 
8901  | 0  |     }  | 
8902  | 0  |     ret = 1;  | 
8903  |  | 
  | 
8904  | 25  |   exit:  | 
8905  | 25  |     Py_DECREF(item);  | 
8906  | 25  |     return ret;  | 
8907  | 0  | }  | 
8908  |  |  | 
8909  |  | /* Fast path for ascii => ascii translation. Return 1 if the whole string  | 
8910  |  |    was translated into writer, return 0 if the input string was partially  | 
8911  |  |    translated into writer, raise an exception and return -1 on error. */  | 
8912  |  | static int  | 
8913  |  | unicode_fast_translate(PyObject *input, PyObject *mapping,  | 
8914  |  |                        _PyUnicodeWriter *writer, int ignore,  | 
8915  |  |                        Py_ssize_t *input_pos)  | 
8916  | 48  | { | 
8917  | 48  |     Py_UCS1 ascii_table[128], ch, ch2;  | 
8918  | 48  |     Py_ssize_t len;  | 
8919  | 48  |     Py_UCS1 *in, *end, *out;  | 
8920  | 48  |     int res = 0;  | 
8921  |  |  | 
8922  | 48  |     len = PyUnicode_GET_LENGTH(input);  | 
8923  |  |  | 
8924  | 48  |     memset(ascii_table, 0xff, 128);  | 
8925  |  |  | 
8926  | 48  |     in = PyUnicode_1BYTE_DATA(input);  | 
8927  | 48  |     end = in + len;  | 
8928  |  |  | 
8929  | 48  |     assert(PyUnicode_IS_ASCII(writer->buffer));  | 
8930  | 48  |     assert(PyUnicode_GET_LENGTH(writer->buffer) == len);  | 
8931  | 48  |     out = PyUnicode_1BYTE_DATA(writer->buffer);  | 
8932  |  |  | 
8933  | 88  |     for (; in < end; in++) { | 
8934  | 65  |         ch = *in;  | 
8935  | 65  |         ch2 = ascii_table[ch];  | 
8936  | 65  |         if (ch2 == 0xff) { | 
8937  | 58  |             int translate = unicode_fast_translate_lookup(mapping, ch,  | 
8938  | 58  |                                                           ascii_table);  | 
8939  | 58  |             if (translate < 0)  | 
8940  | 0  |                 return -1;  | 
8941  | 58  |             if (translate == 0)  | 
8942  | 25  |                 goto exit;  | 
8943  | 33  |             ch2 = ascii_table[ch];  | 
8944  | 33  |         }  | 
8945  | 40  |         if (ch2 == 0xfe) { | 
8946  | 0  |             if (ignore)  | 
8947  | 0  |                 continue;  | 
8948  | 0  |             goto exit;  | 
8949  | 0  |         }  | 
8950  | 40  |         assert(ch2 < 128);  | 
8951  | 40  |         *out = ch2;  | 
8952  | 40  |         out++;  | 
8953  | 40  |     }  | 
8954  | 23  |     res = 1;  | 
8955  |  |  | 
8956  | 48  | exit:  | 
8957  | 48  |     writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);  | 
8958  | 48  |     *input_pos = in - PyUnicode_1BYTE_DATA(input);  | 
8959  | 48  |     return res;  | 
8960  | 23  | }  | 
8961  |  |  | 
8962  |  | static PyObject *  | 
8963  |  | _PyUnicode_TranslateCharmap(PyObject *input,  | 
8964  |  |                             PyObject *mapping,  | 
8965  |  |                             const char *errors)  | 
8966  | 48  | { | 
8967  |  |     /* input object */  | 
8968  | 48  |     char *data;  | 
8969  | 48  |     Py_ssize_t size, i;  | 
8970  | 48  |     int kind;  | 
8971  |  |     /* output buffer */  | 
8972  | 48  |     _PyUnicodeWriter writer;  | 
8973  |  |     /* error handler */  | 
8974  | 48  |     const char *reason = "character maps to <undefined>";  | 
8975  | 48  |     PyObject *errorHandler = NULL;  | 
8976  | 48  |     PyObject *exc = NULL;  | 
8977  | 48  |     int ignore;  | 
8978  | 48  |     int res;  | 
8979  |  |  | 
8980  | 48  |     if (mapping == NULL) { | 
8981  | 0  |         PyErr_BadArgument();  | 
8982  | 0  |         return NULL;  | 
8983  | 0  |     }  | 
8984  |  |  | 
8985  | 48  |     if (PyUnicode_READY(input) == -1)  | 
8986  | 0  |         return NULL;  | 
8987  | 48  |     data = (char*)PyUnicode_DATA(input);  | 
8988  | 48  |     kind = PyUnicode_KIND(input);  | 
8989  | 48  |     size = PyUnicode_GET_LENGTH(input);  | 
8990  |  |  | 
8991  | 48  |     if (size == 0)  | 
8992  | 0  |         return PyUnicode_FromObject(input);  | 
8993  |  |  | 
8994  |  |     /* allocate enough for a simple 1:1 translation without  | 
8995  |  |        replacements, if we need more, we'll resize */  | 
8996  | 48  |     _PyUnicodeWriter_Init(&writer);  | 
8997  | 48  |     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)  | 
8998  | 0  |         goto onError;  | 
8999  |  |  | 
9000  | 48  |     ignore = (errors != NULL && strcmp(errors, "ignore") == 0);  | 
9001  |  |  | 
9002  | 48  |     if (PyUnicode_READY(input) == -1)  | 
9003  | 0  |         return NULL;  | 
9004  | 48  |     if (PyUnicode_IS_ASCII(input)) { | 
9005  | 48  |         res = unicode_fast_translate(input, mapping, &writer, ignore, &i);  | 
9006  | 48  |         if (res < 0) { | 
9007  | 0  |             _PyUnicodeWriter_Dealloc(&writer);  | 
9008  | 0  |             return NULL;  | 
9009  | 0  |         }  | 
9010  | 48  |         if (res == 1)  | 
9011  | 23  |             return _PyUnicodeWriter_Finish(&writer);  | 
9012  | 48  |     }  | 
9013  | 0  |     else { | 
9014  | 0  |         i = 0;  | 
9015  | 0  |     }  | 
9016  |  |  | 
9017  | 62  |     while (i<size) { | 
9018  |  |         /* try to encode it */  | 
9019  | 37  |         int translate;  | 
9020  | 37  |         PyObject *repunicode = NULL; /* initialize to prevent gcc warning */  | 
9021  | 37  |         Py_ssize_t newpos;  | 
9022  |  |         /* startpos for collecting untranslatable chars */  | 
9023  | 37  |         Py_ssize_t collstart;  | 
9024  | 37  |         Py_ssize_t collend;  | 
9025  | 37  |         Py_UCS4 ch;  | 
9026  |  |  | 
9027  | 37  |         ch = PyUnicode_READ(kind, data, i);  | 
9028  | 37  |         translate = charmaptranslate_output(ch, mapping, &writer);  | 
9029  | 37  |         if (translate < 0)  | 
9030  | 0  |             goto onError;  | 
9031  |  |  | 
9032  | 37  |         if (translate != 0) { | 
9033  |  |             /* it worked => adjust input pointer */  | 
9034  | 37  |             ++i;  | 
9035  | 37  |             continue;  | 
9036  | 37  |         }  | 
9037  |  |  | 
9038  |  |         /* untranslatable character */  | 
9039  | 0  |         collstart = i;  | 
9040  | 0  |         collend = i+1;  | 
9041  |  |  | 
9042  |  |         /* find all untranslatable characters */  | 
9043  | 0  |         while (collend < size) { | 
9044  | 0  |             PyObject *x;  | 
9045  | 0  |             ch = PyUnicode_READ(kind, data, collend);  | 
9046  | 0  |             if (charmaptranslate_lookup(ch, mapping, &x))  | 
9047  | 0  |                 goto onError;  | 
9048  | 0  |             Py_XDECREF(x);  | 
9049  | 0  |             if (x != Py_None)  | 
9050  | 0  |                 break;  | 
9051  | 0  |             ++collend;  | 
9052  | 0  |         }  | 
9053  |  |  | 
9054  | 0  |         if (ignore) { | 
9055  | 0  |             i = collend;  | 
9056  | 0  |         }  | 
9057  | 0  |         else { | 
9058  | 0  |             repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,  | 
9059  | 0  |                                                              reason, input, &exc,  | 
9060  | 0  |                                                              collstart, collend, &newpos);  | 
9061  | 0  |             if (repunicode == NULL)  | 
9062  | 0  |                 goto onError;  | 
9063  | 0  |             if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) { | 
9064  | 0  |                 Py_DECREF(repunicode);  | 
9065  | 0  |                 goto onError;  | 
9066  | 0  |             }  | 
9067  | 0  |             Py_DECREF(repunicode);  | 
9068  | 0  |             i = newpos;  | 
9069  | 0  |         }  | 
9070  | 0  |     }  | 
9071  | 25  |     Py_XDECREF(exc);  | 
9072  | 25  |     Py_XDECREF(errorHandler);  | 
9073  | 25  |     return _PyUnicodeWriter_Finish(&writer);  | 
9074  |  |  | 
9075  | 0  |   onError:  | 
9076  | 0  |     _PyUnicodeWriter_Dealloc(&writer);  | 
9077  | 0  |     Py_XDECREF(exc);  | 
9078  | 0  |     Py_XDECREF(errorHandler);  | 
9079  | 0  |     return NULL;  | 
9080  | 25  | }  | 
9081  |  |  | 
9082  |  | /* Deprecated. Use PyUnicode_Translate instead. */  | 
9083  |  | PyObject *  | 
9084  |  | PyUnicode_TranslateCharmap(const Py_UNICODE *p,  | 
9085  |  |                            Py_ssize_t size,  | 
9086  |  |                            PyObject *mapping,  | 
9087  |  |                            const char *errors)  | 
9088  | 0  | { | 
9089  | 0  |     PyObject *result;  | 
9090  | 0  |     PyObject *unicode = PyUnicode_FromWideChar(p, size);  | 
9091  | 0  |     if (!unicode)  | 
9092  | 0  |         return NULL;  | 
9093  | 0  |     result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);  | 
9094  | 0  |     Py_DECREF(unicode);  | 
9095  | 0  |     return result;  | 
9096  | 0  | }  | 
9097  |  |  | 
9098  |  | PyObject *  | 
9099  |  | PyUnicode_Translate(PyObject *str,  | 
9100  |  |                     PyObject *mapping,  | 
9101  |  |                     const char *errors)  | 
9102  | 0  | { | 
9103  | 0  |     if (ensure_unicode(str) < 0)  | 
9104  | 0  |         return NULL;  | 
9105  | 0  |     return _PyUnicode_TranslateCharmap(str, mapping, errors);  | 
9106  | 0  | }  | 
9107  |  |  | 
9108  |  | PyObject *  | 
9109  |  | _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)  | 
9110  | 0  | { | 
9111  | 0  |     if (!PyUnicode_Check(unicode)) { | 
9112  | 0  |         PyErr_BadInternalCall();  | 
9113  | 0  |         return NULL;  | 
9114  | 0  |     }  | 
9115  | 0  |     if (PyUnicode_READY(unicode) == -1)  | 
9116  | 0  |         return NULL;  | 
9117  | 0  |     if (PyUnicode_IS_ASCII(unicode)) { | 
9118  |  |         /* If the string is already ASCII, just return the same string */  | 
9119  | 0  |         Py_INCREF(unicode);  | 
9120  | 0  |         return unicode;  | 
9121  | 0  |     }  | 
9122  |  |  | 
9123  | 0  |     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);  | 
9124  | 0  |     PyObject *result = PyUnicode_New(len, 127);  | 
9125  | 0  |     if (result == NULL) { | 
9126  | 0  |         return NULL;  | 
9127  | 0  |     }  | 
9128  |  |  | 
9129  | 0  |     Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);  | 
9130  | 0  |     int kind = PyUnicode_KIND(unicode);  | 
9131  | 0  |     const void *data = PyUnicode_DATA(unicode);  | 
9132  | 0  |     Py_ssize_t i;  | 
9133  | 0  |     for (i = 0; i < len; ++i) { | 
9134  | 0  |         Py_UCS4 ch = PyUnicode_READ(kind, data, i);  | 
9135  | 0  |         if (ch < 127) { | 
9136  | 0  |             out[i] = ch;  | 
9137  | 0  |         }  | 
9138  | 0  |         else if (Py_UNICODE_ISSPACE(ch)) { | 
9139  | 0  |             out[i] = ' ';  | 
9140  | 0  |         }  | 
9141  | 0  |         else { | 
9142  | 0  |             int decimal = Py_UNICODE_TODECIMAL(ch);  | 
9143  | 0  |             if (decimal < 0) { | 
9144  | 0  |                 out[i] = '?';  | 
9145  | 0  |                 out[i+1] = '\0';  | 
9146  | 0  |                 _PyUnicode_LENGTH(result) = i + 1;  | 
9147  | 0  |                 break;  | 
9148  | 0  |             }  | 
9149  | 0  |             out[i] = '0' + decimal;  | 
9150  | 0  |         }  | 
9151  | 0  |     }  | 
9152  |  | 
  | 
9153  | 0  |     assert(_PyUnicode_CheckConsistency(result, 1));  | 
9154  | 0  |     return result;  | 
9155  | 0  | }  | 
9156  |  |  | 
9157  |  | PyObject *  | 
9158  |  | PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,  | 
9159  |  |                                   Py_ssize_t length)  | 
9160  | 0  | { | 
9161  | 0  |     PyObject *decimal;  | 
9162  | 0  |     Py_ssize_t i;  | 
9163  | 0  |     Py_UCS4 maxchar;  | 
9164  | 0  |     enum PyUnicode_Kind kind;  | 
9165  | 0  |     void *data;  | 
9166  |  | 
  | 
9167  | 0  |     maxchar = 127;  | 
9168  | 0  |     for (i = 0; i < length; i++) { | 
9169  | 0  |         Py_UCS4 ch = s[i];  | 
9170  | 0  |         if (ch > 127) { | 
9171  | 0  |             int decimal = Py_UNICODE_TODECIMAL(ch);  | 
9172  | 0  |             if (decimal >= 0)  | 
9173  | 0  |                 ch = '0' + decimal;  | 
9174  | 0  |             maxchar = Py_MAX(maxchar, ch);  | 
9175  | 0  |         }  | 
9176  | 0  |     }  | 
9177  |  |  | 
9178  |  |     /* Copy to a new string */  | 
9179  | 0  |     decimal = PyUnicode_New(length, maxchar);  | 
9180  | 0  |     if (decimal == NULL)  | 
9181  | 0  |         return decimal;  | 
9182  | 0  |     kind = PyUnicode_KIND(decimal);  | 
9183  | 0  |     data = PyUnicode_DATA(decimal);  | 
9184  |  |     /* Iterate over code points */  | 
9185  | 0  |     for (i = 0; i < length; i++) { | 
9186  | 0  |         Py_UCS4 ch = s[i];  | 
9187  | 0  |         if (ch > 127) { | 
9188  | 0  |             int decimal = Py_UNICODE_TODECIMAL(ch);  | 
9189  | 0  |             if (decimal >= 0)  | 
9190  | 0  |                 ch = '0' + decimal;  | 
9191  | 0  |         }  | 
9192  | 0  |         PyUnicode_WRITE(kind, data, i, ch);  | 
9193  | 0  |     }  | 
9194  | 0  |     return unicode_result(decimal);  | 
9195  | 0  | }  | 
9196  |  | /* --- Decimal Encoder ---------------------------------------------------- */  | 
9197  |  |  | 
9198  |  | int  | 
9199  |  | PyUnicode_EncodeDecimal(Py_UNICODE *s,  | 
9200  |  |                         Py_ssize_t length,  | 
9201  |  |                         char *output,  | 
9202  |  |                         const char *errors)  | 
9203  | 0  | { | 
9204  | 0  |     PyObject *unicode;  | 
9205  | 0  |     Py_ssize_t i;  | 
9206  | 0  |     enum PyUnicode_Kind kind;  | 
9207  | 0  |     void *data;  | 
9208  |  | 
  | 
9209  | 0  |     if (output == NULL) { | 
9210  | 0  |         PyErr_BadArgument();  | 
9211  | 0  |         return -1;  | 
9212  | 0  |     }  | 
9213  |  |  | 
9214  | 0  |     unicode = PyUnicode_FromWideChar(s, length);  | 
9215  | 0  |     if (unicode == NULL)  | 
9216  | 0  |         return -1;  | 
9217  |  |  | 
9218  | 0  |     kind = PyUnicode_KIND(unicode);  | 
9219  | 0  |     data = PyUnicode_DATA(unicode);  | 
9220  |  | 
  | 
9221  | 0  |     for (i=0; i < length; ) { | 
9222  | 0  |         PyObject *exc;  | 
9223  | 0  |         Py_UCS4 ch;  | 
9224  | 0  |         int decimal;  | 
9225  | 0  |         Py_ssize_t startpos;  | 
9226  |  | 
  | 
9227  | 0  |         ch = PyUnicode_READ(kind, data, i);  | 
9228  |  | 
  | 
9229  | 0  |         if (Py_UNICODE_ISSPACE(ch)) { | 
9230  | 0  |             *output++ = ' ';  | 
9231  | 0  |             i++;  | 
9232  | 0  |             continue;  | 
9233  | 0  |         }  | 
9234  | 0  |         decimal = Py_UNICODE_TODECIMAL(ch);  | 
9235  | 0  |         if (decimal >= 0) { | 
9236  | 0  |             *output++ = '0' + decimal;  | 
9237  | 0  |             i++;  | 
9238  | 0  |             continue;  | 
9239  | 0  |         }  | 
9240  | 0  |         if (0 < ch && ch < 256) { | 
9241  | 0  |             *output++ = (char)ch;  | 
9242  | 0  |             i++;  | 
9243  | 0  |             continue;  | 
9244  | 0  |         }  | 
9245  |  |  | 
9246  | 0  |         startpos = i;  | 
9247  | 0  |         exc = NULL;  | 
9248  | 0  |         raise_encode_exception(&exc, "decimal", unicode,  | 
9249  | 0  |                                startpos, startpos+1,  | 
9250  | 0  |                                "invalid decimal Unicode string");  | 
9251  | 0  |         Py_XDECREF(exc);  | 
9252  | 0  |         Py_DECREF(unicode);  | 
9253  | 0  |         return -1;  | 
9254  | 0  |     }  | 
9255  |  |     /* 0-terminate the output string */  | 
9256  | 0  |     *output++ = '\0';  | 
9257  | 0  |     Py_DECREF(unicode);  | 
9258  | 0  |     return 0;  | 
9259  | 0  | }  | 
9260  |  |  | 
9261  |  | /* --- Helpers ------------------------------------------------------------ */  | 
9262  |  |  | 
9263  |  | /* helper macro to fixup start/end slice values */  | 
9264  |  | #define ADJUST_INDICES(start, end, len)         \  | 
9265  | 3.27k  |     if (end > len)                              \  | 
9266  | 3.27k  |         end = len;                              \  | 
9267  | 3.27k  |     else if (end < 0) {                         \ | 
9268  | 0  |         end += len;                             \  | 
9269  | 0  |         if (end < 0)                            \  | 
9270  | 0  |             end = 0;                            \  | 
9271  | 0  |     }                                           \  | 
9272  | 3.27k  |     if (start < 0) {                            \ | 
9273  | 0  |         start += len;                           \  | 
9274  | 0  |         if (start < 0)                          \  | 
9275  | 0  |             start = 0;                          \  | 
9276  | 0  |     }  | 
9277  |  |  | 
9278  |  | static Py_ssize_t  | 
9279  |  | any_find_slice(PyObject* s1, PyObject* s2,  | 
9280  |  |                Py_ssize_t start,  | 
9281  |  |                Py_ssize_t end,  | 
9282  |  |                int direction)  | 
9283  | 98  | { | 
9284  | 98  |     int kind1, kind2;  | 
9285  | 98  |     void *buf1, *buf2;  | 
9286  | 98  |     Py_ssize_t len1, len2, result;  | 
9287  |  |  | 
9288  | 98  |     kind1 = PyUnicode_KIND(s1);  | 
9289  | 98  |     kind2 = PyUnicode_KIND(s2);  | 
9290  | 98  |     if (kind1 < kind2)  | 
9291  | 0  |         return -1;  | 
9292  |  |  | 
9293  | 98  |     len1 = PyUnicode_GET_LENGTH(s1);  | 
9294  | 98  |     len2 = PyUnicode_GET_LENGTH(s2);  | 
9295  | 98  |     ADJUST_INDICES(start, end, len1);  | 
9296  | 98  |     if (end - start < len2)  | 
9297  | 0  |         return -1;  | 
9298  |  |  | 
9299  | 98  |     buf1 = PyUnicode_DATA(s1);  | 
9300  | 98  |     buf2 = PyUnicode_DATA(s2);  | 
9301  | 98  |     if (len2 == 1) { | 
9302  | 84  |         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);  | 
9303  | 84  |         result = findchar((const char *)buf1 + kind1*start,  | 
9304  | 84  |                           kind1, end - start, ch, direction);  | 
9305  | 84  |         if (result == -1)  | 
9306  | 14  |             return -1;  | 
9307  | 70  |         else  | 
9308  | 70  |             return start + result;  | 
9309  | 84  |     }  | 
9310  |  |  | 
9311  | 14  |     if (kind2 != kind1) { | 
9312  | 0  |         buf2 = _PyUnicode_AsKind(s2, kind1);  | 
9313  | 0  |         if (!buf2)  | 
9314  | 0  |             return -2;  | 
9315  | 0  |     }  | 
9316  |  |  | 
9317  | 14  |     if (direction > 0) { | 
9318  | 14  |         switch (kind1) { | 
9319  | 14  |         case PyUnicode_1BYTE_KIND:  | 
9320  | 14  |             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))  | 
9321  | 14  |                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);  | 
9322  | 0  |             else  | 
9323  | 0  |                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);  | 
9324  | 14  |             break;  | 
9325  | 0  |         case PyUnicode_2BYTE_KIND:  | 
9326  | 0  |             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);  | 
9327  | 0  |             break;  | 
9328  | 0  |         case PyUnicode_4BYTE_KIND:  | 
9329  | 0  |             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);  | 
9330  | 0  |             break;  | 
9331  | 0  |         default:  | 
9332  | 0  |             Py_UNREACHABLE();  | 
9333  | 14  |         }  | 
9334  | 14  |     }  | 
9335  | 0  |     else { | 
9336  | 0  |         switch (kind1) { | 
9337  | 0  |         case PyUnicode_1BYTE_KIND:  | 
9338  | 0  |             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))  | 
9339  | 0  |                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);  | 
9340  | 0  |             else  | 
9341  | 0  |                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);  | 
9342  | 0  |             break;  | 
9343  | 0  |         case PyUnicode_2BYTE_KIND:  | 
9344  | 0  |             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);  | 
9345  | 0  |             break;  | 
9346  | 0  |         case PyUnicode_4BYTE_KIND:  | 
9347  | 0  |             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);  | 
9348  | 0  |             break;  | 
9349  | 0  |         default:  | 
9350  | 0  |             Py_UNREACHABLE();  | 
9351  | 0  |         }  | 
9352  | 0  |     }  | 
9353  |  |  | 
9354  | 14  |     if (kind2 != kind1)  | 
9355  | 0  |         PyMem_Free(buf2);  | 
9356  |  |  | 
9357  | 14  |     return result;  | 
9358  | 14  | }  | 
9359  |  |  | 
9360  |  | /* _PyUnicode_InsertThousandsGrouping() helper functions */  | 
9361  |  | #include "stringlib/localeutil.h"  | 
9362  |  |  | 
9363  |  | /**  | 
9364  |  |  * InsertThousandsGrouping:  | 
9365  |  |  * @writer: Unicode writer.  | 
9366  |  |  * @n_buffer: Number of characters in @buffer.  | 
9367  |  |  * @digits: Digits we're reading from. If count is non-NULL, this is unused.  | 
9368  |  |  * @d_pos: Start of digits string.  | 
9369  |  |  * @n_digits: The number of digits in the string, in which we want  | 
9370  |  |  *            to put the grouping chars.  | 
9371  |  |  * @min_width: The minimum width of the digits in the output string.  | 
9372  |  |  *             Output will be zero-padded on the left to fill.  | 
9373  |  |  * @grouping: see definition in localeconv().  | 
9374  |  |  * @thousands_sep: see definition in localeconv().  | 
9375  |  |  *  | 
9376  |  |  * There are 2 modes: counting and filling. If @writer is NULL,  | 
9377  |  |  *  we are in counting mode, else filling mode.  | 
9378  |  |  * If counting, the required buffer size is returned.  | 
9379  |  |  * If filling, we know the buffer will be large enough, so we don't  | 
9380  |  |  *  need to pass in the buffer size.  | 
9381  |  |  * Inserts thousand grouping characters (as defined by grouping and  | 
9382  |  |  *  thousands_sep) into @writer.  | 
9383  |  |  *  | 
9384  |  |  * Return value: -1 on error, number of characters otherwise.  | 
9385  |  |  **/  | 
9386  |  | Py_ssize_t  | 
9387  |  | _PyUnicode_InsertThousandsGrouping(  | 
9388  |  |     _PyUnicodeWriter *writer,  | 
9389  |  |     Py_ssize_t n_buffer,  | 
9390  |  |     PyObject *digits,  | 
9391  |  |     Py_ssize_t d_pos,  | 
9392  |  |     Py_ssize_t n_digits,  | 
9393  |  |     Py_ssize_t min_width,  | 
9394  |  |     const char *grouping,  | 
9395  |  |     PyObject *thousands_sep,  | 
9396  |  |     Py_UCS4 *maxchar)  | 
9397  | 0  | { | 
9398  | 0  |     min_width = Py_MAX(0, min_width);  | 
9399  | 0  |     if (writer) { | 
9400  | 0  |         assert(digits != NULL);  | 
9401  | 0  |         assert(maxchar == NULL);  | 
9402  | 0  |     }  | 
9403  | 0  |     else { | 
9404  | 0  |         assert(digits == NULL);  | 
9405  | 0  |         assert(maxchar != NULL);  | 
9406  | 0  |     }  | 
9407  | 0  |     assert(0 <= d_pos);  | 
9408  | 0  |     assert(0 <= n_digits);  | 
9409  | 0  |     assert(grouping != NULL);  | 
9410  |  | 
  | 
9411  | 0  |     if (digits != NULL) { | 
9412  | 0  |         if (PyUnicode_READY(digits) == -1) { | 
9413  | 0  |             return -1;  | 
9414  | 0  |         }  | 
9415  | 0  |     }  | 
9416  | 0  |     if (PyUnicode_READY(thousands_sep) == -1) { | 
9417  | 0  |         return -1;  | 
9418  | 0  |     }  | 
9419  |  |  | 
9420  | 0  |     Py_ssize_t count = 0;  | 
9421  | 0  |     Py_ssize_t n_zeros;  | 
9422  | 0  |     int loop_broken = 0;  | 
9423  | 0  |     int use_separator = 0; /* First time through, don't append the  | 
9424  |  |                               separator. They only go between  | 
9425  |  |                               groups. */  | 
9426  | 0  |     Py_ssize_t buffer_pos;  | 
9427  | 0  |     Py_ssize_t digits_pos;  | 
9428  | 0  |     Py_ssize_t len;  | 
9429  | 0  |     Py_ssize_t n_chars;  | 
9430  | 0  |     Py_ssize_t remaining = n_digits; /* Number of chars remaining to  | 
9431  |  |                                         be looked at */  | 
9432  |  |     /* A generator that returns all of the grouping widths, until it  | 
9433  |  |        returns 0. */  | 
9434  | 0  |     GroupGenerator groupgen;  | 
9435  | 0  |     GroupGenerator_init(&groupgen, grouping);  | 
9436  | 0  |     const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);  | 
9437  |  |  | 
9438  |  |     /* if digits are not grouped, thousands separator  | 
9439  |  |        should be an empty string */  | 
9440  | 0  |     assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));  | 
9441  |  | 
  | 
9442  | 0  |     digits_pos = d_pos + n_digits;  | 
9443  | 0  |     if (writer) { | 
9444  | 0  |         buffer_pos = writer->pos + n_buffer;  | 
9445  | 0  |         assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));  | 
9446  | 0  |         assert(digits_pos <= PyUnicode_GET_LENGTH(digits));  | 
9447  | 0  |     }  | 
9448  | 0  |     else { | 
9449  | 0  |         buffer_pos = n_buffer;  | 
9450  | 0  |     }  | 
9451  |  | 
  | 
9452  | 0  |     if (!writer) { | 
9453  | 0  |         *maxchar = 127;  | 
9454  | 0  |     }  | 
9455  |  | 
  | 
9456  | 0  |     while ((len = GroupGenerator_next(&groupgen)) > 0) { | 
9457  | 0  |         len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));  | 
9458  | 0  |         n_zeros = Py_MAX(0, len - remaining);  | 
9459  | 0  |         n_chars = Py_MAX(0, Py_MIN(remaining, len));  | 
9460  |  |  | 
9461  |  |         /* Use n_zero zero's and n_chars chars */  | 
9462  |  |  | 
9463  |  |         /* Count only, don't do anything. */  | 
9464  | 0  |         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;  | 
9465  |  |  | 
9466  |  |         /* Copy into the writer. */  | 
9467  | 0  |         InsertThousandsGrouping_fill(writer, &buffer_pos,  | 
9468  | 0  |                                      digits, &digits_pos,  | 
9469  | 0  |                                      n_chars, n_zeros,  | 
9470  | 0  |                                      use_separator ? thousands_sep : NULL,  | 
9471  | 0  |                                      thousands_sep_len, maxchar);  | 
9472  |  |  | 
9473  |  |         /* Use a separator next time. */  | 
9474  | 0  |         use_separator = 1;  | 
9475  |  | 
  | 
9476  | 0  |         remaining -= n_chars;  | 
9477  | 0  |         min_width -= len;  | 
9478  |  | 
  | 
9479  | 0  |         if (remaining <= 0 && min_width <= 0) { | 
9480  | 0  |             loop_broken = 1;  | 
9481  | 0  |             break;  | 
9482  | 0  |         }  | 
9483  | 0  |         min_width -= thousands_sep_len;  | 
9484  | 0  |     }  | 
9485  | 0  |     if (!loop_broken) { | 
9486  |  |         /* We left the loop without using a break statement. */  | 
9487  |  | 
  | 
9488  | 0  |         len = Py_MAX(Py_MAX(remaining, min_width), 1);  | 
9489  | 0  |         n_zeros = Py_MAX(0, len - remaining);  | 
9490  | 0  |         n_chars = Py_MAX(0, Py_MIN(remaining, len));  | 
9491  |  |  | 
9492  |  |         /* Use n_zero zero's and n_chars chars */  | 
9493  | 0  |         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;  | 
9494  |  |  | 
9495  |  |         /* Copy into the writer. */  | 
9496  | 0  |         InsertThousandsGrouping_fill(writer, &buffer_pos,  | 
9497  | 0  |                                      digits, &digits_pos,  | 
9498  | 0  |                                      n_chars, n_zeros,  | 
9499  | 0  |                                      use_separator ? thousands_sep : NULL,  | 
9500  | 0  |                                      thousands_sep_len, maxchar);  | 
9501  | 0  |     }  | 
9502  | 0  |     return count;  | 
9503  | 0  | }  | 
9504  |  |  | 
9505  |  |  | 
9506  |  | Py_ssize_t  | 
9507  |  | PyUnicode_Count(PyObject *str,  | 
9508  |  |                 PyObject *substr,  | 
9509  |  |                 Py_ssize_t start,  | 
9510  |  |                 Py_ssize_t end)  | 
9511  | 0  | { | 
9512  | 0  |     Py_ssize_t result;  | 
9513  | 0  |     int kind1, kind2;  | 
9514  | 0  |     void *buf1 = NULL, *buf2 = NULL;  | 
9515  | 0  |     Py_ssize_t len1, len2;  | 
9516  |  | 
  | 
9517  | 0  |     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)  | 
9518  | 0  |         return -1;  | 
9519  |  |  | 
9520  | 0  |     kind1 = PyUnicode_KIND(str);  | 
9521  | 0  |     kind2 = PyUnicode_KIND(substr);  | 
9522  | 0  |     if (kind1 < kind2)  | 
9523  | 0  |         return 0;  | 
9524  |  |  | 
9525  | 0  |     len1 = PyUnicode_GET_LENGTH(str);  | 
9526  | 0  |     len2 = PyUnicode_GET_LENGTH(substr);  | 
9527  | 0  |     ADJUST_INDICES(start, end, len1);  | 
9528  | 0  |     if (end - start < len2)  | 
9529  | 0  |         return 0;  | 
9530  |  |  | 
9531  | 0  |     buf1 = PyUnicode_DATA(str);  | 
9532  | 0  |     buf2 = PyUnicode_DATA(substr);  | 
9533  | 0  |     if (kind2 != kind1) { | 
9534  | 0  |         buf2 = _PyUnicode_AsKind(substr, kind1);  | 
9535  | 0  |         if (!buf2)  | 
9536  | 0  |             goto onError;  | 
9537  | 0  |     }  | 
9538  |  |  | 
9539  | 0  |     switch (kind1) { | 
9540  | 0  |     case PyUnicode_1BYTE_KIND:  | 
9541  | 0  |         if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))  | 
9542  | 0  |             result = asciilib_count(  | 
9543  | 0  |                 ((Py_UCS1*)buf1) + start, end - start,  | 
9544  | 0  |                 buf2, len2, PY_SSIZE_T_MAX  | 
9545  | 0  |                 );  | 
9546  | 0  |         else  | 
9547  | 0  |             result = ucs1lib_count(  | 
9548  | 0  |                 ((Py_UCS1*)buf1) + start, end - start,  | 
9549  | 0  |                 buf2, len2, PY_SSIZE_T_MAX  | 
9550  | 0  |                 );  | 
9551  | 0  |         break;  | 
9552  | 0  |     case PyUnicode_2BYTE_KIND:  | 
9553  | 0  |         result = ucs2lib_count(  | 
9554  | 0  |             ((Py_UCS2*)buf1) + start, end - start,  | 
9555  | 0  |             buf2, len2, PY_SSIZE_T_MAX  | 
9556  | 0  |             );  | 
9557  | 0  |         break;  | 
9558  | 0  |     case PyUnicode_4BYTE_KIND:  | 
9559  | 0  |         result = ucs4lib_count(  | 
9560  | 0  |             ((Py_UCS4*)buf1) + start, end - start,  | 
9561  | 0  |             buf2, len2, PY_SSIZE_T_MAX  | 
9562  | 0  |             );  | 
9563  | 0  |         break;  | 
9564  | 0  |     default:  | 
9565  | 0  |         Py_UNREACHABLE();  | 
9566  | 0  |     }  | 
9567  |  |  | 
9568  | 0  |     if (kind2 != kind1)  | 
9569  | 0  |         PyMem_Free(buf2);  | 
9570  |  | 
  | 
9571  | 0  |     return result;  | 
9572  | 0  |   onError:  | 
9573  | 0  |     if (kind2 != kind1 && buf2)  | 
9574  | 0  |         PyMem_Free(buf2);  | 
9575  | 0  |     return -1;  | 
9576  | 0  | }  | 
9577  |  |  | 
9578  |  | Py_ssize_t  | 
9579  |  | PyUnicode_Find(PyObject *str,  | 
9580  |  |                PyObject *substr,  | 
9581  |  |                Py_ssize_t start,  | 
9582  |  |                Py_ssize_t end,  | 
9583  |  |                int direction)  | 
9584  | 0  | { | 
9585  | 0  |     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)  | 
9586  | 0  |         return -2;  | 
9587  |  |  | 
9588  | 0  |     return any_find_slice(str, substr, start, end, direction);  | 
9589  | 0  | }  | 
9590  |  |  | 
9591  |  | Py_ssize_t  | 
9592  |  | PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,  | 
9593  |  |                    Py_ssize_t start, Py_ssize_t end,  | 
9594  |  |                    int direction)  | 
9595  | 2.15k  | { | 
9596  | 2.15k  |     int kind;  | 
9597  | 2.15k  |     Py_ssize_t len, result;  | 
9598  | 2.15k  |     if (PyUnicode_READY(str) == -1)  | 
9599  | 0  |         return -2;  | 
9600  | 2.15k  |     len = PyUnicode_GET_LENGTH(str);  | 
9601  | 2.15k  |     ADJUST_INDICES(start, end, len);  | 
9602  | 2.15k  |     if (end - start < 1)  | 
9603  | 0  |         return -1;  | 
9604  | 2.15k  |     kind = PyUnicode_KIND(str);  | 
9605  | 2.15k  |     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,  | 
9606  | 2.15k  |                       kind, end-start, ch, direction);  | 
9607  | 2.15k  |     if (result == -1)  | 
9608  | 2.07k  |         return -1;  | 
9609  | 85  |     else  | 
9610  | 85  |         return start + result;  | 
9611  | 2.15k  | }  | 
9612  |  |  | 
9613  |  | static int  | 
9614  |  | tailmatch(PyObject *self,  | 
9615  |  |           PyObject *substring,  | 
9616  |  |           Py_ssize_t start,  | 
9617  |  |           Py_ssize_t end,  | 
9618  |  |           int direction)  | 
9619  | 1.01k  | { | 
9620  | 1.01k  |     int kind_self;  | 
9621  | 1.01k  |     int kind_sub;  | 
9622  | 1.01k  |     void *data_self;  | 
9623  | 1.01k  |     void *data_sub;  | 
9624  | 1.01k  |     Py_ssize_t offset;  | 
9625  | 1.01k  |     Py_ssize_t i;  | 
9626  | 1.01k  |     Py_ssize_t end_sub;  | 
9627  |  |  | 
9628  | 1.01k  |     if (PyUnicode_READY(self) == -1 ||  | 
9629  | 1.01k  |         PyUnicode_READY(substring) == -1)  | 
9630  | 0  |         return -1;  | 
9631  |  |  | 
9632  | 1.01k  |     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));  | 
9633  | 1.01k  |     end -= PyUnicode_GET_LENGTH(substring);  | 
9634  | 1.01k  |     if (end < start)  | 
9635  | 86  |         return 0;  | 
9636  |  |  | 
9637  | 930  |     if (PyUnicode_GET_LENGTH(substring) == 0)  | 
9638  | 0  |         return 1;  | 
9639  |  |  | 
9640  | 930  |     kind_self = PyUnicode_KIND(self);  | 
9641  | 930  |     data_self = PyUnicode_DATA(self);  | 
9642  | 930  |     kind_sub = PyUnicode_KIND(substring);  | 
9643  | 930  |     data_sub = PyUnicode_DATA(substring);  | 
9644  | 930  |     end_sub = PyUnicode_GET_LENGTH(substring) - 1;  | 
9645  |  |  | 
9646  | 930  |     if (direction > 0)  | 
9647  | 404  |         offset = end;  | 
9648  | 526  |     else  | 
9649  | 526  |         offset = start;  | 
9650  |  |  | 
9651  | 930  |     if (PyUnicode_READ(kind_self, data_self, offset) ==  | 
9652  | 930  |         PyUnicode_READ(kind_sub, data_sub, 0) &&  | 
9653  | 446  |         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==  | 
9654  | 446  |         PyUnicode_READ(kind_sub, data_sub, end_sub)) { | 
9655  |  |         /* If both are of the same kind, memcmp is sufficient */  | 
9656  | 376  |         if (kind_self == kind_sub) { | 
9657  | 376  |             return ! memcmp((char *)data_self +  | 
9658  | 376  |                                 (offset * PyUnicode_KIND(substring)),  | 
9659  | 376  |                             data_sub,  | 
9660  | 376  |                             PyUnicode_GET_LENGTH(substring) *  | 
9661  | 376  |                                 PyUnicode_KIND(substring));  | 
9662  | 376  |         }  | 
9663  |  |         /* otherwise we have to compare each character by first accessing it */  | 
9664  | 0  |         else { | 
9665  |  |             /* We do not need to compare 0 and len(substring)-1 because  | 
9666  |  |                the if statement above ensured already that they are equal  | 
9667  |  |                when we end up here. */  | 
9668  | 0  |             for (i = 1; i < end_sub; ++i) { | 
9669  | 0  |                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=  | 
9670  | 0  |                     PyUnicode_READ(kind_sub, data_sub, i))  | 
9671  | 0  |                     return 0;  | 
9672  | 0  |             }  | 
9673  | 0  |             return 1;  | 
9674  | 0  |         }  | 
9675  | 376  |     }  | 
9676  |  |  | 
9677  | 554  |     return 0;  | 
9678  | 930  | }  | 
9679  |  |  | 
9680  |  | Py_ssize_t  | 
9681  |  | PyUnicode_Tailmatch(PyObject *str,  | 
9682  |  |                     PyObject *substr,  | 
9683  |  |                     Py_ssize_t start,  | 
9684  |  |                     Py_ssize_t end,  | 
9685  |  |                     int direction)  | 
9686  | 0  | { | 
9687  | 0  |     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)  | 
9688  | 0  |         return -1;  | 
9689  |  |  | 
9690  | 0  |     return tailmatch(str, substr, start, end, direction);  | 
9691  | 0  | }  | 
9692  |  |  | 
9693  |  | static PyObject *  | 
9694  |  | ascii_upper_or_lower(PyObject *self, int lower)  | 
9695  | 36  | { | 
9696  | 36  |     Py_ssize_t len = PyUnicode_GET_LENGTH(self);  | 
9697  | 36  |     char *resdata, *data = PyUnicode_DATA(self);  | 
9698  | 36  |     PyObject *res;  | 
9699  |  |  | 
9700  | 36  |     res = PyUnicode_New(len, 127);  | 
9701  | 36  |     if (res == NULL)  | 
9702  | 0  |         return NULL;  | 
9703  | 36  |     resdata = PyUnicode_DATA(res);  | 
9704  | 36  |     if (lower)  | 
9705  | 0  |         _Py_bytes_lower(resdata, data, len);  | 
9706  | 36  |     else  | 
9707  | 36  |         _Py_bytes_upper(resdata, data, len);  | 
9708  | 36  |     return res;  | 
9709  | 36  | }  | 
9710  |  |  | 
9711  |  | static Py_UCS4  | 
9712  |  | handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)  | 
9713  | 0  | { | 
9714  | 0  |     Py_ssize_t j;  | 
9715  | 0  |     int final_sigma;  | 
9716  | 0  |     Py_UCS4 c = 0;   /* initialize to prevent gcc warning */  | 
9717  |  |     /* U+03A3 is in the Final_Sigma context when, it is found like this:  | 
9718  |  |  | 
9719  |  |      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) | 
9720  |  |  | 
9721  |  |     where ! is a negation and \p{xxx} is a character with property xxx. | 
9722  |  |     */  | 
9723  | 0  |     for (j = i - 1; j >= 0; j--) { | 
9724  | 0  |         c = PyUnicode_READ(kind, data, j);  | 
9725  | 0  |         if (!_PyUnicode_IsCaseIgnorable(c))  | 
9726  | 0  |             break;  | 
9727  | 0  |     }  | 
9728  | 0  |     final_sigma = j >= 0 && _PyUnicode_IsCased(c);  | 
9729  | 0  |     if (final_sigma) { | 
9730  | 0  |         for (j = i + 1; j < length; j++) { | 
9731  | 0  |             c = PyUnicode_READ(kind, data, j);  | 
9732  | 0  |             if (!_PyUnicode_IsCaseIgnorable(c))  | 
9733  | 0  |                 break;  | 
9734  | 0  |         }  | 
9735  | 0  |         final_sigma = j == length || !_PyUnicode_IsCased(c);  | 
9736  | 0  |     }  | 
9737  | 0  |     return (final_sigma) ? 0x3C2 : 0x3C3;  | 
9738  | 0  | }  | 
9739  |  |  | 
9740  |  | static int  | 
9741  |  | lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,  | 
9742  |  |            Py_UCS4 c, Py_UCS4 *mapped)  | 
9743  | 0  | { | 
9744  |  |     /* Obscure special case. */  | 
9745  | 0  |     if (c == 0x3A3) { | 
9746  | 0  |         mapped[0] = handle_capital_sigma(kind, data, length, i);  | 
9747  | 0  |         return 1;  | 
9748  | 0  |     }  | 
9749  | 0  |     return _PyUnicode_ToLowerFull(c, mapped);  | 
9750  | 0  | }  | 
9751  |  |  | 
9752  |  | static Py_ssize_t  | 
9753  |  | do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)  | 
9754  | 0  | { | 
9755  | 0  |     Py_ssize_t i, k = 0;  | 
9756  | 0  |     int n_res, j;  | 
9757  | 0  |     Py_UCS4 c, mapped[3];  | 
9758  |  | 
  | 
9759  | 0  |     c = PyUnicode_READ(kind, data, 0);  | 
9760  | 0  |     n_res = _PyUnicode_ToTitleFull(c, mapped);  | 
9761  | 0  |     for (j = 0; j < n_res; j++) { | 
9762  | 0  |         *maxchar = Py_MAX(*maxchar, mapped[j]);  | 
9763  | 0  |         res[k++] = mapped[j];  | 
9764  | 0  |     }  | 
9765  | 0  |     for (i = 1; i < length; i++) { | 
9766  | 0  |         c = PyUnicode_READ(kind, data, i);  | 
9767  | 0  |         n_res = lower_ucs4(kind, data, length, i, c, mapped);  | 
9768  | 0  |         for (j = 0; j < n_res; j++) { | 
9769  | 0  |             *maxchar = Py_MAX(*maxchar, mapped[j]);  | 
9770  | 0  |             res[k++] = mapped[j];  | 
9771  | 0  |         }  | 
9772  | 0  |     }  | 
9773  | 0  |     return k;  | 
9774  | 0  | }  | 
9775  |  |  | 
9776  |  | static Py_ssize_t  | 
9777  | 0  | do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { | 
9778  | 0  |     Py_ssize_t i, k = 0;  | 
9779  |  | 
  | 
9780  | 0  |     for (i = 0; i < length; i++) { | 
9781  | 0  |         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];  | 
9782  | 0  |         int n_res, j;  | 
9783  | 0  |         if (Py_UNICODE_ISUPPER(c)) { | 
9784  | 0  |             n_res = lower_ucs4(kind, data, length, i, c, mapped);  | 
9785  | 0  |         }  | 
9786  | 0  |         else if (Py_UNICODE_ISLOWER(c)) { | 
9787  | 0  |             n_res = _PyUnicode_ToUpperFull(c, mapped);  | 
9788  | 0  |         }  | 
9789  | 0  |         else { | 
9790  | 0  |             n_res = 1;  | 
9791  | 0  |             mapped[0] = c;  | 
9792  | 0  |         }  | 
9793  | 0  |         for (j = 0; j < n_res; j++) { | 
9794  | 0  |             *maxchar = Py_MAX(*maxchar, mapped[j]);  | 
9795  | 0  |             res[k++] = mapped[j];  | 
9796  | 0  |         }  | 
9797  | 0  |     }  | 
9798  | 0  |     return k;  | 
9799  | 0  | }  | 
9800  |  |  | 
9801  |  | static Py_ssize_t  | 
9802  |  | do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,  | 
9803  |  |                   Py_UCS4 *maxchar, int lower)  | 
9804  | 0  | { | 
9805  | 0  |     Py_ssize_t i, k = 0;  | 
9806  |  | 
  | 
9807  | 0  |     for (i = 0; i < length; i++) { | 
9808  | 0  |         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];  | 
9809  | 0  |         int n_res, j;  | 
9810  | 0  |         if (lower)  | 
9811  | 0  |             n_res = lower_ucs4(kind, data, length, i, c, mapped);  | 
9812  | 0  |         else  | 
9813  | 0  |             n_res = _PyUnicode_ToUpperFull(c, mapped);  | 
9814  | 0  |         for (j = 0; j < n_res; j++) { | 
9815  | 0  |             *maxchar = Py_MAX(*maxchar, mapped[j]);  | 
9816  | 0  |             res[k++] = mapped[j];  | 
9817  | 0  |         }  | 
9818  | 0  |     }  | 
9819  | 0  |     return k;  | 
9820  | 0  | }  | 
9821  |  |  | 
9822  |  | static Py_ssize_t  | 
9823  |  | do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)  | 
9824  | 0  | { | 
9825  | 0  |     return do_upper_or_lower(kind, data, length, res, maxchar, 0);  | 
9826  | 0  | }  | 
9827  |  |  | 
9828  |  | static Py_ssize_t  | 
9829  |  | do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)  | 
9830  | 0  | { | 
9831  | 0  |     return do_upper_or_lower(kind, data, length, res, maxchar, 1);  | 
9832  | 0  | }  | 
9833  |  |  | 
9834  |  | static Py_ssize_t  | 
9835  |  | do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)  | 
9836  | 0  | { | 
9837  | 0  |     Py_ssize_t i, k = 0;  | 
9838  |  | 
  | 
9839  | 0  |     for (i = 0; i < length; i++) { | 
9840  | 0  |         Py_UCS4 c = PyUnicode_READ(kind, data, i);  | 
9841  | 0  |         Py_UCS4 mapped[3];  | 
9842  | 0  |         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);  | 
9843  | 0  |         for (j = 0; j < n_res; j++) { | 
9844  | 0  |             *maxchar = Py_MAX(*maxchar, mapped[j]);  | 
9845  | 0  |             res[k++] = mapped[j];  | 
9846  | 0  |         }  | 
9847  | 0  |     }  | 
9848  | 0  |     return k;  | 
9849  | 0  | }  | 
9850  |  |  | 
9851  |  | static Py_ssize_t  | 
9852  |  | do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)  | 
9853  | 0  | { | 
9854  | 0  |     Py_ssize_t i, k = 0;  | 
9855  | 0  |     int previous_is_cased;  | 
9856  |  | 
  | 
9857  | 0  |     previous_is_cased = 0;  | 
9858  | 0  |     for (i = 0; i < length; i++) { | 
9859  | 0  |         const Py_UCS4 c = PyUnicode_READ(kind, data, i);  | 
9860  | 0  |         Py_UCS4 mapped[3];  | 
9861  | 0  |         int n_res, j;  | 
9862  |  | 
  | 
9863  | 0  |         if (previous_is_cased)  | 
9864  | 0  |             n_res = lower_ucs4(kind, data, length, i, c, mapped);  | 
9865  | 0  |         else  | 
9866  | 0  |             n_res = _PyUnicode_ToTitleFull(c, mapped);  | 
9867  |  | 
  | 
9868  | 0  |         for (j = 0; j < n_res; j++) { | 
9869  | 0  |             *maxchar = Py_MAX(*maxchar, mapped[j]);  | 
9870  | 0  |             res[k++] = mapped[j];  | 
9871  | 0  |         }  | 
9872  |  | 
  | 
9873  | 0  |         previous_is_cased = _PyUnicode_IsCased(c);  | 
9874  | 0  |     }  | 
9875  | 0  |     return k;  | 
9876  | 0  | }  | 
9877  |  |  | 
9878  |  | static PyObject *  | 
9879  |  | case_operation(PyObject *self,  | 
9880  |  |                Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))  | 
9881  | 0  | { | 
9882  | 0  |     PyObject *res = NULL;  | 
9883  | 0  |     Py_ssize_t length, newlength = 0;  | 
9884  | 0  |     int kind, outkind;  | 
9885  | 0  |     void *data, *outdata;  | 
9886  | 0  |     Py_UCS4 maxchar = 0, *tmp, *tmpend;  | 
9887  |  | 
  | 
9888  | 0  |     assert(PyUnicode_IS_READY(self));  | 
9889  |  | 
  | 
9890  | 0  |     kind = PyUnicode_KIND(self);  | 
9891  | 0  |     data = PyUnicode_DATA(self);  | 
9892  | 0  |     length = PyUnicode_GET_LENGTH(self);  | 
9893  | 0  |     if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) { | 
9894  | 0  |         PyErr_SetString(PyExc_OverflowError, "string is too long");  | 
9895  | 0  |         return NULL;  | 
9896  | 0  |     }  | 
9897  | 0  |     tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);  | 
9898  | 0  |     if (tmp == NULL)  | 
9899  | 0  |         return PyErr_NoMemory();  | 
9900  | 0  |     newlength = perform(kind, data, length, tmp, &maxchar);  | 
9901  | 0  |     res = PyUnicode_New(newlength, maxchar);  | 
9902  | 0  |     if (res == NULL)  | 
9903  | 0  |         goto leave;  | 
9904  | 0  |     tmpend = tmp + newlength;  | 
9905  | 0  |     outdata = PyUnicode_DATA(res);  | 
9906  | 0  |     outkind = PyUnicode_KIND(res);  | 
9907  | 0  |     switch (outkind) { | 
9908  | 0  |     case PyUnicode_1BYTE_KIND:  | 
9909  | 0  |         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);  | 
9910  | 0  |         break;  | 
9911  | 0  |     case PyUnicode_2BYTE_KIND:  | 
9912  | 0  |         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);  | 
9913  | 0  |         break;  | 
9914  | 0  |     case PyUnicode_4BYTE_KIND:  | 
9915  | 0  |         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);  | 
9916  | 0  |         break;  | 
9917  | 0  |     default:  | 
9918  | 0  |         Py_UNREACHABLE();  | 
9919  | 0  |     }  | 
9920  | 0  |   leave:  | 
9921  | 0  |     PyMem_FREE(tmp);  | 
9922  | 0  |     return res;  | 
9923  | 0  | }  | 
9924  |  |  | 
9925  |  | PyObject *  | 
9926  |  | PyUnicode_Join(PyObject *separator, PyObject *seq)  | 
9927  | 2.32k  | { | 
9928  | 2.32k  |     PyObject *res;  | 
9929  | 2.32k  |     PyObject *fseq;  | 
9930  | 2.32k  |     Py_ssize_t seqlen;  | 
9931  | 2.32k  |     PyObject **items;  | 
9932  |  |  | 
9933  | 2.32k  |     fseq = PySequence_Fast(seq, "can only join an iterable");  | 
9934  | 2.32k  |     if (fseq == NULL) { | 
9935  | 0  |         return NULL;  | 
9936  | 0  |     }  | 
9937  |  |  | 
9938  |  |     /* NOTE: the following code can't call back into Python code,  | 
9939  |  |      * so we are sure that fseq won't be mutated.  | 
9940  |  |      */  | 
9941  |  |  | 
9942  | 2.32k  |     items = PySequence_Fast_ITEMS(fseq);  | 
9943  | 2.32k  |     seqlen = PySequence_Fast_GET_SIZE(fseq);  | 
9944  | 2.32k  |     res = _PyUnicode_JoinArray(separator, items, seqlen);  | 
9945  | 2.32k  |     Py_DECREF(fseq);  | 
9946  | 2.32k  |     return res;  | 
9947  | 2.32k  | }  | 
9948  |  |  | 
9949  |  | PyObject *  | 
9950  |  | _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)  | 
9951  | 2.39k  | { | 
9952  | 2.39k  |     PyObject *res = NULL; /* the result */  | 
9953  | 2.39k  |     PyObject *sep = NULL;  | 
9954  | 2.39k  |     Py_ssize_t seplen;  | 
9955  | 2.39k  |     PyObject *item;  | 
9956  | 2.39k  |     Py_ssize_t sz, i, res_offset;  | 
9957  | 2.39k  |     Py_UCS4 maxchar;  | 
9958  | 2.39k  |     Py_UCS4 item_maxchar;  | 
9959  | 2.39k  |     int use_memcpy;  | 
9960  | 2.39k  |     unsigned char *res_data = NULL, *sep_data = NULL;  | 
9961  | 2.39k  |     PyObject *last_obj;  | 
9962  | 2.39k  |     unsigned int kind = 0;  | 
9963  |  |  | 
9964  |  |     /* If empty sequence, return u"". */  | 
9965  | 2.39k  |     if (seqlen == 0) { | 
9966  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
9967  | 0  |     }  | 
9968  |  |  | 
9969  |  |     /* If singleton sequence with an exact Unicode, return that. */  | 
9970  | 2.39k  |     last_obj = NULL;  | 
9971  | 2.39k  |     if (seqlen == 1) { | 
9972  | 41  |         if (PyUnicode_CheckExact(items[0])) { | 
9973  | 41  |             res = items[0];  | 
9974  | 41  |             Py_INCREF(res);  | 
9975  | 41  |             return res;  | 
9976  | 41  |         }  | 
9977  | 0  |         seplen = 0;  | 
9978  | 0  |         maxchar = 0;  | 
9979  | 0  |     }  | 
9980  | 2.34k  |     else { | 
9981  |  |         /* Set up sep and seplen */  | 
9982  | 2.34k  |         if (separator == NULL) { | 
9983  |  |             /* fall back to a blank space separator */  | 
9984  | 0  |             sep = PyUnicode_FromOrdinal(' '); | 
9985  | 0  |             if (!sep)  | 
9986  | 0  |                 goto onError;  | 
9987  | 0  |             seplen = 1;  | 
9988  | 0  |             maxchar = 32;  | 
9989  | 0  |         }  | 
9990  | 2.34k  |         else { | 
9991  | 2.34k  |             if (!PyUnicode_Check(separator)) { | 
9992  | 0  |                 PyErr_Format(PyExc_TypeError,  | 
9993  | 0  |                              "separator: expected str instance,"  | 
9994  | 0  |                              " %.80s found",  | 
9995  | 0  |                              Py_TYPE(separator)->tp_name);  | 
9996  | 0  |                 goto onError;  | 
9997  | 0  |             }  | 
9998  | 2.34k  |             if (PyUnicode_READY(separator))  | 
9999  | 0  |                 goto onError;  | 
10000  | 2.34k  |             sep = separator;  | 
10001  | 2.34k  |             seplen = PyUnicode_GET_LENGTH(separator);  | 
10002  | 2.34k  |             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);  | 
10003  |  |             /* inc refcount to keep this code path symmetric with the  | 
10004  |  |                above case of a blank separator */  | 
10005  | 2.34k  |             Py_INCREF(sep);  | 
10006  | 2.34k  |         }  | 
10007  | 2.34k  |         last_obj = sep;  | 
10008  | 2.34k  |     }  | 
10009  |  |  | 
10010  |  |     /* There are at least two things to join, or else we have a subclass  | 
10011  |  |      * of str in the sequence.  | 
10012  |  |      * Do a pre-pass to figure out the total amount of space we'll  | 
10013  |  |      * need (sz), and see whether all argument are strings.  | 
10014  |  |      */  | 
10015  | 2.34k  |     sz = 0;  | 
10016  |  | #ifdef Py_DEBUG  | 
10017  |  |     use_memcpy = 0;  | 
10018  |  | #else  | 
10019  | 2.34k  |     use_memcpy = 1;  | 
10020  | 2.34k  | #endif  | 
10021  | 8.74k  |     for (i = 0; i < seqlen; i++) { | 
10022  | 6.39k  |         size_t add_sz;  | 
10023  | 6.39k  |         item = items[i];  | 
10024  | 6.39k  |         if (!PyUnicode_Check(item)) { | 
10025  | 0  |             PyErr_Format(PyExc_TypeError,  | 
10026  | 0  |                          "sequence item %zd: expected str instance,"  | 
10027  | 0  |                          " %.80s found",  | 
10028  | 0  |                          i, Py_TYPE(item)->tp_name);  | 
10029  | 0  |             goto onError;  | 
10030  | 0  |         }  | 
10031  | 6.39k  |         if (PyUnicode_READY(item) == -1)  | 
10032  | 0  |             goto onError;  | 
10033  | 6.39k  |         add_sz = PyUnicode_GET_LENGTH(item);  | 
10034  | 6.39k  |         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);  | 
10035  | 6.39k  |         maxchar = Py_MAX(maxchar, item_maxchar);  | 
10036  | 6.39k  |         if (i != 0) { | 
10037  | 4.05k  |             add_sz += seplen;  | 
10038  | 4.05k  |         }  | 
10039  | 6.39k  |         if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) { | 
10040  | 0  |             PyErr_SetString(PyExc_OverflowError,  | 
10041  | 0  |                             "join() result is too long for a Python string");  | 
10042  | 0  |             goto onError;  | 
10043  | 0  |         }  | 
10044  | 6.39k  |         sz += add_sz;  | 
10045  | 6.39k  |         if (use_memcpy && last_obj != NULL) { | 
10046  | 6.39k  |             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))  | 
10047  | 0  |                 use_memcpy = 0;  | 
10048  | 6.39k  |         }  | 
10049  | 6.39k  |         last_obj = item;  | 
10050  | 6.39k  |     }  | 
10051  |  |  | 
10052  | 2.34k  |     res = PyUnicode_New(sz, maxchar);  | 
10053  | 2.34k  |     if (res == NULL)  | 
10054  | 0  |         goto onError;  | 
10055  |  |  | 
10056  |  |     /* Catenate everything. */  | 
10057  |  | #ifdef Py_DEBUG  | 
10058  |  |     use_memcpy = 0;  | 
10059  |  | #else  | 
10060  | 2.34k  |     if (use_memcpy) { | 
10061  | 2.34k  |         res_data = PyUnicode_1BYTE_DATA(res);  | 
10062  | 2.34k  |         kind = PyUnicode_KIND(res);  | 
10063  | 2.34k  |         if (seplen != 0)  | 
10064  | 1.68k  |             sep_data = PyUnicode_1BYTE_DATA(sep);  | 
10065  | 2.34k  |     }  | 
10066  | 2.34k  | #endif  | 
10067  | 2.34k  |     if (use_memcpy) { | 
10068  | 8.74k  |         for (i = 0; i < seqlen; ++i) { | 
10069  | 6.39k  |             Py_ssize_t itemlen;  | 
10070  | 6.39k  |             item = items[i];  | 
10071  |  |  | 
10072  |  |             /* Copy item, and maybe the separator. */  | 
10073  | 6.39k  |             if (i && seplen != 0) { | 
10074  | 2.59k  |                 memcpy(res_data,  | 
10075  | 2.59k  |                           sep_data,  | 
10076  | 2.59k  |                           kind * seplen);  | 
10077  | 2.59k  |                 res_data += kind * seplen;  | 
10078  | 2.59k  |             }  | 
10079  |  |  | 
10080  | 6.39k  |             itemlen = PyUnicode_GET_LENGTH(item);  | 
10081  | 6.39k  |             if (itemlen != 0) { | 
10082  | 6.39k  |                 memcpy(res_data,  | 
10083  | 6.39k  |                           PyUnicode_DATA(item),  | 
10084  | 6.39k  |                           kind * itemlen);  | 
10085  | 6.39k  |                 res_data += kind * itemlen;  | 
10086  | 6.39k  |             }  | 
10087  | 6.39k  |         }  | 
10088  | 2.34k  |         assert(res_data == PyUnicode_1BYTE_DATA(res)  | 
10089  | 2.34k  |                            + kind * PyUnicode_GET_LENGTH(res));  | 
10090  | 2.34k  |     }  | 
10091  | 0  |     else { | 
10092  | 0  |         for (i = 0, res_offset = 0; i < seqlen; ++i) { | 
10093  | 0  |             Py_ssize_t itemlen;  | 
10094  | 0  |             item = items[i];  | 
10095  |  |  | 
10096  |  |             /* Copy item, and maybe the separator. */  | 
10097  | 0  |             if (i && seplen != 0) { | 
10098  | 0  |                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);  | 
10099  | 0  |                 res_offset += seplen;  | 
10100  | 0  |             }  | 
10101  |  | 
  | 
10102  | 0  |             itemlen = PyUnicode_GET_LENGTH(item);  | 
10103  | 0  |             if (itemlen != 0) { | 
10104  | 0  |                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);  | 
10105  | 0  |                 res_offset += itemlen;  | 
10106  | 0  |             }  | 
10107  | 0  |         }  | 
10108  | 0  |         assert(res_offset == PyUnicode_GET_LENGTH(res));  | 
10109  | 0  |     }  | 
10110  |  |  | 
10111  | 2.34k  |     Py_XDECREF(sep);  | 
10112  | 2.34k  |     assert(_PyUnicode_CheckConsistency(res, 1));  | 
10113  | 2.34k  |     return res;  | 
10114  |  |  | 
10115  | 0  |   onError:  | 
10116  | 0  |     Py_XDECREF(sep);  | 
10117  | 0  |     Py_XDECREF(res);  | 
10118  | 0  |     return NULL;  | 
10119  | 2.34k  | }  | 
10120  |  |  | 
10121  |  | void  | 
10122  |  | _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,  | 
10123  |  |                     Py_UCS4 fill_char)  | 
10124  | 0  | { | 
10125  | 0  |     const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);  | 
10126  | 0  |     void *data = PyUnicode_DATA(unicode);  | 
10127  | 0  |     assert(PyUnicode_IS_READY(unicode));  | 
10128  | 0  |     assert(unicode_modifiable(unicode));  | 
10129  | 0  |     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));  | 
10130  | 0  |     assert(start >= 0);  | 
10131  | 0  |     assert(start + length <= PyUnicode_GET_LENGTH(unicode));  | 
10132  | 0  |     unicode_fill(kind, data, fill_char, start, length);  | 
10133  | 0  | }  | 
10134  |  |  | 
10135  |  | Py_ssize_t  | 
10136  |  | PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,  | 
10137  |  |                Py_UCS4 fill_char)  | 
10138  | 0  | { | 
10139  | 0  |     Py_ssize_t maxlen;  | 
10140  |  | 
  | 
10141  | 0  |     if (!PyUnicode_Check(unicode)) { | 
10142  | 0  |         PyErr_BadInternalCall();  | 
10143  | 0  |         return -1;  | 
10144  | 0  |     }  | 
10145  | 0  |     if (PyUnicode_READY(unicode) == -1)  | 
10146  | 0  |         return -1;  | 
10147  | 0  |     if (unicode_check_modifiable(unicode))  | 
10148  | 0  |         return -1;  | 
10149  |  |  | 
10150  | 0  |     if (start < 0) { | 
10151  | 0  |         PyErr_SetString(PyExc_IndexError, "string index out of range");  | 
10152  | 0  |         return -1;  | 
10153  | 0  |     }  | 
10154  | 0  |     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { | 
10155  | 0  |         PyErr_SetString(PyExc_ValueError,  | 
10156  | 0  |                          "fill character is bigger than "  | 
10157  | 0  |                          "the string maximum character");  | 
10158  | 0  |         return -1;  | 
10159  | 0  |     }  | 
10160  |  |  | 
10161  | 0  |     maxlen = PyUnicode_GET_LENGTH(unicode) - start;  | 
10162  | 0  |     length = Py_MIN(maxlen, length);  | 
10163  | 0  |     if (length <= 0)  | 
10164  | 0  |         return 0;  | 
10165  |  |  | 
10166  | 0  |     _PyUnicode_FastFill(unicode, start, length, fill_char);  | 
10167  | 0  |     return length;  | 
10168  | 0  | }  | 
10169  |  |  | 
10170  |  | static PyObject *  | 
10171  |  | pad(PyObject *self,  | 
10172  |  |     Py_ssize_t left,  | 
10173  |  |     Py_ssize_t right,  | 
10174  |  |     Py_UCS4 fill)  | 
10175  | 0  | { | 
10176  | 0  |     PyObject *u;  | 
10177  | 0  |     Py_UCS4 maxchar;  | 
10178  | 0  |     int kind;  | 
10179  | 0  |     void *data;  | 
10180  |  | 
  | 
10181  | 0  |     if (left < 0)  | 
10182  | 0  |         left = 0;  | 
10183  | 0  |     if (right < 0)  | 
10184  | 0  |         right = 0;  | 
10185  |  | 
  | 
10186  | 0  |     if (left == 0 && right == 0)  | 
10187  | 0  |         return unicode_result_unchanged(self);  | 
10188  |  |  | 
10189  | 0  |     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||  | 
10190  | 0  |         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { | 
10191  | 0  |         PyErr_SetString(PyExc_OverflowError, "padded string is too long");  | 
10192  | 0  |         return NULL;  | 
10193  | 0  |     }  | 
10194  | 0  |     maxchar = PyUnicode_MAX_CHAR_VALUE(self);  | 
10195  | 0  |     maxchar = Py_MAX(maxchar, fill);  | 
10196  | 0  |     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);  | 
10197  | 0  |     if (!u)  | 
10198  | 0  |         return NULL;  | 
10199  |  |  | 
10200  | 0  |     kind = PyUnicode_KIND(u);  | 
10201  | 0  |     data = PyUnicode_DATA(u);  | 
10202  | 0  |     if (left)  | 
10203  | 0  |         unicode_fill(kind, data, fill, 0, left);  | 
10204  | 0  |     if (right)  | 
10205  | 0  |         unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);  | 
10206  | 0  |     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));  | 
10207  | 0  |     assert(_PyUnicode_CheckConsistency(u, 1));  | 
10208  | 0  |     return u;  | 
10209  | 0  | }  | 
10210  |  |  | 
10211  |  | PyObject *  | 
10212  |  | PyUnicode_Splitlines(PyObject *string, int keepends)  | 
10213  | 0  | { | 
10214  | 0  |     PyObject *list;  | 
10215  |  | 
  | 
10216  | 0  |     if (ensure_unicode(string) < 0)  | 
10217  | 0  |         return NULL;  | 
10218  |  |  | 
10219  | 0  |     switch (PyUnicode_KIND(string)) { | 
10220  | 0  |     case PyUnicode_1BYTE_KIND:  | 
10221  | 0  |         if (PyUnicode_IS_ASCII(string))  | 
10222  | 0  |             list = asciilib_splitlines(  | 
10223  | 0  |                 string, PyUnicode_1BYTE_DATA(string),  | 
10224  | 0  |                 PyUnicode_GET_LENGTH(string), keepends);  | 
10225  | 0  |         else  | 
10226  | 0  |             list = ucs1lib_splitlines(  | 
10227  | 0  |                 string, PyUnicode_1BYTE_DATA(string),  | 
10228  | 0  |                 PyUnicode_GET_LENGTH(string), keepends);  | 
10229  | 0  |         break;  | 
10230  | 0  |     case PyUnicode_2BYTE_KIND:  | 
10231  | 0  |         list = ucs2lib_splitlines(  | 
10232  | 0  |             string, PyUnicode_2BYTE_DATA(string),  | 
10233  | 0  |             PyUnicode_GET_LENGTH(string), keepends);  | 
10234  | 0  |         break;  | 
10235  | 0  |     case PyUnicode_4BYTE_KIND:  | 
10236  | 0  |         list = ucs4lib_splitlines(  | 
10237  | 0  |             string, PyUnicode_4BYTE_DATA(string),  | 
10238  | 0  |             PyUnicode_GET_LENGTH(string), keepends);  | 
10239  | 0  |         break;  | 
10240  | 0  |     default:  | 
10241  | 0  |         Py_UNREACHABLE();  | 
10242  | 0  |     }  | 
10243  | 0  |     return list;  | 
10244  | 0  | }  | 
10245  |  |  | 
10246  |  | static PyObject *  | 
10247  |  | split(PyObject *self,  | 
10248  |  |       PyObject *substring,  | 
10249  |  |       Py_ssize_t maxcount)  | 
10250  | 74  | { | 
10251  | 74  |     int kind1, kind2;  | 
10252  | 74  |     void *buf1, *buf2;  | 
10253  | 74  |     Py_ssize_t len1, len2;  | 
10254  | 74  |     PyObject* out;  | 
10255  |  |  | 
10256  | 74  |     if (maxcount < 0)  | 
10257  | 74  |         maxcount = PY_SSIZE_T_MAX;  | 
10258  |  |  | 
10259  | 74  |     if (PyUnicode_READY(self) == -1)  | 
10260  | 0  |         return NULL;  | 
10261  |  |  | 
10262  | 74  |     if (substring == NULL)  | 
10263  | 4  |         switch (PyUnicode_KIND(self)) { | 
10264  | 4  |         case PyUnicode_1BYTE_KIND:  | 
10265  | 4  |             if (PyUnicode_IS_ASCII(self))  | 
10266  | 4  |                 return asciilib_split_whitespace(  | 
10267  | 4  |                     self,  PyUnicode_1BYTE_DATA(self),  | 
10268  | 4  |                     PyUnicode_GET_LENGTH(self), maxcount  | 
10269  | 4  |                     );  | 
10270  | 0  |             else  | 
10271  | 0  |                 return ucs1lib_split_whitespace(  | 
10272  | 0  |                     self,  PyUnicode_1BYTE_DATA(self),  | 
10273  | 0  |                     PyUnicode_GET_LENGTH(self), maxcount  | 
10274  | 0  |                     );  | 
10275  | 0  |         case PyUnicode_2BYTE_KIND:  | 
10276  | 0  |             return ucs2lib_split_whitespace(  | 
10277  | 0  |                 self,  PyUnicode_2BYTE_DATA(self),  | 
10278  | 0  |                 PyUnicode_GET_LENGTH(self), maxcount  | 
10279  | 0  |                 );  | 
10280  | 0  |         case PyUnicode_4BYTE_KIND:  | 
10281  | 0  |             return ucs4lib_split_whitespace(  | 
10282  | 0  |                 self,  PyUnicode_4BYTE_DATA(self),  | 
10283  | 0  |                 PyUnicode_GET_LENGTH(self), maxcount  | 
10284  | 0  |                 );  | 
10285  | 0  |         default:  | 
10286  | 0  |             Py_UNREACHABLE();  | 
10287  | 4  |         }  | 
10288  |  |  | 
10289  | 70  |     if (PyUnicode_READY(substring) == -1)  | 
10290  | 0  |         return NULL;  | 
10291  |  |  | 
10292  | 70  |     kind1 = PyUnicode_KIND(self);  | 
10293  | 70  |     kind2 = PyUnicode_KIND(substring);  | 
10294  | 70  |     len1 = PyUnicode_GET_LENGTH(self);  | 
10295  | 70  |     len2 = PyUnicode_GET_LENGTH(substring);  | 
10296  | 70  |     if (kind1 < kind2 || len1 < len2) { | 
10297  | 0  |         out = PyList_New(1);  | 
10298  | 0  |         if (out == NULL)  | 
10299  | 0  |             return NULL;  | 
10300  | 0  |         Py_INCREF(self);  | 
10301  | 0  |         PyList_SET_ITEM(out, 0, self);  | 
10302  | 0  |         return out;  | 
10303  | 0  |     }  | 
10304  | 70  |     buf1 = PyUnicode_DATA(self);  | 
10305  | 70  |     buf2 = PyUnicode_DATA(substring);  | 
10306  | 70  |     if (kind2 != kind1) { | 
10307  | 0  |         buf2 = _PyUnicode_AsKind(substring, kind1);  | 
10308  | 0  |         if (!buf2)  | 
10309  | 0  |             return NULL;  | 
10310  | 0  |     }  | 
10311  |  |  | 
10312  | 70  |     switch (kind1) { | 
10313  | 70  |     case PyUnicode_1BYTE_KIND:  | 
10314  | 70  |         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))  | 
10315  | 70  |             out = asciilib_split(  | 
10316  | 70  |                 self,  buf1, len1, buf2, len2, maxcount);  | 
10317  | 0  |         else  | 
10318  | 0  |             out = ucs1lib_split(  | 
10319  | 0  |                 self,  buf1, len1, buf2, len2, maxcount);  | 
10320  | 70  |         break;  | 
10321  | 0  |     case PyUnicode_2BYTE_KIND:  | 
10322  | 0  |         out = ucs2lib_split(  | 
10323  | 0  |             self,  buf1, len1, buf2, len2, maxcount);  | 
10324  | 0  |         break;  | 
10325  | 0  |     case PyUnicode_4BYTE_KIND:  | 
10326  | 0  |         out = ucs4lib_split(  | 
10327  | 0  |             self,  buf1, len1, buf2, len2, maxcount);  | 
10328  | 0  |         break;  | 
10329  | 0  |     default:  | 
10330  | 0  |         out = NULL;  | 
10331  | 70  |     }  | 
10332  | 70  |     if (kind2 != kind1)  | 
10333  | 0  |         PyMem_Free(buf2);  | 
10334  | 70  |     return out;  | 
10335  | 70  | }  | 
10336  |  |  | 
10337  |  | static PyObject *  | 
10338  |  | rsplit(PyObject *self,  | 
10339  |  |        PyObject *substring,  | 
10340  |  |        Py_ssize_t maxcount)  | 
10341  | 0  | { | 
10342  | 0  |     int kind1, kind2;  | 
10343  | 0  |     void *buf1, *buf2;  | 
10344  | 0  |     Py_ssize_t len1, len2;  | 
10345  | 0  |     PyObject* out;  | 
10346  |  | 
  | 
10347  | 0  |     if (maxcount < 0)  | 
10348  | 0  |         maxcount = PY_SSIZE_T_MAX;  | 
10349  |  | 
  | 
10350  | 0  |     if (PyUnicode_READY(self) == -1)  | 
10351  | 0  |         return NULL;  | 
10352  |  |  | 
10353  | 0  |     if (substring == NULL)  | 
10354  | 0  |         switch (PyUnicode_KIND(self)) { | 
10355  | 0  |         case PyUnicode_1BYTE_KIND:  | 
10356  | 0  |             if (PyUnicode_IS_ASCII(self))  | 
10357  | 0  |                 return asciilib_rsplit_whitespace(  | 
10358  | 0  |                     self,  PyUnicode_1BYTE_DATA(self),  | 
10359  | 0  |                     PyUnicode_GET_LENGTH(self), maxcount  | 
10360  | 0  |                     );  | 
10361  | 0  |             else  | 
10362  | 0  |                 return ucs1lib_rsplit_whitespace(  | 
10363  | 0  |                     self,  PyUnicode_1BYTE_DATA(self),  | 
10364  | 0  |                     PyUnicode_GET_LENGTH(self), maxcount  | 
10365  | 0  |                     );  | 
10366  | 0  |         case PyUnicode_2BYTE_KIND:  | 
10367  | 0  |             return ucs2lib_rsplit_whitespace(  | 
10368  | 0  |                 self,  PyUnicode_2BYTE_DATA(self),  | 
10369  | 0  |                 PyUnicode_GET_LENGTH(self), maxcount  | 
10370  | 0  |                 );  | 
10371  | 0  |         case PyUnicode_4BYTE_KIND:  | 
10372  | 0  |             return ucs4lib_rsplit_whitespace(  | 
10373  | 0  |                 self,  PyUnicode_4BYTE_DATA(self),  | 
10374  | 0  |                 PyUnicode_GET_LENGTH(self), maxcount  | 
10375  | 0  |                 );  | 
10376  | 0  |         default:  | 
10377  | 0  |             Py_UNREACHABLE();  | 
10378  | 0  |         }  | 
10379  |  |  | 
10380  | 0  |     if (PyUnicode_READY(substring) == -1)  | 
10381  | 0  |         return NULL;  | 
10382  |  |  | 
10383  | 0  |     kind1 = PyUnicode_KIND(self);  | 
10384  | 0  |     kind2 = PyUnicode_KIND(substring);  | 
10385  | 0  |     len1 = PyUnicode_GET_LENGTH(self);  | 
10386  | 0  |     len2 = PyUnicode_GET_LENGTH(substring);  | 
10387  | 0  |     if (kind1 < kind2 || len1 < len2) { | 
10388  | 0  |         out = PyList_New(1);  | 
10389  | 0  |         if (out == NULL)  | 
10390  | 0  |             return NULL;  | 
10391  | 0  |         Py_INCREF(self);  | 
10392  | 0  |         PyList_SET_ITEM(out, 0, self);  | 
10393  | 0  |         return out;  | 
10394  | 0  |     }  | 
10395  | 0  |     buf1 = PyUnicode_DATA(self);  | 
10396  | 0  |     buf2 = PyUnicode_DATA(substring);  | 
10397  | 0  |     if (kind2 != kind1) { | 
10398  | 0  |         buf2 = _PyUnicode_AsKind(substring, kind1);  | 
10399  | 0  |         if (!buf2)  | 
10400  | 0  |             return NULL;  | 
10401  | 0  |     }  | 
10402  |  |  | 
10403  | 0  |     switch (kind1) { | 
10404  | 0  |     case PyUnicode_1BYTE_KIND:  | 
10405  | 0  |         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))  | 
10406  | 0  |             out = asciilib_rsplit(  | 
10407  | 0  |                 self,  buf1, len1, buf2, len2, maxcount);  | 
10408  | 0  |         else  | 
10409  | 0  |             out = ucs1lib_rsplit(  | 
10410  | 0  |                 self,  buf1, len1, buf2, len2, maxcount);  | 
10411  | 0  |         break;  | 
10412  | 0  |     case PyUnicode_2BYTE_KIND:  | 
10413  | 0  |         out = ucs2lib_rsplit(  | 
10414  | 0  |             self,  buf1, len1, buf2, len2, maxcount);  | 
10415  | 0  |         break;  | 
10416  | 0  |     case PyUnicode_4BYTE_KIND:  | 
10417  | 0  |         out = ucs4lib_rsplit(  | 
10418  | 0  |             self,  buf1, len1, buf2, len2, maxcount);  | 
10419  | 0  |         break;  | 
10420  | 0  |     default:  | 
10421  | 0  |         out = NULL;  | 
10422  | 0  |     }  | 
10423  | 0  |     if (kind2 != kind1)  | 
10424  | 0  |         PyMem_Free(buf2);  | 
10425  | 0  |     return out;  | 
10426  | 0  | }  | 
10427  |  |  | 
10428  |  | static Py_ssize_t  | 
10429  |  | anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,  | 
10430  |  |             PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)  | 
10431  | 18  | { | 
10432  | 18  |     switch (kind) { | 
10433  | 18  |     case PyUnicode_1BYTE_KIND:  | 
10434  | 18  |         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))  | 
10435  | 18  |             return asciilib_find(buf1, len1, buf2, len2, offset);  | 
10436  | 0  |         else  | 
10437  | 0  |             return ucs1lib_find(buf1, len1, buf2, len2, offset);  | 
10438  | 0  |     case PyUnicode_2BYTE_KIND:  | 
10439  | 0  |         return ucs2lib_find(buf1, len1, buf2, len2, offset);  | 
10440  | 0  |     case PyUnicode_4BYTE_KIND:  | 
10441  | 0  |         return ucs4lib_find(buf1, len1, buf2, len2, offset);  | 
10442  | 18  |     }  | 
10443  | 18  |     Py_UNREACHABLE();  | 
10444  | 18  | }  | 
10445  |  |  | 
10446  |  | static Py_ssize_t  | 
10447  |  | anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,  | 
10448  |  |              PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)  | 
10449  | 2  | { | 
10450  | 2  |     switch (kind) { | 
10451  | 2  |     case PyUnicode_1BYTE_KIND:  | 
10452  | 2  |         if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))  | 
10453  | 2  |             return asciilib_count(sbuf, slen, buf1, len1, maxcount);  | 
10454  | 0  |         else  | 
10455  | 0  |             return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);  | 
10456  | 0  |     case PyUnicode_2BYTE_KIND:  | 
10457  | 0  |         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);  | 
10458  | 0  |     case PyUnicode_4BYTE_KIND:  | 
10459  | 0  |         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);  | 
10460  | 2  |     }  | 
10461  | 2  |     Py_UNREACHABLE();  | 
10462  | 2  | }  | 
10463  |  |  | 
10464  |  | static void  | 
10465  |  | replace_1char_inplace(PyObject *u, Py_ssize_t pos,  | 
10466  |  |                       Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)  | 
10467  | 0  | { | 
10468  | 0  |     int kind = PyUnicode_KIND(u);  | 
10469  | 0  |     void *data = PyUnicode_DATA(u);  | 
10470  | 0  |     Py_ssize_t len = PyUnicode_GET_LENGTH(u);  | 
10471  | 0  |     if (kind == PyUnicode_1BYTE_KIND) { | 
10472  | 0  |         ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,  | 
10473  | 0  |                                       (Py_UCS1 *)data + len,  | 
10474  | 0  |                                       u1, u2, maxcount);  | 
10475  | 0  |     }  | 
10476  | 0  |     else if (kind == PyUnicode_2BYTE_KIND) { | 
10477  | 0  |         ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,  | 
10478  | 0  |                                       (Py_UCS2 *)data + len,  | 
10479  | 0  |                                       u1, u2, maxcount);  | 
10480  | 0  |     }  | 
10481  | 0  |     else { | 
10482  | 0  |         assert(kind == PyUnicode_4BYTE_KIND);  | 
10483  | 0  |         ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,  | 
10484  | 0  |                                       (Py_UCS4 *)data + len,  | 
10485  | 0  |                                       u1, u2, maxcount);  | 
10486  | 0  |     }  | 
10487  | 0  | }  | 
10488  |  |  | 
10489  |  | static PyObject *  | 
10490  |  | replace(PyObject *self, PyObject *str1,  | 
10491  |  |         PyObject *str2, Py_ssize_t maxcount)  | 
10492  | 18  | { | 
10493  | 18  |     PyObject *u;  | 
10494  | 18  |     char *sbuf = PyUnicode_DATA(self);  | 
10495  | 18  |     char *buf1 = PyUnicode_DATA(str1);  | 
10496  | 18  |     char *buf2 = PyUnicode_DATA(str2);  | 
10497  | 18  |     int srelease = 0, release1 = 0, release2 = 0;  | 
10498  | 18  |     int skind = PyUnicode_KIND(self);  | 
10499  | 18  |     int kind1 = PyUnicode_KIND(str1);  | 
10500  | 18  |     int kind2 = PyUnicode_KIND(str2);  | 
10501  | 18  |     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);  | 
10502  | 18  |     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);  | 
10503  | 18  |     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);  | 
10504  | 18  |     int mayshrink;  | 
10505  | 18  |     Py_UCS4 maxchar, maxchar_str1, maxchar_str2;  | 
10506  |  |  | 
10507  | 18  |     if (maxcount < 0)  | 
10508  | 18  |         maxcount = PY_SSIZE_T_MAX;  | 
10509  | 0  |     else if (maxcount == 0 || slen == 0)  | 
10510  | 0  |         goto nothing;  | 
10511  |  |  | 
10512  | 18  |     if (str1 == str2)  | 
10513  | 0  |         goto nothing;  | 
10514  |  |  | 
10515  | 18  |     maxchar = PyUnicode_MAX_CHAR_VALUE(self);  | 
10516  | 18  |     maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);  | 
10517  | 18  |     if (maxchar < maxchar_str1)  | 
10518  |  |         /* substring too wide to be present */  | 
10519  | 0  |         goto nothing;  | 
10520  | 18  |     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);  | 
10521  |  |     /* Replacing str1 with str2 may cause a maxchar reduction in the  | 
10522  |  |        result string. */  | 
10523  | 18  |     mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);  | 
10524  | 18  |     maxchar = Py_MAX(maxchar, maxchar_str2);  | 
10525  |  |  | 
10526  | 18  |     if (len1 == len2) { | 
10527  |  |         /* same length */  | 
10528  | 16  |         if (len1 == 0)  | 
10529  | 0  |             goto nothing;  | 
10530  | 16  |         if (len1 == 1) { | 
10531  |  |             /* replace characters */  | 
10532  | 16  |             Py_UCS4 u1, u2;  | 
10533  | 16  |             Py_ssize_t pos;  | 
10534  |  |  | 
10535  | 16  |             u1 = PyUnicode_READ(kind1, buf1, 0);  | 
10536  | 16  |             pos = findchar(sbuf, skind, slen, u1, 1);  | 
10537  | 16  |             if (pos < 0)  | 
10538  | 16  |                 goto nothing;  | 
10539  | 0  |             u2 = PyUnicode_READ(kind2, buf2, 0);  | 
10540  | 0  |             u = PyUnicode_New(slen, maxchar);  | 
10541  | 0  |             if (!u)  | 
10542  | 0  |                 goto error;  | 
10543  |  |  | 
10544  | 0  |             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);  | 
10545  | 0  |             replace_1char_inplace(u, pos, u1, u2, maxcount);  | 
10546  | 0  |         }  | 
10547  | 0  |         else { | 
10548  | 0  |             int rkind = skind;  | 
10549  | 0  |             char *res;  | 
10550  | 0  |             Py_ssize_t i;  | 
10551  |  | 
  | 
10552  | 0  |             if (kind1 < rkind) { | 
10553  |  |                 /* widen substring */  | 
10554  | 0  |                 buf1 = _PyUnicode_AsKind(str1, rkind);  | 
10555  | 0  |                 if (!buf1) goto error;  | 
10556  | 0  |                 release1 = 1;  | 
10557  | 0  |             }  | 
10558  | 0  |             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);  | 
10559  | 0  |             if (i < 0)  | 
10560  | 0  |                 goto nothing;  | 
10561  | 0  |             if (rkind > kind2) { | 
10562  |  |                 /* widen replacement */  | 
10563  | 0  |                 buf2 = _PyUnicode_AsKind(str2, rkind);  | 
10564  | 0  |                 if (!buf2) goto error;  | 
10565  | 0  |                 release2 = 1;  | 
10566  | 0  |             }  | 
10567  | 0  |             else if (rkind < kind2) { | 
10568  |  |                 /* widen self and buf1 */  | 
10569  | 0  |                 rkind = kind2;  | 
10570  | 0  |                 if (release1) PyMem_Free(buf1);  | 
10571  | 0  |                 release1 = 0;  | 
10572  | 0  |                 sbuf = _PyUnicode_AsKind(self, rkind);  | 
10573  | 0  |                 if (!sbuf) goto error;  | 
10574  | 0  |                 srelease = 1;  | 
10575  | 0  |                 buf1 = _PyUnicode_AsKind(str1, rkind);  | 
10576  | 0  |                 if (!buf1) goto error;  | 
10577  | 0  |                 release1 = 1;  | 
10578  | 0  |             }  | 
10579  | 0  |             u = PyUnicode_New(slen, maxchar);  | 
10580  | 0  |             if (!u)  | 
10581  | 0  |                 goto error;  | 
10582  | 0  |             assert(PyUnicode_KIND(u) == rkind);  | 
10583  | 0  |             res = PyUnicode_DATA(u);  | 
10584  |  | 
  | 
10585  | 0  |             memcpy(res, sbuf, rkind * slen);  | 
10586  |  |             /* change everything in-place, starting with this one */  | 
10587  | 0  |             memcpy(res + rkind * i,  | 
10588  | 0  |                    buf2,  | 
10589  | 0  |                    rkind * len2);  | 
10590  | 0  |             i += len1;  | 
10591  |  | 
  | 
10592  | 0  |             while ( --maxcount > 0) { | 
10593  | 0  |                 i = anylib_find(rkind, self,  | 
10594  | 0  |                                 sbuf+rkind*i, slen-i,  | 
10595  | 0  |                                 str1, buf1, len1, i);  | 
10596  | 0  |                 if (i == -1)  | 
10597  | 0  |                     break;  | 
10598  | 0  |                 memcpy(res + rkind * i,  | 
10599  | 0  |                        buf2,  | 
10600  | 0  |                        rkind * len2);  | 
10601  | 0  |                 i += len1;  | 
10602  | 0  |             }  | 
10603  | 0  |         }  | 
10604  | 16  |     }  | 
10605  | 2  |     else { | 
10606  | 2  |         Py_ssize_t n, i, j, ires;  | 
10607  | 2  |         Py_ssize_t new_size;  | 
10608  | 2  |         int rkind = skind;  | 
10609  | 2  |         char *res;  | 
10610  |  |  | 
10611  | 2  |         if (kind1 < rkind) { | 
10612  |  |             /* widen substring */  | 
10613  | 0  |             buf1 = _PyUnicode_AsKind(str1, rkind);  | 
10614  | 0  |             if (!buf1) goto error;  | 
10615  | 0  |             release1 = 1;  | 
10616  | 0  |         }  | 
10617  | 2  |         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);  | 
10618  | 2  |         if (n == 0)  | 
10619  | 0  |             goto nothing;  | 
10620  | 2  |         if (kind2 < rkind) { | 
10621  |  |             /* widen replacement */  | 
10622  | 0  |             buf2 = _PyUnicode_AsKind(str2, rkind);  | 
10623  | 0  |             if (!buf2) goto error;  | 
10624  | 0  |             release2 = 1;  | 
10625  | 0  |         }  | 
10626  | 2  |         else if (kind2 > rkind) { | 
10627  |  |             /* widen self and buf1 */  | 
10628  | 0  |             rkind = kind2;  | 
10629  | 0  |             sbuf = _PyUnicode_AsKind(self, rkind);  | 
10630  | 0  |             if (!sbuf) goto error;  | 
10631  | 0  |             srelease = 1;  | 
10632  | 0  |             if (release1) PyMem_Free(buf1);  | 
10633  | 0  |             release1 = 0;  | 
10634  | 0  |             buf1 = _PyUnicode_AsKind(str1, rkind);  | 
10635  | 0  |             if (!buf1) goto error;  | 
10636  | 0  |             release1 = 1;  | 
10637  | 0  |         }  | 
10638  |  |         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -  | 
10639  |  |            PyUnicode_GET_LENGTH(str1))); */  | 
10640  | 2  |         if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { | 
10641  | 0  |                 PyErr_SetString(PyExc_OverflowError,  | 
10642  | 0  |                                 "replace string is too long");  | 
10643  | 0  |                 goto error;  | 
10644  | 0  |         }  | 
10645  | 2  |         new_size = slen + n * (len2 - len1);  | 
10646  | 2  |         if (new_size == 0) { | 
10647  | 0  |             _Py_INCREF_UNICODE_EMPTY();  | 
10648  | 0  |             if (!unicode_empty)  | 
10649  | 0  |                 goto error;  | 
10650  | 0  |             u = unicode_empty;  | 
10651  | 0  |             goto done;  | 
10652  | 0  |         }  | 
10653  | 2  |         if (new_size > (PY_SSIZE_T_MAX / rkind)) { | 
10654  | 0  |             PyErr_SetString(PyExc_OverflowError,  | 
10655  | 0  |                             "replace string is too long");  | 
10656  | 0  |             goto error;  | 
10657  | 0  |         }  | 
10658  | 2  |         u = PyUnicode_New(new_size, maxchar);  | 
10659  | 2  |         if (!u)  | 
10660  | 0  |             goto error;  | 
10661  | 2  |         assert(PyUnicode_KIND(u) == rkind);  | 
10662  | 2  |         res = PyUnicode_DATA(u);  | 
10663  | 2  |         ires = i = 0;  | 
10664  | 2  |         if (len1 > 0) { | 
10665  | 20  |             while (n-- > 0) { | 
10666  |  |                 /* look for next match */  | 
10667  | 18  |                 j = anylib_find(rkind, self,  | 
10668  | 18  |                                 sbuf + rkind * i, slen-i,  | 
10669  | 18  |                                 str1, buf1, len1, i);  | 
10670  | 18  |                 if (j == -1)  | 
10671  | 0  |                     break;  | 
10672  | 18  |                 else if (j > i) { | 
10673  |  |                     /* copy unchanged part [i:j] */  | 
10674  | 18  |                     memcpy(res + rkind * ires,  | 
10675  | 18  |                            sbuf + rkind * i,  | 
10676  | 18  |                            rkind * (j-i));  | 
10677  | 18  |                     ires += j - i;  | 
10678  | 18  |                 }  | 
10679  |  |                 /* copy substitution string */  | 
10680  | 18  |                 if (len2 > 0) { | 
10681  | 0  |                     memcpy(res + rkind * ires,  | 
10682  | 0  |                            buf2,  | 
10683  | 0  |                            rkind * len2);  | 
10684  | 0  |                     ires += len2;  | 
10685  | 0  |                 }  | 
10686  | 18  |                 i = j + len1;  | 
10687  | 18  |             }  | 
10688  | 2  |             if (i < slen)  | 
10689  |  |                 /* copy tail [i:] */  | 
10690  | 2  |                 memcpy(res + rkind * ires,  | 
10691  | 2  |                        sbuf + rkind * i,  | 
10692  | 2  |                        rkind * (slen-i));  | 
10693  | 2  |         }  | 
10694  | 0  |         else { | 
10695  |  |             /* interleave */  | 
10696  | 0  |             while (n > 0) { | 
10697  | 0  |                 memcpy(res + rkind * ires,  | 
10698  | 0  |                        buf2,  | 
10699  | 0  |                        rkind * len2);  | 
10700  | 0  |                 ires += len2;  | 
10701  | 0  |                 if (--n <= 0)  | 
10702  | 0  |                     break;  | 
10703  | 0  |                 memcpy(res + rkind * ires,  | 
10704  | 0  |                        sbuf + rkind * i,  | 
10705  | 0  |                        rkind);  | 
10706  | 0  |                 ires++;  | 
10707  | 0  |                 i++;  | 
10708  | 0  |             }  | 
10709  | 0  |             memcpy(res + rkind * ires,  | 
10710  | 0  |                    sbuf + rkind * i,  | 
10711  | 0  |                    rkind * (slen-i));  | 
10712  | 0  |         }  | 
10713  | 2  |     }  | 
10714  |  |  | 
10715  | 2  |     if (mayshrink) { | 
10716  | 0  |         unicode_adjust_maxchar(&u);  | 
10717  | 0  |         if (u == NULL)  | 
10718  | 0  |             goto error;  | 
10719  | 0  |     }  | 
10720  |  |  | 
10721  | 2  |   done:  | 
10722  | 2  |     if (srelease)  | 
10723  | 0  |         PyMem_FREE(sbuf);  | 
10724  | 2  |     if (release1)  | 
10725  | 0  |         PyMem_FREE(buf1);  | 
10726  | 2  |     if (release2)  | 
10727  | 0  |         PyMem_FREE(buf2);  | 
10728  | 2  |     assert(_PyUnicode_CheckConsistency(u, 1));  | 
10729  | 2  |     return u;  | 
10730  |  |  | 
10731  | 16  |   nothing:  | 
10732  |  |     /* nothing to replace; return original string (when possible) */  | 
10733  | 16  |     if (srelease)  | 
10734  | 0  |         PyMem_FREE(sbuf);  | 
10735  | 16  |     if (release1)  | 
10736  | 0  |         PyMem_FREE(buf1);  | 
10737  | 16  |     if (release2)  | 
10738  | 0  |         PyMem_FREE(buf2);  | 
10739  | 16  |     return unicode_result_unchanged(self);  | 
10740  |  |  | 
10741  | 0  |   error:  | 
10742  | 0  |     if (srelease && sbuf)  | 
10743  | 0  |         PyMem_FREE(sbuf);  | 
10744  | 0  |     if (release1 && buf1)  | 
10745  | 0  |         PyMem_FREE(buf1);  | 
10746  | 0  |     if (release2 && buf2)  | 
10747  | 0  |         PyMem_FREE(buf2);  | 
10748  | 0  |     return NULL;  | 
10749  | 2  | }  | 
10750  |  |  | 
10751  |  | /* --- Unicode Object Methods --------------------------------------------- */  | 
10752  |  |  | 
10753  |  | /*[clinic input]  | 
10754  |  | str.title as unicode_title  | 
10755  |  |  | 
10756  |  | Return a version of the string where each word is titlecased.  | 
10757  |  |  | 
10758  |  | More specifically, words start with uppercased characters and all remaining  | 
10759  |  | cased characters have lower case.  | 
10760  |  | [clinic start generated code]*/  | 
10761  |  |  | 
10762  |  | static PyObject *  | 
10763  |  | unicode_title_impl(PyObject *self)  | 
10764  |  | /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/  | 
10765  | 0  | { | 
10766  | 0  |     if (PyUnicode_READY(self) == -1)  | 
10767  | 0  |         return NULL;  | 
10768  | 0  |     return case_operation(self, do_title);  | 
10769  | 0  | }  | 
10770  |  |  | 
10771  |  | /*[clinic input]  | 
10772  |  | str.capitalize as unicode_capitalize  | 
10773  |  |  | 
10774  |  | Return a capitalized version of the string.  | 
10775  |  |  | 
10776  |  | More specifically, make the first character have upper case and the rest lower  | 
10777  |  | case.  | 
10778  |  | [clinic start generated code]*/  | 
10779  |  |  | 
10780  |  | static PyObject *  | 
10781  |  | unicode_capitalize_impl(PyObject *self)  | 
10782  |  | /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/  | 
10783  | 0  | { | 
10784  | 0  |     if (PyUnicode_READY(self) == -1)  | 
10785  | 0  |         return NULL;  | 
10786  | 0  |     if (PyUnicode_GET_LENGTH(self) == 0)  | 
10787  | 0  |         return unicode_result_unchanged(self);  | 
10788  | 0  |     return case_operation(self, do_capitalize);  | 
10789  | 0  | }  | 
10790  |  |  | 
10791  |  | /*[clinic input]  | 
10792  |  | str.casefold as unicode_casefold  | 
10793  |  |  | 
10794  |  | Return a version of the string suitable for caseless comparisons.  | 
10795  |  | [clinic start generated code]*/  | 
10796  |  |  | 
10797  |  | static PyObject *  | 
10798  |  | unicode_casefold_impl(PyObject *self)  | 
10799  |  | /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/  | 
10800  | 0  | { | 
10801  | 0  |     if (PyUnicode_READY(self) == -1)  | 
10802  | 0  |         return NULL;  | 
10803  | 0  |     if (PyUnicode_IS_ASCII(self))  | 
10804  | 0  |         return ascii_upper_or_lower(self, 1);  | 
10805  | 0  |     return case_operation(self, do_casefold);  | 
10806  | 0  | }  | 
10807  |  |  | 
10808  |  |  | 
10809  |  | /* Argument converter. Accepts a single Unicode character. */  | 
10810  |  |  | 
10811  |  | static int  | 
10812  |  | convert_uc(PyObject *obj, void *addr)  | 
10813  | 0  | { | 
10814  | 0  |     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;  | 
10815  |  | 
  | 
10816  | 0  |     if (!PyUnicode_Check(obj)) { | 
10817  | 0  |         PyErr_Format(PyExc_TypeError,  | 
10818  | 0  |                      "The fill character must be a unicode character, "  | 
10819  | 0  |                      "not %.100s", Py_TYPE(obj)->tp_name);  | 
10820  | 0  |         return 0;  | 
10821  | 0  |     }  | 
10822  | 0  |     if (PyUnicode_READY(obj) < 0)  | 
10823  | 0  |         return 0;  | 
10824  | 0  |     if (PyUnicode_GET_LENGTH(obj) != 1) { | 
10825  | 0  |         PyErr_SetString(PyExc_TypeError,  | 
10826  | 0  |                         "The fill character must be exactly one character long");  | 
10827  | 0  |         return 0;  | 
10828  | 0  |     }  | 
10829  | 0  |     *fillcharloc = PyUnicode_READ_CHAR(obj, 0);  | 
10830  | 0  |     return 1;  | 
10831  | 0  | }  | 
10832  |  |  | 
10833  |  | /*[clinic input]  | 
10834  |  | str.center as unicode_center  | 
10835  |  |  | 
10836  |  |     width: Py_ssize_t  | 
10837  |  |     fillchar: Py_UCS4 = ' '  | 
10838  |  |     /  | 
10839  |  |  | 
10840  |  | Return a centered string of length width.  | 
10841  |  |  | 
10842  |  | Padding is done using the specified fill character (default is a space).  | 
10843  |  | [clinic start generated code]*/  | 
10844  |  |  | 
10845  |  | static PyObject *  | 
10846  |  | unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)  | 
10847  |  | /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/  | 
10848  | 0  | { | 
10849  | 0  |     Py_ssize_t marg, left;  | 
10850  |  | 
  | 
10851  | 0  |     if (PyUnicode_READY(self) == -1)  | 
10852  | 0  |         return NULL;  | 
10853  |  |  | 
10854  | 0  |     if (PyUnicode_GET_LENGTH(self) >= width)  | 
10855  | 0  |         return unicode_result_unchanged(self);  | 
10856  |  |  | 
10857  | 0  |     marg = width - PyUnicode_GET_LENGTH(self);  | 
10858  | 0  |     left = marg / 2 + (marg & width & 1);  | 
10859  |  | 
  | 
10860  | 0  |     return pad(self, left, marg - left, fillchar);  | 
10861  | 0  | }  | 
10862  |  |  | 
10863  |  | /* This function assumes that str1 and str2 are readied by the caller. */  | 
10864  |  |  | 
10865  |  | static int  | 
10866  |  | unicode_compare(PyObject *str1, PyObject *str2)  | 
10867  | 690  | { | 
10868  | 690  | #define COMPARE(TYPE1, TYPE2) \  | 
10869  | 690  |     do { \ | 
10870  | 0  |         TYPE1* p1 = (TYPE1 *)data1; \  | 
10871  | 0  |         TYPE2* p2 = (TYPE2 *)data2; \  | 
10872  | 0  |         TYPE1* end = p1 + len; \  | 
10873  | 0  |         Py_UCS4 c1, c2; \  | 
10874  | 0  |         for (; p1 != end; p1++, p2++) { \ | 
10875  | 0  |             c1 = *p1; \  | 
10876  | 0  |             c2 = *p2; \  | 
10877  | 0  |             if (c1 != c2) \  | 
10878  | 0  |                 return (c1 < c2) ? -1 : 1; \  | 
10879  | 0  |         } \  | 
10880  | 0  |     } \  | 
10881  | 0  |     while (0)  | 
10882  |  |  | 
10883  | 690  |     int kind1, kind2;  | 
10884  | 690  |     void *data1, *data2;  | 
10885  | 690  |     Py_ssize_t len1, len2, len;  | 
10886  |  |  | 
10887  | 690  |     kind1 = PyUnicode_KIND(str1);  | 
10888  | 690  |     kind2 = PyUnicode_KIND(str2);  | 
10889  | 690  |     data1 = PyUnicode_DATA(str1);  | 
10890  | 690  |     data2 = PyUnicode_DATA(str2);  | 
10891  | 690  |     len1 = PyUnicode_GET_LENGTH(str1);  | 
10892  | 690  |     len2 = PyUnicode_GET_LENGTH(str2);  | 
10893  | 690  |     len = Py_MIN(len1, len2);  | 
10894  |  |  | 
10895  | 690  |     switch(kind1) { | 
10896  | 690  |     case PyUnicode_1BYTE_KIND:  | 
10897  | 690  |     { | 
10898  | 690  |         switch(kind2) { | 
10899  | 690  |         case PyUnicode_1BYTE_KIND:  | 
10900  | 690  |         { | 
10901  | 690  |             int cmp = memcmp(data1, data2, len);  | 
10902  |  |             /* normalize result of memcmp() into the range [-1; 1] */  | 
10903  | 690  |             if (cmp < 0)  | 
10904  | 254  |                 return -1;  | 
10905  | 436  |             if (cmp > 0)  | 
10906  | 182  |                 return 1;  | 
10907  | 254  |             break;  | 
10908  | 436  |         }  | 
10909  | 254  |         case PyUnicode_2BYTE_KIND:  | 
10910  | 0  |             COMPARE(Py_UCS1, Py_UCS2);  | 
10911  | 0  |             break;  | 
10912  | 0  |         case PyUnicode_4BYTE_KIND:  | 
10913  | 0  |             COMPARE(Py_UCS1, Py_UCS4);  | 
10914  | 0  |             break;  | 
10915  | 0  |         default:  | 
10916  | 0  |             Py_UNREACHABLE();  | 
10917  | 690  |         }  | 
10918  | 254  |         break;  | 
10919  | 690  |     }  | 
10920  | 254  |     case PyUnicode_2BYTE_KIND:  | 
10921  | 0  |     { | 
10922  | 0  |         switch(kind2) { | 
10923  | 0  |         case PyUnicode_1BYTE_KIND:  | 
10924  | 0  |             COMPARE(Py_UCS2, Py_UCS1);  | 
10925  | 0  |             break;  | 
10926  | 0  |         case PyUnicode_2BYTE_KIND:  | 
10927  | 0  |         { | 
10928  | 0  |             COMPARE(Py_UCS2, Py_UCS2);  | 
10929  | 0  |             break;  | 
10930  | 0  |         }  | 
10931  | 0  |         case PyUnicode_4BYTE_KIND:  | 
10932  | 0  |             COMPARE(Py_UCS2, Py_UCS4);  | 
10933  | 0  |             break;  | 
10934  | 0  |         default:  | 
10935  | 0  |             Py_UNREACHABLE();  | 
10936  | 0  |         }  | 
10937  | 0  |         break;  | 
10938  | 0  |     }  | 
10939  | 0  |     case PyUnicode_4BYTE_KIND:  | 
10940  | 0  |     { | 
10941  | 0  |         switch(kind2) { | 
10942  | 0  |         case PyUnicode_1BYTE_KIND:  | 
10943  | 0  |             COMPARE(Py_UCS4, Py_UCS1);  | 
10944  | 0  |             break;  | 
10945  | 0  |         case PyUnicode_2BYTE_KIND:  | 
10946  | 0  |             COMPARE(Py_UCS4, Py_UCS2);  | 
10947  | 0  |             break;  | 
10948  | 0  |         case PyUnicode_4BYTE_KIND:  | 
10949  | 0  |         { | 
10950  | 0  | #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4  | 
10951  | 0  |             int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);  | 
10952  |  |             /* normalize result of wmemcmp() into the range [-1; 1] */  | 
10953  | 0  |             if (cmp < 0)  | 
10954  | 0  |                 return -1;  | 
10955  | 0  |             if (cmp > 0)  | 
10956  | 0  |                 return 1;  | 
10957  |  | #else  | 
10958  |  |             COMPARE(Py_UCS4, Py_UCS4);  | 
10959  |  | #endif  | 
10960  | 0  |             break;  | 
10961  | 0  |         }  | 
10962  | 0  |         default:  | 
10963  | 0  |             Py_UNREACHABLE();  | 
10964  | 0  |         }  | 
10965  | 0  |         break;  | 
10966  | 0  |     }  | 
10967  | 0  |     default:  | 
10968  | 0  |         Py_UNREACHABLE();  | 
10969  | 690  |     }  | 
10970  |  |  | 
10971  | 254  |     if (len1 == len2)  | 
10972  | 235  |         return 0;  | 
10973  | 19  |     if (len1 < len2)  | 
10974  | 17  |         return -1;  | 
10975  | 2  |     else  | 
10976  | 2  |         return 1;  | 
10977  |  |  | 
10978  | 19  | #undef COMPARE  | 
10979  | 19  | }  | 
10980  |  |  | 
10981  |  | static int  | 
10982  |  | unicode_compare_eq(PyObject *str1, PyObject *str2)  | 
10983  | 30.9k  | { | 
10984  | 30.9k  |     int kind;  | 
10985  | 30.9k  |     void *data1, *data2;  | 
10986  | 30.9k  |     Py_ssize_t len;  | 
10987  | 30.9k  |     int cmp;  | 
10988  |  |  | 
10989  | 30.9k  |     len = PyUnicode_GET_LENGTH(str1);  | 
10990  | 30.9k  |     if (PyUnicode_GET_LENGTH(str2) != len)  | 
10991  | 21.2k  |         return 0;  | 
10992  | 9.70k  |     kind = PyUnicode_KIND(str1);  | 
10993  | 9.70k  |     if (PyUnicode_KIND(str2) != kind)  | 
10994  | 0  |         return 0;  | 
10995  | 9.70k  |     data1 = PyUnicode_DATA(str1);  | 
10996  | 9.70k  |     data2 = PyUnicode_DATA(str2);  | 
10997  |  |  | 
10998  | 9.70k  |     cmp = memcmp(data1, data2, len * kind);  | 
10999  | 9.70k  |     return (cmp == 0);  | 
11000  | 9.70k  | }  | 
11001  |  |  | 
11002  |  |  | 
11003  |  | int  | 
11004  |  | PyUnicode_Compare(PyObject *left, PyObject *right)  | 
11005  | 892  | { | 
11006  | 892  |     if (PyUnicode_Check(left) && PyUnicode_Check(right)) { | 
11007  | 892  |         if (PyUnicode_READY(left) == -1 ||  | 
11008  | 892  |             PyUnicode_READY(right) == -1)  | 
11009  | 0  |             return -1;  | 
11010  |  |  | 
11011  |  |         /* a string is equal to itself */  | 
11012  | 892  |         if (left == right)  | 
11013  | 202  |             return 0;  | 
11014  |  |  | 
11015  | 690  |         return unicode_compare(left, right);  | 
11016  | 892  |     }  | 
11017  | 0  |     PyErr_Format(PyExc_TypeError,  | 
11018  | 0  |                  "Can't compare %.100s and %.100s",  | 
11019  | 0  |                  left->ob_type->tp_name,  | 
11020  | 0  |                  right->ob_type->tp_name);  | 
11021  | 0  |     return -1;  | 
11022  | 892  | }  | 
11023  |  |  | 
11024  |  | int  | 
11025  |  | PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)  | 
11026  | 0  | { | 
11027  | 0  |     Py_ssize_t i;  | 
11028  | 0  |     int kind;  | 
11029  | 0  |     Py_UCS4 chr;  | 
11030  | 0  |     const unsigned char *ustr = (const unsigned char *)str;  | 
11031  |  | 
  | 
11032  | 0  |     assert(_PyUnicode_CHECK(uni));  | 
11033  | 0  |     if (!PyUnicode_IS_READY(uni)) { | 
11034  | 0  |         const wchar_t *ws = _PyUnicode_WSTR(uni);  | 
11035  |  |         /* Compare Unicode string and source character set string */  | 
11036  | 0  |         for (i = 0; (chr = ws[i]) && ustr[i]; i++) { | 
11037  | 0  |             if (chr != ustr[i])  | 
11038  | 0  |                 return (chr < ustr[i]) ? -1 : 1;  | 
11039  | 0  |         }  | 
11040  |  |         /* This check keeps Python strings that end in '\0' from comparing equal  | 
11041  |  |          to C strings identical up to that point. */  | 
11042  | 0  |         if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)  | 
11043  | 0  |             return 1; /* uni is longer */  | 
11044  | 0  |         if (ustr[i])  | 
11045  | 0  |             return -1; /* str is longer */  | 
11046  | 0  |         return 0;  | 
11047  | 0  |     }  | 
11048  | 0  |     kind = PyUnicode_KIND(uni);  | 
11049  | 0  |     if (kind == PyUnicode_1BYTE_KIND) { | 
11050  | 0  |         const void *data = PyUnicode_1BYTE_DATA(uni);  | 
11051  | 0  |         size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);  | 
11052  | 0  |         size_t len, len2 = strlen(str);  | 
11053  | 0  |         int cmp;  | 
11054  |  | 
  | 
11055  | 0  |         len = Py_MIN(len1, len2);  | 
11056  | 0  |         cmp = memcmp(data, str, len);  | 
11057  | 0  |         if (cmp != 0) { | 
11058  | 0  |             if (cmp < 0)  | 
11059  | 0  |                 return -1;  | 
11060  | 0  |             else  | 
11061  | 0  |                 return 1;  | 
11062  | 0  |         }  | 
11063  | 0  |         if (len1 > len2)  | 
11064  | 0  |             return 1; /* uni is longer */  | 
11065  | 0  |         if (len1 < len2)  | 
11066  | 0  |             return -1; /* str is longer */  | 
11067  | 0  |         return 0;  | 
11068  | 0  |     }  | 
11069  | 0  |     else { | 
11070  | 0  |         void *data = PyUnicode_DATA(uni);  | 
11071  |  |         /* Compare Unicode string and source character set string */  | 
11072  | 0  |         for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)  | 
11073  | 0  |             if (chr != (unsigned char)str[i])  | 
11074  | 0  |                 return (chr < (unsigned char)(str[i])) ? -1 : 1;  | 
11075  |  |         /* This check keeps Python strings that end in '\0' from comparing equal  | 
11076  |  |          to C strings identical up to that point. */  | 
11077  | 0  |         if (PyUnicode_GET_LENGTH(uni) != i || chr)  | 
11078  | 0  |             return 1; /* uni is longer */  | 
11079  | 0  |         if (str[i])  | 
11080  | 0  |             return -1; /* str is longer */  | 
11081  | 0  |         return 0;  | 
11082  | 0  |     }  | 
11083  | 0  | }  | 
11084  |  |  | 
11085  |  | static int  | 
11086  |  | non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)  | 
11087  | 0  | { | 
11088  | 0  |     size_t i, len;  | 
11089  | 0  |     const wchar_t *p;  | 
11090  | 0  |     len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);  | 
11091  | 0  |     if (strlen(str) != len)  | 
11092  | 0  |         return 0;  | 
11093  | 0  |     p = _PyUnicode_WSTR(unicode);  | 
11094  | 0  |     assert(p);  | 
11095  | 0  |     for (i = 0; i < len; i++) { | 
11096  | 0  |         unsigned char c = (unsigned char)str[i];  | 
11097  | 0  |         if (c >= 128 || p[i] != (wchar_t)c)  | 
11098  | 0  |             return 0;  | 
11099  | 0  |     }  | 
11100  | 0  |     return 1;  | 
11101  | 0  | }  | 
11102  |  |  | 
11103  |  | int  | 
11104  |  | _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)  | 
11105  | 14.1k  | { | 
11106  | 14.1k  |     size_t len;  | 
11107  | 14.1k  |     assert(_PyUnicode_CHECK(unicode));  | 
11108  | 14.1k  |     assert(str);  | 
11109  |  | #ifndef NDEBUG  | 
11110  |  |     for (const char *p = str; *p; p++) { | 
11111  |  |         assert((unsigned char)*p < 128);  | 
11112  |  |     }  | 
11113  |  | #endif  | 
11114  | 14.1k  |     if (PyUnicode_READY(unicode) == -1) { | 
11115  |  |         /* Memory error or bad data */  | 
11116  | 0  |         PyErr_Clear();  | 
11117  | 0  |         return non_ready_unicode_equal_to_ascii_string(unicode, str);  | 
11118  | 0  |     }  | 
11119  | 14.1k  |     if (!PyUnicode_IS_ASCII(unicode))  | 
11120  | 0  |         return 0;  | 
11121  | 14.1k  |     len = (size_t)PyUnicode_GET_LENGTH(unicode);  | 
11122  | 14.1k  |     return strlen(str) == len &&  | 
11123  | 1.81k  |            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;  | 
11124  | 14.1k  | }  | 
11125  |  |  | 
11126  |  | int  | 
11127  |  | _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)  | 
11128  | 1.36k  | { | 
11129  | 1.36k  |     PyObject *right_uni;  | 
11130  | 1.36k  |     Py_hash_t hash;  | 
11131  |  |  | 
11132  | 1.36k  |     assert(_PyUnicode_CHECK(left));  | 
11133  | 1.36k  |     assert(right->string);  | 
11134  |  | #ifndef NDEBUG  | 
11135  |  |     for (const char *p = right->string; *p; p++) { | 
11136  |  |         assert((unsigned char)*p < 128);  | 
11137  |  |     }  | 
11138  |  | #endif  | 
11139  |  |  | 
11140  | 1.36k  |     if (PyUnicode_READY(left) == -1) { | 
11141  |  |         /* memory error or bad data */  | 
11142  | 0  |         PyErr_Clear();  | 
11143  | 0  |         return non_ready_unicode_equal_to_ascii_string(left, right->string);  | 
11144  | 0  |     }  | 
11145  |  |  | 
11146  | 1.36k  |     if (!PyUnicode_IS_ASCII(left))  | 
11147  | 0  |         return 0;  | 
11148  |  |  | 
11149  | 1.36k  |     right_uni = _PyUnicode_FromId(right);       /* borrowed */  | 
11150  | 1.36k  |     if (right_uni == NULL) { | 
11151  |  |         /* memory error or bad data */  | 
11152  | 0  |         PyErr_Clear();  | 
11153  | 0  |         return _PyUnicode_EqualToASCIIString(left, right->string);  | 
11154  | 0  |     }  | 
11155  |  |  | 
11156  | 1.36k  |     if (left == right_uni)  | 
11157  | 1.31k  |         return 1;  | 
11158  |  |  | 
11159  | 50  |     if (PyUnicode_CHECK_INTERNED(left))  | 
11160  | 50  |         return 0;  | 
11161  |  |  | 
11162  | 50  |     assert(_PyUnicode_HASH(right_uni) != -1);  | 
11163  | 0  |     hash = _PyUnicode_HASH(left);  | 
11164  | 0  |     if (hash != -1 && hash != _PyUnicode_HASH(right_uni))  | 
11165  | 0  |         return 0;  | 
11166  |  |  | 
11167  | 0  |     return unicode_compare_eq(left, right_uni);  | 
11168  | 0  | }  | 
11169  |  |  | 
11170  |  | PyObject *  | 
11171  |  | PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)  | 
11172  | 32.2k  | { | 
11173  | 32.2k  |     int result;  | 
11174  |  |  | 
11175  | 32.2k  |     if (!PyUnicode_Check(left) || !PyUnicode_Check(right))  | 
11176  | 8  |         Py_RETURN_NOTIMPLEMENTED;  | 
11177  |  |  | 
11178  | 32.2k  |     if (PyUnicode_READY(left) == -1 ||  | 
11179  | 32.2k  |         PyUnicode_READY(right) == -1)  | 
11180  | 0  |         return NULL;  | 
11181  |  |  | 
11182  | 32.2k  |     if (left == right) { | 
11183  | 1.28k  |         switch (op) { | 
11184  | 416  |         case Py_EQ:  | 
11185  | 416  |         case Py_LE:  | 
11186  | 416  |         case Py_GE:  | 
11187  |  |             /* a string is equal to itself */  | 
11188  | 416  |             Py_RETURN_TRUE;  | 
11189  | 867  |         case Py_NE:  | 
11190  | 867  |         case Py_LT:  | 
11191  | 867  |         case Py_GT:  | 
11192  | 867  |             Py_RETURN_FALSE;  | 
11193  | 0  |         default:  | 
11194  | 0  |             PyErr_BadArgument();  | 
11195  | 0  |             return NULL;  | 
11196  | 1.28k  |         }  | 
11197  | 1.28k  |     }  | 
11198  | 30.9k  |     else if (op == Py_EQ || op == Py_NE) { | 
11199  | 30.9k  |         result = unicode_compare_eq(left, right);  | 
11200  | 30.9k  |         result ^= (op == Py_NE);  | 
11201  | 30.9k  |         return PyBool_FromLong(result);  | 
11202  | 30.9k  |     }  | 
11203  | 0  |     else { | 
11204  | 0  |         result = unicode_compare(left, right);  | 
11205  | 0  |         Py_RETURN_RICHCOMPARE(result, 0, op);  | 
11206  | 0  |     }  | 
11207  | 32.2k  | }  | 
11208  |  |  | 
11209  |  | int  | 
11210  |  | _PyUnicode_EQ(PyObject *aa, PyObject *bb)  | 
11211  | 424  | { | 
11212  | 424  |     return unicode_eq(aa, bb);  | 
11213  | 424  | }  | 
11214  |  |  | 
11215  |  | int  | 
11216  |  | PyUnicode_Contains(PyObject *str, PyObject *substr)  | 
11217  | 438  | { | 
11218  | 438  |     int kind1, kind2;  | 
11219  | 438  |     void *buf1, *buf2;  | 
11220  | 438  |     Py_ssize_t len1, len2;  | 
11221  | 438  |     int result;  | 
11222  |  |  | 
11223  | 438  |     if (!PyUnicode_Check(substr)) { | 
11224  | 0  |         PyErr_Format(PyExc_TypeError,  | 
11225  | 0  |                      "'in <string>' requires string as left operand, not %.100s",  | 
11226  | 0  |                      Py_TYPE(substr)->tp_name);  | 
11227  | 0  |         return -1;  | 
11228  | 0  |     }  | 
11229  | 438  |     if (PyUnicode_READY(substr) == -1)  | 
11230  | 0  |         return -1;  | 
11231  | 438  |     if (ensure_unicode(str) < 0)  | 
11232  | 0  |         return -1;  | 
11233  |  |  | 
11234  | 438  |     kind1 = PyUnicode_KIND(str);  | 
11235  | 438  |     kind2 = PyUnicode_KIND(substr);  | 
11236  | 438  |     if (kind1 < kind2)  | 
11237  | 0  |         return 0;  | 
11238  | 438  |     len1 = PyUnicode_GET_LENGTH(str);  | 
11239  | 438  |     len2 = PyUnicode_GET_LENGTH(substr);  | 
11240  | 438  |     if (len1 < len2)  | 
11241  | 0  |         return 0;  | 
11242  | 438  |     buf1 = PyUnicode_DATA(str);  | 
11243  | 438  |     buf2 = PyUnicode_DATA(substr);  | 
11244  | 438  |     if (len2 == 1) { | 
11245  | 417  |         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);  | 
11246  | 417  |         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;  | 
11247  | 417  |         return result;  | 
11248  | 417  |     }  | 
11249  | 21  |     if (kind2 != kind1) { | 
11250  | 0  |         buf2 = _PyUnicode_AsKind(substr, kind1);  | 
11251  | 0  |         if (!buf2)  | 
11252  | 0  |             return -1;  | 
11253  | 0  |     }  | 
11254  |  |  | 
11255  | 21  |     switch (kind1) { | 
11256  | 21  |     case PyUnicode_1BYTE_KIND:  | 
11257  | 21  |         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;  | 
11258  | 21  |         break;  | 
11259  | 0  |     case PyUnicode_2BYTE_KIND:  | 
11260  | 0  |         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;  | 
11261  | 0  |         break;  | 
11262  | 0  |     case PyUnicode_4BYTE_KIND:  | 
11263  | 0  |         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;  | 
11264  | 0  |         break;  | 
11265  | 0  |     default:  | 
11266  | 0  |         Py_UNREACHABLE();  | 
11267  | 21  |     }  | 
11268  |  |  | 
11269  | 21  |     if (kind2 != kind1)  | 
11270  | 0  |         PyMem_Free(buf2);  | 
11271  |  |  | 
11272  | 21  |     return result;  | 
11273  | 21  | }  | 
11274  |  |  | 
11275  |  | /* Concat to string or Unicode object giving a new Unicode object. */  | 
11276  |  |  | 
11277  |  | PyObject *  | 
11278  |  | PyUnicode_Concat(PyObject *left, PyObject *right)  | 
11279  | 0  | { | 
11280  | 0  |     PyObject *result;  | 
11281  | 0  |     Py_UCS4 maxchar, maxchar2;  | 
11282  | 0  |     Py_ssize_t left_len, right_len, new_len;  | 
11283  |  | 
  | 
11284  | 0  |     if (ensure_unicode(left) < 0)  | 
11285  | 0  |         return NULL;  | 
11286  |  |  | 
11287  | 0  |     if (!PyUnicode_Check(right)) { | 
11288  | 0  |         PyErr_Format(PyExc_TypeError,  | 
11289  | 0  |                      "can only concatenate str (not \"%.200s\") to str",  | 
11290  | 0  |                      right->ob_type->tp_name);  | 
11291  | 0  |         return NULL;  | 
11292  | 0  |     }  | 
11293  | 0  |     if (PyUnicode_READY(right) < 0)  | 
11294  | 0  |         return NULL;  | 
11295  |  |  | 
11296  |  |     /* Shortcuts */  | 
11297  | 0  |     if (left == unicode_empty)  | 
11298  | 0  |         return PyUnicode_FromObject(right);  | 
11299  | 0  |     if (right == unicode_empty)  | 
11300  | 0  |         return PyUnicode_FromObject(left);  | 
11301  |  |  | 
11302  | 0  |     left_len = PyUnicode_GET_LENGTH(left);  | 
11303  | 0  |     right_len = PyUnicode_GET_LENGTH(right);  | 
11304  | 0  |     if (left_len > PY_SSIZE_T_MAX - right_len) { | 
11305  | 0  |         PyErr_SetString(PyExc_OverflowError,  | 
11306  | 0  |                         "strings are too large to concat");  | 
11307  | 0  |         return NULL;  | 
11308  | 0  |     }  | 
11309  | 0  |     new_len = left_len + right_len;  | 
11310  |  | 
  | 
11311  | 0  |     maxchar = PyUnicode_MAX_CHAR_VALUE(left);  | 
11312  | 0  |     maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);  | 
11313  | 0  |     maxchar = Py_MAX(maxchar, maxchar2);  | 
11314  |  |  | 
11315  |  |     /* Concat the two Unicode strings */  | 
11316  | 0  |     result = PyUnicode_New(new_len, maxchar);  | 
11317  | 0  |     if (result == NULL)  | 
11318  | 0  |         return NULL;  | 
11319  | 0  |     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);  | 
11320  | 0  |     _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);  | 
11321  | 0  |     assert(_PyUnicode_CheckConsistency(result, 1));  | 
11322  | 0  |     return result;  | 
11323  | 0  | }  | 
11324  |  |  | 
11325  |  | void  | 
11326  |  | PyUnicode_Append(PyObject **p_left, PyObject *right)  | 
11327  | 3.57k  | { | 
11328  | 3.57k  |     PyObject *left, *res;  | 
11329  | 3.57k  |     Py_UCS4 maxchar, maxchar2;  | 
11330  | 3.57k  |     Py_ssize_t left_len, right_len, new_len;  | 
11331  |  |  | 
11332  | 3.57k  |     if (p_left == NULL) { | 
11333  | 0  |         if (!PyErr_Occurred())  | 
11334  | 0  |             PyErr_BadInternalCall();  | 
11335  | 0  |         return;  | 
11336  | 0  |     }  | 
11337  | 3.57k  |     left = *p_left;  | 
11338  | 3.57k  |     if (right == NULL || left == NULL  | 
11339  | 3.57k  |         || !PyUnicode_Check(left) || !PyUnicode_Check(right)) { | 
11340  | 0  |         if (!PyErr_Occurred())  | 
11341  | 0  |             PyErr_BadInternalCall();  | 
11342  | 0  |         goto error;  | 
11343  | 0  |     }  | 
11344  |  |  | 
11345  | 3.57k  |     if (PyUnicode_READY(left) == -1)  | 
11346  | 0  |         goto error;  | 
11347  | 3.57k  |     if (PyUnicode_READY(right) == -1)  | 
11348  | 0  |         goto error;  | 
11349  |  |  | 
11350  |  |     /* Shortcuts */  | 
11351  | 3.57k  |     if (left == unicode_empty) { | 
11352  | 68  |         Py_DECREF(left);  | 
11353  | 68  |         Py_INCREF(right);  | 
11354  | 68  |         *p_left = right;  | 
11355  | 68  |         return;  | 
11356  | 68  |     }  | 
11357  | 3.50k  |     if (right == unicode_empty)  | 
11358  | 0  |         return;  | 
11359  |  |  | 
11360  | 3.50k  |     left_len = PyUnicode_GET_LENGTH(left);  | 
11361  | 3.50k  |     right_len = PyUnicode_GET_LENGTH(right);  | 
11362  | 3.50k  |     if (left_len > PY_SSIZE_T_MAX - right_len) { | 
11363  | 0  |         PyErr_SetString(PyExc_OverflowError,  | 
11364  | 0  |                         "strings are too large to concat");  | 
11365  | 0  |         goto error;  | 
11366  | 0  |     }  | 
11367  | 3.50k  |     new_len = left_len + right_len;  | 
11368  |  |  | 
11369  | 3.50k  |     if (unicode_modifiable(left)  | 
11370  | 104  |         && PyUnicode_CheckExact(right)  | 
11371  | 104  |         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)  | 
11372  |  |         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires  | 
11373  |  |            to change the structure size, but characters are stored just after  | 
11374  |  |            the structure, and so it requires to move all characters which is  | 
11375  |  |            not so different than duplicating the string. */  | 
11376  | 104  |         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))  | 
11377  | 104  |     { | 
11378  |  |         /* append inplace */  | 
11379  | 104  |         if (unicode_resize(p_left, new_len) != 0)  | 
11380  | 0  |             goto error;  | 
11381  |  |  | 
11382  |  |         /* copy 'right' into the newly allocated area of 'left' */  | 
11383  | 104  |         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);  | 
11384  | 104  |     }  | 
11385  | 3.40k  |     else { | 
11386  | 3.40k  |         maxchar = PyUnicode_MAX_CHAR_VALUE(left);  | 
11387  | 3.40k  |         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);  | 
11388  | 3.40k  |         maxchar = Py_MAX(maxchar, maxchar2);  | 
11389  |  |  | 
11390  |  |         /* Concat the two Unicode strings */  | 
11391  | 3.40k  |         res = PyUnicode_New(new_len, maxchar);  | 
11392  | 3.40k  |         if (res == NULL)  | 
11393  | 0  |             goto error;  | 
11394  | 3.40k  |         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);  | 
11395  | 3.40k  |         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);  | 
11396  | 3.40k  |         Py_DECREF(left);  | 
11397  | 3.40k  |         *p_left = res;  | 
11398  | 3.40k  |     }  | 
11399  | 3.50k  |     assert(_PyUnicode_CheckConsistency(*p_left, 1));  | 
11400  | 3.50k  |     return;  | 
11401  |  |  | 
11402  | 0  | error:  | 
11403  | 0  |     Py_CLEAR(*p_left);  | 
11404  | 0  | }  | 
11405  |  |  | 
11406  |  | void  | 
11407  |  | PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)  | 
11408  | 0  | { | 
11409  | 0  |     PyUnicode_Append(pleft, right);  | 
11410  | 0  |     Py_XDECREF(right);  | 
11411  | 0  | }  | 
11412  |  |  | 
11413  |  | /*  | 
11414  |  | Wraps stringlib_parse_args_finds() and additionally ensures that the  | 
11415  |  | first argument is a unicode object.  | 
11416  |  | */  | 
11417  |  |  | 
11418  |  | static inline int  | 
11419  |  | parse_args_finds_unicode(const char * function_name, PyObject *args,  | 
11420  |  |                          PyObject **substring,  | 
11421  |  |                          Py_ssize_t *start, Py_ssize_t *end)  | 
11422  | 98  | { | 
11423  | 98  |     if(stringlib_parse_args_finds(function_name, args, substring,  | 
11424  | 98  |                                   start, end)) { | 
11425  | 98  |         if (ensure_unicode(*substring) < 0)  | 
11426  | 0  |             return 0;  | 
11427  | 98  |         return 1;  | 
11428  | 98  |     }  | 
11429  | 0  |     return 0;  | 
11430  | 98  | }  | 
11431  |  |  | 
11432  |  | PyDoc_STRVAR(count__doc__,  | 
11433  |  |              "S.count(sub[, start[, end]]) -> int\n\  | 
11434  |  | \n\  | 
11435  |  | Return the number of non-overlapping occurrences of substring sub in\n\  | 
11436  |  | string S[start:end].  Optional arguments start and end are\n\  | 
11437  |  | interpreted as in slice notation.");  | 
11438  |  |  | 
11439  |  | static PyObject *  | 
11440  |  | unicode_count(PyObject *self, PyObject *args)  | 
11441  | 0  | { | 
11442  | 0  |     PyObject *substring = NULL;   /* initialize to fix a compiler warning */  | 
11443  | 0  |     Py_ssize_t start = 0;  | 
11444  | 0  |     Py_ssize_t end = PY_SSIZE_T_MAX;  | 
11445  | 0  |     PyObject *result;  | 
11446  | 0  |     int kind1, kind2;  | 
11447  | 0  |     void *buf1, *buf2;  | 
11448  | 0  |     Py_ssize_t len1, len2, iresult;  | 
11449  |  | 
  | 
11450  | 0  |     if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) | 
11451  | 0  |         return NULL;  | 
11452  |  |  | 
11453  | 0  |     kind1 = PyUnicode_KIND(self);  | 
11454  | 0  |     kind2 = PyUnicode_KIND(substring);  | 
11455  | 0  |     if (kind1 < kind2)  | 
11456  | 0  |         return PyLong_FromLong(0);  | 
11457  |  |  | 
11458  | 0  |     len1 = PyUnicode_GET_LENGTH(self);  | 
11459  | 0  |     len2 = PyUnicode_GET_LENGTH(substring);  | 
11460  | 0  |     ADJUST_INDICES(start, end, len1);  | 
11461  | 0  |     if (end - start < len2)  | 
11462  | 0  |         return PyLong_FromLong(0);  | 
11463  |  |  | 
11464  | 0  |     buf1 = PyUnicode_DATA(self);  | 
11465  | 0  |     buf2 = PyUnicode_DATA(substring);  | 
11466  | 0  |     if (kind2 != kind1) { | 
11467  | 0  |         buf2 = _PyUnicode_AsKind(substring, kind1);  | 
11468  | 0  |         if (!buf2)  | 
11469  | 0  |             return NULL;  | 
11470  | 0  |     }  | 
11471  | 0  |     switch (kind1) { | 
11472  | 0  |     case PyUnicode_1BYTE_KIND:  | 
11473  | 0  |         iresult = ucs1lib_count(  | 
11474  | 0  |             ((Py_UCS1*)buf1) + start, end - start,  | 
11475  | 0  |             buf2, len2, PY_SSIZE_T_MAX  | 
11476  | 0  |             );  | 
11477  | 0  |         break;  | 
11478  | 0  |     case PyUnicode_2BYTE_KIND:  | 
11479  | 0  |         iresult = ucs2lib_count(  | 
11480  | 0  |             ((Py_UCS2*)buf1) + start, end - start,  | 
11481  | 0  |             buf2, len2, PY_SSIZE_T_MAX  | 
11482  | 0  |             );  | 
11483  | 0  |         break;  | 
11484  | 0  |     case PyUnicode_4BYTE_KIND:  | 
11485  | 0  |         iresult = ucs4lib_count(  | 
11486  | 0  |             ((Py_UCS4*)buf1) + start, end - start,  | 
11487  | 0  |             buf2, len2, PY_SSIZE_T_MAX  | 
11488  | 0  |             );  | 
11489  | 0  |         break;  | 
11490  | 0  |     default:  | 
11491  | 0  |         Py_UNREACHABLE();  | 
11492  | 0  |     }  | 
11493  |  |  | 
11494  | 0  |     result = PyLong_FromSsize_t(iresult);  | 
11495  |  | 
  | 
11496  | 0  |     if (kind2 != kind1)  | 
11497  | 0  |         PyMem_Free(buf2);  | 
11498  |  | 
  | 
11499  | 0  |     return result;  | 
11500  | 0  | }  | 
11501  |  |  | 
11502  |  | /*[clinic input]  | 
11503  |  | str.encode as unicode_encode  | 
11504  |  |  | 
11505  |  |     encoding: str(c_default="NULL") = 'utf-8'  | 
11506  |  |         The encoding in which to encode the string.  | 
11507  |  |     errors: str(c_default="NULL") = 'strict'  | 
11508  |  |         The error handling scheme to use for encoding errors.  | 
11509  |  |         The default is 'strict' meaning that encoding errors raise a  | 
11510  |  |         UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and  | 
11511  |  |         'xmlcharrefreplace' as well as any other name registered with  | 
11512  |  |         codecs.register_error that can handle UnicodeEncodeErrors.  | 
11513  |  |  | 
11514  |  | Encode the string using the codec registered for encoding.  | 
11515  |  | [clinic start generated code]*/  | 
11516  |  |  | 
11517  |  | static PyObject *  | 
11518  |  | unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)  | 
11519  |  | /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/  | 
11520  | 14  | { | 
11521  | 14  |     return PyUnicode_AsEncodedString(self, encoding, errors);  | 
11522  | 14  | }  | 
11523  |  |  | 
11524  |  | /*[clinic input]  | 
11525  |  | str.expandtabs as unicode_expandtabs  | 
11526  |  |  | 
11527  |  |     tabsize: int = 8  | 
11528  |  |  | 
11529  |  | Return a copy where all tab characters are expanded using spaces.  | 
11530  |  |  | 
11531  |  | If tabsize is not given, a tab size of 8 characters is assumed.  | 
11532  |  | [clinic start generated code]*/  | 
11533  |  |  | 
11534  |  | static PyObject *  | 
11535  |  | unicode_expandtabs_impl(PyObject *self, int tabsize)  | 
11536  |  | /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/  | 
11537  | 0  | { | 
11538  | 0  |     Py_ssize_t i, j, line_pos, src_len, incr;  | 
11539  | 0  |     Py_UCS4 ch;  | 
11540  | 0  |     PyObject *u;  | 
11541  | 0  |     void *src_data, *dest_data;  | 
11542  | 0  |     int kind;  | 
11543  | 0  |     int found;  | 
11544  |  | 
  | 
11545  | 0  |     if (PyUnicode_READY(self) == -1)  | 
11546  | 0  |         return NULL;  | 
11547  |  |  | 
11548  |  |     /* First pass: determine size of output string */  | 
11549  | 0  |     src_len = PyUnicode_GET_LENGTH(self);  | 
11550  | 0  |     i = j = line_pos = 0;  | 
11551  | 0  |     kind = PyUnicode_KIND(self);  | 
11552  | 0  |     src_data = PyUnicode_DATA(self);  | 
11553  | 0  |     found = 0;  | 
11554  | 0  |     for (; i < src_len; i++) { | 
11555  | 0  |         ch = PyUnicode_READ(kind, src_data, i);  | 
11556  | 0  |         if (ch == '\t') { | 
11557  | 0  |             found = 1;  | 
11558  | 0  |             if (tabsize > 0) { | 
11559  | 0  |                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */  | 
11560  | 0  |                 if (j > PY_SSIZE_T_MAX - incr)  | 
11561  | 0  |                     goto overflow;  | 
11562  | 0  |                 line_pos += incr;  | 
11563  | 0  |                 j += incr;  | 
11564  | 0  |             }  | 
11565  | 0  |         }  | 
11566  | 0  |         else { | 
11567  | 0  |             if (j > PY_SSIZE_T_MAX - 1)  | 
11568  | 0  |                 goto overflow;  | 
11569  | 0  |             line_pos++;  | 
11570  | 0  |             j++;  | 
11571  | 0  |             if (ch == '\n' || ch == '\r')  | 
11572  | 0  |                 line_pos = 0;  | 
11573  | 0  |         }  | 
11574  | 0  |     }  | 
11575  | 0  |     if (!found)  | 
11576  | 0  |         return unicode_result_unchanged(self);  | 
11577  |  |  | 
11578  |  |     /* Second pass: create output string and fill it */  | 
11579  | 0  |     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));  | 
11580  | 0  |     if (!u)  | 
11581  | 0  |         return NULL;  | 
11582  | 0  |     dest_data = PyUnicode_DATA(u);  | 
11583  |  | 
  | 
11584  | 0  |     i = j = line_pos = 0;  | 
11585  |  | 
  | 
11586  | 0  |     for (; i < src_len; i++) { | 
11587  | 0  |         ch = PyUnicode_READ(kind, src_data, i);  | 
11588  | 0  |         if (ch == '\t') { | 
11589  | 0  |             if (tabsize > 0) { | 
11590  | 0  |                 incr = tabsize - (line_pos % tabsize);  | 
11591  | 0  |                 line_pos += incr;  | 
11592  | 0  |                 unicode_fill(kind, dest_data, ' ', j, incr);  | 
11593  | 0  |                 j += incr;  | 
11594  | 0  |             }  | 
11595  | 0  |         }  | 
11596  | 0  |         else { | 
11597  | 0  |             line_pos++;  | 
11598  | 0  |             PyUnicode_WRITE(kind, dest_data, j, ch);  | 
11599  | 0  |             j++;  | 
11600  | 0  |             if (ch == '\n' || ch == '\r')  | 
11601  | 0  |                 line_pos = 0;  | 
11602  | 0  |         }  | 
11603  | 0  |     }  | 
11604  | 0  |     assert (j == PyUnicode_GET_LENGTH(u));  | 
11605  | 0  |     return unicode_result(u);  | 
11606  |  |  | 
11607  | 0  |   overflow:  | 
11608  | 0  |     PyErr_SetString(PyExc_OverflowError, "new string is too long");  | 
11609  | 0  |     return NULL;  | 
11610  | 0  | }  | 
11611  |  |  | 
11612  |  | PyDoc_STRVAR(find__doc__,  | 
11613  |  |              "S.find(sub[, start[, end]]) -> int\n\  | 
11614  |  | \n\  | 
11615  |  | Return the lowest index in S where substring sub is found,\n\  | 
11616  |  | such that sub is contained within S[start:end].  Optional\n\  | 
11617  |  | arguments start and end are interpreted as in slice notation.\n\  | 
11618  |  | \n\  | 
11619  |  | Return -1 on failure.");  | 
11620  |  |  | 
11621  |  | static PyObject *  | 
11622  |  | unicode_find(PyObject *self, PyObject *args)  | 
11623  | 14  | { | 
11624  |  |     /* initialize variables to prevent gcc warning */  | 
11625  | 14  |     PyObject *substring = NULL;  | 
11626  | 14  |     Py_ssize_t start = 0;  | 
11627  | 14  |     Py_ssize_t end = 0;  | 
11628  | 14  |     Py_ssize_t result;  | 
11629  |  |  | 
11630  | 14  |     if (!parse_args_finds_unicode("find", args, &substring, &start, &end)) | 
11631  | 0  |         return NULL;  | 
11632  |  |  | 
11633  | 14  |     if (PyUnicode_READY(self) == -1)  | 
11634  | 0  |         return NULL;  | 
11635  |  |  | 
11636  | 14  |     result = any_find_slice(self, substring, start, end, 1);  | 
11637  |  |  | 
11638  | 14  |     if (result == -2)  | 
11639  | 0  |         return NULL;  | 
11640  |  |  | 
11641  | 14  |     return PyLong_FromSsize_t(result);  | 
11642  | 14  | }  | 
11643  |  |  | 
11644  |  | static PyObject *  | 
11645  |  | unicode_getitem(PyObject *self, Py_ssize_t index)  | 
11646  | 5.15k  | { | 
11647  | 5.15k  |     void *data;  | 
11648  | 5.15k  |     enum PyUnicode_Kind kind;  | 
11649  | 5.15k  |     Py_UCS4 ch;  | 
11650  |  |  | 
11651  | 5.15k  |     if (!PyUnicode_Check(self)) { | 
11652  | 0  |         PyErr_BadArgument();  | 
11653  | 0  |         return NULL;  | 
11654  | 0  |     }  | 
11655  | 5.15k  |     if (PyUnicode_READY(self) == -1) { | 
11656  | 0  |         return NULL;  | 
11657  | 0  |     }  | 
11658  | 5.15k  |     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { | 
11659  | 8  |         PyErr_SetString(PyExc_IndexError, "string index out of range");  | 
11660  | 8  |         return NULL;  | 
11661  | 8  |     }  | 
11662  | 5.14k  |     kind = PyUnicode_KIND(self);  | 
11663  | 5.14k  |     data = PyUnicode_DATA(self);  | 
11664  | 5.14k  |     ch = PyUnicode_READ(kind, data, index);  | 
11665  | 5.14k  |     return unicode_char(ch);  | 
11666  | 5.15k  | }  | 
11667  |  |  | 
11668  |  | /* Believe it or not, this produces the same value for ASCII strings  | 
11669  |  |    as bytes_hash(). */  | 
11670  |  | static Py_hash_t  | 
11671  |  | unicode_hash(PyObject *self)  | 
11672  | 104k  | { | 
11673  | 104k  |     Py_uhash_t x;  /* Unsigned for defined overflow behavior. */  | 
11674  |  |  | 
11675  |  | #ifdef Py_DEBUG  | 
11676  |  |     assert(_Py_HashSecret_Initialized);  | 
11677  |  | #endif  | 
11678  | 104k  |     if (_PyUnicode_HASH(self) != -1)  | 
11679  | 1.28k  |         return _PyUnicode_HASH(self);  | 
11680  | 102k  |     if (PyUnicode_READY(self) == -1)  | 
11681  | 0  |         return -1;  | 
11682  |  |  | 
11683  | 102k  |     x = _Py_HashBytes(PyUnicode_DATA(self),  | 
11684  | 102k  |                       PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));  | 
11685  | 102k  |     _PyUnicode_HASH(self) = x;  | 
11686  | 102k  |     return x;  | 
11687  | 102k  | }  | 
11688  |  |  | 
11689  |  | PyDoc_STRVAR(index__doc__,  | 
11690  |  |              "S.index(sub[, start[, end]]) -> int\n\  | 
11691  |  | \n\  | 
11692  |  | Return the lowest index in S where substring sub is found,\n\  | 
11693  |  | such that sub is contained within S[start:end].  Optional\n\  | 
11694  |  | arguments start and end are interpreted as in slice notation.\n\  | 
11695  |  | \n\  | 
11696  |  | Raises ValueError when the substring is not found.");  | 
11697  |  |  | 
11698  |  | static PyObject *  | 
11699  |  | unicode_index(PyObject *self, PyObject *args)  | 
11700  | 0  | { | 
11701  |  |     /* initialize variables to prevent gcc warning */  | 
11702  | 0  |     Py_ssize_t result;  | 
11703  | 0  |     PyObject *substring = NULL;  | 
11704  | 0  |     Py_ssize_t start = 0;  | 
11705  | 0  |     Py_ssize_t end = 0;  | 
11706  |  | 
  | 
11707  | 0  |     if (!parse_args_finds_unicode("index", args, &substring, &start, &end)) | 
11708  | 0  |         return NULL;  | 
11709  |  |  | 
11710  | 0  |     if (PyUnicode_READY(self) == -1)  | 
11711  | 0  |         return NULL;  | 
11712  |  |  | 
11713  | 0  |     result = any_find_slice(self, substring, start, end, 1);  | 
11714  |  | 
  | 
11715  | 0  |     if (result == -2)  | 
11716  | 0  |         return NULL;  | 
11717  |  |  | 
11718  | 0  |     if (result < 0) { | 
11719  | 0  |         PyErr_SetString(PyExc_ValueError, "substring not found");  | 
11720  | 0  |         return NULL;  | 
11721  | 0  |     }  | 
11722  |  |  | 
11723  | 0  |     return PyLong_FromSsize_t(result);  | 
11724  | 0  | }  | 
11725  |  |  | 
11726  |  | /*[clinic input]  | 
11727  |  | str.isascii as unicode_isascii  | 
11728  |  |  | 
11729  |  | Return True if all characters in the string are ASCII, False otherwise.  | 
11730  |  |  | 
11731  |  | ASCII characters have code points in the range U+0000-U+007F.  | 
11732  |  | Empty string is ASCII too.  | 
11733  |  | [clinic start generated code]*/  | 
11734  |  |  | 
11735  |  | static PyObject *  | 
11736  |  | unicode_isascii_impl(PyObject *self)  | 
11737  |  | /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/  | 
11738  | 0  | { | 
11739  | 0  |     if (PyUnicode_READY(self) == -1) { | 
11740  | 0  |         return NULL;  | 
11741  | 0  |     }  | 
11742  | 0  |     return PyBool_FromLong(PyUnicode_IS_ASCII(self));  | 
11743  | 0  | }  | 
11744  |  |  | 
11745  |  | /*[clinic input]  | 
11746  |  | str.islower as unicode_islower  | 
11747  |  |  | 
11748  |  | Return True if the string is a lowercase string, False otherwise.  | 
11749  |  |  | 
11750  |  | A string is lowercase if all cased characters in the string are lowercase and  | 
11751  |  | there is at least one cased character in the string.  | 
11752  |  | [clinic start generated code]*/  | 
11753  |  |  | 
11754  |  | static PyObject *  | 
11755  |  | unicode_islower_impl(PyObject *self)  | 
11756  |  | /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/  | 
11757  | 0  | { | 
11758  | 0  |     Py_ssize_t i, length;  | 
11759  | 0  |     int kind;  | 
11760  | 0  |     void *data;  | 
11761  | 0  |     int cased;  | 
11762  |  | 
  | 
11763  | 0  |     if (PyUnicode_READY(self) == -1)  | 
11764  | 0  |         return NULL;  | 
11765  | 0  |     length = PyUnicode_GET_LENGTH(self);  | 
11766  | 0  |     kind = PyUnicode_KIND(self);  | 
11767  | 0  |     data = PyUnicode_DATA(self);  | 
11768  |  |  | 
11769  |  |     /* Shortcut for single character strings */  | 
11770  | 0  |     if (length == 1)  | 
11771  | 0  |         return PyBool_FromLong(  | 
11772  | 0  |             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));  | 
11773  |  |  | 
11774  |  |     /* Special case for empty strings */  | 
11775  | 0  |     if (length == 0)  | 
11776  | 0  |         Py_RETURN_FALSE;  | 
11777  |  |  | 
11778  | 0  |     cased = 0;  | 
11779  | 0  |     for (i = 0; i < length; i++) { | 
11780  | 0  |         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);  | 
11781  |  | 
  | 
11782  | 0  |         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))  | 
11783  | 0  |             Py_RETURN_FALSE;  | 
11784  | 0  |         else if (!cased && Py_UNICODE_ISLOWER(ch))  | 
11785  | 0  |             cased = 1;  | 
11786  | 0  |     }  | 
11787  | 0  |     return PyBool_FromLong(cased);  | 
11788  | 0  | }  | 
11789  |  |  | 
11790  |  | /*[clinic input]  | 
11791  |  | str.isupper as unicode_isupper  | 
11792  |  |  | 
11793  |  | Return True if the string is an uppercase string, False otherwise.  | 
11794  |  |  | 
11795  |  | A string is uppercase if all cased characters in the string are uppercase and  | 
11796  |  | there is at least one cased character in the string.  | 
11797  |  | [clinic start generated code]*/  | 
11798  |  |  | 
11799  |  | static PyObject *  | 
11800  |  | unicode_isupper_impl(PyObject *self)  | 
11801  |  | /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/  | 
11802  | 0  | { | 
11803  | 0  |     Py_ssize_t i, length;  | 
11804  | 0  |     int kind;  | 
11805  | 0  |     void *data;  | 
11806  | 0  |     int cased;  | 
11807  |  | 
  | 
11808  | 0  |     if (PyUnicode_READY(self) == -1)  | 
11809  | 0  |         return NULL;  | 
11810  | 0  |     length = PyUnicode_GET_LENGTH(self);  | 
11811  | 0  |     kind = PyUnicode_KIND(self);  | 
11812  | 0  |     data = PyUnicode_DATA(self);  | 
11813  |  |  | 
11814  |  |     /* Shortcut for single character strings */  | 
11815  | 0  |     if (length == 1)  | 
11816  | 0  |         return PyBool_FromLong(  | 
11817  | 0  |             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);  | 
11818  |  |  | 
11819  |  |     /* Special case for empty strings */  | 
11820  | 0  |     if (length == 0)  | 
11821  | 0  |         Py_RETURN_FALSE;  | 
11822  |  |  | 
11823  | 0  |     cased = 0;  | 
11824  | 0  |     for (i = 0; i < length; i++) { | 
11825  | 0  |         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);  | 
11826  |  | 
  | 
11827  | 0  |         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))  | 
11828  | 0  |             Py_RETURN_FALSE;  | 
11829  | 0  |         else if (!cased && Py_UNICODE_ISUPPER(ch))  | 
11830  | 0  |             cased = 1;  | 
11831  | 0  |     }  | 
11832  | 0  |     return PyBool_FromLong(cased);  | 
11833  | 0  | }  | 
11834  |  |  | 
11835  |  | /*[clinic input]  | 
11836  |  | str.istitle as unicode_istitle  | 
11837  |  |  | 
11838  |  | Return True if the string is a title-cased string, False otherwise.  | 
11839  |  |  | 
11840  |  | In a title-cased string, upper- and title-case characters may only  | 
11841  |  | follow uncased characters and lowercase characters only cased ones.  | 
11842  |  | [clinic start generated code]*/  | 
11843  |  |  | 
11844  |  | static PyObject *  | 
11845  |  | unicode_istitle_impl(PyObject *self)  | 
11846  |  | /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/  | 
11847  | 0  | { | 
11848  | 0  |     Py_ssize_t i, length;  | 
11849  | 0  |     int kind;  | 
11850  | 0  |     void *data;  | 
11851  | 0  |     int cased, previous_is_cased;  | 
11852  |  | 
  | 
11853  | 0  |     if (PyUnicode_READY(self) == -1)  | 
11854  | 0  |         return NULL;  | 
11855  | 0  |     length = PyUnicode_GET_LENGTH(self);  | 
11856  | 0  |     kind = PyUnicode_KIND(self);  | 
11857  | 0  |     data = PyUnicode_DATA(self);  | 
11858  |  |  | 
11859  |  |     /* Shortcut for single character strings */  | 
11860  | 0  |     if (length == 1) { | 
11861  | 0  |         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);  | 
11862  | 0  |         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||  | 
11863  | 0  |                                (Py_UNICODE_ISUPPER(ch) != 0));  | 
11864  | 0  |     }  | 
11865  |  |  | 
11866  |  |     /* Special case for empty strings */  | 
11867  | 0  |     if (length == 0)  | 
11868  | 0  |         Py_RETURN_FALSE;  | 
11869  |  |  | 
11870  | 0  |     cased = 0;  | 
11871  | 0  |     previous_is_cased = 0;  | 
11872  | 0  |     for (i = 0; i < length; i++) { | 
11873  | 0  |         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);  | 
11874  |  | 
  | 
11875  | 0  |         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { | 
11876  | 0  |             if (previous_is_cased)  | 
11877  | 0  |                 Py_RETURN_FALSE;  | 
11878  | 0  |             previous_is_cased = 1;  | 
11879  | 0  |             cased = 1;  | 
11880  | 0  |         }  | 
11881  | 0  |         else if (Py_UNICODE_ISLOWER(ch)) { | 
11882  | 0  |             if (!previous_is_cased)  | 
11883  | 0  |                 Py_RETURN_FALSE;  | 
11884  | 0  |             previous_is_cased = 1;  | 
11885  | 0  |             cased = 1;  | 
11886  | 0  |         }  | 
11887  | 0  |         else  | 
11888  | 0  |             previous_is_cased = 0;  | 
11889  | 0  |     }  | 
11890  | 0  |     return PyBool_FromLong(cased);  | 
11891  | 0  | }  | 
11892  |  |  | 
11893  |  | /*[clinic input]  | 
11894  |  | str.isspace as unicode_isspace  | 
11895  |  |  | 
11896  |  | Return True if the string is a whitespace string, False otherwise.  | 
11897  |  |  | 
11898  |  | A string is whitespace if all characters in the string are whitespace and there  | 
11899  |  | is at least one character in the string.  | 
11900  |  | [clinic start generated code]*/  | 
11901  |  |  | 
11902  |  | static PyObject *  | 
11903  |  | unicode_isspace_impl(PyObject *self)  | 
11904  |  | /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/  | 
11905  | 0  | { | 
11906  | 0  |     Py_ssize_t i, length;  | 
11907  | 0  |     int kind;  | 
11908  | 0  |     void *data;  | 
11909  |  | 
  | 
11910  | 0  |     if (PyUnicode_READY(self) == -1)  | 
11911  | 0  |         return NULL;  | 
11912  | 0  |     length = PyUnicode_GET_LENGTH(self);  | 
11913  | 0  |     kind = PyUnicode_KIND(self);  | 
11914  | 0  |     data = PyUnicode_DATA(self);  | 
11915  |  |  | 
11916  |  |     /* Shortcut for single character strings */  | 
11917  | 0  |     if (length == 1)  | 
11918  | 0  |         return PyBool_FromLong(  | 
11919  | 0  |             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));  | 
11920  |  |  | 
11921  |  |     /* Special case for empty strings */  | 
11922  | 0  |     if (length == 0)  | 
11923  | 0  |         Py_RETURN_FALSE;  | 
11924  |  |  | 
11925  | 0  |     for (i = 0; i < length; i++) { | 
11926  | 0  |         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);  | 
11927  | 0  |         if (!Py_UNICODE_ISSPACE(ch))  | 
11928  | 0  |             Py_RETURN_FALSE;  | 
11929  | 0  |     }  | 
11930  | 0  |     Py_RETURN_TRUE;  | 
11931  | 0  | }  | 
11932  |  |  | 
11933  |  | /*[clinic input]  | 
11934  |  | str.isalpha as unicode_isalpha  | 
11935  |  |  | 
11936  |  | Return True if the string is an alphabetic string, False otherwise.  | 
11937  |  |  | 
11938  |  | A string is alphabetic if all characters in the string are alphabetic and there  | 
11939  |  | is at least one character in the string.  | 
11940  |  | [clinic start generated code]*/  | 
11941  |  |  | 
11942  |  | static PyObject *  | 
11943  |  | unicode_isalpha_impl(PyObject *self)  | 
11944  |  | /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/  | 
11945  | 0  | { | 
11946  | 0  |     Py_ssize_t i, length;  | 
11947  | 0  |     int kind;  | 
11948  | 0  |     void *data;  | 
11949  |  | 
  | 
11950  | 0  |     if (PyUnicode_READY(self) == -1)  | 
11951  | 0  |         return NULL;  | 
11952  | 0  |     length = PyUnicode_GET_LENGTH(self);  | 
11953  | 0  |     kind = PyUnicode_KIND(self);  | 
11954  | 0  |     data = PyUnicode_DATA(self);  | 
11955  |  |  | 
11956  |  |     /* Shortcut for single character strings */  | 
11957  | 0  |     if (length == 1)  | 
11958  | 0  |         return PyBool_FromLong(  | 
11959  | 0  |             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));  | 
11960  |  |  | 
11961  |  |     /* Special case for empty strings */  | 
11962  | 0  |     if (length == 0)  | 
11963  | 0  |         Py_RETURN_FALSE;  | 
11964  |  |  | 
11965  | 0  |     for (i = 0; i < length; i++) { | 
11966  | 0  |         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))  | 
11967  | 0  |             Py_RETURN_FALSE;  | 
11968  | 0  |     }  | 
11969  | 0  |     Py_RETURN_TRUE;  | 
11970  | 0  | }  | 
11971  |  |  | 
11972  |  | /*[clinic input]  | 
11973  |  | str.isalnum as unicode_isalnum  | 
11974  |  |  | 
11975  |  | Return True if the string is an alpha-numeric string, False otherwise.  | 
11976  |  |  | 
11977  |  | A string is alpha-numeric if all characters in the string are alpha-numeric and  | 
11978  |  | there is at least one character in the string.  | 
11979  |  | [clinic start generated code]*/  | 
11980  |  |  | 
11981  |  | static PyObject *  | 
11982  |  | unicode_isalnum_impl(PyObject *self)  | 
11983  |  | /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/  | 
11984  | 271  | { | 
11985  | 271  |     int kind;  | 
11986  | 271  |     void *data;  | 
11987  | 271  |     Py_ssize_t len, i;  | 
11988  |  |  | 
11989  | 271  |     if (PyUnicode_READY(self) == -1)  | 
11990  | 0  |         return NULL;  | 
11991  |  |  | 
11992  | 271  |     kind = PyUnicode_KIND(self);  | 
11993  | 271  |     data = PyUnicode_DATA(self);  | 
11994  | 271  |     len = PyUnicode_GET_LENGTH(self);  | 
11995  |  |  | 
11996  |  |     /* Shortcut for single character strings */  | 
11997  | 271  |     if (len == 1) { | 
11998  | 271  |         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);  | 
11999  | 271  |         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));  | 
12000  | 271  |     }  | 
12001  |  |  | 
12002  |  |     /* Special case for empty strings */  | 
12003  | 0  |     if (len == 0)  | 
12004  | 0  |         Py_RETURN_FALSE;  | 
12005  |  |  | 
12006  | 0  |     for (i = 0; i < len; i++) { | 
12007  | 0  |         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);  | 
12008  | 0  |         if (!Py_UNICODE_ISALNUM(ch))  | 
12009  | 0  |             Py_RETURN_FALSE;  | 
12010  | 0  |     }  | 
12011  | 0  |     Py_RETURN_TRUE;  | 
12012  | 0  | }  | 
12013  |  |  | 
12014  |  | /*[clinic input]  | 
12015  |  | str.isdecimal as unicode_isdecimal  | 
12016  |  |  | 
12017  |  | Return True if the string is a decimal string, False otherwise.  | 
12018  |  |  | 
12019  |  | A string is a decimal string if all characters in the string are decimal and  | 
12020  |  | there is at least one character in the string.  | 
12021  |  | [clinic start generated code]*/  | 
12022  |  |  | 
12023  |  | static PyObject *  | 
12024  |  | unicode_isdecimal_impl(PyObject *self)  | 
12025  |  | /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/  | 
12026  | 0  | { | 
12027  | 0  |     Py_ssize_t i, length;  | 
12028  | 0  |     int kind;  | 
12029  | 0  |     void *data;  | 
12030  |  | 
  | 
12031  | 0  |     if (PyUnicode_READY(self) == -1)  | 
12032  | 0  |         return NULL;  | 
12033  | 0  |     length = PyUnicode_GET_LENGTH(self);  | 
12034  | 0  |     kind = PyUnicode_KIND(self);  | 
12035  | 0  |     data = PyUnicode_DATA(self);  | 
12036  |  |  | 
12037  |  |     /* Shortcut for single character strings */  | 
12038  | 0  |     if (length == 1)  | 
12039  | 0  |         return PyBool_FromLong(  | 
12040  | 0  |             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));  | 
12041  |  |  | 
12042  |  |     /* Special case for empty strings */  | 
12043  | 0  |     if (length == 0)  | 
12044  | 0  |         Py_RETURN_FALSE;  | 
12045  |  |  | 
12046  | 0  |     for (i = 0; i < length; i++) { | 
12047  | 0  |         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))  | 
12048  | 0  |             Py_RETURN_FALSE;  | 
12049  | 0  |     }  | 
12050  | 0  |     Py_RETURN_TRUE;  | 
12051  | 0  | }  | 
12052  |  |  | 
12053  |  | /*[clinic input]  | 
12054  |  | str.isdigit as unicode_isdigit  | 
12055  |  |  | 
12056  |  | Return True if the string is a digit string, False otherwise.  | 
12057  |  |  | 
12058  |  | A string is a digit string if all characters in the string are digits and there  | 
12059  |  | is at least one character in the string.  | 
12060  |  | [clinic start generated code]*/  | 
12061  |  |  | 
12062  |  | static PyObject *  | 
12063  |  | unicode_isdigit_impl(PyObject *self)  | 
12064  |  | /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/  | 
12065  | 0  | { | 
12066  | 0  |     Py_ssize_t i, length;  | 
12067  | 0  |     int kind;  | 
12068  | 0  |     void *data;  | 
12069  |  | 
  | 
12070  | 0  |     if (PyUnicode_READY(self) == -1)  | 
12071  | 0  |         return NULL;  | 
12072  | 0  |     length = PyUnicode_GET_LENGTH(self);  | 
12073  | 0  |     kind = PyUnicode_KIND(self);  | 
12074  | 0  |     data = PyUnicode_DATA(self);  | 
12075  |  |  | 
12076  |  |     /* Shortcut for single character strings */  | 
12077  | 0  |     if (length == 1) { | 
12078  | 0  |         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);  | 
12079  | 0  |         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));  | 
12080  | 0  |     }  | 
12081  |  |  | 
12082  |  |     /* Special case for empty strings */  | 
12083  | 0  |     if (length == 0)  | 
12084  | 0  |         Py_RETURN_FALSE;  | 
12085  |  |  | 
12086  | 0  |     for (i = 0; i < length; i++) { | 
12087  | 0  |         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))  | 
12088  | 0  |             Py_RETURN_FALSE;  | 
12089  | 0  |     }  | 
12090  | 0  |     Py_RETURN_TRUE;  | 
12091  | 0  | }  | 
12092  |  |  | 
12093  |  | /*[clinic input]  | 
12094  |  | str.isnumeric as unicode_isnumeric  | 
12095  |  |  | 
12096  |  | Return True if the string is a numeric string, False otherwise.  | 
12097  |  |  | 
12098  |  | A string is numeric if all characters in the string are numeric and there is at  | 
12099  |  | least one character in the string.  | 
12100  |  | [clinic start generated code]*/  | 
12101  |  |  | 
12102  |  | static PyObject *  | 
12103  |  | unicode_isnumeric_impl(PyObject *self)  | 
12104  |  | /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/  | 
12105  | 0  | { | 
12106  | 0  |     Py_ssize_t i, length;  | 
12107  | 0  |     int kind;  | 
12108  | 0  |     void *data;  | 
12109  |  | 
  | 
12110  | 0  |     if (PyUnicode_READY(self) == -1)  | 
12111  | 0  |         return NULL;  | 
12112  | 0  |     length = PyUnicode_GET_LENGTH(self);  | 
12113  | 0  |     kind = PyUnicode_KIND(self);  | 
12114  | 0  |     data = PyUnicode_DATA(self);  | 
12115  |  |  | 
12116  |  |     /* Shortcut for single character strings */  | 
12117  | 0  |     if (length == 1)  | 
12118  | 0  |         return PyBool_FromLong(  | 
12119  | 0  |             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));  | 
12120  |  |  | 
12121  |  |     /* Special case for empty strings */  | 
12122  | 0  |     if (length == 0)  | 
12123  | 0  |         Py_RETURN_FALSE;  | 
12124  |  |  | 
12125  | 0  |     for (i = 0; i < length; i++) { | 
12126  | 0  |         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))  | 
12127  | 0  |             Py_RETURN_FALSE;  | 
12128  | 0  |     }  | 
12129  | 0  |     Py_RETURN_TRUE;  | 
12130  | 0  | }  | 
12131  |  |  | 
12132  |  | int  | 
12133  |  | PyUnicode_IsIdentifier(PyObject *self)  | 
12134  | 62  | { | 
12135  | 62  |     int kind;  | 
12136  | 62  |     void *data;  | 
12137  | 62  |     Py_ssize_t i;  | 
12138  | 62  |     Py_UCS4 first;  | 
12139  |  |  | 
12140  | 62  |     if (PyUnicode_READY(self) == -1) { | 
12141  | 0  |         Py_FatalError("identifier not ready"); | 
12142  | 0  |         return 0;  | 
12143  | 0  |     }  | 
12144  |  |  | 
12145  |  |     /* Special case for empty strings */  | 
12146  | 62  |     if (PyUnicode_GET_LENGTH(self) == 0)  | 
12147  | 0  |         return 0;  | 
12148  | 62  |     kind = PyUnicode_KIND(self);  | 
12149  | 62  |     data = PyUnicode_DATA(self);  | 
12150  |  |  | 
12151  |  |     /* PEP 3131 says that the first character must be in  | 
12152  |  |        XID_Start and subsequent characters in XID_Continue,  | 
12153  |  |        and for the ASCII range, the 2.x rules apply (i.e  | 
12154  |  |        start with letters and underscore, continue with  | 
12155  |  |        letters, digits, underscore). However, given the current  | 
12156  |  |        definition of XID_Start and XID_Continue, it is sufficient  | 
12157  |  |        to check just for these, except that _ must be allowed  | 
12158  |  |        as starting an identifier.  */  | 
12159  | 62  |     first = PyUnicode_READ(kind, data, 0);  | 
12160  | 62  |     if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)  | 
12161  | 0  |         return 0;  | 
12162  |  |  | 
12163  | 407  |     for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)  | 
12164  | 345  |         if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))  | 
12165  | 0  |             return 0;  | 
12166  | 62  |     return 1;  | 
12167  | 62  | }  | 
12168  |  |  | 
12169  |  | /*[clinic input]  | 
12170  |  | str.isidentifier as unicode_isidentifier  | 
12171  |  |  | 
12172  |  | Return True if the string is a valid Python identifier, False otherwise.  | 
12173  |  |  | 
12174  |  | Call keyword.iskeyword(s) to test whether string s is a reserved identifier,  | 
12175  |  | such as "def" or "class".  | 
12176  |  | [clinic start generated code]*/  | 
12177  |  |  | 
12178  |  | static PyObject *  | 
12179  |  | unicode_isidentifier_impl(PyObject *self)  | 
12180  |  | /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/  | 
12181  | 15  | { | 
12182  | 15  |     return PyBool_FromLong(PyUnicode_IsIdentifier(self));  | 
12183  | 15  | }  | 
12184  |  |  | 
12185  |  | /*[clinic input]  | 
12186  |  | str.isprintable as unicode_isprintable  | 
12187  |  |  | 
12188  |  | Return True if the string is printable, False otherwise.  | 
12189  |  |  | 
12190  |  | A string is printable if all of its characters are considered printable in  | 
12191  |  | repr() or if it is empty.  | 
12192  |  | [clinic start generated code]*/  | 
12193  |  |  | 
12194  |  | static PyObject *  | 
12195  |  | unicode_isprintable_impl(PyObject *self)  | 
12196  |  | /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/  | 
12197  | 0  | { | 
12198  | 0  |     Py_ssize_t i, length;  | 
12199  | 0  |     int kind;  | 
12200  | 0  |     void *data;  | 
12201  |  | 
  | 
12202  | 0  |     if (PyUnicode_READY(self) == -1)  | 
12203  | 0  |         return NULL;  | 
12204  | 0  |     length = PyUnicode_GET_LENGTH(self);  | 
12205  | 0  |     kind = PyUnicode_KIND(self);  | 
12206  | 0  |     data = PyUnicode_DATA(self);  | 
12207  |  |  | 
12208  |  |     /* Shortcut for single character strings */  | 
12209  | 0  |     if (length == 1)  | 
12210  | 0  |         return PyBool_FromLong(  | 
12211  | 0  |             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));  | 
12212  |  |  | 
12213  | 0  |     for (i = 0; i < length; i++) { | 
12214  | 0  |         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { | 
12215  | 0  |             Py_RETURN_FALSE;  | 
12216  | 0  |         }  | 
12217  | 0  |     }  | 
12218  | 0  |     Py_RETURN_TRUE;  | 
12219  | 0  | }  | 
12220  |  |  | 
12221  |  | /*[clinic input]  | 
12222  |  | str.join as unicode_join  | 
12223  |  |  | 
12224  |  |     iterable: object  | 
12225  |  |     /  | 
12226  |  |  | 
12227  |  | Concatenate any number of strings.  | 
12228  |  |  | 
12229  |  | The string whose method is called is inserted in between each given string.  | 
12230  |  | The result is returned as a new string.  | 
12231  |  |  | 
12232  |  | Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'  | 
12233  |  | [clinic start generated code]*/  | 
12234  |  |  | 
12235  |  | static PyObject *  | 
12236  |  | unicode_join(PyObject *self, PyObject *iterable)  | 
12237  |  | /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/  | 
12238  | 2.30k  | { | 
12239  | 2.30k  |     return PyUnicode_Join(self, iterable);  | 
12240  | 2.30k  | }  | 
12241  |  |  | 
12242  |  | static Py_ssize_t  | 
12243  |  | unicode_length(PyObject *self)  | 
12244  | 6.78k  | { | 
12245  | 6.78k  |     if (PyUnicode_READY(self) == -1)  | 
12246  | 0  |         return -1;  | 
12247  | 6.78k  |     return PyUnicode_GET_LENGTH(self);  | 
12248  | 6.78k  | }  | 
12249  |  |  | 
12250  |  | /*[clinic input]  | 
12251  |  | str.ljust as unicode_ljust  | 
12252  |  |  | 
12253  |  |     width: Py_ssize_t  | 
12254  |  |     fillchar: Py_UCS4 = ' '  | 
12255  |  |     /  | 
12256  |  |  | 
12257  |  | Return a left-justified string of length width.  | 
12258  |  |  | 
12259  |  | Padding is done using the specified fill character (default is a space).  | 
12260  |  | [clinic start generated code]*/  | 
12261  |  |  | 
12262  |  | static PyObject *  | 
12263  |  | unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)  | 
12264  |  | /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/  | 
12265  | 0  | { | 
12266  | 0  |     if (PyUnicode_READY(self) == -1)  | 
12267  | 0  |         return NULL;  | 
12268  |  |  | 
12269  | 0  |     if (PyUnicode_GET_LENGTH(self) >= width)  | 
12270  | 0  |         return unicode_result_unchanged(self);  | 
12271  |  |  | 
12272  | 0  |     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);  | 
12273  | 0  | }  | 
12274  |  |  | 
12275  |  | /*[clinic input]  | 
12276  |  | str.lower as unicode_lower  | 
12277  |  |  | 
12278  |  | Return a copy of the string converted to lowercase.  | 
12279  |  | [clinic start generated code]*/  | 
12280  |  |  | 
12281  |  | static PyObject *  | 
12282  |  | unicode_lower_impl(PyObject *self)  | 
12283  |  | /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/  | 
12284  | 0  | { | 
12285  | 0  |     if (PyUnicode_READY(self) == -1)  | 
12286  | 0  |         return NULL;  | 
12287  | 0  |     if (PyUnicode_IS_ASCII(self))  | 
12288  | 0  |         return ascii_upper_or_lower(self, 1);  | 
12289  | 0  |     return case_operation(self, do_lower);  | 
12290  | 0  | }  | 
12291  |  |  | 
12292  | 3.77k  | #define LEFTSTRIP 0  | 
12293  | 7.48k  | #define RIGHTSTRIP 1  | 
12294  | 59  | #define BOTHSTRIP 2  | 
12295  |  |  | 
12296  |  | /* Arrays indexed by above */  | 
12297  |  | static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"}; | 
12298  |  |  | 
12299  | 0  | #define STRIPNAME(i) (stripfuncnames[i])  | 
12300  |  |  | 
12301  |  | /* externally visible for str.strip(unicode) */  | 
12302  |  | PyObject *  | 
12303  |  | _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)  | 
12304  | 3.71k  | { | 
12305  | 3.71k  |     void *data;  | 
12306  | 3.71k  |     int kind;  | 
12307  | 3.71k  |     Py_ssize_t i, j, len;  | 
12308  | 3.71k  |     BLOOM_MASK sepmask;  | 
12309  | 3.71k  |     Py_ssize_t seplen;  | 
12310  |  |  | 
12311  | 3.71k  |     if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)  | 
12312  | 0  |         return NULL;  | 
12313  |  |  | 
12314  | 3.71k  |     kind = PyUnicode_KIND(self);  | 
12315  | 3.71k  |     data = PyUnicode_DATA(self);  | 
12316  | 3.71k  |     len = PyUnicode_GET_LENGTH(self);  | 
12317  | 3.71k  |     seplen = PyUnicode_GET_LENGTH(sepobj);  | 
12318  | 3.71k  |     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),  | 
12319  | 3.71k  |                               PyUnicode_DATA(sepobj),  | 
12320  | 3.71k  |                               seplen);  | 
12321  |  |  | 
12322  | 3.71k  |     i = 0;  | 
12323  | 3.71k  |     if (striptype != RIGHTSTRIP) { | 
12324  | 0  |         while (i < len) { | 
12325  | 0  |             Py_UCS4 ch = PyUnicode_READ(kind, data, i);  | 
12326  | 0  |             if (!BLOOM(sepmask, ch))  | 
12327  | 0  |                 break;  | 
12328  | 0  |             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)  | 
12329  | 0  |                 break;  | 
12330  | 0  |             i++;  | 
12331  | 0  |         }  | 
12332  | 0  |     }  | 
12333  |  |  | 
12334  | 3.71k  |     j = len;  | 
12335  | 3.71k  |     if (striptype != LEFTSTRIP) { | 
12336  | 3.71k  |         j--;  | 
12337  | 3.75k  |         while (j >= i) { | 
12338  | 3.75k  |             Py_UCS4 ch = PyUnicode_READ(kind, data, j);  | 
12339  | 3.75k  |             if (!BLOOM(sepmask, ch))  | 
12340  | 2.90k  |                 break;  | 
12341  | 852  |             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)  | 
12342  | 810  |                 break;  | 
12343  | 42  |             j--;  | 
12344  | 42  |         }  | 
12345  |  |  | 
12346  | 3.71k  |         j++;  | 
12347  | 3.71k  |     }  | 
12348  |  |  | 
12349  | 3.71k  |     return PyUnicode_Substring(self, i, j);  | 
12350  | 3.71k  | }  | 
12351  |  |  | 
12352  |  | PyObject*  | 
12353  |  | PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)  | 
12354  | 4.52k  | { | 
12355  | 4.52k  |     unsigned char *data;  | 
12356  | 4.52k  |     int kind;  | 
12357  | 4.52k  |     Py_ssize_t length;  | 
12358  |  |  | 
12359  | 4.52k  |     if (PyUnicode_READY(self) == -1)  | 
12360  | 0  |         return NULL;  | 
12361  |  |  | 
12362  | 4.52k  |     length = PyUnicode_GET_LENGTH(self);  | 
12363  | 4.52k  |     end = Py_MIN(end, length);  | 
12364  |  |  | 
12365  | 4.52k  |     if (start == 0 && end == length)  | 
12366  | 3.71k  |         return unicode_result_unchanged(self);  | 
12367  |  |  | 
12368  | 807  |     if (start < 0 || end < 0) { | 
12369  | 0  |         PyErr_SetString(PyExc_IndexError, "string index out of range");  | 
12370  | 0  |         return NULL;  | 
12371  | 0  |     }  | 
12372  | 807  |     if (start >= length || end < start)  | 
12373  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
12374  |  |  | 
12375  | 807  |     length = end - start;  | 
12376  | 807  |     if (PyUnicode_IS_ASCII(self)) { | 
12377  | 807  |         data = PyUnicode_1BYTE_DATA(self);  | 
12378  | 807  |         return _PyUnicode_FromASCII((char*)(data + start), length);  | 
12379  | 807  |     }  | 
12380  | 0  |     else { | 
12381  | 0  |         kind = PyUnicode_KIND(self);  | 
12382  | 0  |         data = PyUnicode_1BYTE_DATA(self);  | 
12383  | 0  |         return PyUnicode_FromKindAndData(kind,  | 
12384  | 0  |                                          data + kind * start,  | 
12385  | 0  |                                          length);  | 
12386  | 0  |     }  | 
12387  | 807  | }  | 
12388  |  |  | 
12389  |  | static PyObject *  | 
12390  |  | do_strip(PyObject *self, int striptype)  | 
12391  | 59  | { | 
12392  | 59  |     Py_ssize_t len, i, j;  | 
12393  |  |  | 
12394  | 59  |     if (PyUnicode_READY(self) == -1)  | 
12395  | 0  |         return NULL;  | 
12396  |  |  | 
12397  | 59  |     len = PyUnicode_GET_LENGTH(self);  | 
12398  |  |  | 
12399  | 59  |     if (PyUnicode_IS_ASCII(self)) { | 
12400  | 59  |         Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);  | 
12401  |  |  | 
12402  | 59  |         i = 0;  | 
12403  | 59  |         if (striptype != RIGHTSTRIP) { | 
12404  | 242  |             while (i < len) { | 
12405  | 242  |                 Py_UCS1 ch = data[i];  | 
12406  | 242  |                 if (!_Py_ascii_whitespace[ch])  | 
12407  | 59  |                     break;  | 
12408  | 183  |                 i++;  | 
12409  | 183  |             }  | 
12410  | 59  |         }  | 
12411  |  |  | 
12412  | 59  |         j = len;  | 
12413  | 59  |         if (striptype != LEFTSTRIP) { | 
12414  | 59  |             j--;  | 
12415  | 90  |             while (j >= i) { | 
12416  | 90  |                 Py_UCS1 ch = data[j];  | 
12417  | 90  |                 if (!_Py_ascii_whitespace[ch])  | 
12418  | 59  |                     break;  | 
12419  | 31  |                 j--;  | 
12420  | 31  |             }  | 
12421  | 59  |             j++;  | 
12422  | 59  |         }  | 
12423  | 59  |     }  | 
12424  | 0  |     else { | 
12425  | 0  |         int kind = PyUnicode_KIND(self);  | 
12426  | 0  |         void *data = PyUnicode_DATA(self);  | 
12427  |  | 
  | 
12428  | 0  |         i = 0;  | 
12429  | 0  |         if (striptype != RIGHTSTRIP) { | 
12430  | 0  |             while (i < len) { | 
12431  | 0  |                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);  | 
12432  | 0  |                 if (!Py_UNICODE_ISSPACE(ch))  | 
12433  | 0  |                     break;  | 
12434  | 0  |                 i++;  | 
12435  | 0  |             }  | 
12436  | 0  |         }  | 
12437  |  | 
  | 
12438  | 0  |         j = len;  | 
12439  | 0  |         if (striptype != LEFTSTRIP) { | 
12440  | 0  |             j--;  | 
12441  | 0  |             while (j >= i) { | 
12442  | 0  |                 Py_UCS4 ch = PyUnicode_READ(kind, data, j);  | 
12443  | 0  |                 if (!Py_UNICODE_ISSPACE(ch))  | 
12444  | 0  |                     break;  | 
12445  | 0  |                 j--;  | 
12446  | 0  |             }  | 
12447  | 0  |             j++;  | 
12448  | 0  |         }  | 
12449  | 0  |     }  | 
12450  |  |  | 
12451  | 59  |     return PyUnicode_Substring(self, i, j);  | 
12452  | 59  | }  | 
12453  |  |  | 
12454  |  |  | 
12455  |  | static PyObject *  | 
12456  |  | do_argstrip(PyObject *self, int striptype, PyObject *sep)  | 
12457  | 3.77k  | { | 
12458  | 3.77k  |     if (sep != Py_None) { | 
12459  | 3.71k  |         if (PyUnicode_Check(sep))  | 
12460  | 3.71k  |             return _PyUnicode_XStrip(self, striptype, sep);  | 
12461  | 0  |         else { | 
12462  | 0  |             PyErr_Format(PyExc_TypeError,  | 
12463  | 0  |                          "%s arg must be None or str",  | 
12464  | 0  |                          STRIPNAME(striptype));  | 
12465  | 0  |             return NULL;  | 
12466  | 0  |         }  | 
12467  | 3.71k  |     }  | 
12468  |  |  | 
12469  | 59  |     return do_strip(self, striptype);  | 
12470  | 3.77k  | }  | 
12471  |  |  | 
12472  |  |  | 
12473  |  | /*[clinic input]  | 
12474  |  | str.strip as unicode_strip  | 
12475  |  |  | 
12476  |  |     chars: object = None  | 
12477  |  |     /  | 
12478  |  |  | 
12479  |  | Return a copy of the string with leading and trailing whitespace removed.  | 
12480  |  |  | 
12481  |  | If chars is given and not None, remove characters in chars instead.  | 
12482  |  | [clinic start generated code]*/  | 
12483  |  |  | 
12484  |  | static PyObject *  | 
12485  |  | unicode_strip_impl(PyObject *self, PyObject *chars)  | 
12486  |  | /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/  | 
12487  | 59  | { | 
12488  | 59  |     return do_argstrip(self, BOTHSTRIP, chars);  | 
12489  | 59  | }  | 
12490  |  |  | 
12491  |  |  | 
12492  |  | /*[clinic input]  | 
12493  |  | str.lstrip as unicode_lstrip  | 
12494  |  |  | 
12495  |  |     chars: object = None  | 
12496  |  |     /  | 
12497  |  |  | 
12498  |  | Return a copy of the string with leading whitespace removed.  | 
12499  |  |  | 
12500  |  | If chars is given and not None, remove characters in chars instead.  | 
12501  |  | [clinic start generated code]*/  | 
12502  |  |  | 
12503  |  | static PyObject *  | 
12504  |  | unicode_lstrip_impl(PyObject *self, PyObject *chars)  | 
12505  |  | /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/  | 
12506  | 0  | { | 
12507  | 0  |     return do_argstrip(self, LEFTSTRIP, chars);  | 
12508  | 0  | }  | 
12509  |  |  | 
12510  |  |  | 
12511  |  | /*[clinic input]  | 
12512  |  | str.rstrip as unicode_rstrip  | 
12513  |  |  | 
12514  |  |     chars: object = None  | 
12515  |  |     /  | 
12516  |  |  | 
12517  |  | Return a copy of the string with trailing whitespace removed.  | 
12518  |  |  | 
12519  |  | If chars is given and not None, remove characters in chars instead.  | 
12520  |  | [clinic start generated code]*/  | 
12521  |  |  | 
12522  |  | static PyObject *  | 
12523  |  | unicode_rstrip_impl(PyObject *self, PyObject *chars)  | 
12524  |  | /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/  | 
12525  | 3.71k  | { | 
12526  | 3.71k  |     return do_argstrip(self, RIGHTSTRIP, chars);  | 
12527  | 3.71k  | }  | 
12528  |  |  | 
12529  |  |  | 
12530  |  | static PyObject*  | 
12531  |  | unicode_repeat(PyObject *str, Py_ssize_t len)  | 
12532  | 182  | { | 
12533  | 182  |     PyObject *u;  | 
12534  | 182  |     Py_ssize_t nchars, n;  | 
12535  |  |  | 
12536  | 182  |     if (len < 1)  | 
12537  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
12538  |  |  | 
12539  |  |     /* no repeat, return original string */  | 
12540  | 182  |     if (len == 1)  | 
12541  | 70  |         return unicode_result_unchanged(str);  | 
12542  |  |  | 
12543  | 112  |     if (PyUnicode_READY(str) == -1)  | 
12544  | 0  |         return NULL;  | 
12545  |  |  | 
12546  | 112  |     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { | 
12547  | 0  |         PyErr_SetString(PyExc_OverflowError,  | 
12548  | 0  |                         "repeated string is too long");  | 
12549  | 0  |         return NULL;  | 
12550  | 0  |     }  | 
12551  | 112  |     nchars = len * PyUnicode_GET_LENGTH(str);  | 
12552  |  |  | 
12553  | 112  |     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));  | 
12554  | 112  |     if (!u)  | 
12555  | 0  |         return NULL;  | 
12556  | 112  |     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));  | 
12557  |  |  | 
12558  | 112  |     if (PyUnicode_GET_LENGTH(str) == 1) { | 
12559  | 112  |         const int kind = PyUnicode_KIND(str);  | 
12560  | 112  |         const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);  | 
12561  | 112  |         if (kind == PyUnicode_1BYTE_KIND) { | 
12562  | 112  |             void *to = PyUnicode_DATA(u);  | 
12563  | 112  |             memset(to, (unsigned char)fill_char, len);  | 
12564  | 112  |         }  | 
12565  | 0  |         else if (kind == PyUnicode_2BYTE_KIND) { | 
12566  | 0  |             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);  | 
12567  | 0  |             for (n = 0; n < len; ++n)  | 
12568  | 0  |                 ucs2[n] = fill_char;  | 
12569  | 0  |         } else { | 
12570  | 0  |             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);  | 
12571  | 0  |             assert(kind == PyUnicode_4BYTE_KIND);  | 
12572  | 0  |             for (n = 0; n < len; ++n)  | 
12573  | 0  |                 ucs4[n] = fill_char;  | 
12574  | 0  |         }  | 
12575  | 112  |     }  | 
12576  | 0  |     else { | 
12577  |  |         /* number of characters copied this far */  | 
12578  | 0  |         Py_ssize_t done = PyUnicode_GET_LENGTH(str);  | 
12579  | 0  |         const Py_ssize_t char_size = PyUnicode_KIND(str);  | 
12580  | 0  |         char *to = (char *) PyUnicode_DATA(u);  | 
12581  | 0  |         memcpy(to, PyUnicode_DATA(str),  | 
12582  | 0  |                   PyUnicode_GET_LENGTH(str) * char_size);  | 
12583  | 0  |         while (done < nchars) { | 
12584  | 0  |             n = (done <= nchars-done) ? done : nchars-done;  | 
12585  | 0  |             memcpy(to + (done * char_size), to, n * char_size);  | 
12586  | 0  |             done += n;  | 
12587  | 0  |         }  | 
12588  | 0  |     }  | 
12589  |  |  | 
12590  | 112  |     assert(_PyUnicode_CheckConsistency(u, 1));  | 
12591  | 112  |     return u;  | 
12592  | 112  | }  | 
12593  |  |  | 
12594  |  | PyObject *  | 
12595  |  | PyUnicode_Replace(PyObject *str,  | 
12596  |  |                   PyObject *substr,  | 
12597  |  |                   PyObject *replstr,  | 
12598  |  |                   Py_ssize_t maxcount)  | 
12599  | 0  | { | 
12600  | 0  |     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||  | 
12601  | 0  |             ensure_unicode(replstr) < 0)  | 
12602  | 0  |         return NULL;  | 
12603  | 0  |     return replace(str, substr, replstr, maxcount);  | 
12604  | 0  | }  | 
12605  |  |  | 
12606  |  | /*[clinic input]  | 
12607  |  | str.replace as unicode_replace  | 
12608  |  |  | 
12609  |  |     old: unicode  | 
12610  |  |     new: unicode  | 
12611  |  |     count: Py_ssize_t = -1  | 
12612  |  |         Maximum number of occurrences to replace.  | 
12613  |  |         -1 (the default value) means replace all occurrences.  | 
12614  |  |     /  | 
12615  |  |  | 
12616  |  | Return a copy with all occurrences of substring old replaced by new.  | 
12617  |  |  | 
12618  |  | If the optional argument count is given, only the first count occurrences are  | 
12619  |  | replaced.  | 
12620  |  | [clinic start generated code]*/  | 
12621  |  |  | 
12622  |  | static PyObject *  | 
12623  |  | unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,  | 
12624  |  |                      Py_ssize_t count)  | 
12625  |  | /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/  | 
12626  | 18  | { | 
12627  | 18  |     if (PyUnicode_READY(self) == -1)  | 
12628  | 0  |         return NULL;  | 
12629  | 18  |     return replace(self, old, new, count);  | 
12630  | 18  | }  | 
12631  |  |  | 
12632  |  | static PyObject *  | 
12633  |  | unicode_repr(PyObject *unicode)  | 
12634  | 72  | { | 
12635  | 72  |     PyObject *repr;  | 
12636  | 72  |     Py_ssize_t isize;  | 
12637  | 72  |     Py_ssize_t osize, squote, dquote, i, o;  | 
12638  | 72  |     Py_UCS4 max, quote;  | 
12639  | 72  |     int ikind, okind, unchanged;  | 
12640  | 72  |     void *idata, *odata;  | 
12641  |  |  | 
12642  | 72  |     if (PyUnicode_READY(unicode) == -1)  | 
12643  | 0  |         return NULL;  | 
12644  |  |  | 
12645  | 72  |     isize = PyUnicode_GET_LENGTH(unicode);  | 
12646  | 72  |     idata = PyUnicode_DATA(unicode);  | 
12647  |  |  | 
12648  |  |     /* Compute length of output, quote characters, and  | 
12649  |  |        maximum character */  | 
12650  | 72  |     osize = 0;  | 
12651  | 72  |     max = 127;  | 
12652  | 72  |     squote = dquote = 0;  | 
12653  | 72  |     ikind = PyUnicode_KIND(unicode);  | 
12654  | 696  |     for (i = 0; i < isize; i++) { | 
12655  | 624  |         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);  | 
12656  | 624  |         Py_ssize_t incr = 1;  | 
12657  | 624  |         switch (ch) { | 
12658  | 0  |         case '\'': squote++; break;  | 
12659  | 0  |         case '"':  dquote++; break;  | 
12660  | 0  |         case '\\': case '\t': case '\r': case '\n':  | 
12661  | 0  |             incr = 2;  | 
12662  | 0  |             break;  | 
12663  | 624  |         default:  | 
12664  |  |             /* Fast-path ASCII */  | 
12665  | 624  |             if (ch < ' ' || ch == 0x7f)  | 
12666  | 0  |                 incr = 4; /* \xHH */  | 
12667  | 624  |             else if (ch < 0x7f)  | 
12668  | 624  |                 ;  | 
12669  | 0  |             else if (Py_UNICODE_ISPRINTABLE(ch))  | 
12670  | 0  |                 max = ch > max ? ch : max;  | 
12671  | 0  |             else if (ch < 0x100)  | 
12672  | 0  |                 incr = 4; /* \xHH */  | 
12673  | 0  |             else if (ch < 0x10000)  | 
12674  | 0  |                 incr = 6; /* \uHHHH */  | 
12675  | 0  |             else  | 
12676  | 0  |                 incr = 10; /* \uHHHHHHHH */  | 
12677  | 624  |         }  | 
12678  | 624  |         if (osize > PY_SSIZE_T_MAX - incr) { | 
12679  | 0  |             PyErr_SetString(PyExc_OverflowError,  | 
12680  | 0  |                             "string is too long to generate repr");  | 
12681  | 0  |             return NULL;  | 
12682  | 0  |         }  | 
12683  | 624  |         osize += incr;  | 
12684  | 624  |     }  | 
12685  |  |  | 
12686  | 72  |     quote = '\'';  | 
12687  | 72  |     unchanged = (osize == isize);  | 
12688  | 72  |     if (squote) { | 
12689  | 0  |         unchanged = 0;  | 
12690  | 0  |         if (dquote)  | 
12691  |  |             /* Both squote and dquote present. Use squote,  | 
12692  |  |                and escape them */  | 
12693  | 0  |             osize += squote;  | 
12694  | 0  |         else  | 
12695  | 0  |             quote = '"';  | 
12696  | 0  |     }  | 
12697  | 72  |     osize += 2;   /* quotes */  | 
12698  |  |  | 
12699  | 72  |     repr = PyUnicode_New(osize, max);  | 
12700  | 72  |     if (repr == NULL)  | 
12701  | 0  |         return NULL;  | 
12702  | 72  |     okind = PyUnicode_KIND(repr);  | 
12703  | 72  |     odata = PyUnicode_DATA(repr);  | 
12704  |  |  | 
12705  | 72  |     PyUnicode_WRITE(okind, odata, 0, quote);  | 
12706  | 72  |     PyUnicode_WRITE(okind, odata, osize-1, quote);  | 
12707  | 72  |     if (unchanged) { | 
12708  | 72  |         _PyUnicode_FastCopyCharacters(repr, 1,  | 
12709  | 72  |                                       unicode, 0,  | 
12710  | 72  |                                       isize);  | 
12711  | 72  |     }  | 
12712  | 0  |     else { | 
12713  | 0  |         for (i = 0, o = 1; i < isize; i++) { | 
12714  | 0  |             Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);  | 
12715  |  |  | 
12716  |  |             /* Escape quotes and backslashes */  | 
12717  | 0  |             if ((ch == quote) || (ch == '\\')) { | 
12718  | 0  |                 PyUnicode_WRITE(okind, odata, o++, '\\');  | 
12719  | 0  |                 PyUnicode_WRITE(okind, odata, o++, ch);  | 
12720  | 0  |                 continue;  | 
12721  | 0  |             }  | 
12722  |  |  | 
12723  |  |             /* Map special whitespace to '\t', \n', '\r' */  | 
12724  | 0  |             if (ch == '\t') { | 
12725  | 0  |                 PyUnicode_WRITE(okind, odata, o++, '\\');  | 
12726  | 0  |                 PyUnicode_WRITE(okind, odata, o++, 't');  | 
12727  | 0  |             }  | 
12728  | 0  |             else if (ch == '\n') { | 
12729  | 0  |                 PyUnicode_WRITE(okind, odata, o++, '\\');  | 
12730  | 0  |                 PyUnicode_WRITE(okind, odata, o++, 'n');  | 
12731  | 0  |             }  | 
12732  | 0  |             else if (ch == '\r') { | 
12733  | 0  |                 PyUnicode_WRITE(okind, odata, o++, '\\');  | 
12734  | 0  |                 PyUnicode_WRITE(okind, odata, o++, 'r');  | 
12735  | 0  |             }  | 
12736  |  |  | 
12737  |  |             /* Map non-printable US ASCII to '\xhh' */  | 
12738  | 0  |             else if (ch < ' ' || ch == 0x7F) { | 
12739  | 0  |                 PyUnicode_WRITE(okind, odata, o++, '\\');  | 
12740  | 0  |                 PyUnicode_WRITE(okind, odata, o++, 'x');  | 
12741  | 0  |                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);  | 
12742  | 0  |                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);  | 
12743  | 0  |             }  | 
12744  |  |  | 
12745  |  |             /* Copy ASCII characters as-is */  | 
12746  | 0  |             else if (ch < 0x7F) { | 
12747  | 0  |                 PyUnicode_WRITE(okind, odata, o++, ch);  | 
12748  | 0  |             }  | 
12749  |  |  | 
12750  |  |             /* Non-ASCII characters */  | 
12751  | 0  |             else { | 
12752  |  |                 /* Map Unicode whitespace and control characters  | 
12753  |  |                    (categories Z* and C* except ASCII space)  | 
12754  |  |                 */  | 
12755  | 0  |                 if (!Py_UNICODE_ISPRINTABLE(ch)) { | 
12756  | 0  |                     PyUnicode_WRITE(okind, odata, o++, '\\');  | 
12757  |  |                     /* Map 8-bit characters to '\xhh' */  | 
12758  | 0  |                     if (ch <= 0xff) { | 
12759  | 0  |                         PyUnicode_WRITE(okind, odata, o++, 'x');  | 
12760  | 0  |                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);  | 
12761  | 0  |                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);  | 
12762  | 0  |                     }  | 
12763  |  |                     /* Map 16-bit characters to '\uxxxx' */  | 
12764  | 0  |                     else if (ch <= 0xffff) { | 
12765  | 0  |                         PyUnicode_WRITE(okind, odata, o++, 'u');  | 
12766  | 0  |                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);  | 
12767  | 0  |                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);  | 
12768  | 0  |                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);  | 
12769  | 0  |                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);  | 
12770  | 0  |                     }  | 
12771  |  |                     /* Map 21-bit characters to '\U00xxxxxx' */  | 
12772  | 0  |                     else { | 
12773  | 0  |                         PyUnicode_WRITE(okind, odata, o++, 'U');  | 
12774  | 0  |                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);  | 
12775  | 0  |                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);  | 
12776  | 0  |                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);  | 
12777  | 0  |                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);  | 
12778  | 0  |                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);  | 
12779  | 0  |                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);  | 
12780  | 0  |                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);  | 
12781  | 0  |                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);  | 
12782  | 0  |                     }  | 
12783  | 0  |                 }  | 
12784  |  |                 /* Copy characters as-is */  | 
12785  | 0  |                 else { | 
12786  | 0  |                     PyUnicode_WRITE(okind, odata, o++, ch);  | 
12787  | 0  |                 }  | 
12788  | 0  |             }  | 
12789  | 0  |         }  | 
12790  | 0  |     }  | 
12791  |  |     /* Closing quote already added at the beginning */  | 
12792  | 72  |     assert(_PyUnicode_CheckConsistency(repr, 1));  | 
12793  | 72  |     return repr;  | 
12794  | 72  | }  | 
12795  |  |  | 
12796  |  | PyDoc_STRVAR(rfind__doc__,  | 
12797  |  |              "S.rfind(sub[, start[, end]]) -> int\n\  | 
12798  |  | \n\  | 
12799  |  | Return the highest index in S where substring sub is found,\n\  | 
12800  |  | such that sub is contained within S[start:end].  Optional\n\  | 
12801  |  | arguments start and end are interpreted as in slice notation.\n\  | 
12802  |  | \n\  | 
12803  |  | Return -1 on failure.");  | 
12804  |  |  | 
12805  |  | static PyObject *  | 
12806  |  | unicode_rfind(PyObject *self, PyObject *args)  | 
12807  | 84  | { | 
12808  |  |     /* initialize variables to prevent gcc warning */  | 
12809  | 84  |     PyObject *substring = NULL;  | 
12810  | 84  |     Py_ssize_t start = 0;  | 
12811  | 84  |     Py_ssize_t end = 0;  | 
12812  | 84  |     Py_ssize_t result;  | 
12813  |  |  | 
12814  | 84  |     if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end)) | 
12815  | 0  |         return NULL;  | 
12816  |  |  | 
12817  | 84  |     if (PyUnicode_READY(self) == -1)  | 
12818  | 0  |         return NULL;  | 
12819  |  |  | 
12820  | 84  |     result = any_find_slice(self, substring, start, end, -1);  | 
12821  |  |  | 
12822  | 84  |     if (result == -2)  | 
12823  | 0  |         return NULL;  | 
12824  |  |  | 
12825  | 84  |     return PyLong_FromSsize_t(result);  | 
12826  | 84  | }  | 
12827  |  |  | 
12828  |  | PyDoc_STRVAR(rindex__doc__,  | 
12829  |  |              "S.rindex(sub[, start[, end]]) -> int\n\  | 
12830  |  | \n\  | 
12831  |  | Return the highest index in S where substring sub is found,\n\  | 
12832  |  | such that sub is contained within S[start:end].  Optional\n\  | 
12833  |  | arguments start and end are interpreted as in slice notation.\n\  | 
12834  |  | \n\  | 
12835  |  | Raises ValueError when the substring is not found.");  | 
12836  |  |  | 
12837  |  | static PyObject *  | 
12838  |  | unicode_rindex(PyObject *self, PyObject *args)  | 
12839  | 0  | { | 
12840  |  |     /* initialize variables to prevent gcc warning */  | 
12841  | 0  |     PyObject *substring = NULL;  | 
12842  | 0  |     Py_ssize_t start = 0;  | 
12843  | 0  |     Py_ssize_t end = 0;  | 
12844  | 0  |     Py_ssize_t result;  | 
12845  |  | 
  | 
12846  | 0  |     if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end)) | 
12847  | 0  |         return NULL;  | 
12848  |  |  | 
12849  | 0  |     if (PyUnicode_READY(self) == -1)  | 
12850  | 0  |         return NULL;  | 
12851  |  |  | 
12852  | 0  |     result = any_find_slice(self, substring, start, end, -1);  | 
12853  |  | 
  | 
12854  | 0  |     if (result == -2)  | 
12855  | 0  |         return NULL;  | 
12856  |  |  | 
12857  | 0  |     if (result < 0) { | 
12858  | 0  |         PyErr_SetString(PyExc_ValueError, "substring not found");  | 
12859  | 0  |         return NULL;  | 
12860  | 0  |     }  | 
12861  |  |  | 
12862  | 0  |     return PyLong_FromSsize_t(result);  | 
12863  | 0  | }  | 
12864  |  |  | 
12865  |  | /*[clinic input]  | 
12866  |  | str.rjust as unicode_rjust  | 
12867  |  |  | 
12868  |  |     width: Py_ssize_t  | 
12869  |  |     fillchar: Py_UCS4 = ' '  | 
12870  |  |     /  | 
12871  |  |  | 
12872  |  | Return a right-justified string of length width.  | 
12873  |  |  | 
12874  |  | Padding is done using the specified fill character (default is a space).  | 
12875  |  | [clinic start generated code]*/  | 
12876  |  |  | 
12877  |  | static PyObject *  | 
12878  |  | unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)  | 
12879  |  | /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/  | 
12880  | 0  | { | 
12881  | 0  |     if (PyUnicode_READY(self) == -1)  | 
12882  | 0  |         return NULL;  | 
12883  |  |  | 
12884  | 0  |     if (PyUnicode_GET_LENGTH(self) >= width)  | 
12885  | 0  |         return unicode_result_unchanged(self);  | 
12886  |  |  | 
12887  | 0  |     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);  | 
12888  | 0  | }  | 
12889  |  |  | 
12890  |  | PyObject *  | 
12891  |  | PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)  | 
12892  | 0  | { | 
12893  | 0  |     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))  | 
12894  | 0  |         return NULL;  | 
12895  |  |  | 
12896  | 0  |     return split(s, sep, maxsplit);  | 
12897  | 0  | }  | 
12898  |  |  | 
12899  |  | /*[clinic input]  | 
12900  |  | str.split as unicode_split  | 
12901  |  |  | 
12902  |  |     sep: object = None  | 
12903  |  |         The delimiter according which to split the string.  | 
12904  |  |         None (the default value) means split according to any whitespace,  | 
12905  |  |         and discard empty strings from the result.  | 
12906  |  |     maxsplit: Py_ssize_t = -1  | 
12907  |  |         Maximum number of splits to do.  | 
12908  |  |         -1 (the default value) means no limit.  | 
12909  |  |  | 
12910  |  | Return a list of the words in the string, using sep as the delimiter string.  | 
12911  |  | [clinic start generated code]*/  | 
12912  |  |  | 
12913  |  | static PyObject *  | 
12914  |  | unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)  | 
12915  |  | /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/  | 
12916  | 74  | { | 
12917  | 74  |     if (sep == Py_None)  | 
12918  | 4  |         return split(self, NULL, maxsplit);  | 
12919  | 70  |     if (PyUnicode_Check(sep))  | 
12920  | 70  |         return split(self, sep, maxsplit);  | 
12921  |  |  | 
12922  | 0  |     PyErr_Format(PyExc_TypeError,  | 
12923  | 0  |                  "must be str or None, not %.100s",  | 
12924  | 0  |                  Py_TYPE(sep)->tp_name);  | 
12925  | 0  |     return NULL;  | 
12926  | 70  | }  | 
12927  |  |  | 
12928  |  | PyObject *  | 
12929  |  | PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)  | 
12930  | 0  | { | 
12931  | 0  |     PyObject* out;  | 
12932  | 0  |     int kind1, kind2;  | 
12933  | 0  |     void *buf1, *buf2;  | 
12934  | 0  |     Py_ssize_t len1, len2;  | 
12935  |  | 
  | 
12936  | 0  |     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)  | 
12937  | 0  |         return NULL;  | 
12938  |  |  | 
12939  | 0  |     kind1 = PyUnicode_KIND(str_obj);  | 
12940  | 0  |     kind2 = PyUnicode_KIND(sep_obj);  | 
12941  | 0  |     len1 = PyUnicode_GET_LENGTH(str_obj);  | 
12942  | 0  |     len2 = PyUnicode_GET_LENGTH(sep_obj);  | 
12943  | 0  |     if (kind1 < kind2 || len1 < len2) { | 
12944  | 0  |         _Py_INCREF_UNICODE_EMPTY();  | 
12945  | 0  |         if (!unicode_empty)  | 
12946  | 0  |             out = NULL;  | 
12947  | 0  |         else { | 
12948  | 0  |             out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);  | 
12949  | 0  |             Py_DECREF(unicode_empty);  | 
12950  | 0  |         }  | 
12951  | 0  |         return out;  | 
12952  | 0  |     }  | 
12953  | 0  |     buf1 = PyUnicode_DATA(str_obj);  | 
12954  | 0  |     buf2 = PyUnicode_DATA(sep_obj);  | 
12955  | 0  |     if (kind2 != kind1) { | 
12956  | 0  |         buf2 = _PyUnicode_AsKind(sep_obj, kind1);  | 
12957  | 0  |         if (!buf2)  | 
12958  | 0  |             return NULL;  | 
12959  | 0  |     }  | 
12960  |  |  | 
12961  | 0  |     switch (kind1) { | 
12962  | 0  |     case PyUnicode_1BYTE_KIND:  | 
12963  | 0  |         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))  | 
12964  | 0  |             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);  | 
12965  | 0  |         else  | 
12966  | 0  |             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);  | 
12967  | 0  |         break;  | 
12968  | 0  |     case PyUnicode_2BYTE_KIND:  | 
12969  | 0  |         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);  | 
12970  | 0  |         break;  | 
12971  | 0  |     case PyUnicode_4BYTE_KIND:  | 
12972  | 0  |         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);  | 
12973  | 0  |         break;  | 
12974  | 0  |     default:  | 
12975  | 0  |         Py_UNREACHABLE();  | 
12976  | 0  |     }  | 
12977  |  |  | 
12978  | 0  |     if (kind2 != kind1)  | 
12979  | 0  |         PyMem_Free(buf2);  | 
12980  |  | 
  | 
12981  | 0  |     return out;  | 
12982  | 0  | }  | 
12983  |  |  | 
12984  |  |  | 
12985  |  | PyObject *  | 
12986  |  | PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)  | 
12987  | 2.20k  | { | 
12988  | 2.20k  |     PyObject* out;  | 
12989  | 2.20k  |     int kind1, kind2;  | 
12990  | 2.20k  |     void *buf1, *buf2;  | 
12991  | 2.20k  |     Py_ssize_t len1, len2;  | 
12992  |  |  | 
12993  | 2.20k  |     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)  | 
12994  | 0  |         return NULL;  | 
12995  |  |  | 
12996  | 2.20k  |     kind1 = PyUnicode_KIND(str_obj);  | 
12997  | 2.20k  |     kind2 = PyUnicode_KIND(sep_obj);  | 
12998  | 2.20k  |     len1 = PyUnicode_GET_LENGTH(str_obj);  | 
12999  | 2.20k  |     len2 = PyUnicode_GET_LENGTH(sep_obj);  | 
13000  | 2.20k  |     if (kind1 < kind2 || len1 < len2) { | 
13001  | 0  |         _Py_INCREF_UNICODE_EMPTY();  | 
13002  | 0  |         if (!unicode_empty)  | 
13003  | 0  |             out = NULL;  | 
13004  | 0  |         else { | 
13005  | 0  |             out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);  | 
13006  | 0  |             Py_DECREF(unicode_empty);  | 
13007  | 0  |         }  | 
13008  | 0  |         return out;  | 
13009  | 0  |     }  | 
13010  | 2.20k  |     buf1 = PyUnicode_DATA(str_obj);  | 
13011  | 2.20k  |     buf2 = PyUnicode_DATA(sep_obj);  | 
13012  | 2.20k  |     if (kind2 != kind1) { | 
13013  | 0  |         buf2 = _PyUnicode_AsKind(sep_obj, kind1);  | 
13014  | 0  |         if (!buf2)  | 
13015  | 0  |             return NULL;  | 
13016  | 0  |     }  | 
13017  |  |  | 
13018  | 2.20k  |     switch (kind1) { | 
13019  | 2.20k  |     case PyUnicode_1BYTE_KIND:  | 
13020  | 2.20k  |         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))  | 
13021  | 2.20k  |             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);  | 
13022  | 0  |         else  | 
13023  | 0  |             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);  | 
13024  | 2.20k  |         break;  | 
13025  | 0  |     case PyUnicode_2BYTE_KIND:  | 
13026  | 0  |         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);  | 
13027  | 0  |         break;  | 
13028  | 0  |     case PyUnicode_4BYTE_KIND:  | 
13029  | 0  |         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);  | 
13030  | 0  |         break;  | 
13031  | 0  |     default:  | 
13032  | 0  |         Py_UNREACHABLE();  | 
13033  | 2.20k  |     }  | 
13034  |  |  | 
13035  | 2.20k  |     if (kind2 != kind1)  | 
13036  | 0  |         PyMem_Free(buf2);  | 
13037  |  |  | 
13038  | 2.20k  |     return out;  | 
13039  | 2.20k  | }  | 
13040  |  |  | 
13041  |  | /*[clinic input]  | 
13042  |  | str.partition as unicode_partition  | 
13043  |  |  | 
13044  |  |     sep: object  | 
13045  |  |     /  | 
13046  |  |  | 
13047  |  | Partition the string into three parts using the given separator.  | 
13048  |  |  | 
13049  |  | This will search for the separator in the string.  If the separator is found,  | 
13050  |  | returns a 3-tuple containing the part before the separator, the separator  | 
13051  |  | itself, and the part after it.  | 
13052  |  |  | 
13053  |  | If the separator is not found, returns a 3-tuple containing the original string  | 
13054  |  | and two empty strings.  | 
13055  |  | [clinic start generated code]*/  | 
13056  |  |  | 
13057  |  | static PyObject *  | 
13058  |  | unicode_partition(PyObject *self, PyObject *sep)  | 
13059  |  | /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/  | 
13060  | 0  | { | 
13061  | 0  |     return PyUnicode_Partition(self, sep);  | 
13062  | 0  | }  | 
13063  |  |  | 
13064  |  | /*[clinic input]  | 
13065  |  | str.rpartition as unicode_rpartition = str.partition  | 
13066  |  |  | 
13067  |  | Partition the string into three parts using the given separator.  | 
13068  |  |  | 
13069  |  | This will search for the separator in the string, starting at the end. If  | 
13070  |  | the separator is found, returns a 3-tuple containing the part before the  | 
13071  |  | separator, the separator itself, and the part after it.  | 
13072  |  |  | 
13073  |  | If the separator is not found, returns a 3-tuple containing two empty strings  | 
13074  |  | and the original string.  | 
13075  |  | [clinic start generated code]*/  | 
13076  |  |  | 
13077  |  | static PyObject *  | 
13078  |  | unicode_rpartition(PyObject *self, PyObject *sep)  | 
13079  |  | /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/  | 
13080  | 2.20k  | { | 
13081  | 2.20k  |     return PyUnicode_RPartition(self, sep);  | 
13082  | 2.20k  | }  | 
13083  |  |  | 
13084  |  | PyObject *  | 
13085  |  | PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)  | 
13086  | 0  | { | 
13087  | 0  |     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))  | 
13088  | 0  |         return NULL;  | 
13089  |  |  | 
13090  | 0  |     return rsplit(s, sep, maxsplit);  | 
13091  | 0  | }  | 
13092  |  |  | 
13093  |  | /*[clinic input]  | 
13094  |  | str.rsplit as unicode_rsplit = str.split  | 
13095  |  |  | 
13096  |  | Return a list of the words in the string, using sep as the delimiter string.  | 
13097  |  |  | 
13098  |  | Splits are done starting at the end of the string and working to the front.  | 
13099  |  | [clinic start generated code]*/  | 
13100  |  |  | 
13101  |  | static PyObject *  | 
13102  |  | unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)  | 
13103  |  | /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/  | 
13104  | 0  | { | 
13105  | 0  |     if (sep == Py_None)  | 
13106  | 0  |         return rsplit(self, NULL, maxsplit);  | 
13107  | 0  |     if (PyUnicode_Check(sep))  | 
13108  | 0  |         return rsplit(self, sep, maxsplit);  | 
13109  |  |  | 
13110  | 0  |     PyErr_Format(PyExc_TypeError,  | 
13111  | 0  |                  "must be str or None, not %.100s",  | 
13112  | 0  |                  Py_TYPE(sep)->tp_name);  | 
13113  | 0  |     return NULL;  | 
13114  | 0  | }  | 
13115  |  |  | 
13116  |  | /*[clinic input]  | 
13117  |  | str.splitlines as unicode_splitlines  | 
13118  |  |  | 
13119  |  |     keepends: bool(accept={int}) = False | 
13120  |  |  | 
13121  |  | Return a list of the lines in the string, breaking at line boundaries.  | 
13122  |  |  | 
13123  |  | Line breaks are not included in the resulting list unless keepends is given and  | 
13124  |  | true.  | 
13125  |  | [clinic start generated code]*/  | 
13126  |  |  | 
13127  |  | static PyObject *  | 
13128  |  | unicode_splitlines_impl(PyObject *self, int keepends)  | 
13129  |  | /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/  | 
13130  | 0  | { | 
13131  | 0  |     return PyUnicode_Splitlines(self, keepends);  | 
13132  | 0  | }  | 
13133  |  |  | 
13134  |  | static  | 
13135  |  | PyObject *unicode_str(PyObject *self)  | 
13136  | 0  | { | 
13137  | 0  |     return unicode_result_unchanged(self);  | 
13138  | 0  | }  | 
13139  |  |  | 
13140  |  | /*[clinic input]  | 
13141  |  | str.swapcase as unicode_swapcase  | 
13142  |  |  | 
13143  |  | Convert uppercase characters to lowercase and lowercase characters to uppercase.  | 
13144  |  | [clinic start generated code]*/  | 
13145  |  |  | 
13146  |  | static PyObject *  | 
13147  |  | unicode_swapcase_impl(PyObject *self)  | 
13148  |  | /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/  | 
13149  | 0  | { | 
13150  | 0  |     if (PyUnicode_READY(self) == -1)  | 
13151  | 0  |         return NULL;  | 
13152  | 0  |     return case_operation(self, do_swapcase);  | 
13153  | 0  | }  | 
13154  |  |  | 
13155  |  | /*[clinic input]  | 
13156  |  |  | 
13157  |  | @staticmethod  | 
13158  |  | str.maketrans as unicode_maketrans  | 
13159  |  |  | 
13160  |  |   x: object  | 
13161  |  |  | 
13162  |  |   y: unicode=NULL  | 
13163  |  |  | 
13164  |  |   z: unicode=NULL  | 
13165  |  |  | 
13166  |  |   /  | 
13167  |  |  | 
13168  |  | Return a translation table usable for str.translate().  | 
13169  |  |  | 
13170  |  | If there is only one argument, it must be a dictionary mapping Unicode  | 
13171  |  | ordinals (integers) or characters to Unicode ordinals, strings or None.  | 
13172  |  | Character keys will be then converted to ordinals.  | 
13173  |  | If there are two arguments, they must be strings of equal length, and  | 
13174  |  | in the resulting dictionary, each character in x will be mapped to the  | 
13175  |  | character at the same position in y. If there is a third argument, it  | 
13176  |  | must be a string, whose characters will be mapped to None in the result.  | 
13177  |  | [clinic start generated code]*/  | 
13178  |  |  | 
13179  |  | static PyObject *  | 
13180  |  | unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)  | 
13181  |  | /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/  | 
13182  | 0  | { | 
13183  | 0  |     PyObject *new = NULL, *key, *value;  | 
13184  | 0  |     Py_ssize_t i = 0;  | 
13185  | 0  |     int res;  | 
13186  |  | 
  | 
13187  | 0  |     new = PyDict_New();  | 
13188  | 0  |     if (!new)  | 
13189  | 0  |         return NULL;  | 
13190  | 0  |     if (y != NULL) { | 
13191  | 0  |         int x_kind, y_kind, z_kind;  | 
13192  | 0  |         void *x_data, *y_data, *z_data;  | 
13193  |  |  | 
13194  |  |         /* x must be a string too, of equal length */  | 
13195  | 0  |         if (!PyUnicode_Check(x)) { | 
13196  | 0  |             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "  | 
13197  | 0  |                             "be a string if there is a second argument");  | 
13198  | 0  |             goto err;  | 
13199  | 0  |         }  | 
13200  | 0  |         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { | 
13201  | 0  |             PyErr_SetString(PyExc_ValueError, "the first two maketrans "  | 
13202  | 0  |                             "arguments must have equal length");  | 
13203  | 0  |             goto err;  | 
13204  | 0  |         }  | 
13205  |  |         /* create entries for translating chars in x to those in y */  | 
13206  | 0  |         x_kind = PyUnicode_KIND(x);  | 
13207  | 0  |         y_kind = PyUnicode_KIND(y);  | 
13208  | 0  |         x_data = PyUnicode_DATA(x);  | 
13209  | 0  |         y_data = PyUnicode_DATA(y);  | 
13210  | 0  |         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { | 
13211  | 0  |             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));  | 
13212  | 0  |             if (!key)  | 
13213  | 0  |                 goto err;  | 
13214  | 0  |             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));  | 
13215  | 0  |             if (!value) { | 
13216  | 0  |                 Py_DECREF(key);  | 
13217  | 0  |                 goto err;  | 
13218  | 0  |             }  | 
13219  | 0  |             res = PyDict_SetItem(new, key, value);  | 
13220  | 0  |             Py_DECREF(key);  | 
13221  | 0  |             Py_DECREF(value);  | 
13222  | 0  |             if (res < 0)  | 
13223  | 0  |                 goto err;  | 
13224  | 0  |         }  | 
13225  |  |         /* create entries for deleting chars in z */  | 
13226  | 0  |         if (z != NULL) { | 
13227  | 0  |             z_kind = PyUnicode_KIND(z);  | 
13228  | 0  |             z_data = PyUnicode_DATA(z);  | 
13229  | 0  |             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { | 
13230  | 0  |                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));  | 
13231  | 0  |                 if (!key)  | 
13232  | 0  |                     goto err;  | 
13233  | 0  |                 res = PyDict_SetItem(new, key, Py_None);  | 
13234  | 0  |                 Py_DECREF(key);  | 
13235  | 0  |                 if (res < 0)  | 
13236  | 0  |                     goto err;  | 
13237  | 0  |             }  | 
13238  | 0  |         }  | 
13239  | 0  |     } else { | 
13240  | 0  |         int kind;  | 
13241  | 0  |         void *data;  | 
13242  |  |  | 
13243  |  |         /* x must be a dict */  | 
13244  | 0  |         if (!PyDict_CheckExact(x)) { | 
13245  | 0  |             PyErr_SetString(PyExc_TypeError, "if you give only one argument "  | 
13246  | 0  |                             "to maketrans it must be a dict");  | 
13247  | 0  |             goto err;  | 
13248  | 0  |         }  | 
13249  |  |         /* copy entries into the new dict, converting string keys to int keys */  | 
13250  | 0  |         while (PyDict_Next(x, &i, &key, &value)) { | 
13251  | 0  |             if (PyUnicode_Check(key)) { | 
13252  |  |                 /* convert string keys to integer keys */  | 
13253  | 0  |                 PyObject *newkey;  | 
13254  | 0  |                 if (PyUnicode_GET_LENGTH(key) != 1) { | 
13255  | 0  |                     PyErr_SetString(PyExc_ValueError, "string keys in translate "  | 
13256  | 0  |                                     "table must be of length 1");  | 
13257  | 0  |                     goto err;  | 
13258  | 0  |                 }  | 
13259  | 0  |                 kind = PyUnicode_KIND(key);  | 
13260  | 0  |                 data = PyUnicode_DATA(key);  | 
13261  | 0  |                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));  | 
13262  | 0  |                 if (!newkey)  | 
13263  | 0  |                     goto err;  | 
13264  | 0  |                 res = PyDict_SetItem(new, newkey, value);  | 
13265  | 0  |                 Py_DECREF(newkey);  | 
13266  | 0  |                 if (res < 0)  | 
13267  | 0  |                     goto err;  | 
13268  | 0  |             } else if (PyLong_Check(key)) { | 
13269  |  |                 /* just keep integer keys */  | 
13270  | 0  |                 if (PyDict_SetItem(new, key, value) < 0)  | 
13271  | 0  |                     goto err;  | 
13272  | 0  |             } else { | 
13273  | 0  |                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "  | 
13274  | 0  |                                 "be strings or integers");  | 
13275  | 0  |                 goto err;  | 
13276  | 0  |             }  | 
13277  | 0  |         }  | 
13278  | 0  |     }  | 
13279  | 0  |     return new;  | 
13280  | 0  |   err:  | 
13281  | 0  |     Py_DECREF(new);  | 
13282  | 0  |     return NULL;  | 
13283  | 0  | }  | 
13284  |  |  | 
13285  |  | /*[clinic input]  | 
13286  |  | str.translate as unicode_translate  | 
13287  |  |  | 
13288  |  |     table: object  | 
13289  |  |         Translation table, which must be a mapping of Unicode ordinals to  | 
13290  |  |         Unicode ordinals, strings, or None.  | 
13291  |  |     /  | 
13292  |  |  | 
13293  |  | Replace each character in the string using the given translation table.  | 
13294  |  |  | 
13295  |  | The table must implement lookup/indexing via __getitem__, for instance a  | 
13296  |  | dictionary or list.  If this operation raises LookupError, the character is  | 
13297  |  | left untouched.  Characters mapped to None are deleted.  | 
13298  |  | [clinic start generated code]*/  | 
13299  |  |  | 
13300  |  | static PyObject *  | 
13301  |  | unicode_translate(PyObject *self, PyObject *table)  | 
13302  |  | /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/  | 
13303  | 48  | { | 
13304  | 48  |     return _PyUnicode_TranslateCharmap(self, table, "ignore");  | 
13305  | 48  | }  | 
13306  |  |  | 
13307  |  | /*[clinic input]  | 
13308  |  | str.upper as unicode_upper  | 
13309  |  |  | 
13310  |  | Return a copy of the string converted to uppercase.  | 
13311  |  | [clinic start generated code]*/  | 
13312  |  |  | 
13313  |  | static PyObject *  | 
13314  |  | unicode_upper_impl(PyObject *self)  | 
13315  |  | /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/  | 
13316  | 36  | { | 
13317  | 36  |     if (PyUnicode_READY(self) == -1)  | 
13318  | 0  |         return NULL;  | 
13319  | 36  |     if (PyUnicode_IS_ASCII(self))  | 
13320  | 36  |         return ascii_upper_or_lower(self, 0);  | 
13321  | 0  |     return case_operation(self, do_upper);  | 
13322  | 36  | }  | 
13323  |  |  | 
13324  |  | /*[clinic input]  | 
13325  |  | str.zfill as unicode_zfill  | 
13326  |  |  | 
13327  |  |     width: Py_ssize_t  | 
13328  |  |     /  | 
13329  |  |  | 
13330  |  | Pad a numeric string with zeros on the left, to fill a field of the given width.  | 
13331  |  |  | 
13332  |  | The string is never truncated.  | 
13333  |  | [clinic start generated code]*/  | 
13334  |  |  | 
13335  |  | static PyObject *  | 
13336  |  | unicode_zfill_impl(PyObject *self, Py_ssize_t width)  | 
13337  |  | /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/  | 
13338  | 0  | { | 
13339  | 0  |     Py_ssize_t fill;  | 
13340  | 0  |     PyObject *u;  | 
13341  | 0  |     int kind;  | 
13342  | 0  |     void *data;  | 
13343  | 0  |     Py_UCS4 chr;  | 
13344  |  | 
  | 
13345  | 0  |     if (PyUnicode_READY(self) == -1)  | 
13346  | 0  |         return NULL;  | 
13347  |  |  | 
13348  | 0  |     if (PyUnicode_GET_LENGTH(self) >= width)  | 
13349  | 0  |         return unicode_result_unchanged(self);  | 
13350  |  |  | 
13351  | 0  |     fill = width - PyUnicode_GET_LENGTH(self);  | 
13352  |  | 
  | 
13353  | 0  |     u = pad(self, fill, 0, '0');  | 
13354  |  | 
  | 
13355  | 0  |     if (u == NULL)  | 
13356  | 0  |         return NULL;  | 
13357  |  |  | 
13358  | 0  |     kind = PyUnicode_KIND(u);  | 
13359  | 0  |     data = PyUnicode_DATA(u);  | 
13360  | 0  |     chr = PyUnicode_READ(kind, data, fill);  | 
13361  |  | 
  | 
13362  | 0  |     if (chr == '+' || chr == '-') { | 
13363  |  |         /* move sign to beginning of string */  | 
13364  | 0  |         PyUnicode_WRITE(kind, data, 0, chr);  | 
13365  | 0  |         PyUnicode_WRITE(kind, data, fill, '0');  | 
13366  | 0  |     }  | 
13367  |  |  | 
13368  | 0  |     assert(_PyUnicode_CheckConsistency(u, 1));  | 
13369  | 0  |     return u;  | 
13370  | 0  | }  | 
13371  |  |  | 
13372  |  | #if 0  | 
13373  |  | static PyObject *  | 
13374  |  | unicode__decimal2ascii(PyObject *self)  | 
13375  |  | { | 
13376  |  |     return PyUnicode_TransformDecimalAndSpaceToASCII(self);  | 
13377  |  | }  | 
13378  |  | #endif  | 
13379  |  |  | 
13380  |  | PyDoc_STRVAR(startswith__doc__,  | 
13381  |  |              "S.startswith(prefix[, start[, end]]) -> bool\n\  | 
13382  |  | \n\  | 
13383  |  | Return True if S starts with the specified prefix, False otherwise.\n\  | 
13384  |  | With optional start, test S beginning at that position.\n\  | 
13385  |  | With optional end, stop comparing S at that position.\n\  | 
13386  |  | prefix can also be a tuple of strings to try.");  | 
13387  |  |  | 
13388  |  | static PyObject *  | 
13389  |  | unicode_startswith(PyObject *self,  | 
13390  |  |                    PyObject *args)  | 
13391  | 526  | { | 
13392  | 526  |     PyObject *subobj;  | 
13393  | 526  |     PyObject *substring;  | 
13394  | 526  |     Py_ssize_t start = 0;  | 
13395  | 526  |     Py_ssize_t end = PY_SSIZE_T_MAX;  | 
13396  | 526  |     int result;  | 
13397  |  |  | 
13398  | 526  |     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) | 
13399  | 0  |         return NULL;  | 
13400  | 526  |     if (PyTuple_Check(subobj)) { | 
13401  | 43  |         Py_ssize_t i;  | 
13402  | 172  |         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { | 
13403  | 129  |             substring = PyTuple_GET_ITEM(subobj, i);  | 
13404  | 129  |             if (!PyUnicode_Check(substring)) { | 
13405  | 0  |                 PyErr_Format(PyExc_TypeError,  | 
13406  | 0  |                              "tuple for startswith must only contain str, "  | 
13407  | 0  |                              "not %.100s",  | 
13408  | 0  |                              Py_TYPE(substring)->tp_name);  | 
13409  | 0  |                 return NULL;  | 
13410  | 0  |             }  | 
13411  | 129  |             result = tailmatch(self, substring, start, end, -1);  | 
13412  | 129  |             if (result == -1)  | 
13413  | 0  |                 return NULL;  | 
13414  | 129  |             if (result) { | 
13415  | 0  |                 Py_RETURN_TRUE;  | 
13416  | 0  |             }  | 
13417  | 129  |         }  | 
13418  |  |         /* nothing matched */  | 
13419  | 43  |         Py_RETURN_FALSE;  | 
13420  | 43  |     }  | 
13421  | 483  |     if (!PyUnicode_Check(subobj)) { | 
13422  | 0  |         PyErr_Format(PyExc_TypeError,  | 
13423  | 0  |                      "startswith first arg must be str or "  | 
13424  | 0  |                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);  | 
13425  | 0  |         return NULL;  | 
13426  | 0  |     }  | 
13427  | 483  |     result = tailmatch(self, subobj, start, end, -1);  | 
13428  | 483  |     if (result == -1)  | 
13429  | 0  |         return NULL;  | 
13430  | 483  |     return PyBool_FromLong(result);  | 
13431  | 483  | }  | 
13432  |  |  | 
13433  |  |  | 
13434  |  | PyDoc_STRVAR(endswith__doc__,  | 
13435  |  |              "S.endswith(suffix[, start[, end]]) -> bool\n\  | 
13436  |  | \n\  | 
13437  |  | Return True if S ends with the specified suffix, False otherwise.\n\  | 
13438  |  | With optional start, test S beginning at that position.\n\  | 
13439  |  | With optional end, stop comparing S at that position.\n\  | 
13440  |  | suffix can also be a tuple of strings to try.");  | 
13441  |  |  | 
13442  |  | static PyObject *  | 
13443  |  | unicode_endswith(PyObject *self,  | 
13444  |  |                  PyObject *args)  | 
13445  | 404  | { | 
13446  | 404  |     PyObject *subobj;  | 
13447  | 404  |     PyObject *substring;  | 
13448  | 404  |     Py_ssize_t start = 0;  | 
13449  | 404  |     Py_ssize_t end = PY_SSIZE_T_MAX;  | 
13450  | 404  |     int result;  | 
13451  |  |  | 
13452  | 404  |     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) | 
13453  | 0  |         return NULL;  | 
13454  | 404  |     if (PyTuple_Check(subobj)) { | 
13455  | 235  |         Py_ssize_t i;  | 
13456  | 235  |         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { | 
13457  | 235  |             substring = PyTuple_GET_ITEM(subobj, i);  | 
13458  | 235  |             if (!PyUnicode_Check(substring)) { | 
13459  | 0  |                 PyErr_Format(PyExc_TypeError,  | 
13460  | 0  |                              "tuple for endswith must only contain str, "  | 
13461  | 0  |                              "not %.100s",  | 
13462  | 0  |                              Py_TYPE(substring)->tp_name);  | 
13463  | 0  |                 return NULL;  | 
13464  | 0  |             }  | 
13465  | 235  |             result = tailmatch(self, substring, start, end, +1);  | 
13466  | 235  |             if (result == -1)  | 
13467  | 0  |                 return NULL;  | 
13468  | 235  |             if (result) { | 
13469  | 235  |                 Py_RETURN_TRUE;  | 
13470  | 235  |             }  | 
13471  | 235  |         }  | 
13472  | 235  |         Py_RETURN_FALSE;  | 
13473  | 235  |     }  | 
13474  | 169  |     if (!PyUnicode_Check(subobj)) { | 
13475  | 0  |         PyErr_Format(PyExc_TypeError,  | 
13476  | 0  |                      "endswith first arg must be str or "  | 
13477  | 0  |                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);  | 
13478  | 0  |         return NULL;  | 
13479  | 0  |     }  | 
13480  | 169  |     result = tailmatch(self, subobj, start, end, +1);  | 
13481  | 169  |     if (result == -1)  | 
13482  | 0  |         return NULL;  | 
13483  | 169  |     return PyBool_FromLong(result);  | 
13484  | 169  | }  | 
13485  |  |  | 
13486  |  | static inline void  | 
13487  |  | _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)  | 
13488  | 70.7k  | { | 
13489  | 70.7k  |     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);  | 
13490  | 70.7k  |     writer->data = PyUnicode_DATA(writer->buffer);  | 
13491  |  |  | 
13492  | 70.7k  |     if (!writer->readonly) { | 
13493  | 70.7k  |         writer->kind = PyUnicode_KIND(writer->buffer);  | 
13494  | 70.7k  |         writer->size = PyUnicode_GET_LENGTH(writer->buffer);  | 
13495  | 70.7k  |     }  | 
13496  | 0  |     else { | 
13497  |  |         /* use a value smaller than PyUnicode_1BYTE_KIND() so  | 
13498  |  |            _PyUnicodeWriter_PrepareKind() will copy the buffer. */  | 
13499  | 0  |         writer->kind = PyUnicode_WCHAR_KIND;  | 
13500  | 0  |         assert(writer->kind <= PyUnicode_1BYTE_KIND);  | 
13501  |  |  | 
13502  |  |         /* Copy-on-write mode: set buffer size to 0 so  | 
13503  |  |          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on  | 
13504  |  |          * next write. */  | 
13505  | 0  |         writer->size = 0;  | 
13506  | 0  |     }  | 
13507  | 70.7k  | }  | 
13508  |  |  | 
13509  |  | void  | 
13510  |  | _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)  | 
13511  | 70.6k  | { | 
13512  | 70.6k  |     memset(writer, 0, sizeof(*writer));  | 
13513  |  |  | 
13514  |  |     /* ASCII is the bare minimum */  | 
13515  | 70.6k  |     writer->min_char = 127;  | 
13516  |  |  | 
13517  |  |     /* use a value smaller than PyUnicode_1BYTE_KIND() so  | 
13518  |  |        _PyUnicodeWriter_PrepareKind() will copy the buffer. */  | 
13519  | 70.6k  |     writer->kind = PyUnicode_WCHAR_KIND;  | 
13520  | 70.6k  |     assert(writer->kind <= PyUnicode_1BYTE_KIND);  | 
13521  | 70.6k  | }  | 
13522  |  |  | 
13523  |  | int  | 
13524  |  | _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,  | 
13525  |  |                                  Py_ssize_t length, Py_UCS4 maxchar)  | 
13526  | 70.7k  | { | 
13527  | 70.7k  |     Py_ssize_t newlen;  | 
13528  | 70.7k  |     PyObject *newbuffer;  | 
13529  |  |  | 
13530  | 70.7k  |     assert(maxchar <= MAX_UNICODE);  | 
13531  |  |  | 
13532  |  |     /* ensure that the _PyUnicodeWriter_Prepare macro was used */  | 
13533  | 70.7k  |     assert((maxchar > writer->maxchar && length >= 0)  | 
13534  | 70.7k  |            || length > 0);  | 
13535  |  |  | 
13536  | 70.7k  |     if (length > PY_SSIZE_T_MAX - writer->pos) { | 
13537  | 0  |         PyErr_NoMemory();  | 
13538  | 0  |         return -1;  | 
13539  | 0  |     }  | 
13540  | 70.7k  |     newlen = writer->pos + length;  | 
13541  |  |  | 
13542  | 70.7k  |     maxchar = Py_MAX(maxchar, writer->min_char);  | 
13543  |  |  | 
13544  | 70.7k  |     if (writer->buffer == NULL) { | 
13545  | 70.6k  |         assert(!writer->readonly);  | 
13546  | 70.6k  |         if (writer->overallocate  | 
13547  | 6.30k  |             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { | 
13548  |  |             /* overallocate to limit the number of realloc() */  | 
13549  | 6.30k  |             newlen += newlen / OVERALLOCATE_FACTOR;  | 
13550  | 6.30k  |         }  | 
13551  | 70.6k  |         if (newlen < writer->min_length)  | 
13552  | 6.30k  |             newlen = writer->min_length;  | 
13553  |  |  | 
13554  | 70.6k  |         writer->buffer = PyUnicode_New(newlen, maxchar);  | 
13555  | 70.6k  |         if (writer->buffer == NULL)  | 
13556  | 0  |             return -1;  | 
13557  | 70.6k  |     }  | 
13558  | 63  |     else if (newlen > writer->size) { | 
13559  | 34  |         if (writer->overallocate  | 
13560  | 6  |             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { | 
13561  |  |             /* overallocate to limit the number of realloc() */  | 
13562  | 6  |             newlen += newlen / OVERALLOCATE_FACTOR;  | 
13563  | 6  |         }  | 
13564  | 34  |         if (newlen < writer->min_length)  | 
13565  | 0  |             newlen = writer->min_length;  | 
13566  |  |  | 
13567  | 34  |         if (maxchar > writer->maxchar || writer->readonly) { | 
13568  |  |             /* resize + widen */  | 
13569  | 0  |             maxchar = Py_MAX(maxchar, writer->maxchar);  | 
13570  | 0  |             newbuffer = PyUnicode_New(newlen, maxchar);  | 
13571  | 0  |             if (newbuffer == NULL)  | 
13572  | 0  |                 return -1;  | 
13573  | 0  |             _PyUnicode_FastCopyCharacters(newbuffer, 0,  | 
13574  | 0  |                                           writer->buffer, 0, writer->pos);  | 
13575  | 0  |             Py_DECREF(writer->buffer);  | 
13576  | 0  |             writer->readonly = 0;  | 
13577  | 0  |         }  | 
13578  | 34  |         else { | 
13579  | 34  |             newbuffer = resize_compact(writer->buffer, newlen);  | 
13580  | 34  |             if (newbuffer == NULL)  | 
13581  | 0  |                 return -1;  | 
13582  | 34  |         }  | 
13583  | 34  |         writer->buffer = newbuffer;  | 
13584  | 34  |     }  | 
13585  | 29  |     else if (maxchar > writer->maxchar) { | 
13586  | 29  |         assert(!writer->readonly);  | 
13587  | 29  |         newbuffer = PyUnicode_New(writer->size, maxchar);  | 
13588  | 29  |         if (newbuffer == NULL)  | 
13589  | 0  |             return -1;  | 
13590  | 29  |         _PyUnicode_FastCopyCharacters(newbuffer, 0,  | 
13591  | 29  |                                       writer->buffer, 0, writer->pos);  | 
13592  | 29  |         Py_SETREF(writer->buffer, newbuffer);  | 
13593  | 29  |     }  | 
13594  | 70.7k  |     _PyUnicodeWriter_Update(writer);  | 
13595  | 70.7k  |     return 0;  | 
13596  |  |  | 
13597  | 70.7k  | #undef OVERALLOCATE_FACTOR  | 
13598  | 70.7k  | }  | 
13599  |  |  | 
13600  |  | int  | 
13601  |  | _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,  | 
13602  |  |                                      enum PyUnicode_Kind kind)  | 
13603  | 0  | { | 
13604  | 0  |     Py_UCS4 maxchar;  | 
13605  |  |  | 
13606  |  |     /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */  | 
13607  | 0  |     assert(writer->kind < kind);  | 
13608  |  | 
  | 
13609  | 0  |     switch (kind)  | 
13610  | 0  |     { | 
13611  | 0  |     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;  | 
13612  | 0  |     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;  | 
13613  | 0  |     case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;  | 
13614  | 0  |     default:  | 
13615  | 0  |         Py_UNREACHABLE();  | 
13616  | 0  |     }  | 
13617  |  |  | 
13618  | 0  |     return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);  | 
13619  | 0  | }  | 
13620  |  |  | 
13621  |  | static inline int  | 
13622  |  | _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)  | 
13623  | 77  | { | 
13624  | 77  |     assert(ch <= MAX_UNICODE);  | 
13625  | 77  |     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)  | 
13626  | 0  |         return -1;  | 
13627  | 77  |     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);  | 
13628  | 77  |     writer->pos++;  | 
13629  | 77  |     return 0;  | 
13630  | 77  | }  | 
13631  |  |  | 
13632  |  | int  | 
13633  |  | _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)  | 
13634  | 4  | { | 
13635  | 4  |     return _PyUnicodeWriter_WriteCharInline(writer, ch);  | 
13636  | 4  | }  | 
13637  |  |  | 
13638  |  | int  | 
13639  |  | _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)  | 
13640  | 12.1k  | { | 
13641  | 12.1k  |     Py_UCS4 maxchar;  | 
13642  | 12.1k  |     Py_ssize_t len;  | 
13643  |  |  | 
13644  | 12.1k  |     if (PyUnicode_READY(str) == -1)  | 
13645  | 0  |         return -1;  | 
13646  | 12.1k  |     len = PyUnicode_GET_LENGTH(str);  | 
13647  | 12.1k  |     if (len == 0)  | 
13648  | 262  |         return 0;  | 
13649  | 11.8k  |     maxchar = PyUnicode_MAX_CHAR_VALUE(str);  | 
13650  | 11.8k  |     if (maxchar > writer->maxchar || len > writer->size - writer->pos) { | 
13651  | 343  |         if (writer->buffer == NULL && !writer->overallocate) { | 
13652  | 0  |             assert(_PyUnicode_CheckConsistency(str, 1));  | 
13653  | 0  |             writer->readonly = 1;  | 
13654  | 0  |             Py_INCREF(str);  | 
13655  | 0  |             writer->buffer = str;  | 
13656  | 0  |             _PyUnicodeWriter_Update(writer);  | 
13657  | 0  |             writer->pos += len;  | 
13658  | 0  |             return 0;  | 
13659  | 0  |         }  | 
13660  | 343  |         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)  | 
13661  | 0  |             return -1;  | 
13662  | 343  |     }  | 
13663  | 11.8k  |     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,  | 
13664  | 11.8k  |                                   str, 0, len);  | 
13665  | 11.8k  |     writer->pos += len;  | 
13666  | 11.8k  |     return 0;  | 
13667  | 11.8k  | }  | 
13668  |  |  | 
13669  |  | int  | 
13670  |  | _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,  | 
13671  |  |                                 Py_ssize_t start, Py_ssize_t end)  | 
13672  | 334  | { | 
13673  | 334  |     Py_UCS4 maxchar;  | 
13674  | 334  |     Py_ssize_t len;  | 
13675  |  |  | 
13676  | 334  |     if (PyUnicode_READY(str) == -1)  | 
13677  | 0  |         return -1;  | 
13678  |  |  | 
13679  | 334  |     assert(0 <= start);  | 
13680  | 334  |     assert(end <= PyUnicode_GET_LENGTH(str));  | 
13681  | 334  |     assert(start <= end);  | 
13682  |  |  | 
13683  | 334  |     if (end == 0)  | 
13684  | 0  |         return 0;  | 
13685  |  |  | 
13686  | 334  |     if (start == 0 && end == PyUnicode_GET_LENGTH(str))  | 
13687  | 0  |         return _PyUnicodeWriter_WriteStr(writer, str);  | 
13688  |  |  | 
13689  | 334  |     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)  | 
13690  | 120  |         maxchar = _PyUnicode_FindMaxChar(str, start, end);  | 
13691  | 214  |     else  | 
13692  | 214  |         maxchar = writer->maxchar;  | 
13693  | 334  |     len = end - start;  | 
13694  |  |  | 
13695  | 334  |     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)  | 
13696  | 0  |         return -1;  | 
13697  |  |  | 
13698  | 334  |     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,  | 
13699  | 334  |                                   str, start, len);  | 
13700  | 334  |     writer->pos += len;  | 
13701  | 334  |     return 0;  | 
13702  | 334  | }  | 
13703  |  |  | 
13704  |  | int  | 
13705  |  | _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,  | 
13706  |  |                                   const char *ascii, Py_ssize_t len)  | 
13707  | 18.4k  | { | 
13708  | 18.4k  |     if (len == -1)  | 
13709  | 0  |         len = strlen(ascii);  | 
13710  |  |  | 
13711  | 18.4k  |     assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);  | 
13712  |  |  | 
13713  | 18.4k  |     if (writer->buffer == NULL && !writer->overallocate) { | 
13714  | 0  |         PyObject *str;  | 
13715  |  | 
  | 
13716  | 0  |         str = _PyUnicode_FromASCII(ascii, len);  | 
13717  | 0  |         if (str == NULL)  | 
13718  | 0  |             return -1;  | 
13719  |  |  | 
13720  | 0  |         writer->readonly = 1;  | 
13721  | 0  |         writer->buffer = str;  | 
13722  | 0  |         _PyUnicodeWriter_Update(writer);  | 
13723  | 0  |         writer->pos += len;  | 
13724  | 0  |         return 0;  | 
13725  | 0  |     }  | 
13726  |  |  | 
13727  | 18.4k  |     if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)  | 
13728  | 0  |         return -1;  | 
13729  |  |  | 
13730  | 18.4k  |     switch (writer->kind)  | 
13731  | 18.4k  |     { | 
13732  | 18.4k  |     case PyUnicode_1BYTE_KIND:  | 
13733  | 18.4k  |     { | 
13734  | 18.4k  |         const Py_UCS1 *str = (const Py_UCS1 *)ascii;  | 
13735  | 18.4k  |         Py_UCS1 *data = writer->data;  | 
13736  |  |  | 
13737  | 18.4k  |         memcpy(data + writer->pos, str, len);  | 
13738  | 18.4k  |         break;  | 
13739  | 0  |     }  | 
13740  | 0  |     case PyUnicode_2BYTE_KIND:  | 
13741  | 0  |     { | 
13742  | 0  |         _PyUnicode_CONVERT_BYTES(  | 
13743  | 0  |             Py_UCS1, Py_UCS2,  | 
13744  | 0  |             ascii, ascii + len,  | 
13745  | 0  |             (Py_UCS2 *)writer->data + writer->pos);  | 
13746  | 0  |         break;  | 
13747  | 0  |     }  | 
13748  | 0  |     case PyUnicode_4BYTE_KIND:  | 
13749  | 0  |     { | 
13750  | 0  |         _PyUnicode_CONVERT_BYTES(  | 
13751  | 0  |             Py_UCS1, Py_UCS4,  | 
13752  | 0  |             ascii, ascii + len,  | 
13753  | 0  |             (Py_UCS4 *)writer->data + writer->pos);  | 
13754  | 0  |         break;  | 
13755  | 0  |     }  | 
13756  | 0  |     default:  | 
13757  | 0  |         Py_UNREACHABLE();  | 
13758  | 18.4k  |     }  | 
13759  |  |  | 
13760  | 18.4k  |     writer->pos += len;  | 
13761  | 18.4k  |     return 0;  | 
13762  | 18.4k  | }  | 
13763  |  |  | 
13764  |  | int  | 
13765  |  | _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,  | 
13766  |  |                                    const char *str, Py_ssize_t len)  | 
13767  | 0  | { | 
13768  | 0  |     Py_UCS4 maxchar;  | 
13769  |  | 
  | 
13770  | 0  |     maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);  | 
13771  | 0  |     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)  | 
13772  | 0  |         return -1;  | 
13773  | 0  |     unicode_write_cstr(writer->buffer, writer->pos, str, len);  | 
13774  | 0  |     writer->pos += len;  | 
13775  | 0  |     return 0;  | 
13776  | 0  | }  | 
13777  |  |  | 
13778  |  | PyObject *  | 
13779  |  | _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)  | 
13780  | 70.6k  | { | 
13781  | 70.6k  |     PyObject *str;  | 
13782  |  |  | 
13783  | 70.6k  |     if (writer->pos == 0) { | 
13784  | 0  |         Py_CLEAR(writer->buffer);  | 
13785  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
13786  | 0  |     }  | 
13787  |  |  | 
13788  | 70.6k  |     str = writer->buffer;  | 
13789  | 70.6k  |     writer->buffer = NULL;  | 
13790  |  |  | 
13791  | 70.6k  |     if (writer->readonly) { | 
13792  | 0  |         assert(PyUnicode_GET_LENGTH(str) == writer->pos);  | 
13793  | 0  |         return str;  | 
13794  | 0  |     }  | 
13795  |  |  | 
13796  | 70.6k  |     if (PyUnicode_GET_LENGTH(str) != writer->pos) { | 
13797  | 6.28k  |         PyObject *str2;  | 
13798  | 6.28k  |         str2 = resize_compact(str, writer->pos);  | 
13799  | 6.28k  |         if (str2 == NULL) { | 
13800  | 0  |             Py_DECREF(str);  | 
13801  | 0  |             return NULL;  | 
13802  | 0  |         }  | 
13803  | 6.28k  |         str = str2;  | 
13804  | 6.28k  |     }  | 
13805  |  |  | 
13806  | 70.6k  |     assert(_PyUnicode_CheckConsistency(str, 1));  | 
13807  | 70.6k  |     return unicode_result_ready(str);  | 
13808  | 70.6k  | }  | 
13809  |  |  | 
13810  |  | void  | 
13811  |  | _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)  | 
13812  | 32  | { | 
13813  | 32  |     Py_CLEAR(writer->buffer);  | 
13814  | 32  | }  | 
13815  |  |  | 
13816  |  | #include "stringlib/unicode_format.h"  | 
13817  |  |  | 
13818  |  | PyDoc_STRVAR(format__doc__,  | 
13819  |  |              "S.format(*args, **kwargs) -> str\n\  | 
13820  |  | \n\  | 
13821  |  | Return a formatted version of S, using substitutions from args and kwargs.\n\  | 
13822  |  | The substitutions are identified by braces ('{' and '}')."); | 
13823  |  |  | 
13824  |  | PyDoc_STRVAR(format_map__doc__,  | 
13825  |  |              "S.format_map(mapping) -> str\n\  | 
13826  |  | \n\  | 
13827  |  | Return a formatted version of S, using substitutions from mapping.\n\  | 
13828  |  | The substitutions are identified by braces ('{' and '}')."); | 
13829  |  |  | 
13830  |  | /*[clinic input]  | 
13831  |  | str.__format__ as unicode___format__  | 
13832  |  |  | 
13833  |  |     format_spec: unicode  | 
13834  |  |     /  | 
13835  |  |  | 
13836  |  | Return a formatted version of the string as described by format_spec.  | 
13837  |  | [clinic start generated code]*/  | 
13838  |  |  | 
13839  |  | static PyObject *  | 
13840  |  | unicode___format___impl(PyObject *self, PyObject *format_spec)  | 
13841  |  | /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/  | 
13842  | 0  | { | 
13843  | 0  |     _PyUnicodeWriter writer;  | 
13844  | 0  |     int ret;  | 
13845  |  | 
  | 
13846  | 0  |     if (PyUnicode_READY(self) == -1)  | 
13847  | 0  |         return NULL;  | 
13848  | 0  |     _PyUnicodeWriter_Init(&writer);  | 
13849  | 0  |     ret = _PyUnicode_FormatAdvancedWriter(&writer,  | 
13850  | 0  |                                           self, format_spec, 0,  | 
13851  | 0  |                                           PyUnicode_GET_LENGTH(format_spec));  | 
13852  | 0  |     if (ret == -1) { | 
13853  | 0  |         _PyUnicodeWriter_Dealloc(&writer);  | 
13854  | 0  |         return NULL;  | 
13855  | 0  |     }  | 
13856  | 0  |     return _PyUnicodeWriter_Finish(&writer);  | 
13857  | 0  | }  | 
13858  |  |  | 
13859  |  | /*[clinic input]  | 
13860  |  | str.__sizeof__ as unicode_sizeof  | 
13861  |  |  | 
13862  |  | Return the size of the string in memory, in bytes.  | 
13863  |  | [clinic start generated code]*/  | 
13864  |  |  | 
13865  |  | static PyObject *  | 
13866  |  | unicode_sizeof_impl(PyObject *self)  | 
13867  |  | /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/  | 
13868  | 0  | { | 
13869  | 0  |     Py_ssize_t size;  | 
13870  |  |  | 
13871  |  |     /* If it's a compact object, account for base structure +  | 
13872  |  |        character data. */  | 
13873  | 0  |     if (PyUnicode_IS_COMPACT_ASCII(self))  | 
13874  | 0  |         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;  | 
13875  | 0  |     else if (PyUnicode_IS_COMPACT(self))  | 
13876  | 0  |         size = sizeof(PyCompactUnicodeObject) +  | 
13877  | 0  |             (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);  | 
13878  | 0  |     else { | 
13879  |  |         /* If it is a two-block object, account for base object, and  | 
13880  |  |            for character block if present. */  | 
13881  | 0  |         size = sizeof(PyUnicodeObject);  | 
13882  | 0  |         if (_PyUnicode_DATA_ANY(self))  | 
13883  | 0  |             size += (PyUnicode_GET_LENGTH(self) + 1) *  | 
13884  | 0  |                 PyUnicode_KIND(self);  | 
13885  | 0  |     }  | 
13886  |  |     /* If the wstr pointer is present, account for it unless it is shared  | 
13887  |  |        with the data pointer. Check if the data is not shared. */  | 
13888  | 0  |     if (_PyUnicode_HAS_WSTR_MEMORY(self))  | 
13889  | 0  |         size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);  | 
13890  | 0  |     if (_PyUnicode_HAS_UTF8_MEMORY(self))  | 
13891  | 0  |         size += PyUnicode_UTF8_LENGTH(self) + 1;  | 
13892  |  | 
  | 
13893  | 0  |     return PyLong_FromSsize_t(size);  | 
13894  | 0  | }  | 
13895  |  |  | 
13896  |  | static PyObject *  | 
13897  |  | unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))  | 
13898  | 0  | { | 
13899  | 0  |     PyObject *copy = _PyUnicode_Copy(v);  | 
13900  | 0  |     if (!copy)  | 
13901  | 0  |         return NULL;  | 
13902  | 0  |     return Py_BuildValue("(N)", copy); | 
13903  | 0  | }  | 
13904  |  |  | 
13905  |  | static PyMethodDef unicode_methods[] = { | 
13906  |  |     UNICODE_ENCODE_METHODDEF  | 
13907  |  |     UNICODE_REPLACE_METHODDEF  | 
13908  |  |     UNICODE_SPLIT_METHODDEF  | 
13909  |  |     UNICODE_RSPLIT_METHODDEF  | 
13910  |  |     UNICODE_JOIN_METHODDEF  | 
13911  |  |     UNICODE_CAPITALIZE_METHODDEF  | 
13912  |  |     UNICODE_CASEFOLD_METHODDEF  | 
13913  |  |     UNICODE_TITLE_METHODDEF  | 
13914  |  |     UNICODE_CENTER_METHODDEF  | 
13915  |  |     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, | 
13916  |  |     UNICODE_EXPANDTABS_METHODDEF  | 
13917  |  |     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, | 
13918  |  |     UNICODE_PARTITION_METHODDEF  | 
13919  |  |     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, | 
13920  |  |     UNICODE_LJUST_METHODDEF  | 
13921  |  |     UNICODE_LOWER_METHODDEF  | 
13922  |  |     UNICODE_LSTRIP_METHODDEF  | 
13923  |  |     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, | 
13924  |  |     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, | 
13925  |  |     UNICODE_RJUST_METHODDEF  | 
13926  |  |     UNICODE_RSTRIP_METHODDEF  | 
13927  |  |     UNICODE_RPARTITION_METHODDEF  | 
13928  |  |     UNICODE_SPLITLINES_METHODDEF  | 
13929  |  |     UNICODE_STRIP_METHODDEF  | 
13930  |  |     UNICODE_SWAPCASE_METHODDEF  | 
13931  |  |     UNICODE_TRANSLATE_METHODDEF  | 
13932  |  |     UNICODE_UPPER_METHODDEF  | 
13933  |  |     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, | 
13934  |  |     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, | 
13935  |  |     UNICODE_ISASCII_METHODDEF  | 
13936  |  |     UNICODE_ISLOWER_METHODDEF  | 
13937  |  |     UNICODE_ISUPPER_METHODDEF  | 
13938  |  |     UNICODE_ISTITLE_METHODDEF  | 
13939  |  |     UNICODE_ISSPACE_METHODDEF  | 
13940  |  |     UNICODE_ISDECIMAL_METHODDEF  | 
13941  |  |     UNICODE_ISDIGIT_METHODDEF  | 
13942  |  |     UNICODE_ISNUMERIC_METHODDEF  | 
13943  |  |     UNICODE_ISALPHA_METHODDEF  | 
13944  |  |     UNICODE_ISALNUM_METHODDEF  | 
13945  |  |     UNICODE_ISIDENTIFIER_METHODDEF  | 
13946  |  |     UNICODE_ISPRINTABLE_METHODDEF  | 
13947  |  |     UNICODE_ZFILL_METHODDEF  | 
13948  |  |     {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, | 
13949  |  |     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, | 
13950  |  |     UNICODE___FORMAT___METHODDEF  | 
13951  |  |     UNICODE_MAKETRANS_METHODDEF  | 
13952  |  |     UNICODE_SIZEOF_METHODDEF  | 
13953  |  | #if 0  | 
13954  |  |     /* These methods are just used for debugging the implementation. */  | 
13955  |  |     {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, | 
13956  |  | #endif  | 
13957  |  |  | 
13958  |  |     {"__getnewargs__",  unicode_getnewargs, METH_NOARGS}, | 
13959  |  |     {NULL, NULL} | 
13960  |  | };  | 
13961  |  |  | 
13962  |  | static PyObject *  | 
13963  |  | unicode_mod(PyObject *v, PyObject *w)  | 
13964  | 0  | { | 
13965  | 0  |     if (!PyUnicode_Check(v))  | 
13966  | 0  |         Py_RETURN_NOTIMPLEMENTED;  | 
13967  | 0  |     return PyUnicode_Format(v, w);  | 
13968  | 0  | }  | 
13969  |  |  | 
13970  |  | static PyNumberMethods unicode_as_number = { | 
13971  |  |     0,              /*nb_add*/  | 
13972  |  |     0,              /*nb_subtract*/  | 
13973  |  |     0,              /*nb_multiply*/  | 
13974  |  |     unicode_mod,            /*nb_remainder*/  | 
13975  |  | };  | 
13976  |  |  | 
13977  |  | static PySequenceMethods unicode_as_sequence = { | 
13978  |  |     (lenfunc) unicode_length,       /* sq_length */  | 
13979  |  |     PyUnicode_Concat,           /* sq_concat */  | 
13980  |  |     (ssizeargfunc) unicode_repeat,  /* sq_repeat */  | 
13981  |  |     (ssizeargfunc) unicode_getitem,     /* sq_item */  | 
13982  |  |     0,                  /* sq_slice */  | 
13983  |  |     0,                  /* sq_ass_item */  | 
13984  |  |     0,                  /* sq_ass_slice */  | 
13985  |  |     PyUnicode_Contains,         /* sq_contains */  | 
13986  |  | };  | 
13987  |  |  | 
13988  |  | static PyObject*  | 
13989  |  | unicode_subscript(PyObject* self, PyObject* item)  | 
13990  | 5.61k  | { | 
13991  | 5.61k  |     if (PyUnicode_READY(self) == -1)  | 
13992  | 0  |         return NULL;  | 
13993  |  |  | 
13994  | 5.61k  |     if (PyIndex_Check(item)) { | 
13995  | 5.15k  |         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);  | 
13996  | 5.15k  |         if (i == -1 && PyErr_Occurred())  | 
13997  | 0  |             return NULL;  | 
13998  | 5.15k  |         if (i < 0)  | 
13999  | 104  |             i += PyUnicode_GET_LENGTH(self);  | 
14000  | 5.15k  |         return unicode_getitem(self, i);  | 
14001  | 5.15k  |     } else if (PySlice_Check(item)) { | 
14002  | 455  |         Py_ssize_t start, stop, step, slicelength, i;  | 
14003  | 455  |         size_t cur;  | 
14004  | 455  |         PyObject *result;  | 
14005  | 455  |         void *src_data, *dest_data;  | 
14006  | 455  |         int src_kind, dest_kind;  | 
14007  | 455  |         Py_UCS4 ch, max_char, kind_limit;  | 
14008  |  |  | 
14009  | 455  |         if (PySlice_Unpack(item, &start, &stop, &step) < 0) { | 
14010  | 0  |             return NULL;  | 
14011  | 0  |         }  | 
14012  | 455  |         slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),  | 
14013  | 455  |                                             &start, &stop, step);  | 
14014  |  |  | 
14015  | 455  |         if (slicelength <= 0) { | 
14016  | 70  |             _Py_RETURN_UNICODE_EMPTY();  | 
14017  | 385  |         } else if (start == 0 && step == 1 &&  | 
14018  | 188  |                    slicelength == PyUnicode_GET_LENGTH(self)) { | 
14019  | 0  |             return unicode_result_unchanged(self);  | 
14020  | 385  |         } else if (step == 1) { | 
14021  | 385  |             return PyUnicode_Substring(self,  | 
14022  | 385  |                                        start, start + slicelength);  | 
14023  | 385  |         }  | 
14024  |  |         /* General case */  | 
14025  | 0  |         src_kind = PyUnicode_KIND(self);  | 
14026  | 0  |         src_data = PyUnicode_DATA(self);  | 
14027  | 0  |         if (!PyUnicode_IS_ASCII(self)) { | 
14028  | 0  |             kind_limit = kind_maxchar_limit(src_kind);  | 
14029  | 0  |             max_char = 0;  | 
14030  | 0  |             for (cur = start, i = 0; i < slicelength; cur += step, i++) { | 
14031  | 0  |                 ch = PyUnicode_READ(src_kind, src_data, cur);  | 
14032  | 0  |                 if (ch > max_char) { | 
14033  | 0  |                     max_char = ch;  | 
14034  | 0  |                     if (max_char >= kind_limit)  | 
14035  | 0  |                         break;  | 
14036  | 0  |                 }  | 
14037  | 0  |             }  | 
14038  | 0  |         }  | 
14039  | 0  |         else  | 
14040  | 0  |             max_char = 127;  | 
14041  | 0  |         result = PyUnicode_New(slicelength, max_char);  | 
14042  | 0  |         if (result == NULL)  | 
14043  | 0  |             return NULL;  | 
14044  | 0  |         dest_kind = PyUnicode_KIND(result);  | 
14045  | 0  |         dest_data = PyUnicode_DATA(result);  | 
14046  |  | 
  | 
14047  | 0  |         for (cur = start, i = 0; i < slicelength; cur += step, i++) { | 
14048  | 0  |             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);  | 
14049  | 0  |             PyUnicode_WRITE(dest_kind, dest_data, i, ch);  | 
14050  | 0  |         }  | 
14051  | 0  |         assert(_PyUnicode_CheckConsistency(result, 1));  | 
14052  | 0  |         return result;  | 
14053  | 0  |     } else { | 
14054  | 0  |         PyErr_SetString(PyExc_TypeError, "string indices must be integers");  | 
14055  | 0  |         return NULL;  | 
14056  | 0  |     }  | 
14057  | 5.61k  | }  | 
14058  |  |  | 
14059  |  | static PyMappingMethods unicode_as_mapping = { | 
14060  |  |     (lenfunc)unicode_length,        /* mp_length */  | 
14061  |  |     (binaryfunc)unicode_subscript,  /* mp_subscript */  | 
14062  |  |     (objobjargproc)0,           /* mp_ass_subscript */  | 
14063  |  | };  | 
14064  |  |  | 
14065  |  |  | 
14066  |  | /* Helpers for PyUnicode_Format() */  | 
14067  |  |  | 
14068  |  | struct unicode_formatter_t { | 
14069  |  |     PyObject *args;  | 
14070  |  |     int args_owned;  | 
14071  |  |     Py_ssize_t arglen, argidx;  | 
14072  |  |     PyObject *dict;  | 
14073  |  |  | 
14074  |  |     enum PyUnicode_Kind fmtkind;  | 
14075  |  |     Py_ssize_t fmtcnt, fmtpos;  | 
14076  |  |     void *fmtdata;  | 
14077  |  |     PyObject *fmtstr;  | 
14078  |  |  | 
14079  |  |     _PyUnicodeWriter writer;  | 
14080  |  | };  | 
14081  |  |  | 
14082  |  | struct unicode_format_arg_t { | 
14083  |  |     Py_UCS4 ch;  | 
14084  |  |     int flags;  | 
14085  |  |     Py_ssize_t width;  | 
14086  |  |     int prec;  | 
14087  |  |     int sign;  | 
14088  |  | };  | 
14089  |  |  | 
14090  |  | static PyObject *  | 
14091  |  | unicode_format_getnextarg(struct unicode_formatter_t *ctx)  | 
14092  | 116  | { | 
14093  | 116  |     Py_ssize_t argidx = ctx->argidx;  | 
14094  |  |  | 
14095  | 116  |     if (argidx < ctx->arglen) { | 
14096  | 116  |         ctx->argidx++;  | 
14097  | 116  |         if (ctx->arglen < 0)  | 
14098  | 46  |             return ctx->args;  | 
14099  | 70  |         else  | 
14100  | 70  |             return PyTuple_GetItem(ctx->args, argidx);  | 
14101  | 116  |     }  | 
14102  | 0  |     PyErr_SetString(PyExc_TypeError,  | 
14103  | 0  |                     "not enough arguments for format string");  | 
14104  | 0  |     return NULL;  | 
14105  | 116  | }  | 
14106  |  |  | 
14107  |  | /* Returns a new reference to a PyUnicode object, or NULL on failure. */  | 
14108  |  |  | 
14109  |  | /* Format a float into the writer if the writer is not NULL, or into *p_output  | 
14110  |  |    otherwise.  | 
14111  |  |  | 
14112  |  |    Return 0 on success, raise an exception and return -1 on error. */  | 
14113  |  | static int  | 
14114  |  | formatfloat(PyObject *v, struct unicode_format_arg_t *arg,  | 
14115  |  |             PyObject **p_output,  | 
14116  |  |             _PyUnicodeWriter *writer)  | 
14117  | 0  | { | 
14118  | 0  |     char *p;  | 
14119  | 0  |     double x;  | 
14120  | 0  |     Py_ssize_t len;  | 
14121  | 0  |     int prec;  | 
14122  | 0  |     int dtoa_flags;  | 
14123  |  | 
  | 
14124  | 0  |     x = PyFloat_AsDouble(v);  | 
14125  | 0  |     if (x == -1.0 && PyErr_Occurred())  | 
14126  | 0  |         return -1;  | 
14127  |  |  | 
14128  | 0  |     prec = arg->prec;  | 
14129  | 0  |     if (prec < 0)  | 
14130  | 0  |         prec = 6;  | 
14131  |  | 
  | 
14132  | 0  |     if (arg->flags & F_ALT)  | 
14133  | 0  |         dtoa_flags = Py_DTSF_ALT;  | 
14134  | 0  |     else  | 
14135  | 0  |         dtoa_flags = 0;  | 
14136  | 0  |     p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);  | 
14137  | 0  |     if (p == NULL)  | 
14138  | 0  |         return -1;  | 
14139  | 0  |     len = strlen(p);  | 
14140  | 0  |     if (writer) { | 
14141  | 0  |         if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) { | 
14142  | 0  |             PyMem_Free(p);  | 
14143  | 0  |             return -1;  | 
14144  | 0  |         }  | 
14145  | 0  |     }  | 
14146  | 0  |     else  | 
14147  | 0  |         *p_output = _PyUnicode_FromASCII(p, len);  | 
14148  | 0  |     PyMem_Free(p);  | 
14149  | 0  |     return 0;  | 
14150  | 0  | }  | 
14151  |  |  | 
14152  |  | /* formatlong() emulates the format codes d, u, o, x and X, and  | 
14153  |  |  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for  | 
14154  |  |  * Python's regular ints.  | 
14155  |  |  * Return value:  a new PyUnicodeObject*, or NULL if error.  | 
14156  |  |  *     The output string is of the form  | 
14157  |  |  *         "-"? ("0x" | "0X")? digit+ | 
14158  |  |  *     "0x"/"0X" are present only for x and X conversions, with F_ALT  | 
14159  |  |  *         set in flags.  The case of hex digits will be correct,  | 
14160  |  |  *     There will be at least prec digits, zero-filled on the left if  | 
14161  |  |  *         necessary to get that many.  | 
14162  |  |  * val          object to be converted  | 
14163  |  |  * flags        bitmask of format flags; only F_ALT is looked at  | 
14164  |  |  * prec         minimum number of digits; 0-fill on left if needed  | 
14165  |  |  * type         a character in [duoxX]; u acts the same as d  | 
14166  |  |  *  | 
14167  |  |  * CAUTION:  o, x and X conversions on regular ints can never  | 
14168  |  |  * produce a '-' sign, but can for Python's unbounded ints.  | 
14169  |  |  */  | 
14170  |  | PyObject *  | 
14171  |  | _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)  | 
14172  | 0  | { | 
14173  | 0  |     PyObject *result = NULL;  | 
14174  | 0  |     char *buf;  | 
14175  | 0  |     Py_ssize_t i;  | 
14176  | 0  |     int sign;           /* 1 if '-', else 0 */  | 
14177  | 0  |     int len;            /* number of characters */  | 
14178  | 0  |     Py_ssize_t llen;  | 
14179  | 0  |     int numdigits;      /* len == numnondigits + numdigits */  | 
14180  | 0  |     int numnondigits = 0;  | 
14181  |  |  | 
14182  |  |     /* Avoid exceeding SSIZE_T_MAX */  | 
14183  | 0  |     if (prec > INT_MAX-3) { | 
14184  | 0  |         PyErr_SetString(PyExc_OverflowError,  | 
14185  | 0  |                         "precision too large");  | 
14186  | 0  |         return NULL;  | 
14187  | 0  |     }  | 
14188  |  |  | 
14189  | 0  |     assert(PyLong_Check(val));  | 
14190  |  | 
  | 
14191  | 0  |     switch (type) { | 
14192  | 0  |     default:  | 
14193  | 0  |         Py_UNREACHABLE();  | 
14194  | 0  |     case 'd':  | 
14195  | 0  |     case 'i':  | 
14196  | 0  |     case 'u':  | 
14197  |  |         /* int and int subclasses should print numerically when a numeric */  | 
14198  |  |         /* format code is used (see issue18780) */  | 
14199  | 0  |         result = PyNumber_ToBase(val, 10);  | 
14200  | 0  |         break;  | 
14201  | 0  |     case 'o':  | 
14202  | 0  |         numnondigits = 2;  | 
14203  | 0  |         result = PyNumber_ToBase(val, 8);  | 
14204  | 0  |         break;  | 
14205  | 0  |     case 'x':  | 
14206  | 0  |     case 'X':  | 
14207  | 0  |         numnondigits = 2;  | 
14208  | 0  |         result = PyNumber_ToBase(val, 16);  | 
14209  | 0  |         break;  | 
14210  | 0  |     }  | 
14211  | 0  |     if (!result)  | 
14212  | 0  |         return NULL;  | 
14213  |  |  | 
14214  | 0  |     assert(unicode_modifiable(result));  | 
14215  | 0  |     assert(PyUnicode_IS_READY(result));  | 
14216  | 0  |     assert(PyUnicode_IS_ASCII(result));  | 
14217  |  |  | 
14218  |  |     /* To modify the string in-place, there can only be one reference. */  | 
14219  | 0  |     if (Py_REFCNT(result) != 1) { | 
14220  | 0  |         Py_DECREF(result);  | 
14221  | 0  |         PyErr_BadInternalCall();  | 
14222  | 0  |         return NULL;  | 
14223  | 0  |     }  | 
14224  | 0  |     buf = PyUnicode_DATA(result);  | 
14225  | 0  |     llen = PyUnicode_GET_LENGTH(result);  | 
14226  | 0  |     if (llen > INT_MAX) { | 
14227  | 0  |         Py_DECREF(result);  | 
14228  | 0  |         PyErr_SetString(PyExc_ValueError,  | 
14229  | 0  |                         "string too large in _PyUnicode_FormatLong");  | 
14230  | 0  |         return NULL;  | 
14231  | 0  |     }  | 
14232  | 0  |     len = (int)llen;  | 
14233  | 0  |     sign = buf[0] == '-';  | 
14234  | 0  |     numnondigits += sign;  | 
14235  | 0  |     numdigits = len - numnondigits;  | 
14236  | 0  |     assert(numdigits > 0);  | 
14237  |  |  | 
14238  |  |     /* Get rid of base marker unless F_ALT */  | 
14239  | 0  |     if (((alt) == 0 &&  | 
14240  | 0  |         (type == 'o' || type == 'x' || type == 'X'))) { | 
14241  | 0  |         assert(buf[sign] == '0');  | 
14242  | 0  |         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||  | 
14243  | 0  |                buf[sign+1] == 'o');  | 
14244  | 0  |         numnondigits -= 2;  | 
14245  | 0  |         buf += 2;  | 
14246  | 0  |         len -= 2;  | 
14247  | 0  |         if (sign)  | 
14248  | 0  |             buf[0] = '-';  | 
14249  | 0  |         assert(len == numnondigits + numdigits);  | 
14250  | 0  |         assert(numdigits > 0);  | 
14251  | 0  |     }  | 
14252  |  |  | 
14253  |  |     /* Fill with leading zeroes to meet minimum width. */  | 
14254  | 0  |     if (prec > numdigits) { | 
14255  | 0  |         PyObject *r1 = PyBytes_FromStringAndSize(NULL,  | 
14256  | 0  |                                 numnondigits + prec);  | 
14257  | 0  |         char *b1;  | 
14258  | 0  |         if (!r1) { | 
14259  | 0  |             Py_DECREF(result);  | 
14260  | 0  |             return NULL;  | 
14261  | 0  |         }  | 
14262  | 0  |         b1 = PyBytes_AS_STRING(r1);  | 
14263  | 0  |         for (i = 0; i < numnondigits; ++i)  | 
14264  | 0  |             *b1++ = *buf++;  | 
14265  | 0  |         for (i = 0; i < prec - numdigits; i++)  | 
14266  | 0  |             *b1++ = '0';  | 
14267  | 0  |         for (i = 0; i < numdigits; i++)  | 
14268  | 0  |             *b1++ = *buf++;  | 
14269  | 0  |         *b1 = '\0';  | 
14270  | 0  |         Py_DECREF(result);  | 
14271  | 0  |         result = r1;  | 
14272  | 0  |         buf = PyBytes_AS_STRING(result);  | 
14273  | 0  |         len = numnondigits + prec;  | 
14274  | 0  |     }  | 
14275  |  |  | 
14276  |  |     /* Fix up case for hex conversions. */  | 
14277  | 0  |     if (type == 'X') { | 
14278  |  |         /* Need to convert all lower case letters to upper case.  | 
14279  |  |            and need to convert 0x to 0X (and -0x to -0X). */  | 
14280  | 0  |         for (i = 0; i < len; i++)  | 
14281  | 0  |             if (buf[i] >= 'a' && buf[i] <= 'x')  | 
14282  | 0  |                 buf[i] -= 'a'-'A';  | 
14283  | 0  |     }  | 
14284  | 0  |     if (!PyUnicode_Check(result)  | 
14285  | 0  |         || buf != PyUnicode_DATA(result)) { | 
14286  | 0  |         PyObject *unicode;  | 
14287  | 0  |         unicode = _PyUnicode_FromASCII(buf, len);  | 
14288  | 0  |         Py_DECREF(result);  | 
14289  | 0  |         result = unicode;  | 
14290  | 0  |     }  | 
14291  | 0  |     else if (len != PyUnicode_GET_LENGTH(result)) { | 
14292  | 0  |         if (PyUnicode_Resize(&result, len) < 0)  | 
14293  | 0  |             Py_CLEAR(result);  | 
14294  | 0  |     }  | 
14295  | 0  |     return result;  | 
14296  | 0  | }  | 
14297  |  |  | 
14298  |  | /* Format an integer or a float as an integer.  | 
14299  |  |  * Return 1 if the number has been formatted into the writer,  | 
14300  |  |  *        0 if the number has been formatted into *p_output  | 
14301  |  |  *       -1 and raise an exception on error */  | 
14302  |  | static int  | 
14303  |  | mainformatlong(PyObject *v,  | 
14304  |  |                struct unicode_format_arg_t *arg,  | 
14305  |  |                PyObject **p_output,  | 
14306  |  |                _PyUnicodeWriter *writer)  | 
14307  | 28  | { | 
14308  | 28  |     PyObject *iobj, *res;  | 
14309  | 28  |     char type = (char)arg->ch;  | 
14310  |  |  | 
14311  | 28  |     if (!PyNumber_Check(v))  | 
14312  | 0  |         goto wrongtype;  | 
14313  |  |  | 
14314  |  |     /* make sure number is a type of integer for o, x, and X */  | 
14315  | 28  |     if (!PyLong_Check(v)) { | 
14316  | 0  |         if (type == 'o' || type == 'x' || type == 'X') { | 
14317  | 0  |             iobj = PyNumber_Index(v);  | 
14318  | 0  |             if (iobj == NULL) { | 
14319  | 0  |                 if (PyErr_ExceptionMatches(PyExc_TypeError))  | 
14320  | 0  |                     goto wrongtype;  | 
14321  | 0  |                 return -1;  | 
14322  | 0  |             }  | 
14323  | 0  |         }  | 
14324  | 0  |         else { | 
14325  | 0  |             iobj = PyNumber_Long(v);  | 
14326  | 0  |             if (iobj == NULL ) { | 
14327  | 0  |                 if (PyErr_ExceptionMatches(PyExc_TypeError))  | 
14328  | 0  |                     goto wrongtype;  | 
14329  | 0  |                 return -1;  | 
14330  | 0  |             }  | 
14331  | 0  |         }  | 
14332  | 0  |         assert(PyLong_Check(iobj));  | 
14333  | 0  |     }  | 
14334  | 28  |     else { | 
14335  | 28  |         iobj = v;  | 
14336  | 28  |         Py_INCREF(iobj);  | 
14337  | 28  |     }  | 
14338  |  |  | 
14339  | 28  |     if (PyLong_CheckExact(v)  | 
14340  | 28  |         && arg->width == -1 && arg->prec == -1  | 
14341  | 28  |         && !(arg->flags & (F_SIGN | F_BLANK))  | 
14342  | 28  |         && type != 'X')  | 
14343  | 28  |     { | 
14344  |  |         /* Fast path */  | 
14345  | 28  |         int alternate = arg->flags & F_ALT;  | 
14346  | 28  |         int base;  | 
14347  |  |  | 
14348  | 28  |         switch(type)  | 
14349  | 28  |         { | 
14350  | 0  |             default:  | 
14351  | 0  |                 Py_UNREACHABLE();  | 
14352  | 28  |             case 'd':  | 
14353  | 28  |             case 'i':  | 
14354  | 28  |             case 'u':  | 
14355  | 28  |                 base = 10;  | 
14356  | 28  |                 break;  | 
14357  | 0  |             case 'o':  | 
14358  | 0  |                 base = 8;  | 
14359  | 0  |                 break;  | 
14360  | 0  |             case 'x':  | 
14361  | 0  |             case 'X':  | 
14362  | 0  |                 base = 16;  | 
14363  | 0  |                 break;  | 
14364  | 28  |         }  | 
14365  |  |  | 
14366  | 28  |         if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { | 
14367  | 0  |             Py_DECREF(iobj);  | 
14368  | 0  |             return -1;  | 
14369  | 0  |         }  | 
14370  | 28  |         Py_DECREF(iobj);  | 
14371  | 28  |         return 1;  | 
14372  | 28  |     }  | 
14373  |  |  | 
14374  | 0  |     res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);  | 
14375  | 0  |     Py_DECREF(iobj);  | 
14376  | 0  |     if (res == NULL)  | 
14377  | 0  |         return -1;  | 
14378  | 0  |     *p_output = res;  | 
14379  | 0  |     return 0;  | 
14380  |  |  | 
14381  | 0  | wrongtype:  | 
14382  | 0  |     switch(type)  | 
14383  | 0  |     { | 
14384  | 0  |         case 'o':  | 
14385  | 0  |         case 'x':  | 
14386  | 0  |         case 'X':  | 
14387  | 0  |             PyErr_Format(PyExc_TypeError,  | 
14388  | 0  |                     "%%%c format: an integer is required, "  | 
14389  | 0  |                     "not %.200s",  | 
14390  | 0  |                     type, Py_TYPE(v)->tp_name);  | 
14391  | 0  |             break;  | 
14392  | 0  |         default:  | 
14393  | 0  |             PyErr_Format(PyExc_TypeError,  | 
14394  | 0  |                     "%%%c format: a number is required, "  | 
14395  | 0  |                     "not %.200s",  | 
14396  | 0  |                     type, Py_TYPE(v)->tp_name);  | 
14397  | 0  |             break;  | 
14398  | 0  |     }  | 
14399  | 0  |     return -1;  | 
14400  | 0  | }  | 
14401  |  |  | 
14402  |  | static Py_UCS4  | 
14403  |  | formatchar(PyObject *v)  | 
14404  | 0  | { | 
14405  |  |     /* presume that the buffer is at least 3 characters long */  | 
14406  | 0  |     if (PyUnicode_Check(v)) { | 
14407  | 0  |         if (PyUnicode_GET_LENGTH(v) == 1) { | 
14408  | 0  |             return PyUnicode_READ_CHAR(v, 0);  | 
14409  | 0  |         }  | 
14410  | 0  |         goto onError;  | 
14411  | 0  |     }  | 
14412  | 0  |     else { | 
14413  | 0  |         PyObject *iobj;  | 
14414  | 0  |         long x;  | 
14415  |  |         /* make sure number is a type of integer */  | 
14416  | 0  |         if (!PyLong_Check(v)) { | 
14417  | 0  |             iobj = PyNumber_Index(v);  | 
14418  | 0  |             if (iobj == NULL) { | 
14419  | 0  |                 goto onError;  | 
14420  | 0  |             }  | 
14421  | 0  |             x = PyLong_AsLong(iobj);  | 
14422  | 0  |             Py_DECREF(iobj);  | 
14423  | 0  |         }  | 
14424  | 0  |         else { | 
14425  | 0  |             x = PyLong_AsLong(v);  | 
14426  | 0  |         }  | 
14427  | 0  |         if (x == -1 && PyErr_Occurred())  | 
14428  | 0  |             goto onError;  | 
14429  |  |  | 
14430  | 0  |         if (x < 0 || x > MAX_UNICODE) { | 
14431  | 0  |             PyErr_SetString(PyExc_OverflowError,  | 
14432  | 0  |                             "%c arg not in range(0x110000)");  | 
14433  | 0  |             return (Py_UCS4) -1;  | 
14434  | 0  |         }  | 
14435  |  |  | 
14436  | 0  |         return (Py_UCS4) x;  | 
14437  | 0  |     }  | 
14438  |  |  | 
14439  | 0  |   onError:  | 
14440  | 0  |     PyErr_SetString(PyExc_TypeError,  | 
14441  | 0  |                     "%c requires int or char");  | 
14442  | 0  |     return (Py_UCS4) -1;  | 
14443  | 0  | }  | 
14444  |  |  | 
14445  |  | /* Parse options of an argument: flags, width, precision.  | 
14446  |  |    Handle also "%(name)" syntax.  | 
14447  |  |  | 
14448  |  |    Return 0 if the argument has been formatted into arg->str.  | 
14449  |  |    Return 1 if the argument has been written into ctx->writer,  | 
14450  |  |    Raise an exception and return -1 on error. */  | 
14451  |  | static int  | 
14452  |  | unicode_format_arg_parse(struct unicode_formatter_t *ctx,  | 
14453  |  |                          struct unicode_format_arg_t *arg)  | 
14454  | 116  | { | 
14455  | 116  | #define FORMAT_READ(ctx) \  | 
14456  | 457  |         PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)  | 
14457  |  |  | 
14458  | 116  |     PyObject *v;  | 
14459  |  |  | 
14460  | 116  |     if (arg->ch == '(') { | 
14461  |  |         /* Get argument value from a dictionary. Example: "%(name)s". */  | 
14462  | 46  |         Py_ssize_t keystart;  | 
14463  | 46  |         Py_ssize_t keylen;  | 
14464  | 46  |         PyObject *key;  | 
14465  | 46  |         int pcount = 1;  | 
14466  |  |  | 
14467  | 46  |         if (ctx->dict == NULL) { | 
14468  | 0  |             PyErr_SetString(PyExc_TypeError,  | 
14469  | 0  |                             "format requires a mapping");  | 
14470  | 0  |             return -1;  | 
14471  | 0  |         }  | 
14472  | 46  |         ++ctx->fmtpos;  | 
14473  | 46  |         --ctx->fmtcnt;  | 
14474  | 46  |         keystart = ctx->fmtpos;  | 
14475  |  |         /* Skip over balanced parentheses */  | 
14476  | 387  |         while (pcount > 0 && --ctx->fmtcnt >= 0) { | 
14477  | 341  |             arg->ch = FORMAT_READ(ctx);  | 
14478  | 341  |             if (arg->ch == ')')  | 
14479  | 46  |                 --pcount;  | 
14480  | 295  |             else if (arg->ch == '(') | 
14481  | 0  |                 ++pcount;  | 
14482  | 341  |             ctx->fmtpos++;  | 
14483  | 341  |         }  | 
14484  | 46  |         keylen = ctx->fmtpos - keystart - 1;  | 
14485  | 46  |         if (ctx->fmtcnt < 0 || pcount > 0) { | 
14486  | 0  |             PyErr_SetString(PyExc_ValueError,  | 
14487  | 0  |                             "incomplete format key");  | 
14488  | 0  |             return -1;  | 
14489  | 0  |         }  | 
14490  | 46  |         key = PyUnicode_Substring(ctx->fmtstr,  | 
14491  | 46  |                                   keystart, keystart + keylen);  | 
14492  | 46  |         if (key == NULL)  | 
14493  | 0  |             return -1;  | 
14494  | 46  |         if (ctx->args_owned) { | 
14495  | 31  |             ctx->args_owned = 0;  | 
14496  | 31  |             Py_DECREF(ctx->args);  | 
14497  | 31  |         }  | 
14498  | 46  |         ctx->args = PyObject_GetItem(ctx->dict, key);  | 
14499  | 46  |         Py_DECREF(key);  | 
14500  | 46  |         if (ctx->args == NULL)  | 
14501  | 0  |             return -1;  | 
14502  | 46  |         ctx->args_owned = 1;  | 
14503  | 46  |         ctx->arglen = -1;  | 
14504  | 46  |         ctx->argidx = -2;  | 
14505  | 46  |     }  | 
14506  |  |  | 
14507  |  |     /* Parse flags. Example: "%+i" => flags=F_SIGN. */  | 
14508  | 116  |     while (--ctx->fmtcnt >= 0) { | 
14509  | 116  |         arg->ch = FORMAT_READ(ctx);  | 
14510  | 116  |         ctx->fmtpos++;  | 
14511  | 116  |         switch (arg->ch) { | 
14512  | 0  |         case '-': arg->flags |= F_LJUST; continue;  | 
14513  | 0  |         case '+': arg->flags |= F_SIGN; continue;  | 
14514  | 0  |         case ' ': arg->flags |= F_BLANK; continue;  | 
14515  | 0  |         case '#': arg->flags |= F_ALT; continue;  | 
14516  | 0  |         case '0': arg->flags |= F_ZERO; continue;  | 
14517  | 116  |         }  | 
14518  | 116  |         break;  | 
14519  | 116  |     }  | 
14520  |  |  | 
14521  |  |     /* Parse width. Example: "%10s" => width=10 */  | 
14522  | 116  |     if (arg->ch == '*') { | 
14523  | 0  |         v = unicode_format_getnextarg(ctx);  | 
14524  | 0  |         if (v == NULL)  | 
14525  | 0  |             return -1;  | 
14526  | 0  |         if (!PyLong_Check(v)) { | 
14527  | 0  |             PyErr_SetString(PyExc_TypeError,  | 
14528  | 0  |                             "* wants int");  | 
14529  | 0  |             return -1;  | 
14530  | 0  |         }  | 
14531  | 0  |         arg->width = PyLong_AsSsize_t(v);  | 
14532  | 0  |         if (arg->width == -1 && PyErr_Occurred())  | 
14533  | 0  |             return -1;  | 
14534  | 0  |         if (arg->width < 0) { | 
14535  | 0  |             arg->flags |= F_LJUST;  | 
14536  | 0  |             arg->width = -arg->width;  | 
14537  | 0  |         }  | 
14538  | 0  |         if (--ctx->fmtcnt >= 0) { | 
14539  | 0  |             arg->ch = FORMAT_READ(ctx);  | 
14540  | 0  |             ctx->fmtpos++;  | 
14541  | 0  |         }  | 
14542  | 0  |     }  | 
14543  | 116  |     else if (arg->ch >= '0' && arg->ch <= '9') { | 
14544  | 0  |         arg->width = arg->ch - '0';  | 
14545  | 0  |         while (--ctx->fmtcnt >= 0) { | 
14546  | 0  |             arg->ch = FORMAT_READ(ctx);  | 
14547  | 0  |             ctx->fmtpos++;  | 
14548  | 0  |             if (arg->ch < '0' || arg->ch > '9')  | 
14549  | 0  |                 break;  | 
14550  |  |             /* Since arg->ch is unsigned, the RHS would end up as unsigned,  | 
14551  |  |                mixing signed and unsigned comparison. Since arg->ch is between  | 
14552  |  |                '0' and '9', casting to int is safe. */  | 
14553  | 0  |             if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { | 
14554  | 0  |                 PyErr_SetString(PyExc_ValueError,  | 
14555  | 0  |                                 "width too big");  | 
14556  | 0  |                 return -1;  | 
14557  | 0  |             }  | 
14558  | 0  |             arg->width = arg->width*10 + (arg->ch - '0');  | 
14559  | 0  |         }  | 
14560  | 0  |     }  | 
14561  |  |  | 
14562  |  |     /* Parse precision. Example: "%.3f" => prec=3 */  | 
14563  | 116  |     if (arg->ch == '.') { | 
14564  | 0  |         arg->prec = 0;  | 
14565  | 0  |         if (--ctx->fmtcnt >= 0) { | 
14566  | 0  |             arg->ch = FORMAT_READ(ctx);  | 
14567  | 0  |             ctx->fmtpos++;  | 
14568  | 0  |         }  | 
14569  | 0  |         if (arg->ch == '*') { | 
14570  | 0  |             v = unicode_format_getnextarg(ctx);  | 
14571  | 0  |             if (v == NULL)  | 
14572  | 0  |                 return -1;  | 
14573  | 0  |             if (!PyLong_Check(v)) { | 
14574  | 0  |                 PyErr_SetString(PyExc_TypeError,  | 
14575  | 0  |                                 "* wants int");  | 
14576  | 0  |                 return -1;  | 
14577  | 0  |             }  | 
14578  | 0  |             arg->prec = _PyLong_AsInt(v);  | 
14579  | 0  |             if (arg->prec == -1 && PyErr_Occurred())  | 
14580  | 0  |                 return -1;  | 
14581  | 0  |             if (arg->prec < 0)  | 
14582  | 0  |                 arg->prec = 0;  | 
14583  | 0  |             if (--ctx->fmtcnt >= 0) { | 
14584  | 0  |                 arg->ch = FORMAT_READ(ctx);  | 
14585  | 0  |                 ctx->fmtpos++;  | 
14586  | 0  |             }  | 
14587  | 0  |         }  | 
14588  | 0  |         else if (arg->ch >= '0' && arg->ch <= '9') { | 
14589  | 0  |             arg->prec = arg->ch - '0';  | 
14590  | 0  |             while (--ctx->fmtcnt >= 0) { | 
14591  | 0  |                 arg->ch = FORMAT_READ(ctx);  | 
14592  | 0  |                 ctx->fmtpos++;  | 
14593  | 0  |                 if (arg->ch < '0' || arg->ch > '9')  | 
14594  | 0  |                     break;  | 
14595  | 0  |                 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { | 
14596  | 0  |                     PyErr_SetString(PyExc_ValueError,  | 
14597  | 0  |                                     "precision too big");  | 
14598  | 0  |                     return -1;  | 
14599  | 0  |                 }  | 
14600  | 0  |                 arg->prec = arg->prec*10 + (arg->ch - '0');  | 
14601  | 0  |             }  | 
14602  | 0  |         }  | 
14603  | 0  |     }  | 
14604  |  |  | 
14605  |  |     /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */  | 
14606  | 116  |     if (ctx->fmtcnt >= 0) { | 
14607  | 116  |         if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { | 
14608  | 0  |             if (--ctx->fmtcnt >= 0) { | 
14609  | 0  |                 arg->ch = FORMAT_READ(ctx);  | 
14610  | 0  |                 ctx->fmtpos++;  | 
14611  | 0  |             }  | 
14612  | 0  |         }  | 
14613  | 116  |     }  | 
14614  | 116  |     if (ctx->fmtcnt < 0) { | 
14615  | 0  |         PyErr_SetString(PyExc_ValueError,  | 
14616  | 0  |                         "incomplete format");  | 
14617  | 0  |         return -1;  | 
14618  | 0  |     }  | 
14619  | 116  |     return 0;  | 
14620  |  |  | 
14621  | 116  | #undef FORMAT_READ  | 
14622  | 116  | }  | 
14623  |  |  | 
14624  |  | /* Format one argument. Supported conversion specifiers:  | 
14625  |  |  | 
14626  |  |    - "s", "r", "a": any type  | 
14627  |  |    - "i", "d", "u": int or float  | 
14628  |  |    - "o", "x", "X": int  | 
14629  |  |    - "e", "E", "f", "F", "g", "G": float  | 
14630  |  |    - "c": int or str (1 character)  | 
14631  |  |  | 
14632  |  |    When possible, the output is written directly into the Unicode writer  | 
14633  |  |    (ctx->writer). A string is created when padding is required.  | 
14634  |  |  | 
14635  |  |    Return 0 if the argument has been formatted into *p_str,  | 
14636  |  |           1 if the argument has been written into ctx->writer,  | 
14637  |  |          -1 on error. */  | 
14638  |  | static int  | 
14639  |  | unicode_format_arg_format(struct unicode_formatter_t *ctx,  | 
14640  |  |                           struct unicode_format_arg_t *arg,  | 
14641  |  |                           PyObject **p_str)  | 
14642  | 116  | { | 
14643  | 116  |     PyObject *v;  | 
14644  | 116  |     _PyUnicodeWriter *writer = &ctx->writer;  | 
14645  |  |  | 
14646  | 116  |     if (ctx->fmtcnt == 0)  | 
14647  | 28  |         ctx->writer.overallocate = 0;  | 
14648  |  |  | 
14649  | 116  |     v = unicode_format_getnextarg(ctx);  | 
14650  | 116  |     if (v == NULL)  | 
14651  | 0  |         return -1;  | 
14652  |  |  | 
14653  |  |  | 
14654  | 116  |     switch (arg->ch) { | 
14655  | 88  |     case 's':  | 
14656  | 88  |     case 'r':  | 
14657  | 88  |     case 'a':  | 
14658  | 88  |         if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { | 
14659  |  |             /* Fast path */  | 
14660  | 0  |             if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)  | 
14661  | 0  |                 return -1;  | 
14662  | 0  |             return 1;  | 
14663  | 0  |         }  | 
14664  |  |  | 
14665  | 88  |         if (PyUnicode_CheckExact(v) && arg->ch == 's') { | 
14666  | 88  |             *p_str = v;  | 
14667  | 88  |             Py_INCREF(*p_str);  | 
14668  | 88  |         }  | 
14669  | 0  |         else { | 
14670  | 0  |             if (arg->ch == 's')  | 
14671  | 0  |                 *p_str = PyObject_Str(v);  | 
14672  | 0  |             else if (arg->ch == 'r')  | 
14673  | 0  |                 *p_str = PyObject_Repr(v);  | 
14674  | 0  |             else  | 
14675  | 0  |                 *p_str = PyObject_ASCII(v);  | 
14676  | 0  |         }  | 
14677  | 88  |         break;  | 
14678  |  |  | 
14679  | 0  |     case 'i':  | 
14680  | 28  |     case 'd':  | 
14681  | 28  |     case 'u':  | 
14682  | 28  |     case 'o':  | 
14683  | 28  |     case 'x':  | 
14684  | 28  |     case 'X':  | 
14685  | 28  |     { | 
14686  | 28  |         int ret = mainformatlong(v, arg, p_str, writer);  | 
14687  | 28  |         if (ret != 0)  | 
14688  | 28  |             return ret;  | 
14689  | 0  |         arg->sign = 1;  | 
14690  | 0  |         break;  | 
14691  | 28  |     }  | 
14692  |  |  | 
14693  | 0  |     case 'e':  | 
14694  | 0  |     case 'E':  | 
14695  | 0  |     case 'f':  | 
14696  | 0  |     case 'F':  | 
14697  | 0  |     case 'g':  | 
14698  | 0  |     case 'G':  | 
14699  | 0  |         if (arg->width == -1 && arg->prec == -1  | 
14700  | 0  |             && !(arg->flags & (F_SIGN | F_BLANK)))  | 
14701  | 0  |         { | 
14702  |  |             /* Fast path */  | 
14703  | 0  |             if (formatfloat(v, arg, NULL, writer) == -1)  | 
14704  | 0  |                 return -1;  | 
14705  | 0  |             return 1;  | 
14706  | 0  |         }  | 
14707  |  |  | 
14708  | 0  |         arg->sign = 1;  | 
14709  | 0  |         if (formatfloat(v, arg, p_str, NULL) == -1)  | 
14710  | 0  |             return -1;  | 
14711  | 0  |         break;  | 
14712  |  |  | 
14713  | 0  |     case 'c':  | 
14714  | 0  |     { | 
14715  | 0  |         Py_UCS4 ch = formatchar(v);  | 
14716  | 0  |         if (ch == (Py_UCS4) -1)  | 
14717  | 0  |             return -1;  | 
14718  | 0  |         if (arg->width == -1 && arg->prec == -1) { | 
14719  |  |             /* Fast path */  | 
14720  | 0  |             if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)  | 
14721  | 0  |                 return -1;  | 
14722  | 0  |             return 1;  | 
14723  | 0  |         }  | 
14724  | 0  |         *p_str = PyUnicode_FromOrdinal(ch);  | 
14725  | 0  |         break;  | 
14726  | 0  |     }  | 
14727  |  |  | 
14728  | 0  |     default:  | 
14729  | 0  |         PyErr_Format(PyExc_ValueError,  | 
14730  | 0  |                      "unsupported format character '%c' (0x%x) "  | 
14731  | 0  |                      "at index %zd",  | 
14732  | 0  |                      (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',  | 
14733  | 0  |                      (int)arg->ch,  | 
14734  | 0  |                      ctx->fmtpos - 1);  | 
14735  | 0  |         return -1;  | 
14736  | 116  |     }  | 
14737  | 88  |     if (*p_str == NULL)  | 
14738  | 0  |         return -1;  | 
14739  | 88  |     assert (PyUnicode_Check(*p_str));  | 
14740  | 88  |     return 0;  | 
14741  | 88  | }  | 
14742  |  |  | 
14743  |  | static int  | 
14744  |  | unicode_format_arg_output(struct unicode_formatter_t *ctx,  | 
14745  |  |                           struct unicode_format_arg_t *arg,  | 
14746  |  |                           PyObject *str)  | 
14747  | 88  | { | 
14748  | 88  |     Py_ssize_t len;  | 
14749  | 88  |     enum PyUnicode_Kind kind;  | 
14750  | 88  |     void *pbuf;  | 
14751  | 88  |     Py_ssize_t pindex;  | 
14752  | 88  |     Py_UCS4 signchar;  | 
14753  | 88  |     Py_ssize_t buflen;  | 
14754  | 88  |     Py_UCS4 maxchar;  | 
14755  | 88  |     Py_ssize_t sublen;  | 
14756  | 88  |     _PyUnicodeWriter *writer = &ctx->writer;  | 
14757  | 88  |     Py_UCS4 fill;  | 
14758  |  |  | 
14759  | 88  |     fill = ' ';  | 
14760  | 88  |     if (arg->sign && arg->flags & F_ZERO)  | 
14761  | 0  |         fill = '0';  | 
14762  |  |  | 
14763  | 88  |     if (PyUnicode_READY(str) == -1)  | 
14764  | 0  |         return -1;  | 
14765  |  |  | 
14766  | 88  |     len = PyUnicode_GET_LENGTH(str);  | 
14767  | 88  |     if ((arg->width == -1 || arg->width <= len)  | 
14768  | 88  |         && (arg->prec == -1 || arg->prec >= len)  | 
14769  | 88  |         && !(arg->flags & (F_SIGN | F_BLANK)))  | 
14770  | 88  |     { | 
14771  |  |         /* Fast path */  | 
14772  | 88  |         if (_PyUnicodeWriter_WriteStr(writer, str) == -1)  | 
14773  | 0  |             return -1;  | 
14774  | 88  |         return 0;  | 
14775  | 88  |     }  | 
14776  |  |  | 
14777  |  |     /* Truncate the string for "s", "r" and "a" formats  | 
14778  |  |        if the precision is set */  | 
14779  | 0  |     if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { | 
14780  | 0  |         if (arg->prec >= 0 && len > arg->prec)  | 
14781  | 0  |             len = arg->prec;  | 
14782  | 0  |     }  | 
14783  |  |  | 
14784  |  |     /* Adjust sign and width */  | 
14785  | 0  |     kind = PyUnicode_KIND(str);  | 
14786  | 0  |     pbuf = PyUnicode_DATA(str);  | 
14787  | 0  |     pindex = 0;  | 
14788  | 0  |     signchar = '\0';  | 
14789  | 0  |     if (arg->sign) { | 
14790  | 0  |         Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);  | 
14791  | 0  |         if (ch == '-' || ch == '+') { | 
14792  | 0  |             signchar = ch;  | 
14793  | 0  |             len--;  | 
14794  | 0  |             pindex++;  | 
14795  | 0  |         }  | 
14796  | 0  |         else if (arg->flags & F_SIGN)  | 
14797  | 0  |             signchar = '+';  | 
14798  | 0  |         else if (arg->flags & F_BLANK)  | 
14799  | 0  |             signchar = ' ';  | 
14800  | 0  |         else  | 
14801  | 0  |             arg->sign = 0;  | 
14802  | 0  |     }  | 
14803  | 0  |     if (arg->width < len)  | 
14804  | 0  |         arg->width = len;  | 
14805  |  |  | 
14806  |  |     /* Prepare the writer */  | 
14807  | 0  |     maxchar = writer->maxchar;  | 
14808  | 0  |     if (!(arg->flags & F_LJUST)) { | 
14809  | 0  |         if (arg->sign) { | 
14810  | 0  |             if ((arg->width-1) > len)  | 
14811  | 0  |                 maxchar = Py_MAX(maxchar, fill);  | 
14812  | 0  |         }  | 
14813  | 0  |         else { | 
14814  | 0  |             if (arg->width > len)  | 
14815  | 0  |                 maxchar = Py_MAX(maxchar, fill);  | 
14816  | 0  |         }  | 
14817  | 0  |     }  | 
14818  | 0  |     if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { | 
14819  | 0  |         Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);  | 
14820  | 0  |         maxchar = Py_MAX(maxchar, strmaxchar);  | 
14821  | 0  |     }  | 
14822  |  | 
  | 
14823  | 0  |     buflen = arg->width;  | 
14824  | 0  |     if (arg->sign && len == arg->width)  | 
14825  | 0  |         buflen++;  | 
14826  | 0  |     if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)  | 
14827  | 0  |         return -1;  | 
14828  |  |  | 
14829  |  |     /* Write the sign if needed */  | 
14830  | 0  |     if (arg->sign) { | 
14831  | 0  |         if (fill != ' ') { | 
14832  | 0  |             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);  | 
14833  | 0  |             writer->pos += 1;  | 
14834  | 0  |         }  | 
14835  | 0  |         if (arg->width > len)  | 
14836  | 0  |             arg->width--;  | 
14837  | 0  |     }  | 
14838  |  |  | 
14839  |  |     /* Write the numeric prefix for "x", "X" and "o" formats  | 
14840  |  |        if the alternate form is used.  | 
14841  |  |        For example, write "0x" for the "%#x" format. */  | 
14842  | 0  |     if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { | 
14843  | 0  |         assert(PyUnicode_READ(kind, pbuf, pindex) == '0');  | 
14844  | 0  |         assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);  | 
14845  | 0  |         if (fill != ' ') { | 
14846  | 0  |             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');  | 
14847  | 0  |             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);  | 
14848  | 0  |             writer->pos += 2;  | 
14849  | 0  |             pindex += 2;  | 
14850  | 0  |         }  | 
14851  | 0  |         arg->width -= 2;  | 
14852  | 0  |         if (arg->width < 0)  | 
14853  | 0  |             arg->width = 0;  | 
14854  | 0  |         len -= 2;  | 
14855  | 0  |     }  | 
14856  |  |  | 
14857  |  |     /* Pad left with the fill character if needed */  | 
14858  | 0  |     if (arg->width > len && !(arg->flags & F_LJUST)) { | 
14859  | 0  |         sublen = arg->width - len;  | 
14860  | 0  |         unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);  | 
14861  | 0  |         writer->pos += sublen;  | 
14862  | 0  |         arg->width = len;  | 
14863  | 0  |     }  | 
14864  |  |  | 
14865  |  |     /* If padding with spaces: write sign if needed and/or numeric prefix if  | 
14866  |  |        the alternate form is used */  | 
14867  | 0  |     if (fill == ' ') { | 
14868  | 0  |         if (arg->sign) { | 
14869  | 0  |             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);  | 
14870  | 0  |             writer->pos += 1;  | 
14871  | 0  |         }  | 
14872  | 0  |         if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { | 
14873  | 0  |             assert(PyUnicode_READ(kind, pbuf, pindex) == '0');  | 
14874  | 0  |             assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);  | 
14875  | 0  |             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');  | 
14876  | 0  |             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);  | 
14877  | 0  |             writer->pos += 2;  | 
14878  | 0  |             pindex += 2;  | 
14879  | 0  |         }  | 
14880  | 0  |     }  | 
14881  |  |  | 
14882  |  |     /* Write characters */  | 
14883  | 0  |     if (len) { | 
14884  | 0  |         _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,  | 
14885  | 0  |                                       str, pindex, len);  | 
14886  | 0  |         writer->pos += len;  | 
14887  | 0  |     }  | 
14888  |  |  | 
14889  |  |     /* Pad right with the fill character if needed */  | 
14890  | 0  |     if (arg->width > len) { | 
14891  | 0  |         sublen = arg->width - len;  | 
14892  | 0  |         unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);  | 
14893  | 0  |         writer->pos += sublen;  | 
14894  | 0  |     }  | 
14895  | 0  |     return 0;  | 
14896  | 0  | }  | 
14897  |  |  | 
14898  |  | /* Helper of PyUnicode_Format(): format one arg.  | 
14899  |  |    Return 0 on success, raise an exception and return -1 on error. */  | 
14900  |  | static int  | 
14901  |  | unicode_format_arg(struct unicode_formatter_t *ctx)  | 
14902  | 116  | { | 
14903  | 116  |     struct unicode_format_arg_t arg;  | 
14904  | 116  |     PyObject *str;  | 
14905  | 116  |     int ret;  | 
14906  |  |  | 
14907  | 116  |     arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);  | 
14908  | 116  |     if (arg.ch == '%') { | 
14909  | 0  |         ctx->fmtpos++;  | 
14910  | 0  |         ctx->fmtcnt--;  | 
14911  | 0  |         if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)  | 
14912  | 0  |             return -1;  | 
14913  | 0  |         return 0;  | 
14914  | 0  |     }  | 
14915  | 116  |     arg.flags = 0;  | 
14916  | 116  |     arg.width = -1;  | 
14917  | 116  |     arg.prec = -1;  | 
14918  | 116  |     arg.sign = 0;  | 
14919  | 116  |     str = NULL;  | 
14920  |  |  | 
14921  | 116  |     ret = unicode_format_arg_parse(ctx, &arg);  | 
14922  | 116  |     if (ret == -1)  | 
14923  | 0  |         return -1;  | 
14924  |  |  | 
14925  | 116  |     ret = unicode_format_arg_format(ctx, &arg, &str);  | 
14926  | 116  |     if (ret == -1)  | 
14927  | 0  |         return -1;  | 
14928  |  |  | 
14929  | 116  |     if (ret != 1) { | 
14930  | 88  |         ret = unicode_format_arg_output(ctx, &arg, str);  | 
14931  | 88  |         Py_DECREF(str);  | 
14932  | 88  |         if (ret == -1)  | 
14933  | 0  |             return -1;  | 
14934  | 88  |     }  | 
14935  |  |  | 
14936  | 116  |     if (ctx->dict && (ctx->argidx < ctx->arglen)) { | 
14937  | 0  |         PyErr_SetString(PyExc_TypeError,  | 
14938  | 0  |                         "not all arguments converted during string formatting");  | 
14939  | 0  |         return -1;  | 
14940  | 0  |     }  | 
14941  | 116  |     return 0;  | 
14942  | 116  | }  | 
14943  |  |  | 
14944  |  | PyObject *  | 
14945  |  | PyUnicode_Format(PyObject *format, PyObject *args)  | 
14946  | 57  | { | 
14947  | 57  |     struct unicode_formatter_t ctx;  | 
14948  |  |  | 
14949  | 57  |     if (format == NULL || args == NULL) { | 
14950  | 0  |         PyErr_BadInternalCall();  | 
14951  | 0  |         return NULL;  | 
14952  | 0  |     }  | 
14953  |  |  | 
14954  | 57  |     if (ensure_unicode(format) < 0)  | 
14955  | 0  |         return NULL;  | 
14956  |  |  | 
14957  | 57  |     ctx.fmtstr = format;  | 
14958  | 57  |     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);  | 
14959  | 57  |     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);  | 
14960  | 57  |     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);  | 
14961  | 57  |     ctx.fmtpos = 0;  | 
14962  |  |  | 
14963  | 57  |     _PyUnicodeWriter_Init(&ctx.writer);  | 
14964  | 57  |     ctx.writer.min_length = ctx.fmtcnt + 100;  | 
14965  | 57  |     ctx.writer.overallocate = 1;  | 
14966  |  |  | 
14967  | 57  |     if (PyTuple_Check(args)) { | 
14968  | 42  |         ctx.arglen = PyTuple_Size(args);  | 
14969  | 42  |         ctx.argidx = 0;  | 
14970  | 42  |     }  | 
14971  | 15  |     else { | 
14972  | 15  |         ctx.arglen = -1;  | 
14973  | 15  |         ctx.argidx = -2;  | 
14974  | 15  |     }  | 
14975  | 57  |     ctx.args_owned = 0;  | 
14976  | 57  |     if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))  | 
14977  | 15  |         ctx.dict = args;  | 
14978  | 42  |     else  | 
14979  | 42  |         ctx.dict = NULL;  | 
14980  | 57  |     ctx.args = args;  | 
14981  |  |  | 
14982  | 290  |     while (--ctx.fmtcnt >= 0) { | 
14983  | 233  |         if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { | 
14984  | 117  |             Py_ssize_t nonfmtpos;  | 
14985  |  |  | 
14986  | 117  |             nonfmtpos = ctx.fmtpos++;  | 
14987  | 875  |             while (ctx.fmtcnt >= 0 &&  | 
14988  | 846  |                    PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { | 
14989  | 758  |                 ctx.fmtpos++;  | 
14990  | 758  |                 ctx.fmtcnt--;  | 
14991  | 758  |             }  | 
14992  | 117  |             if (ctx.fmtcnt < 0) { | 
14993  | 29  |                 ctx.fmtpos--;  | 
14994  | 29  |                 ctx.writer.overallocate = 0;  | 
14995  | 29  |             }  | 
14996  |  |  | 
14997  | 117  |             if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,  | 
14998  | 117  |                                                 nonfmtpos, ctx.fmtpos) < 0)  | 
14999  | 0  |                 goto onError;  | 
15000  | 117  |         }  | 
15001  | 116  |         else { | 
15002  | 116  |             ctx.fmtpos++;  | 
15003  | 116  |             if (unicode_format_arg(&ctx) == -1)  | 
15004  | 0  |                 goto onError;  | 
15005  | 116  |         }  | 
15006  | 233  |     }  | 
15007  |  |  | 
15008  | 57  |     if (ctx.argidx < ctx.arglen && !ctx.dict) { | 
15009  | 0  |         PyErr_SetString(PyExc_TypeError,  | 
15010  | 0  |                         "not all arguments converted during string formatting");  | 
15011  | 0  |         goto onError;  | 
15012  | 0  |     }  | 
15013  |  |  | 
15014  | 57  |     if (ctx.args_owned) { | 
15015  | 15  |         Py_DECREF(ctx.args);  | 
15016  | 15  |     }  | 
15017  | 57  |     return _PyUnicodeWriter_Finish(&ctx.writer);  | 
15018  |  |  | 
15019  | 0  |   onError:  | 
15020  | 0  |     _PyUnicodeWriter_Dealloc(&ctx.writer);  | 
15021  | 0  |     if (ctx.args_owned) { | 
15022  | 0  |         Py_DECREF(ctx.args);  | 
15023  | 0  |     }  | 
15024  | 0  |     return NULL;  | 
15025  | 57  | }  | 
15026  |  |  | 
15027  |  | static PyObject *  | 
15028  |  | unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);  | 
15029  |  |  | 
15030  |  | static PyObject *  | 
15031  |  | unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)  | 
15032  | 760  | { | 
15033  | 760  |     PyObject *x = NULL;  | 
15034  | 760  |     static char *kwlist[] = {"object", "encoding", "errors", 0}; | 
15035  | 760  |     char *encoding = NULL;  | 
15036  | 760  |     char *errors = NULL;  | 
15037  |  |  | 
15038  | 760  |     if (type != &PyUnicode_Type)  | 
15039  | 0  |         return unicode_subtype_new(type, args, kwds);  | 
15040  | 760  |     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",  | 
15041  | 760  |                                      kwlist, &x, &encoding, &errors))  | 
15042  | 0  |         return NULL;  | 
15043  | 760  |     if (x == NULL)  | 
15044  | 0  |         _Py_RETURN_UNICODE_EMPTY();  | 
15045  | 760  |     if (encoding == NULL && errors == NULL)  | 
15046  | 759  |         return PyObject_Str(x);  | 
15047  | 1  |     else  | 
15048  | 1  |         return PyUnicode_FromEncodedObject(x, encoding, errors);  | 
15049  | 760  | }  | 
15050  |  |  | 
15051  |  | static PyObject *  | 
15052  |  | unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)  | 
15053  | 0  | { | 
15054  | 0  |     PyObject *unicode, *self;  | 
15055  | 0  |     Py_ssize_t length, char_size;  | 
15056  | 0  |     int share_wstr, share_utf8;  | 
15057  | 0  |     unsigned int kind;  | 
15058  | 0  |     void *data;  | 
15059  |  | 
  | 
15060  | 0  |     assert(PyType_IsSubtype(type, &PyUnicode_Type));  | 
15061  |  | 
  | 
15062  | 0  |     unicode = unicode_new(&PyUnicode_Type, args, kwds);  | 
15063  | 0  |     if (unicode == NULL)  | 
15064  | 0  |         return NULL;  | 
15065  | 0  |     assert(_PyUnicode_CHECK(unicode));  | 
15066  | 0  |     if (PyUnicode_READY(unicode) == -1) { | 
15067  | 0  |         Py_DECREF(unicode);  | 
15068  | 0  |         return NULL;  | 
15069  | 0  |     }  | 
15070  |  |  | 
15071  | 0  |     self = type->tp_alloc(type, 0);  | 
15072  | 0  |     if (self == NULL) { | 
15073  | 0  |         Py_DECREF(unicode);  | 
15074  | 0  |         return NULL;  | 
15075  | 0  |     }  | 
15076  | 0  |     kind = PyUnicode_KIND(unicode);  | 
15077  | 0  |     length = PyUnicode_GET_LENGTH(unicode);  | 
15078  |  | 
  | 
15079  | 0  |     _PyUnicode_LENGTH(self) = length;  | 
15080  |  | #ifdef Py_DEBUG  | 
15081  |  |     _PyUnicode_HASH(self) = -1;  | 
15082  |  | #else  | 
15083  | 0  |     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);  | 
15084  | 0  | #endif  | 
15085  | 0  |     _PyUnicode_STATE(self).interned = 0;  | 
15086  | 0  |     _PyUnicode_STATE(self).kind = kind;  | 
15087  | 0  |     _PyUnicode_STATE(self).compact = 0;  | 
15088  | 0  |     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;  | 
15089  | 0  |     _PyUnicode_STATE(self).ready = 1;  | 
15090  | 0  |     _PyUnicode_WSTR(self) = NULL;  | 
15091  | 0  |     _PyUnicode_UTF8_LENGTH(self) = 0;  | 
15092  | 0  |     _PyUnicode_UTF8(self) = NULL;  | 
15093  | 0  |     _PyUnicode_WSTR_LENGTH(self) = 0;  | 
15094  | 0  |     _PyUnicode_DATA_ANY(self) = NULL;  | 
15095  |  | 
  | 
15096  | 0  |     share_utf8 = 0;  | 
15097  | 0  |     share_wstr = 0;  | 
15098  | 0  |     if (kind == PyUnicode_1BYTE_KIND) { | 
15099  | 0  |         char_size = 1;  | 
15100  | 0  |         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)  | 
15101  | 0  |             share_utf8 = 1;  | 
15102  | 0  |     }  | 
15103  | 0  |     else if (kind == PyUnicode_2BYTE_KIND) { | 
15104  | 0  |         char_size = 2;  | 
15105  | 0  |         if (sizeof(wchar_t) == 2)  | 
15106  | 0  |             share_wstr = 1;  | 
15107  | 0  |     }  | 
15108  | 0  |     else { | 
15109  | 0  |         assert(kind == PyUnicode_4BYTE_KIND);  | 
15110  | 0  |         char_size = 4;  | 
15111  | 0  |         if (sizeof(wchar_t) == 4)  | 
15112  | 0  |             share_wstr = 1;  | 
15113  | 0  |     }  | 
15114  |  |  | 
15115  |  |     /* Ensure we won't overflow the length. */  | 
15116  | 0  |     if (length > (PY_SSIZE_T_MAX / char_size - 1)) { | 
15117  | 0  |         PyErr_NoMemory();  | 
15118  | 0  |         goto onError;  | 
15119  | 0  |     }  | 
15120  | 0  |     data = PyObject_MALLOC((length + 1) * char_size);  | 
15121  | 0  |     if (data == NULL) { | 
15122  | 0  |         PyErr_NoMemory();  | 
15123  | 0  |         goto onError;  | 
15124  | 0  |     }  | 
15125  |  |  | 
15126  | 0  |     _PyUnicode_DATA_ANY(self) = data;  | 
15127  | 0  |     if (share_utf8) { | 
15128  | 0  |         _PyUnicode_UTF8_LENGTH(self) = length;  | 
15129  | 0  |         _PyUnicode_UTF8(self) = data;  | 
15130  | 0  |     }  | 
15131  | 0  |     if (share_wstr) { | 
15132  | 0  |         _PyUnicode_WSTR_LENGTH(self) = length;  | 
15133  | 0  |         _PyUnicode_WSTR(self) = (wchar_t *)data;  | 
15134  | 0  |     }  | 
15135  |  | 
  | 
15136  | 0  |     memcpy(data, PyUnicode_DATA(unicode),  | 
15137  | 0  |               kind * (length + 1));  | 
15138  | 0  |     assert(_PyUnicode_CheckConsistency(self, 1));  | 
15139  |  | #ifdef Py_DEBUG  | 
15140  |  |     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);  | 
15141  |  | #endif  | 
15142  | 0  |     Py_DECREF(unicode);  | 
15143  | 0  |     return self;  | 
15144  |  |  | 
15145  | 0  | onError:  | 
15146  | 0  |     Py_DECREF(unicode);  | 
15147  | 0  |     Py_DECREF(self);  | 
15148  | 0  |     return NULL;  | 
15149  | 0  | }  | 
15150  |  |  | 
15151  |  | PyDoc_STRVAR(unicode_doc,  | 
15152  |  | "str(object='') -> str\n\  | 
15153  |  | str(bytes_or_buffer[, encoding[, errors]]) -> str\n\  | 
15154  |  | \n\  | 
15155  |  | Create a new string object from the given object. If encoding or\n\  | 
15156  |  | errors is specified, then the object must expose a data buffer\n\  | 
15157  |  | that will be decoded using the given encoding and error handler.\n\  | 
15158  |  | Otherwise, returns the result of object.__str__() (if defined)\n\  | 
15159  |  | or repr(object).\n\  | 
15160  |  | encoding defaults to sys.getdefaultencoding().\n\  | 
15161  |  | errors defaults to 'strict'.");  | 
15162  |  |  | 
15163  |  | static PyObject *unicode_iter(PyObject *seq);  | 
15164  |  |  | 
15165  |  | PyTypeObject PyUnicode_Type = { | 
15166  |  |     PyVarObject_HEAD_INIT(&PyType_Type, 0)  | 
15167  |  |     "str",                        /* tp_name */  | 
15168  |  |     sizeof(PyUnicodeObject),      /* tp_basicsize */  | 
15169  |  |     0,                            /* tp_itemsize */  | 
15170  |  |     /* Slots */  | 
15171  |  |     (destructor)unicode_dealloc,  /* tp_dealloc */  | 
15172  |  |     0,                            /* tp_vectorcall_offset */  | 
15173  |  |     0,                            /* tp_getattr */  | 
15174  |  |     0,                            /* tp_setattr */  | 
15175  |  |     0,                            /* tp_as_async */  | 
15176  |  |     unicode_repr,                 /* tp_repr */  | 
15177  |  |     &unicode_as_number,           /* tp_as_number */  | 
15178  |  |     &unicode_as_sequence,         /* tp_as_sequence */  | 
15179  |  |     &unicode_as_mapping,          /* tp_as_mapping */  | 
15180  |  |     (hashfunc) unicode_hash,      /* tp_hash*/  | 
15181  |  |     0,                            /* tp_call*/  | 
15182  |  |     (reprfunc) unicode_str,       /* tp_str */  | 
15183  |  |     PyObject_GenericGetAttr,      /* tp_getattro */  | 
15184  |  |     0,                            /* tp_setattro */  | 
15185  |  |     0,                            /* tp_as_buffer */  | 
15186  |  |     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |  | 
15187  |  |     Py_TPFLAGS_UNICODE_SUBCLASS,   /* tp_flags */  | 
15188  |  |     unicode_doc,                  /* tp_doc */  | 
15189  |  |     0,                            /* tp_traverse */  | 
15190  |  |     0,                            /* tp_clear */  | 
15191  |  |     PyUnicode_RichCompare,        /* tp_richcompare */  | 
15192  |  |     0,                            /* tp_weaklistoffset */  | 
15193  |  |     unicode_iter,                 /* tp_iter */  | 
15194  |  |     0,                            /* tp_iternext */  | 
15195  |  |     unicode_methods,              /* tp_methods */  | 
15196  |  |     0,                            /* tp_members */  | 
15197  |  |     0,                            /* tp_getset */  | 
15198  |  |     &PyBaseObject_Type,           /* tp_base */  | 
15199  |  |     0,                            /* tp_dict */  | 
15200  |  |     0,                            /* tp_descr_get */  | 
15201  |  |     0,                            /* tp_descr_set */  | 
15202  |  |     0,                            /* tp_dictoffset */  | 
15203  |  |     0,                            /* tp_init */  | 
15204  |  |     0,                            /* tp_alloc */  | 
15205  |  |     unicode_new,                  /* tp_new */  | 
15206  |  |     PyObject_Del,                 /* tp_free */  | 
15207  |  | };  | 
15208  |  |  | 
15209  |  | /* Initialize the Unicode implementation */  | 
15210  |  |  | 
15211  |  | PyStatus  | 
15212  |  | _PyUnicode_Init(void)  | 
15213  | 14  | { | 
15214  |  |     /* XXX - move this array to unicodectype.c ? */  | 
15215  | 14  |     Py_UCS2 linebreak[] = { | 
15216  | 14  |         0x000A, /* LINE FEED */  | 
15217  | 14  |         0x000D, /* CARRIAGE RETURN */  | 
15218  | 14  |         0x001C, /* FILE SEPARATOR */  | 
15219  | 14  |         0x001D, /* GROUP SEPARATOR */  | 
15220  | 14  |         0x001E, /* RECORD SEPARATOR */  | 
15221  | 14  |         0x0085, /* NEXT LINE */  | 
15222  | 14  |         0x2028, /* LINE SEPARATOR */  | 
15223  | 14  |         0x2029, /* PARAGRAPH SEPARATOR */  | 
15224  | 14  |     };  | 
15225  |  |  | 
15226  |  |     /* Init the implementation */  | 
15227  | 14  |     _Py_INCREF_UNICODE_EMPTY();  | 
15228  | 14  |     if (!unicode_empty) { | 
15229  | 0  |         return _PyStatus_ERR("Can't create empty string"); | 
15230  | 0  |     }  | 
15231  | 14  |     Py_DECREF(unicode_empty);  | 
15232  |  |  | 
15233  | 14  |     if (PyType_Ready(&PyUnicode_Type) < 0) { | 
15234  | 0  |         return _PyStatus_ERR("Can't initialize unicode type"); | 
15235  | 0  |     }  | 
15236  |  |  | 
15237  |  |     /* initialize the linebreak bloom filter */  | 
15238  | 14  |     bloom_linebreak = make_bloom_mask(  | 
15239  | 14  |         PyUnicode_2BYTE_KIND, linebreak,  | 
15240  | 14  |         Py_ARRAY_LENGTH(linebreak));  | 
15241  |  |  | 
15242  | 14  |     if (PyType_Ready(&EncodingMapType) < 0) { | 
15243  | 0  |          return _PyStatus_ERR("Can't initialize encoding map type"); | 
15244  | 0  |     }  | 
15245  | 14  |     if (PyType_Ready(&PyFieldNameIter_Type) < 0) { | 
15246  | 0  |         return _PyStatus_ERR("Can't initialize field name iterator type"); | 
15247  | 0  |     }  | 
15248  | 14  |     if (PyType_Ready(&PyFormatterIter_Type) < 0) { | 
15249  | 0  |         return _PyStatus_ERR("Can't initialize formatter iter type"); | 
15250  | 0  |     }  | 
15251  | 14  |     return _PyStatus_OK();  | 
15252  | 14  | }  | 
15253  |  |  | 
15254  |  | /* Finalize the Unicode implementation */  | 
15255  |  |  | 
15256  |  | int  | 
15257  |  | PyUnicode_ClearFreeList(void)  | 
15258  | 0  | { | 
15259  | 0  |     return 0;  | 
15260  | 0  | }  | 
15261  |  |  | 
15262  |  |  | 
15263  |  | void  | 
15264  |  | PyUnicode_InternInPlace(PyObject **p)  | 
15265  | 232k  | { | 
15266  | 232k  |     PyObject *s = *p;  | 
15267  | 232k  |     PyObject *t;  | 
15268  |  | #ifdef Py_DEBUG  | 
15269  |  |     assert(s != NULL);  | 
15270  |  |     assert(_PyUnicode_CHECK(s));  | 
15271  |  | #else  | 
15272  | 232k  |     if (s == NULL || !PyUnicode_Check(s))  | 
15273  | 0  |         return;  | 
15274  | 232k  | #endif  | 
15275  |  |     /* If it's a subclass, we don't really know what putting  | 
15276  |  |        it in the interned dict might do. */  | 
15277  | 232k  |     if (!PyUnicode_CheckExact(s))  | 
15278  | 0  |         return;  | 
15279  | 232k  |     if (PyUnicode_CHECK_INTERNED(s))  | 
15280  | 139k  |         return;  | 
15281  | 92.9k  |     if (interned == NULL) { | 
15282  | 14  |         interned = PyDict_New();  | 
15283  | 14  |         if (interned == NULL) { | 
15284  | 0  |             PyErr_Clear(); /* Don't leave an exception */  | 
15285  | 0  |             return;  | 
15286  | 0  |         }  | 
15287  | 14  |     }  | 
15288  | 92.9k  |     Py_ALLOW_RECURSION  | 
15289  | 92.9k  |     t = PyDict_SetDefault(interned, s, s);  | 
15290  | 92.9k  |     Py_END_ALLOW_RECURSION  | 
15291  | 92.9k  |     if (t == NULL) { | 
15292  | 0  |         PyErr_Clear();  | 
15293  | 0  |         return;  | 
15294  | 0  |     }  | 
15295  | 92.9k  |     if (t != s) { | 
15296  | 49.9k  |         Py_INCREF(t);  | 
15297  | 49.9k  |         Py_SETREF(*p, t);  | 
15298  | 49.9k  |         return;  | 
15299  | 49.9k  |     }  | 
15300  |  |     /* The two references in interned are not counted by refcnt.  | 
15301  |  |        The deallocator will take care of this */  | 
15302  | 43.0k  |     Py_REFCNT(s) -= 2;  | 
15303  | 43.0k  |     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;  | 
15304  | 43.0k  | }  | 
15305  |  |  | 
15306  |  | void  | 
15307  |  | PyUnicode_InternImmortal(PyObject **p)  | 
15308  | 0  | { | 
15309  | 0  |     PyUnicode_InternInPlace(p);  | 
15310  | 0  |     if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { | 
15311  | 0  |         _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;  | 
15312  | 0  |         Py_INCREF(*p);  | 
15313  | 0  |     }  | 
15314  | 0  | }  | 
15315  |  |  | 
15316  |  | PyObject *  | 
15317  |  | PyUnicode_InternFromString(const char *cp)  | 
15318  | 36.0k  | { | 
15319  | 36.0k  |     PyObject *s = PyUnicode_FromString(cp);  | 
15320  | 36.0k  |     if (s == NULL)  | 
15321  | 0  |         return NULL;  | 
15322  | 36.0k  |     PyUnicode_InternInPlace(&s);  | 
15323  | 36.0k  |     return s;  | 
15324  | 36.0k  | }  | 
15325  |  |  | 
15326  |  |  | 
15327  |  | #if defined(WITH_VALGRIND) || defined(__INSURE__)  | 
15328  |  | static void  | 
15329  |  | unicode_release_interned(void)  | 
15330  |  | { | 
15331  |  |     PyObject *keys;  | 
15332  |  |     PyObject *s;  | 
15333  |  |     Py_ssize_t i, n;  | 
15334  |  |     Py_ssize_t immortal_size = 0, mortal_size = 0;  | 
15335  |  |  | 
15336  |  |     if (interned == NULL || !PyDict_Check(interned))  | 
15337  |  |         return;  | 
15338  |  |     keys = PyDict_Keys(interned);  | 
15339  |  |     if (keys == NULL || !PyList_Check(keys)) { | 
15340  |  |         PyErr_Clear();  | 
15341  |  |         return;  | 
15342  |  |     }  | 
15343  |  |  | 
15344  |  |     /* Since unicode_release_interned() is intended to help a leak  | 
15345  |  |        detector, interned unicode strings are not forcibly deallocated;  | 
15346  |  |        rather, we give them their stolen references back, and then clear  | 
15347  |  |        and DECREF the interned dict. */  | 
15348  |  |  | 
15349  |  |     n = PyList_GET_SIZE(keys);  | 
15350  |  | #ifdef INTERNED_STATS  | 
15351  |  |     fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",  | 
15352  |  |             n);  | 
15353  |  | #endif  | 
15354  |  |     for (i = 0; i < n; i++) { | 
15355  |  |         s = PyList_GET_ITEM(keys, i);  | 
15356  |  |         if (PyUnicode_READY(s) == -1) { | 
15357  |  |             Py_UNREACHABLE();  | 
15358  |  |         }  | 
15359  |  |         switch (PyUnicode_CHECK_INTERNED(s)) { | 
15360  |  |         case SSTATE_NOT_INTERNED:  | 
15361  |  |             /* XXX Shouldn't happen */  | 
15362  |  |             break;  | 
15363  |  |         case SSTATE_INTERNED_IMMORTAL:  | 
15364  |  |             Py_REFCNT(s) += 1;  | 
15365  |  |             immortal_size += PyUnicode_GET_LENGTH(s);  | 
15366  |  |             break;  | 
15367  |  |         case SSTATE_INTERNED_MORTAL:  | 
15368  |  |             Py_REFCNT(s) += 2;  | 
15369  |  |             mortal_size += PyUnicode_GET_LENGTH(s);  | 
15370  |  |             break;  | 
15371  |  |         default:  | 
15372  |  |             Py_FatalError("Inconsistent interned string state."); | 
15373  |  |         }  | 
15374  |  |         _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;  | 
15375  |  |     }  | 
15376  |  | #ifdef INTERNED_STATS  | 
15377  |  |     fprintf(stderr, "total size of all interned strings: "  | 
15378  |  |             "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "  | 
15379  |  |             "mortal/immortal\n", mortal_size, immortal_size);  | 
15380  |  | #endif  | 
15381  |  |     Py_DECREF(keys);  | 
15382  |  |     PyDict_Clear(interned);  | 
15383  |  |     Py_CLEAR(interned);  | 
15384  |  | }  | 
15385  |  | #endif  | 
15386  |  |  | 
15387  |  |  | 
15388  |  | /********************* Unicode Iterator **************************/  | 
15389  |  |  | 
15390  |  | typedef struct { | 
15391  |  |     PyObject_HEAD  | 
15392  |  |     Py_ssize_t it_index;  | 
15393  |  |     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */  | 
15394  |  | } unicodeiterobject;  | 
15395  |  |  | 
15396  |  | static void  | 
15397  |  | unicodeiter_dealloc(unicodeiterobject *it)  | 
15398  | 66  | { | 
15399  | 66  |     _PyObject_GC_UNTRACK(it);  | 
15400  | 66  |     Py_XDECREF(it->it_seq);  | 
15401  | 66  |     PyObject_GC_Del(it);  | 
15402  | 66  | }  | 
15403  |  |  | 
15404  |  | static int  | 
15405  |  | unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)  | 
15406  | 0  | { | 
15407  | 0  |     Py_VISIT(it->it_seq);  | 
15408  | 0  |     return 0;  | 
15409  | 0  | }  | 
15410  |  |  | 
15411  |  | static PyObject *  | 
15412  |  | unicodeiter_next(unicodeiterobject *it)  | 
15413  | 445  | { | 
15414  | 445  |     PyObject *seq, *item;  | 
15415  |  |  | 
15416  | 445  |     assert(it != NULL);  | 
15417  | 445  |     seq = it->it_seq;  | 
15418  | 445  |     if (seq == NULL)  | 
15419  | 0  |         return NULL;  | 
15420  | 445  |     assert(_PyUnicode_CHECK(seq));  | 
15421  |  |  | 
15422  | 445  |     if (it->it_index < PyUnicode_GET_LENGTH(seq)) { | 
15423  | 393  |         int kind = PyUnicode_KIND(seq);  | 
15424  | 393  |         void *data = PyUnicode_DATA(seq);  | 
15425  | 393  |         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);  | 
15426  | 393  |         item = PyUnicode_FromOrdinal(chr);  | 
15427  | 393  |         if (item != NULL)  | 
15428  | 393  |             ++it->it_index;  | 
15429  | 393  |         return item;  | 
15430  | 393  |     }  | 
15431  |  |  | 
15432  | 52  |     it->it_seq = NULL;  | 
15433  | 52  |     Py_DECREF(seq);  | 
15434  | 52  |     return NULL;  | 
15435  | 445  | }  | 
15436  |  |  | 
15437  |  | static PyObject *  | 
15438  |  | unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))  | 
15439  | 0  | { | 
15440  | 0  |     Py_ssize_t len = 0;  | 
15441  | 0  |     if (it->it_seq)  | 
15442  | 0  |         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;  | 
15443  | 0  |     return PyLong_FromSsize_t(len);  | 
15444  | 0  | }  | 
15445  |  |  | 
15446  |  | PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");  | 
15447  |  |  | 
15448  |  | static PyObject *  | 
15449  |  | unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))  | 
15450  | 0  | { | 
15451  | 0  |     _Py_IDENTIFIER(iter);  | 
15452  | 0  |     if (it->it_seq != NULL) { | 
15453  | 0  |         return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter), | 
15454  | 0  |                              it->it_seq, it->it_index);  | 
15455  | 0  |     } else { | 
15456  | 0  |         PyObject *u = (PyObject *)_PyUnicode_New(0);  | 
15457  | 0  |         if (u == NULL)  | 
15458  | 0  |             return NULL;  | 
15459  | 0  |         return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u); | 
15460  | 0  |     }  | 
15461  | 0  | }  | 
15462  |  |  | 
15463  |  | PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");  | 
15464  |  |  | 
15465  |  | static PyObject *  | 
15466  |  | unicodeiter_setstate(unicodeiterobject *it, PyObject *state)  | 
15467  | 0  | { | 
15468  | 0  |     Py_ssize_t index = PyLong_AsSsize_t(state);  | 
15469  | 0  |     if (index == -1 && PyErr_Occurred())  | 
15470  | 0  |         return NULL;  | 
15471  | 0  |     if (it->it_seq != NULL) { | 
15472  | 0  |         if (index < 0)  | 
15473  | 0  |             index = 0;  | 
15474  | 0  |         else if (index > PyUnicode_GET_LENGTH(it->it_seq))  | 
15475  | 0  |             index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */  | 
15476  | 0  |         it->it_index = index;  | 
15477  | 0  |     }  | 
15478  | 0  |     Py_RETURN_NONE;  | 
15479  | 0  | }  | 
15480  |  |  | 
15481  |  | PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");  | 
15482  |  |  | 
15483  |  | static PyMethodDef unicodeiter_methods[] = { | 
15484  |  |     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, | 
15485  |  |      length_hint_doc},  | 
15486  |  |     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS, | 
15487  |  |      reduce_doc},  | 
15488  |  |     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O, | 
15489  |  |      setstate_doc},  | 
15490  |  |     {NULL,      NULL}       /* sentinel */ | 
15491  |  | };  | 
15492  |  |  | 
15493  |  | PyTypeObject PyUnicodeIter_Type = { | 
15494  |  |     PyVarObject_HEAD_INIT(&PyType_Type, 0)  | 
15495  |  |     "str_iterator",         /* tp_name */  | 
15496  |  |     sizeof(unicodeiterobject),      /* tp_basicsize */  | 
15497  |  |     0,                  /* tp_itemsize */  | 
15498  |  |     /* methods */  | 
15499  |  |     (destructor)unicodeiter_dealloc,    /* tp_dealloc */  | 
15500  |  |     0,                  /* tp_vectorcall_offset */  | 
15501  |  |     0,                  /* tp_getattr */  | 
15502  |  |     0,                  /* tp_setattr */  | 
15503  |  |     0,                  /* tp_as_async */  | 
15504  |  |     0,                  /* tp_repr */  | 
15505  |  |     0,                  /* tp_as_number */  | 
15506  |  |     0,                  /* tp_as_sequence */  | 
15507  |  |     0,                  /* tp_as_mapping */  | 
15508  |  |     0,                  /* tp_hash */  | 
15509  |  |     0,                  /* tp_call */  | 
15510  |  |     0,                  /* tp_str */  | 
15511  |  |     PyObject_GenericGetAttr,        /* tp_getattro */  | 
15512  |  |     0,                  /* tp_setattro */  | 
15513  |  |     0,                  /* tp_as_buffer */  | 
15514  |  |     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */  | 
15515  |  |     0,                  /* tp_doc */  | 
15516  |  |     (traverseproc)unicodeiter_traverse, /* tp_traverse */  | 
15517  |  |     0,                  /* tp_clear */  | 
15518  |  |     0,                  /* tp_richcompare */  | 
15519  |  |     0,                  /* tp_weaklistoffset */  | 
15520  |  |     PyObject_SelfIter,          /* tp_iter */  | 
15521  |  |     (iternextfunc)unicodeiter_next,     /* tp_iternext */  | 
15522  |  |     unicodeiter_methods,            /* tp_methods */  | 
15523  |  |     0,  | 
15524  |  | };  | 
15525  |  |  | 
15526  |  | static PyObject *  | 
15527  |  | unicode_iter(PyObject *seq)  | 
15528  | 66  | { | 
15529  | 66  |     unicodeiterobject *it;  | 
15530  |  |  | 
15531  | 66  |     if (!PyUnicode_Check(seq)) { | 
15532  | 0  |         PyErr_BadInternalCall();  | 
15533  | 0  |         return NULL;  | 
15534  | 0  |     }  | 
15535  | 66  |     if (PyUnicode_READY(seq) == -1)  | 
15536  | 0  |         return NULL;  | 
15537  | 66  |     it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);  | 
15538  | 66  |     if (it == NULL)  | 
15539  | 0  |         return NULL;  | 
15540  | 66  |     it->it_index = 0;  | 
15541  | 66  |     Py_INCREF(seq);  | 
15542  | 66  |     it->it_seq = seq;  | 
15543  | 66  |     _PyObject_GC_TRACK(it);  | 
15544  | 66  |     return (PyObject *)it;  | 
15545  | 66  | }  | 
15546  |  |  | 
15547  |  |  | 
15548  |  | size_t  | 
15549  |  | Py_UNICODE_strlen(const Py_UNICODE *u)  | 
15550  | 0  | { | 
15551  | 0  |     return wcslen(u);  | 
15552  | 0  | }  | 
15553  |  |  | 
15554  |  | Py_UNICODE*  | 
15555  |  | Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)  | 
15556  | 0  | { | 
15557  | 0  |     Py_UNICODE *u = s1;  | 
15558  | 0  |     while ((*u++ = *s2++));  | 
15559  | 0  |     return s1;  | 
15560  | 0  | }  | 
15561  |  |  | 
15562  |  | Py_UNICODE*  | 
15563  |  | Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)  | 
15564  | 0  | { | 
15565  | 0  |     Py_UNICODE *u = s1;  | 
15566  | 0  |     while ((*u++ = *s2++))  | 
15567  | 0  |         if (n-- == 0)  | 
15568  | 0  |             break;  | 
15569  | 0  |     return s1;  | 
15570  | 0  | }  | 
15571  |  |  | 
15572  |  | Py_UNICODE*  | 
15573  |  | Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)  | 
15574  | 0  | { | 
15575  | 0  |     Py_UNICODE *u1 = s1;  | 
15576  | 0  |     u1 += wcslen(u1);  | 
15577  | 0  |     while ((*u1++ = *s2++));  | 
15578  | 0  |     return s1;  | 
15579  | 0  | }  | 
15580  |  |  | 
15581  |  | int  | 
15582  |  | Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)  | 
15583  | 0  | { | 
15584  | 0  |     while (*s1 && *s2 && *s1 == *s2)  | 
15585  | 0  |         s1++, s2++;  | 
15586  | 0  |     if (*s1 && *s2)  | 
15587  | 0  |         return (*s1 < *s2) ? -1 : +1;  | 
15588  | 0  |     if (*s1)  | 
15589  | 0  |         return 1;  | 
15590  | 0  |     if (*s2)  | 
15591  | 0  |         return -1;  | 
15592  | 0  |     return 0;  | 
15593  | 0  | }  | 
15594  |  |  | 
15595  |  | int  | 
15596  |  | Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)  | 
15597  | 0  | { | 
15598  | 0  |     Py_UNICODE u1, u2;  | 
15599  | 0  |     for (; n != 0; n--) { | 
15600  | 0  |         u1 = *s1;  | 
15601  | 0  |         u2 = *s2;  | 
15602  | 0  |         if (u1 != u2)  | 
15603  | 0  |             return (u1 < u2) ? -1 : +1;  | 
15604  | 0  |         if (u1 == '\0')  | 
15605  | 0  |             return 0;  | 
15606  | 0  |         s1++;  | 
15607  | 0  |         s2++;  | 
15608  | 0  |     }  | 
15609  | 0  |     return 0;  | 
15610  | 0  | }  | 
15611  |  |  | 
15612  |  | Py_UNICODE*  | 
15613  |  | Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)  | 
15614  | 0  | { | 
15615  | 0  |     const Py_UNICODE *p;  | 
15616  | 0  |     for (p = s; *p; p++)  | 
15617  | 0  |         if (*p == c)  | 
15618  | 0  |             return (Py_UNICODE*)p;  | 
15619  | 0  |     return NULL;  | 
15620  | 0  | }  | 
15621  |  |  | 
15622  |  | Py_UNICODE*  | 
15623  |  | Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)  | 
15624  | 0  | { | 
15625  | 0  |     const Py_UNICODE *p;  | 
15626  | 0  |     p = s + wcslen(s);  | 
15627  | 0  |     while (p != s) { | 
15628  | 0  |         p--;  | 
15629  | 0  |         if (*p == c)  | 
15630  | 0  |             return (Py_UNICODE*)p;  | 
15631  | 0  |     }  | 
15632  | 0  |     return NULL;  | 
15633  | 0  | }  | 
15634  |  |  | 
15635  |  | Py_UNICODE*  | 
15636  |  | PyUnicode_AsUnicodeCopy(PyObject *unicode)  | 
15637  | 0  | { | 
15638  | 0  |     Py_UNICODE *u, *copy;  | 
15639  | 0  |     Py_ssize_t len, size;  | 
15640  |  | 
  | 
15641  | 0  |     if (!PyUnicode_Check(unicode)) { | 
15642  | 0  |         PyErr_BadArgument();  | 
15643  | 0  |         return NULL;  | 
15644  | 0  |     }  | 
15645  | 0  |     u = PyUnicode_AsUnicodeAndSize(unicode, &len);  | 
15646  | 0  |     if (u == NULL)  | 
15647  | 0  |         return NULL;  | 
15648  |  |     /* Ensure we won't overflow the size. */  | 
15649  | 0  |     if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { | 
15650  | 0  |         PyErr_NoMemory();  | 
15651  | 0  |         return NULL;  | 
15652  | 0  |     }  | 
15653  | 0  |     size = len + 1; /* copy the null character */  | 
15654  | 0  |     size *= sizeof(Py_UNICODE);  | 
15655  | 0  |     copy = PyMem_Malloc(size);  | 
15656  | 0  |     if (copy == NULL) { | 
15657  | 0  |         PyErr_NoMemory();  | 
15658  | 0  |         return NULL;  | 
15659  | 0  |     }  | 
15660  | 0  |     memcpy(copy, u, size);  | 
15661  | 0  |     return copy;  | 
15662  | 0  | }  | 
15663  |  |  | 
15664  |  |  | 
15665  |  | static int  | 
15666  |  | encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)  | 
15667  | 56  | { | 
15668  | 56  |     int res;  | 
15669  | 56  |     res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);  | 
15670  | 56  |     if (res == -2) { | 
15671  | 0  |         PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);  | 
15672  | 0  |         return -1;  | 
15673  | 0  |     }  | 
15674  | 56  |     if (res < 0) { | 
15675  | 0  |         PyErr_NoMemory();  | 
15676  | 0  |         return -1;  | 
15677  | 0  |     }  | 
15678  | 56  |     return 0;  | 
15679  | 56  | }  | 
15680  |  |  | 
15681  |  |  | 
15682  |  | static int  | 
15683  |  | config_get_codec_name(wchar_t **config_encoding)  | 
15684  | 28  | { | 
15685  | 28  |     char *encoding;  | 
15686  | 28  |     if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) { | 
15687  | 0  |         return -1;  | 
15688  | 0  |     }  | 
15689  |  |  | 
15690  | 28  |     PyObject *name_obj = NULL;  | 
15691  | 28  |     PyObject *codec = _PyCodec_Lookup(encoding);  | 
15692  | 28  |     PyMem_RawFree(encoding);  | 
15693  |  |  | 
15694  | 28  |     if (!codec)  | 
15695  | 0  |         goto error;  | 
15696  |  |  | 
15697  | 28  |     name_obj = PyObject_GetAttrString(codec, "name");  | 
15698  | 28  |     Py_CLEAR(codec);  | 
15699  | 28  |     if (!name_obj) { | 
15700  | 0  |         goto error;  | 
15701  | 0  |     }  | 
15702  |  |  | 
15703  | 28  |     wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);  | 
15704  | 28  |     Py_DECREF(name_obj);  | 
15705  | 28  |     if (wname == NULL) { | 
15706  | 0  |         goto error;  | 
15707  | 0  |     }  | 
15708  |  |  | 
15709  | 28  |     wchar_t *raw_wname = _PyMem_RawWcsdup(wname);  | 
15710  | 28  |     if (raw_wname == NULL) { | 
15711  | 0  |         PyMem_Free(wname);  | 
15712  | 0  |         PyErr_NoMemory();  | 
15713  | 0  |         goto error;  | 
15714  | 0  |     }  | 
15715  |  |  | 
15716  | 28  |     PyMem_RawFree(*config_encoding);  | 
15717  | 28  |     *config_encoding = raw_wname;  | 
15718  |  |  | 
15719  | 28  |     PyMem_Free(wname);  | 
15720  | 28  |     return 0;  | 
15721  |  |  | 
15722  | 0  | error:  | 
15723  | 0  |     Py_XDECREF(codec);  | 
15724  | 0  |     Py_XDECREF(name_obj);  | 
15725  | 0  |     return -1;  | 
15726  | 28  | }  | 
15727  |  |  | 
15728  |  |  | 
15729  |  | static PyStatus  | 
15730  |  | init_stdio_encoding(PyThreadState *tstate)  | 
15731  | 14  | { | 
15732  |  |     /* Update the stdio encoding to the normalized Python codec name. */  | 
15733  | 14  |     PyConfig *config = &tstate->interp->config;  | 
15734  | 14  |     if (config_get_codec_name(&config->stdio_encoding) < 0) { | 
15735  | 0  |         return _PyStatus_ERR("failed to get the Python codec name " | 
15736  | 0  |                              "of the stdio encoding");  | 
15737  | 0  |     }  | 
15738  | 14  |     return _PyStatus_OK();  | 
15739  | 14  | }  | 
15740  |  |  | 
15741  |  |  | 
15742  |  | static int  | 
15743  |  | init_fs_codec(PyInterpreterState *interp)  | 
15744  | 14  | { | 
15745  | 14  |     PyConfig *config = &interp->config;  | 
15746  |  |  | 
15747  | 14  |     _Py_error_handler error_handler;  | 
15748  | 14  |     error_handler = get_error_handler_wide(config->filesystem_errors);  | 
15749  | 14  |     if (error_handler == _Py_ERROR_UNKNOWN) { | 
15750  | 0  |         PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");  | 
15751  | 0  |         return -1;  | 
15752  | 0  |     }  | 
15753  |  |  | 
15754  | 14  |     char *encoding, *errors;  | 
15755  | 14  |     if (encode_wstr_utf8(config->filesystem_encoding,  | 
15756  | 14  |                          &encoding,  | 
15757  | 14  |                          "filesystem_encoding") < 0) { | 
15758  | 0  |         return -1;  | 
15759  | 0  |     }  | 
15760  |  |  | 
15761  | 14  |     if (encode_wstr_utf8(config->filesystem_errors,  | 
15762  | 14  |                          &errors,  | 
15763  | 14  |                          "filesystem_errors") < 0) { | 
15764  | 0  |         PyMem_RawFree(encoding);  | 
15765  | 0  |         return -1;  | 
15766  | 0  |     }  | 
15767  |  |  | 
15768  | 14  |     PyMem_RawFree(interp->fs_codec.encoding);  | 
15769  | 14  |     interp->fs_codec.encoding = encoding;  | 
15770  | 14  |     PyMem_RawFree(interp->fs_codec.errors);  | 
15771  | 14  |     interp->fs_codec.errors = errors;  | 
15772  | 14  |     interp->fs_codec.error_handler = error_handler;  | 
15773  |  |  | 
15774  |  |     /* At this point, PyUnicode_EncodeFSDefault() and  | 
15775  |  |        PyUnicode_DecodeFSDefault() can now use the Python codec rather than  | 
15776  |  |        the C implementation of the filesystem encoding. */  | 
15777  |  |  | 
15778  |  |     /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors  | 
15779  |  |        global configuration variables. */  | 
15780  | 14  |     if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,  | 
15781  | 14  |                                   interp->fs_codec.errors) < 0) { | 
15782  | 0  |         PyErr_NoMemory();  | 
15783  | 0  |         return -1;  | 
15784  | 0  |     }  | 
15785  | 14  |     return 0;  | 
15786  | 14  | }  | 
15787  |  |  | 
15788  |  |  | 
15789  |  | static PyStatus  | 
15790  |  | init_fs_encoding(PyThreadState *tstate)  | 
15791  | 14  | { | 
15792  | 14  |     PyInterpreterState *interp = tstate->interp;  | 
15793  |  |  | 
15794  |  |     /* Update the filesystem encoding to the normalized Python codec name.  | 
15795  |  |        For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"  | 
15796  |  |        (Python codec name). */  | 
15797  | 14  |     PyConfig *config = &interp->config;  | 
15798  | 14  |     if (config_get_codec_name(&config->filesystem_encoding) < 0) { | 
15799  | 0  |         _Py_DumpPathConfig(tstate);  | 
15800  | 0  |         return _PyStatus_ERR("failed to get the Python codec " | 
15801  | 0  |                              "of the filesystem encoding");  | 
15802  | 0  |     }  | 
15803  |  |  | 
15804  | 14  |     if (init_fs_codec(interp) < 0) { | 
15805  | 0  |         return _PyStatus_ERR("cannot initialize filesystem codec"); | 
15806  | 0  |     }  | 
15807  | 14  |     return _PyStatus_OK();  | 
15808  | 14  | }  | 
15809  |  |  | 
15810  |  |  | 
15811  |  | PyStatus  | 
15812  |  | _PyUnicode_InitEncodings(PyThreadState *tstate)  | 
15813  | 14  | { | 
15814  | 14  |     PyStatus status = init_fs_encoding(tstate);  | 
15815  | 14  |     if (_PyStatus_EXCEPTION(status)) { | 
15816  | 0  |         return status;  | 
15817  | 0  |     }  | 
15818  |  |  | 
15819  | 14  |     return init_stdio_encoding(tstate);  | 
15820  | 14  | }  | 
15821  |  |  | 
15822  |  |  | 
15823  |  | #ifdef MS_WINDOWS  | 
15824  |  | int  | 
15825  |  | _PyUnicode_EnableLegacyWindowsFSEncoding(void)  | 
15826  |  | { | 
15827  |  |     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();  | 
15828  |  |     PyConfig *config = &interp->config;  | 
15829  |  |  | 
15830  |  |     /* Set the filesystem encoding to mbcs/replace (PEP 529) */  | 
15831  |  |     wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");  | 
15832  |  |     wchar_t *errors = _PyMem_RawWcsdup(L"replace");  | 
15833  |  |     if (encoding == NULL || errors == NULL) { | 
15834  |  |         PyMem_RawFree(encoding);  | 
15835  |  |         PyMem_RawFree(errors);  | 
15836  |  |         PyErr_NoMemory();  | 
15837  |  |         return -1;  | 
15838  |  |     }  | 
15839  |  |  | 
15840  |  |     PyMem_RawFree(config->filesystem_encoding);  | 
15841  |  |     config->filesystem_encoding = encoding;  | 
15842  |  |     PyMem_RawFree(config->filesystem_errors);  | 
15843  |  |     config->filesystem_errors = errors;  | 
15844  |  |  | 
15845  |  |     return init_fs_codec(interp);  | 
15846  |  | }  | 
15847  |  | #endif  | 
15848  |  |  | 
15849  |  |  | 
15850  |  | void  | 
15851  |  | _PyUnicode_Fini(void)  | 
15852  | 0  | { | 
15853  |  | #if defined(WITH_VALGRIND) || defined(__INSURE__)  | 
15854  |  |     /* Insure++ is a memory analysis tool that aids in discovering  | 
15855  |  |      * memory leaks and other memory problems.  On Python exit, the  | 
15856  |  |      * interned string dictionaries are flagged as being in use at exit  | 
15857  |  |      * (which it is).  Under normal circumstances, this is fine because  | 
15858  |  |      * the memory will be automatically reclaimed by the system.  Under  | 
15859  |  |      * memory debugging, it's a huge source of useless noise, so we  | 
15860  |  |      * trade off slower shutdown for less distraction in the memory  | 
15861  |  |      * reports.  -baw  | 
15862  |  |      */  | 
15863  |  |     unicode_release_interned();  | 
15864  |  | #endif /* __INSURE__ */  | 
15865  |  | 
  | 
15866  | 0  |     Py_CLEAR(unicode_empty);  | 
15867  |  | 
  | 
15868  | 0  |     for (Py_ssize_t i = 0; i < 256; i++) { | 
15869  | 0  |         Py_CLEAR(unicode_latin1[i]);  | 
15870  | 0  |     }  | 
15871  | 0  |     _PyUnicode_ClearStaticStrings();  | 
15872  | 0  |     (void)PyUnicode_ClearFreeList();  | 
15873  |  | 
  | 
15874  | 0  |     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();  | 
15875  | 0  |     PyMem_RawFree(interp->fs_codec.encoding);  | 
15876  | 0  |     interp->fs_codec.encoding = NULL;  | 
15877  | 0  |     PyMem_RawFree(interp->fs_codec.errors);  | 
15878  | 0  |     interp->fs_codec.errors = NULL;  | 
15879  | 0  | }  | 
15880  |  |  | 
15881  |  |  | 
15882  |  | /* A _string module, to export formatter_parser and formatter_field_name_split  | 
15883  |  |    to the string.Formatter class implemented in Python. */  | 
15884  |  |  | 
15885  |  | static PyMethodDef _string_methods[] = { | 
15886  |  |     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, | 
15887  |  |      METH_O, PyDoc_STR("split the argument as a field name")}, | 
15888  |  |     {"formatter_parser", (PyCFunction) formatter_parser, | 
15889  |  |      METH_O, PyDoc_STR("parse the argument as a format string")}, | 
15890  |  |     {NULL, NULL} | 
15891  |  | };  | 
15892  |  |  | 
15893  |  | static struct PyModuleDef _string_module = { | 
15894  |  |     PyModuleDef_HEAD_INIT,  | 
15895  |  |     "_string",  | 
15896  |  |     PyDoc_STR("string helper module"), | 
15897  |  |     0,  | 
15898  |  |     _string_methods,  | 
15899  |  |     NULL,  | 
15900  |  |     NULL,  | 
15901  |  |     NULL,  | 
15902  |  |     NULL  | 
15903  |  | };  | 
15904  |  |  | 
15905  |  | PyMODINIT_FUNC  | 
15906  |  | PyInit__string(void)  | 
15907  | 1  | { | 
15908  | 1  |     return PyModule_Create(&_string_module);  | 
15909  | 1  | }  | 
15910  |  |  | 
15911  |  |  | 
15912  |  | #ifdef __cplusplus  | 
15913  |  | }  | 
15914  |  | #endif  |