Coverage Report

Created: 2025-10-12 06:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_freelist.h"      // _Py_FREELIST_FREE(), _Py_FREELIST_POP()
50
#include "pycore_initconfig.h"    // _PyStatus_OK()
51
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
52
#include "pycore_long.h"          // _PyLong_FormatWriter()
53
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
54
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
55
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
56
#include "pycore_pyhash.h"        // _Py_HashSecret_t
57
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
58
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
59
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
60
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
61
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
62
63
#include "stringlib/eq.h"         // unicode_eq()
64
#include <stddef.h>               // ptrdiff_t
65
66
#ifdef MS_WINDOWS
67
#include <windows.h>
68
#endif
69
70
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
71
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
72
#endif
73
74
/* Uncomment to display statistics on interned strings at exit
75
   in _PyUnicode_ClearInterned(). */
76
/* #define INTERNED_STATS 1 */
77
78
79
/*[clinic input]
80
class str "PyObject *" "&PyUnicode_Type"
81
[clinic start generated code]*/
82
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
83
84
/*[python input]
85
class Py_UCS4_converter(CConverter):
86
    type = 'Py_UCS4'
87
    converter = 'convert_uc'
88
89
    def converter_init(self):
90
        if self.default is not unspecified:
91
            self.c_default = ascii(self.default)
92
            if len(self.c_default) > 4 or self.c_default[0] != "'":
93
                self.c_default = hex(ord(self.default))
94
95
[python start generated code]*/
96
/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
97
98
/* --- Globals ------------------------------------------------------------
99
100
NOTE: In the interpreter's initialization phase, some globals are currently
101
      initialized dynamically as needed. In the process Unicode objects may
102
      be created before the Unicode type is ready.
103
104
*/
105
106
84.6M
#define MAX_UNICODE _Py_MAX_UNICODE
107
143M
#define ensure_unicode _PyUnicode_EnsureUnicode
108
109
#ifdef Py_DEBUG
110
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
111
#else
112
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
113
#endif
114
115
static inline char* _PyUnicode_UTF8(PyObject *op)
116
267M
{
117
267M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
118
267M
}
119
120
static inline char* PyUnicode_UTF8(PyObject *op)
121
67.6M
{
122
67.6M
    assert(_PyUnicode_CHECK(op));
123
67.6M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
124
54.8M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
125
54.8M
    }
126
12.8M
    else {
127
12.8M
         return _PyUnicode_UTF8(op);
128
12.8M
    }
129
67.6M
}
130
131
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
132
21.2M
{
133
21.2M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
134
21.2M
}
135
136
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
137
30.7M
{
138
30.7M
    assert(_PyUnicode_CHECK(op));
139
30.7M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
140
27.3M
         return _PyASCIIObject_CAST(op)->length;
141
27.3M
    }
142
3.43M
    else {
143
3.43M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
144
3.43M
    }
145
30.7M
}
146
147
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
148
21.2M
{
149
21.2M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
150
21.2M
}
151
152
#define _PyUnicode_LENGTH(op)                           \
153
597M
    (_PyASCIIObject_CAST(op)->length)
154
#define _PyUnicode_STATE(op)                            \
155
3.77G
    (_PyASCIIObject_CAST(op)->state)
156
#define _PyUnicode_HASH(op)                             \
157
549M
    (_PyASCIIObject_CAST(op)->hash)
158
159
106M
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
160
161
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
162
45.0M
{
163
45.0M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
164
45.0M
}
165
166
#define _PyUnicode_DATA_ANY(op)                         \
167
44.8M
    (_PyUnicodeObject_CAST(op)->data.any)
168
169
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
170
0
{
171
0
    assert(_PyUnicode_CHECK(op));
172
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
173
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
174
0
}
175
176
/* true if the Unicode object has an allocated UTF-8 memory block
177
   (not shared with other data) */
178
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
179
597M
{
180
597M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
181
244M
            && _PyUnicode_UTF8(op) != NULL
182
10.0M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
183
597M
}
184
185
186
/* Generic helper macro to convert characters of different types.
187
   from_type and to_type have to be valid type names, begin and end
188
   are pointers to the source characters which should be of type
189
   "from_type *".  to is a pointer of type "to_type *" and points to the
190
   buffer where the result characters are written to. */
191
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
192
165M
    do {                                                \
193
165M
        to_type *_to = (to_type *)(to);                 \
194
165M
        const from_type *_iter = (const from_type *)(begin);\
195
165M
        const from_type *_end = (const from_type *)(end);\
196
165M
        Py_ssize_t n = (_end) - (_iter);                \
197
165M
        const from_type *_unrolled_end =                \
198
165M
            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
199
1.08G
        while (_iter < (_unrolled_end)) {               \
200
914M
            _to[0] = (to_type) _iter[0];                \
201
914M
            _to[1] = (to_type) _iter[1];                \
202
914M
            _to[2] = (to_type) _iter[2];                \
203
914M
            _to[3] = (to_type) _iter[3];                \
204
914M
            _iter += 4; _to += 4;                       \
205
914M
        }                                               \
206
375M
        while (_iter < (_end))                          \
207
210M
            *_to++ = (to_type) *_iter++;                \
208
165M
    } while (0)
209
210
265M
#define LATIN1 _Py_LATIN1_CHR
211
212
#ifdef MS_WINDOWS
213
   /* On Windows, overallocate by 50% is the best factor */
214
#  define OVERALLOCATE_FACTOR 2
215
#else
216
   /* On Linux, overallocate by 25% is the best factor */
217
102M
#  define OVERALLOCATE_FACTOR 4
218
#endif
219
220
/* Forward declaration */
221
static inline int
222
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
223
static inline void
224
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
225
static PyObject *
226
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
227
                    const char *errors);
228
static PyObject *
229
unicode_decode_utf8(const char *s, Py_ssize_t size,
230
                    _Py_error_handler error_handler, const char *errors,
231
                    Py_ssize_t *consumed);
232
static int
233
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
234
                           const char *s, Py_ssize_t size,
235
                           _Py_error_handler error_handler, const char *errors,
236
                           Py_ssize_t *consumed);
237
#ifdef Py_DEBUG
238
static inline int unicode_is_finalizing(void);
239
static int unicode_is_singleton(PyObject *unicode);
240
#endif
241
242
243
// Return a reference to the immortal empty string singleton.
244
static inline PyObject* unicode_get_empty(void)
245
120M
{
246
120M
    _Py_DECLARE_STR(empty, "");
247
120M
    return &_Py_STR(empty);
248
120M
}
249
250
/* This dictionary holds per-interpreter interned strings.
251
 * See InternalDocs/string_interning.md for details.
252
 */
253
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
254
3.26M
{
255
3.26M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
256
3.26M
}
257
258
/* This hashtable holds statically allocated interned strings.
259
 * See InternalDocs/string_interning.md for details.
260
 */
261
3.04M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
262
263
/* Get number of all interned strings for the current interpreter. */
264
Py_ssize_t
265
_PyUnicode_InternedSize(void)
266
0
{
267
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
268
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
269
0
}
270
271
/* Get number of immortal interned strings for the current interpreter. */
272
Py_ssize_t
273
_PyUnicode_InternedSize_Immortal(void)
274
0
{
275
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
276
0
    PyObject *key, *value;
277
0
    Py_ssize_t pos = 0;
278
0
    Py_ssize_t count = 0;
279
280
    // It's tempting to keep a count and avoid a loop here. But, this function
281
    // is intended for refleak tests. It spends extra work to report the true
282
    // value, to help detect bugs in optimizations.
283
284
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
285
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
286
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
287
0
           count++;
288
0
       }
289
0
    }
290
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
291
0
}
292
293
static Py_hash_t unicode_hash(PyObject *);
294
295
static Py_uhash_t
296
hashtable_unicode_hash(const void *key)
297
3.04M
{
298
3.04M
    return unicode_hash((PyObject *)key);
299
3.04M
}
300
301
static int
302
hashtable_unicode_compare(const void *key1, const void *key2)
303
284k
{
304
284k
    PyObject *obj1 = (PyObject *)key1;
305
284k
    PyObject *obj2 = (PyObject *)key2;
306
284k
    if (obj1 != NULL && obj2 != NULL) {
307
284k
        return unicode_eq(obj1, obj2);
308
284k
    }
309
0
    else {
310
0
        return obj1 == obj2;
311
0
    }
312
284k
}
313
314
/* Return true if this interpreter should share the main interpreter's
315
   intern_dict.  That's important for interpreters which load basic
316
   single-phase init extension modules (m_size == -1).  There could be interned
317
   immortal strings that are shared between interpreters, due to the
318
   PyDict_Update(mdict, m_copy) call in import_find_extension().
319
320
   It's not safe to deallocate those strings until all interpreters that
321
   potentially use them are freed.  By storing them in the main interpreter, we
322
   ensure they get freed after all other interpreters are freed.
323
*/
324
static bool
325
has_shared_intern_dict(PyInterpreterState *interp)
326
16
{
327
16
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
328
16
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
329
16
}
330
331
static int
332
init_interned_dict(PyInterpreterState *interp)
333
16
{
334
16
    assert(get_interned_dict(interp) == NULL);
335
16
    PyObject *interned;
336
16
    if (has_shared_intern_dict(interp)) {
337
0
        interned = get_interned_dict(_PyInterpreterState_Main());
338
0
        Py_INCREF(interned);
339
0
    }
340
16
    else {
341
16
        interned = PyDict_New();
342
16
        if (interned == NULL) {
343
0
            return -1;
344
0
        }
345
16
    }
346
16
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
347
16
    return 0;
348
16
}
349
350
static void
351
clear_interned_dict(PyInterpreterState *interp)
352
0
{
353
0
    PyObject *interned = get_interned_dict(interp);
354
0
    if (interned != NULL) {
355
0
        if (!has_shared_intern_dict(interp)) {
356
            // only clear if the dict belongs to this interpreter
357
0
            PyDict_Clear(interned);
358
0
        }
359
0
        Py_DECREF(interned);
360
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
361
0
    }
362
0
}
363
364
static PyStatus
365
init_global_interned_strings(PyInterpreterState *interp)
366
16
{
367
16
    assert(INTERNED_STRINGS == NULL);
368
16
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
369
370
16
    INTERNED_STRINGS = _Py_hashtable_new_full(
371
16
        hashtable_unicode_hash,
372
16
        hashtable_unicode_compare,
373
        // Objects stored here are immortal and statically allocated,
374
        // so we don't need key_destroy_func & value_destroy_func:
375
16
        NULL,
376
16
        NULL,
377
16
        &hashtable_alloc
378
16
    );
379
16
    if (INTERNED_STRINGS == NULL) {
380
0
        PyErr_Clear();
381
0
        return _PyStatus_ERR("failed to create global interned dict");
382
0
    }
383
384
    /* Intern statically allocated string identifiers, deepfreeze strings,
385
        * and one-byte latin-1 strings.
386
        * This must be done before any module initialization so that statically
387
        * allocated string identifiers are used instead of heap allocated strings.
388
        * Deepfreeze uses the interned identifiers if present to save space
389
        * else generates them and they are interned to speed up dict lookups.
390
    */
391
16
    _PyUnicode_InitStaticStrings(interp);
392
393
4.11k
    for (int i = 0; i < 256; i++) {
394
4.09k
        PyObject *s = LATIN1(i);
395
4.09k
        _PyUnicode_InternStatic(interp, &s);
396
4.09k
        assert(s == LATIN1(i));
397
4.09k
    }
398
#ifdef Py_DEBUG
399
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
400
401
    for (int i = 0; i < 256; i++) {
402
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
403
    }
404
#endif
405
16
    return _PyStatus_OK();
406
16
}
407
408
static void clear_global_interned_strings(void)
409
0
{
410
0
    if (INTERNED_STRINGS != NULL) {
411
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
412
0
        INTERNED_STRINGS = NULL;
413
0
    }
414
0
}
415
416
#define _Py_RETURN_UNICODE_EMPTY()   \
417
53.1M
    do {                             \
418
53.1M
        return unicode_get_empty();  \
419
53.1M
    } while (0)
420
421
422
/* Fast detection of the most frequent whitespace characters */
423
const unsigned char _Py_ascii_whitespace[] = {
424
    0, 0, 0, 0, 0, 0, 0, 0,
425
/*     case 0x0009: * CHARACTER TABULATION */
426
/*     case 0x000A: * LINE FEED */
427
/*     case 0x000B: * LINE TABULATION */
428
/*     case 0x000C: * FORM FEED */
429
/*     case 0x000D: * CARRIAGE RETURN */
430
    0, 1, 1, 1, 1, 1, 0, 0,
431
    0, 0, 0, 0, 0, 0, 0, 0,
432
/*     case 0x001C: * FILE SEPARATOR */
433
/*     case 0x001D: * GROUP SEPARATOR */
434
/*     case 0x001E: * RECORD SEPARATOR */
435
/*     case 0x001F: * UNIT SEPARATOR */
436
    0, 0, 0, 0, 1, 1, 1, 1,
437
/*     case 0x0020: * SPACE */
438
    1, 0, 0, 0, 0, 0, 0, 0,
439
    0, 0, 0, 0, 0, 0, 0, 0,
440
    0, 0, 0, 0, 0, 0, 0, 0,
441
    0, 0, 0, 0, 0, 0, 0, 0,
442
443
    0, 0, 0, 0, 0, 0, 0, 0,
444
    0, 0, 0, 0, 0, 0, 0, 0,
445
    0, 0, 0, 0, 0, 0, 0, 0,
446
    0, 0, 0, 0, 0, 0, 0, 0,
447
    0, 0, 0, 0, 0, 0, 0, 0,
448
    0, 0, 0, 0, 0, 0, 0, 0,
449
    0, 0, 0, 0, 0, 0, 0, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0
451
};
452
453
/* forward */
454
static PyObject* get_latin1_char(unsigned char ch);
455
456
457
static PyObject *
458
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
459
static PyObject *
460
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
461
static PyObject *
462
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
463
464
static PyObject *
465
unicode_encode_call_errorhandler(const char *errors,
466
       PyObject **errorHandler,const char *encoding, const char *reason,
467
       PyObject *unicode, PyObject **exceptionObject,
468
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
469
470
static void
471
raise_encode_exception(PyObject **exceptionObject,
472
                       const char *encoding,
473
                       PyObject *unicode,
474
                       Py_ssize_t startpos, Py_ssize_t endpos,
475
                       const char *reason);
476
477
/* Same for linebreaks */
478
static const unsigned char ascii_linebreak[] = {
479
    0, 0, 0, 0, 0, 0, 0, 0,
480
/*         0x000A, * LINE FEED */
481
/*         0x000B, * LINE TABULATION */
482
/*         0x000C, * FORM FEED */
483
/*         0x000D, * CARRIAGE RETURN */
484
    0, 0, 1, 1, 1, 1, 0, 0,
485
    0, 0, 0, 0, 0, 0, 0, 0,
486
/*         0x001C, * FILE SEPARATOR */
487
/*         0x001D, * GROUP SEPARATOR */
488
/*         0x001E, * RECORD SEPARATOR */
489
    0, 0, 0, 0, 1, 1, 1, 0,
490
    0, 0, 0, 0, 0, 0, 0, 0,
491
    0, 0, 0, 0, 0, 0, 0, 0,
492
    0, 0, 0, 0, 0, 0, 0, 0,
493
    0, 0, 0, 0, 0, 0, 0, 0,
494
495
    0, 0, 0, 0, 0, 0, 0, 0,
496
    0, 0, 0, 0, 0, 0, 0, 0,
497
    0, 0, 0, 0, 0, 0, 0, 0,
498
    0, 0, 0, 0, 0, 0, 0, 0,
499
    0, 0, 0, 0, 0, 0, 0, 0,
500
    0, 0, 0, 0, 0, 0, 0, 0,
501
    0, 0, 0, 0, 0, 0, 0, 0,
502
    0, 0, 0, 0, 0, 0, 0, 0
503
};
504
505
static int convert_uc(PyObject *obj, void *addr);
506
507
struct encoding_map;
508
#include "clinic/unicodeobject.c.h"
509
510
_Py_error_handler
511
_Py_GetErrorHandler(const char *errors)
512
518k
{
513
518k
    if (errors == NULL || strcmp(errors, "strict") == 0) {
514
194k
        return _Py_ERROR_STRICT;
515
194k
    }
516
323k
    if (strcmp(errors, "surrogateescape") == 0) {
517
162k
        return _Py_ERROR_SURROGATEESCAPE;
518
162k
    }
519
161k
    if (strcmp(errors, "replace") == 0) {
520
161k
        return _Py_ERROR_REPLACE;
521
161k
    }
522
0
    if (strcmp(errors, "ignore") == 0) {
523
0
        return _Py_ERROR_IGNORE;
524
0
    }
525
0
    if (strcmp(errors, "backslashreplace") == 0) {
526
0
        return _Py_ERROR_BACKSLASHREPLACE;
527
0
    }
528
0
    if (strcmp(errors, "surrogatepass") == 0) {
529
0
        return _Py_ERROR_SURROGATEPASS;
530
0
    }
531
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
532
0
        return _Py_ERROR_XMLCHARREFREPLACE;
533
0
    }
534
0
    return _Py_ERROR_OTHER;
535
0
}
536
537
538
static _Py_error_handler
539
get_error_handler_wide(const wchar_t *errors)
540
5.57k
{
541
5.57k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
542
0
        return _Py_ERROR_STRICT;
543
0
    }
544
5.57k
    if (wcscmp(errors, L"surrogateescape") == 0) {
545
5.57k
        return _Py_ERROR_SURROGATEESCAPE;
546
5.57k
    }
547
0
    if (wcscmp(errors, L"replace") == 0) {
548
0
        return _Py_ERROR_REPLACE;
549
0
    }
550
0
    if (wcscmp(errors, L"ignore") == 0) {
551
0
        return _Py_ERROR_IGNORE;
552
0
    }
553
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
554
0
        return _Py_ERROR_BACKSLASHREPLACE;
555
0
    }
556
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
557
0
        return _Py_ERROR_SURROGATEPASS;
558
0
    }
559
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
560
0
        return _Py_ERROR_XMLCHARREFREPLACE;
561
0
    }
562
0
    return _Py_ERROR_OTHER;
563
0
}
564
565
566
static inline int
567
unicode_check_encoding_errors(const char *encoding, const char *errors)
568
22.0M
{
569
22.0M
    if (encoding == NULL && errors == NULL) {
570
11.3M
        return 0;
571
11.3M
    }
572
573
10.7M
    PyInterpreterState *interp = _PyInterpreterState_GET();
574
10.7M
#ifndef Py_DEBUG
575
    /* In release mode, only check in development mode (-X dev) */
576
10.7M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
577
10.7M
        return 0;
578
10.7M
    }
579
#else
580
    /* Always check in debug mode */
581
#endif
582
583
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
584
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
585
0
    if (!interp->unicode.fs_codec.encoding) {
586
0
        return 0;
587
0
    }
588
589
    /* Disable checks during Python finalization. For example, it allows to
590
       call _PyObject_Dump() during finalization for debugging purpose. */
591
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
592
0
        return 0;
593
0
    }
594
595
0
    if (encoding != NULL
596
        // Fast path for the most common built-in encodings. Even if the codec
597
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
598
        // create a temporary Unicode string (the key in the cache).
599
0
        && strcmp(encoding, "utf-8") != 0
600
0
        && strcmp(encoding, "utf8") != 0
601
0
        && strcmp(encoding, "ascii") != 0)
602
0
    {
603
0
        PyObject *handler = _PyCodec_Lookup(encoding);
604
0
        if (handler == NULL) {
605
0
            return -1;
606
0
        }
607
0
        Py_DECREF(handler);
608
0
    }
609
610
0
    if (errors != NULL
611
        // Fast path for the most common built-in error handlers.
612
0
        && strcmp(errors, "strict") != 0
613
0
        && strcmp(errors, "ignore") != 0
614
0
        && strcmp(errors, "replace") != 0
615
0
        && strcmp(errors, "surrogateescape") != 0
616
0
        && strcmp(errors, "surrogatepass") != 0)
617
0
    {
618
0
        PyObject *handler = PyCodec_LookupError(errors);
619
0
        if (handler == NULL) {
620
0
            return -1;
621
0
        }
622
0
        Py_DECREF(handler);
623
0
    }
624
0
    return 0;
625
0
}
626
627
628
int
629
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
630
0
{
631
0
#define CHECK(expr) \
632
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
633
634
0
    assert(op != NULL);
635
0
    CHECK(PyUnicode_Check(op));
636
637
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
638
0
    int kind = ascii->state.kind;
639
640
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
641
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
642
0
    }
643
0
    else {
644
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
645
0
        void *data;
646
647
0
        if (ascii->state.compact == 1) {
648
0
            data = compact + 1;
649
0
            CHECK(kind == PyUnicode_1BYTE_KIND
650
0
                                 || kind == PyUnicode_2BYTE_KIND
651
0
                                 || kind == PyUnicode_4BYTE_KIND);
652
0
            CHECK(ascii->state.ascii == 0);
653
0
            CHECK(_PyUnicode_UTF8(op) != data);
654
0
        }
655
0
        else {
656
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
657
658
0
            data = unicode->data.any;
659
0
            CHECK(kind == PyUnicode_1BYTE_KIND
660
0
                     || kind == PyUnicode_2BYTE_KIND
661
0
                     || kind == PyUnicode_4BYTE_KIND);
662
0
            CHECK(ascii->state.compact == 0);
663
0
            CHECK(data != NULL);
664
0
            if (ascii->state.ascii) {
665
0
                CHECK(_PyUnicode_UTF8(op) == data);
666
0
                CHECK(compact->utf8_length == ascii->length);
667
0
            }
668
0
            else {
669
0
                CHECK(_PyUnicode_UTF8(op) != data);
670
0
            }
671
0
        }
672
0
#ifndef Py_GIL_DISABLED
673
0
        if (_PyUnicode_UTF8(op) == NULL)
674
0
            CHECK(compact->utf8_length == 0);
675
0
#endif
676
0
    }
677
678
    /* check that the best kind is used: O(n) operation */
679
0
    if (check_content) {
680
0
        Py_ssize_t i;
681
0
        Py_UCS4 maxchar = 0;
682
0
        const void *data;
683
0
        Py_UCS4 ch;
684
685
0
        data = PyUnicode_DATA(ascii);
686
0
        for (i=0; i < ascii->length; i++)
687
0
        {
688
0
            ch = PyUnicode_READ(kind, data, i);
689
0
            if (ch > maxchar)
690
0
                maxchar = ch;
691
0
        }
692
0
        if (kind == PyUnicode_1BYTE_KIND) {
693
0
            if (ascii->state.ascii == 0) {
694
0
                CHECK(maxchar >= 128);
695
0
                CHECK(maxchar <= 255);
696
0
            }
697
0
            else
698
0
                CHECK(maxchar < 128);
699
0
        }
700
0
        else if (kind == PyUnicode_2BYTE_KIND) {
701
0
            CHECK(maxchar >= 0x100);
702
0
            CHECK(maxchar <= 0xFFFF);
703
0
        }
704
0
        else {
705
0
            CHECK(maxchar >= 0x10000);
706
0
            CHECK(maxchar <= MAX_UNICODE);
707
0
        }
708
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
709
0
    }
710
711
    /* Check interning state */
712
#ifdef Py_DEBUG
713
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
714
    // extensions can make immortal strings mortal (but with a high enough
715
    // refcount).
716
    // The other way is extremely unlikely (worth a potential failed assertion
717
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
718
    switch (PyUnicode_CHECK_INTERNED(op)) {
719
        case SSTATE_NOT_INTERNED:
720
            if (ascii->state.statically_allocated) {
721
                // This state is for two exceptions:
722
                // - strings are currently checked before they're interned
723
                // - the 256 one-latin1-character strings
724
                //   are static but use SSTATE_NOT_INTERNED
725
            }
726
            else {
727
                CHECK(!_Py_IsImmortal(op));
728
            }
729
            break;
730
        case SSTATE_INTERNED_MORTAL:
731
            CHECK(!ascii->state.statically_allocated);
732
            CHECK(!_Py_IsImmortal(op));
733
            break;
734
        case SSTATE_INTERNED_IMMORTAL:
735
            CHECK(!ascii->state.statically_allocated);
736
            break;
737
        case SSTATE_INTERNED_IMMORTAL_STATIC:
738
            CHECK(ascii->state.statically_allocated);
739
            break;
740
        default:
741
            Py_UNREACHABLE();
742
    }
743
#endif
744
745
0
    return 1;
746
747
0
#undef CHECK
748
0
}
749
750
static PyObject*
751
unicode_result(PyObject *unicode)
752
48.9M
{
753
48.9M
    assert(_PyUnicode_CHECK(unicode));
754
755
48.9M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
756
48.9M
    if (length == 0) {
757
247
        PyObject *empty = unicode_get_empty();
758
247
        if (unicode != empty) {
759
0
            Py_DECREF(unicode);
760
0
        }
761
247
        return empty;
762
247
    }
763
764
48.9M
    if (length == 1) {
765
272k
        int kind = PyUnicode_KIND(unicode);
766
272k
        if (kind == PyUnicode_1BYTE_KIND) {
767
86.9k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
768
86.9k
            Py_UCS1 ch = data[0];
769
86.9k
            PyObject *latin1_char = LATIN1(ch);
770
86.9k
            if (unicode != latin1_char) {
771
81.3k
                Py_DECREF(unicode);
772
81.3k
            }
773
86.9k
            return latin1_char;
774
86.9k
        }
775
272k
    }
776
777
48.9M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
778
48.8M
    return unicode;
779
48.9M
}
780
781
static PyObject*
782
unicode_result_unchanged(PyObject *unicode)
783
141M
{
784
141M
    if (PyUnicode_CheckExact(unicode)) {
785
138M
        return Py_NewRef(unicode);
786
138M
    }
787
3.24M
    else
788
        /* Subtype -- return genuine unicode string with the same value. */
789
3.24M
        return _PyUnicode_Copy(unicode);
790
141M
}
791
792
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
793
   ASCII, Latin1, UTF-8, etc. */
794
static char*
795
backslashreplace(PyBytesWriter *writer, char *str,
796
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
797
0
{
798
0
    Py_ssize_t size, i;
799
0
    Py_UCS4 ch;
800
0
    int kind;
801
0
    const void *data;
802
803
0
    kind = PyUnicode_KIND(unicode);
804
0
    data = PyUnicode_DATA(unicode);
805
806
0
    size = 0;
807
    /* determine replacement size */
808
0
    for (i = collstart; i < collend; ++i) {
809
0
        Py_ssize_t incr;
810
811
0
        ch = PyUnicode_READ(kind, data, i);
812
0
        if (ch < 0x100)
813
0
            incr = 2+2;
814
0
        else if (ch < 0x10000)
815
0
            incr = 2+4;
816
0
        else {
817
0
            assert(ch <= MAX_UNICODE);
818
0
            incr = 2+8;
819
0
        }
820
0
        if (size > PY_SSIZE_T_MAX - incr) {
821
0
            PyErr_SetString(PyExc_OverflowError,
822
0
                            "encoded result is too long for a Python string");
823
0
            return NULL;
824
0
        }
825
0
        size += incr;
826
0
    }
827
828
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
829
0
    if (str == NULL) {
830
0
        return NULL;
831
0
    }
832
833
    /* generate replacement */
834
0
    for (i = collstart; i < collend; ++i) {
835
0
        ch = PyUnicode_READ(kind, data, i);
836
0
        *str++ = '\\';
837
0
        if (ch >= 0x00010000) {
838
0
            *str++ = 'U';
839
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
840
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
841
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
842
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
843
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
844
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
845
0
        }
846
0
        else if (ch >= 0x100) {
847
0
            *str++ = 'u';
848
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
849
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
850
0
        }
851
0
        else
852
0
            *str++ = 'x';
853
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
854
0
        *str++ = Py_hexdigits[ch&0xf];
855
0
    }
856
0
    return str;
857
0
}
858
859
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
860
   ASCII, Latin1, UTF-8, etc. */
861
static char*
862
xmlcharrefreplace(PyBytesWriter *writer, char *str,
863
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
864
0
{
865
0
    Py_ssize_t size, i;
866
0
    Py_UCS4 ch;
867
0
    int kind;
868
0
    const void *data;
869
870
0
    kind = PyUnicode_KIND(unicode);
871
0
    data = PyUnicode_DATA(unicode);
872
873
0
    size = 0;
874
    /* determine replacement size */
875
0
    for (i = collstart; i < collend; ++i) {
876
0
        Py_ssize_t incr;
877
878
0
        ch = PyUnicode_READ(kind, data, i);
879
0
        if (ch < 10)
880
0
            incr = 2+1+1;
881
0
        else if (ch < 100)
882
0
            incr = 2+2+1;
883
0
        else if (ch < 1000)
884
0
            incr = 2+3+1;
885
0
        else if (ch < 10000)
886
0
            incr = 2+4+1;
887
0
        else if (ch < 100000)
888
0
            incr = 2+5+1;
889
0
        else if (ch < 1000000)
890
0
            incr = 2+6+1;
891
0
        else {
892
0
            assert(ch <= MAX_UNICODE);
893
0
            incr = 2+7+1;
894
0
        }
895
0
        if (size > PY_SSIZE_T_MAX - incr) {
896
0
            PyErr_SetString(PyExc_OverflowError,
897
0
                            "encoded result is too long for a Python string");
898
0
            return NULL;
899
0
        }
900
0
        size += incr;
901
0
    }
902
903
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
904
0
    if (str == NULL) {
905
0
        return NULL;
906
0
    }
907
908
    /* generate replacement */
909
0
    for (i = collstart; i < collend; ++i) {
910
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
911
0
        if (size < 0) {
912
0
            return NULL;
913
0
        }
914
0
        str += size;
915
0
    }
916
0
    return str;
917
0
}
918
919
/* --- Bloom Filters ----------------------------------------------------- */
920
921
/* stuff to implement simple "bloom filters" for Unicode characters.
922
   to keep things simple, we use a single bitmask, using the least 5
923
   bits from each unicode characters as the bit index. */
924
925
/* the linebreak mask is set up by _PyUnicode_Init() below */
926
927
#if LONG_BIT >= 128
928
#define BLOOM_WIDTH 128
929
#elif LONG_BIT >= 64
930
43.9M
#define BLOOM_WIDTH 64
931
#elif LONG_BIT >= 32
932
#define BLOOM_WIDTH 32
933
#else
934
#error "LONG_BIT is smaller than 32"
935
#endif
936
937
16.9M
#define BLOOM_MASK unsigned long
938
939
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
940
941
63.5M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
942
943
#define BLOOM_LINEBREAK(ch)                                             \
944
246M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
945
246M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
946
947
static inline BLOOM_MASK
948
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
949
8.47M
{
950
8.47M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
951
8.47M
    do {                                               \
952
8.47M
        TYPE *data = (TYPE *)PTR;                      \
953
8.47M
        TYPE *end = data + LEN;                        \
954
8.47M
        Py_UCS4 ch;                                    \
955
18.5M
        for (; data != end; data++) {                  \
956
10.0M
            ch = *data;                                \
957
10.0M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
958
10.0M
        }                                              \
959
8.47M
        break;                                         \
960
8.47M
    } while (0)
961
962
    /* calculate simple bloom-style bitmask for a given unicode string */
963
964
8.47M
    BLOOM_MASK mask;
965
966
8.47M
    mask = 0;
967
8.47M
    switch (kind) {
968
8.47M
    case PyUnicode_1BYTE_KIND:
969
8.47M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
970
8.47M
        break;
971
16
    case PyUnicode_2BYTE_KIND:
972
16
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
973
16
        break;
974
0
    case PyUnicode_4BYTE_KIND:
975
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
976
0
        break;
977
0
    default:
978
0
        Py_UNREACHABLE();
979
8.47M
    }
980
8.47M
    return mask;
981
982
8.47M
#undef BLOOM_UPDATE
983
8.47M
}
984
985
/* Compilation of templated routines */
986
987
1.06M
#define STRINGLIB_GET_EMPTY() unicode_get_empty()
988
989
#include "stringlib/asciilib.h"
990
#include "stringlib/fastsearch.h"
991
#include "stringlib/partition.h"
992
#include "stringlib/split.h"
993
#include "stringlib/count.h"
994
#include "stringlib/find.h"
995
#include "stringlib/find_max_char.h"
996
#include "stringlib/undef.h"
997
998
#include "stringlib/ucs1lib.h"
999
#include "stringlib/fastsearch.h"
1000
#include "stringlib/partition.h"
1001
#include "stringlib/split.h"
1002
#include "stringlib/count.h"
1003
#include "stringlib/find.h"
1004
#include "stringlib/replace.h"
1005
#include "stringlib/repr.h"
1006
#include "stringlib/find_max_char.h"
1007
#include "stringlib/undef.h"
1008
1009
#include "stringlib/ucs2lib.h"
1010
#include "stringlib/fastsearch.h"
1011
#include "stringlib/partition.h"
1012
#include "stringlib/split.h"
1013
#include "stringlib/count.h"
1014
#include "stringlib/find.h"
1015
#include "stringlib/replace.h"
1016
#include "stringlib/repr.h"
1017
#include "stringlib/find_max_char.h"
1018
#include "stringlib/undef.h"
1019
1020
#include "stringlib/ucs4lib.h"
1021
#include "stringlib/fastsearch.h"
1022
#include "stringlib/partition.h"
1023
#include "stringlib/split.h"
1024
#include "stringlib/count.h"
1025
#include "stringlib/find.h"
1026
#include "stringlib/replace.h"
1027
#include "stringlib/repr.h"
1028
#include "stringlib/find_max_char.h"
1029
#include "stringlib/undef.h"
1030
1031
#undef STRINGLIB_GET_EMPTY
1032
1033
/* --- Unicode Object ----------------------------------------------------- */
1034
1035
static inline Py_ssize_t
1036
findchar(const void *s, int kind,
1037
         Py_ssize_t size, Py_UCS4 ch,
1038
         int direction)
1039
106M
{
1040
106M
    switch (kind) {
1041
98.0M
    case PyUnicode_1BYTE_KIND:
1042
98.0M
        if ((Py_UCS1) ch != ch)
1043
3.54k
            return -1;
1044
98.0M
        if (direction > 0)
1045
98.0M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1046
8.01k
        else
1047
8.01k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1048
7.40M
    case PyUnicode_2BYTE_KIND:
1049
7.40M
        if ((Py_UCS2) ch != ch)
1050
0
            return -1;
1051
7.40M
        if (direction > 0)
1052
7.38M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1053
23.7k
        else
1054
23.7k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1055
828k
    case PyUnicode_4BYTE_KIND:
1056
828k
        if (direction > 0)
1057
743k
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1058
84.2k
        else
1059
84.2k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1060
0
    default:
1061
0
        Py_UNREACHABLE();
1062
106M
    }
1063
106M
}
1064
1065
#ifdef Py_DEBUG
1066
/* Fill the data of a Unicode string with invalid characters to detect bugs
1067
   earlier.
1068
1069
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1070
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1071
   invalid character in Unicode 6.0. */
1072
static void
1073
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1074
{
1075
    int kind = PyUnicode_KIND(unicode);
1076
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1077
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1078
    if (length <= old_length)
1079
        return;
1080
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1081
}
1082
#endif
1083
1084
static PyObject*
1085
resize_copy(PyObject *unicode, Py_ssize_t length)
1086
0
{
1087
0
    Py_ssize_t copy_length;
1088
0
    PyObject *copy;
1089
1090
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1091
0
    if (copy == NULL)
1092
0
        return NULL;
1093
1094
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1095
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1096
0
    return copy;
1097
0
}
1098
1099
static PyObject*
1100
resize_compact(PyObject *unicode, Py_ssize_t length)
1101
59.7M
{
1102
59.7M
    Py_ssize_t char_size;
1103
59.7M
    Py_ssize_t struct_size;
1104
59.7M
    Py_ssize_t new_size;
1105
59.7M
    PyObject *new_unicode;
1106
#ifdef Py_DEBUG
1107
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1108
#endif
1109
1110
59.7M
    if (!_PyUnicode_IsModifiable(unicode)) {
1111
0
        PyObject *copy = resize_copy(unicode, length);
1112
0
        if (copy == NULL) {
1113
0
            return NULL;
1114
0
        }
1115
0
        Py_DECREF(unicode);
1116
0
        return copy;
1117
0
    }
1118
59.7M
    assert(PyUnicode_IS_COMPACT(unicode));
1119
1120
59.7M
    char_size = PyUnicode_KIND(unicode);
1121
59.7M
    if (PyUnicode_IS_ASCII(unicode))
1122
51.1M
        struct_size = sizeof(PyASCIIObject);
1123
8.64M
    else
1124
8.64M
        struct_size = sizeof(PyCompactUnicodeObject);
1125
1126
59.7M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1127
0
        PyErr_NoMemory();
1128
0
        return NULL;
1129
0
    }
1130
59.7M
    new_size = (struct_size + (length + 1) * char_size);
1131
1132
59.7M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1133
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1134
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1135
0
        PyUnicode_SET_UTF8(unicode, NULL);
1136
0
    }
1137
#ifdef Py_TRACE_REFS
1138
    _Py_ForgetReference(unicode);
1139
#endif
1140
59.7M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1141
1142
59.7M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1143
59.7M
    if (new_unicode == NULL) {
1144
0
        _Py_NewReferenceNoTotal(unicode);
1145
0
        PyErr_NoMemory();
1146
0
        return NULL;
1147
0
    }
1148
59.7M
    unicode = new_unicode;
1149
59.7M
    _Py_NewReferenceNoTotal(unicode);
1150
1151
59.7M
    _PyUnicode_LENGTH(unicode) = length;
1152
#ifdef Py_DEBUG
1153
    unicode_fill_invalid(unicode, old_length);
1154
#endif
1155
59.7M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1156
59.7M
                    length, 0);
1157
59.7M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1158
59.7M
    return unicode;
1159
59.7M
}
1160
1161
static int
1162
resize_inplace(PyObject *unicode, Py_ssize_t length)
1163
0
{
1164
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1165
0
    assert(Py_REFCNT(unicode) == 1);
1166
1167
0
    Py_ssize_t new_size;
1168
0
    Py_ssize_t char_size;
1169
0
    int share_utf8;
1170
0
    void *data;
1171
#ifdef Py_DEBUG
1172
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1173
#endif
1174
1175
0
    data = _PyUnicode_DATA_ANY(unicode);
1176
0
    char_size = PyUnicode_KIND(unicode);
1177
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1178
1179
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1180
0
        PyErr_NoMemory();
1181
0
        return -1;
1182
0
    }
1183
0
    new_size = (length + 1) * char_size;
1184
1185
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1186
0
    {
1187
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1188
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1189
0
        PyUnicode_SET_UTF8(unicode, NULL);
1190
0
    }
1191
1192
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1193
0
    if (data == NULL) {
1194
0
        PyErr_NoMemory();
1195
0
        return -1;
1196
0
    }
1197
0
    _PyUnicode_DATA_ANY(unicode) = data;
1198
0
    if (share_utf8) {
1199
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1200
0
        PyUnicode_SET_UTF8(unicode, data);
1201
0
    }
1202
0
    _PyUnicode_LENGTH(unicode) = length;
1203
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1204
#ifdef Py_DEBUG
1205
    unicode_fill_invalid(unicode, old_length);
1206
#endif
1207
1208
    /* check for integer overflow */
1209
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1210
0
        PyErr_NoMemory();
1211
0
        return -1;
1212
0
    }
1213
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1214
0
    return 0;
1215
0
}
1216
1217
static const char*
1218
unicode_kind_name(PyObject *unicode)
1219
0
{
1220
    /* don't check consistency: unicode_kind_name() is called from
1221
       _PyUnicode_Dump() */
1222
0
    if (!PyUnicode_IS_COMPACT(unicode))
1223
0
    {
1224
0
        switch (PyUnicode_KIND(unicode))
1225
0
        {
1226
0
        case PyUnicode_1BYTE_KIND:
1227
0
            if (PyUnicode_IS_ASCII(unicode))
1228
0
                return "legacy ascii";
1229
0
            else
1230
0
                return "legacy latin1";
1231
0
        case PyUnicode_2BYTE_KIND:
1232
0
            return "legacy UCS2";
1233
0
        case PyUnicode_4BYTE_KIND:
1234
0
            return "legacy UCS4";
1235
0
        default:
1236
0
            return "<legacy invalid kind>";
1237
0
        }
1238
0
    }
1239
0
    switch (PyUnicode_KIND(unicode)) {
1240
0
    case PyUnicode_1BYTE_KIND:
1241
0
        if (PyUnicode_IS_ASCII(unicode))
1242
0
            return "ascii";
1243
0
        else
1244
0
            return "latin1";
1245
0
    case PyUnicode_2BYTE_KIND:
1246
0
        return "UCS2";
1247
0
    case PyUnicode_4BYTE_KIND:
1248
0
        return "UCS4";
1249
0
    default:
1250
0
        return "<invalid compact kind>";
1251
0
    }
1252
0
}
1253
1254
#ifdef Py_DEBUG
1255
/* Functions wrapping macros for use in debugger */
1256
const char *_PyUnicode_utf8(void *unicode_raw){
1257
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1258
    return PyUnicode_UTF8(unicode);
1259
}
1260
1261
const void *_PyUnicode_compact_data(void *unicode_raw) {
1262
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1263
    return _PyUnicode_COMPACT_DATA(unicode);
1264
}
1265
const void *_PyUnicode_data(void *unicode_raw) {
1266
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1267
    printf("obj %p\n", (void*)unicode);
1268
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1269
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1270
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1271
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1272
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1273
    return PyUnicode_DATA(unicode);
1274
}
1275
1276
void
1277
_PyUnicode_Dump(PyObject *op)
1278
{
1279
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1280
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1281
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1282
    const void *data;
1283
1284
    if (ascii->state.compact)
1285
    {
1286
        if (ascii->state.ascii)
1287
            data = (ascii + 1);
1288
        else
1289
            data = (compact + 1);
1290
    }
1291
    else
1292
        data = unicode->data.any;
1293
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1294
1295
    if (!ascii->state.ascii) {
1296
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1297
    }
1298
    printf(", data=%p\n", data);
1299
}
1300
#endif
1301
1302
1303
PyObject *
1304
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1305
552M
{
1306
    /* Optimization for empty strings */
1307
552M
    if (size == 0) {
1308
25.4M
        return unicode_get_empty();
1309
25.4M
    }
1310
1311
526M
    PyObject *obj;
1312
526M
    PyCompactUnicodeObject *unicode;
1313
526M
    void *data;
1314
526M
    int kind;
1315
526M
    int is_ascii;
1316
526M
    Py_ssize_t char_size;
1317
526M
    Py_ssize_t struct_size;
1318
1319
526M
    is_ascii = 0;
1320
526M
    struct_size = sizeof(PyCompactUnicodeObject);
1321
526M
    if (maxchar < 128) {
1322
302M
        kind = PyUnicode_1BYTE_KIND;
1323
302M
        char_size = 1;
1324
302M
        is_ascii = 1;
1325
302M
        struct_size = sizeof(PyASCIIObject);
1326
302M
    }
1327
224M
    else if (maxchar < 256) {
1328
25.1M
        kind = PyUnicode_1BYTE_KIND;
1329
25.1M
        char_size = 1;
1330
25.1M
    }
1331
199M
    else if (maxchar < 65536) {
1332
191M
        kind = PyUnicode_2BYTE_KIND;
1333
191M
        char_size = 2;
1334
191M
    }
1335
7.10M
    else {
1336
7.10M
        if (maxchar > MAX_UNICODE) {
1337
0
            PyErr_SetString(PyExc_SystemError,
1338
0
                            "invalid maximum character passed to PyUnicode_New");
1339
0
            return NULL;
1340
0
        }
1341
7.10M
        kind = PyUnicode_4BYTE_KIND;
1342
7.10M
        char_size = 4;
1343
7.10M
    }
1344
1345
    /* Ensure we won't overflow the size. */
1346
526M
    if (size < 0) {
1347
0
        PyErr_SetString(PyExc_SystemError,
1348
0
                        "Negative size passed to PyUnicode_New");
1349
0
        return NULL;
1350
0
    }
1351
526M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1352
0
        return PyErr_NoMemory();
1353
1354
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1355
     * PyObject_New() so we are able to allocate space for the object and
1356
     * it's data buffer.
1357
     */
1358
526M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1359
526M
    if (obj == NULL) {
1360
0
        return PyErr_NoMemory();
1361
0
    }
1362
526M
    _PyObject_Init(obj, &PyUnicode_Type);
1363
1364
526M
    unicode = (PyCompactUnicodeObject *)obj;
1365
526M
    if (is_ascii)
1366
302M
        data = ((PyASCIIObject*)obj) + 1;
1367
224M
    else
1368
224M
        data = unicode + 1;
1369
526M
    _PyUnicode_LENGTH(unicode) = size;
1370
526M
    _PyUnicode_HASH(unicode) = -1;
1371
526M
    _PyUnicode_STATE(unicode).interned = 0;
1372
526M
    _PyUnicode_STATE(unicode).kind = kind;
1373
526M
    _PyUnicode_STATE(unicode).compact = 1;
1374
526M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1375
526M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1376
526M
    if (is_ascii) {
1377
302M
        ((char*)data)[size] = 0;
1378
302M
    }
1379
224M
    else if (kind == PyUnicode_1BYTE_KIND) {
1380
25.1M
        ((char*)data)[size] = 0;
1381
25.1M
        unicode->utf8 = NULL;
1382
25.1M
        unicode->utf8_length = 0;
1383
25.1M
    }
1384
199M
    else {
1385
199M
        unicode->utf8 = NULL;
1386
199M
        unicode->utf8_length = 0;
1387
199M
        if (kind == PyUnicode_2BYTE_KIND)
1388
191M
            ((Py_UCS2*)data)[size] = 0;
1389
7.10M
        else /* kind == PyUnicode_4BYTE_KIND */
1390
7.10M
            ((Py_UCS4*)data)[size] = 0;
1391
199M
    }
1392
#ifdef Py_DEBUG
1393
    unicode_fill_invalid((PyObject*)unicode, 0);
1394
#endif
1395
526M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1396
526M
    return obj;
1397
526M
}
1398
1399
static int
1400
unicode_check_modifiable(PyObject *unicode)
1401
680
{
1402
680
    if (!_PyUnicode_IsModifiable(unicode)) {
1403
0
        PyErr_SetString(PyExc_SystemError,
1404
0
                        "Cannot modify a string currently used");
1405
0
        return -1;
1406
0
    }
1407
680
    return 0;
1408
680
}
1409
1410
static int
1411
_copy_characters(PyObject *to, Py_ssize_t to_start,
1412
                 PyObject *from, Py_ssize_t from_start,
1413
                 Py_ssize_t how_many, int check_maxchar)
1414
280M
{
1415
280M
    int from_kind, to_kind;
1416
280M
    const void *from_data;
1417
280M
    void *to_data;
1418
1419
280M
    assert(0 <= how_many);
1420
280M
    assert(0 <= from_start);
1421
280M
    assert(0 <= to_start);
1422
280M
    assert(PyUnicode_Check(from));
1423
280M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1424
1425
280M
    assert(to == NULL || PyUnicode_Check(to));
1426
1427
280M
    if (how_many == 0) {
1428
270k
        return 0;
1429
270k
    }
1430
1431
280M
    assert(to != NULL);
1432
280M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1433
1434
280M
    from_kind = PyUnicode_KIND(from);
1435
280M
    from_data = PyUnicode_DATA(from);
1436
280M
    to_kind = PyUnicode_KIND(to);
1437
280M
    to_data = PyUnicode_DATA(to);
1438
1439
#ifdef Py_DEBUG
1440
    if (!check_maxchar
1441
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1442
    {
1443
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1444
        Py_UCS4 ch;
1445
        Py_ssize_t i;
1446
        for (i=0; i < how_many; i++) {
1447
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1448
            assert(ch <= to_maxchar);
1449
        }
1450
    }
1451
#endif
1452
1453
280M
    if (from_kind == to_kind) {
1454
183M
        if (check_maxchar
1455
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1456
0
        {
1457
            /* Writing Latin-1 characters into an ASCII string requires to
1458
               check that all written characters are pure ASCII */
1459
0
            Py_UCS4 max_char;
1460
0
            max_char = ucs1lib_find_max_char(from_data,
1461
0
                                             (const Py_UCS1*)from_data + how_many);
1462
0
            if (max_char >= 128)
1463
0
                return -1;
1464
0
        }
1465
183M
        memcpy((char*)to_data + to_kind * to_start,
1466
183M
                  (const char*)from_data + from_kind * from_start,
1467
183M
                  to_kind * how_many);
1468
183M
    }
1469
96.9M
    else if (from_kind == PyUnicode_1BYTE_KIND
1470
95.0M
             && to_kind == PyUnicode_2BYTE_KIND)
1471
80.8M
    {
1472
80.8M
        _PyUnicode_CONVERT_BYTES(
1473
80.8M
            Py_UCS1, Py_UCS2,
1474
80.8M
            PyUnicode_1BYTE_DATA(from) + from_start,
1475
80.8M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1476
80.8M
            PyUnicode_2BYTE_DATA(to) + to_start
1477
80.8M
            );
1478
80.8M
    }
1479
16.1M
    else if (from_kind == PyUnicode_1BYTE_KIND
1480
14.1M
             && to_kind == PyUnicode_4BYTE_KIND)
1481
14.1M
    {
1482
14.1M
        _PyUnicode_CONVERT_BYTES(
1483
14.1M
            Py_UCS1, Py_UCS4,
1484
14.1M
            PyUnicode_1BYTE_DATA(from) + from_start,
1485
14.1M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1486
14.1M
            PyUnicode_4BYTE_DATA(to) + to_start
1487
14.1M
            );
1488
14.1M
    }
1489
1.93M
    else if (from_kind == PyUnicode_2BYTE_KIND
1490
1.90M
             && to_kind == PyUnicode_4BYTE_KIND)
1491
1.90M
    {
1492
1.90M
        _PyUnicode_CONVERT_BYTES(
1493
1.90M
            Py_UCS2, Py_UCS4,
1494
1.90M
            PyUnicode_2BYTE_DATA(from) + from_start,
1495
1.90M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1496
1.90M
            PyUnicode_4BYTE_DATA(to) + to_start
1497
1.90M
            );
1498
1.90M
    }
1499
28.7k
    else {
1500
28.7k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1501
1502
28.7k
        if (!check_maxchar) {
1503
28.7k
            if (from_kind == PyUnicode_2BYTE_KIND
1504
2.84k
                && to_kind == PyUnicode_1BYTE_KIND)
1505
2.84k
            {
1506
2.84k
                _PyUnicode_CONVERT_BYTES(
1507
2.84k
                    Py_UCS2, Py_UCS1,
1508
2.84k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1509
2.84k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1510
2.84k
                    PyUnicode_1BYTE_DATA(to) + to_start
1511
2.84k
                    );
1512
2.84k
            }
1513
25.8k
            else if (from_kind == PyUnicode_4BYTE_KIND
1514
25.8k
                     && to_kind == PyUnicode_1BYTE_KIND)
1515
8.15k
            {
1516
8.15k
                _PyUnicode_CONVERT_BYTES(
1517
8.15k
                    Py_UCS4, Py_UCS1,
1518
8.15k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1519
8.15k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1520
8.15k
                    PyUnicode_1BYTE_DATA(to) + to_start
1521
8.15k
                    );
1522
8.15k
            }
1523
17.7k
            else if (from_kind == PyUnicode_4BYTE_KIND
1524
17.7k
                     && to_kind == PyUnicode_2BYTE_KIND)
1525
17.7k
            {
1526
17.7k
                _PyUnicode_CONVERT_BYTES(
1527
17.7k
                    Py_UCS4, Py_UCS2,
1528
17.7k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1529
17.7k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1530
17.7k
                    PyUnicode_2BYTE_DATA(to) + to_start
1531
17.7k
                    );
1532
17.7k
            }
1533
0
            else {
1534
0
                Py_UNREACHABLE();
1535
0
            }
1536
28.7k
        }
1537
0
        else {
1538
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1539
0
            Py_UCS4 ch;
1540
0
            Py_ssize_t i;
1541
1542
0
            for (i=0; i < how_many; i++) {
1543
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1544
0
                if (ch > to_maxchar)
1545
0
                    return -1;
1546
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1547
0
            }
1548
0
        }
1549
28.7k
    }
1550
280M
    return 0;
1551
280M
}
1552
1553
void
1554
_PyUnicode_FastCopyCharacters(
1555
    PyObject *to, Py_ssize_t to_start,
1556
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1557
280M
{
1558
280M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1559
280M
}
1560
1561
Py_ssize_t
1562
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1563
                         PyObject *from, Py_ssize_t from_start,
1564
                         Py_ssize_t how_many)
1565
0
{
1566
0
    int err;
1567
1568
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1569
0
        PyErr_BadInternalCall();
1570
0
        return -1;
1571
0
    }
1572
1573
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1574
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1575
0
        return -1;
1576
0
    }
1577
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1578
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1579
0
        return -1;
1580
0
    }
1581
0
    if (how_many < 0) {
1582
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1583
0
        return -1;
1584
0
    }
1585
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1586
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1587
0
        PyErr_Format(PyExc_SystemError,
1588
0
                     "Cannot write %zi characters at %zi "
1589
0
                     "in a string of %zi characters",
1590
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1591
0
        return -1;
1592
0
    }
1593
1594
0
    if (how_many == 0)
1595
0
        return 0;
1596
1597
0
    if (unicode_check_modifiable(to))
1598
0
        return -1;
1599
1600
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1601
0
    if (err) {
1602
0
        PyErr_Format(PyExc_SystemError,
1603
0
                     "Cannot copy %s characters "
1604
0
                     "into a string of %s characters",
1605
0
                     unicode_kind_name(from),
1606
0
                     unicode_kind_name(to));
1607
0
        return -1;
1608
0
    }
1609
0
    return how_many;
1610
0
}
1611
1612
/* Find the maximum code point and count the number of surrogate pairs so a
1613
   correct string length can be computed before converting a string to UCS4.
1614
   This function counts single surrogates as a character and not as a pair.
1615
1616
   Return 0 on success, or -1 on error. */
1617
static int
1618
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1619
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1620
16.3k
{
1621
16.3k
    const wchar_t *iter;
1622
16.3k
    Py_UCS4 ch;
1623
1624
16.3k
    assert(num_surrogates != NULL && maxchar != NULL);
1625
16.3k
    *num_surrogates = 0;
1626
16.3k
    *maxchar = 0;
1627
1628
359k
    for (iter = begin; iter < end; ) {
1629
#if SIZEOF_WCHAR_T == 2
1630
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1631
            && (iter+1) < end
1632
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1633
        {
1634
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1635
            ++(*num_surrogates);
1636
            iter += 2;
1637
        }
1638
        else
1639
#endif
1640
342k
        {
1641
342k
            ch = *iter;
1642
342k
            iter++;
1643
342k
        }
1644
342k
        if (ch > *maxchar) {
1645
70.2k
            *maxchar = ch;
1646
70.2k
            if (*maxchar > MAX_UNICODE) {
1647
0
                PyErr_Format(PyExc_ValueError,
1648
0
                             "character U+%x is not in range [U+0000; U+%x]",
1649
0
                             ch, MAX_UNICODE);
1650
0
                return -1;
1651
0
            }
1652
70.2k
        }
1653
342k
    }
1654
16.3k
    return 0;
1655
16.3k
}
1656
1657
static void
1658
unicode_dealloc(PyObject *unicode)
1659
537M
{
1660
#ifdef Py_DEBUG
1661
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1662
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1663
    }
1664
#endif
1665
537M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1666
        /* This should never get called, but we also don't want to SEGV if
1667
        * we accidentally decref an immortal string out of existence. Since
1668
        * the string is an immortal object, just re-set the reference count.
1669
        */
1670
#ifdef Py_DEBUG
1671
        Py_UNREACHABLE();
1672
#endif
1673
0
        _Py_SetImmortal(unicode);
1674
0
        return;
1675
0
    }
1676
537M
    switch (_PyUnicode_STATE(unicode).interned) {
1677
537M
        case SSTATE_NOT_INTERNED:
1678
537M
            break;
1679
541k
        case SSTATE_INTERNED_MORTAL:
1680
            /* Remove the object from the intern dict.
1681
             * Before doing so, we set the refcount to 2: the key and value
1682
             * in the interned_dict.
1683
             */
1684
541k
            assert(Py_REFCNT(unicode) == 0);
1685
541k
            Py_SET_REFCNT(unicode, 2);
1686
#ifdef Py_REF_DEBUG
1687
            /* let's be pedantic with the ref total */
1688
            _Py_IncRefTotal(_PyThreadState_GET());
1689
            _Py_IncRefTotal(_PyThreadState_GET());
1690
#endif
1691
541k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1692
541k
            PyObject *interned = get_interned_dict(interp);
1693
541k
            assert(interned != NULL);
1694
541k
            PyObject *popped;
1695
541k
            int r = PyDict_Pop(interned, unicode, &popped);
1696
541k
            if (r == -1) {
1697
0
                PyErr_FormatUnraisable("Exception ignored while "
1698
0
                                       "removing an interned string %R",
1699
0
                                       unicode);
1700
                // We don't know what happened to the string. It's probably
1701
                // best to leak it:
1702
                // - if it was popped, there are no more references to it
1703
                //   so it can't cause trouble (except wasted memory)
1704
                // - if it wasn't popped, it'll remain interned
1705
0
                _Py_SetImmortal(unicode);
1706
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1707
0
                return;
1708
0
            }
1709
541k
            if (r == 0) {
1710
                // The interned string was not found in the interned_dict.
1711
#ifdef Py_DEBUG
1712
                Py_UNREACHABLE();
1713
#endif
1714
0
                _Py_SetImmortal(unicode);
1715
0
                return;
1716
0
            }
1717
            // Successfully popped.
1718
541k
            assert(popped == unicode);
1719
            // Only our `popped` reference should be left; remove it too.
1720
541k
            assert(Py_REFCNT(unicode) == 1);
1721
541k
            Py_SET_REFCNT(unicode, 0);
1722
#ifdef Py_REF_DEBUG
1723
            /* let's be pedantic with the ref total */
1724
            _Py_DecRefTotal(_PyThreadState_GET());
1725
#endif
1726
541k
            break;
1727
0
        default:
1728
            // As with `statically_allocated` above.
1729
#ifdef Py_REF_DEBUG
1730
            Py_UNREACHABLE();
1731
#endif
1732
0
            _Py_SetImmortal(unicode);
1733
0
            return;
1734
537M
    }
1735
537M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1736
145k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1737
145k
    }
1738
537M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1739
11.2M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1740
11.2M
    }
1741
1742
537M
    Py_TYPE(unicode)->tp_free(unicode);
1743
537M
}
1744
1745
#ifdef Py_DEBUG
1746
static int
1747
unicode_is_singleton(PyObject *unicode)
1748
{
1749
    if (unicode == &_Py_STR(empty)) {
1750
        return 1;
1751
    }
1752
1753
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1754
    if (ascii->length == 1) {
1755
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1756
        if (ch < 256 && LATIN1(ch) == unicode) {
1757
            return 1;
1758
        }
1759
    }
1760
    return 0;
1761
}
1762
#endif
1763
1764
int
1765
_PyUnicode_IsModifiable(PyObject *unicode)
1766
61.6M
{
1767
61.6M
    assert(_PyUnicode_CHECK(unicode));
1768
61.6M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1769
44.9k
        return 0;
1770
61.5M
    if (PyUnicode_HASH(unicode) != -1)
1771
0
        return 0;
1772
61.5M
    if (PyUnicode_CHECK_INTERNED(unicode))
1773
0
        return 0;
1774
61.5M
    if (!PyUnicode_CheckExact(unicode))
1775
0
        return 0;
1776
#ifdef Py_DEBUG
1777
    /* singleton refcount is greater than 1 */
1778
    assert(!unicode_is_singleton(unicode));
1779
#endif
1780
61.5M
    return 1;
1781
61.5M
}
1782
1783
static int
1784
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1785
915k
{
1786
915k
    PyObject *unicode;
1787
915k
    Py_ssize_t old_length;
1788
1789
915k
    assert(p_unicode != NULL);
1790
915k
    unicode = *p_unicode;
1791
1792
915k
    assert(unicode != NULL);
1793
915k
    assert(PyUnicode_Check(unicode));
1794
915k
    assert(0 <= length);
1795
1796
915k
    old_length = PyUnicode_GET_LENGTH(unicode);
1797
915k
    if (old_length == length)
1798
0
        return 0;
1799
1800
915k
    if (length == 0) {
1801
0
        PyObject *empty = unicode_get_empty();
1802
0
        Py_SETREF(*p_unicode, empty);
1803
0
        return 0;
1804
0
    }
1805
1806
915k
    if (!_PyUnicode_IsModifiable(unicode)) {
1807
0
        PyObject *copy = resize_copy(unicode, length);
1808
0
        if (copy == NULL)
1809
0
            return -1;
1810
0
        Py_SETREF(*p_unicode, copy);
1811
0
        return 0;
1812
0
    }
1813
1814
915k
    if (PyUnicode_IS_COMPACT(unicode)) {
1815
915k
        PyObject *new_unicode = resize_compact(unicode, length);
1816
915k
        if (new_unicode == NULL)
1817
0
            return -1;
1818
915k
        *p_unicode = new_unicode;
1819
915k
        return 0;
1820
915k
    }
1821
0
    return resize_inplace(unicode, length);
1822
915k
}
1823
1824
int
1825
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1826
0
{
1827
0
    PyObject *unicode;
1828
0
    if (p_unicode == NULL) {
1829
0
        PyErr_BadInternalCall();
1830
0
        return -1;
1831
0
    }
1832
0
    unicode = *p_unicode;
1833
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1834
0
    {
1835
0
        PyErr_BadInternalCall();
1836
0
        return -1;
1837
0
    }
1838
0
    return unicode_resize(p_unicode, length);
1839
0
}
1840
1841
/* Copy an ASCII or latin1 char* string into a Python Unicode string.
1842
1843
   WARNING: The function doesn't copy the terminating null character and
1844
   doesn't check the maximum character (may write a latin1 character in an
1845
   ASCII string). */
1846
static void
1847
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1848
                   const char *str, Py_ssize_t len)
1849
0
{
1850
0
    int kind = PyUnicode_KIND(unicode);
1851
0
    const void *data = PyUnicode_DATA(unicode);
1852
0
    const char *end = str + len;
1853
1854
0
    assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1855
0
    switch (kind) {
1856
0
    case PyUnicode_1BYTE_KIND: {
1857
#ifdef Py_DEBUG
1858
        if (PyUnicode_IS_ASCII(unicode)) {
1859
            Py_UCS4 maxchar = ucs1lib_find_max_char(
1860
                (const Py_UCS1*)str,
1861
                (const Py_UCS1*)str + len);
1862
            assert(maxchar < 128);
1863
        }
1864
#endif
1865
0
        memcpy((char *) data + index, str, len);
1866
0
        break;
1867
0
    }
1868
0
    case PyUnicode_2BYTE_KIND: {
1869
0
        Py_UCS2 *start = (Py_UCS2 *)data + index;
1870
0
        Py_UCS2 *ucs2 = start;
1871
1872
0
        for (; str < end; ++ucs2, ++str)
1873
0
            *ucs2 = (Py_UCS2)*str;
1874
1875
0
        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1876
0
        break;
1877
0
    }
1878
0
    case PyUnicode_4BYTE_KIND: {
1879
0
        Py_UCS4 *start = (Py_UCS4 *)data + index;
1880
0
        Py_UCS4 *ucs4 = start;
1881
1882
0
        for (; str < end; ++ucs4, ++str)
1883
0
            *ucs4 = (Py_UCS4)*str;
1884
1885
0
        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1886
0
        break;
1887
0
    }
1888
0
    default:
1889
0
        Py_UNREACHABLE();
1890
0
    }
1891
0
}
1892
1893
static PyObject*
1894
get_latin1_char(Py_UCS1 ch)
1895
265M
{
1896
265M
    PyObject *o = LATIN1(ch);
1897
265M
    return o;
1898
265M
}
1899
1900
static PyObject*
1901
unicode_char(Py_UCS4 ch)
1902
337M
{
1903
337M
    PyObject *unicode;
1904
1905
337M
    assert(ch <= MAX_UNICODE);
1906
1907
337M
    if (ch < 256) {
1908
206M
        return get_latin1_char(ch);
1909
206M
    }
1910
1911
130M
    unicode = PyUnicode_New(1, ch);
1912
130M
    if (unicode == NULL)
1913
0
        return NULL;
1914
1915
130M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1916
130M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1917
125M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1918
125M
    } else {
1919
4.43M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1920
4.43M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1921
4.43M
    }
1922
130M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1923
130M
    return unicode;
1924
130M
}
1925
1926
1927
static inline void
1928
unicode_write_widechar(int kind, void *data,
1929
                       const wchar_t *u, Py_ssize_t size,
1930
                       Py_ssize_t num_surrogates)
1931
16.3k
{
1932
16.3k
    switch (kind) {
1933
16.3k
    case PyUnicode_1BYTE_KIND:
1934
16.3k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1935
16.3k
        break;
1936
1937
0
    case PyUnicode_2BYTE_KIND:
1938
#if SIZEOF_WCHAR_T == 2
1939
        memcpy(data, u, size * 2);
1940
#else
1941
0
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1942
0
#endif
1943
0
        break;
1944
1945
0
    case PyUnicode_4BYTE_KIND:
1946
0
    {
1947
#if SIZEOF_WCHAR_T == 2
1948
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1949
        // surrogate pairs.
1950
        const wchar_t *end = u + size;
1951
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1952
#  ifndef NDEBUG
1953
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1954
#  endif
1955
        for (const wchar_t *iter = u; iter < end; ) {
1956
            assert(ucs4_out < ucs4_end);
1957
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1958
                && (iter+1) < end
1959
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1960
            {
1961
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1962
                iter += 2;
1963
            }
1964
            else {
1965
                *ucs4_out++ = *iter;
1966
                iter++;
1967
            }
1968
        }
1969
        assert(ucs4_out == ucs4_end);
1970
#else
1971
0
        assert(num_surrogates == 0);
1972
0
        memcpy(data, u, size * 4);
1973
0
#endif
1974
0
        break;
1975
0
    }
1976
0
    default:
1977
0
        Py_UNREACHABLE();
1978
16.3k
    }
1979
16.3k
}
1980
1981
1982
PyObject *
1983
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1984
16.3k
{
1985
16.3k
    PyObject *unicode;
1986
16.3k
    Py_UCS4 maxchar = 0;
1987
16.3k
    Py_ssize_t num_surrogates;
1988
1989
16.3k
    if (u == NULL && size != 0) {
1990
0
        PyErr_BadInternalCall();
1991
0
        return NULL;
1992
0
    }
1993
1994
16.3k
    if (size == -1) {
1995
576
        size = wcslen(u);
1996
576
    }
1997
1998
    /* If the Unicode data is known at construction time, we can apply
1999
       some optimizations which share commonly used objects. */
2000
2001
    /* Optimization for empty strings */
2002
16.3k
    if (size == 0)
2003
32
        _Py_RETURN_UNICODE_EMPTY();
2004
2005
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2006
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2007
       non-Unicode locales and hence needs conversion to UCS-4 first. */
2008
    if (_Py_LocaleUsesNonUnicodeWchar()) {
2009
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2010
        if (!converted) {
2011
            return NULL;
2012
        }
2013
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2014
        PyMem_Free(converted);
2015
        return unicode;
2016
    }
2017
#endif
2018
2019
    /* Single character Unicode objects in the Latin-1 range are
2020
       shared when using this constructor */
2021
16.3k
    if (size == 1 && (Py_UCS4)*u < 256)
2022
0
        return get_latin1_char((unsigned char)*u);
2023
2024
    /* If not empty and not single character, copy the Unicode data
2025
       into the new object */
2026
16.3k
    if (find_maxchar_surrogates(u, u + size,
2027
16.3k
                                &maxchar, &num_surrogates) == -1)
2028
0
        return NULL;
2029
2030
16.3k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
2031
16.3k
    if (!unicode)
2032
0
        return NULL;
2033
2034
16.3k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2035
16.3k
                           u, size, num_surrogates);
2036
2037
16.3k
    return unicode_result(unicode);
2038
16.3k
}
2039
2040
2041
int
2042
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
2043
                              const wchar_t *str,
2044
                              Py_ssize_t size)
2045
0
{
2046
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
2047
2048
0
    if (size < 0) {
2049
0
        size = wcslen(str);
2050
0
    }
2051
2052
0
    if (size == 0) {
2053
0
        return 0;
2054
0
    }
2055
2056
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2057
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2058
       non-Unicode locales and hence needs conversion to UCS-4 first. */
2059
    if (_Py_LocaleUsesNonUnicodeWchar()) {
2060
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
2061
        if (!converted) {
2062
            return -1;
2063
        }
2064
2065
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
2066
        PyMem_Free(converted);
2067
        return res;
2068
    }
2069
#endif
2070
2071
0
    Py_UCS4 maxchar = 0;
2072
0
    Py_ssize_t num_surrogates;
2073
0
    if (find_maxchar_surrogates(str, str + size,
2074
0
                                &maxchar, &num_surrogates) == -1) {
2075
0
        return -1;
2076
0
    }
2077
2078
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
2079
0
        return -1;
2080
0
    }
2081
2082
0
    int kind = writer->kind;
2083
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2084
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
2085
2086
0
    writer->pos += size - num_surrogates;
2087
0
    return 0;
2088
0
}
2089
2090
2091
PyObject *
2092
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2093
564k
{
2094
564k
    if (size < 0) {
2095
0
        PyErr_SetString(PyExc_SystemError,
2096
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2097
0
        return NULL;
2098
0
    }
2099
564k
    if (u != NULL) {
2100
564k
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2101
564k
    }
2102
0
    if (size > 0) {
2103
0
        PyErr_SetString(PyExc_SystemError,
2104
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2105
0
        return NULL;
2106
0
    }
2107
0
    return unicode_get_empty();
2108
0
}
2109
2110
PyObject *
2111
PyUnicode_FromString(const char *u)
2112
7.00M
{
2113
7.00M
    size_t size = strlen(u);
2114
7.00M
    if (size > PY_SSIZE_T_MAX) {
2115
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2116
0
        return NULL;
2117
0
    }
2118
7.00M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2119
7.00M
}
2120
2121
2122
PyObject *
2123
_PyUnicode_FromId(_Py_Identifier *id)
2124
0
{
2125
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2126
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2127
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2128
2129
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2130
0
    if (index < 0) {
2131
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2132
2133
0
        PyMutex_Lock(&rt_ids->mutex);
2134
        // Check again to detect concurrent access. Another thread can have
2135
        // initialized the index while this thread waited for the lock.
2136
0
        index = _Py_atomic_load_ssize(&id->index);
2137
0
        if (index < 0) {
2138
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2139
0
            index = rt_ids->next_index;
2140
0
            rt_ids->next_index++;
2141
0
            _Py_atomic_store_ssize(&id->index, index);
2142
0
        }
2143
0
        PyMutex_Unlock(&rt_ids->mutex);
2144
0
    }
2145
0
    assert(index >= 0);
2146
2147
0
    PyObject *obj;
2148
0
    if (index < ids->size) {
2149
0
        obj = ids->array[index];
2150
0
        if (obj) {
2151
            // Return a borrowed reference
2152
0
            goto end;
2153
0
        }
2154
0
    }
2155
2156
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2157
0
                                       NULL, NULL);
2158
0
    if (!obj) {
2159
0
        goto end;
2160
0
    }
2161
0
    _PyUnicode_InternImmortal(interp, &obj);
2162
2163
0
    if (index >= ids->size) {
2164
        // Overallocate to reduce the number of realloc
2165
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2166
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2167
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2168
0
        if (new_array == NULL) {
2169
0
            PyErr_NoMemory();
2170
0
            obj = NULL;
2171
0
            goto end;
2172
0
        }
2173
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2174
0
        ids->array = new_array;
2175
0
        ids->size = new_size;
2176
0
    }
2177
2178
    // The array stores a strong reference
2179
0
    ids->array[index] = obj;
2180
2181
0
end:
2182
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2183
    // Return a borrowed reference
2184
0
    return obj;
2185
0
}
2186
2187
2188
static void
2189
unicode_clear_identifiers(struct _Py_unicode_state *state)
2190
0
{
2191
0
    struct _Py_unicode_ids *ids = &state->ids;
2192
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2193
0
        Py_XDECREF(ids->array[i]);
2194
0
    }
2195
0
    ids->size = 0;
2196
0
    PyMem_Free(ids->array);
2197
0
    ids->array = NULL;
2198
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2199
    // after Py_Finalize().
2200
0
}
2201
2202
2203
/* Internal function, doesn't check maximum character */
2204
2205
PyObject*
2206
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2207
92.7M
{
2208
92.7M
    const unsigned char *s = (const unsigned char *)buffer;
2209
92.7M
    PyObject *unicode;
2210
92.7M
    if (size == 1) {
2211
#ifdef Py_DEBUG
2212
        assert((unsigned char)s[0] < 128);
2213
#endif
2214
33.6M
        return get_latin1_char(s[0]);
2215
33.6M
    }
2216
59.1M
    unicode = PyUnicode_New(size, 127);
2217
59.1M
    if (!unicode)
2218
0
        return NULL;
2219
59.1M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2220
59.1M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2221
59.1M
    return unicode;
2222
59.1M
}
2223
2224
static Py_UCS4
2225
kind_maxchar_limit(int kind)
2226
0
{
2227
0
    switch (kind) {
2228
0
    case PyUnicode_1BYTE_KIND:
2229
0
        return 0x80;
2230
0
    case PyUnicode_2BYTE_KIND:
2231
0
        return 0x100;
2232
0
    case PyUnicode_4BYTE_KIND:
2233
0
        return 0x10000;
2234
0
    default:
2235
0
        Py_UNREACHABLE();
2236
0
    }
2237
0
}
2238
2239
static PyObject*
2240
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2241
74.8M
{
2242
74.8M
    PyObject *res;
2243
74.8M
    unsigned char max_char;
2244
2245
74.8M
    if (size == 0) {
2246
10.3M
        _Py_RETURN_UNICODE_EMPTY();
2247
10.3M
    }
2248
74.8M
    assert(size > 0);
2249
64.4M
    if (size == 1) {
2250
23.0M
        return get_latin1_char(u[0]);
2251
23.0M
    }
2252
2253
41.4M
    max_char = ucs1lib_find_max_char(u, u + size);
2254
41.4M
    res = PyUnicode_New(size, max_char);
2255
41.4M
    if (!res)
2256
0
        return NULL;
2257
41.4M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2258
41.4M
    assert(_PyUnicode_CheckConsistency(res, 1));
2259
41.4M
    return res;
2260
41.4M
}
2261
2262
static PyObject*
2263
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2264
111M
{
2265
111M
    PyObject *res;
2266
111M
    Py_UCS2 max_char;
2267
2268
111M
    if (size == 0)
2269
14.9M
        _Py_RETURN_UNICODE_EMPTY();
2270
111M
    assert(size > 0);
2271
96.2M
    if (size == 1)
2272
65.9M
        return unicode_char(u[0]);
2273
2274
30.2M
    max_char = ucs2lib_find_max_char(u, u + size);
2275
30.2M
    res = PyUnicode_New(size, max_char);
2276
30.2M
    if (!res)
2277
0
        return NULL;
2278
30.2M
    if (max_char >= 256)
2279
17.2M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2280
13.0M
    else {
2281
13.0M
        _PyUnicode_CONVERT_BYTES(
2282
13.0M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2283
13.0M
    }
2284
30.2M
    assert(_PyUnicode_CheckConsistency(res, 1));
2285
30.2M
    return res;
2286
30.2M
}
2287
2288
static PyObject*
2289
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2290
84.1M
{
2291
84.1M
    PyObject *res;
2292
84.1M
    Py_UCS4 max_char;
2293
2294
84.1M
    if (size == 0)
2295
6.54M
        _Py_RETURN_UNICODE_EMPTY();
2296
84.1M
    assert(size > 0);
2297
77.5M
    if (size == 1)
2298
57.9M
        return unicode_char(u[0]);
2299
2300
19.5M
    max_char = ucs4lib_find_max_char(u, u + size);
2301
19.5M
    res = PyUnicode_New(size, max_char);
2302
19.5M
    if (!res)
2303
0
        return NULL;
2304
19.5M
    if (max_char < 256)
2305
14.1M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2306
19.5M
                                 PyUnicode_1BYTE_DATA(res));
2307
5.41M
    else if (max_char < 0x10000)
2308
3.54M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2309
5.41M
                                 PyUnicode_2BYTE_DATA(res));
2310
1.86M
    else
2311
1.86M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2312
19.5M
    assert(_PyUnicode_CheckConsistency(res, 1));
2313
19.5M
    return res;
2314
19.5M
}
2315
2316
2317
int
2318
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2319
                          Py_UCS4 *str,
2320
                          Py_ssize_t size)
2321
0
{
2322
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2323
2324
0
    if (size < 0) {
2325
0
        PyErr_SetString(PyExc_ValueError,
2326
0
                        "size must be positive");
2327
0
        return -1;
2328
0
    }
2329
2330
0
    if (size == 0) {
2331
0
        return 0;
2332
0
    }
2333
2334
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2335
2336
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2337
0
        return -1;
2338
0
    }
2339
2340
0
    int kind = writer->kind;
2341
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2342
0
    if (kind == PyUnicode_1BYTE_KIND) {
2343
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2344
0
                                 str, str + size,
2345
0
                                 data);
2346
0
    }
2347
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2348
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2349
0
                                 str, str + size,
2350
0
                                 data);
2351
0
    }
2352
0
    else {
2353
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2354
0
    }
2355
0
    writer->pos += size;
2356
2357
0
    return 0;
2358
0
}
2359
2360
2361
PyObject*
2362
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2363
209M
{
2364
209M
    if (size < 0) {
2365
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2366
0
        return NULL;
2367
0
    }
2368
209M
    switch (kind) {
2369
45.4M
    case PyUnicode_1BYTE_KIND:
2370
45.4M
        return _PyUnicode_FromUCS1(buffer, size);
2371
91.1M
    case PyUnicode_2BYTE_KIND:
2372
91.1M
        return _PyUnicode_FromUCS2(buffer, size);
2373
73.2M
    case PyUnicode_4BYTE_KIND:
2374
73.2M
        return _PyUnicode_FromUCS4(buffer, size);
2375
0
    default:
2376
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2377
0
        return NULL;
2378
209M
    }
2379
209M
}
2380
2381
Py_UCS4
2382
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2383
13.8M
{
2384
13.8M
    int kind;
2385
13.8M
    const void *startptr, *endptr;
2386
2387
13.8M
    assert(0 <= start);
2388
13.8M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2389
13.8M
    assert(start <= end);
2390
2391
13.8M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2392
0
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2393
2394
13.8M
    if (start == end)
2395
0
        return 127;
2396
2397
13.8M
    if (PyUnicode_IS_ASCII(unicode))
2398
13.7M
        return 127;
2399
2400
36.7k
    kind = PyUnicode_KIND(unicode);
2401
36.7k
    startptr = PyUnicode_DATA(unicode);
2402
36.7k
    endptr = (char *)startptr + end * kind;
2403
36.7k
    startptr = (char *)startptr + start * kind;
2404
36.7k
    switch(kind) {
2405
1.58k
    case PyUnicode_1BYTE_KIND:
2406
1.58k
        return ucs1lib_find_max_char(startptr, endptr);
2407
4.37k
    case PyUnicode_2BYTE_KIND:
2408
4.37k
        return ucs2lib_find_max_char(startptr, endptr);
2409
30.8k
    case PyUnicode_4BYTE_KIND:
2410
30.8k
        return ucs4lib_find_max_char(startptr, endptr);
2411
0
    default:
2412
0
        Py_UNREACHABLE();
2413
36.7k
    }
2414
36.7k
}
2415
2416
/* Ensure that a string uses the most efficient storage, if it is not the
2417
   case: create a new string with of the right kind. Write NULL into *p_unicode
2418
   on error. */
2419
static void
2420
unicode_adjust_maxchar(PyObject **p_unicode)
2421
0
{
2422
0
    PyObject *unicode, *copy;
2423
0
    Py_UCS4 max_char;
2424
0
    Py_ssize_t len;
2425
0
    int kind;
2426
2427
0
    assert(p_unicode != NULL);
2428
0
    unicode = *p_unicode;
2429
0
    if (PyUnicode_IS_ASCII(unicode))
2430
0
        return;
2431
2432
0
    len = PyUnicode_GET_LENGTH(unicode);
2433
0
    kind = PyUnicode_KIND(unicode);
2434
0
    if (kind == PyUnicode_1BYTE_KIND) {
2435
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2436
0
        max_char = ucs1lib_find_max_char(u, u + len);
2437
0
        if (max_char >= 128)
2438
0
            return;
2439
0
    }
2440
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2441
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2442
0
        max_char = ucs2lib_find_max_char(u, u + len);
2443
0
        if (max_char >= 256)
2444
0
            return;
2445
0
    }
2446
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2447
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2448
0
        max_char = ucs4lib_find_max_char(u, u + len);
2449
0
        if (max_char >= 0x10000)
2450
0
            return;
2451
0
    }
2452
0
    else
2453
0
        Py_UNREACHABLE();
2454
2455
0
    copy = PyUnicode_New(len, max_char);
2456
0
    if (copy != NULL)
2457
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2458
0
    Py_DECREF(unicode);
2459
0
    *p_unicode = copy;
2460
0
}
2461
2462
PyObject*
2463
_PyUnicode_Copy(PyObject *unicode)
2464
3.24M
{
2465
3.24M
    Py_ssize_t length;
2466
3.24M
    PyObject *copy;
2467
2468
3.24M
    if (!PyUnicode_Check(unicode)) {
2469
0
        PyErr_BadInternalCall();
2470
0
        return NULL;
2471
0
    }
2472
2473
3.24M
    length = PyUnicode_GET_LENGTH(unicode);
2474
3.24M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2475
3.24M
    if (!copy)
2476
0
        return NULL;
2477
3.24M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2478
2479
3.24M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2480
3.24M
              length * PyUnicode_KIND(unicode));
2481
3.24M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2482
3.24M
    return copy;
2483
3.24M
}
2484
2485
2486
/* Widen Unicode objects to larger buffers. Don't write terminating null
2487
   character. Return NULL on error. */
2488
2489
static void*
2490
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2491
10.8M
{
2492
10.8M
    void *result;
2493
2494
10.8M
    assert(skind < kind);
2495
10.8M
    switch (kind) {
2496
9.84M
    case PyUnicode_2BYTE_KIND:
2497
9.84M
        result = PyMem_New(Py_UCS2, len);
2498
9.84M
        if (!result)
2499
0
            return PyErr_NoMemory();
2500
9.84M
        assert(skind == PyUnicode_1BYTE_KIND);
2501
9.84M
        _PyUnicode_CONVERT_BYTES(
2502
9.84M
            Py_UCS1, Py_UCS2,
2503
9.84M
            (const Py_UCS1 *)data,
2504
9.84M
            ((const Py_UCS1 *)data) + len,
2505
9.84M
            result);
2506
9.84M
        return result;
2507
1.04M
    case PyUnicode_4BYTE_KIND:
2508
1.04M
        result = PyMem_New(Py_UCS4, len);
2509
1.04M
        if (!result)
2510
0
            return PyErr_NoMemory();
2511
1.04M
        if (skind == PyUnicode_2BYTE_KIND) {
2512
0
            _PyUnicode_CONVERT_BYTES(
2513
0
                Py_UCS2, Py_UCS4,
2514
0
                (const Py_UCS2 *)data,
2515
0
                ((const Py_UCS2 *)data) + len,
2516
0
                result);
2517
0
        }
2518
1.04M
        else {
2519
1.04M
            assert(skind == PyUnicode_1BYTE_KIND);
2520
1.04M
            _PyUnicode_CONVERT_BYTES(
2521
1.04M
                Py_UCS1, Py_UCS4,
2522
1.04M
                (const Py_UCS1 *)data,
2523
1.04M
                ((const Py_UCS1 *)data) + len,
2524
1.04M
                result);
2525
1.04M
        }
2526
1.04M
        return result;
2527
0
    default:
2528
0
        Py_UNREACHABLE();
2529
0
        return NULL;
2530
10.8M
    }
2531
10.8M
}
2532
2533
static Py_UCS4*
2534
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2535
        int copy_null)
2536
76.4k
{
2537
76.4k
    int kind;
2538
76.4k
    const void *data;
2539
76.4k
    Py_ssize_t len, targetlen;
2540
76.4k
    kind = PyUnicode_KIND(string);
2541
76.4k
    data = PyUnicode_DATA(string);
2542
76.4k
    len = PyUnicode_GET_LENGTH(string);
2543
76.4k
    targetlen = len;
2544
76.4k
    if (copy_null)
2545
0
        targetlen++;
2546
76.4k
    if (!target) {
2547
0
        target = PyMem_New(Py_UCS4, targetlen);
2548
0
        if (!target) {
2549
0
            PyErr_NoMemory();
2550
0
            return NULL;
2551
0
        }
2552
0
    }
2553
76.4k
    else {
2554
76.4k
        if (targetsize < targetlen) {
2555
0
            PyErr_Format(PyExc_SystemError,
2556
0
                         "string is longer than the buffer");
2557
0
            if (copy_null && 0 < targetsize)
2558
0
                target[0] = 0;
2559
0
            return NULL;
2560
0
        }
2561
76.4k
    }
2562
76.4k
    if (kind == PyUnicode_1BYTE_KIND) {
2563
56.9k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2564
56.9k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2565
56.9k
    }
2566
19.4k
    else if (kind == PyUnicode_2BYTE_KIND) {
2567
14.9k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2568
14.9k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2569
14.9k
    }
2570
4.56k
    else if (kind == PyUnicode_4BYTE_KIND) {
2571
4.56k
        memcpy(target, data, len * sizeof(Py_UCS4));
2572
4.56k
    }
2573
0
    else {
2574
0
        Py_UNREACHABLE();
2575
0
    }
2576
76.4k
    if (copy_null)
2577
0
        target[len] = 0;
2578
76.4k
    return target;
2579
76.4k
}
2580
2581
Py_UCS4*
2582
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2583
                 int copy_null)
2584
76.4k
{
2585
76.4k
    if (target == NULL || targetsize < 0) {
2586
0
        PyErr_BadInternalCall();
2587
0
        return NULL;
2588
0
    }
2589
76.4k
    return as_ucs4(string, target, targetsize, copy_null);
2590
76.4k
}
2591
2592
Py_UCS4*
2593
PyUnicode_AsUCS4Copy(PyObject *string)
2594
0
{
2595
0
    return as_ucs4(string, NULL, 0, 1);
2596
0
}
2597
2598
/* maximum number of characters required for output of %jo or %jd or %p.
2599
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2600
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2601
   plus 1 for the terminal NUL. */
2602
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2603
2604
static int
2605
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2606
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2607
28.6k
{
2608
28.6k
    Py_ssize_t length, fill, arglen;
2609
28.6k
    Py_UCS4 maxchar;
2610
2611
28.6k
    length = PyUnicode_GET_LENGTH(str);
2612
28.6k
    if ((precision == -1 || precision >= length)
2613
28.5k
        && width <= length)
2614
28.5k
        return _PyUnicodeWriter_WriteStr(writer, str);
2615
2616
47
    if (precision != -1)
2617
47
        length = Py_MIN(precision, length);
2618
2619
47
    arglen = Py_MAX(length, width);
2620
47
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2621
26
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2622
21
    else
2623
21
        maxchar = writer->maxchar;
2624
2625
47
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2626
0
        return -1;
2627
2628
47
    fill = Py_MAX(width - length, 0);
2629
47
    if (fill && !(flags & F_LJUST)) {
2630
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2631
0
            return -1;
2632
0
        writer->pos += fill;
2633
0
    }
2634
2635
47
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2636
47
                                  str, 0, length);
2637
47
    writer->pos += length;
2638
2639
47
    if (fill && (flags & F_LJUST)) {
2640
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2641
0
            return -1;
2642
0
        writer->pos += fill;
2643
0
    }
2644
2645
47
    return 0;
2646
47
}
2647
2648
static int
2649
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2650
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2651
4.46M
{
2652
    /* UTF-8 */
2653
4.46M
    Py_ssize_t *pconsumed = NULL;
2654
4.46M
    Py_ssize_t length;
2655
4.46M
    if (precision == -1) {
2656
207k
        length = strlen(str);
2657
207k
    }
2658
4.25M
    else {
2659
4.25M
        length = 0;
2660
17.5M
        while (length < precision && str[length]) {
2661
13.2M
            length++;
2662
13.2M
        }
2663
4.25M
        if (length == precision) {
2664
            /* The input string is not NUL-terminated.  If it ends with an
2665
             * incomplete UTF-8 sequence, truncate the string just before it.
2666
             * Incomplete sequences in the middle and sequences which cannot
2667
             * be valid prefixes are still treated as errors and replaced
2668
             * with \xfffd. */
2669
1.81k
            pconsumed = &length;
2670
1.81k
        }
2671
4.25M
    }
2672
2673
4.46M
    if (width < 0) {
2674
4.46M
        return unicode_decode_utf8_writer(writer, str, length,
2675
4.46M
                                          _Py_ERROR_REPLACE, "replace", pconsumed);
2676
4.46M
    }
2677
2678
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2679
0
                                                     "replace", pconsumed);
2680
0
    if (unicode == NULL)
2681
0
        return -1;
2682
2683
0
    int res = unicode_fromformat_write_str(writer, unicode,
2684
0
                                           width, -1, flags);
2685
0
    Py_DECREF(unicode);
2686
0
    return res;
2687
0
}
2688
2689
static int
2690
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2691
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2692
0
{
2693
0
    Py_ssize_t length;
2694
0
    if (precision == -1) {
2695
0
        length = wcslen(str);
2696
0
    }
2697
0
    else {
2698
0
        length = 0;
2699
0
        while (length < precision && str[length]) {
2700
0
            length++;
2701
0
        }
2702
0
    }
2703
2704
0
    if (width < 0) {
2705
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2706
0
                                             str, length);
2707
0
    }
2708
2709
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2710
0
    if (unicode == NULL)
2711
0
        return -1;
2712
2713
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2714
0
    Py_DECREF(unicode);
2715
0
    return res;
2716
0
}
2717
2718
0
#define F_LONG 1
2719
0
#define F_LONGLONG 2
2720
123k
#define F_SIZE 3
2721
0
#define F_PTRDIFF 4
2722
0
#define F_INTMAX 5
2723
2724
static const char*
2725
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2726
                       const char *f, va_list *vargs)
2727
32.0M
{
2728
32.0M
    const char *p;
2729
32.0M
    Py_ssize_t len;
2730
32.0M
    int flags = 0;
2731
32.0M
    Py_ssize_t width;
2732
32.0M
    Py_ssize_t precision;
2733
2734
32.0M
    p = f;
2735
32.0M
    f++;
2736
32.0M
    if (*f == '%') {
2737
4.23M
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2738
0
            return NULL;
2739
4.23M
        f++;
2740
4.23M
        return f;
2741
4.23M
    }
2742
2743
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2744
    /* Flags '+', ' ' and '#' are not particularly useful.
2745
     * They are not worth the implementation and maintenance costs.
2746
     * In addition, '#' should add "0" for "o" conversions for compatibility
2747
     * with printf, but it would confuse Python users. */
2748
27.7M
    while (1) {
2749
27.7M
        switch (*f++) {
2750
0
        case '-': flags |= F_LJUST; continue;
2751
2.13k
        case '0': flags |= F_ZERO; continue;
2752
0
        case '#': flags |= F_ALT; continue;
2753
27.7M
        }
2754
27.7M
        f--;
2755
27.7M
        break;
2756
27.7M
    }
2757
2758
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2759
27.7M
    width = -1;
2760
27.7M
    if (*f == '*') {
2761
0
        width = va_arg(*vargs, int);
2762
0
        if (width < 0) {
2763
0
            flags |= F_LJUST;
2764
0
            width = -width;
2765
0
        }
2766
0
        f++;
2767
0
    }
2768
27.7M
    else if (Py_ISDIGIT((unsigned)*f)) {
2769
2.13k
        width = *f - '0';
2770
2.13k
        f++;
2771
2.13k
        while (Py_ISDIGIT((unsigned)*f)) {
2772
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2773
0
                PyErr_SetString(PyExc_ValueError,
2774
0
                                "width too big");
2775
0
                return NULL;
2776
0
            }
2777
0
            width = (width * 10) + (*f - '0');
2778
0
            f++;
2779
0
        }
2780
2.13k
    }
2781
27.7M
    precision = -1;
2782
27.7M
    if (*f == '.') {
2783
4.25M
        f++;
2784
4.25M
        if (*f == '*') {
2785
0
            precision = va_arg(*vargs, int);
2786
0
            if (precision < 0) {
2787
0
                precision = -2;
2788
0
            }
2789
0
            f++;
2790
0
        }
2791
4.25M
        else if (Py_ISDIGIT((unsigned)*f)) {
2792
4.25M
            precision = (*f - '0');
2793
4.25M
            f++;
2794
12.7M
            while (Py_ISDIGIT((unsigned)*f)) {
2795
8.50M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2796
0
                    PyErr_SetString(PyExc_ValueError,
2797
0
                                    "precision too big");
2798
0
                    return NULL;
2799
0
                }
2800
8.50M
                precision = (precision * 10) + (*f - '0');
2801
8.50M
                f++;
2802
8.50M
            }
2803
4.25M
        }
2804
4.25M
    }
2805
2806
27.7M
    int sizemod = 0;
2807
27.7M
    if (*f == 'l') {
2808
0
        if (f[1] == 'l') {
2809
0
            sizemod = F_LONGLONG;
2810
0
            f += 2;
2811
0
        }
2812
0
        else {
2813
0
            sizemod = F_LONG;
2814
0
            ++f;
2815
0
        }
2816
0
    }
2817
27.7M
    else if (*f == 'z') {
2818
61.6k
        sizemod = F_SIZE;
2819
61.6k
        ++f;
2820
61.6k
    }
2821
27.7M
    else if (*f == 't') {
2822
0
        sizemod = F_PTRDIFF;
2823
0
        ++f;
2824
0
    }
2825
27.7M
    else if (*f == 'j') {
2826
0
        sizemod = F_INTMAX;
2827
0
        ++f;
2828
0
    }
2829
27.7M
    if (f[0] != '\0' && f[1] == '\0')
2830
4.32M
        writer->overallocate = 0;
2831
2832
27.7M
    switch (*f) {
2833
19.0M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2834
19.0M
        break;
2835
4.24M
    case 'c': case 'p':
2836
4.24M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2837
4.24M
        break;
2838
4.46M
    case 's':
2839
4.46M
    case 'V':
2840
4.46M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2841
4.46M
        break;
2842
4.46M
    default:
2843
28.6k
        if (sizemod) goto invalid_format;
2844
28.6k
        break;
2845
27.7M
    }
2846
2847
27.7M
    switch (*f) {
2848
4.24M
    case 'c':
2849
4.24M
    {
2850
4.24M
        int ordinal = va_arg(*vargs, int);
2851
4.24M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2852
0
            PyErr_SetString(PyExc_OverflowError,
2853
0
                            "character argument not in range(0x110000)");
2854
0
            return NULL;
2855
0
        }
2856
4.24M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2857
0
            return NULL;
2858
4.24M
        break;
2859
4.24M
    }
2860
2861
19.0M
    case 'd': case 'i':
2862
19.0M
    case 'o': case 'u': case 'x': case 'X':
2863
19.0M
    {
2864
19.0M
        char buffer[MAX_INTMAX_CHARS];
2865
2866
        // Fill buffer using sprinf, with one of many possible format
2867
        // strings, like "%llX" for `long long` in hexadecimal.
2868
        // The type/size is in `sizemod`; the format is in `*f`.
2869
2870
        // Use macros with nested switches to keep the sprintf format strings
2871
        // as compile-time literals, avoiding warnings and maybe allowing
2872
        // optimizations.
2873
2874
        // `SPRINT` macro does one sprintf
2875
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2876
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2877
19.0M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2878
19.0M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2879
2880
        // One inner switch to handle all format variants
2881
19.0M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2882
19.0M
            switch (*f) {                                                     \
2883
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2884
0
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2885
1.56k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2886
1.17k
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2887
19.0M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2888
19.0M
            }
2889
2890
        // Outer switch to handle all the sizes/types
2891
19.0M
        switch (sizemod) {
2892
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2893
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2894
61.6k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2895
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2896
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2897
18.9M
            default:         DO_SPRINTS("", int, unsigned int); break;
2898
19.0M
        }
2899
19.0M
        #undef SPRINT
2900
19.0M
        #undef DO_SPRINTS
2901
2902
19.0M
        assert(len >= 0);
2903
2904
19.0M
        int sign = (buffer[0] == '-');
2905
19.0M
        len -= sign;
2906
2907
19.0M
        precision = Py_MAX(precision, len);
2908
19.0M
        width = Py_MAX(width, precision + sign);
2909
19.0M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2910
2.13k
            precision = width - sign;
2911
2.13k
        }
2912
2913
19.0M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2914
19.0M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2915
2916
19.0M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2917
0
            return NULL;
2918
2919
19.0M
        if (spacepad && !(flags & F_LJUST)) {
2920
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2921
0
                return NULL;
2922
0
            writer->pos += spacepad;
2923
0
        }
2924
2925
19.0M
        if (sign) {
2926
0
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2927
0
                return NULL;
2928
0
        }
2929
2930
19.0M
        if (zeropad) {
2931
680
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2932
0
                return NULL;
2933
680
            writer->pos += zeropad;
2934
680
        }
2935
2936
19.0M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2937
0
            return NULL;
2938
2939
19.0M
        if (spacepad && (flags & F_LJUST)) {
2940
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2941
0
                return NULL;
2942
0
            writer->pos += spacepad;
2943
0
        }
2944
19.0M
        break;
2945
19.0M
    }
2946
2947
19.0M
    case 'p':
2948
0
    {
2949
0
        char number[MAX_INTMAX_CHARS];
2950
2951
0
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2952
0
        assert(len >= 0);
2953
2954
        /* %p is ill-defined:  ensure leading 0x. */
2955
0
        if (number[1] == 'X')
2956
0
            number[1] = 'x';
2957
0
        else if (number[1] != 'x') {
2958
0
            memmove(number + 2, number,
2959
0
                    strlen(number) + 1);
2960
0
            number[0] = '0';
2961
0
            number[1] = 'x';
2962
0
            len += 2;
2963
0
        }
2964
2965
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2966
0
            return NULL;
2967
0
        break;
2968
0
    }
2969
2970
4.46M
    case 's':
2971
4.46M
    {
2972
4.46M
        if (sizemod) {
2973
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2974
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2975
0
                return NULL;
2976
0
        }
2977
4.46M
        else {
2978
            /* UTF-8 */
2979
4.46M
            const char *s = va_arg(*vargs, const char*);
2980
4.46M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2981
0
                return NULL;
2982
4.46M
        }
2983
4.46M
        break;
2984
4.46M
    }
2985
2986
4.46M
    case 'U':
2987
27.9k
    {
2988
27.9k
        PyObject *obj = va_arg(*vargs, PyObject *);
2989
27.9k
        assert(obj && _PyUnicode_CHECK(obj));
2990
2991
27.9k
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2992
0
            return NULL;
2993
27.9k
        break;
2994
27.9k
    }
2995
2996
27.9k
    case 'V':
2997
605
    {
2998
605
        PyObject *obj = va_arg(*vargs, PyObject *);
2999
605
        const char *str;
3000
605
        const wchar_t *wstr;
3001
605
        if (sizemod) {
3002
0
            wstr = va_arg(*vargs, const wchar_t*);
3003
0
        }
3004
605
        else {
3005
605
            str = va_arg(*vargs, const char *);
3006
605
        }
3007
605
        if (obj) {
3008
0
            assert(_PyUnicode_CHECK(obj));
3009
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
3010
0
                return NULL;
3011
0
        }
3012
605
        else if (sizemod) {
3013
0
            assert(wstr != NULL);
3014
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
3015
0
                return NULL;
3016
0
        }
3017
605
        else {
3018
605
            assert(str != NULL);
3019
605
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
3020
0
                return NULL;
3021
605
        }
3022
605
        break;
3023
605
    }
3024
3025
605
    case 'S':
3026
43
    {
3027
43
        PyObject *obj = va_arg(*vargs, PyObject *);
3028
43
        PyObject *str;
3029
43
        assert(obj);
3030
43
        str = PyObject_Str(obj);
3031
43
        if (!str)
3032
0
            return NULL;
3033
43
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
3034
0
            Py_DECREF(str);
3035
0
            return NULL;
3036
0
        }
3037
43
        Py_DECREF(str);
3038
43
        break;
3039
43
    }
3040
3041
658
    case 'R':
3042
658
    {
3043
658
        PyObject *obj = va_arg(*vargs, PyObject *);
3044
658
        PyObject *repr;
3045
658
        assert(obj);
3046
658
        repr = PyObject_Repr(obj);
3047
658
        if (!repr)
3048
0
            return NULL;
3049
658
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
3050
0
            Py_DECREF(repr);
3051
0
            return NULL;
3052
0
        }
3053
658
        Py_DECREF(repr);
3054
658
        break;
3055
658
    }
3056
3057
0
    case 'A':
3058
0
    {
3059
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3060
0
        PyObject *ascii;
3061
0
        assert(obj);
3062
0
        ascii = PyObject_ASCII(obj);
3063
0
        if (!ascii)
3064
0
            return NULL;
3065
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
3066
0
            Py_DECREF(ascii);
3067
0
            return NULL;
3068
0
        }
3069
0
        Py_DECREF(ascii);
3070
0
        break;
3071
0
    }
3072
3073
0
    case 'T':
3074
0
    {
3075
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3076
0
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
3077
3078
0
        PyObject *type_name;
3079
0
        if (flags & F_ALT) {
3080
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3081
0
        }
3082
0
        else {
3083
0
            type_name = PyType_GetFullyQualifiedName(type);
3084
0
        }
3085
0
        Py_DECREF(type);
3086
0
        if (!type_name) {
3087
0
            return NULL;
3088
0
        }
3089
3090
0
        if (unicode_fromformat_write_str(writer, type_name,
3091
0
                                         width, precision, flags) == -1) {
3092
0
            Py_DECREF(type_name);
3093
0
            return NULL;
3094
0
        }
3095
0
        Py_DECREF(type_name);
3096
0
        break;
3097
0
    }
3098
3099
0
    case 'N':
3100
0
    {
3101
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3102
0
        assert(type_raw != NULL);
3103
3104
0
        if (!PyType_Check(type_raw)) {
3105
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3106
0
            return NULL;
3107
0
        }
3108
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3109
3110
0
        PyObject *type_name;
3111
0
        if (flags & F_ALT) {
3112
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3113
0
        }
3114
0
        else {
3115
0
            type_name = PyType_GetFullyQualifiedName(type);
3116
0
        }
3117
0
        if (!type_name) {
3118
0
            return NULL;
3119
0
        }
3120
0
        if (unicode_fromformat_write_str(writer, type_name,
3121
0
                                         width, precision, flags) == -1) {
3122
0
            Py_DECREF(type_name);
3123
0
            return NULL;
3124
0
        }
3125
0
        Py_DECREF(type_name);
3126
0
        break;
3127
0
    }
3128
3129
0
    default:
3130
0
    invalid_format:
3131
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3132
0
        return NULL;
3133
27.7M
    }
3134
3135
27.7M
    f++;
3136
27.7M
    return f;
3137
27.7M
}
3138
3139
static int
3140
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3141
13.9M
{
3142
13.9M
    Py_ssize_t len = strlen(format);
3143
13.9M
    writer->min_length += len + 100;
3144
13.9M
    writer->overallocate = 1;
3145
3146
    // Copy varags to be able to pass a reference to a subfunction.
3147
13.9M
    va_list vargs2;
3148
13.9M
    va_copy(vargs2, vargs);
3149
3150
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3151
    // to be encoded to ASCII.
3152
13.9M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3153
13.9M
    if (!is_ascii) {
3154
0
        Py_ssize_t i;
3155
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3156
0
        PyErr_Format(PyExc_ValueError,
3157
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3158
0
            "string, got a non-ASCII byte: 0x%02x",
3159
0
            (unsigned char)format[i]);
3160
0
        goto fail;
3161
0
    }
3162
3163
79.0M
    for (const char *f = format; *f; ) {
3164
65.0M
        if (*f == '%') {
3165
32.0M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3166
32.0M
            if (f == NULL)
3167
0
                goto fail;
3168
32.0M
        }
3169
33.0M
        else {
3170
33.0M
            const char *p = strchr(f, '%');
3171
33.0M
            if (p != NULL) {
3172
23.4M
                len = p - f;
3173
23.4M
            }
3174
9.61M
            else {
3175
9.61M
                len = strlen(f);
3176
9.61M
                writer->overallocate = 0;
3177
9.61M
            }
3178
3179
33.0M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3180
0
                goto fail;
3181
0
            }
3182
33.0M
            f += len;
3183
33.0M
        }
3184
65.0M
    }
3185
13.9M
    va_end(vargs2);
3186
13.9M
    return 0;
3187
3188
0
  fail:
3189
0
    va_end(vargs2);
3190
0
    return -1;
3191
13.9M
}
3192
3193
PyObject *
3194
PyUnicode_FromFormatV(const char *format, va_list vargs)
3195
13.9M
{
3196
13.9M
    _PyUnicodeWriter writer;
3197
13.9M
    _PyUnicodeWriter_Init(&writer);
3198
3199
13.9M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3200
0
        _PyUnicodeWriter_Dealloc(&writer);
3201
0
        return NULL;
3202
0
    }
3203
13.9M
    return _PyUnicodeWriter_Finish(&writer);
3204
13.9M
}
3205
3206
PyObject *
3207
PyUnicode_FromFormat(const char *format, ...)
3208
11.4k
{
3209
11.4k
    PyObject* ret;
3210
11.4k
    va_list vargs;
3211
3212
11.4k
    va_start(vargs, format);
3213
11.4k
    ret = PyUnicode_FromFormatV(format, vargs);
3214
11.4k
    va_end(vargs);
3215
11.4k
    return ret;
3216
11.4k
}
3217
3218
int
3219
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3220
0
{
3221
0
    va_list vargs;
3222
0
    va_start(vargs, format);
3223
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3224
0
    va_end(vargs);
3225
0
    return res;
3226
0
}
3227
3228
int
3229
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3230
                         va_list vargs)
3231
0
{
3232
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3233
0
    Py_ssize_t old_pos = _writer->pos;
3234
3235
0
    int res = unicode_from_format(_writer, format, vargs);
3236
3237
0
    if (res < 0) {
3238
0
        _writer->pos = old_pos;
3239
0
    }
3240
0
    return res;
3241
0
}
3242
3243
static Py_ssize_t
3244
unicode_get_widechar_size(PyObject *unicode)
3245
7.00k
{
3246
7.00k
    Py_ssize_t res;
3247
3248
7.00k
    assert(unicode != NULL);
3249
7.00k
    assert(_PyUnicode_CHECK(unicode));
3250
3251
7.00k
    res = _PyUnicode_LENGTH(unicode);
3252
#if SIZEOF_WCHAR_T == 2
3253
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3254
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3255
        const Py_UCS4 *end = s + res;
3256
        for (; s < end; ++s) {
3257
            if (*s > 0xFFFF) {
3258
                ++res;
3259
            }
3260
        }
3261
    }
3262
#endif
3263
7.00k
    return res;
3264
7.00k
}
3265
3266
static void
3267
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3268
7.00k
{
3269
7.00k
    assert(unicode != NULL);
3270
7.00k
    assert(_PyUnicode_CHECK(unicode));
3271
3272
7.00k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3273
0
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3274
0
        return;
3275
0
    }
3276
3277
7.00k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3278
7.00k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3279
600k
        for (; size--; ++s, ++w) {
3280
593k
            *w = *s;
3281
593k
        }
3282
7.00k
    }
3283
0
    else {
3284
0
#if SIZEOF_WCHAR_T == 4
3285
0
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3286
0
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3287
0
        for (; size--; ++s, ++w) {
3288
0
            *w = *s;
3289
0
        }
3290
#else
3291
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3292
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3293
        for (; size--; ++s, ++w) {
3294
            Py_UCS4 ch = *s;
3295
            if (ch > 0xFFFF) {
3296
                assert(ch <= MAX_UNICODE);
3297
                /* encode surrogate pair in this case */
3298
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3299
                if (!size--)
3300
                    break;
3301
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3302
            }
3303
            else {
3304
                *w = ch;
3305
            }
3306
        }
3307
#endif
3308
0
    }
3309
7.00k
}
3310
3311
#ifdef HAVE_WCHAR_H
3312
3313
/* Convert a Unicode object to a wide character string.
3314
3315
   - If w is NULL: return the number of wide characters (including the null
3316
     character) required to convert the unicode object. Ignore size argument.
3317
3318
   - Otherwise: return the number of wide characters (excluding the null
3319
     character) written into w. Write at most size wide characters (including
3320
     the null character). */
3321
Py_ssize_t
3322
PyUnicode_AsWideChar(PyObject *unicode,
3323
                     wchar_t *w,
3324
                     Py_ssize_t size)
3325
5.73k
{
3326
5.73k
    Py_ssize_t res;
3327
3328
5.73k
    if (unicode == NULL) {
3329
0
        PyErr_BadInternalCall();
3330
0
        return -1;
3331
0
    }
3332
5.73k
    if (!PyUnicode_Check(unicode)) {
3333
0
        PyErr_BadArgument();
3334
0
        return -1;
3335
0
    }
3336
3337
5.73k
    res = unicode_get_widechar_size(unicode);
3338
5.73k
    if (w == NULL) {
3339
0
        return res + 1;
3340
0
    }
3341
3342
5.73k
    if (size > res) {
3343
5.73k
        size = res + 1;
3344
5.73k
    }
3345
0
    else {
3346
0
        res = size;
3347
0
    }
3348
5.73k
    unicode_copy_as_widechar(unicode, w, size);
3349
3350
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3351
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3352
       non-Unicode locales and hence needs conversion first. */
3353
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3354
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3355
            return -1;
3356
        }
3357
    }
3358
#endif
3359
3360
5.73k
    return res;
3361
5.73k
}
3362
3363
wchar_t*
3364
PyUnicode_AsWideCharString(PyObject *unicode,
3365
                           Py_ssize_t *size)
3366
1.26k
{
3367
1.26k
    wchar_t *buffer;
3368
1.26k
    Py_ssize_t buflen;
3369
3370
1.26k
    if (unicode == NULL) {
3371
0
        PyErr_BadInternalCall();
3372
0
        return NULL;
3373
0
    }
3374
1.26k
    if (!PyUnicode_Check(unicode)) {
3375
0
        PyErr_BadArgument();
3376
0
        return NULL;
3377
0
    }
3378
3379
1.26k
    buflen = unicode_get_widechar_size(unicode);
3380
1.26k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3381
1.26k
    if (buffer == NULL) {
3382
0
        PyErr_NoMemory();
3383
0
        return NULL;
3384
0
    }
3385
1.26k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3386
3387
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3388
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3389
       non-Unicode locales and hence needs conversion first. */
3390
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3391
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3392
            return NULL;
3393
        }
3394
    }
3395
#endif
3396
3397
1.26k
    if (size != NULL) {
3398
820
        *size = buflen;
3399
820
    }
3400
448
    else if (wcslen(buffer) != (size_t)buflen) {
3401
0
        PyMem_Free(buffer);
3402
0
        PyErr_SetString(PyExc_ValueError,
3403
0
                        "embedded null character");
3404
0
        return NULL;
3405
0
    }
3406
1.26k
    return buffer;
3407
1.26k
}
3408
3409
#endif /* HAVE_WCHAR_H */
3410
3411
int
3412
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3413
0
{
3414
0
    wchar_t **p = (wchar_t **)ptr;
3415
0
    if (obj == NULL) {
3416
0
        PyMem_Free(*p);
3417
0
        *p = NULL;
3418
0
        return 1;
3419
0
    }
3420
0
    if (PyUnicode_Check(obj)) {
3421
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3422
0
        if (*p == NULL) {
3423
0
            return 0;
3424
0
        }
3425
0
        return Py_CLEANUP_SUPPORTED;
3426
0
    }
3427
0
    PyErr_Format(PyExc_TypeError,
3428
0
                 "argument must be str, not %.50s",
3429
0
                 Py_TYPE(obj)->tp_name);
3430
0
    return 0;
3431
0
}
3432
3433
int
3434
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3435
0
{
3436
0
    wchar_t **p = (wchar_t **)ptr;
3437
0
    if (obj == NULL) {
3438
0
        PyMem_Free(*p);
3439
0
        *p = NULL;
3440
0
        return 1;
3441
0
    }
3442
0
    if (obj == Py_None) {
3443
0
        *p = NULL;
3444
0
        return 1;
3445
0
    }
3446
0
    if (PyUnicode_Check(obj)) {
3447
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3448
0
        if (*p == NULL) {
3449
0
            return 0;
3450
0
        }
3451
0
        return Py_CLEANUP_SUPPORTED;
3452
0
    }
3453
0
    PyErr_Format(PyExc_TypeError,
3454
0
                 "argument must be str or None, not %.50s",
3455
0
                 Py_TYPE(obj)->tp_name);
3456
0
    return 0;
3457
0
}
3458
3459
PyObject *
3460
PyUnicode_FromOrdinal(int ordinal)
3461
216k
{
3462
216k
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3463
0
        PyErr_SetString(PyExc_ValueError,
3464
0
                        "chr() arg not in range(0x110000)");
3465
0
        return NULL;
3466
0
    }
3467
3468
216k
    return unicode_char((Py_UCS4)ordinal);
3469
216k
}
3470
3471
PyObject *
3472
PyUnicode_FromObject(PyObject *obj)
3473
4.65M
{
3474
    /* XXX Perhaps we should make this API an alias of
3475
       PyObject_Str() instead ?! */
3476
4.65M
    if (PyUnicode_CheckExact(obj)) {
3477
4.65M
        return Py_NewRef(obj);
3478
4.65M
    }
3479
0
    if (PyUnicode_Check(obj)) {
3480
        /* For a Unicode subtype that's not a Unicode object,
3481
           return a true Unicode object with the same data. */
3482
0
        return _PyUnicode_Copy(obj);
3483
0
    }
3484
0
    PyErr_Format(PyExc_TypeError,
3485
0
                 "Can't convert '%.100s' object to str implicitly",
3486
0
                 Py_TYPE(obj)->tp_name);
3487
0
    return NULL;
3488
0
}
3489
3490
PyObject *
3491
PyUnicode_FromEncodedObject(PyObject *obj,
3492
                            const char *encoding,
3493
                            const char *errors)
3494
5.92M
{
3495
5.92M
    Py_buffer buffer;
3496
5.92M
    PyObject *v;
3497
3498
5.92M
    if (obj == NULL) {
3499
0
        PyErr_BadInternalCall();
3500
0
        return NULL;
3501
0
    }
3502
3503
    /* Decoding bytes objects is the most common case and should be fast */
3504
5.92M
    if (PyBytes_Check(obj)) {
3505
5.34M
        if (PyBytes_GET_SIZE(obj) == 0) {
3506
997k
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3507
0
                return NULL;
3508
0
            }
3509
997k
            _Py_RETURN_UNICODE_EMPTY();
3510
997k
        }
3511
4.34M
        return PyUnicode_Decode(
3512
4.34M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3513
4.34M
                encoding, errors);
3514
5.34M
    }
3515
3516
581k
    if (PyUnicode_Check(obj)) {
3517
0
        PyErr_SetString(PyExc_TypeError,
3518
0
                        "decoding str is not supported");
3519
0
        return NULL;
3520
0
    }
3521
3522
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3523
581k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3524
0
        PyErr_Format(PyExc_TypeError,
3525
0
                     "decoding to str: need a bytes-like object, %.80s found",
3526
0
                     Py_TYPE(obj)->tp_name);
3527
0
        return NULL;
3528
0
    }
3529
3530
581k
    if (buffer.len == 0) {
3531
0
        PyBuffer_Release(&buffer);
3532
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3533
0
            return NULL;
3534
0
        }
3535
0
        _Py_RETURN_UNICODE_EMPTY();
3536
0
    }
3537
3538
581k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3539
581k
    PyBuffer_Release(&buffer);
3540
581k
    return v;
3541
581k
}
3542
3543
/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3544
   also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3545
   longer than lower_len-1). */
3546
int
3547
_Py_normalize_encoding(const char *encoding,
3548
                       char *lower,
3549
                       size_t lower_len)
3550
9.71M
{
3551
9.71M
    const char *e;
3552
9.71M
    char *l;
3553
9.71M
    char *l_end;
3554
9.71M
    int punct;
3555
3556
9.71M
    assert(encoding != NULL);
3557
3558
9.71M
    e = encoding;
3559
9.71M
    l = lower;
3560
9.71M
    l_end = &lower[lower_len - 1];
3561
9.71M
    punct = 0;
3562
104M
    while (1) {
3563
104M
        char c = *e;
3564
104M
        if (c == 0) {
3565
9.14M
            break;
3566
9.14M
        }
3567
3568
95.2M
        if (Py_ISALNUM(c) || c == '.') {
3569
49.4M
            if (punct && l != lower) {
3570
8.90M
                if (l == l_end) {
3571
1.32k
                    return 0;
3572
1.32k
                }
3573
8.90M
                *l++ = '_';
3574
8.90M
            }
3575
49.4M
            punct = 0;
3576
3577
49.4M
            if (l == l_end) {
3578
575k
                return 0;
3579
575k
            }
3580
48.8M
            *l++ = Py_TOLOWER(c);
3581
48.8M
        }
3582
45.8M
        else {
3583
45.8M
            punct = 1;
3584
45.8M
        }
3585
3586
94.6M
        e++;
3587
94.6M
    }
3588
9.14M
    *l = '\0';
3589
9.14M
    return 1;
3590
9.71M
}
3591
3592
PyObject *
3593
PyUnicode_Decode(const char *s,
3594
                 Py_ssize_t size,
3595
                 const char *encoding,
3596
                 const char *errors)
3597
4.93M
{
3598
4.93M
    PyObject *buffer = NULL, *unicode;
3599
4.93M
    Py_buffer info;
3600
4.93M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3601
3602
4.93M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3603
0
        return NULL;
3604
0
    }
3605
3606
4.93M
    if (size == 0) {
3607
0
        _Py_RETURN_UNICODE_EMPTY();
3608
0
    }
3609
3610
4.93M
    if (encoding == NULL) {
3611
36.2k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3612
36.2k
    }
3613
3614
    /* Shortcuts for common default encodings */
3615
4.89M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3616
4.88M
        char *lower = buflower;
3617
3618
        /* Fast paths */
3619
4.88M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3620
834k
            lower += 3;
3621
834k
            if (*lower == '_') {
3622
                /* Match "utf8" and "utf_8" */
3623
834k
                lower++;
3624
834k
            }
3625
3626
834k
            if (lower[0] == '8' && lower[1] == 0) {
3627
833k
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3628
833k
            }
3629
918
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3630
107
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3631
107
            }
3632
811
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3633
106
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3634
106
            }
3635
834k
        }
3636
4.05M
        else {
3637
4.05M
            if (strcmp(lower, "ascii") == 0
3638
3.62M
                || strcmp(lower, "us_ascii") == 0) {
3639
578k
                return PyUnicode_DecodeASCII(s, size, errors);
3640
578k
            }
3641
    #ifdef MS_WINDOWS
3642
            else if (strcmp(lower, "mbcs") == 0) {
3643
                return PyUnicode_DecodeMBCS(s, size, errors);
3644
            }
3645
    #endif
3646
3.47M
            else if (strcmp(lower, "latin1") == 0
3647
3.47M
                     || strcmp(lower, "latin_1") == 0
3648
331k
                     || strcmp(lower, "iso_8859_1") == 0
3649
3.16M
                     || strcmp(lower, "iso8859_1") == 0) {
3650
3.16M
                return PyUnicode_DecodeLatin1(s, size, errors);
3651
3.16M
            }
3652
4.05M
        }
3653
4.88M
    }
3654
3655
    /* Decode via the codec registry */
3656
316k
    buffer = NULL;
3657
316k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3658
0
        goto onError;
3659
316k
    buffer = PyMemoryView_FromBuffer(&info);
3660
316k
    if (buffer == NULL)
3661
0
        goto onError;
3662
316k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3663
316k
    if (unicode == NULL)
3664
133k
        goto onError;
3665
182k
    if (!PyUnicode_Check(unicode)) {
3666
0
        PyErr_Format(PyExc_TypeError,
3667
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3668
0
                     "use codecs.decode() to decode to arbitrary types",
3669
0
                     encoding,
3670
0
                     Py_TYPE(unicode)->tp_name);
3671
0
        Py_DECREF(unicode);
3672
0
        goto onError;
3673
0
    }
3674
182k
    Py_DECREF(buffer);
3675
182k
    return unicode_result(unicode);
3676
3677
133k
  onError:
3678
133k
    Py_XDECREF(buffer);
3679
133k
    return NULL;
3680
182k
}
3681
3682
PyAPI_FUNC(PyObject *)
3683
PyUnicode_AsDecodedObject(PyObject *unicode,
3684
                          const char *encoding,
3685
                          const char *errors)
3686
0
{
3687
0
    if (!PyUnicode_Check(unicode)) {
3688
0
        PyErr_BadArgument();
3689
0
        return NULL;
3690
0
    }
3691
3692
0
    if (encoding == NULL)
3693
0
        encoding = PyUnicode_GetDefaultEncoding();
3694
3695
    /* Decode via the codec registry */
3696
0
    return PyCodec_Decode(unicode, encoding, errors);
3697
0
}
3698
3699
PyAPI_FUNC(PyObject *)
3700
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3701
                           const char *encoding,
3702
                           const char *errors)
3703
0
{
3704
0
    PyObject *v;
3705
3706
0
    if (!PyUnicode_Check(unicode)) {
3707
0
        PyErr_BadArgument();
3708
0
        goto onError;
3709
0
    }
3710
3711
0
    if (encoding == NULL)
3712
0
        encoding = PyUnicode_GetDefaultEncoding();
3713
3714
    /* Decode via the codec registry */
3715
0
    v = PyCodec_Decode(unicode, encoding, errors);
3716
0
    if (v == NULL)
3717
0
        goto onError;
3718
0
    if (!PyUnicode_Check(v)) {
3719
0
        PyErr_Format(PyExc_TypeError,
3720
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3721
0
                     "use codecs.decode() to decode to arbitrary types",
3722
0
                     encoding,
3723
0
                     Py_TYPE(unicode)->tp_name);
3724
0
        Py_DECREF(v);
3725
0
        goto onError;
3726
0
    }
3727
0
    return unicode_result(v);
3728
3729
0
  onError:
3730
0
    return NULL;
3731
0
}
3732
3733
PyAPI_FUNC(PyObject *)
3734
PyUnicode_AsEncodedObject(PyObject *unicode,
3735
                          const char *encoding,
3736
                          const char *errors)
3737
0
{
3738
0
    PyObject *v;
3739
3740
0
    if (!PyUnicode_Check(unicode)) {
3741
0
        PyErr_BadArgument();
3742
0
        goto onError;
3743
0
    }
3744
3745
0
    if (encoding == NULL)
3746
0
        encoding = PyUnicode_GetDefaultEncoding();
3747
3748
    /* Encode via the codec registry */
3749
0
    v = PyCodec_Encode(unicode, encoding, errors);
3750
0
    if (v == NULL)
3751
0
        goto onError;
3752
0
    return v;
3753
3754
0
  onError:
3755
0
    return NULL;
3756
0
}
3757
3758
3759
static PyObject *
3760
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3761
                      int current_locale)
3762
420
{
3763
420
    Py_ssize_t wlen;
3764
420
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3765
420
    if (wstr == NULL) {
3766
0
        return NULL;
3767
0
    }
3768
3769
420
    if ((size_t)wlen != wcslen(wstr)) {
3770
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3771
0
        PyMem_Free(wstr);
3772
0
        return NULL;
3773
0
    }
3774
3775
420
    char *str;
3776
420
    size_t error_pos;
3777
420
    const char *reason;
3778
420
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3779
420
                                 current_locale, error_handler);
3780
420
    PyMem_Free(wstr);
3781
3782
420
    if (res != 0) {
3783
0
        if (res == -2) {
3784
0
            PyObject *exc;
3785
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3786
0
                    "locale", unicode,
3787
0
                    (Py_ssize_t)error_pos,
3788
0
                    (Py_ssize_t)(error_pos+1),
3789
0
                    reason);
3790
0
            if (exc != NULL) {
3791
0
                PyCodec_StrictErrors(exc);
3792
0
                Py_DECREF(exc);
3793
0
            }
3794
0
        }
3795
0
        else if (res == -3) {
3796
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3797
0
        }
3798
0
        else {
3799
0
            PyErr_NoMemory();
3800
0
        }
3801
0
        return NULL;
3802
0
    }
3803
3804
420
    PyObject *bytes = PyBytes_FromString(str);
3805
420
    PyMem_RawFree(str);
3806
420
    return bytes;
3807
420
}
3808
3809
PyObject *
3810
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3811
0
{
3812
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3813
0
    return unicode_encode_locale(unicode, error_handler, 1);
3814
0
}
3815
3816
PyObject *
3817
PyUnicode_EncodeFSDefault(PyObject *unicode)
3818
17.5k
{
3819
17.5k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3820
17.5k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3821
17.5k
    if (fs_codec->utf8) {
3822
17.0k
        return unicode_encode_utf8(unicode,
3823
17.0k
                                   fs_codec->error_handler,
3824
17.0k
                                   fs_codec->errors);
3825
17.0k
    }
3826
420
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3827
420
    else if (fs_codec->encoding) {
3828
0
        return PyUnicode_AsEncodedString(unicode,
3829
0
                                         fs_codec->encoding,
3830
0
                                         fs_codec->errors);
3831
0
    }
3832
420
#endif
3833
420
    else {
3834
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3835
           machinery is not ready and so cannot be used:
3836
           use wcstombs() in this case. */
3837
420
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3838
420
        const wchar_t *filesystem_errors = config->filesystem_errors;
3839
420
        assert(filesystem_errors != NULL);
3840
420
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3841
420
        assert(errors != _Py_ERROR_UNKNOWN);
3842
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3843
        return unicode_encode_utf8(unicode, errors, NULL);
3844
#else
3845
420
        return unicode_encode_locale(unicode, errors, 0);
3846
420
#endif
3847
420
    }
3848
17.5k
}
3849
3850
PyObject *
3851
PyUnicode_AsEncodedString(PyObject *unicode,
3852
                          const char *encoding,
3853
                          const char *errors)
3854
16.1M
{
3855
16.1M
    PyObject *v;
3856
16.1M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3857
3858
16.1M
    if (!PyUnicode_Check(unicode)) {
3859
0
        PyErr_BadArgument();
3860
0
        return NULL;
3861
0
    }
3862
3863
16.1M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3864
0
        return NULL;
3865
0
    }
3866
3867
16.1M
    if (encoding == NULL) {
3868
11.2M
        return _PyUnicode_AsUTF8String(unicode, errors);
3869
11.2M
    }
3870
3871
    /* Shortcuts for common default encodings */
3872
4.82M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3873
4.25M
        char *lower = buflower;
3874
3875
        /* Fast paths */
3876
4.25M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3877
4.12M
            lower += 3;
3878
4.12M
            if (*lower == '_') {
3879
                /* Match "utf8" and "utf_8" */
3880
4.12M
                lower++;
3881
4.12M
            }
3882
3883
4.12M
            if (lower[0] == '8' && lower[1] == 0) {
3884
4.12M
                return _PyUnicode_AsUTF8String(unicode, errors);
3885
4.12M
            }
3886
0
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3887
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3888
0
            }
3889
0
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3890
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3891
0
            }
3892
4.12M
        }
3893
127k
        else {
3894
127k
            if (strcmp(lower, "ascii") == 0
3895
108k
                || strcmp(lower, "us_ascii") == 0) {
3896
108k
                return _PyUnicode_AsASCIIString(unicode, errors);
3897
108k
            }
3898
#ifdef MS_WINDOWS
3899
            else if (strcmp(lower, "mbcs") == 0) {
3900
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3901
            }
3902
#endif
3903
19.2k
            else if (strcmp(lower, "latin1") == 0 ||
3904
19.2k
                     strcmp(lower, "latin_1") == 0 ||
3905
19.2k
                     strcmp(lower, "iso_8859_1") == 0 ||
3906
19.2k
                     strcmp(lower, "iso8859_1") == 0) {
3907
0
                return _PyUnicode_AsLatin1String(unicode, errors);
3908
0
            }
3909
127k
        }
3910
4.25M
    }
3911
3912
    /* Encode via the codec registry */
3913
587k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3914
587k
    if (v == NULL)
3915
0
        return NULL;
3916
3917
    /* The normal path */
3918
587k
    if (PyBytes_Check(v))
3919
587k
        return v;
3920
3921
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3922
0
    if (PyByteArray_Check(v)) {
3923
0
        int error;
3924
0
        PyObject *b;
3925
3926
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3927
0
            "encoder %s returned bytearray instead of bytes; "
3928
0
            "use codecs.encode() to encode to arbitrary types",
3929
0
            encoding);
3930
0
        if (error) {
3931
0
            Py_DECREF(v);
3932
0
            return NULL;
3933
0
        }
3934
3935
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3936
0
                                      PyByteArray_GET_SIZE(v));
3937
0
        Py_DECREF(v);
3938
0
        return b;
3939
0
    }
3940
3941
0
    PyErr_Format(PyExc_TypeError,
3942
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3943
0
                 "use codecs.encode() to encode to arbitrary types",
3944
0
                 encoding,
3945
0
                 Py_TYPE(v)->tp_name);
3946
0
    Py_DECREF(v);
3947
0
    return NULL;
3948
0
}
3949
3950
PyAPI_FUNC(PyObject *)
3951
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3952
                           const char *encoding,
3953
                           const char *errors)
3954
0
{
3955
0
    PyObject *v;
3956
3957
0
    if (!PyUnicode_Check(unicode)) {
3958
0
        PyErr_BadArgument();
3959
0
        goto onError;
3960
0
    }
3961
3962
0
    if (encoding == NULL)
3963
0
        encoding = PyUnicode_GetDefaultEncoding();
3964
3965
    /* Encode via the codec registry */
3966
0
    v = PyCodec_Encode(unicode, encoding, errors);
3967
0
    if (v == NULL)
3968
0
        goto onError;
3969
0
    if (!PyUnicode_Check(v)) {
3970
0
        PyErr_Format(PyExc_TypeError,
3971
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3972
0
                     "use codecs.encode() to encode to arbitrary types",
3973
0
                     encoding,
3974
0
                     Py_TYPE(v)->tp_name);
3975
0
        Py_DECREF(v);
3976
0
        goto onError;
3977
0
    }
3978
0
    return v;
3979
3980
0
  onError:
3981
0
    return NULL;
3982
0
}
3983
3984
static PyObject*
3985
unicode_decode_locale(const char *str, Py_ssize_t len,
3986
                      _Py_error_handler errors, int current_locale)
3987
15.6k
{
3988
15.6k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3989
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3990
0
        return NULL;
3991
0
    }
3992
3993
15.6k
    wchar_t *wstr;
3994
15.6k
    size_t wlen;
3995
15.6k
    const char *reason;
3996
15.6k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3997
15.6k
                                 current_locale, errors);
3998
15.6k
    if (res != 0) {
3999
0
        if (res == -2) {
4000
0
            PyObject *exc;
4001
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4002
0
                                        "locale", str, len,
4003
0
                                        (Py_ssize_t)wlen,
4004
0
                                        (Py_ssize_t)(wlen + 1),
4005
0
                                        reason);
4006
0
            if (exc != NULL) {
4007
0
                PyCodec_StrictErrors(exc);
4008
0
                Py_DECREF(exc);
4009
0
            }
4010
0
        }
4011
0
        else if (res == -3) {
4012
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4013
0
        }
4014
0
        else {
4015
0
            PyErr_NoMemory();
4016
0
        }
4017
0
        return NULL;
4018
0
    }
4019
4020
15.6k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4021
15.6k
    PyMem_RawFree(wstr);
4022
15.6k
    return unicode;
4023
15.6k
}
4024
4025
PyObject*
4026
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4027
                              const char *errors)
4028
0
{
4029
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4030
0
    return unicode_decode_locale(str, len, error_handler, 1);
4031
0
}
4032
4033
PyObject*
4034
PyUnicode_DecodeLocale(const char *str, const char *errors)
4035
10.4k
{
4036
10.4k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
4037
10.4k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4038
10.4k
    return unicode_decode_locale(str, size, error_handler, 1);
4039
10.4k
}
4040
4041
4042
PyObject*
4043
0
PyUnicode_DecodeFSDefault(const char *s) {
4044
0
    Py_ssize_t size = (Py_ssize_t)strlen(s);
4045
0
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
4046
0
}
4047
4048
PyObject*
4049
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4050
6.72k
{
4051
6.72k
    PyInterpreterState *interp = _PyInterpreterState_GET();
4052
6.72k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4053
6.72k
    if (fs_codec->utf8) {
4054
1.59k
        return unicode_decode_utf8(s, size,
4055
1.59k
                                   fs_codec->error_handler,
4056
1.59k
                                   fs_codec->errors,
4057
1.59k
                                   NULL);
4058
1.59k
    }
4059
5.13k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
4060
5.13k
    else if (fs_codec->encoding) {
4061
0
        return PyUnicode_Decode(s, size,
4062
0
                                fs_codec->encoding,
4063
0
                                fs_codec->errors);
4064
0
    }
4065
5.13k
#endif
4066
5.13k
    else {
4067
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
4068
           machinery is not ready and so cannot be used:
4069
           use mbstowcs() in this case. */
4070
5.13k
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4071
5.13k
        const wchar_t *filesystem_errors = config->filesystem_errors;
4072
5.13k
        assert(filesystem_errors != NULL);
4073
5.13k
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4074
5.13k
        assert(errors != _Py_ERROR_UNKNOWN);
4075
#ifdef _Py_FORCE_UTF8_FS_ENCODING
4076
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
4077
#else
4078
5.13k
        return unicode_decode_locale(s, size, errors, 0);
4079
5.13k
#endif
4080
5.13k
    }
4081
6.72k
}
4082
4083
4084
int
4085
PyUnicode_FSConverter(PyObject* arg, void* addr)
4086
11.1k
{
4087
11.1k
    PyObject *path = NULL;
4088
11.1k
    PyObject *output = NULL;
4089
11.1k
    Py_ssize_t size;
4090
11.1k
    const char *data;
4091
11.1k
    if (arg == NULL) {
4092
0
        Py_DECREF(*(PyObject**)addr);
4093
0
        *(PyObject**)addr = NULL;
4094
0
        return 1;
4095
0
    }
4096
11.1k
    path = PyOS_FSPath(arg);
4097
11.1k
    if (path == NULL) {
4098
0
        return 0;
4099
0
    }
4100
11.1k
    if (PyBytes_Check(path)) {
4101
0
        output = path;
4102
0
    }
4103
11.1k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4104
11.1k
        output = PyUnicode_EncodeFSDefault(path);
4105
11.1k
        Py_DECREF(path);
4106
11.1k
        if (!output) {
4107
0
            return 0;
4108
0
        }
4109
11.1k
        assert(PyBytes_Check(output));
4110
11.1k
    }
4111
4112
11.1k
    size = PyBytes_GET_SIZE(output);
4113
11.1k
    data = PyBytes_AS_STRING(output);
4114
11.1k
    if ((size_t)size != strlen(data)) {
4115
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4116
0
        Py_DECREF(output);
4117
0
        return 0;
4118
0
    }
4119
11.1k
    *(PyObject**)addr = output;
4120
11.1k
    return Py_CLEANUP_SUPPORTED;
4121
11.1k
}
4122
4123
4124
int
4125
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4126
21.5k
{
4127
21.5k
    if (arg == NULL) {
4128
0
        Py_DECREF(*(PyObject**)addr);
4129
0
        *(PyObject**)addr = NULL;
4130
0
        return 1;
4131
0
    }
4132
4133
21.5k
    PyObject *path = PyOS_FSPath(arg);
4134
21.5k
    if (path == NULL) {
4135
0
        return 0;
4136
0
    }
4137
4138
21.5k
    PyObject *output = NULL;
4139
21.5k
    if (PyUnicode_Check(path)) {
4140
21.5k
        output = path;
4141
21.5k
    }
4142
0
    else if (PyBytes_Check(path)) {
4143
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4144
0
                                                  PyBytes_GET_SIZE(path));
4145
0
        Py_DECREF(path);
4146
0
        if (!output) {
4147
0
            return 0;
4148
0
        }
4149
0
    }
4150
0
    else {
4151
0
        PyErr_Format(PyExc_TypeError,
4152
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4153
0
                     Py_TYPE(arg)->tp_name);
4154
0
        Py_DECREF(path);
4155
0
        return 0;
4156
0
    }
4157
4158
21.5k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4159
21.5k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4160
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4161
0
        Py_DECREF(output);
4162
0
        return 0;
4163
0
    }
4164
21.5k
    *(PyObject**)addr = output;
4165
21.5k
    return Py_CLEANUP_SUPPORTED;
4166
21.5k
}
4167
4168
4169
static int unicode_fill_utf8(PyObject *unicode);
4170
4171
4172
static int
4173
unicode_ensure_utf8(PyObject *unicode)
4174
21.2M
{
4175
21.2M
    int err = 0;
4176
21.2M
    if (PyUnicode_UTF8(unicode) == NULL) {
4177
145k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4178
145k
        if (PyUnicode_UTF8(unicode) == NULL) {
4179
145k
            err = unicode_fill_utf8(unicode);
4180
145k
        }
4181
145k
        Py_END_CRITICAL_SECTION();
4182
145k
    }
4183
21.2M
    return err;
4184
21.2M
}
4185
4186
const char *
4187
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4188
21.2M
{
4189
21.2M
    if (!PyUnicode_Check(unicode)) {
4190
0
        PyErr_BadArgument();
4191
0
        if (psize) {
4192
0
            *psize = -1;
4193
0
        }
4194
0
        return NULL;
4195
0
    }
4196
4197
21.2M
    if (unicode_ensure_utf8(unicode) == -1) {
4198
206
        if (psize) {
4199
206
            *psize = -1;
4200
206
        }
4201
206
        return NULL;
4202
206
    }
4203
4204
21.2M
    if (psize) {
4205
21.1M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4206
21.1M
    }
4207
21.2M
    return PyUnicode_UTF8(unicode);
4208
21.2M
}
4209
4210
const char *
4211
PyUnicode_AsUTF8(PyObject *unicode)
4212
68.0k
{
4213
68.0k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4214
68.0k
}
4215
4216
const char *
4217
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4218
1.40M
{
4219
1.40M
    Py_ssize_t size;
4220
1.40M
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4221
1.40M
    if (s && strlen(s) != (size_t)size) {
4222
156
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4223
156
        return NULL;
4224
156
    }
4225
1.40M
    return s;
4226
1.40M
}
4227
4228
/*
4229
PyUnicode_GetSize() has been deprecated since Python 3.3
4230
because it returned length of Py_UNICODE.
4231
4232
But this function is part of stable abi, because it doesn't
4233
include Py_UNICODE in signature and it was not excluded from
4234
stable ABI in PEP 384.
4235
*/
4236
PyAPI_FUNC(Py_ssize_t)
4237
PyUnicode_GetSize(PyObject *unicode)
4238
0
{
4239
0
    PyErr_SetString(PyExc_RuntimeError,
4240
0
                    "PyUnicode_GetSize has been removed.");
4241
0
    return -1;
4242
0
}
4243
4244
Py_ssize_t
4245
PyUnicode_GetLength(PyObject *unicode)
4246
33.6k
{
4247
33.6k
    if (!PyUnicode_Check(unicode)) {
4248
0
        PyErr_BadArgument();
4249
0
        return -1;
4250
0
    }
4251
33.6k
    return PyUnicode_GET_LENGTH(unicode);
4252
33.6k
}
4253
4254
Py_UCS4
4255
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4256
22
{
4257
22
    const void *data;
4258
22
    int kind;
4259
4260
22
    if (!PyUnicode_Check(unicode)) {
4261
0
        PyErr_BadArgument();
4262
0
        return (Py_UCS4)-1;
4263
0
    }
4264
22
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4265
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4266
0
        return (Py_UCS4)-1;
4267
0
    }
4268
22
    data = PyUnicode_DATA(unicode);
4269
22
    kind = PyUnicode_KIND(unicode);
4270
22
    return PyUnicode_READ(kind, data, index);
4271
22
}
4272
4273
int
4274
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4275
0
{
4276
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4277
0
        PyErr_BadArgument();
4278
0
        return -1;
4279
0
    }
4280
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4281
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4282
0
        return -1;
4283
0
    }
4284
0
    if (unicode_check_modifiable(unicode))
4285
0
        return -1;
4286
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4287
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4288
0
        return -1;
4289
0
    }
4290
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4291
0
                    index, ch);
4292
0
    return 0;
4293
0
}
4294
4295
const char *
4296
PyUnicode_GetDefaultEncoding(void)
4297
0
{
4298
0
    return "utf-8";
4299
0
}
4300
4301
/* create or adjust a UnicodeDecodeError */
4302
static void
4303
make_decode_exception(PyObject **exceptionObject,
4304
                      const char *encoding,
4305
                      const char *input, Py_ssize_t length,
4306
                      Py_ssize_t startpos, Py_ssize_t endpos,
4307
                      const char *reason)
4308
266k
{
4309
266k
    if (*exceptionObject == NULL) {
4310
77.1k
        *exceptionObject = PyUnicodeDecodeError_Create(
4311
77.1k
            encoding, input, length, startpos, endpos, reason);
4312
77.1k
    }
4313
189k
    else {
4314
189k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4315
0
            goto onError;
4316
189k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4317
0
            goto onError;
4318
189k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4319
0
            goto onError;
4320
189k
    }
4321
266k
    return;
4322
4323
266k
onError:
4324
0
    Py_CLEAR(*exceptionObject);
4325
0
}
4326
4327
#ifdef MS_WINDOWS
4328
static int
4329
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4330
{
4331
    if (newsize > *size) {
4332
        wchar_t *newbuf = *buf;
4333
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4334
            PyErr_NoMemory();
4335
            return -1;
4336
        }
4337
        *buf = newbuf;
4338
    }
4339
    *size = newsize;
4340
    return 0;
4341
}
4342
4343
/* error handling callback helper:
4344
   build arguments, call the callback and check the arguments,
4345
   if no exception occurred, copy the replacement to the output
4346
   and adjust various state variables.
4347
   return 0 on success, -1 on error
4348
*/
4349
4350
static int
4351
unicode_decode_call_errorhandler_wchar(
4352
    const char *errors, PyObject **errorHandler,
4353
    const char *encoding, const char *reason,
4354
    const char **input, const char **inend, Py_ssize_t *startinpos,
4355
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4356
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4357
{
4358
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4359
4360
    PyObject *restuple = NULL;
4361
    PyObject *repunicode = NULL;
4362
    Py_ssize_t outsize;
4363
    Py_ssize_t insize;
4364
    Py_ssize_t requiredsize;
4365
    Py_ssize_t newpos;
4366
    PyObject *inputobj = NULL;
4367
    Py_ssize_t repwlen;
4368
4369
    if (*errorHandler == NULL) {
4370
        *errorHandler = PyCodec_LookupError(errors);
4371
        if (*errorHandler == NULL)
4372
            goto onError;
4373
    }
4374
4375
    make_decode_exception(exceptionObject,
4376
        encoding,
4377
        *input, *inend - *input,
4378
        *startinpos, *endinpos,
4379
        reason);
4380
    if (*exceptionObject == NULL)
4381
        goto onError;
4382
4383
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4384
    if (restuple == NULL)
4385
        goto onError;
4386
    if (!PyTuple_Check(restuple)) {
4387
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4388
        goto onError;
4389
    }
4390
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4391
        goto onError;
4392
4393
    /* Copy back the bytes variables, which might have been modified by the
4394
       callback */
4395
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4396
    if (!inputobj)
4397
        goto onError;
4398
    *input = PyBytes_AS_STRING(inputobj);
4399
    insize = PyBytes_GET_SIZE(inputobj);
4400
    *inend = *input + insize;
4401
    /* we can DECREF safely, as the exception has another reference,
4402
       so the object won't go away. */
4403
    Py_DECREF(inputobj);
4404
4405
    if (newpos<0)
4406
        newpos = insize+newpos;
4407
    if (newpos<0 || newpos>insize) {
4408
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4409
        goto onError;
4410
    }
4411
4412
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4413
    if (repwlen < 0)
4414
        goto onError;
4415
    repwlen--;
4416
    /* need more space? (at least enough for what we
4417
       have+the replacement+the rest of the string (starting
4418
       at the new input position), so we won't have to check space
4419
       when there are no errors in the rest of the string) */
4420
    requiredsize = *outpos;
4421
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4422
        goto overflow;
4423
    requiredsize += repwlen;
4424
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4425
        goto overflow;
4426
    requiredsize += insize - newpos;
4427
    outsize = *bufsize;
4428
    if (requiredsize > outsize) {
4429
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4430
            requiredsize = 2*outsize;
4431
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4432
            goto onError;
4433
        }
4434
    }
4435
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4436
    *outpos += repwlen;
4437
    *endinpos = newpos;
4438
    *inptr = *input + newpos;
4439
4440
    /* we made it! */
4441
    Py_DECREF(restuple);
4442
    return 0;
4443
4444
  overflow:
4445
    PyErr_SetString(PyExc_OverflowError,
4446
                    "decoded result is too long for a Python string");
4447
4448
  onError:
4449
    Py_XDECREF(restuple);
4450
    return -1;
4451
}
4452
#endif   /* MS_WINDOWS */
4453
4454
static int
4455
unicode_decode_call_errorhandler_writer(
4456
    const char *errors, PyObject **errorHandler,
4457
    const char *encoding, const char *reason,
4458
    const char **input, const char **inend, Py_ssize_t *startinpos,
4459
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4460
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4461
266k
{
4462
266k
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4463
4464
266k
    PyObject *restuple = NULL;
4465
266k
    PyObject *repunicode = NULL;
4466
266k
    Py_ssize_t insize;
4467
266k
    Py_ssize_t newpos;
4468
266k
    Py_ssize_t replen;
4469
266k
    Py_ssize_t remain;
4470
266k
    PyObject *inputobj = NULL;
4471
266k
    int need_to_grow = 0;
4472
266k
    const char *new_inptr;
4473
4474
266k
    if (*errorHandler == NULL) {
4475
77.1k
        *errorHandler = PyCodec_LookupError(errors);
4476
77.1k
        if (*errorHandler == NULL)
4477
0
            goto onError;
4478
77.1k
    }
4479
4480
266k
    make_decode_exception(exceptionObject,
4481
266k
        encoding,
4482
266k
        *input, *inend - *input,
4483
266k
        *startinpos, *endinpos,
4484
266k
        reason);
4485
266k
    if (*exceptionObject == NULL)
4486
0
        goto onError;
4487
4488
266k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4489
266k
    if (restuple == NULL)
4490
48.1k
        goto onError;
4491
218k
    if (!PyTuple_Check(restuple)) {
4492
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4493
0
        goto onError;
4494
0
    }
4495
218k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4496
0
        goto onError;
4497
4498
    /* Copy back the bytes variables, which might have been modified by the
4499
       callback */
4500
218k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4501
218k
    if (!inputobj)
4502
0
        goto onError;
4503
218k
    remain = *inend - *input - *endinpos;
4504
218k
    *input = PyBytes_AS_STRING(inputobj);
4505
218k
    insize = PyBytes_GET_SIZE(inputobj);
4506
218k
    *inend = *input + insize;
4507
    /* we can DECREF safely, as the exception has another reference,
4508
       so the object won't go away. */
4509
218k
    Py_DECREF(inputobj);
4510
4511
218k
    if (newpos<0)
4512
0
        newpos = insize+newpos;
4513
218k
    if (newpos<0 || newpos>insize) {
4514
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4515
0
        goto onError;
4516
0
    }
4517
4518
218k
    replen = PyUnicode_GET_LENGTH(repunicode);
4519
218k
    if (replen > 1) {
4520
19.3k
        writer->min_length += replen - 1;
4521
19.3k
        need_to_grow = 1;
4522
19.3k
    }
4523
218k
    new_inptr = *input + newpos;
4524
218k
    if (*inend - new_inptr > remain) {
4525
        /* We don't know the decoding algorithm here so we make the worst
4526
           assumption that one byte decodes to one unicode character.
4527
           If unfortunately one byte could decode to more unicode characters,
4528
           the decoder may write out-of-bound then.  Is it possible for the
4529
           algorithms using this function? */
4530
8.38k
        writer->min_length += *inend - new_inptr - remain;
4531
8.38k
        need_to_grow = 1;
4532
8.38k
    }
4533
218k
    if (need_to_grow) {
4534
19.4k
        writer->overallocate = 1;
4535
19.4k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4536
19.4k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4537
0
            goto onError;
4538
19.4k
    }
4539
218k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4540
0
        goto onError;
4541
4542
218k
    *endinpos = newpos;
4543
218k
    *inptr = new_inptr;
4544
4545
    /* we made it! */
4546
218k
    Py_DECREF(restuple);
4547
218k
    return 0;
4548
4549
48.1k
  onError:
4550
48.1k
    Py_XDECREF(restuple);
4551
48.1k
    return -1;
4552
218k
}
4553
4554
/* --- UTF-7 Codec -------------------------------------------------------- */
4555
4556
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4557
4558
/* Three simple macros defining base-64. */
4559
4560
/* Is c a base-64 character? */
4561
4562
#define IS_BASE64(c) \
4563
284k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4564
284k
     ((c) >= 'a' && (c) <= 'z') ||     \
4565
284k
     ((c) >= '0' && (c) <= '9') ||     \
4566
284k
     (c) == '+' || (c) == '/')
4567
4568
/* given that c is a base-64 character, what is its base-64 value? */
4569
4570
#define FROM_BASE64(c)                                                  \
4571
239k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4572
239k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4573
186k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4574
104k
     (c) == '+' ? 62 : 63)
4575
4576
/* What is the base-64 character of the bottom 6 bits of n? */
4577
4578
#define TO_BASE64(n)  \
4579
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4580
4581
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4582
 * decoded as itself.  We are permissive on decoding; the only ASCII
4583
 * byte not decoding to itself is the + which begins a base64
4584
 * string. */
4585
4586
#define DECODE_DIRECT(c)                                \
4587
7.54M
    ((c) <= 127 && (c) != '+')
4588
4589
/* The UTF-7 encoder treats ASCII characters differently according to
4590
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4591
 * the above).  See RFC2152.  This array identifies these different
4592
 * sets:
4593
 * 0 : "Set D"
4594
 *     alphanumeric and '(),-./:?
4595
 * 1 : "Set O"
4596
 *     !"#$%&*;<=>@[]^_`{|}
4597
 * 2 : "whitespace"
4598
 *     ht nl cr sp
4599
 * 3 : special (must be base64 encoded)
4600
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4601
 */
4602
4603
static
4604
char utf7_category[128] = {
4605
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4606
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4607
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4608
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4609
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4610
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4611
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4612
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4613
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4614
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4615
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4616
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4617
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4618
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4619
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4620
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4621
};
4622
4623
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4624
 * answer depends on whether we are encoding set O as itself, and also
4625
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4626
 * clear that the answers to these questions vary between
4627
 * applications, so this code needs to be flexible.  */
4628
4629
#define ENCODE_DIRECT(c) \
4630
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4631
4632
PyObject *
4633
PyUnicode_DecodeUTF7(const char *s,
4634
                     Py_ssize_t size,
4635
                     const char *errors)
4636
0
{
4637
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4638
0
}
4639
4640
/* The decoder.  The only state we preserve is our read position,
4641
 * i.e. how many characters we have consumed.  So if we end in the
4642
 * middle of a shift sequence we have to back off the read position
4643
 * and the output to the beginning of the sequence, otherwise we lose
4644
 * all the shift state (seen bits, number of bits seen, high
4645
 * surrogate). */
4646
4647
PyObject *
4648
PyUnicode_DecodeUTF7Stateful(const char *s,
4649
                             Py_ssize_t size,
4650
                             const char *errors,
4651
                             Py_ssize_t *consumed)
4652
32.5k
{
4653
32.5k
    const char *starts = s;
4654
32.5k
    Py_ssize_t startinpos;
4655
32.5k
    Py_ssize_t endinpos;
4656
32.5k
    const char *e;
4657
32.5k
    _PyUnicodeWriter writer;
4658
32.5k
    const char *errmsg = "";
4659
32.5k
    int inShift = 0;
4660
32.5k
    Py_ssize_t shiftOutStart;
4661
32.5k
    unsigned int base64bits = 0;
4662
32.5k
    unsigned long base64buffer = 0;
4663
32.5k
    Py_UCS4 surrogate = 0;
4664
32.5k
    PyObject *errorHandler = NULL;
4665
32.5k
    PyObject *exc = NULL;
4666
4667
32.5k
    if (size == 0) {
4668
0
        if (consumed)
4669
0
            *consumed = 0;
4670
0
        _Py_RETURN_UNICODE_EMPTY();
4671
0
    }
4672
4673
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4674
32.5k
    _PyUnicodeWriter_Init(&writer);
4675
32.5k
    writer.min_length = size;
4676
4677
32.5k
    shiftOutStart = 0;
4678
32.5k
    e = s + size;
4679
4680
7.84M
    while (s < e) {
4681
7.82M
        Py_UCS4 ch;
4682
7.82M
      restart:
4683
7.82M
        ch = (unsigned char) *s;
4684
4685
7.82M
        if (inShift) { /* in a base-64 section */
4686
258k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4687
239k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4688
239k
                base64bits += 6;
4689
239k
                s++;
4690
239k
                if (base64bits >= 16) {
4691
                    /* we have enough bits for a UTF-16 value */
4692
82.4k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4693
82.4k
                    base64bits -= 16;
4694
82.4k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4695
82.4k
                    assert(outCh <= 0xffff);
4696
82.4k
                    if (surrogate) {
4697
                        /* expecting a second surrogate */
4698
8.64k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4699
3.66k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4700
3.66k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4701
0
                                goto onError;
4702
3.66k
                            surrogate = 0;
4703
3.66k
                            continue;
4704
3.66k
                        }
4705
4.97k
                        else {
4706
4.97k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4707
0
                                goto onError;
4708
4.97k
                            surrogate = 0;
4709
4.97k
                        }
4710
8.64k
                    }
4711
78.7k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4712
                        /* first surrogate */
4713
12.7k
                        surrogate = outCh;
4714
12.7k
                    }
4715
65.9k
                    else {
4716
65.9k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4717
0
                            goto onError;
4718
65.9k
                    }
4719
78.7k
                }
4720
239k
            }
4721
19.1k
            else { /* now leaving a base-64 section */
4722
19.1k
                inShift = 0;
4723
19.1k
                if (base64bits > 0) { /* left-over bits */
4724
15.5k
                    if (base64bits >= 6) {
4725
                        /* We've seen at least one base-64 character */
4726
8.48k
                        s++;
4727
8.48k
                        errmsg = "partial character in shift sequence";
4728
8.48k
                        goto utf7Error;
4729
8.48k
                    }
4730
7.10k
                    else {
4731
                        /* Some bits remain; they should be zero */
4732
7.10k
                        if (base64buffer != 0) {
4733
1.46k
                            s++;
4734
1.46k
                            errmsg = "non-zero padding bits in shift sequence";
4735
1.46k
                            goto utf7Error;
4736
1.46k
                        }
4737
7.10k
                    }
4738
15.5k
                }
4739
9.22k
                if (surrogate && DECODE_DIRECT(ch)) {
4740
3.08k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4741
0
                        goto onError;
4742
3.08k
                }
4743
9.22k
                surrogate = 0;
4744
9.22k
                if (ch == '-') {
4745
                    /* '-' is absorbed; other terminating
4746
                       characters are preserved */
4747
2.54k
                    s++;
4748
2.54k
                }
4749
9.22k
            }
4750
258k
        }
4751
7.56M
        else if ( ch == '+' ) {
4752
28.9k
            startinpos = s-starts;
4753
28.9k
            s++; /* consume '+' */
4754
28.9k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4755
2.34k
                s++;
4756
2.34k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4757
0
                    goto onError;
4758
2.34k
            }
4759
26.5k
            else if (s < e && !IS_BASE64(*s)) {
4760
3.64k
                s++;
4761
3.64k
                errmsg = "ill-formed sequence";
4762
3.64k
                goto utf7Error;
4763
3.64k
            }
4764
22.9k
            else { /* begin base64-encoded section */
4765
22.9k
                inShift = 1;
4766
22.9k
                surrogate = 0;
4767
22.9k
                shiftOutStart = writer.pos;
4768
22.9k
                base64bits = 0;
4769
22.9k
                base64buffer = 0;
4770
22.9k
            }
4771
28.9k
        }
4772
7.53M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4773
7.44M
            s++;
4774
7.44M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4775
0
                goto onError;
4776
7.44M
        }
4777
89.4k
        else {
4778
89.4k
            startinpos = s-starts;
4779
89.4k
            s++;
4780
89.4k
            errmsg = "unexpected special character";
4781
89.4k
            goto utf7Error;
4782
89.4k
        }
4783
7.71M
        continue;
4784
7.71M
utf7Error:
4785
103k
        endinpos = s-starts;
4786
103k
        if (unicode_decode_call_errorhandler_writer(
4787
103k
                errors, &errorHandler,
4788
103k
                "utf7", errmsg,
4789
103k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4790
103k
                &writer))
4791
14.1k
            goto onError;
4792
103k
    }
4793
4794
    /* end of string */
4795
4796
18.3k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4797
        /* if we're in an inconsistent state, that's an error */
4798
3.76k
        inShift = 0;
4799
3.76k
        if (surrogate ||
4800
3.18k
                (base64bits >= 6) ||
4801
2.53k
                (base64bits > 0 && base64buffer != 0)) {
4802
2.53k
            endinpos = size;
4803
2.53k
            if (unicode_decode_call_errorhandler_writer(
4804
2.53k
                    errors, &errorHandler,
4805
2.53k
                    "utf7", "unterminated shift sequence",
4806
2.53k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4807
2.53k
                    &writer))
4808
2.18k
                goto onError;
4809
347
            if (s < e)
4810
0
                goto restart;
4811
347
        }
4812
3.76k
    }
4813
4814
    /* return state */
4815
16.2k
    if (consumed) {
4816
0
        if (inShift) {
4817
0
            *consumed = startinpos;
4818
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4819
0
                PyObject *result = PyUnicode_FromKindAndData(
4820
0
                        writer.kind, writer.data, shiftOutStart);
4821
0
                Py_XDECREF(errorHandler);
4822
0
                Py_XDECREF(exc);
4823
0
                _PyUnicodeWriter_Dealloc(&writer);
4824
0
                return result;
4825
0
            }
4826
0
            writer.pos = shiftOutStart; /* back off output */
4827
0
        }
4828
0
        else {
4829
0
            *consumed = s-starts;
4830
0
        }
4831
0
    }
4832
4833
16.2k
    Py_XDECREF(errorHandler);
4834
16.2k
    Py_XDECREF(exc);
4835
16.2k
    return _PyUnicodeWriter_Finish(&writer);
4836
4837
16.2k
  onError:
4838
16.2k
    Py_XDECREF(errorHandler);
4839
16.2k
    Py_XDECREF(exc);
4840
16.2k
    _PyUnicodeWriter_Dealloc(&writer);
4841
16.2k
    return NULL;
4842
16.2k
}
4843
4844
4845
PyObject *
4846
_PyUnicode_EncodeUTF7(PyObject *str,
4847
                      const char *errors)
4848
0
{
4849
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4850
0
    if (len == 0) {
4851
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4852
0
    }
4853
0
    int kind = PyUnicode_KIND(str);
4854
0
    const void *data = PyUnicode_DATA(str);
4855
4856
    /* It might be possible to tighten this worst case */
4857
0
    if (len > PY_SSIZE_T_MAX / 8) {
4858
0
        return PyErr_NoMemory();
4859
0
    }
4860
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4861
0
    if (writer == NULL) {
4862
0
        return NULL;
4863
0
    }
4864
4865
0
    int inShift = 0;
4866
0
    unsigned int base64bits = 0;
4867
0
    unsigned long base64buffer = 0;
4868
0
    char *out = PyBytesWriter_GetData(writer);
4869
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4870
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4871
4872
0
        if (inShift) {
4873
0
            if (ENCODE_DIRECT(ch)) {
4874
                /* shifting out */
4875
0
                if (base64bits) { /* output remaining bits */
4876
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4877
0
                    base64buffer = 0;
4878
0
                    base64bits = 0;
4879
0
                }
4880
0
                inShift = 0;
4881
                /* Characters not in the BASE64 set implicitly unshift the sequence
4882
                   so no '-' is required, except if the character is itself a '-' */
4883
0
                if (IS_BASE64(ch) || ch == '-') {
4884
0
                    *out++ = '-';
4885
0
                }
4886
0
                *out++ = (char) ch;
4887
0
            }
4888
0
            else {
4889
0
                goto encode_char;
4890
0
            }
4891
0
        }
4892
0
        else { /* not in a shift sequence */
4893
0
            if (ch == '+') {
4894
0
                *out++ = '+';
4895
0
                        *out++ = '-';
4896
0
            }
4897
0
            else if (ENCODE_DIRECT(ch)) {
4898
0
                *out++ = (char) ch;
4899
0
            }
4900
0
            else {
4901
0
                *out++ = '+';
4902
0
                inShift = 1;
4903
0
                goto encode_char;
4904
0
            }
4905
0
        }
4906
0
        continue;
4907
0
encode_char:
4908
0
        if (ch >= 0x10000) {
4909
0
            assert(ch <= MAX_UNICODE);
4910
4911
            /* code first surrogate */
4912
0
            base64bits += 16;
4913
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4914
0
            while (base64bits >= 6) {
4915
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4916
0
                base64bits -= 6;
4917
0
            }
4918
            /* prepare second surrogate */
4919
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4920
0
        }
4921
0
        base64bits += 16;
4922
0
        base64buffer = (base64buffer << 16) | ch;
4923
0
        while (base64bits >= 6) {
4924
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4925
0
            base64bits -= 6;
4926
0
        }
4927
0
    }
4928
0
    if (base64bits)
4929
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4930
0
    if (inShift)
4931
0
        *out++ = '-';
4932
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4933
0
}
4934
4935
#undef IS_BASE64
4936
#undef FROM_BASE64
4937
#undef TO_BASE64
4938
#undef DECODE_DIRECT
4939
#undef ENCODE_DIRECT
4940
4941
/* --- UTF-8 Codec -------------------------------------------------------- */
4942
4943
PyObject *
4944
PyUnicode_DecodeUTF8(const char *s,
4945
                     Py_ssize_t size,
4946
                     const char *errors)
4947
2.18M
{
4948
2.18M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4949
2.18M
}
4950
4951
#include "stringlib/asciilib.h"
4952
#include "stringlib/codecs.h"
4953
#include "stringlib/undef.h"
4954
4955
#include "stringlib/ucs1lib.h"
4956
#include "stringlib/codecs.h"
4957
#include "stringlib/undef.h"
4958
4959
#include "stringlib/ucs2lib.h"
4960
#include "stringlib/codecs.h"
4961
#include "stringlib/undef.h"
4962
4963
#include "stringlib/ucs4lib.h"
4964
#include "stringlib/codecs.h"
4965
#include "stringlib/undef.h"
4966
4967
#if (SIZEOF_SIZE_T == 8)
4968
/* Mask to quickly check whether a C 'size_t' contains a
4969
   non-ASCII, UTF8-encoded char. */
4970
116M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4971
// used to count codepoints in UTF-8 string.
4972
273M
# define VECTOR_0101     0x0101010101010101ULL
4973
2.45M
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4974
#elif (SIZEOF_SIZE_T == 4)
4975
# define ASCII_CHAR_MASK 0x80808080U
4976
# define VECTOR_0101     0x01010101U
4977
# define VECTOR_00FF     0x00ff00ffU
4978
#else
4979
# error C 'size_t' size should be either 4 or 8!
4980
#endif
4981
4982
#if (defined(__clang__) || defined(__GNUC__))
4983
#define HAVE_CTZ 1
4984
static inline unsigned int
4985
ctz(size_t v)
4986
534k
{
4987
534k
    return __builtin_ctzll((unsigned long long)v);
4988
534k
}
4989
#elif defined(_MSC_VER)
4990
#define HAVE_CTZ 1
4991
static inline unsigned int
4992
ctz(size_t v)
4993
{
4994
    unsigned long pos;
4995
#if SIZEOF_SIZE_T == 4
4996
    _BitScanForward(&pos, v);
4997
#else
4998
    _BitScanForward64(&pos, v);
4999
#endif /* SIZEOF_SIZE_T */
5000
    return pos;
5001
}
5002
#else
5003
#define HAVE_CTZ 0
5004
#endif
5005
5006
#if HAVE_CTZ && PY_LITTLE_ENDIAN
5007
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
5008
static size_t
5009
load_unaligned(const unsigned char *p, size_t size)
5010
13.1M
{
5011
13.1M
    union {
5012
13.1M
        size_t s;
5013
13.1M
        unsigned char b[SIZEOF_SIZE_T];
5014
13.1M
    } u;
5015
13.1M
    u.s = 0;
5016
    // This switch statement assumes little endian because:
5017
    // * union is faster than bitwise or and shift.
5018
    // * big endian machine is rare and hard to maintain.
5019
13.1M
    switch (size) {
5020
0
    default:
5021
0
#if SIZEOF_SIZE_T == 8
5022
0
    case 8:
5023
0
        u.b[7] = p[7];
5024
0
        _Py_FALLTHROUGH;
5025
794k
    case 7:
5026
794k
        u.b[6] = p[6];
5027
794k
        _Py_FALLTHROUGH;
5028
2.98M
    case 6:
5029
2.98M
        u.b[5] = p[5];
5030
2.98M
        _Py_FALLTHROUGH;
5031
3.54M
    case 5:
5032
3.54M
        u.b[4] = p[4];
5033
3.54M
        _Py_FALLTHROUGH;
5034
3.54M
#endif
5035
4.08M
    case 4:
5036
4.08M
        u.b[3] = p[3];
5037
4.08M
        _Py_FALLTHROUGH;
5038
9.38M
    case 3:
5039
9.38M
        u.b[2] = p[2];
5040
9.38M
        _Py_FALLTHROUGH;
5041
12.6M
    case 2:
5042
12.6M
        u.b[1] = p[1];
5043
12.6M
        _Py_FALLTHROUGH;
5044
12.9M
    case 1:
5045
12.9M
        u.b[0] = p[0];
5046
12.9M
        break;
5047
187k
    case 0:
5048
187k
        break;
5049
13.1M
    }
5050
13.1M
    return u.s;
5051
13.1M
}
5052
#endif
5053
5054
/*
5055
 * Find the first non-ASCII character in a byte sequence.
5056
 *
5057
 * This function scans a range of bytes from `start` to `end` and returns the
5058
 * index of the first byte that is not an ASCII character (i.e., has the most
5059
 * significant bit set). If all characters in the range are ASCII, it returns
5060
 * `end - start`.
5061
 */
5062
static Py_ssize_t
5063
find_first_nonascii(const unsigned char *start, const unsigned char *end)
5064
13.4M
{
5065
    // The search is done in `size_t` chunks.
5066
    // The start and end might not be aligned at `size_t` boundaries,
5067
    // so they're handled specially.
5068
5069
13.4M
    const unsigned char *p = start;
5070
5071
13.4M
    if (end - start >= SIZEOF_SIZE_T) {
5072
        // Avoid unaligned read.
5073
3.33M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5074
3.33M
        size_t u;
5075
3.33M
        memcpy(&u, p, sizeof(size_t));
5076
3.33M
        u &= ASCII_CHAR_MASK;
5077
3.33M
        if (u) {
5078
202k
            return (ctz(u) - 7) / 8;
5079
202k
        }
5080
3.13M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
5081
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
5082
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
5083
        while (p < p2) {
5084
            if (*p & 0x80) {
5085
                return p - start;
5086
            }
5087
            p++;
5088
        }
5089
#endif
5090
5091
3.13M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5092
102M
        while (p <= e) {
5093
99.1M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5094
99.1M
            if (u) {
5095
161k
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5096
161k
                return p - start + (ctz(u) - 7) / 8;
5097
#else
5098
                // big endian and minor compilers are difficult to test.
5099
                // fallback to per byte check.
5100
                break;
5101
#endif
5102
161k
            }
5103
98.9M
            p += SIZEOF_SIZE_T;
5104
98.9M
        }
5105
3.13M
    }
5106
13.1M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5107
13.4M
    assert((end - p) < SIZEOF_SIZE_T);
5108
    // we can not use *(const size_t*)p to avoid buffer overrun.
5109
13.1M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5110
13.1M
    if (u) {
5111
170k
        return p - start + (ctz(u) - 7) / 8;
5112
170k
    }
5113
12.9M
    return end - start;
5114
#else
5115
    while (p < end) {
5116
        if (*p & 0x80) {
5117
            break;
5118
        }
5119
        p++;
5120
    }
5121
    return p - start;
5122
#endif
5123
13.1M
}
5124
5125
static inline int
5126
scalar_utf8_start_char(unsigned int ch)
5127
686k
{
5128
    // 0xxxxxxx or 11xxxxxx are first byte.
5129
686k
    return (~ch >> 7 | ch >> 6) & 1;
5130
686k
}
5131
5132
static inline size_t
5133
vector_utf8_start_chars(size_t v)
5134
273M
{
5135
273M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5136
273M
}
5137
5138
5139
// Count the number of UTF-8 code points in a given byte sequence.
5140
static Py_ssize_t
5141
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5142
246k
{
5143
246k
    Py_ssize_t len = 0;
5144
5145
246k
    if (end - s >= SIZEOF_SIZE_T) {
5146
179k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5147
16.5k
            len += scalar_utf8_start_char(*s++);
5148
16.5k
        }
5149
5150
1.39M
        while (s + SIZEOF_SIZE_T <= end) {
5151
1.22M
            const unsigned char *e = end;
5152
1.22M
            if (e - s > SIZEOF_SIZE_T * 255) {
5153
1.06M
                e = s + SIZEOF_SIZE_T * 255;
5154
1.06M
            }
5155
1.22M
            Py_ssize_t vstart = 0;
5156
274M
            while (s + SIZEOF_SIZE_T <= e) {
5157
273M
                size_t v = *(size_t*)s;
5158
273M
                size_t vs = vector_utf8_start_chars(v);
5159
273M
                vstart += vs;
5160
273M
                s += SIZEOF_SIZE_T;
5161
273M
            }
5162
1.22M
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5163
1.22M
            vstart += vstart >> 16;
5164
1.22M
#if SIZEOF_SIZE_T == 8
5165
1.22M
            vstart += vstart >> 32;
5166
1.22M
#endif
5167
1.22M
            len += vstart & 0x7ff;
5168
1.22M
        }
5169
162k
    }
5170
916k
    while (s < end) {
5171
669k
        len += scalar_utf8_start_char(*s++);
5172
669k
    }
5173
246k
    return len;
5174
246k
}
5175
5176
static Py_ssize_t
5177
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5178
5.03M
{
5179
5.03M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5180
5.03M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5181
4.96M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5182
596k
    {
5183
        /* Fast path, see in STRINGLIB(utf8_decode) for
5184
           an explanation. */
5185
596k
        const char *p = start;
5186
596k
        Py_UCS1 *q = dest;
5187
1.41M
        while (p + SIZEOF_SIZE_T <= end) {
5188
946k
            size_t value = *(const size_t *) p;
5189
946k
            if (value & ASCII_CHAR_MASK)
5190
128k
                break;
5191
818k
            *((size_t *)q) = value;
5192
818k
            p += SIZEOF_SIZE_T;
5193
818k
            q += SIZEOF_SIZE_T;
5194
818k
        }
5195
2.67M
        while (p < end) {
5196
2.22M
            if ((unsigned char)*p & 0x80)
5197
147k
                break;
5198
2.08M
            *q++ = *p++;
5199
2.08M
        }
5200
596k
        return p - start;
5201
596k
    }
5202
4.44M
#endif
5203
4.44M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5204
4.44M
                                         (const unsigned char*)end);
5205
4.44M
    memcpy(dest, start, pos);
5206
4.44M
    return pos;
5207
5.03M
}
5208
5209
static int
5210
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5211
                         const char *starts, const char *s, const char *end,
5212
                         _Py_error_handler error_handler,
5213
                         const char *errors,
5214
                         Py_ssize_t *consumed)
5215
536k
{
5216
536k
    Py_ssize_t startinpos, endinpos;
5217
536k
    const char *errmsg = "";
5218
536k
    PyObject *error_handler_obj = NULL;
5219
536k
    PyObject *exc = NULL;
5220
5221
162M
    while (s < end) {
5222
162M
        Py_UCS4 ch;
5223
162M
        int kind = writer->kind;
5224
5225
162M
        if (kind == PyUnicode_1BYTE_KIND) {
5226
495k
            if (PyUnicode_IS_ASCII(writer->buffer))
5227
287k
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5228
208k
            else
5229
208k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5230
162M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5231
85.9M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5232
85.9M
        } else {
5233
76.4M
            assert(kind == PyUnicode_4BYTE_KIND);
5234
76.4M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5235
76.4M
        }
5236
5237
162M
        switch (ch) {
5238
472k
        case 0:
5239
472k
            if (s == end || consumed)
5240
451k
                goto End;
5241
21.3k
            errmsg = "unexpected end of data";
5242
21.3k
            startinpos = s - starts;
5243
21.3k
            endinpos = end - starts;
5244
21.3k
            break;
5245
124M
        case 1:
5246
124M
            errmsg = "invalid start byte";
5247
124M
            startinpos = s - starts;
5248
124M
            endinpos = startinpos + 1;
5249
124M
            break;
5250
36.1M
        case 2:
5251
36.1M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5252
0
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5253
0
            {
5254
                /* Truncated surrogate code in range D800-DFFF */
5255
0
                goto End;
5256
0
            }
5257
36.1M
            _Py_FALLTHROUGH;
5258
37.1M
        case 3:
5259
37.2M
        case 4:
5260
37.2M
            errmsg = "invalid continuation byte";
5261
37.2M
            startinpos = s - starts;
5262
37.2M
            endinpos = startinpos + ch - 1;
5263
37.2M
            break;
5264
287k
        default:
5265
            // ch doesn't fit into kind, so change the buffer kind to write
5266
            // the character
5267
287k
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5268
0
                goto onError;
5269
287k
            continue;
5270
162M
        }
5271
5272
162M
        if (error_handler == _Py_ERROR_UNKNOWN)
5273
106k
            error_handler = _Py_GetErrorHandler(errors);
5274
5275
162M
        switch (error_handler) {
5276
0
        case _Py_ERROR_IGNORE:
5277
0
            s += (endinpos - startinpos);
5278
0
            break;
5279
5280
162M
        case _Py_ERROR_REPLACE:
5281
162M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5282
0
                goto onError;
5283
162M
            s += (endinpos - startinpos);
5284
162M
            break;
5285
5286
3.29k
        case _Py_ERROR_SURROGATEESCAPE:
5287
3.29k
        {
5288
3.29k
            Py_ssize_t i;
5289
5290
3.29k
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5291
0
                goto onError;
5292
7.00k
            for (i=startinpos; i<endinpos; i++) {
5293
3.70k
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5294
3.70k
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5295
3.70k
                                ch + 0xdc00);
5296
3.70k
                writer->pos++;
5297
3.70k
            }
5298
3.29k
            s += (endinpos - startinpos);
5299
3.29k
            break;
5300
3.29k
        }
5301
5302
915
        default:
5303
915
            if (unicode_decode_call_errorhandler_writer(
5304
915
                    errors, &error_handler_obj,
5305
915
                    "utf-8", errmsg,
5306
915
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5307
915
                    writer)) {
5308
915
                goto onError;
5309
915
            }
5310
5311
0
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5312
0
                return -1;
5313
0
            }
5314
162M
        }
5315
162M
    }
5316
5317
535k
End:
5318
535k
    if (consumed)
5319
902
        *consumed = s - starts;
5320
5321
535k
    Py_XDECREF(error_handler_obj);
5322
535k
    Py_XDECREF(exc);
5323
535k
    return 0;
5324
5325
915
onError:
5326
915
    Py_XDECREF(error_handler_obj);
5327
915
    Py_XDECREF(exc);
5328
915
    return -1;
5329
536k
}
5330
5331
5332
static PyObject *
5333
unicode_decode_utf8(const char *s, Py_ssize_t size,
5334
                    _Py_error_handler error_handler, const char *errors,
5335
                    Py_ssize_t *consumed)
5336
10.7M
{
5337
10.7M
    if (size == 0) {
5338
72.7k
        if (consumed) {
5339
0
            *consumed = 0;
5340
0
        }
5341
72.7k
        _Py_RETURN_UNICODE_EMPTY();
5342
72.7k
    }
5343
5344
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5345
10.6M
    if (size == 1 && (unsigned char)s[0] < 128) {
5346
1.64M
        if (consumed) {
5347
0
            *consumed = 1;
5348
0
        }
5349
1.64M
        return get_latin1_char((unsigned char)s[0]);
5350
1.64M
    }
5351
5352
    // I don't know this check is necessary or not. But there is a test
5353
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5354
9.03M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5355
0
        PyErr_NoMemory();
5356
0
        return NULL;
5357
0
    }
5358
5359
9.03M
    const char *starts = s;
5360
9.03M
    const char *end = s + size;
5361
5362
9.03M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5363
9.03M
    if (pos == size) {  // fast path: ASCII string.
5364
8.54M
        PyObject *u = PyUnicode_New(size, 127);
5365
8.54M
        if (u == NULL) {
5366
0
            return NULL;
5367
0
        }
5368
8.54M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5369
8.54M
        if (consumed) {
5370
0
            *consumed = size;
5371
0
        }
5372
8.54M
        return u;
5373
8.54M
    }
5374
5375
489k
    int maxchr = 127;
5376
489k
    Py_ssize_t maxsize = size;
5377
5378
489k
    unsigned char ch = (unsigned char)(s[pos]);
5379
    // error handler other than strict may remove/replace the invalid byte.
5380
    // consumed != NULL allows 1~3 bytes remainings.
5381
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5382
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5383
    // reallocation and copy.
5384
489k
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5385
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5386
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5387
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5388
        // means that it is no longer necessary to allocate several times the required amount
5389
        // of memory.
5390
246k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5391
246k
        if (ch < 0xc4) { // latin1
5392
130k
            maxchr = 0xff;
5393
130k
        }
5394
116k
        else if (ch < 0xf0) { // ucs2
5395
105k
            maxchr = 0xffff;
5396
105k
        }
5397
10.5k
        else { // ucs4
5398
10.5k
            maxchr = 0x10ffff;
5399
10.5k
        }
5400
246k
    }
5401
489k
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5402
489k
    if (!u) {
5403
0
        return NULL;
5404
0
    }
5405
5406
    // Use _PyUnicodeWriter after fast path is failed.
5407
489k
    _PyUnicodeWriter writer;
5408
489k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5409
489k
    if (maxchr <= 255) {
5410
373k
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5411
373k
        s += pos;
5412
373k
        size -= pos;
5413
373k
        writer.pos = pos;
5414
373k
    }
5415
5416
489k
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5417
489k
                                 error_handler, errors,
5418
489k
                                 consumed) < 0) {
5419
915
        _PyUnicodeWriter_Dealloc(&writer);
5420
915
        return NULL;
5421
915
    }
5422
488k
    return _PyUnicodeWriter_Finish(&writer);
5423
489k
}
5424
5425
5426
// Used by PyUnicodeWriter_WriteUTF8() implementation
5427
static int
5428
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
5429
                           const char *s, Py_ssize_t size,
5430
                           _Py_error_handler error_handler, const char *errors,
5431
                           Py_ssize_t *consumed)
5432
4.46M
{
5433
4.46M
    if (size == 0) {
5434
8.36k
        if (consumed) {
5435
0
            *consumed = 0;
5436
0
        }
5437
8.36k
        return 0;
5438
8.36k
    }
5439
5440
    // fast path: try ASCII string.
5441
4.45M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5442
0
        return -1;
5443
0
    }
5444
5445
4.45M
    const char *starts = s;
5446
4.45M
    const char *end = s + size;
5447
4.45M
    Py_ssize_t decoded = 0;
5448
4.45M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5449
4.45M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5450
4.45M
        decoded = ascii_decode(s, end, dest);
5451
4.45M
        writer->pos += decoded;
5452
5453
4.45M
        if (decoded == size) {
5454
4.40M
            if (consumed) {
5455
910
                *consumed = size;
5456
910
            }
5457
4.40M
            return 0;
5458
4.40M
        }
5459
44.3k
        s += decoded;
5460
44.3k
        size -= decoded;
5461
44.3k
    }
5462
5463
46.6k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5464
46.6k
                                    error_handler, errors, consumed);
5465
4.45M
}
5466
5467
5468
PyObject *
5469
PyUnicode_DecodeUTF8Stateful(const char *s,
5470
                             Py_ssize_t size,
5471
                             const char *errors,
5472
                             Py_ssize_t *consumed)
5473
10.7M
{
5474
10.7M
    return unicode_decode_utf8(s, size,
5475
10.7M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5476
10.7M
                               errors, consumed);
5477
10.7M
}
5478
5479
5480
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5481
   non-zero, use strict error handler otherwise.
5482
5483
   On success, write a pointer to a newly allocated wide character string into
5484
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5485
   (in number of wchar_t units) into *wlen (if wlen is set).
5486
5487
   On memory allocation failure, return -1.
5488
5489
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5490
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5491
   is not NULL, write the decoding error message into *reason. */
5492
int
5493
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5494
                 const char **reason, _Py_error_handler errors)
5495
5.24k
{
5496
5.24k
    const char *orig_s = s;
5497
5.24k
    const char *e;
5498
5.24k
    wchar_t *unicode;
5499
5.24k
    Py_ssize_t outpos;
5500
5501
5.24k
    int surrogateescape = 0;
5502
5.24k
    int surrogatepass = 0;
5503
5.24k
    switch (errors)
5504
5.24k
    {
5505
0
    case _Py_ERROR_STRICT:
5506
0
        break;
5507
5.24k
    case _Py_ERROR_SURROGATEESCAPE:
5508
5.24k
        surrogateescape = 1;
5509
5.24k
        break;
5510
0
    case _Py_ERROR_SURROGATEPASS:
5511
0
        surrogatepass = 1;
5512
0
        break;
5513
0
    default:
5514
0
        return -3;
5515
5.24k
    }
5516
5517
    /* Note: size will always be longer than the resulting Unicode
5518
       character count */
5519
5.24k
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5520
0
        return -1;
5521
0
    }
5522
5523
5.24k
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5524
5.24k
    if (!unicode) {
5525
0
        return -1;
5526
0
    }
5527
5528
    /* Unpack UTF-8 encoded data */
5529
5.24k
    e = s + size;
5530
5.24k
    outpos = 0;
5531
5.24k
    while (s < e) {
5532
5.24k
        Py_UCS4 ch;
5533
5.24k
#if SIZEOF_WCHAR_T == 4
5534
5.24k
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5535
#else
5536
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5537
#endif
5538
5.24k
        if (ch > 0xFF) {
5539
0
#if SIZEOF_WCHAR_T == 4
5540
0
            Py_UNREACHABLE();
5541
#else
5542
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5543
            /* write a surrogate pair */
5544
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5545
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5546
#endif
5547
0
        }
5548
5.24k
        else {
5549
5.24k
            if (!ch && s == e) {
5550
5.24k
                break;
5551
5.24k
            }
5552
5553
0
            if (surrogateescape) {
5554
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5555
0
            }
5556
0
            else {
5557
                /* Is it a valid three-byte code? */
5558
0
                if (surrogatepass
5559
0
                    && (e - s) >= 3
5560
0
                    && (s[0] & 0xf0) == 0xe0
5561
0
                    && (s[1] & 0xc0) == 0x80
5562
0
                    && (s[2] & 0xc0) == 0x80)
5563
0
                {
5564
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5565
0
                    s += 3;
5566
0
                    unicode[outpos++] = ch;
5567
0
                }
5568
0
                else {
5569
0
                    PyMem_RawFree(unicode );
5570
0
                    if (reason != NULL) {
5571
0
                        switch (ch) {
5572
0
                        case 0:
5573
0
                            *reason = "unexpected end of data";
5574
0
                            break;
5575
0
                        case 1:
5576
0
                            *reason = "invalid start byte";
5577
0
                            break;
5578
                        /* 2, 3, 4 */
5579
0
                        default:
5580
0
                            *reason = "invalid continuation byte";
5581
0
                            break;
5582
0
                        }
5583
0
                    }
5584
0
                    if (wlen != NULL) {
5585
0
                        *wlen = s - orig_s;
5586
0
                    }
5587
0
                    return -2;
5588
0
                }
5589
0
            }
5590
0
        }
5591
5.24k
    }
5592
5.24k
    unicode[outpos] = L'\0';
5593
5.24k
    if (wlen) {
5594
5.24k
        *wlen = outpos;
5595
5.24k
    }
5596
5.24k
    *wstr = unicode;
5597
5.24k
    return 0;
5598
5.24k
}
5599
5600
5601
wchar_t*
5602
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5603
                               size_t *wlen)
5604
0
{
5605
0
    wchar_t *wstr;
5606
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5607
0
                               &wstr, wlen,
5608
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5609
0
    if (res != 0) {
5610
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5611
0
        assert(res != -3);
5612
0
        if (wlen) {
5613
0
            *wlen = (size_t)res;
5614
0
        }
5615
0
        return NULL;
5616
0
    }
5617
0
    return wstr;
5618
0
}
5619
5620
5621
/* UTF-8 encoder.
5622
5623
   On success, return 0 and write the newly allocated character string (use
5624
   PyMem_Free() to free the memory) into *str.
5625
5626
   On encoding failure, return -2 and write the position of the invalid
5627
   surrogate character into *error_pos (if error_pos is set) and the decoding
5628
   error message into *reason (if reason is set).
5629
5630
   On memory allocation failure, return -1. */
5631
int
5632
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5633
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5634
644
{
5635
644
    const Py_ssize_t max_char_size = 4;
5636
644
    Py_ssize_t len = wcslen(text);
5637
5638
644
    assert(len >= 0);
5639
5640
644
    int surrogateescape = 0;
5641
644
    int surrogatepass = 0;
5642
644
    switch (errors)
5643
644
    {
5644
64
    case _Py_ERROR_STRICT:
5645
64
        break;
5646
580
    case _Py_ERROR_SURROGATEESCAPE:
5647
580
        surrogateescape = 1;
5648
580
        break;
5649
0
    case _Py_ERROR_SURROGATEPASS:
5650
0
        surrogatepass = 1;
5651
0
        break;
5652
0
    default:
5653
0
        return -3;
5654
644
    }
5655
5656
644
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5657
0
        return -1;
5658
0
    }
5659
644
    char *bytes;
5660
644
    if (raw_malloc) {
5661
644
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5662
644
    }
5663
0
    else {
5664
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5665
0
    }
5666
644
    if (bytes == NULL) {
5667
0
        return -1;
5668
0
    }
5669
5670
644
    char *p = bytes;
5671
644
    Py_ssize_t i;
5672
42.9k
    for (i = 0; i < len; ) {
5673
42.3k
        Py_ssize_t ch_pos = i;
5674
42.3k
        Py_UCS4 ch = text[i];
5675
42.3k
        i++;
5676
#if Py_UNICODE_SIZE == 2
5677
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5678
            && i < len
5679
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5680
        {
5681
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5682
            i++;
5683
        }
5684
#endif
5685
5686
42.3k
        if (ch < 0x80) {
5687
            /* Encode ASCII */
5688
42.3k
            *p++ = (char) ch;
5689
5690
42.3k
        }
5691
0
        else if (ch < 0x0800) {
5692
            /* Encode Latin-1 */
5693
0
            *p++ = (char)(0xc0 | (ch >> 6));
5694
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5695
0
        }
5696
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5697
            /* surrogateescape error handler */
5698
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5699
0
                if (error_pos != NULL) {
5700
0
                    *error_pos = (size_t)ch_pos;
5701
0
                }
5702
0
                if (reason != NULL) {
5703
0
                    *reason = "encoding error";
5704
0
                }
5705
0
                if (raw_malloc) {
5706
0
                    PyMem_RawFree(bytes);
5707
0
                }
5708
0
                else {
5709
0
                    PyMem_Free(bytes);
5710
0
                }
5711
0
                return -2;
5712
0
            }
5713
0
            *p++ = (char)(ch & 0xff);
5714
0
        }
5715
0
        else if (ch < 0x10000) {
5716
0
            *p++ = (char)(0xe0 | (ch >> 12));
5717
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5718
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5719
0
        }
5720
0
        else {  /* ch >= 0x10000 */
5721
0
            assert(ch <= MAX_UNICODE);
5722
            /* Encode UCS4 Unicode ordinals */
5723
0
            *p++ = (char)(0xf0 | (ch >> 18));
5724
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5725
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5726
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5727
0
        }
5728
42.3k
    }
5729
644
    *p++ = '\0';
5730
5731
644
    size_t final_size = (p - bytes);
5732
644
    char *bytes2;
5733
644
    if (raw_malloc) {
5734
644
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5735
644
    }
5736
0
    else {
5737
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5738
0
    }
5739
644
    if (bytes2 == NULL) {
5740
0
        if (error_pos != NULL) {
5741
0
            *error_pos = (size_t)-1;
5742
0
        }
5743
0
        if (raw_malloc) {
5744
0
            PyMem_RawFree(bytes);
5745
0
        }
5746
0
        else {
5747
0
            PyMem_Free(bytes);
5748
0
        }
5749
0
        return -1;
5750
0
    }
5751
644
    *str = bytes2;
5752
644
    return 0;
5753
644
}
5754
5755
5756
/* Primary internal function which creates utf8 encoded bytes objects.
5757
5758
   Allocation strategy:  if the string is short, convert into a stack buffer
5759
   and allocate exactly as much space needed at the end.  Else allocate the
5760
   maximum possible needed (4 result bytes per Unicode character), and return
5761
   the excess memory at the end.
5762
*/
5763
static PyObject *
5764
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5765
                    const char *errors)
5766
15.4M
{
5767
15.4M
    if (!PyUnicode_Check(unicode)) {
5768
0
        PyErr_BadArgument();
5769
0
        return NULL;
5770
0
    }
5771
5772
15.4M
    if (PyUnicode_UTF8(unicode))
5773
9.59M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5774
9.59M
                                         PyUnicode_UTF8_LENGTH(unicode));
5775
5776
5.83M
    int kind = PyUnicode_KIND(unicode);
5777
5.83M
    const void *data = PyUnicode_DATA(unicode);
5778
5.83M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5779
5780
5.83M
    PyBytesWriter *writer;
5781
5.83M
    char *end;
5782
5783
5.83M
    switch (kind) {
5784
0
    default:
5785
0
        Py_UNREACHABLE();
5786
4.32M
    case PyUnicode_1BYTE_KIND:
5787
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5788
4.32M
        assert(!PyUnicode_IS_ASCII(unicode));
5789
4.32M
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5790
4.32M
                                      error_handler, errors, &end);
5791
4.32M
        break;
5792
1.44M
    case PyUnicode_2BYTE_KIND:
5793
1.44M
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5794
1.44M
                                      error_handler, errors, &end);
5795
1.44M
        break;
5796
65.2k
    case PyUnicode_4BYTE_KIND:
5797
65.2k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5798
65.2k
                                      error_handler, errors, &end);
5799
65.2k
        break;
5800
5.83M
    }
5801
5802
5.83M
    if (writer == NULL) {
5803
148k
        PyBytesWriter_Discard(writer);
5804
148k
        return NULL;
5805
148k
    }
5806
5.68M
    return PyBytesWriter_FinishWithPointer(writer, end);
5807
5.83M
}
5808
5809
static int
5810
unicode_fill_utf8(PyObject *unicode)
5811
145k
{
5812
145k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5813
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5814
145k
    assert(!PyUnicode_IS_ASCII(unicode));
5815
5816
145k
    int kind = PyUnicode_KIND(unicode);
5817
145k
    const void *data = PyUnicode_DATA(unicode);
5818
145k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5819
5820
145k
    PyBytesWriter *writer;
5821
145k
    char *end;
5822
5823
145k
    switch (kind) {
5824
0
    default:
5825
0
        Py_UNREACHABLE();
5826
115k
    case PyUnicode_1BYTE_KIND:
5827
115k
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5828
115k
                                      _Py_ERROR_STRICT, NULL, &end);
5829
115k
        break;
5830
25.0k
    case PyUnicode_2BYTE_KIND:
5831
25.0k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5832
25.0k
                                      _Py_ERROR_STRICT, NULL, &end);
5833
25.0k
        break;
5834
4.86k
    case PyUnicode_4BYTE_KIND:
5835
4.86k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5836
4.86k
                                      _Py_ERROR_STRICT, NULL, &end);
5837
4.86k
        break;
5838
145k
    }
5839
145k
    if (writer == NULL) {
5840
206
        return -1;
5841
206
    }
5842
5843
145k
    const char *start = PyBytesWriter_GetData(writer);
5844
145k
    Py_ssize_t len = end - start;
5845
5846
145k
    char *cache = PyMem_Malloc(len + 1);
5847
145k
    if (cache == NULL) {
5848
0
        PyBytesWriter_Discard(writer);
5849
0
        PyErr_NoMemory();
5850
0
        return -1;
5851
0
    }
5852
145k
    memcpy(cache, start, len);
5853
145k
    cache[len] = '\0';
5854
145k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5855
145k
    PyUnicode_SET_UTF8(unicode, cache);
5856
145k
    PyBytesWriter_Discard(writer);
5857
145k
    return 0;
5858
145k
}
5859
5860
PyObject *
5861
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5862
15.4M
{
5863
15.4M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5864
15.4M
}
5865
5866
5867
PyObject *
5868
PyUnicode_AsUTF8String(PyObject *unicode)
5869
3.19k
{
5870
3.19k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5871
3.19k
}
5872
5873
/* --- UTF-32 Codec ------------------------------------------------------- */
5874
5875
PyObject *
5876
PyUnicode_DecodeUTF32(const char *s,
5877
                      Py_ssize_t size,
5878
                      const char *errors,
5879
                      int *byteorder)
5880
106
{
5881
106
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5882
106
}
5883
5884
PyObject *
5885
PyUnicode_DecodeUTF32Stateful(const char *s,
5886
                              Py_ssize_t size,
5887
                              const char *errors,
5888
                              int *byteorder,
5889
                              Py_ssize_t *consumed)
5890
22.7k
{
5891
22.7k
    const char *starts = s;
5892
22.7k
    Py_ssize_t startinpos;
5893
22.7k
    Py_ssize_t endinpos;
5894
22.7k
    _PyUnicodeWriter writer;
5895
22.7k
    const unsigned char *q, *e;
5896
22.7k
    int le, bo = 0;       /* assume native ordering by default */
5897
22.7k
    const char *encoding;
5898
22.7k
    const char *errmsg = "";
5899
22.7k
    PyObject *errorHandler = NULL;
5900
22.7k
    PyObject *exc = NULL;
5901
5902
22.7k
    q = (const unsigned char *)s;
5903
22.7k
    e = q + size;
5904
5905
22.7k
    if (byteorder)
5906
22.6k
        bo = *byteorder;
5907
5908
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5909
       byte order setting accordingly. In native mode, the leading BOM
5910
       mark is skipped, in all other modes, it is copied to the output
5911
       stream as-is (giving a ZWNBSP character). */
5912
22.7k
    if (bo == 0 && size >= 4) {
5913
20.2k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5914
20.2k
        if (bom == 0x0000FEFF) {
5915
130
            bo = -1;
5916
130
            q += 4;
5917
130
        }
5918
20.1k
        else if (bom == 0xFFFE0000) {
5919
204
            bo = 1;
5920
204
            q += 4;
5921
204
        }
5922
20.2k
        if (byteorder)
5923
20.1k
            *byteorder = bo;
5924
20.2k
    }
5925
5926
22.7k
    if (q == e) {
5927
77
        if (consumed)
5928
0
            *consumed = size;
5929
77
        _Py_RETURN_UNICODE_EMPTY();
5930
77
    }
5931
5932
#ifdef WORDS_BIGENDIAN
5933
    le = bo < 0;
5934
#else
5935
22.7k
    le = bo <= 0;
5936
22.7k
#endif
5937
22.7k
    encoding = le ? "utf-32-le" : "utf-32-be";
5938
5939
22.7k
    _PyUnicodeWriter_Init(&writer);
5940
22.7k
    writer.min_length = (e - q + 3) / 4;
5941
22.7k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5942
0
        goto onError;
5943
5944
109k
    while (1) {
5945
109k
        Py_UCS4 ch = 0;
5946
109k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5947
5948
109k
        if (e - q >= 4) {
5949
94.6k
            int kind = writer.kind;
5950
94.6k
            void *data = writer.data;
5951
94.6k
            const unsigned char *last = e - 4;
5952
94.6k
            Py_ssize_t pos = writer.pos;
5953
94.6k
            if (le) {
5954
130k
                do {
5955
130k
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5956
130k
                    if (ch > maxch)
5957
90.1k
                        break;
5958
39.8k
                    if (kind != PyUnicode_1BYTE_KIND &&
5959
9.57k
                        Py_UNICODE_IS_SURROGATE(ch))
5960
239
                        break;
5961
39.6k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5962
39.6k
                    q += 4;
5963
39.6k
                } while (q <= last);
5964
91.5k
            }
5965
3.10k
            else {
5966
5.36k
                do {
5967
5.36k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5968
5.36k
                    if (ch > maxch)
5969
2.88k
                        break;
5970
2.48k
                    if (kind != PyUnicode_1BYTE_KIND &&
5971
1.88k
                        Py_UNICODE_IS_SURROGATE(ch))
5972
107
                        break;
5973
2.37k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5974
2.37k
                    q += 4;
5975
2.37k
                } while (q <= last);
5976
3.10k
            }
5977
94.6k
            writer.pos = pos;
5978
94.6k
        }
5979
5980
109k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5981
348
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5982
348
            startinpos = ((const char *)q) - starts;
5983
348
            endinpos = startinpos + 4;
5984
348
        }
5985
109k
        else if (ch <= maxch) {
5986
16.1k
            if (q == e || consumed)
5987
3.37k
                break;
5988
            /* remaining bytes at the end? (size should be divisible by 4) */
5989
12.8k
            errmsg = "truncated data";
5990
12.8k
            startinpos = ((const char *)q) - starts;
5991
12.8k
            endinpos = ((const char *)e) - starts;
5992
12.8k
        }
5993
93.0k
        else {
5994
93.0k
            if (ch < 0x110000) {
5995
4.16k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5996
0
                    goto onError;
5997
4.16k
                q += 4;
5998
4.16k
                continue;
5999
4.16k
            }
6000
88.8k
            errmsg = "code point not in range(0x110000)";
6001
88.8k
            startinpos = ((const char *)q) - starts;
6002
88.8k
            endinpos = startinpos + 4;
6003
88.8k
        }
6004
6005
        /* The remaining input chars are ignored if the callback
6006
           chooses to skip the input */
6007
102k
        if (unicode_decode_call_errorhandler_writer(
6008
102k
                errors, &errorHandler,
6009
102k
                encoding, errmsg,
6010
102k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
6011
102k
                &writer))
6012
19.3k
            goto onError;
6013
102k
    }
6014
6015
3.37k
    if (consumed)
6016
0
        *consumed = (const char *)q-starts;
6017
6018
3.37k
    Py_XDECREF(errorHandler);
6019
3.37k
    Py_XDECREF(exc);
6020
3.37k
    return _PyUnicodeWriter_Finish(&writer);
6021
6022
19.3k
  onError:
6023
19.3k
    _PyUnicodeWriter_Dealloc(&writer);
6024
19.3k
    Py_XDECREF(errorHandler);
6025
19.3k
    Py_XDECREF(exc);
6026
19.3k
    return NULL;
6027
22.7k
}
6028
6029
PyObject *
6030
_PyUnicode_EncodeUTF32(PyObject *str,
6031
                       const char *errors,
6032
                       int byteorder)
6033
0
{
6034
0
    if (!PyUnicode_Check(str)) {
6035
0
        PyErr_BadArgument();
6036
0
        return NULL;
6037
0
    }
6038
0
    int kind = PyUnicode_KIND(str);
6039
0
    const void *data = PyUnicode_DATA(str);
6040
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6041
6042
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
6043
0
        return PyErr_NoMemory();
6044
0
    Py_ssize_t nsize = len + (byteorder == 0);
6045
6046
0
#if PY_LITTLE_ENDIAN
6047
0
    int native_ordering = byteorder <= 0;
6048
#else
6049
    int native_ordering = byteorder >= 0;
6050
#endif
6051
6052
0
    if (kind == PyUnicode_1BYTE_KIND) {
6053
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6054
        // on short strings
6055
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
6056
0
        if (v == NULL) {
6057
0
            return NULL;
6058
0
        }
6059
6060
        /* output buffer is 4-bytes aligned */
6061
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
6062
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
6063
0
        if (byteorder == 0) {
6064
0
            *out++ = 0xFEFF;
6065
0
        }
6066
0
        if (len > 0) {
6067
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
6068
0
                                 &out, native_ordering);
6069
0
        }
6070
0
        return v;
6071
0
    }
6072
6073
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
6074
0
    if (writer == NULL) {
6075
0
        return NULL;
6076
0
    }
6077
6078
    /* output buffer is 4-bytes aligned */
6079
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
6080
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
6081
0
    if (byteorder == 0) {
6082
0
        *out++ = 0xFEFF;
6083
0
    }
6084
0
    if (len == 0) {
6085
0
        return PyBytesWriter_Finish(writer);
6086
0
    }
6087
6088
0
    const char *encoding;
6089
0
    if (byteorder == -1)
6090
0
        encoding = "utf-32-le";
6091
0
    else if (byteorder == 1)
6092
0
        encoding = "utf-32-be";
6093
0
    else
6094
0
        encoding = "utf-32";
6095
6096
0
    PyObject *errorHandler = NULL;
6097
0
    PyObject *exc = NULL;
6098
0
    PyObject *rep = NULL;
6099
6100
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6101
0
        if (kind == PyUnicode_2BYTE_KIND) {
6102
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6103
0
                                        &out, native_ordering);
6104
0
        }
6105
0
        else {
6106
0
            assert(kind == PyUnicode_4BYTE_KIND);
6107
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6108
0
                                        &out, native_ordering);
6109
0
        }
6110
0
        if (pos == len)
6111
0
            break;
6112
6113
0
        Py_ssize_t newpos;
6114
0
        rep = unicode_encode_call_errorhandler(
6115
0
                errors, &errorHandler,
6116
0
                encoding, "surrogates not allowed",
6117
0
                str, &exc, pos, pos + 1, &newpos);
6118
0
        if (!rep)
6119
0
            goto error;
6120
6121
0
        Py_ssize_t repsize, moreunits;
6122
0
        if (PyBytes_Check(rep)) {
6123
0
            repsize = PyBytes_GET_SIZE(rep);
6124
0
            if (repsize & 3) {
6125
0
                raise_encode_exception(&exc, encoding,
6126
0
                                       str, pos, pos + 1,
6127
0
                                       "surrogates not allowed");
6128
0
                goto error;
6129
0
            }
6130
0
            moreunits = repsize / 4;
6131
0
        }
6132
0
        else {
6133
0
            assert(PyUnicode_Check(rep));
6134
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6135
0
            if (!PyUnicode_IS_ASCII(rep)) {
6136
0
                raise_encode_exception(&exc, encoding,
6137
0
                                       str, pos, pos + 1,
6138
0
                                       "surrogates not allowed");
6139
0
                goto error;
6140
0
            }
6141
0
        }
6142
0
        moreunits += pos - newpos;
6143
0
        pos = newpos;
6144
6145
        /* four bytes are reserved for each surrogate */
6146
0
        if (moreunits > 0) {
6147
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6148
0
            if (out == NULL) {
6149
0
                goto error;
6150
0
            }
6151
0
        }
6152
6153
0
        if (PyBytes_Check(rep)) {
6154
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6155
0
            out += repsize / 4;
6156
0
        }
6157
0
        else {
6158
            /* rep is unicode */
6159
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6160
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6161
0
                                 &out, native_ordering);
6162
0
        }
6163
6164
0
        Py_CLEAR(rep);
6165
0
    }
6166
6167
0
    Py_XDECREF(errorHandler);
6168
0
    Py_XDECREF(exc);
6169
6170
    /* Cut back to size actually needed. This is necessary for, for example,
6171
       encoding of a string containing isolated surrogates and the 'ignore'
6172
       handler is used. */
6173
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6174
6175
0
  error:
6176
0
    Py_XDECREF(rep);
6177
0
    Py_XDECREF(errorHandler);
6178
0
    Py_XDECREF(exc);
6179
0
    PyBytesWriter_Discard(writer);
6180
0
    return NULL;
6181
0
}
6182
6183
PyObject *
6184
PyUnicode_AsUTF32String(PyObject *unicode)
6185
0
{
6186
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6187
0
}
6188
6189
/* --- UTF-16 Codec ------------------------------------------------------- */
6190
6191
PyObject *
6192
PyUnicode_DecodeUTF16(const char *s,
6193
                      Py_ssize_t size,
6194
                      const char *errors,
6195
                      int *byteorder)
6196
107
{
6197
107
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6198
107
}
6199
6200
PyObject *
6201
PyUnicode_DecodeUTF16Stateful(const char *s,
6202
                              Py_ssize_t size,
6203
                              const char *errors,
6204
                              int *byteorder,
6205
                              Py_ssize_t *consumed)
6206
13.5k
{
6207
13.5k
    const char *starts = s;
6208
13.5k
    Py_ssize_t startinpos;
6209
13.5k
    Py_ssize_t endinpos;
6210
13.5k
    _PyUnicodeWriter writer;
6211
13.5k
    const unsigned char *q, *e;
6212
13.5k
    int bo = 0;       /* assume native ordering by default */
6213
13.5k
    int native_ordering;
6214
13.5k
    const char *errmsg = "";
6215
13.5k
    PyObject *errorHandler = NULL;
6216
13.5k
    PyObject *exc = NULL;
6217
13.5k
    const char *encoding;
6218
6219
13.5k
    q = (const unsigned char *)s;
6220
13.5k
    e = q + size;
6221
6222
13.5k
    if (byteorder)
6223
13.4k
        bo = *byteorder;
6224
6225
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6226
       byte order setting accordingly. In native mode, the leading BOM
6227
       mark is skipped, in all other modes, it is copied to the output
6228
       stream as-is (giving a ZWNBSP character). */
6229
13.5k
    if (bo == 0 && size >= 2) {
6230
12.8k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6231
12.8k
        if (bom == 0xFEFF) {
6232
292
            q += 2;
6233
292
            bo = -1;
6234
292
        }
6235
12.5k
        else if (bom == 0xFFFE) {
6236
1.94k
            q += 2;
6237
1.94k
            bo = 1;
6238
1.94k
        }
6239
12.8k
        if (byteorder)
6240
12.7k
            *byteorder = bo;
6241
12.8k
    }
6242
6243
13.5k
    if (q == e) {
6244
73
        if (consumed)
6245
0
            *consumed = size;
6246
73
        _Py_RETURN_UNICODE_EMPTY();
6247
73
    }
6248
6249
13.4k
#if PY_LITTLE_ENDIAN
6250
13.4k
    native_ordering = bo <= 0;
6251
13.4k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6252
#else
6253
    native_ordering = bo >= 0;
6254
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6255
#endif
6256
6257
    /* Note: size will always be longer than the resulting Unicode
6258
       character count normally.  Error handler will take care of
6259
       resizing when needed. */
6260
13.4k
    _PyUnicodeWriter_Init(&writer);
6261
13.4k
    writer.min_length = (e - q + 1) / 2;
6262
13.4k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6263
0
        goto onError;
6264
6265
50.0k
    while (1) {
6266
50.0k
        Py_UCS4 ch = 0;
6267
50.0k
        if (e - q >= 2) {
6268
42.6k
            int kind = writer.kind;
6269
42.6k
            if (kind == PyUnicode_1BYTE_KIND) {
6270
16.2k
                if (PyUnicode_IS_ASCII(writer.buffer))
6271
12.9k
                    ch = asciilib_utf16_decode(&q, e,
6272
12.9k
                            (Py_UCS1*)writer.data, &writer.pos,
6273
12.9k
                            native_ordering);
6274
3.27k
                else
6275
3.27k
                    ch = ucs1lib_utf16_decode(&q, e,
6276
3.27k
                            (Py_UCS1*)writer.data, &writer.pos,
6277
3.27k
                            native_ordering);
6278
26.4k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6279
11.6k
                ch = ucs2lib_utf16_decode(&q, e,
6280
11.6k
                        (Py_UCS2*)writer.data, &writer.pos,
6281
11.6k
                        native_ordering);
6282
14.8k
            } else {
6283
14.8k
                assert(kind == PyUnicode_4BYTE_KIND);
6284
14.8k
                ch = ucs4lib_utf16_decode(&q, e,
6285
14.8k
                        (Py_UCS4*)writer.data, &writer.pos,
6286
14.8k
                        native_ordering);
6287
14.8k
            }
6288
42.6k
        }
6289
6290
50.0k
        switch (ch)
6291
50.0k
        {
6292
13.3k
        case 0:
6293
            /* remaining byte at the end? (size should be even) */
6294
13.3k
            if (q == e || consumed)
6295
8.70k
                goto End;
6296
4.61k
            errmsg = "truncated data";
6297
4.61k
            startinpos = ((const char *)q) - starts;
6298
4.61k
            endinpos = ((const char *)e) - starts;
6299
4.61k
            break;
6300
            /* The remaining input chars are ignored if the callback
6301
               chooses to skip the input */
6302
1.80k
        case 1:
6303
1.80k
            q -= 2;
6304
1.80k
            if (consumed)
6305
0
                goto End;
6306
1.80k
            errmsg = "unexpected end of data";
6307
1.80k
            startinpos = ((const char *)q) - starts;
6308
1.80k
            endinpos = ((const char *)e) - starts;
6309
1.80k
            break;
6310
13.4k
        case 2:
6311
13.4k
            errmsg = "illegal encoding";
6312
13.4k
            startinpos = ((const char *)q) - 2 - starts;
6313
13.4k
            endinpos = startinpos + 2;
6314
13.4k
            break;
6315
6.49k
        case 3:
6316
6.49k
            errmsg = "illegal UTF-16 surrogate";
6317
6.49k
            startinpos = ((const char *)q) - 4 - starts;
6318
6.49k
            endinpos = startinpos + 2;
6319
6.49k
            break;
6320
14.9k
        default:
6321
14.9k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6322
0
                goto onError;
6323
14.9k
            continue;
6324
50.0k
        }
6325
6326
26.3k
        if (unicode_decode_call_errorhandler_writer(
6327
26.3k
                errors,
6328
26.3k
                &errorHandler,
6329
26.3k
                encoding, errmsg,
6330
26.3k
                &starts,
6331
26.3k
                (const char **)&e,
6332
26.3k
                &startinpos,
6333
26.3k
                &endinpos,
6334
26.3k
                &exc,
6335
26.3k
                (const char **)&q,
6336
26.3k
                &writer))
6337
4.77k
            goto onError;
6338
26.3k
    }
6339
6340
8.70k
End:
6341
8.70k
    if (consumed)
6342
0
        *consumed = (const char *)q-starts;
6343
6344
8.70k
    Py_XDECREF(errorHandler);
6345
8.70k
    Py_XDECREF(exc);
6346
8.70k
    return _PyUnicodeWriter_Finish(&writer);
6347
6348
4.77k
  onError:
6349
4.77k
    _PyUnicodeWriter_Dealloc(&writer);
6350
4.77k
    Py_XDECREF(errorHandler);
6351
4.77k
    Py_XDECREF(exc);
6352
4.77k
    return NULL;
6353
13.4k
}
6354
6355
PyObject *
6356
_PyUnicode_EncodeUTF16(PyObject *str,
6357
                       const char *errors,
6358
                       int byteorder)
6359
0
{
6360
0
    if (!PyUnicode_Check(str)) {
6361
0
        PyErr_BadArgument();
6362
0
        return NULL;
6363
0
    }
6364
0
    int kind = PyUnicode_KIND(str);
6365
0
    const void *data = PyUnicode_DATA(str);
6366
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6367
6368
0
    Py_ssize_t pairs = 0;
6369
0
    if (kind == PyUnicode_4BYTE_KIND) {
6370
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6371
0
        const Py_UCS4 *end = in + len;
6372
0
        while (in < end) {
6373
0
            if (*in++ >= 0x10000) {
6374
0
                pairs++;
6375
0
            }
6376
0
        }
6377
0
    }
6378
0
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6379
0
        return PyErr_NoMemory();
6380
0
    }
6381
0
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6382
6383
#if PY_BIG_ENDIAN
6384
    int native_ordering = byteorder >= 0;
6385
#else
6386
0
    int native_ordering = byteorder <= 0;
6387
0
#endif
6388
6389
0
    if (kind == PyUnicode_1BYTE_KIND) {
6390
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6391
        // on short strings
6392
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6393
0
        if (v == NULL) {
6394
0
            return NULL;
6395
0
        }
6396
6397
        /* output buffer is 2-bytes aligned */
6398
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6399
0
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6400
0
        if (byteorder == 0) {
6401
0
            *out++ = 0xFEFF;
6402
0
        }
6403
0
        if (len > 0) {
6404
0
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6405
0
        }
6406
0
        return v;
6407
0
    }
6408
6409
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6410
0
    if (writer == NULL) {
6411
0
        return NULL;
6412
0
    }
6413
6414
    /* output buffer is 2-bytes aligned */
6415
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6416
0
    unsigned short *out = PyBytesWriter_GetData(writer);
6417
0
    if (byteorder == 0) {
6418
0
        *out++ = 0xFEFF;
6419
0
    }
6420
0
    if (len == 0) {
6421
0
        return PyBytesWriter_Finish(writer);
6422
0
    }
6423
6424
0
    const char *encoding;
6425
0
    if (byteorder < 0) {
6426
0
        encoding = "utf-16-le";
6427
0
    }
6428
0
    else if (byteorder > 0) {
6429
0
        encoding = "utf-16-be";
6430
0
    }
6431
0
    else {
6432
0
        encoding = "utf-16";
6433
0
    }
6434
6435
0
    PyObject *errorHandler = NULL;
6436
0
    PyObject *exc = NULL;
6437
0
    PyObject *rep = NULL;
6438
6439
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6440
0
        if (kind == PyUnicode_2BYTE_KIND) {
6441
0
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6442
0
                                        &out, native_ordering);
6443
0
        }
6444
0
        else {
6445
0
            assert(kind == PyUnicode_4BYTE_KIND);
6446
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6447
0
                                        &out, native_ordering);
6448
0
        }
6449
0
        if (pos == len)
6450
0
            break;
6451
6452
0
        Py_ssize_t newpos;
6453
0
        rep = unicode_encode_call_errorhandler(
6454
0
                errors, &errorHandler,
6455
0
                encoding, "surrogates not allowed",
6456
0
                str, &exc, pos, pos + 1, &newpos);
6457
0
        if (!rep)
6458
0
            goto error;
6459
6460
0
        Py_ssize_t repsize, moreunits;
6461
0
        if (PyBytes_Check(rep)) {
6462
0
            repsize = PyBytes_GET_SIZE(rep);
6463
0
            if (repsize & 1) {
6464
0
                raise_encode_exception(&exc, encoding,
6465
0
                                       str, pos, pos + 1,
6466
0
                                       "surrogates not allowed");
6467
0
                goto error;
6468
0
            }
6469
0
            moreunits = repsize / 2;
6470
0
        }
6471
0
        else {
6472
0
            assert(PyUnicode_Check(rep));
6473
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6474
0
            if (!PyUnicode_IS_ASCII(rep)) {
6475
0
                raise_encode_exception(&exc, encoding,
6476
0
                                       str, pos, pos + 1,
6477
0
                                       "surrogates not allowed");
6478
0
                goto error;
6479
0
            }
6480
0
        }
6481
0
        moreunits += pos - newpos;
6482
0
        pos = newpos;
6483
6484
        /* two bytes are reserved for each surrogate */
6485
0
        if (moreunits > 0) {
6486
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6487
0
            if (out == NULL) {
6488
0
                goto error;
6489
0
            }
6490
0
        }
6491
6492
0
        if (PyBytes_Check(rep)) {
6493
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6494
0
            out += repsize / 2;
6495
0
        } else {
6496
            /* rep is unicode */
6497
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6498
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6499
0
                                 &out, native_ordering);
6500
0
        }
6501
6502
0
        Py_CLEAR(rep);
6503
0
    }
6504
6505
0
    Py_XDECREF(errorHandler);
6506
0
    Py_XDECREF(exc);
6507
6508
    /* Cut back to size actually needed. This is necessary for, for example,
6509
    encoding of a string containing isolated surrogates and the 'ignore' handler
6510
    is used. */
6511
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6512
6513
0
  error:
6514
0
    Py_XDECREF(rep);
6515
0
    Py_XDECREF(errorHandler);
6516
0
    Py_XDECREF(exc);
6517
0
    PyBytesWriter_Discard(writer);
6518
0
    return NULL;
6519
0
}
6520
6521
PyObject *
6522
PyUnicode_AsUTF16String(PyObject *unicode)
6523
0
{
6524
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6525
0
}
6526
6527
_PyUnicode_Name_CAPI *
6528
_PyUnicode_GetNameCAPI(void)
6529
2.34k
{
6530
2.34k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6531
2.34k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6532
6533
2.34k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6534
2.34k
    if (ucnhash_capi == NULL) {
6535
1
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6536
1
                PyUnicodeData_CAPSULE_NAME, 1);
6537
6538
        // It's fine if we overwrite the value here. It's always the same value.
6539
1
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6540
1
    }
6541
2.34k
    return ucnhash_capi;
6542
2.34k
}
6543
6544
/* --- Unicode Escape Codec ----------------------------------------------- */
6545
6546
PyObject *
6547
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6548
                               Py_ssize_t size,
6549
                               const char *errors,
6550
                               Py_ssize_t *consumed,
6551
                               int *first_invalid_escape_char,
6552
                               const char **first_invalid_escape_ptr)
6553
31.4k
{
6554
31.4k
    const char *starts = s;
6555
31.4k
    const char *initial_starts = starts;
6556
31.4k
    _PyUnicodeWriter writer;
6557
31.4k
    const char *end;
6558
31.4k
    PyObject *errorHandler = NULL;
6559
31.4k
    PyObject *exc = NULL;
6560
31.4k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6561
6562
    // so we can remember if we've seen an invalid escape char or not
6563
31.4k
    *first_invalid_escape_char = -1;
6564
31.4k
    *first_invalid_escape_ptr = NULL;
6565
6566
31.4k
    if (size == 0) {
6567
1.99k
        if (consumed) {
6568
0
            *consumed = 0;
6569
0
        }
6570
1.99k
        _Py_RETURN_UNICODE_EMPTY();
6571
1.99k
    }
6572
    /* Escaped strings will always be longer than the resulting
6573
       Unicode string, so we start with size here and then reduce the
6574
       length after conversion to the true value.
6575
       (but if the error callback returns a long replacement string
6576
       we'll have to allocate more space) */
6577
29.4k
    _PyUnicodeWriter_Init(&writer);
6578
29.4k
    writer.min_length = size;
6579
29.4k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6580
0
        goto onError;
6581
0
    }
6582
6583
29.4k
    end = s + size;
6584
168k
    while (s < end) {
6585
139k
        unsigned char c = (unsigned char) *s++;
6586
139k
        Py_UCS4 ch;
6587
139k
        int count;
6588
139k
        const char *message;
6589
6590
139k
#define WRITE_ASCII_CHAR(ch)                                                  \
6591
139k
            do {                                                              \
6592
14.2k
                assert(ch <= 127);                                            \
6593
14.2k
                assert(writer.pos < writer.size);                             \
6594
14.2k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6595
14.2k
            } while(0)
6596
6597
139k
#define WRITE_CHAR(ch)                                                        \
6598
139k
            do {                                                              \
6599
129k
                if (ch <= writer.maxchar) {                                   \
6600
113k
                    assert(writer.pos < writer.size);                         \
6601
113k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6602
113k
                }                                                             \
6603
129k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6604
0
                    goto onError;                                             \
6605
0
                }                                                             \
6606
129k
            } while(0)
6607
6608
        /* Non-escape characters are interpreted as Unicode ordinals */
6609
139k
        if (c != '\\') {
6610
90.2k
            WRITE_CHAR(c);
6611
90.2k
            continue;
6612
90.2k
        }
6613
6614
49.0k
        Py_ssize_t startinpos = s - starts - 1;
6615
        /* \ - Escapes */
6616
49.0k
        if (s >= end) {
6617
0
            message = "\\ at end of string";
6618
0
            goto incomplete;
6619
0
        }
6620
49.0k
        c = (unsigned char) *s++;
6621
6622
49.0k
        assert(writer.pos < writer.size);
6623
49.0k
        switch (c) {
6624
6625
            /* \x escapes */
6626
820
        case '\n': continue;
6627
1.41k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6628
858
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6629
1.14k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6630
493
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6631
        /* FF */
6632
913
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6633
408
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6634
915
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6635
1.27k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6636
        /* VT */
6637
823
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6638
        /* BEL, not classic C */
6639
713
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6640
6641
            /* \OOO (octal) escapes */
6642
3.29k
        case '0': case '1': case '2': case '3':
6643
6.23k
        case '4': case '5': case '6': case '7':
6644
6.23k
            ch = c - '0';
6645
6.23k
            if (s < end && '0' <= *s && *s <= '7') {
6646
2.29k
                ch = (ch<<3) + *s++ - '0';
6647
2.29k
                if (s < end && '0' <= *s && *s <= '7') {
6648
1.18k
                    ch = (ch<<3) + *s++ - '0';
6649
1.18k
                }
6650
2.29k
            }
6651
6.23k
            if (ch > 0377) {
6652
1.04k
                if (*first_invalid_escape_char == -1) {
6653
794
                    *first_invalid_escape_char = ch;
6654
794
                    if (starts == initial_starts) {
6655
                        /* Back up 3 chars, since we've already incremented s. */
6656
794
                        *first_invalid_escape_ptr = s - 3;
6657
794
                    }
6658
794
                }
6659
1.04k
            }
6660
6.23k
            WRITE_CHAR(ch);
6661
6.23k
            continue;
6662
6663
            /* hex escapes */
6664
            /* \xXX */
6665
6.23k
        case 'x':
6666
5.96k
            count = 2;
6667
5.96k
            message = "truncated \\xXX escape";
6668
5.96k
            goto hexescape;
6669
6670
            /* \uXXXX */
6671
8.93k
        case 'u':
6672
8.93k
            count = 4;
6673
8.93k
            message = "truncated \\uXXXX escape";
6674
8.93k
            goto hexescape;
6675
6676
            /* \UXXXXXXXX */
6677
10.4k
        case 'U':
6678
10.4k
            count = 8;
6679
10.4k
            message = "truncated \\UXXXXXXXX escape";
6680
25.4k
        hexescape:
6681
157k
            for (ch = 0; count; ++s, --count) {
6682
131k
                if (s >= end) {
6683
5
                    goto incomplete;
6684
5
                }
6685
131k
                c = (unsigned char)*s;
6686
131k
                ch <<= 4;
6687
131k
                if (c >= '0' && c <= '9') {
6688
100k
                    ch += c - '0';
6689
100k
                }
6690
31.2k
                else if (c >= 'a' && c <= 'f') {
6691
31.0k
                    ch += c - ('a' - 10);
6692
31.0k
                }
6693
243
                else if (c >= 'A' && c <= 'F') {
6694
239
                    ch += c - ('A' - 10);
6695
239
                }
6696
4
                else {
6697
4
                    goto error;
6698
4
                }
6699
131k
            }
6700
6701
            /* when we get here, ch is a 32-bit unicode character */
6702
25.3k
            if (ch > MAX_UNICODE) {
6703
1
                message = "illegal Unicode character";
6704
1
                goto error;
6705
1
            }
6706
6707
25.3k
            WRITE_CHAR(ch);
6708
25.3k
            continue;
6709
6710
            /* \N{name} */
6711
25.3k
        case 'N':
6712
2.34k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6713
2.34k
            if (ucnhash_capi == NULL) {
6714
0
                PyErr_SetString(
6715
0
                        PyExc_UnicodeError,
6716
0
                        "\\N escapes not supported (can't load unicodedata module)"
6717
0
                );
6718
0
                goto onError;
6719
0
            }
6720
6721
2.34k
            message = "malformed \\N character escape";
6722
2.34k
            if (s >= end) {
6723
4
                goto incomplete;
6724
4
            }
6725
2.33k
            if (*s == '{') {
6726
2.33k
                const char *start = ++s;
6727
2.33k
                size_t namelen;
6728
                /* look for the closing brace */
6729
38.3k
                while (s < end && *s != '}')
6730
36.0k
                    s++;
6731
2.33k
                if (s >= end) {
6732
13
                    goto incomplete;
6733
13
                }
6734
2.32k
                namelen = s - start;
6735
2.32k
                if (namelen) {
6736
                    /* found a name.  look it up in the unicode database */
6737
2.31k
                    s++;
6738
2.31k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6739
2.31k
                    if (namelen <= INT_MAX &&
6740
2.31k
                        ucnhash_capi->getcode(start, (int)namelen,
6741
2.31k
                                              &ch, 0)) {
6742
2.24k
                        assert(ch <= MAX_UNICODE);
6743
2.24k
                        WRITE_CHAR(ch);
6744
2.24k
                        continue;
6745
2.24k
                    }
6746
72
                    message = "unknown Unicode character name";
6747
72
                }
6748
2.32k
            }
6749
78
            goto error;
6750
6751
5.33k
        default:
6752
5.33k
            if (*first_invalid_escape_char == -1) {
6753
4.03k
                *first_invalid_escape_char = c;
6754
4.03k
                if (starts == initial_starts) {
6755
                    /* Back up one char, since we've already incremented s. */
6756
4.03k
                    *first_invalid_escape_ptr = s - 1;
6757
4.03k
                }
6758
4.03k
            }
6759
5.33k
            WRITE_ASCII_CHAR('\\');
6760
5.33k
            WRITE_CHAR(c);
6761
5.33k
            continue;
6762
49.0k
        }
6763
6764
22
      incomplete:
6765
22
        if (consumed) {
6766
0
            *consumed = startinpos;
6767
0
            break;
6768
0
        }
6769
105
      error:;
6770
105
        Py_ssize_t endinpos = s-starts;
6771
105
        writer.min_length = end - s + writer.pos;
6772
105
        if (unicode_decode_call_errorhandler_writer(
6773
105
                errors, &errorHandler,
6774
105
                "unicodeescape", message,
6775
105
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6776
105
                &writer)) {
6777
105
            goto onError;
6778
105
        }
6779
105
        assert(end - s <= writer.size - writer.pos);
6780
6781
0
#undef WRITE_ASCII_CHAR
6782
0
#undef WRITE_CHAR
6783
0
    }
6784
6785
29.3k
    Py_XDECREF(errorHandler);
6786
29.3k
    Py_XDECREF(exc);
6787
29.3k
    return _PyUnicodeWriter_Finish(&writer);
6788
6789
105
  onError:
6790
105
    _PyUnicodeWriter_Dealloc(&writer);
6791
105
    Py_XDECREF(errorHandler);
6792
105
    Py_XDECREF(exc);
6793
105
    return NULL;
6794
29.4k
}
6795
6796
PyObject *
6797
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6798
                              Py_ssize_t size,
6799
                              const char *errors,
6800
                              Py_ssize_t *consumed)
6801
0
{
6802
0
    int first_invalid_escape_char;
6803
0
    const char *first_invalid_escape_ptr;
6804
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6805
0
                                                      consumed,
6806
0
                                                      &first_invalid_escape_char,
6807
0
                                                      &first_invalid_escape_ptr);
6808
0
    if (result == NULL)
6809
0
        return NULL;
6810
0
    if (first_invalid_escape_char != -1) {
6811
0
        if (first_invalid_escape_char > 0xff) {
6812
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6813
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6814
0
                                 "Such sequences will not work in the future. ",
6815
0
                                 first_invalid_escape_char) < 0)
6816
0
            {
6817
0
                Py_DECREF(result);
6818
0
                return NULL;
6819
0
            }
6820
0
        }
6821
0
        else {
6822
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6823
0
                                 "\"\\%c\" is an invalid escape sequence. "
6824
0
                                 "Such sequences will not work in the future. ",
6825
0
                                 first_invalid_escape_char) < 0)
6826
0
            {
6827
0
                Py_DECREF(result);
6828
0
                return NULL;
6829
0
            }
6830
0
        }
6831
0
    }
6832
0
    return result;
6833
0
}
6834
6835
PyObject *
6836
PyUnicode_DecodeUnicodeEscape(const char *s,
6837
                              Py_ssize_t size,
6838
                              const char *errors)
6839
0
{
6840
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6841
0
}
6842
6843
/* Return a Unicode-Escape string version of the Unicode object. */
6844
6845
PyObject *
6846
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6847
360k
{
6848
360k
    if (!PyUnicode_Check(unicode)) {
6849
0
        PyErr_BadArgument();
6850
0
        return NULL;
6851
0
    }
6852
6853
360k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6854
360k
    if (len == 0) {
6855
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6856
0
    }
6857
360k
    int kind = PyUnicode_KIND(unicode);
6858
360k
    const void *data = PyUnicode_DATA(unicode);
6859
6860
    /* Initial allocation is based on the longest-possible character
6861
     * escape.
6862
     *
6863
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6864
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6865
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6866
360k
    Py_ssize_t expandsize = kind * 2 + 2;
6867
360k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6868
0
        return PyErr_NoMemory();
6869
0
    }
6870
6871
360k
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6872
360k
    if (writer == NULL) {
6873
0
        return NULL;
6874
0
    }
6875
360k
    char *p = PyBytesWriter_GetData(writer);
6876
6877
721k
    for (Py_ssize_t i = 0; i < len; i++) {
6878
360k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6879
6880
        /* U+0000-U+00ff range */
6881
360k
        if (ch < 0x100) {
6882
354k
            if (ch >= ' ' && ch < 127) {
6883
28.5k
                if (ch != '\\') {
6884
                    /* Copy printable US ASCII as-is */
6885
0
                    *p++ = (char) ch;
6886
0
                }
6887
                /* Escape backslashes */
6888
28.5k
                else {
6889
28.5k
                    *p++ = '\\';
6890
28.5k
                    *p++ = '\\';
6891
28.5k
                }
6892
28.5k
            }
6893
6894
            /* Map special whitespace to '\t', \n', '\r' */
6895
325k
            else if (ch == '\t') {
6896
2.72k
                *p++ = '\\';
6897
2.72k
                *p++ = 't';
6898
2.72k
            }
6899
322k
            else if (ch == '\n') {
6900
910
                *p++ = '\\';
6901
910
                *p++ = 'n';
6902
910
            }
6903
322k
            else if (ch == '\r') {
6904
518
                *p++ = '\\';
6905
518
                *p++ = 'r';
6906
518
            }
6907
6908
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6909
321k
            else {
6910
321k
                *p++ = '\\';
6911
321k
                *p++ = 'x';
6912
321k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6913
321k
                *p++ = Py_hexdigits[ch & 0x000F];
6914
321k
            }
6915
354k
        }
6916
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6917
6.67k
        else if (ch < 0x10000) {
6918
5.63k
            *p++ = '\\';
6919
5.63k
            *p++ = 'u';
6920
5.63k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6921
5.63k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6922
5.63k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6923
5.63k
            *p++ = Py_hexdigits[ch & 0x000F];
6924
5.63k
        }
6925
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6926
1.03k
        else {
6927
6928
            /* Make sure that the first two digits are zero */
6929
1.03k
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6930
1.03k
            *p++ = '\\';
6931
1.03k
            *p++ = 'U';
6932
1.03k
            *p++ = '0';
6933
1.03k
            *p++ = '0';
6934
1.03k
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6935
1.03k
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6936
1.03k
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6937
1.03k
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6938
1.03k
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6939
1.03k
            *p++ = Py_hexdigits[ch & 0x0000000F];
6940
1.03k
        }
6941
360k
    }
6942
6943
360k
    return PyBytesWriter_FinishWithPointer(writer, p);
6944
360k
}
6945
6946
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6947
6948
PyObject *
6949
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6950
                                          Py_ssize_t size,
6951
                                          const char *errors,
6952
                                          Py_ssize_t *consumed)
6953
0
{
6954
0
    const char *starts = s;
6955
0
    _PyUnicodeWriter writer;
6956
0
    const char *end;
6957
0
    PyObject *errorHandler = NULL;
6958
0
    PyObject *exc = NULL;
6959
6960
0
    if (size == 0) {
6961
0
        if (consumed) {
6962
0
            *consumed = 0;
6963
0
        }
6964
0
        _Py_RETURN_UNICODE_EMPTY();
6965
0
    }
6966
6967
    /* Escaped strings will always be longer than the resulting
6968
       Unicode string, so we start with size here and then reduce the
6969
       length after conversion to the true value. (But decoding error
6970
       handler might have to resize the string) */
6971
0
    _PyUnicodeWriter_Init(&writer);
6972
0
    writer.min_length = size;
6973
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6974
0
        goto onError;
6975
0
    }
6976
6977
0
    end = s + size;
6978
0
    while (s < end) {
6979
0
        unsigned char c = (unsigned char) *s++;
6980
0
        Py_UCS4 ch;
6981
0
        int count;
6982
0
        const char *message;
6983
6984
0
#define WRITE_CHAR(ch)                                                        \
6985
0
            do {                                                              \
6986
0
                if (ch <= writer.maxchar) {                                   \
6987
0
                    assert(writer.pos < writer.size);                         \
6988
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6989
0
                }                                                             \
6990
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6991
0
                    goto onError;                                             \
6992
0
                }                                                             \
6993
0
            } while(0)
6994
6995
        /* Non-escape characters are interpreted as Unicode ordinals */
6996
0
        if (c != '\\' || (s >= end && !consumed)) {
6997
0
            WRITE_CHAR(c);
6998
0
            continue;
6999
0
        }
7000
7001
0
        Py_ssize_t startinpos = s - starts - 1;
7002
        /* \ - Escapes */
7003
0
        if (s >= end) {
7004
0
            assert(consumed);
7005
            // Set message to silent compiler warning.
7006
            // Actually it is never used.
7007
0
            message = "\\ at end of string";
7008
0
            goto incomplete;
7009
0
        }
7010
7011
0
        c = (unsigned char) *s++;
7012
0
        if (c == 'u') {
7013
0
            count = 4;
7014
0
            message = "truncated \\uXXXX escape";
7015
0
        }
7016
0
        else if (c == 'U') {
7017
0
            count = 8;
7018
0
            message = "truncated \\UXXXXXXXX escape";
7019
0
        }
7020
0
        else {
7021
0
            assert(writer.pos < writer.size);
7022
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
7023
0
            WRITE_CHAR(c);
7024
0
            continue;
7025
0
        }
7026
7027
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
7028
0
        for (ch = 0; count; ++s, --count) {
7029
0
            if (s >= end) {
7030
0
                goto incomplete;
7031
0
            }
7032
0
            c = (unsigned char)*s;
7033
0
            ch <<= 4;
7034
0
            if (c >= '0' && c <= '9') {
7035
0
                ch += c - '0';
7036
0
            }
7037
0
            else if (c >= 'a' && c <= 'f') {
7038
0
                ch += c - ('a' - 10);
7039
0
            }
7040
0
            else if (c >= 'A' && c <= 'F') {
7041
0
                ch += c - ('A' - 10);
7042
0
            }
7043
0
            else {
7044
0
                goto error;
7045
0
            }
7046
0
        }
7047
0
        if (ch > MAX_UNICODE) {
7048
0
            message = "\\Uxxxxxxxx out of range";
7049
0
            goto error;
7050
0
        }
7051
0
        WRITE_CHAR(ch);
7052
0
        continue;
7053
7054
0
      incomplete:
7055
0
        if (consumed) {
7056
0
            *consumed = startinpos;
7057
0
            break;
7058
0
        }
7059
0
      error:;
7060
0
        Py_ssize_t endinpos = s-starts;
7061
0
        writer.min_length = end - s + writer.pos;
7062
0
        if (unicode_decode_call_errorhandler_writer(
7063
0
                errors, &errorHandler,
7064
0
                "rawunicodeescape", message,
7065
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
7066
0
                &writer)) {
7067
0
            goto onError;
7068
0
        }
7069
0
        assert(end - s <= writer.size - writer.pos);
7070
7071
0
#undef WRITE_CHAR
7072
0
    }
7073
0
    Py_XDECREF(errorHandler);
7074
0
    Py_XDECREF(exc);
7075
0
    return _PyUnicodeWriter_Finish(&writer);
7076
7077
0
  onError:
7078
0
    _PyUnicodeWriter_Dealloc(&writer);
7079
0
    Py_XDECREF(errorHandler);
7080
0
    Py_XDECREF(exc);
7081
0
    return NULL;
7082
0
}
7083
7084
PyObject *
7085
PyUnicode_DecodeRawUnicodeEscape(const char *s,
7086
                                 Py_ssize_t size,
7087
                                 const char *errors)
7088
0
{
7089
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
7090
0
}
7091
7092
7093
PyObject *
7094
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7095
207k
{
7096
207k
    if (!PyUnicode_Check(unicode)) {
7097
0
        PyErr_BadArgument();
7098
0
        return NULL;
7099
0
    }
7100
207k
    int kind = PyUnicode_KIND(unicode);
7101
207k
    const void *data = PyUnicode_DATA(unicode);
7102
207k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7103
207k
    if (len == 0) {
7104
378
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7105
378
    }
7106
207k
    if (kind == PyUnicode_1BYTE_KIND) {
7107
206k
        return PyBytes_FromStringAndSize(data, len);
7108
206k
    }
7109
7110
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7111
       bytes, and 1 byte characters 4. */
7112
341
    Py_ssize_t expandsize = kind * 2 + 2;
7113
341
    if (len > PY_SSIZE_T_MAX / expandsize) {
7114
0
        return PyErr_NoMemory();
7115
0
    }
7116
7117
341
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7118
341
    if (writer == NULL) {
7119
0
        return NULL;
7120
0
    }
7121
341
    char *p = PyBytesWriter_GetData(writer);
7122
7123
4.92M
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7124
4.92M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7125
7126
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7127
4.92M
        if (ch < 0x100) {
7128
4.89M
            *p++ = (char) ch;
7129
4.89M
        }
7130
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7131
28.1k
        else if (ch < 0x10000) {
7132
27.5k
            *p++ = '\\';
7133
27.5k
            *p++ = 'u';
7134
27.5k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7135
27.5k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7136
27.5k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7137
27.5k
            *p++ = Py_hexdigits[ch & 15];
7138
27.5k
        }
7139
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7140
552
        else {
7141
552
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7142
552
            *p++ = '\\';
7143
552
            *p++ = 'U';
7144
552
            *p++ = '0';
7145
552
            *p++ = '0';
7146
552
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7147
552
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7148
552
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7149
552
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7150
552
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7151
552
            *p++ = Py_hexdigits[ch & 15];
7152
552
        }
7153
4.92M
    }
7154
7155
341
    return PyBytesWriter_FinishWithPointer(writer, p);
7156
341
}
7157
7158
/* --- Latin-1 Codec ------------------------------------------------------ */
7159
7160
PyObject *
7161
PyUnicode_DecodeLatin1(const char *s,
7162
                       Py_ssize_t size,
7163
                       const char *errors)
7164
3.17M
{
7165
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7166
3.17M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7167
3.17M
}
7168
7169
/* create or adjust a UnicodeEncodeError */
7170
static void
7171
make_encode_exception(PyObject **exceptionObject,
7172
                      const char *encoding,
7173
                      PyObject *unicode,
7174
                      Py_ssize_t startpos, Py_ssize_t endpos,
7175
                      const char *reason)
7176
197k
{
7177
197k
    if (*exceptionObject == NULL) {
7178
197k
        *exceptionObject = PyObject_CallFunction(
7179
197k
            PyExc_UnicodeEncodeError, "sOnns",
7180
197k
            encoding, unicode, startpos, endpos, reason);
7181
197k
    }
7182
0
    else {
7183
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7184
0
            goto onError;
7185
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7186
0
            goto onError;
7187
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7188
0
            goto onError;
7189
0
        return;
7190
0
      onError:
7191
0
        Py_CLEAR(*exceptionObject);
7192
0
    }
7193
197k
}
7194
7195
/* raises a UnicodeEncodeError */
7196
static void
7197
raise_encode_exception(PyObject **exceptionObject,
7198
                       const char *encoding,
7199
                       PyObject *unicode,
7200
                       Py_ssize_t startpos, Py_ssize_t endpos,
7201
                       const char *reason)
7202
38.7k
{
7203
38.7k
    make_encode_exception(exceptionObject,
7204
38.7k
                          encoding, unicode, startpos, endpos, reason);
7205
38.7k
    if (*exceptionObject != NULL)
7206
38.7k
        PyCodec_StrictErrors(*exceptionObject);
7207
38.7k
}
7208
7209
/* error handling callback helper:
7210
   build arguments, call the callback and check the arguments,
7211
   put the result into newpos and return the replacement string, which
7212
   has to be freed by the caller */
7213
static PyObject *
7214
unicode_encode_call_errorhandler(const char *errors,
7215
                                 PyObject **errorHandler,
7216
                                 const char *encoding, const char *reason,
7217
                                 PyObject *unicode, PyObject **exceptionObject,
7218
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7219
                                 Py_ssize_t *newpos)
7220
159k
{
7221
159k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7222
159k
    Py_ssize_t len;
7223
159k
    PyObject *restuple;
7224
159k
    PyObject *resunicode;
7225
7226
159k
    if (*errorHandler == NULL) {
7227
159k
        *errorHandler = PyCodec_LookupError(errors);
7228
159k
        if (*errorHandler == NULL)
7229
0
            return NULL;
7230
159k
    }
7231
7232
159k
    len = PyUnicode_GET_LENGTH(unicode);
7233
7234
159k
    make_encode_exception(exceptionObject,
7235
159k
                          encoding, unicode, startpos, endpos, reason);
7236
159k
    if (*exceptionObject == NULL)
7237
0
        return NULL;
7238
7239
159k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7240
159k
    if (restuple == NULL)
7241
159k
        return NULL;
7242
0
    if (!PyTuple_Check(restuple)) {
7243
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7244
0
        Py_DECREF(restuple);
7245
0
        return NULL;
7246
0
    }
7247
0
    if (!PyArg_ParseTuple(restuple, argparse,
7248
0
                          &resunicode, newpos)) {
7249
0
        Py_DECREF(restuple);
7250
0
        return NULL;
7251
0
    }
7252
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7253
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7254
0
        Py_DECREF(restuple);
7255
0
        return NULL;
7256
0
    }
7257
0
    if (*newpos<0)
7258
0
        *newpos = len + *newpos;
7259
0
    if (*newpos<0 || *newpos>len) {
7260
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7261
0
        Py_DECREF(restuple);
7262
0
        return NULL;
7263
0
    }
7264
0
    Py_INCREF(resunicode);
7265
0
    Py_DECREF(restuple);
7266
0
    return resunicode;
7267
0
}
7268
7269
static PyObject *
7270
unicode_encode_ucs1(PyObject *unicode,
7271
                    const char *errors,
7272
                    const Py_UCS4 limit)
7273
49.6k
{
7274
    /* input state */
7275
49.6k
    Py_ssize_t pos=0, size;
7276
49.6k
    int kind;
7277
49.6k
    const void *data;
7278
49.6k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7279
49.6k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7280
49.6k
    PyObject *error_handler_obj = NULL;
7281
49.6k
    PyObject *exc = NULL;
7282
49.6k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7283
49.6k
    PyObject *rep = NULL;
7284
7285
49.6k
    size = PyUnicode_GET_LENGTH(unicode);
7286
49.6k
    kind = PyUnicode_KIND(unicode);
7287
49.6k
    data = PyUnicode_DATA(unicode);
7288
    /* allocate enough for a simple encoding without
7289
       replacements, if we need more, we'll resize */
7290
49.6k
    if (size == 0)
7291
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7292
7293
    /* output object */
7294
49.6k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7295
49.6k
    if (writer == NULL) {
7296
0
        return NULL;
7297
0
    }
7298
    /* pointer into the output */
7299
49.6k
    char *str = PyBytesWriter_GetData(writer);
7300
7301
3.53M
    while (pos < size) {
7302
3.53M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7303
7304
        /* can we encode this? */
7305
3.53M
        if (ch < limit) {
7306
            /* no overflow check, because we know that the space is enough */
7307
3.49M
            *str++ = (char)ch;
7308
3.49M
            ++pos;
7309
3.49M
        }
7310
49.6k
        else {
7311
49.6k
            Py_ssize_t newpos, i;
7312
            /* startpos for collecting unencodable chars */
7313
49.6k
            Py_ssize_t collstart = pos;
7314
49.6k
            Py_ssize_t collend = collstart + 1;
7315
            /* find all unecodable characters */
7316
7317
384k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7318
335k
                ++collend;
7319
7320
            /* Only overallocate the buffer if it's not the last write */
7321
49.6k
            writer->overallocate = (collend < size);
7322
7323
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7324
49.6k
            if (error_handler == _Py_ERROR_UNKNOWN)
7325
49.6k
                error_handler = _Py_GetErrorHandler(errors);
7326
7327
49.6k
            switch (error_handler) {
7328
38.7k
            case _Py_ERROR_STRICT:
7329
38.7k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7330
38.7k
                goto onError;
7331
7332
0
            case _Py_ERROR_REPLACE:
7333
0
                memset(str, '?', collend - collstart);
7334
0
                str += (collend - collstart);
7335
0
                _Py_FALLTHROUGH;
7336
0
            case _Py_ERROR_IGNORE:
7337
0
                pos = collend;
7338
0
                break;
7339
7340
0
            case _Py_ERROR_BACKSLASHREPLACE:
7341
                /* subtract preallocated bytes */
7342
0
                writer->size -= (collend - collstart);
7343
0
                str = backslashreplace(writer, str,
7344
0
                                       unicode, collstart, collend);
7345
0
                if (str == NULL)
7346
0
                    goto onError;
7347
0
                pos = collend;
7348
0
                break;
7349
7350
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7351
                /* subtract preallocated bytes */
7352
0
                writer->size -= (collend - collstart);
7353
0
                str = xmlcharrefreplace(writer, str,
7354
0
                                        unicode, collstart, collend);
7355
0
                if (str == NULL)
7356
0
                    goto onError;
7357
0
                pos = collend;
7358
0
                break;
7359
7360
10.8k
            case _Py_ERROR_SURROGATEESCAPE:
7361
10.8k
                for (i = collstart; i < collend; ++i) {
7362
10.8k
                    ch = PyUnicode_READ(kind, data, i);
7363
10.8k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7364
                        /* Not a UTF-8b surrogate */
7365
10.8k
                        break;
7366
10.8k
                    }
7367
0
                    *str++ = (char)(ch - 0xdc00);
7368
0
                    ++pos;
7369
0
                }
7370
10.8k
                if (i >= collend)
7371
0
                    break;
7372
10.8k
                collstart = pos;
7373
10.8k
                assert(collstart != collend);
7374
10.8k
                _Py_FALLTHROUGH;
7375
7376
10.8k
            default:
7377
10.8k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7378
10.8k
                                                       encoding, reason, unicode, &exc,
7379
10.8k
                                                       collstart, collend, &newpos);
7380
10.8k
                if (rep == NULL)
7381
10.8k
                    goto onError;
7382
7383
0
                if (newpos < collstart) {
7384
0
                    writer->overallocate = 1;
7385
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7386
0
                                                             collstart - newpos,
7387
0
                                                             str);
7388
0
                    if (str == NULL) {
7389
0
                        goto onError;
7390
0
                    }
7391
0
                }
7392
0
                else {
7393
                    /* subtract preallocated bytes */
7394
0
                    writer->size -= newpos - collstart;
7395
                    /* Only overallocate the buffer if it's not the last write */
7396
0
                    writer->overallocate = (newpos < size);
7397
0
                }
7398
7399
0
                char *rep_str;
7400
0
                Py_ssize_t rep_len;
7401
0
                if (PyBytes_Check(rep)) {
7402
                    /* Directly copy bytes result to output. */
7403
0
                    rep_str = PyBytes_AS_STRING(rep);
7404
0
                    rep_len = PyBytes_GET_SIZE(rep);
7405
0
                }
7406
0
                else {
7407
0
                    assert(PyUnicode_Check(rep));
7408
7409
0
                    if (limit == 256 ?
7410
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7411
0
                        !PyUnicode_IS_ASCII(rep))
7412
0
                    {
7413
                        /* Not all characters are smaller than limit */
7414
0
                        raise_encode_exception(&exc, encoding, unicode,
7415
0
                                               collstart, collend, reason);
7416
0
                        goto onError;
7417
0
                    }
7418
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7419
0
                    rep_str = PyUnicode_DATA(rep);
7420
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7421
0
                }
7422
7423
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7424
0
                if (str == NULL) {
7425
0
                    goto onError;
7426
0
                }
7427
0
                memcpy(str, rep_str, rep_len);
7428
0
                str += rep_len;
7429
7430
0
                pos = newpos;
7431
0
                Py_CLEAR(rep);
7432
49.6k
            }
7433
7434
            /* If overallocation was disabled, ensure that it was the last
7435
               write. Otherwise, we missed an optimization */
7436
49.6k
            assert(writer->overallocate || pos == size);
7437
0
        }
7438
3.53M
    }
7439
7440
0
    Py_XDECREF(error_handler_obj);
7441
0
    Py_XDECREF(exc);
7442
0
    return PyBytesWriter_FinishWithPointer(writer, str);
7443
7444
49.6k
  onError:
7445
49.6k
    Py_XDECREF(rep);
7446
49.6k
    PyBytesWriter_Discard(writer);
7447
49.6k
    Py_XDECREF(error_handler_obj);
7448
49.6k
    Py_XDECREF(exc);
7449
49.6k
    return NULL;
7450
49.6k
}
7451
7452
PyObject *
7453
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7454
0
{
7455
0
    if (!PyUnicode_Check(unicode)) {
7456
0
        PyErr_BadArgument();
7457
0
        return NULL;
7458
0
    }
7459
    /* Fast path: if it is a one-byte string, construct
7460
       bytes object directly. */
7461
0
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7462
0
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7463
0
                                         PyUnicode_GET_LENGTH(unicode));
7464
    /* Non-Latin-1 characters present. Defer to above function to
7465
       raise the exception. */
7466
0
    return unicode_encode_ucs1(unicode, errors, 256);
7467
0
}
7468
7469
PyObject*
7470
PyUnicode_AsLatin1String(PyObject *unicode)
7471
0
{
7472
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7473
0
}
7474
7475
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7476
7477
PyObject *
7478
PyUnicode_DecodeASCII(const char *s,
7479
                      Py_ssize_t size,
7480
                      const char *errors)
7481
595k
{
7482
595k
    const char *starts = s;
7483
595k
    const char *e = s + size;
7484
595k
    PyObject *error_handler_obj = NULL;
7485
595k
    PyObject *exc = NULL;
7486
595k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7487
7488
595k
    if (size == 0)
7489
0
        _Py_RETURN_UNICODE_EMPTY();
7490
7491
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7492
595k
    if (size == 1 && (unsigned char)s[0] < 128) {
7493
8.29k
        return get_latin1_char((unsigned char)s[0]);
7494
8.29k
    }
7495
7496
    // Shortcut for simple case
7497
587k
    PyObject *u = PyUnicode_New(size, 127);
7498
587k
    if (u == NULL) {
7499
0
        return NULL;
7500
0
    }
7501
587k
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7502
587k
    if (outpos == size) {
7503
440k
        return u;
7504
440k
    }
7505
7506
147k
    _PyUnicodeWriter writer;
7507
147k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7508
147k
    writer.pos = outpos;
7509
7510
147k
    s += outpos;
7511
147k
    int kind = writer.kind;
7512
147k
    void *data = writer.data;
7513
147k
    Py_ssize_t startinpos, endinpos;
7514
7515
18.9M
    while (s < e) {
7516
18.7M
        unsigned char c = (unsigned char)*s;
7517
18.7M
        if (c < 128) {
7518
6.13M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7519
6.13M
            writer.pos++;
7520
6.13M
            ++s;
7521
6.13M
            continue;
7522
6.13M
        }
7523
7524
        /* byte outsize range 0x00..0x7f: call the error handler */
7525
7526
12.6M
        if (error_handler == _Py_ERROR_UNKNOWN)
7527
147k
            error_handler = _Py_GetErrorHandler(errors);
7528
7529
12.6M
        switch (error_handler)
7530
12.6M
        {
7531
725k
        case _Py_ERROR_REPLACE:
7532
12.6M
        case _Py_ERROR_SURROGATEESCAPE:
7533
            /* Fast-path: the error handler only writes one character,
7534
               but we may switch to UCS2 at the first write */
7535
12.6M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7536
0
                goto onError;
7537
12.6M
            kind = writer.kind;
7538
12.6M
            data = writer.data;
7539
7540
12.6M
            if (error_handler == _Py_ERROR_REPLACE)
7541
725k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7542
11.9M
            else
7543
11.9M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7544
12.6M
            writer.pos++;
7545
12.6M
            ++s;
7546
12.6M
            break;
7547
7548
0
        case _Py_ERROR_IGNORE:
7549
0
            ++s;
7550
0
            break;
7551
7552
6.66k
        default:
7553
6.66k
            startinpos = s-starts;
7554
6.66k
            endinpos = startinpos + 1;
7555
6.66k
            if (unicode_decode_call_errorhandler_writer(
7556
6.66k
                    errors, &error_handler_obj,
7557
6.66k
                    "ascii", "ordinal not in range(128)",
7558
6.66k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7559
6.66k
                    &writer))
7560
6.66k
                goto onError;
7561
0
            kind = writer.kind;
7562
0
            data = writer.data;
7563
12.6M
        }
7564
12.6M
    }
7565
140k
    Py_XDECREF(error_handler_obj);
7566
140k
    Py_XDECREF(exc);
7567
140k
    return _PyUnicodeWriter_Finish(&writer);
7568
7569
6.66k
  onError:
7570
6.66k
    _PyUnicodeWriter_Dealloc(&writer);
7571
6.66k
    Py_XDECREF(error_handler_obj);
7572
6.66k
    Py_XDECREF(exc);
7573
6.66k
    return NULL;
7574
147k
}
7575
7576
PyObject *
7577
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7578
108k
{
7579
108k
    if (!PyUnicode_Check(unicode)) {
7580
0
        PyErr_BadArgument();
7581
0
        return NULL;
7582
0
    }
7583
    /* Fast path: if it is an ASCII-only string, construct bytes object
7584
       directly. Else defer to above function to raise the exception. */
7585
108k
    if (PyUnicode_IS_ASCII(unicode))
7586
58.8k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7587
58.8k
                                         PyUnicode_GET_LENGTH(unicode));
7588
49.6k
    return unicode_encode_ucs1(unicode, errors, 128);
7589
108k
}
7590
7591
PyObject *
7592
PyUnicode_AsASCIIString(PyObject *unicode)
7593
4
{
7594
4
    return _PyUnicode_AsASCIIString(unicode, NULL);
7595
4
}
7596
7597
#ifdef MS_WINDOWS
7598
7599
/* --- MBCS codecs for Windows -------------------------------------------- */
7600
7601
#if SIZEOF_INT < SIZEOF_SIZE_T
7602
#define NEED_RETRY
7603
#endif
7604
7605
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7606
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7607
   both cases also and avoids partial characters overrunning the
7608
   length limit in MultiByteToWideChar on Windows */
7609
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7610
7611
#ifndef WC_ERR_INVALID_CHARS
7612
#  define WC_ERR_INVALID_CHARS 0x0080
7613
#endif
7614
7615
static const char*
7616
code_page_name(UINT code_page, PyObject **obj)
7617
{
7618
    *obj = NULL;
7619
    if (code_page == CP_ACP)
7620
        return "mbcs";
7621
7622
    *obj = PyBytes_FromFormat("cp%u", code_page);
7623
    if (*obj == NULL)
7624
        return NULL;
7625
    return PyBytes_AS_STRING(*obj);
7626
}
7627
7628
static DWORD
7629
decode_code_page_flags(UINT code_page)
7630
{
7631
    if (code_page == CP_UTF7) {
7632
        /* The CP_UTF7 decoder only supports flags=0 */
7633
        return 0;
7634
    }
7635
    else
7636
        return MB_ERR_INVALID_CHARS;
7637
}
7638
7639
/*
7640
 * Decode a byte string from a Windows code page into unicode object in strict
7641
 * mode.
7642
 *
7643
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7644
 * OSError and returns -1 on other error.
7645
 */
7646
static int
7647
decode_code_page_strict(UINT code_page,
7648
                        wchar_t **buf,
7649
                        Py_ssize_t *bufsize,
7650
                        const char *in,
7651
                        int insize)
7652
{
7653
    DWORD flags = MB_ERR_INVALID_CHARS;
7654
    wchar_t *out;
7655
    DWORD outsize;
7656
7657
    /* First get the size of the result */
7658
    assert(insize > 0);
7659
    while ((outsize = MultiByteToWideChar(code_page, flags,
7660
                                          in, insize, NULL, 0)) <= 0)
7661
    {
7662
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7663
            goto error;
7664
        }
7665
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7666
        flags = 0;
7667
    }
7668
7669
    /* Extend a wchar_t* buffer */
7670
    Py_ssize_t n = *bufsize;   /* Get the current length */
7671
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7672
        return -1;
7673
    }
7674
    out = *buf + n;
7675
7676
    /* Do the conversion */
7677
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7678
    if (outsize <= 0)
7679
        goto error;
7680
    return insize;
7681
7682
error:
7683
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7684
        return -2;
7685
    PyErr_SetFromWindowsErr(0);
7686
    return -1;
7687
}
7688
7689
/*
7690
 * Decode a byte string from a code page into unicode object with an error
7691
 * handler.
7692
 *
7693
 * Returns consumed size if succeed, or raise an OSError or
7694
 * UnicodeDecodeError exception and returns -1 on error.
7695
 */
7696
static int
7697
decode_code_page_errors(UINT code_page,
7698
                        wchar_t **buf,
7699
                        Py_ssize_t *bufsize,
7700
                        const char *in, const int size,
7701
                        const char *errors, int final)
7702
{
7703
    const char *startin = in;
7704
    const char *endin = in + size;
7705
    DWORD flags = MB_ERR_INVALID_CHARS;
7706
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7707
       2000 English version of the message. */
7708
    const char *reason = "No mapping for the Unicode character exists "
7709
                         "in the target code page.";
7710
    /* each step cannot decode more than 1 character, but a character can be
7711
       represented as a surrogate pair */
7712
    wchar_t buffer[2], *out;
7713
    int insize;
7714
    Py_ssize_t outsize;
7715
    PyObject *errorHandler = NULL;
7716
    PyObject *exc = NULL;
7717
    PyObject *encoding_obj = NULL;
7718
    const char *encoding;
7719
    DWORD err;
7720
    int ret = -1;
7721
7722
    assert(size > 0);
7723
7724
    encoding = code_page_name(code_page, &encoding_obj);
7725
    if (encoding == NULL)
7726
        return -1;
7727
7728
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7729
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7730
           UnicodeDecodeError. */
7731
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7732
        if (exc != NULL) {
7733
            PyCodec_StrictErrors(exc);
7734
            Py_CLEAR(exc);
7735
        }
7736
        goto error;
7737
    }
7738
7739
    /* Extend a wchar_t* buffer */
7740
    Py_ssize_t n = *bufsize;   /* Get the current length */
7741
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7742
        PyErr_NoMemory();
7743
        goto error;
7744
    }
7745
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7746
        goto error;
7747
    }
7748
    out = *buf + n;
7749
7750
    /* Decode the byte string character per character */
7751
    while (in < endin)
7752
    {
7753
        /* Decode a character */
7754
        insize = 1;
7755
        do
7756
        {
7757
            outsize = MultiByteToWideChar(code_page, flags,
7758
                                          in, insize,
7759
                                          buffer, Py_ARRAY_LENGTH(buffer));
7760
            if (outsize > 0)
7761
                break;
7762
            err = GetLastError();
7763
            if (err == ERROR_INVALID_FLAGS && flags) {
7764
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7765
                flags = 0;
7766
                continue;
7767
            }
7768
            if (err != ERROR_NO_UNICODE_TRANSLATION
7769
                && err != ERROR_INSUFFICIENT_BUFFER)
7770
            {
7771
                PyErr_SetFromWindowsErr(err);
7772
                goto error;
7773
            }
7774
            insize++;
7775
        }
7776
        /* 4=maximum length of a UTF-8 sequence */
7777
        while (insize <= 4 && (in + insize) <= endin);
7778
7779
        if (outsize <= 0) {
7780
            Py_ssize_t startinpos, endinpos, outpos;
7781
7782
            /* last character in partial decode? */
7783
            if (in + insize >= endin && !final)
7784
                break;
7785
7786
            startinpos = in - startin;
7787
            endinpos = startinpos + 1;
7788
            outpos = out - *buf;
7789
            if (unicode_decode_call_errorhandler_wchar(
7790
                    errors, &errorHandler,
7791
                    encoding, reason,
7792
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7793
                    buf, bufsize, &outpos))
7794
            {
7795
                goto error;
7796
            }
7797
            out = *buf + outpos;
7798
        }
7799
        else {
7800
            in += insize;
7801
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7802
            out += outsize;
7803
        }
7804
    }
7805
7806
    /* Shrink the buffer */
7807
    assert(out - *buf <= *bufsize);
7808
    *bufsize = out - *buf;
7809
    /* (in - startin) <= size and size is an int */
7810
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7811
7812
error:
7813
    Py_XDECREF(encoding_obj);
7814
    Py_XDECREF(errorHandler);
7815
    Py_XDECREF(exc);
7816
    return ret;
7817
}
7818
7819
static PyObject *
7820
decode_code_page_stateful(int code_page,
7821
                          const char *s, Py_ssize_t size,
7822
                          const char *errors, Py_ssize_t *consumed)
7823
{
7824
    wchar_t *buf = NULL;
7825
    Py_ssize_t bufsize = 0;
7826
    int chunk_size, final, converted, done;
7827
7828
    if (code_page < 0) {
7829
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7830
        return NULL;
7831
    }
7832
    if (size < 0) {
7833
        PyErr_BadInternalCall();
7834
        return NULL;
7835
    }
7836
7837
    if (consumed)
7838
        *consumed = 0;
7839
7840
    do
7841
    {
7842
#ifdef NEED_RETRY
7843
        if (size > DECODING_CHUNK_SIZE) {
7844
            chunk_size = DECODING_CHUNK_SIZE;
7845
            final = 0;
7846
            done = 0;
7847
        }
7848
        else
7849
#endif
7850
        {
7851
            chunk_size = (int)size;
7852
            final = (consumed == NULL);
7853
            done = 1;
7854
        }
7855
7856
        if (chunk_size == 0 && done) {
7857
            if (buf != NULL)
7858
                break;
7859
            _Py_RETURN_UNICODE_EMPTY();
7860
        }
7861
7862
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7863
                                            s, chunk_size);
7864
        if (converted == -2)
7865
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7866
                                                s, chunk_size,
7867
                                                errors, final);
7868
        assert(converted != 0 || done);
7869
7870
        if (converted < 0) {
7871
            PyMem_Free(buf);
7872
            return NULL;
7873
        }
7874
7875
        if (consumed)
7876
            *consumed += converted;
7877
7878
        s += converted;
7879
        size -= converted;
7880
    } while (!done);
7881
7882
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7883
    PyMem_Free(buf);
7884
    return v;
7885
}
7886
7887
PyObject *
7888
PyUnicode_DecodeCodePageStateful(int code_page,
7889
                                 const char *s,
7890
                                 Py_ssize_t size,
7891
                                 const char *errors,
7892
                                 Py_ssize_t *consumed)
7893
{
7894
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7895
}
7896
7897
PyObject *
7898
PyUnicode_DecodeMBCSStateful(const char *s,
7899
                             Py_ssize_t size,
7900
                             const char *errors,
7901
                             Py_ssize_t *consumed)
7902
{
7903
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7904
}
7905
7906
PyObject *
7907
PyUnicode_DecodeMBCS(const char *s,
7908
                     Py_ssize_t size,
7909
                     const char *errors)
7910
{
7911
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7912
}
7913
7914
static DWORD
7915
encode_code_page_flags(UINT code_page, const char *errors)
7916
{
7917
    if (code_page == CP_UTF8) {
7918
        return WC_ERR_INVALID_CHARS;
7919
    }
7920
    else if (code_page == CP_UTF7) {
7921
        /* CP_UTF7 only supports flags=0 */
7922
        return 0;
7923
    }
7924
    else {
7925
        if (errors != NULL && strcmp(errors, "replace") == 0)
7926
            return 0;
7927
        else
7928
            return WC_NO_BEST_FIT_CHARS;
7929
    }
7930
}
7931
7932
/*
7933
 * Encode a Unicode string to a Windows code page into a byte string in strict
7934
 * mode.
7935
 *
7936
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7937
 * an OSError and returns -1 on other error.
7938
 */
7939
static int
7940
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7941
                        PyObject *unicode, Py_ssize_t offset, int len,
7942
                        const char* errors)
7943
{
7944
    BOOL usedDefaultChar = FALSE;
7945
    BOOL *pusedDefaultChar = &usedDefaultChar;
7946
    int outsize;
7947
    wchar_t *p;
7948
    Py_ssize_t size;
7949
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7950
    char *out;
7951
    /* Create a substring so that we can get the UTF-16 representation
7952
       of just the slice under consideration. */
7953
    PyObject *substring;
7954
    int ret = -1;
7955
7956
    assert(len > 0);
7957
7958
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7959
        pusedDefaultChar = &usedDefaultChar;
7960
    else
7961
        pusedDefaultChar = NULL;
7962
7963
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7964
    if (substring == NULL)
7965
        return -1;
7966
    p = PyUnicode_AsWideCharString(substring, &size);
7967
    Py_CLEAR(substring);
7968
    if (p == NULL) {
7969
        return -1;
7970
    }
7971
    assert(size <= INT_MAX);
7972
7973
    /* First get the size of the result */
7974
    outsize = WideCharToMultiByte(code_page, flags,
7975
                                  p, (int)size,
7976
                                  NULL, 0,
7977
                                  NULL, pusedDefaultChar);
7978
    if (outsize <= 0)
7979
        goto error;
7980
    /* If we used a default char, then we failed! */
7981
    if (pusedDefaultChar && *pusedDefaultChar) {
7982
        ret = -2;
7983
        goto done;
7984
    }
7985
7986
    if (*writer == NULL) {
7987
        /* Create string object */
7988
        *writer = PyBytesWriter_Create(outsize);
7989
        if (*writer == NULL) {
7990
            goto done;
7991
        }
7992
        out = PyBytesWriter_GetData(*writer);
7993
    }
7994
    else {
7995
        /* Extend string object */
7996
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
7997
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
7998
            goto done;
7999
        }
8000
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8001
    }
8002
8003
    /* Do the conversion */
8004
    outsize = WideCharToMultiByte(code_page, flags,
8005
                                  p, (int)size,
8006
                                  out, outsize,
8007
                                  NULL, pusedDefaultChar);
8008
    if (outsize <= 0)
8009
        goto error;
8010
    if (pusedDefaultChar && *pusedDefaultChar) {
8011
        ret = -2;
8012
        goto done;
8013
    }
8014
    ret = 0;
8015
8016
done:
8017
    PyMem_Free(p);
8018
    return ret;
8019
8020
error:
8021
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
8022
        ret = -2;
8023
        goto done;
8024
    }
8025
    PyErr_SetFromWindowsErr(0);
8026
    goto done;
8027
}
8028
8029
/*
8030
 * Encode a Unicode string to a Windows code page into a byte string using an
8031
 * error handler.
8032
 *
8033
 * Returns consumed characters if succeed, or raise an OSError and returns
8034
 * -1 on other error.
8035
 */
8036
static int
8037
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
8038
                        PyObject *unicode, Py_ssize_t unicode_offset,
8039
                        Py_ssize_t insize, const char* errors)
8040
{
8041
    const DWORD flags = encode_code_page_flags(code_page, errors);
8042
    Py_ssize_t pos = unicode_offset;
8043
    Py_ssize_t endin = unicode_offset + insize;
8044
    /* Ideally, we should get reason from FormatMessage. This is the Windows
8045
       2000 English version of the message. */
8046
    const char *reason = "invalid character";
8047
    /* 4=maximum length of a UTF-8 sequence */
8048
    char buffer[4];
8049
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
8050
    Py_ssize_t outsize;
8051
    char *out;
8052
    PyObject *errorHandler = NULL;
8053
    PyObject *exc = NULL;
8054
    PyObject *encoding_obj = NULL;
8055
    const char *encoding;
8056
    Py_ssize_t newpos;
8057
    PyObject *rep;
8058
    int ret = -1;
8059
8060
    assert(insize > 0);
8061
8062
    encoding = code_page_name(code_page, &encoding_obj);
8063
    if (encoding == NULL)
8064
        return -1;
8065
8066
    if (errors == NULL || strcmp(errors, "strict") == 0) {
8067
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
8068
           then we raise a UnicodeEncodeError. */
8069
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
8070
        if (exc != NULL) {
8071
            PyCodec_StrictErrors(exc);
8072
            Py_DECREF(exc);
8073
        }
8074
        Py_XDECREF(encoding_obj);
8075
        return -1;
8076
    }
8077
8078
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
8079
        pusedDefaultChar = &usedDefaultChar;
8080
    else
8081
        pusedDefaultChar = NULL;
8082
8083
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
8084
        PyErr_NoMemory();
8085
        goto error;
8086
    }
8087
    outsize = insize * Py_ARRAY_LENGTH(buffer);
8088
8089
    if (*writer == NULL) {
8090
        /* Create string object */
8091
        *writer = PyBytesWriter_Create(outsize);
8092
        if (*writer == NULL) {
8093
            goto error;
8094
        }
8095
        out = PyBytesWriter_GetData(*writer);
8096
    }
8097
    else {
8098
        /* Extend string object */
8099
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8100
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8101
            goto error;
8102
        }
8103
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8104
    }
8105
8106
    /* Encode the string character per character */
8107
    while (pos < endin)
8108
    {
8109
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8110
        wchar_t chars[2];
8111
        int charsize;
8112
        if (ch < 0x10000) {
8113
            chars[0] = (wchar_t)ch;
8114
            charsize = 1;
8115
        }
8116
        else {
8117
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8118
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8119
            charsize = 2;
8120
        }
8121
8122
        outsize = WideCharToMultiByte(code_page, flags,
8123
                                      chars, charsize,
8124
                                      buffer, Py_ARRAY_LENGTH(buffer),
8125
                                      NULL, pusedDefaultChar);
8126
        if (outsize > 0) {
8127
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8128
            {
8129
                pos++;
8130
                memcpy(out, buffer, outsize);
8131
                out += outsize;
8132
                continue;
8133
            }
8134
        }
8135
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8136
            PyErr_SetFromWindowsErr(0);
8137
            goto error;
8138
        }
8139
8140
        rep = unicode_encode_call_errorhandler(
8141
                  errors, &errorHandler, encoding, reason,
8142
                  unicode, &exc,
8143
                  pos, pos + 1, &newpos);
8144
        if (rep == NULL)
8145
            goto error;
8146
8147
        Py_ssize_t morebytes = pos - newpos;
8148
        if (PyBytes_Check(rep)) {
8149
            outsize = PyBytes_GET_SIZE(rep);
8150
            morebytes += outsize;
8151
            if (morebytes > 0) {
8152
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8153
                if (out == NULL) {
8154
                    Py_DECREF(rep);
8155
                    goto error;
8156
                }
8157
            }
8158
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8159
            out += outsize;
8160
        }
8161
        else {
8162
            Py_ssize_t i;
8163
            int kind;
8164
            const void *data;
8165
8166
            outsize = PyUnicode_GET_LENGTH(rep);
8167
            morebytes += outsize;
8168
            if (morebytes > 0) {
8169
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8170
                if (out == NULL) {
8171
                    Py_DECREF(rep);
8172
                    goto error;
8173
                }
8174
            }
8175
            kind = PyUnicode_KIND(rep);
8176
            data = PyUnicode_DATA(rep);
8177
            for (i=0; i < outsize; i++) {
8178
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8179
                if (ch > 127) {
8180
                    raise_encode_exception(&exc,
8181
                        encoding, unicode,
8182
                        pos, pos + 1,
8183
                        "unable to encode error handler result to ASCII");
8184
                    Py_DECREF(rep);
8185
                    goto error;
8186
                }
8187
                *out = (unsigned char)ch;
8188
                out++;
8189
            }
8190
        }
8191
        pos = newpos;
8192
        Py_DECREF(rep);
8193
    }
8194
    /* write a NUL byte */
8195
    *out = 0;
8196
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8197
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8198
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8199
        goto error;
8200
    }
8201
    ret = 0;
8202
8203
error:
8204
    Py_XDECREF(encoding_obj);
8205
    Py_XDECREF(errorHandler);
8206
    Py_XDECREF(exc);
8207
    return ret;
8208
}
8209
8210
8211
PyObject *
8212
PyUnicode_EncodeCodePage(int code_page,
8213
                         PyObject *unicode,
8214
                         const char *errors)
8215
{
8216
    Py_ssize_t len;
8217
    PyBytesWriter *writer = NULL;
8218
    Py_ssize_t offset;
8219
    int chunk_len, ret, done;
8220
8221
    if (!PyUnicode_Check(unicode)) {
8222
        PyErr_BadArgument();
8223
        return NULL;
8224
    }
8225
8226
    len = PyUnicode_GET_LENGTH(unicode);
8227
8228
    if (code_page < 0) {
8229
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8230
        return NULL;
8231
    }
8232
8233
    if (len == 0)
8234
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8235
8236
    offset = 0;
8237
    do
8238
    {
8239
#ifdef NEED_RETRY
8240
        if (len > DECODING_CHUNK_SIZE) {
8241
            chunk_len = DECODING_CHUNK_SIZE;
8242
            done = 0;
8243
        }
8244
        else
8245
#endif
8246
        {
8247
            chunk_len = (int)len;
8248
            done = 1;
8249
        }
8250
8251
        ret = encode_code_page_strict(code_page, &writer,
8252
                                      unicode, offset, chunk_len,
8253
                                      errors);
8254
        if (ret == -2)
8255
            ret = encode_code_page_errors(code_page, &writer,
8256
                                          unicode, offset,
8257
                                          chunk_len, errors);
8258
        if (ret < 0) {
8259
            PyBytesWriter_Discard(writer);
8260
            return NULL;
8261
        }
8262
8263
        offset += chunk_len;
8264
        len -= chunk_len;
8265
    } while (!done);
8266
8267
    return PyBytesWriter_Finish(writer);
8268
}
8269
8270
8271
PyObject *
8272
PyUnicode_AsMBCSString(PyObject *unicode)
8273
{
8274
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8275
}
8276
8277
#undef NEED_RETRY
8278
8279
#endif /* MS_WINDOWS */
8280
8281
/* --- Character Mapping Codec -------------------------------------------- */
8282
8283
static int
8284
charmap_decode_string(const char *s,
8285
                      Py_ssize_t size,
8286
                      PyObject *mapping,
8287
                      const char *errors,
8288
                      _PyUnicodeWriter *writer)
8289
12.2k
{
8290
12.2k
    const char *starts = s;
8291
12.2k
    const char *e;
8292
12.2k
    Py_ssize_t startinpos, endinpos;
8293
12.2k
    PyObject *errorHandler = NULL, *exc = NULL;
8294
12.2k
    Py_ssize_t maplen;
8295
12.2k
    int mapkind;
8296
12.2k
    const void *mapdata;
8297
12.2k
    Py_UCS4 x;
8298
12.2k
    unsigned char ch;
8299
8300
12.2k
    maplen = PyUnicode_GET_LENGTH(mapping);
8301
12.2k
    mapdata = PyUnicode_DATA(mapping);
8302
12.2k
    mapkind = PyUnicode_KIND(mapping);
8303
8304
12.2k
    e = s + size;
8305
8306
12.2k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8307
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8308
         * is disabled in encoding aliases, latin1 is preferred because
8309
         * its implementation is faster. */
8310
135
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8311
135
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8312
135
        Py_UCS4 maxchar = writer->maxchar;
8313
8314
135
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8315
2.52k
        while (s < e) {
8316
2.39k
            ch = *s;
8317
2.39k
            x = mapdata_ucs1[ch];
8318
2.39k
            if (x > maxchar) {
8319
124
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8320
0
                    goto onError;
8321
124
                maxchar = writer->maxchar;
8322
124
                outdata = (Py_UCS1 *)writer->data;
8323
124
            }
8324
2.39k
            outdata[writer->pos] = x;
8325
2.39k
            writer->pos++;
8326
2.39k
            ++s;
8327
2.39k
        }
8328
135
        return 0;
8329
135
    }
8330
8331
49.0k
    while (s < e) {
8332
42.7k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8333
42.7k
            int outkind = writer->kind;
8334
42.7k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8335
42.7k
            if (outkind == PyUnicode_1BYTE_KIND) {
8336
22.9k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8337
22.9k
                Py_UCS4 maxchar = writer->maxchar;
8338
295k
                while (s < e) {
8339
294k
                    ch = *s;
8340
294k
                    x = mapdata_ucs2[ch];
8341
294k
                    if (x > maxchar)
8342
21.5k
                        goto Error;
8343
272k
                    outdata[writer->pos] = x;
8344
272k
                    writer->pos++;
8345
272k
                    ++s;
8346
272k
                }
8347
1.44k
                break;
8348
22.9k
            }
8349
19.8k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8350
19.8k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8351
587k
                while (s < e) {
8352
582k
                    ch = *s;
8353
582k
                    x = mapdata_ucs2[ch];
8354
582k
                    if (x == 0xFFFE)
8355
15.5k
                        goto Error;
8356
567k
                    outdata[writer->pos] = x;
8357
567k
                    writer->pos++;
8358
567k
                    ++s;
8359
567k
                }
8360
4.34k
                break;
8361
19.8k
            }
8362
42.7k
        }
8363
0
        ch = *s;
8364
8365
0
        if (ch < maplen)
8366
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8367
0
        else
8368
0
            x = 0xfffe; /* invalid value */
8369
37.0k
Error:
8370
37.0k
        if (x == 0xfffe)
8371
24.5k
        {
8372
            /* undefined mapping */
8373
24.5k
            startinpos = s-starts;
8374
24.5k
            endinpos = startinpos+1;
8375
24.5k
            if (unicode_decode_call_errorhandler_writer(
8376
24.5k
                    errors, &errorHandler,
8377
24.5k
                    "charmap", "character maps to <undefined>",
8378
24.5k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8379
24.5k
                    writer)) {
8380
17
                goto onError;
8381
17
            }
8382
24.5k
            continue;
8383
24.5k
        }
8384
8385
12.4k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8386
0
            goto onError;
8387
12.4k
        ++s;
8388
12.4k
    }
8389
12.0k
    Py_XDECREF(errorHandler);
8390
12.0k
    Py_XDECREF(exc);
8391
12.0k
    return 0;
8392
8393
17
onError:
8394
17
    Py_XDECREF(errorHandler);
8395
17
    Py_XDECREF(exc);
8396
17
    return -1;
8397
12.0k
}
8398
8399
static int
8400
charmap_decode_mapping(const char *s,
8401
                       Py_ssize_t size,
8402
                       PyObject *mapping,
8403
                       const char *errors,
8404
                       _PyUnicodeWriter *writer)
8405
0
{
8406
0
    const char *starts = s;
8407
0
    const char *e;
8408
0
    Py_ssize_t startinpos, endinpos;
8409
0
    PyObject *errorHandler = NULL, *exc = NULL;
8410
0
    unsigned char ch;
8411
0
    PyObject *key, *item = NULL;
8412
8413
0
    e = s + size;
8414
8415
0
    while (s < e) {
8416
0
        ch = *s;
8417
8418
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8419
0
        key = PyLong_FromLong((long)ch);
8420
0
        if (key == NULL)
8421
0
            goto onError;
8422
8423
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8424
0
        Py_DECREF(key);
8425
0
        if (rc == 0) {
8426
            /* No mapping found means: mapping is undefined. */
8427
0
            goto Undefined;
8428
0
        }
8429
0
        if (item == NULL) {
8430
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8431
                /* No mapping found means: mapping is undefined. */
8432
0
                PyErr_Clear();
8433
0
                goto Undefined;
8434
0
            } else
8435
0
                goto onError;
8436
0
        }
8437
8438
        /* Apply mapping */
8439
0
        if (item == Py_None)
8440
0
            goto Undefined;
8441
0
        if (PyLong_Check(item)) {
8442
0
            long value = PyLong_AsLong(item);
8443
0
            if (value == 0xFFFE)
8444
0
                goto Undefined;
8445
0
            if (value < 0 || value > MAX_UNICODE) {
8446
0
                PyErr_Format(PyExc_TypeError,
8447
0
                             "character mapping must be in range(0x%x)",
8448
0
                             (unsigned long)MAX_UNICODE + 1);
8449
0
                goto onError;
8450
0
            }
8451
8452
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8453
0
                goto onError;
8454
0
        }
8455
0
        else if (PyUnicode_Check(item)) {
8456
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8457
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8458
0
                if (value == 0xFFFE)
8459
0
                    goto Undefined;
8460
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8461
0
                    goto onError;
8462
0
            }
8463
0
            else {
8464
0
                writer->overallocate = 1;
8465
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8466
0
                    goto onError;
8467
0
            }
8468
0
        }
8469
0
        else {
8470
            /* wrong return value */
8471
0
            PyErr_SetString(PyExc_TypeError,
8472
0
                            "character mapping must return integer, None or str");
8473
0
            goto onError;
8474
0
        }
8475
0
        Py_CLEAR(item);
8476
0
        ++s;
8477
0
        continue;
8478
8479
0
Undefined:
8480
        /* undefined mapping */
8481
0
        Py_CLEAR(item);
8482
0
        startinpos = s-starts;
8483
0
        endinpos = startinpos+1;
8484
0
        if (unicode_decode_call_errorhandler_writer(
8485
0
                errors, &errorHandler,
8486
0
                "charmap", "character maps to <undefined>",
8487
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8488
0
                writer)) {
8489
0
            goto onError;
8490
0
        }
8491
0
    }
8492
0
    Py_XDECREF(errorHandler);
8493
0
    Py_XDECREF(exc);
8494
0
    return 0;
8495
8496
0
onError:
8497
0
    Py_XDECREF(item);
8498
0
    Py_XDECREF(errorHandler);
8499
0
    Py_XDECREF(exc);
8500
0
    return -1;
8501
0
}
8502
8503
PyObject *
8504
PyUnicode_DecodeCharmap(const char *s,
8505
                        Py_ssize_t size,
8506
                        PyObject *mapping,
8507
                        const char *errors)
8508
12.2k
{
8509
12.2k
    _PyUnicodeWriter writer;
8510
8511
    /* Default to Latin-1 */
8512
12.2k
    if (mapping == NULL)
8513
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8514
8515
12.2k
    if (size == 0)
8516
0
        _Py_RETURN_UNICODE_EMPTY();
8517
12.2k
    _PyUnicodeWriter_Init(&writer);
8518
12.2k
    writer.min_length = size;
8519
12.2k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8520
0
        goto onError;
8521
8522
12.2k
    if (PyUnicode_CheckExact(mapping)) {
8523
12.2k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8524
17
            goto onError;
8525
12.2k
    }
8526
0
    else {
8527
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8528
0
            goto onError;
8529
0
    }
8530
12.1k
    return _PyUnicodeWriter_Finish(&writer);
8531
8532
17
  onError:
8533
17
    _PyUnicodeWriter_Dealloc(&writer);
8534
17
    return NULL;
8535
12.2k
}
8536
8537
/* Charmap encoding: the lookup table */
8538
8539
/*[clinic input]
8540
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8541
[clinic start generated code]*/
8542
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8543
8544
struct encoding_map {
8545
    PyObject_HEAD
8546
    unsigned char level1[32];
8547
    int count2, count3;
8548
    unsigned char level23[1];
8549
};
8550
8551
/*[clinic input]
8552
EncodingMap.size
8553
8554
Return the size (in bytes) of this object.
8555
[clinic start generated code]*/
8556
8557
static PyObject *
8558
EncodingMap_size_impl(struct encoding_map *self)
8559
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8560
0
{
8561
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8562
0
                           128*self->count3);
8563
0
}
8564
8565
static PyMethodDef encoding_map_methods[] = {
8566
    ENCODINGMAP_SIZE_METHODDEF
8567
    {NULL, NULL}
8568
};
8569
8570
static PyTypeObject EncodingMapType = {
8571
    PyVarObject_HEAD_INIT(NULL, 0)
8572
    .tp_name = "EncodingMap",
8573
    .tp_basicsize = sizeof(struct encoding_map),
8574
    /* methods */
8575
    .tp_flags = Py_TPFLAGS_DEFAULT,
8576
    .tp_methods = encoding_map_methods,
8577
};
8578
8579
PyObject*
8580
PyUnicode_BuildEncodingMap(PyObject* string)
8581
115
{
8582
115
    PyObject *result;
8583
115
    struct encoding_map *mresult;
8584
115
    int i;
8585
115
    int need_dict = 0;
8586
115
    unsigned char level1[32];
8587
115
    unsigned char level2[512];
8588
115
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8589
115
    int count2 = 0, count3 = 0;
8590
115
    int kind;
8591
115
    const void *data;
8592
115
    int length;
8593
115
    Py_UCS4 ch;
8594
8595
115
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8596
0
        PyErr_BadArgument();
8597
0
        return NULL;
8598
0
    }
8599
115
    kind = PyUnicode_KIND(string);
8600
115
    data = PyUnicode_DATA(string);
8601
115
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8602
115
    memset(level1, 0xFF, sizeof level1);
8603
115
    memset(level2, 0xFF, sizeof level2);
8604
8605
    /* If there isn't a one-to-one mapping of NULL to \0,
8606
       or if there are non-BMP characters, we need to use
8607
       a mapping dictionary. */
8608
115
    if (PyUnicode_READ(kind, data, 0) != 0)
8609
0
        need_dict = 1;
8610
29.4k
    for (i = 1; i < length; i++) {
8611
29.3k
        int l1, l2;
8612
29.3k
        ch = PyUnicode_READ(kind, data, i);
8613
29.3k
        if (ch == 0 || ch > 0xFFFF) {
8614
0
            need_dict = 1;
8615
0
            break;
8616
0
        }
8617
29.3k
        if (ch == 0xFFFE)
8618
            /* unmapped character */
8619
714
            continue;
8620
28.6k
        l1 = ch >> 11;
8621
28.6k
        l2 = ch >> 7;
8622
28.6k
        if (level1[l1] == 0xFF)
8623
209
            level1[l1] = count2++;
8624
28.6k
        if (level2[l2] == 0xFF)
8625
629
            level2[l2] = count3++;
8626
28.6k
    }
8627
8628
115
    if (count2 >= 0xFF || count3 >= 0xFF)
8629
0
        need_dict = 1;
8630
8631
115
    if (need_dict) {
8632
0
        PyObject *result = PyDict_New();
8633
0
        if (!result)
8634
0
            return NULL;
8635
0
        for (i = 0; i < length; i++) {
8636
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8637
0
            PyObject *key = PyLong_FromLong(c);
8638
0
            if (key == NULL) {
8639
0
                Py_DECREF(result);
8640
0
                return NULL;
8641
0
            }
8642
0
            PyObject *value = PyLong_FromLong(i);
8643
0
            if (value == NULL) {
8644
0
                Py_DECREF(key);
8645
0
                Py_DECREF(result);
8646
0
                return NULL;
8647
0
            }
8648
0
            int rc = PyDict_SetItem(result, key, value);
8649
0
            Py_DECREF(key);
8650
0
            Py_DECREF(value);
8651
0
            if (rc < 0) {
8652
0
                Py_DECREF(result);
8653
0
                return NULL;
8654
0
            }
8655
0
        }
8656
0
        return result;
8657
0
    }
8658
8659
    /* Create a three-level trie */
8660
115
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8661
115
                             16*count2 + 128*count3 - 1);
8662
115
    if (!result) {
8663
0
        return PyErr_NoMemory();
8664
0
    }
8665
8666
115
    _PyObject_Init(result, &EncodingMapType);
8667
115
    mresult = (struct encoding_map*)result;
8668
115
    mresult->count2 = count2;
8669
115
    mresult->count3 = count3;
8670
115
    mlevel1 = mresult->level1;
8671
115
    mlevel2 = mresult->level23;
8672
115
    mlevel3 = mresult->level23 + 16*count2;
8673
115
    memcpy(mlevel1, level1, 32);
8674
115
    memset(mlevel2, 0xFF, 16*count2);
8675
115
    memset(mlevel3, 0, 128*count3);
8676
115
    count3 = 0;
8677
29.4k
    for (i = 1; i < length; i++) {
8678
29.3k
        int o1, o2, o3, i2, i3;
8679
29.3k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8680
29.3k
        if (ch == 0xFFFE)
8681
            /* unmapped character */
8682
714
            continue;
8683
28.6k
        o1 = ch>>11;
8684
28.6k
        o2 = (ch>>7) & 0xF;
8685
28.6k
        i2 = 16*mlevel1[o1] + o2;
8686
28.6k
        if (mlevel2[i2] == 0xFF)
8687
629
            mlevel2[i2] = count3++;
8688
28.6k
        o3 = ch & 0x7F;
8689
28.6k
        i3 = 128*mlevel2[i2] + o3;
8690
28.6k
        mlevel3[i3] = i;
8691
28.6k
    }
8692
115
    return result;
8693
115
}
8694
8695
static int
8696
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8697
0
{
8698
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8699
0
    int l1 = c>>11;
8700
0
    int l2 = (c>>7) & 0xF;
8701
0
    int l3 = c & 0x7F;
8702
0
    int i;
8703
8704
0
    if (c > 0xFFFF)
8705
0
        return -1;
8706
0
    if (c == 0)
8707
0
        return 0;
8708
    /* level 1*/
8709
0
    i = map->level1[l1];
8710
0
    if (i == 0xFF) {
8711
0
        return -1;
8712
0
    }
8713
    /* level 2*/
8714
0
    i = map->level23[16*i+l2];
8715
0
    if (i == 0xFF) {
8716
0
        return -1;
8717
0
    }
8718
    /* level 3 */
8719
0
    i = map->level23[16*map->count2 + 128*i + l3];
8720
0
    if (i == 0) {
8721
0
        return -1;
8722
0
    }
8723
0
    return i;
8724
0
}
8725
8726
/* Lookup the character in the mapping.
8727
   On success, return PyLong, PyBytes or None (if the character can't be found).
8728
   If the result is PyLong, put its value in replace.
8729
   On error, return NULL.
8730
   */
8731
static PyObject *
8732
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8733
0
{
8734
0
    PyObject *w = PyLong_FromLong((long)c);
8735
0
    PyObject *x;
8736
8737
0
    if (w == NULL)
8738
0
        return NULL;
8739
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8740
0
    Py_DECREF(w);
8741
0
    if (rc == 0) {
8742
        /* No mapping found means: mapping is undefined. */
8743
0
        Py_RETURN_NONE;
8744
0
    }
8745
0
    if (x == NULL) {
8746
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8747
            /* No mapping found means: mapping is undefined. */
8748
0
            PyErr_Clear();
8749
0
            Py_RETURN_NONE;
8750
0
        } else
8751
0
            return NULL;
8752
0
    }
8753
0
    else if (x == Py_None)
8754
0
        return x;
8755
0
    else if (PyLong_Check(x)) {
8756
0
        long value = PyLong_AsLong(x);
8757
0
        if (value < 0 || value > 255) {
8758
0
            PyErr_SetString(PyExc_TypeError,
8759
0
                            "character mapping must be in range(256)");
8760
0
            Py_DECREF(x);
8761
0
            return NULL;
8762
0
        }
8763
0
        *replace = (unsigned char)value;
8764
0
        return x;
8765
0
    }
8766
0
    else if (PyBytes_Check(x))
8767
0
        return x;
8768
0
    else {
8769
        /* wrong return value */
8770
0
        PyErr_Format(PyExc_TypeError,
8771
0
                     "character mapping must return integer, bytes or None, not %.400s",
8772
0
                     Py_TYPE(x)->tp_name);
8773
0
        Py_DECREF(x);
8774
0
        return NULL;
8775
0
    }
8776
0
}
8777
8778
static int
8779
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8780
0
{
8781
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8782
    /* exponentially overallocate to minimize reallocations */
8783
0
    if (requiredsize < 2 * outsize)
8784
0
        requiredsize = 2 * outsize;
8785
0
    return PyBytesWriter_Resize(writer, requiredsize);
8786
0
}
8787
8788
typedef enum charmapencode_result {
8789
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8790
} charmapencode_result;
8791
/* lookup the character, put the result in the output string and adjust
8792
   various state variables. Resize the output bytes object if not enough
8793
   space is available. Return a new reference to the object that
8794
   was put in the output buffer, or Py_None, if the mapping was undefined
8795
   (in which case no character was written) or NULL, if a
8796
   reallocation error occurred. The caller must decref the result */
8797
static charmapencode_result
8798
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8799
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8800
0
{
8801
0
    PyObject *rep;
8802
0
    unsigned char replace;
8803
0
    char *outstart;
8804
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8805
8806
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8807
0
        int res = encoding_map_lookup(c, mapping);
8808
0
        Py_ssize_t requiredsize = *outpos+1;
8809
0
        if (res == -1) {
8810
0
            return enc_FAILED;
8811
0
        }
8812
8813
0
        if (outsize<requiredsize) {
8814
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8815
0
                return enc_EXCEPTION;
8816
0
            }
8817
0
        }
8818
0
        outstart = _PyBytesWriter_GetData(writer);
8819
0
        outstart[(*outpos)++] = (char)res;
8820
0
        return enc_SUCCESS;
8821
0
    }
8822
8823
0
    rep = charmapencode_lookup(c, mapping, &replace);
8824
0
    if (rep==NULL)
8825
0
        return enc_EXCEPTION;
8826
0
    else if (rep==Py_None) {
8827
0
        Py_DECREF(rep);
8828
0
        return enc_FAILED;
8829
0
    } else {
8830
0
        if (PyLong_Check(rep)) {
8831
0
            Py_ssize_t requiredsize = *outpos+1;
8832
0
            if (outsize<requiredsize)
8833
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8834
0
                    Py_DECREF(rep);
8835
0
                    return enc_EXCEPTION;
8836
0
                }
8837
0
            outstart = _PyBytesWriter_GetData(writer);
8838
0
            outstart[(*outpos)++] = (char)replace;
8839
0
        }
8840
0
        else {
8841
0
            const char *repchars = PyBytes_AS_STRING(rep);
8842
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8843
0
            Py_ssize_t requiredsize = *outpos+repsize;
8844
0
            if (outsize<requiredsize)
8845
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8846
0
                    Py_DECREF(rep);
8847
0
                    return enc_EXCEPTION;
8848
0
                }
8849
0
            outstart = _PyBytesWriter_GetData(writer);
8850
0
            memcpy(outstart + *outpos, repchars, repsize);
8851
0
            *outpos += repsize;
8852
0
        }
8853
0
    }
8854
0
    Py_DECREF(rep);
8855
0
    return enc_SUCCESS;
8856
0
}
8857
8858
/* handle an error in _PyUnicode_EncodeCharmap()
8859
   Return 0 on success, -1 on error */
8860
static int
8861
charmap_encoding_error(
8862
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8863
    PyObject **exceptionObject,
8864
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8865
    PyBytesWriter *writer, Py_ssize_t *respos)
8866
0
{
8867
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8868
0
    Py_ssize_t size, repsize;
8869
0
    Py_ssize_t newpos;
8870
0
    int kind;
8871
0
    const void *data;
8872
0
    Py_ssize_t index;
8873
    /* startpos for collecting unencodable chars */
8874
0
    Py_ssize_t collstartpos = *inpos;
8875
0
    Py_ssize_t collendpos = *inpos+1;
8876
0
    Py_ssize_t collpos;
8877
0
    const char *encoding = "charmap";
8878
0
    const char *reason = "character maps to <undefined>";
8879
0
    charmapencode_result x;
8880
0
    Py_UCS4 ch;
8881
0
    int val;
8882
8883
0
    size = PyUnicode_GET_LENGTH(unicode);
8884
    /* find all unencodable characters */
8885
0
    while (collendpos < size) {
8886
0
        PyObject *rep;
8887
0
        unsigned char replace;
8888
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8889
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8890
0
            val = encoding_map_lookup(ch, mapping);
8891
0
            if (val != -1)
8892
0
                break;
8893
0
            ++collendpos;
8894
0
            continue;
8895
0
        }
8896
8897
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8898
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8899
0
        if (rep==NULL)
8900
0
            return -1;
8901
0
        else if (rep!=Py_None) {
8902
0
            Py_DECREF(rep);
8903
0
            break;
8904
0
        }
8905
0
        Py_DECREF(rep);
8906
0
        ++collendpos;
8907
0
    }
8908
    /* cache callback name lookup
8909
     * (if not done yet, i.e. it's the first error) */
8910
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8911
0
        *error_handler = _Py_GetErrorHandler(errors);
8912
8913
0
    switch (*error_handler) {
8914
0
    case _Py_ERROR_STRICT:
8915
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8916
0
        return -1;
8917
8918
0
    case _Py_ERROR_REPLACE:
8919
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8920
0
            x = charmapencode_output('?', mapping, writer, respos);
8921
0
            if (x==enc_EXCEPTION) {
8922
0
                return -1;
8923
0
            }
8924
0
            else if (x==enc_FAILED) {
8925
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8926
0
                return -1;
8927
0
            }
8928
0
        }
8929
0
        _Py_FALLTHROUGH;
8930
0
    case _Py_ERROR_IGNORE:
8931
0
        *inpos = collendpos;
8932
0
        break;
8933
8934
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8935
        /* generate replacement (temporarily (mis)uses p) */
8936
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8937
0
            char buffer[2+29+1+1];
8938
0
            char *cp;
8939
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8940
0
            for (cp = buffer; *cp; ++cp) {
8941
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8942
0
                if (x==enc_EXCEPTION)
8943
0
                    return -1;
8944
0
                else if (x==enc_FAILED) {
8945
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8946
0
                    return -1;
8947
0
                }
8948
0
            }
8949
0
        }
8950
0
        *inpos = collendpos;
8951
0
        break;
8952
8953
0
    default:
8954
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8955
0
                                                      encoding, reason, unicode, exceptionObject,
8956
0
                                                      collstartpos, collendpos, &newpos);
8957
0
        if (repunicode == NULL)
8958
0
            return -1;
8959
0
        if (PyBytes_Check(repunicode)) {
8960
            /* Directly copy bytes result to output. */
8961
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8962
0
            Py_ssize_t requiredsize;
8963
0
            repsize = PyBytes_Size(repunicode);
8964
0
            requiredsize = *respos + repsize;
8965
0
            if (requiredsize > outsize)
8966
                /* Make room for all additional bytes. */
8967
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8968
0
                    Py_DECREF(repunicode);
8969
0
                    return -1;
8970
0
                }
8971
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8972
0
                   PyBytes_AsString(repunicode),  repsize);
8973
0
            *respos += repsize;
8974
0
            *inpos = newpos;
8975
0
            Py_DECREF(repunicode);
8976
0
            break;
8977
0
        }
8978
        /* generate replacement  */
8979
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8980
0
        data = PyUnicode_DATA(repunicode);
8981
0
        kind = PyUnicode_KIND(repunicode);
8982
0
        for (index = 0; index < repsize; index++) {
8983
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8984
0
            x = charmapencode_output(repch, mapping, writer, respos);
8985
0
            if (x==enc_EXCEPTION) {
8986
0
                Py_DECREF(repunicode);
8987
0
                return -1;
8988
0
            }
8989
0
            else if (x==enc_FAILED) {
8990
0
                Py_DECREF(repunicode);
8991
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8992
0
                return -1;
8993
0
            }
8994
0
        }
8995
0
        *inpos = newpos;
8996
0
        Py_DECREF(repunicode);
8997
0
    }
8998
0
    return 0;
8999
0
}
9000
9001
PyObject *
9002
_PyUnicode_EncodeCharmap(PyObject *unicode,
9003
                         PyObject *mapping,
9004
                         const char *errors)
9005
0
{
9006
    /* Default to Latin-1 */
9007
0
    if (mapping == NULL) {
9008
0
        return unicode_encode_ucs1(unicode, errors, 256);
9009
0
    }
9010
9011
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
9012
0
    if (size == 0) {
9013
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
9014
0
    }
9015
0
    const void *data = PyUnicode_DATA(unicode);
9016
0
    int kind = PyUnicode_KIND(unicode);
9017
9018
0
    PyObject *error_handler_obj = NULL;
9019
0
    PyObject *exc = NULL;
9020
9021
    /* output object */
9022
0
    PyBytesWriter *writer;
9023
    /* allocate enough for a simple encoding without
9024
       replacements, if we need more, we'll resize */
9025
0
    writer = PyBytesWriter_Create(size);
9026
0
    if (writer == NULL) {
9027
0
        goto onError;
9028
0
    }
9029
9030
    /* current input position */
9031
0
    Py_ssize_t inpos = 0;
9032
    /* current output position */
9033
0
    Py_ssize_t respos = 0;
9034
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
9035
9036
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
9037
0
        char *outstart = _PyBytesWriter_GetData(writer);
9038
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
9039
9040
0
        while (inpos<size) {
9041
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9042
9043
            /* try to encode it */
9044
0
            int res = encoding_map_lookup(ch, mapping);
9045
0
            Py_ssize_t requiredsize = respos+1;
9046
0
            if (res == -1) {
9047
0
                goto enc_FAILED;
9048
0
            }
9049
9050
0
            if (outsize<requiredsize) {
9051
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
9052
0
                    goto onError;
9053
0
                }
9054
0
                outstart = _PyBytesWriter_GetData(writer);
9055
0
                outsize = _PyBytesWriter_GetSize(writer);
9056
0
            }
9057
0
            outstart[respos++] = (char)res;
9058
9059
            /* done with this character => adjust input position */
9060
0
            ++inpos;
9061
0
            continue;
9062
9063
0
enc_FAILED:
9064
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
9065
0
                                       &exc,
9066
0
                                       &error_handler, &error_handler_obj, errors,
9067
0
                                       writer, &respos)) {
9068
0
                goto onError;
9069
0
            }
9070
0
            outstart = _PyBytesWriter_GetData(writer);
9071
0
            outsize = _PyBytesWriter_GetSize(writer);
9072
0
        }
9073
0
    }
9074
0
    else {
9075
0
        while (inpos<size) {
9076
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9077
            /* try to encode it */
9078
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
9079
0
            if (x==enc_EXCEPTION) { /* error */
9080
0
                goto onError;
9081
0
            }
9082
0
            if (x==enc_FAILED) { /* unencodable character */
9083
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
9084
0
                                           &exc,
9085
0
                                           &error_handler, &error_handler_obj, errors,
9086
0
                                           writer, &respos)) {
9087
0
                    goto onError;
9088
0
                }
9089
0
            }
9090
0
            else {
9091
                /* done with this character => adjust input position */
9092
0
                ++inpos;
9093
0
            }
9094
0
        }
9095
0
    }
9096
9097
0
    Py_XDECREF(exc);
9098
0
    Py_XDECREF(error_handler_obj);
9099
9100
    /* Resize if we allocated too much */
9101
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9102
9103
0
  onError:
9104
0
    PyBytesWriter_Discard(writer);
9105
0
    Py_XDECREF(exc);
9106
0
    Py_XDECREF(error_handler_obj);
9107
0
    return NULL;
9108
0
}
9109
9110
PyObject *
9111
PyUnicode_AsCharmapString(PyObject *unicode,
9112
                          PyObject *mapping)
9113
0
{
9114
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9115
0
        PyErr_BadArgument();
9116
0
        return NULL;
9117
0
    }
9118
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9119
0
}
9120
9121
/* create or adjust a UnicodeTranslateError */
9122
static void
9123
make_translate_exception(PyObject **exceptionObject,
9124
                         PyObject *unicode,
9125
                         Py_ssize_t startpos, Py_ssize_t endpos,
9126
                         const char *reason)
9127
0
{
9128
0
    if (*exceptionObject == NULL) {
9129
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9130
0
            unicode, startpos, endpos, reason);
9131
0
    }
9132
0
    else {
9133
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9134
0
            goto onError;
9135
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9136
0
            goto onError;
9137
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9138
0
            goto onError;
9139
0
        return;
9140
0
      onError:
9141
0
        Py_CLEAR(*exceptionObject);
9142
0
    }
9143
0
}
9144
9145
/* error handling callback helper:
9146
   build arguments, call the callback and check the arguments,
9147
   put the result into newpos and return the replacement string, which
9148
   has to be freed by the caller */
9149
static PyObject *
9150
unicode_translate_call_errorhandler(const char *errors,
9151
                                    PyObject **errorHandler,
9152
                                    const char *reason,
9153
                                    PyObject *unicode, PyObject **exceptionObject,
9154
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9155
                                    Py_ssize_t *newpos)
9156
0
{
9157
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9158
9159
0
    Py_ssize_t i_newpos;
9160
0
    PyObject *restuple;
9161
0
    PyObject *resunicode;
9162
9163
0
    if (*errorHandler == NULL) {
9164
0
        *errorHandler = PyCodec_LookupError(errors);
9165
0
        if (*errorHandler == NULL)
9166
0
            return NULL;
9167
0
    }
9168
9169
0
    make_translate_exception(exceptionObject,
9170
0
                             unicode, startpos, endpos, reason);
9171
0
    if (*exceptionObject == NULL)
9172
0
        return NULL;
9173
9174
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9175
0
    if (restuple == NULL)
9176
0
        return NULL;
9177
0
    if (!PyTuple_Check(restuple)) {
9178
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9179
0
        Py_DECREF(restuple);
9180
0
        return NULL;
9181
0
    }
9182
0
    if (!PyArg_ParseTuple(restuple, argparse,
9183
0
                          &resunicode, &i_newpos)) {
9184
0
        Py_DECREF(restuple);
9185
0
        return NULL;
9186
0
    }
9187
0
    if (i_newpos<0)
9188
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9189
0
    else
9190
0
        *newpos = i_newpos;
9191
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9192
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9193
0
        Py_DECREF(restuple);
9194
0
        return NULL;
9195
0
    }
9196
0
    Py_INCREF(resunicode);
9197
0
    Py_DECREF(restuple);
9198
0
    return resunicode;
9199
0
}
9200
9201
/* Lookup the character ch in the mapping and put the result in result,
9202
   which must be decrefed by the caller.
9203
   The result can be PyLong, PyUnicode, None or NULL.
9204
   If the result is PyLong, put its value in replace.
9205
   Return 0 on success, -1 on error */
9206
static int
9207
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9208
338
{
9209
338
    PyObject *w = PyLong_FromLong((long)c);
9210
338
    PyObject *x;
9211
9212
338
    if (w == NULL)
9213
0
        return -1;
9214
338
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9215
338
    Py_DECREF(w);
9216
338
    if (rc == 0) {
9217
        /* No mapping found means: use 1:1 mapping. */
9218
158
        *result = NULL;
9219
158
        return 0;
9220
158
    }
9221
180
    if (x == NULL) {
9222
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9223
            /* No mapping found means: use 1:1 mapping. */
9224
0
            PyErr_Clear();
9225
0
            *result = NULL;
9226
0
            return 0;
9227
0
        } else
9228
0
            return -1;
9229
0
    }
9230
180
    else if (x == Py_None) {
9231
0
        *result = x;
9232
0
        return 0;
9233
0
    }
9234
180
    else if (PyLong_Check(x)) {
9235
0
        long value = PyLong_AsLong(x);
9236
0
        if (value < 0 || value > MAX_UNICODE) {
9237
0
            PyErr_Format(PyExc_ValueError,
9238
0
                         "character mapping must be in range(0x%x)",
9239
0
                         MAX_UNICODE+1);
9240
0
            Py_DECREF(x);
9241
0
            return -1;
9242
0
        }
9243
0
        *result = x;
9244
0
        *replace = (Py_UCS4)value;
9245
0
        return 0;
9246
0
    }
9247
180
    else if (PyUnicode_Check(x)) {
9248
180
        *result = x;
9249
180
        return 0;
9250
180
    }
9251
0
    else {
9252
        /* wrong return value */
9253
0
        PyErr_SetString(PyExc_TypeError,
9254
0
                        "character mapping must return integer, None or str");
9255
0
        Py_DECREF(x);
9256
0
        return -1;
9257
0
    }
9258
180
}
9259
9260
/* lookup the character, write the result into the writer.
9261
   Return 1 if the result was written into the writer, return 0 if the mapping
9262
   was undefined, raise an exception return -1 on error. */
9263
static int
9264
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9265
                        _PyUnicodeWriter *writer)
9266
208
{
9267
208
    PyObject *item;
9268
208
    Py_UCS4 replace;
9269
9270
208
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9271
0
        return -1;
9272
9273
208
    if (item == NULL) {
9274
        /* not found => default to 1:1 mapping */
9275
84
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9276
0
            return -1;
9277
0
        }
9278
84
        return 1;
9279
84
    }
9280
9281
124
    if (item == Py_None) {
9282
0
        Py_DECREF(item);
9283
0
        return 0;
9284
0
    }
9285
9286
124
    if (PyLong_Check(item)) {
9287
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9288
0
            Py_DECREF(item);
9289
0
            return -1;
9290
0
        }
9291
0
        Py_DECREF(item);
9292
0
        return 1;
9293
0
    }
9294
9295
124
    if (!PyUnicode_Check(item)) {
9296
0
        Py_DECREF(item);
9297
0
        return -1;
9298
0
    }
9299
9300
124
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9301
0
        Py_DECREF(item);
9302
0
        return -1;
9303
0
    }
9304
9305
124
    Py_DECREF(item);
9306
124
    return 1;
9307
124
}
9308
9309
static int
9310
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9311
                              Py_UCS1 *translate)
9312
130
{
9313
130
    PyObject *item = NULL;
9314
130
    Py_UCS4 replace;
9315
130
    int ret = 0;
9316
9317
130
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9318
0
        return -1;
9319
0
    }
9320
9321
130
    if (item == Py_None) {
9322
        /* deletion */
9323
0
        translate[ch] = 0xfe;
9324
0
    }
9325
130
    else if (item == NULL) {
9326
        /* not found => default to 1:1 mapping */
9327
74
        translate[ch] = ch;
9328
74
        return 1;
9329
74
    }
9330
56
    else if (PyLong_Check(item)) {
9331
0
        if (replace > 127) {
9332
            /* invalid character or character outside ASCII:
9333
               skip the fast translate */
9334
0
            goto exit;
9335
0
        }
9336
0
        translate[ch] = (Py_UCS1)replace;
9337
0
    }
9338
56
    else if (PyUnicode_Check(item)) {
9339
56
        if (PyUnicode_GET_LENGTH(item) != 1)
9340
56
            goto exit;
9341
9342
0
        replace = PyUnicode_READ_CHAR(item, 0);
9343
0
        if (replace > 127)
9344
0
            goto exit;
9345
0
        translate[ch] = (Py_UCS1)replace;
9346
0
    }
9347
0
    else {
9348
        /* not None, NULL, long or unicode */
9349
0
        goto exit;
9350
0
    }
9351
0
    ret = 1;
9352
9353
56
  exit:
9354
56
    Py_DECREF(item);
9355
56
    return ret;
9356
0
}
9357
9358
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9359
   was translated into writer, return 0 if the input string was partially
9360
   translated into writer, raise an exception and return -1 on error. */
9361
static int
9362
unicode_fast_translate(PyObject *input, PyObject *mapping,
9363
                       _PyUnicodeWriter *writer, int ignore,
9364
                       Py_ssize_t *input_pos)
9365
104
{
9366
104
    Py_UCS1 ascii_table[128], ch, ch2;
9367
104
    Py_ssize_t len;
9368
104
    const Py_UCS1 *in, *end;
9369
104
    Py_UCS1 *out;
9370
104
    int res = 0;
9371
9372
104
    len = PyUnicode_GET_LENGTH(input);
9373
9374
104
    memset(ascii_table, 0xff, 128);
9375
9376
104
    in = PyUnicode_1BYTE_DATA(input);
9377
104
    end = in + len;
9378
9379
104
    assert(PyUnicode_IS_ASCII(writer->buffer));
9380
104
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9381
104
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9382
9383
192
    for (; in < end; in++) {
9384
144
        ch = *in;
9385
144
        ch2 = ascii_table[ch];
9386
144
        if (ch2 == 0xff) {
9387
130
            int translate = unicode_fast_translate_lookup(mapping, ch,
9388
130
                                                          ascii_table);
9389
130
            if (translate < 0)
9390
0
                return -1;
9391
130
            if (translate == 0)
9392
56
                goto exit;
9393
74
            ch2 = ascii_table[ch];
9394
74
        }
9395
88
        if (ch2 == 0xfe) {
9396
0
            if (ignore)
9397
0
                continue;
9398
0
            goto exit;
9399
0
        }
9400
88
        assert(ch2 < 128);
9401
88
        *out = ch2;
9402
88
        out++;
9403
88
    }
9404
48
    res = 1;
9405
9406
104
exit:
9407
104
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9408
104
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9409
104
    return res;
9410
48
}
9411
9412
static PyObject *
9413
_PyUnicode_TranslateCharmap(PyObject *input,
9414
                            PyObject *mapping,
9415
                            const char *errors)
9416
104
{
9417
    /* input object */
9418
104
    const void *data;
9419
104
    Py_ssize_t size, i;
9420
104
    int kind;
9421
    /* output buffer */
9422
104
    _PyUnicodeWriter writer;
9423
    /* error handler */
9424
104
    const char *reason = "character maps to <undefined>";
9425
104
    PyObject *errorHandler = NULL;
9426
104
    PyObject *exc = NULL;
9427
104
    int ignore;
9428
104
    int res;
9429
9430
104
    if (mapping == NULL) {
9431
0
        PyErr_BadArgument();
9432
0
        return NULL;
9433
0
    }
9434
9435
104
    data = PyUnicode_DATA(input);
9436
104
    kind = PyUnicode_KIND(input);
9437
104
    size = PyUnicode_GET_LENGTH(input);
9438
9439
104
    if (size == 0)
9440
0
        return PyUnicode_FromObject(input);
9441
9442
    /* allocate enough for a simple 1:1 translation without
9443
       replacements, if we need more, we'll resize */
9444
104
    _PyUnicodeWriter_Init(&writer);
9445
104
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9446
0
        goto onError;
9447
9448
104
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9449
9450
104
    if (PyUnicode_IS_ASCII(input)) {
9451
104
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9452
104
        if (res < 0) {
9453
0
            _PyUnicodeWriter_Dealloc(&writer);
9454
0
            return NULL;
9455
0
        }
9456
104
        if (res == 1)
9457
48
            return _PyUnicodeWriter_Finish(&writer);
9458
104
    }
9459
0
    else {
9460
0
        i = 0;
9461
0
    }
9462
9463
264
    while (i<size) {
9464
        /* try to encode it */
9465
208
        int translate;
9466
208
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9467
208
        Py_ssize_t newpos;
9468
        /* startpos for collecting untranslatable chars */
9469
208
        Py_ssize_t collstart;
9470
208
        Py_ssize_t collend;
9471
208
        Py_UCS4 ch;
9472
9473
208
        ch = PyUnicode_READ(kind, data, i);
9474
208
        translate = charmaptranslate_output(ch, mapping, &writer);
9475
208
        if (translate < 0)
9476
0
            goto onError;
9477
9478
208
        if (translate != 0) {
9479
            /* it worked => adjust input pointer */
9480
208
            ++i;
9481
208
            continue;
9482
208
        }
9483
9484
        /* untranslatable character */
9485
0
        collstart = i;
9486
0
        collend = i+1;
9487
9488
        /* find all untranslatable characters */
9489
0
        while (collend < size) {
9490
0
            PyObject *x;
9491
0
            Py_UCS4 replace;
9492
0
            ch = PyUnicode_READ(kind, data, collend);
9493
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9494
0
                goto onError;
9495
0
            Py_XDECREF(x);
9496
0
            if (x != Py_None)
9497
0
                break;
9498
0
            ++collend;
9499
0
        }
9500
9501
0
        if (ignore) {
9502
0
            i = collend;
9503
0
        }
9504
0
        else {
9505
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9506
0
                                                             reason, input, &exc,
9507
0
                                                             collstart, collend, &newpos);
9508
0
            if (repunicode == NULL)
9509
0
                goto onError;
9510
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9511
0
                Py_DECREF(repunicode);
9512
0
                goto onError;
9513
0
            }
9514
0
            Py_DECREF(repunicode);
9515
0
            i = newpos;
9516
0
        }
9517
0
    }
9518
56
    Py_XDECREF(exc);
9519
56
    Py_XDECREF(errorHandler);
9520
56
    return _PyUnicodeWriter_Finish(&writer);
9521
9522
0
  onError:
9523
0
    _PyUnicodeWriter_Dealloc(&writer);
9524
0
    Py_XDECREF(exc);
9525
0
    Py_XDECREF(errorHandler);
9526
0
    return NULL;
9527
56
}
9528
9529
PyObject *
9530
PyUnicode_Translate(PyObject *str,
9531
                    PyObject *mapping,
9532
                    const char *errors)
9533
0
{
9534
0
    if (ensure_unicode(str) < 0)
9535
0
        return NULL;
9536
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9537
0
}
9538
9539
PyObject *
9540
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9541
4.57M
{
9542
4.57M
    if (!PyUnicode_Check(unicode)) {
9543
0
        PyErr_BadInternalCall();
9544
0
        return NULL;
9545
0
    }
9546
4.57M
    if (PyUnicode_IS_ASCII(unicode)) {
9547
        /* If the string is already ASCII, just return the same string */
9548
4.57M
        return Py_NewRef(unicode);
9549
4.57M
    }
9550
9551
2.59k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9552
2.59k
    PyObject *result = PyUnicode_New(len, 127);
9553
2.59k
    if (result == NULL) {
9554
0
        return NULL;
9555
0
    }
9556
9557
2.59k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9558
2.59k
    int kind = PyUnicode_KIND(unicode);
9559
2.59k
    const void *data = PyUnicode_DATA(unicode);
9560
2.59k
    Py_ssize_t i;
9561
45.7k
    for (i = 0; i < len; ++i) {
9562
43.3k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9563
43.3k
        if (ch < 127) {
9564
40.3k
            out[i] = ch;
9565
40.3k
        }
9566
3.01k
        else if (Py_UNICODE_ISSPACE(ch)) {
9567
1.07k
            out[i] = ' ';
9568
1.07k
        }
9569
1.93k
        else {
9570
1.93k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9571
1.93k
            if (decimal < 0) {
9572
137
                out[i] = '?';
9573
137
                out[i+1] = '\0';
9574
137
                _PyUnicode_LENGTH(result) = i + 1;
9575
137
                break;
9576
137
            }
9577
1.79k
            out[i] = '0' + decimal;
9578
1.79k
        }
9579
43.3k
    }
9580
9581
2.59k
    assert(_PyUnicode_CheckConsistency(result, 1));
9582
2.59k
    return result;
9583
2.59k
}
9584
9585
/* --- Helpers ------------------------------------------------------------ */
9586
9587
/* helper macro to fixup start/end slice values */
9588
#define ADJUST_INDICES(start, end, len) \
9589
138M
    do {                                \
9590
138M
        if (end > len) {                \
9591
123M
            end = len;                  \
9592
123M
        }                               \
9593
138M
        else if (end < 0) {             \
9594
0
            end += len;                 \
9595
0
            if (end < 0) {              \
9596
0
                end = 0;                \
9597
0
            }                           \
9598
0
        }                               \
9599
138M
        if (start < 0) {                \
9600
0
            start += len;               \
9601
0
            if (start < 0) {            \
9602
0
                start = 0;              \
9603
0
            }                           \
9604
0
        }                               \
9605
138M
    } while (0)
9606
9607
static Py_ssize_t
9608
any_find_slice(PyObject* s1, PyObject* s2,
9609
               Py_ssize_t start,
9610
               Py_ssize_t end,
9611
               int direction)
9612
17.1M
{
9613
17.1M
    int kind1, kind2;
9614
17.1M
    const void *buf1, *buf2;
9615
17.1M
    Py_ssize_t len1, len2, result;
9616
9617
17.1M
    kind1 = PyUnicode_KIND(s1);
9618
17.1M
    kind2 = PyUnicode_KIND(s2);
9619
17.1M
    if (kind1 < kind2)
9620
0
        return -1;
9621
9622
17.1M
    len1 = PyUnicode_GET_LENGTH(s1);
9623
17.1M
    len2 = PyUnicode_GET_LENGTH(s2);
9624
17.1M
    ADJUST_INDICES(start, end, len1);
9625
17.1M
    if (end - start < len2)
9626
26.4k
        return -1;
9627
9628
17.1M
    buf1 = PyUnicode_DATA(s1);
9629
17.1M
    buf2 = PyUnicode_DATA(s2);
9630
17.1M
    if (len2 == 1) {
9631
17.1M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9632
17.1M
        result = findchar((const char *)buf1 + kind1*start,
9633
17.1M
                          kind1, end - start, ch, direction);
9634
17.1M
        if (result == -1)
9635
207k
            return -1;
9636
16.9M
        else
9637
16.9M
            return start + result;
9638
17.1M
    }
9639
9640
39.8k
    if (kind2 != kind1) {
9641
29.0k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9642
29.0k
        if (!buf2)
9643
0
            return -2;
9644
29.0k
    }
9645
9646
39.8k
    if (direction > 0) {
9647
39.8k
        switch (kind1) {
9648
10.7k
        case PyUnicode_1BYTE_KIND:
9649
10.7k
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9650
6.06k
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9651
4.72k
            else
9652
4.72k
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9653
10.7k
            break;
9654
20.7k
        case PyUnicode_2BYTE_KIND:
9655
20.7k
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9656
20.7k
            break;
9657
8.28k
        case PyUnicode_4BYTE_KIND:
9658
8.28k
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9659
8.28k
            break;
9660
0
        default:
9661
0
            Py_UNREACHABLE();
9662
39.8k
        }
9663
39.8k
    }
9664
0
    else {
9665
0
        switch (kind1) {
9666
0
        case PyUnicode_1BYTE_KIND:
9667
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9668
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9669
0
            else
9670
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9671
0
            break;
9672
0
        case PyUnicode_2BYTE_KIND:
9673
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9674
0
            break;
9675
0
        case PyUnicode_4BYTE_KIND:
9676
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9677
0
            break;
9678
0
        default:
9679
0
            Py_UNREACHABLE();
9680
0
        }
9681
0
    }
9682
9683
39.8k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9684
39.8k
    if (kind2 != kind1)
9685
29.0k
        PyMem_Free((void *)buf2);
9686
9687
39.8k
    return result;
9688
39.8k
}
9689
9690
9691
Py_ssize_t
9692
PyUnicode_Count(PyObject *str,
9693
                PyObject *substr,
9694
                Py_ssize_t start,
9695
                Py_ssize_t end)
9696
0
{
9697
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9698
0
        return -1;
9699
9700
0
    return unicode_count_impl(str, substr, start, end);
9701
0
}
9702
9703
Py_ssize_t
9704
PyUnicode_Find(PyObject *str,
9705
               PyObject *substr,
9706
               Py_ssize_t start,
9707
               Py_ssize_t end,
9708
               int direction)
9709
0
{
9710
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9711
0
        return -2;
9712
9713
0
    return any_find_slice(str, substr, start, end, direction);
9714
0
}
9715
9716
Py_ssize_t
9717
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9718
                   Py_ssize_t start, Py_ssize_t end,
9719
                   int direction)
9720
479k
{
9721
479k
    int kind;
9722
479k
    Py_ssize_t len, result;
9723
479k
    len = PyUnicode_GET_LENGTH(str);
9724
479k
    ADJUST_INDICES(start, end, len);
9725
479k
    if (end - start < 1)
9726
0
        return -1;
9727
479k
    kind = PyUnicode_KIND(str);
9728
479k
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9729
479k
                      kind, end-start, ch, direction);
9730
479k
    if (result == -1)
9731
52.2k
        return -1;
9732
427k
    else
9733
427k
        return start + result;
9734
479k
}
9735
9736
static int
9737
tailmatch(PyObject *self,
9738
          PyObject *substring,
9739
          Py_ssize_t start,
9740
          Py_ssize_t end,
9741
          int direction)
9742
99.7M
{
9743
99.7M
    int kind_self;
9744
99.7M
    int kind_sub;
9745
99.7M
    const void *data_self;
9746
99.7M
    const void *data_sub;
9747
99.7M
    Py_ssize_t offset;
9748
99.7M
    Py_ssize_t i;
9749
99.7M
    Py_ssize_t end_sub;
9750
9751
99.7M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9752
99.7M
    end -= PyUnicode_GET_LENGTH(substring);
9753
99.7M
    if (end < start)
9754
12.2M
        return 0;
9755
9756
87.5M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9757
0
        return 1;
9758
9759
87.5M
    kind_self = PyUnicode_KIND(self);
9760
87.5M
    data_self = PyUnicode_DATA(self);
9761
87.5M
    kind_sub = PyUnicode_KIND(substring);
9762
87.5M
    data_sub = PyUnicode_DATA(substring);
9763
87.5M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9764
9765
87.5M
    if (direction > 0)
9766
8.12M
        offset = end;
9767
79.4M
    else
9768
79.4M
        offset = start;
9769
9770
87.5M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9771
87.5M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9772
43.5M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9773
43.5M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9774
        /* If both are of the same kind, memcmp is sufficient */
9775
14.0M
        if (kind_self == kind_sub) {
9776
9.38M
            return ! memcmp((char *)data_self +
9777
9.38M
                                (offset * PyUnicode_KIND(substring)),
9778
9.38M
                            data_sub,
9779
9.38M
                            PyUnicode_GET_LENGTH(substring) *
9780
9.38M
                                PyUnicode_KIND(substring));
9781
9.38M
        }
9782
        /* otherwise we have to compare each character by first accessing it */
9783
4.70M
        else {
9784
            /* We do not need to compare 0 and len(substring)-1 because
9785
               the if statement above ensured already that they are equal
9786
               when we end up here. */
9787
4.76M
            for (i = 1; i < end_sub; ++i) {
9788
65.9k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9789
65.9k
                    PyUnicode_READ(kind_sub, data_sub, i))
9790
4.87k
                    return 0;
9791
65.9k
            }
9792
4.69M
            return 1;
9793
4.70M
        }
9794
14.0M
    }
9795
9796
73.4M
    return 0;
9797
87.5M
}
9798
9799
Py_ssize_t
9800
PyUnicode_Tailmatch(PyObject *str,
9801
                    PyObject *substr,
9802
                    Py_ssize_t start,
9803
                    Py_ssize_t end,
9804
                    int direction)
9805
0
{
9806
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9807
0
        return -1;
9808
9809
0
    return tailmatch(str, substr, start, end, direction);
9810
0
}
9811
9812
static PyObject *
9813
ascii_upper_or_lower(PyObject *self, int lower)
9814
80.1M
{
9815
80.1M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9816
80.1M
    const char *data = PyUnicode_DATA(self);
9817
80.1M
    char *resdata;
9818
80.1M
    PyObject *res;
9819
9820
80.1M
    res = PyUnicode_New(len, 127);
9821
80.1M
    if (res == NULL)
9822
0
        return NULL;
9823
80.1M
    resdata = PyUnicode_DATA(res);
9824
80.1M
    if (lower)
9825
80.1M
        _Py_bytes_lower(resdata, data, len);
9826
102
    else
9827
102
        _Py_bytes_upper(resdata, data, len);
9828
80.1M
    return res;
9829
80.1M
}
9830
9831
static Py_UCS4
9832
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9833
32.7k
{
9834
32.7k
    Py_ssize_t j;
9835
32.7k
    int final_sigma;
9836
32.7k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9837
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9838
9839
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9840
9841
    where ! is a negation and \p{xxx} is a character with property xxx.
9842
    */
9843
81.2k
    for (j = i - 1; j >= 0; j--) {
9844
79.3k
        c = PyUnicode_READ(kind, data, j);
9845
79.3k
        if (!_PyUnicode_IsCaseIgnorable(c))
9846
30.8k
            break;
9847
79.3k
    }
9848
32.7k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9849
32.7k
    if (final_sigma) {
9850
63.2k
        for (j = i + 1; j < length; j++) {
9851
62.1k
            c = PyUnicode_READ(kind, data, j);
9852
62.1k
            if (!_PyUnicode_IsCaseIgnorable(c))
9853
23.2k
                break;
9854
62.1k
        }
9855
24.3k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9856
24.3k
    }
9857
32.7k
    return (final_sigma) ? 0x3C2 : 0x3C3;
9858
32.7k
}
9859
9860
static int
9861
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9862
           Py_UCS4 c, Py_UCS4 *mapped)
9863
87.5M
{
9864
    /* Obscure special case. */
9865
87.5M
    if (c == 0x3A3) {
9866
32.7k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9867
32.7k
        return 1;
9868
32.7k
    }
9869
87.5M
    return _PyUnicode_ToLowerFull(c, mapped);
9870
87.5M
}
9871
9872
static Py_ssize_t
9873
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9874
0
{
9875
0
    Py_ssize_t i, k = 0;
9876
0
    int n_res, j;
9877
0
    Py_UCS4 c, mapped[3];
9878
9879
0
    c = PyUnicode_READ(kind, data, 0);
9880
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9881
0
    for (j = 0; j < n_res; j++) {
9882
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9883
0
        res[k++] = mapped[j];
9884
0
    }
9885
0
    for (i = 1; i < length; i++) {
9886
0
        c = PyUnicode_READ(kind, data, i);
9887
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9888
0
        for (j = 0; j < n_res; j++) {
9889
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9890
0
            res[k++] = mapped[j];
9891
0
        }
9892
0
    }
9893
0
    return k;
9894
0
}
9895
9896
static Py_ssize_t
9897
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9898
0
    Py_ssize_t i, k = 0;
9899
9900
0
    for (i = 0; i < length; i++) {
9901
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9902
0
        int n_res, j;
9903
0
        if (Py_UNICODE_ISUPPER(c)) {
9904
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9905
0
        }
9906
0
        else if (Py_UNICODE_ISLOWER(c)) {
9907
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9908
0
        }
9909
0
        else {
9910
0
            n_res = 1;
9911
0
            mapped[0] = c;
9912
0
        }
9913
0
        for (j = 0; j < n_res; j++) {
9914
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9915
0
            res[k++] = mapped[j];
9916
0
        }
9917
0
    }
9918
0
    return k;
9919
0
}
9920
9921
static Py_ssize_t
9922
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9923
                  Py_UCS4 *maxchar, int lower)
9924
26.6M
{
9925
26.6M
    Py_ssize_t i, k = 0;
9926
9927
114M
    for (i = 0; i < length; i++) {
9928
87.5M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9929
87.5M
        int n_res, j;
9930
87.5M
        if (lower)
9931
87.5M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9932
0
        else
9933
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9934
175M
        for (j = 0; j < n_res; j++) {
9935
87.5M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9936
87.5M
            res[k++] = mapped[j];
9937
87.5M
        }
9938
87.5M
    }
9939
26.6M
    return k;
9940
26.6M
}
9941
9942
static Py_ssize_t
9943
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9944
0
{
9945
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9946
0
}
9947
9948
static Py_ssize_t
9949
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9950
26.6M
{
9951
26.6M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9952
26.6M
}
9953
9954
static Py_ssize_t
9955
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9956
0
{
9957
0
    Py_ssize_t i, k = 0;
9958
9959
0
    for (i = 0; i < length; i++) {
9960
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9961
0
        Py_UCS4 mapped[3];
9962
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9963
0
        for (j = 0; j < n_res; j++) {
9964
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9965
0
            res[k++] = mapped[j];
9966
0
        }
9967
0
    }
9968
0
    return k;
9969
0
}
9970
9971
static Py_ssize_t
9972
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9973
0
{
9974
0
    Py_ssize_t i, k = 0;
9975
0
    int previous_is_cased;
9976
9977
0
    previous_is_cased = 0;
9978
0
    for (i = 0; i < length; i++) {
9979
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9980
0
        Py_UCS4 mapped[3];
9981
0
        int n_res, j;
9982
9983
0
        if (previous_is_cased)
9984
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9985
0
        else
9986
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9987
9988
0
        for (j = 0; j < n_res; j++) {
9989
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9990
0
            res[k++] = mapped[j];
9991
0
        }
9992
9993
0
        previous_is_cased = _PyUnicode_IsCased(c);
9994
0
    }
9995
0
    return k;
9996
0
}
9997
9998
static PyObject *
9999
case_operation(PyObject *self,
10000
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
10001
26.6M
{
10002
26.6M
    PyObject *res = NULL;
10003
26.6M
    Py_ssize_t length, newlength = 0;
10004
26.6M
    int kind, outkind;
10005
26.6M
    const void *data;
10006
26.6M
    void *outdata;
10007
26.6M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
10008
10009
26.6M
    kind = PyUnicode_KIND(self);
10010
26.6M
    data = PyUnicode_DATA(self);
10011
26.6M
    length = PyUnicode_GET_LENGTH(self);
10012
26.6M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10013
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
10014
0
        return NULL;
10015
0
    }
10016
26.6M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10017
26.6M
    if (tmp == NULL)
10018
0
        return PyErr_NoMemory();
10019
26.6M
    newlength = perform(kind, data, length, tmp, &maxchar);
10020
26.6M
    res = PyUnicode_New(newlength, maxchar);
10021
26.6M
    if (res == NULL)
10022
0
        goto leave;
10023
26.6M
    tmpend = tmp + newlength;
10024
26.6M
    outdata = PyUnicode_DATA(res);
10025
26.6M
    outkind = PyUnicode_KIND(res);
10026
26.6M
    switch (outkind) {
10027
216k
    case PyUnicode_1BYTE_KIND:
10028
216k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10029
216k
        break;
10030
26.3M
    case PyUnicode_2BYTE_KIND:
10031
26.3M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10032
26.3M
        break;
10033
54.4k
    case PyUnicode_4BYTE_KIND:
10034
54.4k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10035
54.4k
        break;
10036
0
    default:
10037
0
        Py_UNREACHABLE();
10038
26.6M
    }
10039
26.6M
  leave:
10040
26.6M
    PyMem_Free(tmp);
10041
26.6M
    return res;
10042
26.6M
}
10043
10044
PyObject *
10045
PyUnicode_Join(PyObject *separator, PyObject *seq)
10046
22.1M
{
10047
22.1M
    PyObject *res;
10048
22.1M
    PyObject *fseq;
10049
22.1M
    Py_ssize_t seqlen;
10050
22.1M
    PyObject **items;
10051
10052
22.1M
    fseq = PySequence_Fast(seq, "can only join an iterable");
10053
22.1M
    if (fseq == NULL) {
10054
653
        return NULL;
10055
653
    }
10056
10057
22.1M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
10058
10059
22.1M
    items = PySequence_Fast_ITEMS(fseq);
10060
22.1M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
10061
22.1M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
10062
10063
22.1M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
10064
10065
22.1M
    Py_DECREF(fseq);
10066
22.1M
    return res;
10067
22.1M
}
10068
10069
PyObject *
10070
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10071
61.6M
{
10072
61.6M
    PyObject *res = NULL; /* the result */
10073
61.6M
    PyObject *sep = NULL;
10074
61.6M
    Py_ssize_t seplen;
10075
61.6M
    PyObject *item;
10076
61.6M
    Py_ssize_t sz, i, res_offset;
10077
61.6M
    Py_UCS4 maxchar;
10078
61.6M
    Py_UCS4 item_maxchar;
10079
61.6M
    int use_memcpy;
10080
61.6M
    unsigned char *res_data = NULL, *sep_data = NULL;
10081
61.6M
    PyObject *last_obj;
10082
61.6M
    int kind = 0;
10083
10084
    /* If empty sequence, return u"". */
10085
61.6M
    if (seqlen == 0) {
10086
5.21M
        _Py_RETURN_UNICODE_EMPTY();
10087
5.21M
    }
10088
10089
    /* If singleton sequence with an exact Unicode, return that. */
10090
56.4M
    last_obj = NULL;
10091
56.4M
    if (seqlen == 1) {
10092
7.18M
        if (PyUnicode_CheckExact(items[0])) {
10093
5.43M
            res = items[0];
10094
5.43M
            return Py_NewRef(res);
10095
5.43M
        }
10096
1.75M
        seplen = 0;
10097
1.75M
        maxchar = 0;
10098
1.75M
    }
10099
49.3M
    else {
10100
        /* Set up sep and seplen */
10101
49.3M
        if (separator == NULL) {
10102
            /* fall back to a blank space separator */
10103
0
            sep = PyUnicode_FromOrdinal(' ');
10104
0
            if (!sep)
10105
0
                goto onError;
10106
0
            seplen = 1;
10107
0
            maxchar = 32;
10108
0
        }
10109
49.3M
        else {
10110
49.3M
            if (!PyUnicode_Check(separator)) {
10111
0
                PyErr_Format(PyExc_TypeError,
10112
0
                             "separator: expected str instance,"
10113
0
                             " %.80s found",
10114
0
                             Py_TYPE(separator)->tp_name);
10115
0
                goto onError;
10116
0
            }
10117
49.3M
            sep = separator;
10118
49.3M
            seplen = PyUnicode_GET_LENGTH(separator);
10119
49.3M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10120
            /* inc refcount to keep this code path symmetric with the
10121
               above case of a blank separator */
10122
49.3M
            Py_INCREF(sep);
10123
49.3M
        }
10124
49.3M
        last_obj = sep;
10125
49.3M
    }
10126
10127
    /* There are at least two things to join, or else we have a subclass
10128
     * of str in the sequence.
10129
     * Do a pre-pass to figure out the total amount of space we'll
10130
     * need (sz), and see whether all argument are strings.
10131
     */
10132
51.0M
    sz = 0;
10133
#ifdef Py_DEBUG
10134
    use_memcpy = 0;
10135
#else
10136
51.0M
    use_memcpy = 1;
10137
51.0M
#endif
10138
403M
    for (i = 0; i < seqlen; i++) {
10139
352M
        size_t add_sz;
10140
352M
        item = items[i];
10141
352M
        if (!PyUnicode_Check(item)) {
10142
0
            PyErr_Format(PyExc_TypeError,
10143
0
                         "sequence item %zd: expected str instance,"
10144
0
                         " %.80s found",
10145
0
                         i, Py_TYPE(item)->tp_name);
10146
0
            goto onError;
10147
0
        }
10148
352M
        add_sz = PyUnicode_GET_LENGTH(item);
10149
352M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10150
352M
        maxchar = Py_MAX(maxchar, item_maxchar);
10151
352M
        if (i != 0) {
10152
301M
            add_sz += seplen;
10153
301M
        }
10154
352M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10155
0
            PyErr_SetString(PyExc_OverflowError,
10156
0
                            "join() result is too long for a Python string");
10157
0
            goto onError;
10158
0
        }
10159
352M
        sz += add_sz;
10160
352M
        if (use_memcpy && last_obj != NULL) {
10161
288M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10162
4.84M
                use_memcpy = 0;
10163
288M
        }
10164
352M
        last_obj = item;
10165
352M
    }
10166
10167
51.0M
    res = PyUnicode_New(sz, maxchar);
10168
51.0M
    if (res == NULL)
10169
0
        goto onError;
10170
10171
    /* Catenate everything. */
10172
#ifdef Py_DEBUG
10173
    use_memcpy = 0;
10174
#else
10175
51.0M
    if (use_memcpy) {
10176
46.2M
        res_data = PyUnicode_1BYTE_DATA(res);
10177
46.2M
        kind = PyUnicode_KIND(res);
10178
46.2M
        if (seplen != 0)
10179
19.1k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10180
46.2M
    }
10181
51.0M
#endif
10182
51.0M
    if (use_memcpy) {
10183
307M
        for (i = 0; i < seqlen; ++i) {
10184
261M
            Py_ssize_t itemlen;
10185
261M
            item = items[i];
10186
10187
            /* Copy item, and maybe the separator. */
10188
261M
            if (i && seplen != 0) {
10189
25.8k
                memcpy(res_data,
10190
25.8k
                          sep_data,
10191
25.8k
                          kind * seplen);
10192
25.8k
                res_data += kind * seplen;
10193
25.8k
            }
10194
10195
261M
            itemlen = PyUnicode_GET_LENGTH(item);
10196
261M
            if (itemlen != 0) {
10197
225M
                memcpy(res_data,
10198
225M
                          PyUnicode_DATA(item),
10199
225M
                          kind * itemlen);
10200
225M
                res_data += kind * itemlen;
10201
225M
            }
10202
261M
        }
10203
46.2M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10204
46.2M
                           + kind * PyUnicode_GET_LENGTH(res));
10205
46.2M
    }
10206
4.84M
    else {
10207
96.5M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10208
91.6M
            Py_ssize_t itemlen;
10209
91.6M
            item = items[i];
10210
10211
            /* Copy item, and maybe the separator. */
10212
91.6M
            if (i && seplen != 0) {
10213
69.0k
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10214
69.0k
                res_offset += seplen;
10215
69.0k
            }
10216
10217
91.6M
            itemlen = PyUnicode_GET_LENGTH(item);
10218
91.6M
            if (itemlen != 0) {
10219
89.8M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10220
89.8M
                res_offset += itemlen;
10221
89.8M
            }
10222
91.6M
        }
10223
4.84M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10224
4.84M
    }
10225
10226
51.0M
    Py_XDECREF(sep);
10227
51.0M
    assert(_PyUnicode_CheckConsistency(res, 1));
10228
51.0M
    return res;
10229
10230
0
  onError:
10231
0
    Py_XDECREF(sep);
10232
0
    Py_XDECREF(res);
10233
0
    return NULL;
10234
51.0M
}
10235
10236
void
10237
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10238
                    Py_UCS4 fill_char)
10239
680
{
10240
680
    const int kind = PyUnicode_KIND(unicode);
10241
680
    void *data = PyUnicode_DATA(unicode);
10242
680
    assert(_PyUnicode_IsModifiable(unicode));
10243
680
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10244
680
    assert(start >= 0);
10245
680
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10246
680
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10247
680
}
10248
10249
Py_ssize_t
10250
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10251
               Py_UCS4 fill_char)
10252
680
{
10253
680
    Py_ssize_t maxlen;
10254
10255
680
    if (!PyUnicode_Check(unicode)) {
10256
0
        PyErr_BadInternalCall();
10257
0
        return -1;
10258
0
    }
10259
680
    if (unicode_check_modifiable(unicode))
10260
0
        return -1;
10261
10262
680
    if (start < 0) {
10263
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10264
0
        return -1;
10265
0
    }
10266
680
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10267
0
        PyErr_SetString(PyExc_ValueError,
10268
0
                         "fill character is bigger than "
10269
0
                         "the string maximum character");
10270
0
        return -1;
10271
0
    }
10272
10273
680
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10274
680
    length = Py_MIN(maxlen, length);
10275
680
    if (length <= 0)
10276
0
        return 0;
10277
10278
680
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10279
680
    return length;
10280
680
}
10281
10282
static PyObject *
10283
pad(PyObject *self,
10284
    Py_ssize_t left,
10285
    Py_ssize_t right,
10286
    Py_UCS4 fill)
10287
0
{
10288
0
    PyObject *u;
10289
0
    Py_UCS4 maxchar;
10290
0
    int kind;
10291
0
    void *data;
10292
10293
0
    if (left < 0)
10294
0
        left = 0;
10295
0
    if (right < 0)
10296
0
        right = 0;
10297
10298
0
    if (left == 0 && right == 0)
10299
0
        return unicode_result_unchanged(self);
10300
10301
0
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10302
0
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10303
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10304
0
        return NULL;
10305
0
    }
10306
0
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10307
0
    maxchar = Py_MAX(maxchar, fill);
10308
0
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10309
0
    if (!u)
10310
0
        return NULL;
10311
10312
0
    kind = PyUnicode_KIND(u);
10313
0
    data = PyUnicode_DATA(u);
10314
0
    if (left)
10315
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10316
0
    if (right)
10317
0
        _PyUnicode_Fill(kind, data, fill,
10318
0
                        left + _PyUnicode_LENGTH(self), right);
10319
0
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10320
0
    assert(_PyUnicode_CheckConsistency(u, 1));
10321
0
    return u;
10322
0
}
10323
10324
PyObject *
10325
PyUnicode_Splitlines(PyObject *string, int keepends)
10326
13.8k
{
10327
13.8k
    PyObject *list;
10328
10329
13.8k
    if (ensure_unicode(string) < 0)
10330
0
        return NULL;
10331
10332
13.8k
    switch (PyUnicode_KIND(string)) {
10333
3.57k
    case PyUnicode_1BYTE_KIND:
10334
3.57k
        if (PyUnicode_IS_ASCII(string))
10335
2.73k
            list = asciilib_splitlines(
10336
2.73k
                string, PyUnicode_1BYTE_DATA(string),
10337
2.73k
                PyUnicode_GET_LENGTH(string), keepends);
10338
843
        else
10339
843
            list = ucs1lib_splitlines(
10340
843
                string, PyUnicode_1BYTE_DATA(string),
10341
843
                PyUnicode_GET_LENGTH(string), keepends);
10342
3.57k
        break;
10343
7.26k
    case PyUnicode_2BYTE_KIND:
10344
7.26k
        list = ucs2lib_splitlines(
10345
7.26k
            string, PyUnicode_2BYTE_DATA(string),
10346
7.26k
            PyUnicode_GET_LENGTH(string), keepends);
10347
7.26k
        break;
10348
3.06k
    case PyUnicode_4BYTE_KIND:
10349
3.06k
        list = ucs4lib_splitlines(
10350
3.06k
            string, PyUnicode_4BYTE_DATA(string),
10351
3.06k
            PyUnicode_GET_LENGTH(string), keepends);
10352
3.06k
        break;
10353
0
    default:
10354
0
        Py_UNREACHABLE();
10355
13.8k
    }
10356
13.8k
    return list;
10357
13.8k
}
10358
10359
static PyObject *
10360
split(PyObject *self,
10361
      PyObject *substring,
10362
      Py_ssize_t maxcount)
10363
22.6M
{
10364
22.6M
    int kind1, kind2;
10365
22.6M
    const void *buf1, *buf2;
10366
22.6M
    Py_ssize_t len1, len2;
10367
22.6M
    PyObject* out;
10368
22.6M
    len1 = PyUnicode_GET_LENGTH(self);
10369
22.6M
    kind1 = PyUnicode_KIND(self);
10370
10371
22.6M
    if (substring == NULL) {
10372
181k
        if (maxcount < 0) {
10373
155k
            maxcount = (len1 - 1) / 2 + 1;
10374
155k
        }
10375
181k
        switch (kind1) {
10376
118k
        case PyUnicode_1BYTE_KIND:
10377
118k
            if (PyUnicode_IS_ASCII(self))
10378
86.3k
                return asciilib_split_whitespace(
10379
86.3k
                    self,  PyUnicode_1BYTE_DATA(self),
10380
86.3k
                    len1, maxcount
10381
86.3k
                    );
10382
32.1k
            else
10383
32.1k
                return ucs1lib_split_whitespace(
10384
32.1k
                    self,  PyUnicode_1BYTE_DATA(self),
10385
32.1k
                    len1, maxcount
10386
32.1k
                    );
10387
51.4k
        case PyUnicode_2BYTE_KIND:
10388
51.4k
            return ucs2lib_split_whitespace(
10389
51.4k
                self,  PyUnicode_2BYTE_DATA(self),
10390
51.4k
                len1, maxcount
10391
51.4k
                );
10392
12.0k
        case PyUnicode_4BYTE_KIND:
10393
12.0k
            return ucs4lib_split_whitespace(
10394
12.0k
                self,  PyUnicode_4BYTE_DATA(self),
10395
12.0k
                len1, maxcount
10396
12.0k
                );
10397
0
        default:
10398
0
            Py_UNREACHABLE();
10399
181k
        }
10400
181k
    }
10401
10402
22.5M
    kind2 = PyUnicode_KIND(substring);
10403
22.5M
    len2 = PyUnicode_GET_LENGTH(substring);
10404
22.5M
    if (maxcount < 0) {
10405
        // if len2 == 0, it will raise ValueError.
10406
11.6M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10407
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10408
11.6M
        maxcount = maxcount < 0 ? len1 : maxcount;
10409
11.6M
    }
10410
22.5M
    if (kind1 < kind2 || len1 < len2) {
10411
5.63M
        out = PyList_New(1);
10412
5.63M
        if (out == NULL)
10413
0
            return NULL;
10414
5.63M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10415
5.63M
        return out;
10416
5.63M
    }
10417
16.8M
    buf1 = PyUnicode_DATA(self);
10418
16.8M
    buf2 = PyUnicode_DATA(substring);
10419
16.8M
    if (kind2 != kind1) {
10420
221k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10421
221k
        if (!buf2)
10422
0
            return NULL;
10423
221k
    }
10424
10425
16.8M
    switch (kind1) {
10426
16.6M
    case PyUnicode_1BYTE_KIND:
10427
16.6M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10428
15.5M
            out = asciilib_split(
10429
15.5M
                self,  buf1, len1, buf2, len2, maxcount);
10430
1.09M
        else
10431
1.09M
            out = ucs1lib_split(
10432
1.09M
                self,  buf1, len1, buf2, len2, maxcount);
10433
16.6M
        break;
10434
182k
    case PyUnicode_2BYTE_KIND:
10435
182k
        out = ucs2lib_split(
10436
182k
            self,  buf1, len1, buf2, len2, maxcount);
10437
182k
        break;
10438
38.4k
    case PyUnicode_4BYTE_KIND:
10439
38.4k
        out = ucs4lib_split(
10440
38.4k
            self,  buf1, len1, buf2, len2, maxcount);
10441
38.4k
        break;
10442
0
    default:
10443
0
        out = NULL;
10444
16.8M
    }
10445
16.8M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10446
16.8M
    if (kind2 != kind1)
10447
221k
        PyMem_Free((void *)buf2);
10448
16.8M
    return out;
10449
16.8M
}
10450
10451
static PyObject *
10452
rsplit(PyObject *self,
10453
       PyObject *substring,
10454
       Py_ssize_t maxcount)
10455
50
{
10456
50
    int kind1, kind2;
10457
50
    const void *buf1, *buf2;
10458
50
    Py_ssize_t len1, len2;
10459
50
    PyObject* out;
10460
10461
50
    len1 = PyUnicode_GET_LENGTH(self);
10462
50
    kind1 = PyUnicode_KIND(self);
10463
10464
50
    if (substring == NULL) {
10465
0
        if (maxcount < 0) {
10466
0
            maxcount = (len1 - 1) / 2 + 1;
10467
0
        }
10468
0
        switch (kind1) {
10469
0
        case PyUnicode_1BYTE_KIND:
10470
0
            if (PyUnicode_IS_ASCII(self))
10471
0
                return asciilib_rsplit_whitespace(
10472
0
                    self,  PyUnicode_1BYTE_DATA(self),
10473
0
                    len1, maxcount
10474
0
                    );
10475
0
            else
10476
0
                return ucs1lib_rsplit_whitespace(
10477
0
                    self,  PyUnicode_1BYTE_DATA(self),
10478
0
                    len1, maxcount
10479
0
                    );
10480
0
        case PyUnicode_2BYTE_KIND:
10481
0
            return ucs2lib_rsplit_whitespace(
10482
0
                self,  PyUnicode_2BYTE_DATA(self),
10483
0
                len1, maxcount
10484
0
                );
10485
0
        case PyUnicode_4BYTE_KIND:
10486
0
            return ucs4lib_rsplit_whitespace(
10487
0
                self,  PyUnicode_4BYTE_DATA(self),
10488
0
                len1, maxcount
10489
0
                );
10490
0
        default:
10491
0
            Py_UNREACHABLE();
10492
0
        }
10493
0
    }
10494
50
    kind2 = PyUnicode_KIND(substring);
10495
50
    len2 = PyUnicode_GET_LENGTH(substring);
10496
50
    if (maxcount < 0) {
10497
        // if len2 == 0, it will raise ValueError.
10498
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10499
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10500
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10501
0
    }
10502
50
    if (kind1 < kind2 || len1 < len2) {
10503
0
        out = PyList_New(1);
10504
0
        if (out == NULL)
10505
0
            return NULL;
10506
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10507
0
        return out;
10508
0
    }
10509
50
    buf1 = PyUnicode_DATA(self);
10510
50
    buf2 = PyUnicode_DATA(substring);
10511
50
    if (kind2 != kind1) {
10512
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10513
0
        if (!buf2)
10514
0
            return NULL;
10515
0
    }
10516
10517
50
    switch (kind1) {
10518
50
    case PyUnicode_1BYTE_KIND:
10519
50
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10520
50
            out = asciilib_rsplit(
10521
50
                self,  buf1, len1, buf2, len2, maxcount);
10522
0
        else
10523
0
            out = ucs1lib_rsplit(
10524
0
                self,  buf1, len1, buf2, len2, maxcount);
10525
50
        break;
10526
0
    case PyUnicode_2BYTE_KIND:
10527
0
        out = ucs2lib_rsplit(
10528
0
            self,  buf1, len1, buf2, len2, maxcount);
10529
0
        break;
10530
0
    case PyUnicode_4BYTE_KIND:
10531
0
        out = ucs4lib_rsplit(
10532
0
            self,  buf1, len1, buf2, len2, maxcount);
10533
0
        break;
10534
0
    default:
10535
0
        out = NULL;
10536
50
    }
10537
50
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10538
50
    if (kind2 != kind1)
10539
0
        PyMem_Free((void *)buf2);
10540
50
    return out;
10541
50
}
10542
10543
static Py_ssize_t
10544
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10545
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10546
163M
{
10547
163M
    switch (kind) {
10548
26.9M
    case PyUnicode_1BYTE_KIND:
10549
26.9M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10550
23.0M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10551
3.85M
        else
10552
3.85M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10553
62.1M
    case PyUnicode_2BYTE_KIND:
10554
62.1M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10555
74.1M
    case PyUnicode_4BYTE_KIND:
10556
74.1M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10557
163M
    }
10558
163M
    Py_UNREACHABLE();
10559
163M
}
10560
10561
static Py_ssize_t
10562
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10563
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10564
41.0M
{
10565
41.0M
    switch (kind) {
10566
36.0M
    case PyUnicode_1BYTE_KIND:
10567
36.0M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10568
4.87M
    case PyUnicode_2BYTE_KIND:
10569
4.87M
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10570
126k
    case PyUnicode_4BYTE_KIND:
10571
126k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10572
41.0M
    }
10573
41.0M
    Py_UNREACHABLE();
10574
41.0M
}
10575
10576
static void
10577
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10578
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10579
1.40M
{
10580
1.40M
    int kind = PyUnicode_KIND(u);
10581
1.40M
    void *data = PyUnicode_DATA(u);
10582
1.40M
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10583
1.40M
    if (kind == PyUnicode_1BYTE_KIND) {
10584
601k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10585
601k
                                      (Py_UCS1 *)data + len,
10586
601k
                                      u1, u2, maxcount);
10587
601k
    }
10588
805k
    else if (kind == PyUnicode_2BYTE_KIND) {
10589
791k
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10590
791k
                                      (Py_UCS2 *)data + len,
10591
791k
                                      u1, u2, maxcount);
10592
791k
    }
10593
14.4k
    else {
10594
14.4k
        assert(kind == PyUnicode_4BYTE_KIND);
10595
14.4k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10596
14.4k
                                      (Py_UCS4 *)data + len,
10597
14.4k
                                      u1, u2, maxcount);
10598
14.4k
    }
10599
1.40M
}
10600
10601
static PyObject *
10602
replace(PyObject *self, PyObject *str1,
10603
        PyObject *str2, Py_ssize_t maxcount)
10604
79.2M
{
10605
79.2M
    PyObject *u;
10606
79.2M
    const char *sbuf = PyUnicode_DATA(self);
10607
79.2M
    const void *buf1 = PyUnicode_DATA(str1);
10608
79.2M
    const void *buf2 = PyUnicode_DATA(str2);
10609
79.2M
    int srelease = 0, release1 = 0, release2 = 0;
10610
79.2M
    int skind = PyUnicode_KIND(self);
10611
79.2M
    int kind1 = PyUnicode_KIND(str1);
10612
79.2M
    int kind2 = PyUnicode_KIND(str2);
10613
79.2M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10614
79.2M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10615
79.2M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10616
79.2M
    int mayshrink;
10617
79.2M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10618
10619
79.2M
    if (slen < len1)
10620
31.9M
        goto nothing;
10621
10622
47.2M
    if (maxcount < 0)
10623
47.2M
        maxcount = PY_SSIZE_T_MAX;
10624
0
    else if (maxcount == 0)
10625
0
        goto nothing;
10626
10627
47.2M
    if (str1 == str2)
10628
0
        goto nothing;
10629
10630
47.2M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10631
47.2M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10632
47.2M
    if (maxchar < maxchar_str1)
10633
        /* substring too wide to be present */
10634
0
        goto nothing;
10635
47.2M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10636
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10637
       result string. */
10638
47.2M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10639
47.2M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10640
10641
47.2M
    if (len1 == len2) {
10642
        /* same length */
10643
6.18M
        if (len1 == 0)
10644
0
            goto nothing;
10645
6.18M
        if (len1 == 1) {
10646
            /* replace characters */
10647
6.18M
            Py_UCS4 u1, u2;
10648
6.18M
            Py_ssize_t pos;
10649
10650
6.18M
            u1 = PyUnicode_READ(kind1, buf1, 0);
10651
6.18M
            pos = findchar(sbuf, skind, slen, u1, 1);
10652
6.18M
            if (pos < 0)
10653
4.78M
                goto nothing;
10654
1.40M
            u2 = PyUnicode_READ(kind2, buf2, 0);
10655
1.40M
            u = PyUnicode_New(slen, maxchar);
10656
1.40M
            if (!u)
10657
0
                goto error;
10658
10659
1.40M
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10660
1.40M
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10661
1.40M
        }
10662
0
        else {
10663
0
            int rkind = skind;
10664
0
            char *res;
10665
0
            Py_ssize_t i;
10666
10667
0
            if (kind1 < rkind) {
10668
                /* widen substring */
10669
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10670
0
                if (!buf1) goto error;
10671
0
                release1 = 1;
10672
0
            }
10673
0
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10674
0
            if (i < 0)
10675
0
                goto nothing;
10676
0
            if (rkind > kind2) {
10677
                /* widen replacement */
10678
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10679
0
                if (!buf2) goto error;
10680
0
                release2 = 1;
10681
0
            }
10682
0
            else if (rkind < kind2) {
10683
                /* widen self and buf1 */
10684
0
                rkind = kind2;
10685
0
                if (release1) {
10686
0
                    assert(buf1 != PyUnicode_DATA(str1));
10687
0
                    PyMem_Free((void *)buf1);
10688
0
                    buf1 = PyUnicode_DATA(str1);
10689
0
                    release1 = 0;
10690
0
                }
10691
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10692
0
                if (!sbuf) goto error;
10693
0
                srelease = 1;
10694
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10695
0
                if (!buf1) goto error;
10696
0
                release1 = 1;
10697
0
            }
10698
0
            u = PyUnicode_New(slen, maxchar);
10699
0
            if (!u)
10700
0
                goto error;
10701
0
            assert(PyUnicode_KIND(u) == rkind);
10702
0
            res = PyUnicode_DATA(u);
10703
10704
0
            memcpy(res, sbuf, rkind * slen);
10705
            /* change everything in-place, starting with this one */
10706
0
            memcpy(res + rkind * i,
10707
0
                   buf2,
10708
0
                   rkind * len2);
10709
0
            i += len1;
10710
10711
0
            while ( --maxcount > 0) {
10712
0
                i = anylib_find(rkind, self,
10713
0
                                sbuf+rkind*i, slen-i,
10714
0
                                str1, buf1, len1, i);
10715
0
                if (i == -1)
10716
0
                    break;
10717
0
                memcpy(res + rkind * i,
10718
0
                       buf2,
10719
0
                       rkind * len2);
10720
0
                i += len1;
10721
0
            }
10722
0
        }
10723
6.18M
    }
10724
41.0M
    else {
10725
41.0M
        Py_ssize_t n, i, j, ires;
10726
41.0M
        Py_ssize_t new_size;
10727
41.0M
        int rkind = skind;
10728
41.0M
        char *res;
10729
10730
41.0M
        if (kind1 < rkind) {
10731
            /* widen substring */
10732
5.00M
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10733
5.00M
            if (!buf1) goto error;
10734
5.00M
            release1 = 1;
10735
5.00M
        }
10736
41.0M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10737
41.0M
        if (n == 0)
10738
36.0M
            goto nothing;
10739
5.05M
        if (kind2 < rkind) {
10740
            /* widen replacement */
10741
1.08M
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10742
1.08M
            if (!buf2) goto error;
10743
1.08M
            release2 = 1;
10744
1.08M
        }
10745
3.97M
        else if (kind2 > rkind) {
10746
            /* widen self and buf1 */
10747
0
            rkind = kind2;
10748
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10749
0
            if (!sbuf) goto error;
10750
0
            srelease = 1;
10751
0
            if (release1) {
10752
0
                assert(buf1 != PyUnicode_DATA(str1));
10753
0
                PyMem_Free((void *)buf1);
10754
0
                buf1 = PyUnicode_DATA(str1);
10755
0
                release1 = 0;
10756
0
            }
10757
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10758
0
            if (!buf1) goto error;
10759
0
            release1 = 1;
10760
0
        }
10761
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10762
           PyUnicode_GET_LENGTH(str1)); */
10763
5.05M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10764
0
                PyErr_SetString(PyExc_OverflowError,
10765
0
                                "replace string is too long");
10766
0
                goto error;
10767
0
        }
10768
5.05M
        new_size = slen + n * (len2 - len1);
10769
5.05M
        if (new_size == 0) {
10770
0
            u = unicode_get_empty();
10771
0
            goto done;
10772
0
        }
10773
5.05M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10774
0
            PyErr_SetString(PyExc_OverflowError,
10775
0
                            "replace string is too long");
10776
0
            goto error;
10777
0
        }
10778
5.05M
        u = PyUnicode_New(new_size, maxchar);
10779
5.05M
        if (!u)
10780
0
            goto error;
10781
5.05M
        assert(PyUnicode_KIND(u) == rkind);
10782
5.05M
        res = PyUnicode_DATA(u);
10783
5.05M
        ires = i = 0;
10784
5.05M
        if (len1 > 0) {
10785
168M
            while (n-- > 0) {
10786
                /* look for next match */
10787
163M
                j = anylib_find(rkind, self,
10788
163M
                                sbuf + rkind * i, slen-i,
10789
163M
                                str1, buf1, len1, i);
10790
163M
                if (j == -1)
10791
0
                    break;
10792
163M
                else if (j > i) {
10793
                    /* copy unchanged part [i:j] */
10794
21.0M
                    memcpy(res + rkind * ires,
10795
21.0M
                           sbuf + rkind * i,
10796
21.0M
                           rkind * (j-i));
10797
21.0M
                    ires += j - i;
10798
21.0M
                }
10799
                /* copy substitution string */
10800
163M
                if (len2 > 0) {
10801
163M
                    memcpy(res + rkind * ires,
10802
163M
                           buf2,
10803
163M
                           rkind * len2);
10804
163M
                    ires += len2;
10805
163M
                }
10806
163M
                i = j + len1;
10807
163M
            }
10808
5.05M
            if (i < slen)
10809
                /* copy tail [i:] */
10810
4.97M
                memcpy(res + rkind * ires,
10811
4.97M
                       sbuf + rkind * i,
10812
4.97M
                       rkind * (slen-i));
10813
5.05M
        }
10814
0
        else {
10815
            /* interleave */
10816
0
            while (n > 0) {
10817
0
                memcpy(res + rkind * ires,
10818
0
                       buf2,
10819
0
                       rkind * len2);
10820
0
                ires += len2;
10821
0
                if (--n <= 0)
10822
0
                    break;
10823
0
                memcpy(res + rkind * ires,
10824
0
                       sbuf + rkind * i,
10825
0
                       rkind);
10826
0
                ires++;
10827
0
                i++;
10828
0
            }
10829
0
            memcpy(res + rkind * ires,
10830
0
                   sbuf + rkind * i,
10831
0
                   rkind * (slen-i));
10832
0
        }
10833
5.05M
    }
10834
10835
6.46M
    if (mayshrink) {
10836
0
        unicode_adjust_maxchar(&u);
10837
0
        if (u == NULL)
10838
0
            goto error;
10839
0
    }
10840
10841
6.46M
  done:
10842
6.46M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10843
6.46M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10844
6.46M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10845
6.46M
    if (srelease)
10846
0
        PyMem_Free((void *)sbuf);
10847
6.46M
    if (release1)
10848
1.08M
        PyMem_Free((void *)buf1);
10849
6.46M
    if (release2)
10850
1.08M
        PyMem_Free((void *)buf2);
10851
6.46M
    assert(_PyUnicode_CheckConsistency(u, 1));
10852
6.46M
    return u;
10853
10854
72.7M
  nothing:
10855
    /* nothing to replace; return original string (when possible) */
10856
72.7M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10857
72.7M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10858
72.7M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10859
72.7M
    if (srelease)
10860
0
        PyMem_Free((void *)sbuf);
10861
72.7M
    if (release1)
10862
3.92M
        PyMem_Free((void *)buf1);
10863
72.7M
    if (release2)
10864
0
        PyMem_Free((void *)buf2);
10865
72.7M
    return unicode_result_unchanged(self);
10866
10867
0
  error:
10868
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10869
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10870
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10871
0
    if (srelease)
10872
0
        PyMem_Free((void *)sbuf);
10873
0
    if (release1)
10874
0
        PyMem_Free((void *)buf1);
10875
0
    if (release2)
10876
0
        PyMem_Free((void *)buf2);
10877
0
    return NULL;
10878
6.46M
}
10879
10880
/* --- Unicode Object Methods --------------------------------------------- */
10881
10882
/*[clinic input]
10883
@permit_long_docstring_body
10884
str.title as unicode_title
10885
10886
Return a version of the string where each word is titlecased.
10887
10888
More specifically, words start with uppercased characters and all remaining
10889
cased characters have lower case.
10890
[clinic start generated code]*/
10891
10892
static PyObject *
10893
unicode_title_impl(PyObject *self)
10894
/*[clinic end generated code: output=c75ae03809574902 input=533ce0eb6a7f5d1b]*/
10895
0
{
10896
0
    return case_operation(self, do_title);
10897
0
}
10898
10899
/*[clinic input]
10900
@permit_long_docstring_body
10901
str.capitalize as unicode_capitalize
10902
10903
Return a capitalized version of the string.
10904
10905
More specifically, make the first character have upper case and the rest lower
10906
case.
10907
[clinic start generated code]*/
10908
10909
static PyObject *
10910
unicode_capitalize_impl(PyObject *self)
10911
/*[clinic end generated code: output=e49a4c333cdb7667 input=a4a15ade41f6f9e9]*/
10912
0
{
10913
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10914
0
        return unicode_result_unchanged(self);
10915
0
    return case_operation(self, do_capitalize);
10916
0
}
10917
10918
/*[clinic input]
10919
str.casefold as unicode_casefold
10920
10921
Return a version of the string suitable for caseless comparisons.
10922
[clinic start generated code]*/
10923
10924
static PyObject *
10925
unicode_casefold_impl(PyObject *self)
10926
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10927
0
{
10928
0
    if (PyUnicode_IS_ASCII(self))
10929
0
        return ascii_upper_or_lower(self, 1);
10930
0
    return case_operation(self, do_casefold);
10931
0
}
10932
10933
10934
/* Argument converter. Accepts a single Unicode character. */
10935
10936
static int
10937
convert_uc(PyObject *obj, void *addr)
10938
0
{
10939
0
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10940
10941
0
    if (!PyUnicode_Check(obj)) {
10942
0
        PyErr_Format(PyExc_TypeError,
10943
0
                     "The fill character must be a unicode character, "
10944
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10945
0
        return 0;
10946
0
    }
10947
0
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10948
0
        PyErr_SetString(PyExc_TypeError,
10949
0
                        "The fill character must be exactly one character long");
10950
0
        return 0;
10951
0
    }
10952
0
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10953
0
    return 1;
10954
0
}
10955
10956
/*[clinic input]
10957
str.center as unicode_center
10958
10959
    width: Py_ssize_t
10960
    fillchar: Py_UCS4 = ' '
10961
    /
10962
10963
Return a centered string of length width.
10964
10965
Padding is done using the specified fill character (default is a space).
10966
[clinic start generated code]*/
10967
10968
static PyObject *
10969
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10970
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10971
0
{
10972
0
    Py_ssize_t marg, left;
10973
10974
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10975
0
        return unicode_result_unchanged(self);
10976
10977
0
    marg = width - PyUnicode_GET_LENGTH(self);
10978
0
    left = marg / 2 + (marg & width & 1);
10979
10980
0
    return pad(self, left, marg - left, fillchar);
10981
0
}
10982
10983
/* This function assumes that str1 and str2 are readied by the caller. */
10984
10985
static int
10986
unicode_compare(PyObject *str1, PyObject *str2)
10987
22.5M
{
10988
22.5M
#define COMPARE(TYPE1, TYPE2) \
10989
22.5M
    do { \
10990
21.1M
        TYPE1* p1 = (TYPE1 *)data1; \
10991
21.1M
        TYPE2* p2 = (TYPE2 *)data2; \
10992
21.1M
        TYPE1* end = p1 + len; \
10993
21.1M
        Py_UCS4 c1, c2; \
10994
21.1M
        for (; p1 != end; p1++, p2++) { \
10995
21.1M
            c1 = *p1; \
10996
21.1M
            c2 = *p2; \
10997
21.1M
            if (c1 != c2) \
10998
21.1M
                return (c1 < c2) ? -1 : 1; \
10999
21.1M
        } \
11000
21.1M
    } \
11001
21.1M
    while (0)
11002
11003
22.5M
    int kind1, kind2;
11004
22.5M
    const void *data1, *data2;
11005
22.5M
    Py_ssize_t len1, len2, len;
11006
11007
22.5M
    kind1 = PyUnicode_KIND(str1);
11008
22.5M
    kind2 = PyUnicode_KIND(str2);
11009
22.5M
    data1 = PyUnicode_DATA(str1);
11010
22.5M
    data2 = PyUnicode_DATA(str2);
11011
22.5M
    len1 = PyUnicode_GET_LENGTH(str1);
11012
22.5M
    len2 = PyUnicode_GET_LENGTH(str2);
11013
22.5M
    len = Py_MIN(len1, len2);
11014
11015
22.5M
    switch(kind1) {
11016
1.72M
    case PyUnicode_1BYTE_KIND:
11017
1.72M
    {
11018
1.72M
        switch(kind2) {
11019
70.5k
        case PyUnicode_1BYTE_KIND:
11020
70.5k
        {
11021
70.5k
            int cmp = memcmp(data1, data2, len);
11022
            /* normalize result of memcmp() into the range [-1; 1] */
11023
70.5k
            if (cmp < 0)
11024
45.1k
                return -1;
11025
25.4k
            if (cmp > 0)
11026
24.8k
                return 1;
11027
555
            break;
11028
25.4k
        }
11029
1.31M
        case PyUnicode_2BYTE_KIND:
11030
1.31M
            COMPARE(Py_UCS1, Py_UCS2);
11031
0
            break;
11032
343k
        case PyUnicode_4BYTE_KIND:
11033
343k
            COMPARE(Py_UCS1, Py_UCS4);
11034
0
            break;
11035
0
        default:
11036
0
            Py_UNREACHABLE();
11037
1.72M
        }
11038
555
        break;
11039
1.72M
    }
11040
18.8M
    case PyUnicode_2BYTE_KIND:
11041
18.8M
    {
11042
18.8M
        switch(kind2) {
11043
4.03k
        case PyUnicode_1BYTE_KIND:
11044
4.03k
            COMPARE(Py_UCS2, Py_UCS1);
11045
0
            break;
11046
16.9M
        case PyUnicode_2BYTE_KIND:
11047
16.9M
        {
11048
16.9M
            COMPARE(Py_UCS2, Py_UCS2);
11049
0
            break;
11050
16.9M
        }
11051
1.86M
        case PyUnicode_4BYTE_KIND:
11052
1.86M
            COMPARE(Py_UCS2, Py_UCS4);
11053
0
            break;
11054
0
        default:
11055
0
            Py_UNREACHABLE();
11056
18.8M
        }
11057
0
        break;
11058
18.8M
    }
11059
2.06M
    case PyUnicode_4BYTE_KIND:
11060
2.06M
    {
11061
2.06M
        switch(kind2) {
11062
3.67k
        case PyUnicode_1BYTE_KIND:
11063
3.67k
            COMPARE(Py_UCS4, Py_UCS1);
11064
0
            break;
11065
717k
        case PyUnicode_2BYTE_KIND:
11066
717k
            COMPARE(Py_UCS4, Py_UCS2);
11067
0
            break;
11068
1.34M
        case PyUnicode_4BYTE_KIND:
11069
1.34M
        {
11070
1.34M
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11071
1.34M
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11072
            /* normalize result of wmemcmp() into the range [-1; 1] */
11073
1.34M
            if (cmp < 0)
11074
642k
                return -1;
11075
697k
            if (cmp > 0)
11076
697k
                return 1;
11077
#else
11078
            COMPARE(Py_UCS4, Py_UCS4);
11079
#endif
11080
0
            break;
11081
697k
        }
11082
0
        default:
11083
0
            Py_UNREACHABLE();
11084
2.06M
        }
11085
0
        break;
11086
2.06M
    }
11087
0
    default:
11088
0
        Py_UNREACHABLE();
11089
22.5M
    }
11090
11091
555
    if (len1 == len2)
11092
552
        return 0;
11093
3
    if (len1 < len2)
11094
3
        return -1;
11095
0
    else
11096
0
        return 1;
11097
11098
3
#undef COMPARE
11099
3
}
11100
11101
11102
int
11103
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11104
290M
{
11105
290M
    assert(PyUnicode_Check(str1));
11106
290M
    assert(PyUnicode_Check(str2));
11107
290M
    if (str1 == str2) {
11108
80.0M
        return 1;
11109
80.0M
    }
11110
210M
    return unicode_eq(str1, str2);
11111
290M
}
11112
11113
11114
int
11115
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11116
0
{
11117
0
    if (!PyUnicode_Check(str1)) {
11118
0
        PyErr_Format(PyExc_TypeError,
11119
0
                     "first argument must be str, not %T", str1);
11120
0
        return -1;
11121
0
    }
11122
0
    if (!PyUnicode_Check(str2)) {
11123
0
        PyErr_Format(PyExc_TypeError,
11124
0
                     "second argument must be str, not %T", str2);
11125
0
        return -1;
11126
0
    }
11127
11128
0
    return _PyUnicode_Equal(str1, str2);
11129
0
}
11130
11131
11132
int
11133
PyUnicode_Compare(PyObject *left, PyObject *right)
11134
7.23k
{
11135
7.23k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11136
        /* a string is equal to itself */
11137
7.23k
        if (left == right)
11138
0
            return 0;
11139
11140
7.23k
        return unicode_compare(left, right);
11141
7.23k
    }
11142
0
    PyErr_Format(PyExc_TypeError,
11143
0
                 "Can't compare %.100s and %.100s",
11144
0
                 Py_TYPE(left)->tp_name,
11145
0
                 Py_TYPE(right)->tp_name);
11146
0
    return -1;
11147
7.23k
}
11148
11149
int
11150
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11151
2.01M
{
11152
2.01M
    Py_ssize_t i;
11153
2.01M
    int kind;
11154
2.01M
    Py_UCS4 chr;
11155
11156
2.01M
    assert(_PyUnicode_CHECK(uni));
11157
2.01M
    kind = PyUnicode_KIND(uni);
11158
2.01M
    if (kind == PyUnicode_1BYTE_KIND) {
11159
2.01M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11160
2.01M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11161
2.01M
        size_t len, len2 = strlen(str);
11162
2.01M
        int cmp;
11163
11164
2.01M
        len = Py_MIN(len1, len2);
11165
2.01M
        cmp = memcmp(data, str, len);
11166
2.01M
        if (cmp != 0) {
11167
1.46M
            if (cmp < 0)
11168
8.17k
                return -1;
11169
1.45M
            else
11170
1.45M
                return 1;
11171
1.46M
        }
11172
549k
        if (len1 > len2)
11173
70
            return 1; /* uni is longer */
11174
549k
        if (len1 < len2)
11175
735
            return -1; /* str is longer */
11176
548k
        return 0;
11177
549k
    }
11178
1.37k
    else {
11179
1.37k
        const void *data = PyUnicode_DATA(uni);
11180
        /* Compare Unicode string and source character set string */
11181
2.59k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11182
2.35k
            if (chr != (unsigned char)str[i])
11183
1.13k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11184
        /* This check keeps Python strings that end in '\0' from comparing equal
11185
         to C strings identical up to that point. */
11186
238
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11187
238
            return 1; /* uni is longer */
11188
0
        if (str[i])
11189
0
            return -1; /* str is longer */
11190
0
        return 0;
11191
0
    }
11192
2.01M
}
11193
11194
int
11195
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11196
0
{
11197
0
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11198
0
}
11199
11200
int
11201
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11202
0
{
11203
0
    assert(_PyUnicode_CHECK(unicode));
11204
0
    assert(str);
11205
11206
0
    if (PyUnicode_IS_ASCII(unicode)) {
11207
0
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11208
0
        return size == len &&
11209
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11210
0
    }
11211
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11212
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11213
0
        return size == len &&
11214
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11215
0
    }
11216
11217
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11218
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11219
0
        return 0;
11220
0
    }
11221
0
    const unsigned char *s = (const unsigned char *)str;
11222
0
    const unsigned char *ends = s + (size_t)size;
11223
0
    int kind = PyUnicode_KIND(unicode);
11224
0
    const void *data = PyUnicode_DATA(unicode);
11225
    /* Compare Unicode string and UTF-8 string */
11226
0
    for (Py_ssize_t i = 0; i < len; i++) {
11227
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11228
0
        if (ch < 0x80) {
11229
0
            if (ends == s || s[0] != ch) {
11230
0
                return 0;
11231
0
            }
11232
0
            s += 1;
11233
0
        }
11234
0
        else if (ch < 0x800) {
11235
0
            if ((ends - s) < 2 ||
11236
0
                s[0] != (0xc0 | (ch >> 6)) ||
11237
0
                s[1] != (0x80 | (ch & 0x3f)))
11238
0
            {
11239
0
                return 0;
11240
0
            }
11241
0
            s += 2;
11242
0
        }
11243
0
        else if (ch < 0x10000) {
11244
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11245
0
                (ends - s) < 3 ||
11246
0
                s[0] != (0xe0 | (ch >> 12)) ||
11247
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11248
0
                s[2] != (0x80 | (ch & 0x3f)))
11249
0
            {
11250
0
                return 0;
11251
0
            }
11252
0
            s += 3;
11253
0
        }
11254
0
        else {
11255
0
            assert(ch <= MAX_UNICODE);
11256
0
            if ((ends - s) < 4 ||
11257
0
                s[0] != (0xf0 | (ch >> 18)) ||
11258
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11259
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11260
0
                s[3] != (0x80 | (ch & 0x3f)))
11261
0
            {
11262
0
                return 0;
11263
0
            }
11264
0
            s += 4;
11265
0
        }
11266
0
    }
11267
0
    return s == ends;
11268
0
}
11269
11270
int
11271
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11272
6.71M
{
11273
6.71M
    size_t len;
11274
6.71M
    assert(_PyUnicode_CHECK(unicode));
11275
6.71M
    assert(str);
11276
#ifndef NDEBUG
11277
    for (const char *p = str; *p; p++) {
11278
        assert((unsigned char)*p < 128);
11279
    }
11280
#endif
11281
6.71M
    if (!PyUnicode_IS_ASCII(unicode))
11282
149k
        return 0;
11283
6.56M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11284
6.56M
    return strlen(str) == len &&
11285
437k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11286
6.71M
}
11287
11288
int
11289
_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11290
0
{
11291
0
    PyObject *right_uni;
11292
11293
0
    assert(_PyUnicode_CHECK(left));
11294
0
    assert(right->string);
11295
#ifndef NDEBUG
11296
    for (const char *p = right->string; *p; p++) {
11297
        assert((unsigned char)*p < 128);
11298
    }
11299
#endif
11300
11301
0
    if (!PyUnicode_IS_ASCII(left))
11302
0
        return 0;
11303
11304
0
    right_uni = _PyUnicode_FromId(right);       /* borrowed */
11305
0
    if (right_uni == NULL) {
11306
        /* memory error or bad data */
11307
0
        PyErr_Clear();
11308
0
        return _PyUnicode_EqualToASCIIString(left, right->string);
11309
0
    }
11310
11311
0
    if (left == right_uni)
11312
0
        return 1;
11313
11314
0
    assert(PyUnicode_CHECK_INTERNED(right_uni));
11315
0
    if (PyUnicode_CHECK_INTERNED(left)) {
11316
0
        return 0;
11317
0
    }
11318
11319
0
    Py_hash_t right_hash = PyUnicode_HASH(right_uni);
11320
0
    assert(right_hash != -1);
11321
0
    Py_hash_t hash = PyUnicode_HASH(left);
11322
0
    if (hash != -1 && hash != right_hash) {
11323
0
        return 0;
11324
0
    }
11325
11326
0
    return unicode_eq(left, right_uni);
11327
0
}
11328
11329
PyObject *
11330
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11331
32.1M
{
11332
32.1M
    int result;
11333
11334
32.1M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11335
89.3k
        Py_RETURN_NOTIMPLEMENTED;
11336
11337
32.0M
    if (left == right) {
11338
1.39k
        switch (op) {
11339
1.31k
        case Py_EQ:
11340
1.31k
        case Py_LE:
11341
1.31k
        case Py_GE:
11342
            /* a string is equal to itself */
11343
1.31k
            Py_RETURN_TRUE;
11344
83
        case Py_NE:
11345
83
        case Py_LT:
11346
83
        case Py_GT:
11347
83
            Py_RETURN_FALSE;
11348
0
        default:
11349
0
            PyErr_BadArgument();
11350
0
            return NULL;
11351
1.39k
        }
11352
1.39k
    }
11353
32.0M
    else if (op == Py_EQ || op == Py_NE) {
11354
9.42M
        result = unicode_eq(left, right);
11355
9.42M
        result ^= (op == Py_NE);
11356
9.42M
        return PyBool_FromLong(result);
11357
9.42M
    }
11358
22.5M
    else {
11359
22.5M
        result = unicode_compare(left, right);
11360
22.5M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11361
22.5M
    }
11362
32.0M
}
11363
11364
int
11365
PyUnicode_Contains(PyObject *str, PyObject *substr)
11366
91.7M
{
11367
91.7M
    int kind1, kind2;
11368
91.7M
    const void *buf1, *buf2;
11369
91.7M
    Py_ssize_t len1, len2;
11370
91.7M
    int result;
11371
11372
91.7M
    if (!PyUnicode_Check(substr)) {
11373
0
        PyErr_Format(PyExc_TypeError,
11374
0
                     "'in <string>' requires string as left operand, not %.100s",
11375
0
                     Py_TYPE(substr)->tp_name);
11376
0
        return -1;
11377
0
    }
11378
91.7M
    if (ensure_unicode(str) < 0)
11379
0
        return -1;
11380
11381
91.7M
    kind1 = PyUnicode_KIND(str);
11382
91.7M
    kind2 = PyUnicode_KIND(substr);
11383
91.7M
    if (kind1 < kind2)
11384
3.65M
        return 0;
11385
88.1M
    len1 = PyUnicode_GET_LENGTH(str);
11386
88.1M
    len2 = PyUnicode_GET_LENGTH(substr);
11387
88.1M
    if (len1 < len2)
11388
5.65M
        return 0;
11389
82.4M
    buf1 = PyUnicode_DATA(str);
11390
82.4M
    buf2 = PyUnicode_DATA(substr);
11391
82.4M
    if (len2 == 1) {
11392
82.4M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11393
82.4M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11394
82.4M
        return result;
11395
82.4M
    }
11396
39.0k
    if (kind2 != kind1) {
11397
21.8k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11398
21.8k
        if (!buf2)
11399
0
            return -1;
11400
21.8k
    }
11401
11402
39.0k
    switch (kind1) {
11403
17.1k
    case PyUnicode_1BYTE_KIND:
11404
17.1k
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11405
17.1k
        break;
11406
17.1k
    case PyUnicode_2BYTE_KIND:
11407
17.1k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11408
17.1k
        break;
11409
4.74k
    case PyUnicode_4BYTE_KIND:
11410
4.74k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11411
4.74k
        break;
11412
0
    default:
11413
0
        Py_UNREACHABLE();
11414
39.0k
    }
11415
11416
39.0k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11417
39.0k
    if (kind2 != kind1)
11418
21.8k
        PyMem_Free((void *)buf2);
11419
11420
39.0k
    return result;
11421
39.0k
}
11422
11423
/* Concat to string or Unicode object giving a new Unicode object. */
11424
11425
PyObject *
11426
PyUnicode_Concat(PyObject *left, PyObject *right)
11427
38.8M
{
11428
38.8M
    PyObject *result;
11429
38.8M
    Py_UCS4 maxchar, maxchar2;
11430
38.8M
    Py_ssize_t left_len, right_len, new_len;
11431
11432
38.8M
    if (ensure_unicode(left) < 0)
11433
0
        return NULL;
11434
11435
38.8M
    if (!PyUnicode_Check(right)) {
11436
0
        PyErr_Format(PyExc_TypeError,
11437
0
            "can only concatenate str (not \"%.200s\") to str",
11438
0
            Py_TYPE(right)->tp_name);
11439
0
        return NULL;
11440
0
    }
11441
11442
    /* Shortcuts */
11443
38.8M
    PyObject *empty = unicode_get_empty();  // Borrowed reference
11444
38.8M
    if (left == empty) {
11445
71.4k
        return PyUnicode_FromObject(right);
11446
71.4k
    }
11447
38.8M
    if (right == empty) {
11448
4.56M
        return PyUnicode_FromObject(left);
11449
4.56M
    }
11450
11451
34.2M
    left_len = PyUnicode_GET_LENGTH(left);
11452
34.2M
    right_len = PyUnicode_GET_LENGTH(right);
11453
34.2M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11454
0
        PyErr_SetString(PyExc_OverflowError,
11455
0
                        "strings are too large to concat");
11456
0
        return NULL;
11457
0
    }
11458
34.2M
    new_len = left_len + right_len;
11459
11460
34.2M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11461
34.2M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11462
34.2M
    maxchar = Py_MAX(maxchar, maxchar2);
11463
11464
    /* Concat the two Unicode strings */
11465
34.2M
    result = PyUnicode_New(new_len, maxchar);
11466
34.2M
    if (result == NULL)
11467
0
        return NULL;
11468
34.2M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11469
34.2M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11470
34.2M
    assert(_PyUnicode_CheckConsistency(result, 1));
11471
34.2M
    return result;
11472
34.2M
}
11473
11474
void
11475
PyUnicode_Append(PyObject **p_left, PyObject *right)
11476
1.43M
{
11477
1.43M
    PyObject *left, *res;
11478
1.43M
    Py_UCS4 maxchar, maxchar2;
11479
1.43M
    Py_ssize_t left_len, right_len, new_len;
11480
11481
1.43M
    if (p_left == NULL) {
11482
0
        if (!PyErr_Occurred())
11483
0
            PyErr_BadInternalCall();
11484
0
        return;
11485
0
    }
11486
1.43M
    left = *p_left;
11487
1.43M
    if (right == NULL || left == NULL
11488
1.43M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11489
0
        if (!PyErr_Occurred())
11490
0
            PyErr_BadInternalCall();
11491
0
        goto error;
11492
0
    }
11493
11494
    /* Shortcuts */
11495
1.43M
    PyObject *empty = unicode_get_empty();  // Borrowed reference
11496
1.43M
    if (left == empty) {
11497
474k
        Py_DECREF(left);
11498
474k
        *p_left = Py_NewRef(right);
11499
474k
        return;
11500
474k
    }
11501
961k
    if (right == empty) {
11502
0
        return;
11503
0
    }
11504
11505
961k
    left_len = PyUnicode_GET_LENGTH(left);
11506
961k
    right_len = PyUnicode_GET_LENGTH(right);
11507
961k
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11508
0
        PyErr_SetString(PyExc_OverflowError,
11509
0
                        "strings are too large to concat");
11510
0
        goto error;
11511
0
    }
11512
961k
    new_len = left_len + right_len;
11513
11514
961k
    if (_PyUnicode_IsModifiable(left)
11515
961k
        && PyUnicode_CheckExact(right)
11516
961k
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11517
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11518
           to change the structure size, but characters are stored just after
11519
           the structure, and so it requires to move all characters which is
11520
           not so different than duplicating the string. */
11521
915k
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11522
915k
    {
11523
        /* append inplace */
11524
915k
        if (unicode_resize(p_left, new_len) != 0)
11525
0
            goto error;
11526
11527
        /* copy 'right' into the newly allocated area of 'left' */
11528
915k
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11529
915k
    }
11530
45.8k
    else {
11531
45.8k
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11532
45.8k
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11533
45.8k
        maxchar = Py_MAX(maxchar, maxchar2);
11534
11535
        /* Concat the two Unicode strings */
11536
45.8k
        res = PyUnicode_New(new_len, maxchar);
11537
45.8k
        if (res == NULL)
11538
0
            goto error;
11539
45.8k
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11540
45.8k
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11541
45.8k
        Py_DECREF(left);
11542
45.8k
        *p_left = res;
11543
45.8k
    }
11544
961k
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11545
961k
    return;
11546
11547
0
error:
11548
0
    Py_CLEAR(*p_left);
11549
0
}
11550
11551
void
11552
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11553
0
{
11554
0
    PyUnicode_Append(pleft, right);
11555
0
    Py_XDECREF(right);
11556
0
}
11557
11558
/*[clinic input]
11559
@permit_long_summary
11560
@text_signature "($self, sub[, start[, end]], /)"
11561
str.count as unicode_count -> Py_ssize_t
11562
11563
    self as str: self
11564
    sub as substr: unicode
11565
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11566
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11567
    /
11568
11569
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11570
11571
Optional arguments start and end are interpreted as in slice notation.
11572
[clinic start generated code]*/
11573
11574
static Py_ssize_t
11575
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11576
                   Py_ssize_t end)
11577
/*[clinic end generated code: output=8fcc3aef0b18edbf input=8590716ee228b935]*/
11578
20.6M
{
11579
20.6M
    assert(PyUnicode_Check(str));
11580
20.6M
    assert(PyUnicode_Check(substr));
11581
11582
20.6M
    Py_ssize_t result;
11583
20.6M
    int kind1, kind2;
11584
20.6M
    const void *buf1 = NULL, *buf2 = NULL;
11585
20.6M
    Py_ssize_t len1, len2;
11586
11587
20.6M
    kind1 = PyUnicode_KIND(str);
11588
20.6M
    kind2 = PyUnicode_KIND(substr);
11589
20.6M
    if (kind1 < kind2)
11590
0
        return 0;
11591
11592
20.6M
    len1 = PyUnicode_GET_LENGTH(str);
11593
20.6M
    len2 = PyUnicode_GET_LENGTH(substr);
11594
20.6M
    ADJUST_INDICES(start, end, len1);
11595
20.6M
    if (end - start < len2)
11596
87.7k
        return 0;
11597
11598
20.5M
    buf1 = PyUnicode_DATA(str);
11599
20.5M
    buf2 = PyUnicode_DATA(substr);
11600
20.5M
    if (kind2 != kind1) {
11601
4.44M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11602
4.44M
        if (!buf2)
11603
0
            goto onError;
11604
4.44M
    }
11605
11606
    // We don't reuse `anylib_count` here because of the explicit casts.
11607
20.5M
    switch (kind1) {
11608
16.1M
    case PyUnicode_1BYTE_KIND:
11609
16.1M
        result = ucs1lib_count(
11610
16.1M
            ((const Py_UCS1*)buf1) + start, end - start,
11611
16.1M
            buf2, len2, PY_SSIZE_T_MAX
11612
16.1M
            );
11613
16.1M
        break;
11614
3.61M
    case PyUnicode_2BYTE_KIND:
11615
3.61M
        result = ucs2lib_count(
11616
3.61M
            ((const Py_UCS2*)buf1) + start, end - start,
11617
3.61M
            buf2, len2, PY_SSIZE_T_MAX
11618
3.61M
            );
11619
3.61M
        break;
11620
834k
    case PyUnicode_4BYTE_KIND:
11621
834k
        result = ucs4lib_count(
11622
834k
            ((const Py_UCS4*)buf1) + start, end - start,
11623
834k
            buf2, len2, PY_SSIZE_T_MAX
11624
834k
            );
11625
834k
        break;
11626
0
    default:
11627
0
        Py_UNREACHABLE();
11628
20.5M
    }
11629
11630
20.5M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11631
20.5M
    if (kind2 != kind1)
11632
4.44M
        PyMem_Free((void *)buf2);
11633
11634
20.5M
    return result;
11635
0
  onError:
11636
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11637
0
    if (kind2 != kind1)
11638
0
        PyMem_Free((void *)buf2);
11639
0
    return -1;
11640
20.5M
}
11641
11642
/*[clinic input]
11643
str.encode as unicode_encode
11644
11645
    encoding: str(c_default="NULL") = 'utf-8'
11646
        The encoding in which to encode the string.
11647
    errors: str(c_default="NULL") = 'strict'
11648
        The error handling scheme to use for encoding errors.
11649
        The default is 'strict' meaning that encoding errors raise a
11650
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11651
        'xmlcharrefreplace' as well as any other name registered with
11652
        codecs.register_error that can handle UnicodeEncodeErrors.
11653
11654
Encode the string using the codec registered for encoding.
11655
[clinic start generated code]*/
11656
11657
static PyObject *
11658
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11659
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11660
15.8M
{
11661
15.8M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11662
15.8M
}
11663
11664
/*[clinic input]
11665
str.expandtabs as unicode_expandtabs
11666
11667
    tabsize: int = 8
11668
11669
Return a copy where all tab characters are expanded using spaces.
11670
11671
If tabsize is not given, a tab size of 8 characters is assumed.
11672
[clinic start generated code]*/
11673
11674
static PyObject *
11675
unicode_expandtabs_impl(PyObject *self, int tabsize)
11676
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11677
6.74M
{
11678
6.74M
    Py_ssize_t i, j, line_pos, src_len, incr;
11679
6.74M
    Py_UCS4 ch;
11680
6.74M
    PyObject *u;
11681
6.74M
    const void *src_data;
11682
6.74M
    void *dest_data;
11683
6.74M
    int kind;
11684
6.74M
    int found;
11685
11686
    /* First pass: determine size of output string */
11687
6.74M
    src_len = PyUnicode_GET_LENGTH(self);
11688
6.74M
    i = j = line_pos = 0;
11689
6.74M
    kind = PyUnicode_KIND(self);
11690
6.74M
    src_data = PyUnicode_DATA(self);
11691
6.74M
    found = 0;
11692
132M
    for (; i < src_len; i++) {
11693
125M
        ch = PyUnicode_READ(kind, src_data, i);
11694
125M
        if (ch == '\t') {
11695
12.6M
            found = 1;
11696
12.6M
            if (tabsize > 0) {
11697
12.6M
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11698
12.6M
                if (j > PY_SSIZE_T_MAX - incr)
11699
0
                    goto overflow;
11700
12.6M
                line_pos += incr;
11701
12.6M
                j += incr;
11702
12.6M
            }
11703
12.6M
        }
11704
112M
        else {
11705
112M
            if (j > PY_SSIZE_T_MAX - 1)
11706
0
                goto overflow;
11707
112M
            line_pos++;
11708
112M
            j++;
11709
112M
            if (ch == '\n' || ch == '\r')
11710
12.9k
                line_pos = 0;
11711
112M
        }
11712
125M
    }
11713
6.74M
    if (!found)
11714
6.60M
        return unicode_result_unchanged(self);
11715
11716
    /* Second pass: create output string and fill it */
11717
144k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11718
144k
    if (!u)
11719
0
        return NULL;
11720
144k
    dest_data = PyUnicode_DATA(u);
11721
11722
144k
    i = j = line_pos = 0;
11723
11724
30.6M
    for (; i < src_len; i++) {
11725
30.5M
        ch = PyUnicode_READ(kind, src_data, i);
11726
30.5M
        if (ch == '\t') {
11727
12.6M
            if (tabsize > 0) {
11728
12.6M
                incr = tabsize - (line_pos % tabsize);
11729
12.6M
                line_pos += incr;
11730
12.6M
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11731
12.6M
                j += incr;
11732
12.6M
            }
11733
12.6M
        }
11734
17.8M
        else {
11735
17.8M
            line_pos++;
11736
17.8M
            PyUnicode_WRITE(kind, dest_data, j, ch);
11737
17.8M
            j++;
11738
17.8M
            if (ch == '\n' || ch == '\r')
11739
0
                line_pos = 0;
11740
17.8M
        }
11741
30.5M
    }
11742
144k
    assert (j == PyUnicode_GET_LENGTH(u));
11743
144k
    return unicode_result(u);
11744
11745
0
  overflow:
11746
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11747
0
    return NULL;
11748
144k
}
11749
11750
/*[clinic input]
11751
@permit_long_summary
11752
str.find as unicode_find = str.count
11753
11754
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11755
11756
Optional arguments start and end are interpreted as in slice notation.
11757
Return -1 on failure.
11758
[clinic start generated code]*/
11759
11760
static Py_ssize_t
11761
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11762
                  Py_ssize_t end)
11763
/*[clinic end generated code: output=51dbe6255712e278 input=3a9d650fe4c24695]*/
11764
16.4M
{
11765
16.4M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11766
16.4M
    if (result < 0) {
11767
228k
        return -1;
11768
228k
    }
11769
16.2M
    return result;
11770
16.4M
}
11771
11772
static PyObject *
11773
unicode_getitem(PyObject *self, Py_ssize_t index)
11774
55.3M
{
11775
55.3M
    const void *data;
11776
55.3M
    int kind;
11777
55.3M
    Py_UCS4 ch;
11778
11779
55.3M
    if (!PyUnicode_Check(self)) {
11780
0
        PyErr_BadArgument();
11781
0
        return NULL;
11782
0
    }
11783
55.3M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11784
387
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11785
387
        return NULL;
11786
387
    }
11787
55.3M
    kind = PyUnicode_KIND(self);
11788
55.3M
    data = PyUnicode_DATA(self);
11789
55.3M
    ch = PyUnicode_READ(kind, data, index);
11790
55.3M
    return unicode_char(ch);
11791
55.3M
}
11792
11793
/* Believe it or not, this produces the same value for ASCII strings
11794
   as bytes_hash(). */
11795
static Py_hash_t
11796
unicode_hash(PyObject *self)
11797
45.2M
{
11798
45.2M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11799
11800
#ifdef Py_DEBUG
11801
    assert(_Py_HashSecret_Initialized);
11802
#endif
11803
45.2M
    Py_hash_t hash = PyUnicode_HASH(self);
11804
45.2M
    if (hash != -1) {
11805
252k
        return hash;
11806
252k
    }
11807
45.0M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11808
45.0M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11809
11810
45.0M
    PyUnicode_SET_HASH(self, x);
11811
45.0M
    return x;
11812
45.2M
}
11813
11814
/*[clinic input]
11815
@permit_long_summary
11816
str.index as unicode_index = str.count
11817
11818
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11819
11820
Optional arguments start and end are interpreted as in slice notation.
11821
Raises ValueError when the substring is not found.
11822
[clinic start generated code]*/
11823
11824
static Py_ssize_t
11825
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11826
                   Py_ssize_t end)
11827
/*[clinic end generated code: output=77558288837cdf40 input=ae5e48f69ed75b06]*/
11828
592k
{
11829
592k
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11830
592k
    if (result == -1) {
11831
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
11832
0
    }
11833
592k
    else if (result < 0) {
11834
0
        return -1;
11835
0
    }
11836
592k
    return result;
11837
592k
}
11838
11839
/*[clinic input]
11840
str.isascii as unicode_isascii
11841
11842
Return True if all characters in the string are ASCII, False otherwise.
11843
11844
ASCII characters have code points in the range U+0000-U+007F.
11845
Empty string is ASCII too.
11846
[clinic start generated code]*/
11847
11848
static PyObject *
11849
unicode_isascii_impl(PyObject *self)
11850
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11851
9.13M
{
11852
9.13M
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11853
9.13M
}
11854
11855
/*[clinic input]
11856
@permit_long_docstring_body
11857
str.islower as unicode_islower
11858
11859
Return True if the string is a lowercase string, False otherwise.
11860
11861
A string is lowercase if all cased characters in the string are lowercase and
11862
there is at least one cased character in the string.
11863
[clinic start generated code]*/
11864
11865
static PyObject *
11866
unicode_islower_impl(PyObject *self)
11867
/*[clinic end generated code: output=dbd41995bd005b81 input=c6fc0295241a1aaa]*/
11868
0
{
11869
0
    Py_ssize_t i, length;
11870
0
    int kind;
11871
0
    const void *data;
11872
0
    int cased;
11873
11874
0
    length = PyUnicode_GET_LENGTH(self);
11875
0
    kind = PyUnicode_KIND(self);
11876
0
    data = PyUnicode_DATA(self);
11877
11878
    /* Shortcut for single character strings */
11879
0
    if (length == 1)
11880
0
        return PyBool_FromLong(
11881
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11882
11883
    /* Special case for empty strings */
11884
0
    if (length == 0)
11885
0
        Py_RETURN_FALSE;
11886
11887
0
    cased = 0;
11888
0
    for (i = 0; i < length; i++) {
11889
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11890
11891
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11892
0
            Py_RETURN_FALSE;
11893
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11894
0
            cased = 1;
11895
0
    }
11896
0
    return PyBool_FromLong(cased);
11897
0
}
11898
11899
/*[clinic input]
11900
@permit_long_docstring_body
11901
str.isupper as unicode_isupper
11902
11903
Return True if the string is an uppercase string, False otherwise.
11904
11905
A string is uppercase if all cased characters in the string are uppercase and
11906
there is at least one cased character in the string.
11907
[clinic start generated code]*/
11908
11909
static PyObject *
11910
unicode_isupper_impl(PyObject *self)
11911
/*[clinic end generated code: output=049209c8e7f15f59 input=8d5cb33e67efde72]*/
11912
6.98k
{
11913
6.98k
    Py_ssize_t i, length;
11914
6.98k
    int kind;
11915
6.98k
    const void *data;
11916
6.98k
    int cased;
11917
11918
6.98k
    length = PyUnicode_GET_LENGTH(self);
11919
6.98k
    kind = PyUnicode_KIND(self);
11920
6.98k
    data = PyUnicode_DATA(self);
11921
11922
    /* Shortcut for single character strings */
11923
6.98k
    if (length == 1)
11924
0
        return PyBool_FromLong(
11925
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11926
11927
    /* Special case for empty strings */
11928
6.98k
    if (length == 0)
11929
0
        Py_RETURN_FALSE;
11930
11931
6.98k
    cased = 0;
11932
89.1k
    for (i = 0; i < length; i++) {
11933
83.0k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11934
11935
83.0k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11936
840
            Py_RETURN_FALSE;
11937
82.1k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11938
6.23k
            cased = 1;
11939
83.0k
    }
11940
6.14k
    return PyBool_FromLong(cased);
11941
6.98k
}
11942
11943
/*[clinic input]
11944
str.istitle as unicode_istitle
11945
11946
Return True if the string is a title-cased string, False otherwise.
11947
11948
In a title-cased string, upper- and title-case characters may only
11949
follow uncased characters and lowercase characters only cased ones.
11950
[clinic start generated code]*/
11951
11952
static PyObject *
11953
unicode_istitle_impl(PyObject *self)
11954
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11955
0
{
11956
0
    Py_ssize_t i, length;
11957
0
    int kind;
11958
0
    const void *data;
11959
0
    int cased, previous_is_cased;
11960
11961
0
    length = PyUnicode_GET_LENGTH(self);
11962
0
    kind = PyUnicode_KIND(self);
11963
0
    data = PyUnicode_DATA(self);
11964
11965
    /* Shortcut for single character strings */
11966
0
    if (length == 1) {
11967
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11968
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11969
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11970
0
    }
11971
11972
    /* Special case for empty strings */
11973
0
    if (length == 0)
11974
0
        Py_RETURN_FALSE;
11975
11976
0
    cased = 0;
11977
0
    previous_is_cased = 0;
11978
0
    for (i = 0; i < length; i++) {
11979
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11980
11981
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11982
0
            if (previous_is_cased)
11983
0
                Py_RETURN_FALSE;
11984
0
            previous_is_cased = 1;
11985
0
            cased = 1;
11986
0
        }
11987
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11988
0
            if (!previous_is_cased)
11989
0
                Py_RETURN_FALSE;
11990
0
            previous_is_cased = 1;
11991
0
            cased = 1;
11992
0
        }
11993
0
        else
11994
0
            previous_is_cased = 0;
11995
0
    }
11996
0
    return PyBool_FromLong(cased);
11997
0
}
11998
11999
/*[clinic input]
12000
@permit_long_docstring_body
12001
str.isspace as unicode_isspace
12002
12003
Return True if the string is a whitespace string, False otherwise.
12004
12005
A string is whitespace if all characters in the string are whitespace and there
12006
is at least one character in the string.
12007
[clinic start generated code]*/
12008
12009
static PyObject *
12010
unicode_isspace_impl(PyObject *self)
12011
/*[clinic end generated code: output=163a63bfa08ac2b9 input=44fe05e248c6e159]*/
12012
21.1M
{
12013
21.1M
    Py_ssize_t i, length;
12014
21.1M
    int kind;
12015
21.1M
    const void *data;
12016
12017
21.1M
    length = PyUnicode_GET_LENGTH(self);
12018
21.1M
    kind = PyUnicode_KIND(self);
12019
21.1M
    data = PyUnicode_DATA(self);
12020
12021
    /* Shortcut for single character strings */
12022
21.1M
    if (length == 1)
12023
21.1M
        return PyBool_FromLong(
12024
21.1M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12025
12026
    /* Special case for empty strings */
12027
0
    if (length == 0)
12028
0
        Py_RETURN_FALSE;
12029
12030
0
    for (i = 0; i < length; i++) {
12031
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12032
0
        if (!Py_UNICODE_ISSPACE(ch))
12033
0
            Py_RETURN_FALSE;
12034
0
    }
12035
0
    Py_RETURN_TRUE;
12036
0
}
12037
12038
/*[clinic input]
12039
@permit_long_docstring_body
12040
str.isalpha as unicode_isalpha
12041
12042
Return True if the string is an alphabetic string, False otherwise.
12043
12044
A string is alphabetic if all characters in the string are alphabetic and there
12045
is at least one character in the string.
12046
[clinic start generated code]*/
12047
12048
static PyObject *
12049
unicode_isalpha_impl(PyObject *self)
12050
/*[clinic end generated code: output=cc81b9ac3883ec4f input=c233000624a56e0d]*/
12051
0
{
12052
0
    Py_ssize_t i, length;
12053
0
    int kind;
12054
0
    const void *data;
12055
12056
0
    length = PyUnicode_GET_LENGTH(self);
12057
0
    kind = PyUnicode_KIND(self);
12058
0
    data = PyUnicode_DATA(self);
12059
12060
    /* Shortcut for single character strings */
12061
0
    if (length == 1)
12062
0
        return PyBool_FromLong(
12063
0
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12064
12065
    /* Special case for empty strings */
12066
0
    if (length == 0)
12067
0
        Py_RETURN_FALSE;
12068
12069
0
    for (i = 0; i < length; i++) {
12070
0
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12071
0
            Py_RETURN_FALSE;
12072
0
    }
12073
0
    Py_RETURN_TRUE;
12074
0
}
12075
12076
/*[clinic input]
12077
@permit_long_docstring_body
12078
str.isalnum as unicode_isalnum
12079
12080
Return True if the string is an alpha-numeric string, False otherwise.
12081
12082
A string is alpha-numeric if all characters in the string are alpha-numeric and
12083
there is at least one character in the string.
12084
[clinic start generated code]*/
12085
12086
static PyObject *
12087
unicode_isalnum_impl(PyObject *self)
12088
/*[clinic end generated code: output=a5a23490ffc3660c input=5d63ba9c9bafdb6b]*/
12089
23.8M
{
12090
23.8M
    int kind;
12091
23.8M
    const void *data;
12092
23.8M
    Py_ssize_t len, i;
12093
12094
23.8M
    kind = PyUnicode_KIND(self);
12095
23.8M
    data = PyUnicode_DATA(self);
12096
23.8M
    len = PyUnicode_GET_LENGTH(self);
12097
12098
    /* Shortcut for single character strings */
12099
23.8M
    if (len == 1) {
12100
23.8M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12101
23.8M
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12102
23.8M
    }
12103
12104
    /* Special case for empty strings */
12105
0
    if (len == 0)
12106
0
        Py_RETURN_FALSE;
12107
12108
0
    for (i = 0; i < len; i++) {
12109
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12110
0
        if (!Py_UNICODE_ISALNUM(ch))
12111
0
            Py_RETURN_FALSE;
12112
0
    }
12113
0
    Py_RETURN_TRUE;
12114
0
}
12115
12116
/*[clinic input]
12117
@permit_long_docstring_body
12118
str.isdecimal as unicode_isdecimal
12119
12120
Return True if the string is a decimal string, False otherwise.
12121
12122
A string is a decimal string if all characters in the string are decimal and
12123
there is at least one character in the string.
12124
[clinic start generated code]*/
12125
12126
static PyObject *
12127
unicode_isdecimal_impl(PyObject *self)
12128
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=8e84a58b414935a3]*/
12129
0
{
12130
0
    Py_ssize_t i, length;
12131
0
    int kind;
12132
0
    const void *data;
12133
12134
0
    length = PyUnicode_GET_LENGTH(self);
12135
0
    kind = PyUnicode_KIND(self);
12136
0
    data = PyUnicode_DATA(self);
12137
12138
    /* Shortcut for single character strings */
12139
0
    if (length == 1)
12140
0
        return PyBool_FromLong(
12141
0
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12142
12143
    /* Special case for empty strings */
12144
0
    if (length == 0)
12145
0
        Py_RETURN_FALSE;
12146
12147
0
    for (i = 0; i < length; i++) {
12148
0
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12149
0
            Py_RETURN_FALSE;
12150
0
    }
12151
0
    Py_RETURN_TRUE;
12152
0
}
12153
12154
/*[clinic input]
12155
@permit_long_docstring_body
12156
str.isdigit as unicode_isdigit
12157
12158
Return True if the string is a digit string, False otherwise.
12159
12160
A string is a digit string if all characters in the string are digits and there
12161
is at least one character in the string.
12162
[clinic start generated code]*/
12163
12164
static PyObject *
12165
unicode_isdigit_impl(PyObject *self)
12166
/*[clinic end generated code: output=10a6985311da6858 input=99e284affb54d4a0]*/
12167
1.91M
{
12168
1.91M
    Py_ssize_t i, length;
12169
1.91M
    int kind;
12170
1.91M
    const void *data;
12171
12172
1.91M
    length = PyUnicode_GET_LENGTH(self);
12173
1.91M
    kind = PyUnicode_KIND(self);
12174
1.91M
    data = PyUnicode_DATA(self);
12175
12176
    /* Shortcut for single character strings */
12177
1.91M
    if (length == 1) {
12178
1.91M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12179
1.91M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12180
1.91M
    }
12181
12182
    /* Special case for empty strings */
12183
306
    if (length == 0)
12184
0
        Py_RETURN_FALSE;
12185
12186
1.09k
    for (i = 0; i < length; i++) {
12187
786
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12188
0
            Py_RETURN_FALSE;
12189
786
    }
12190
306
    Py_RETURN_TRUE;
12191
306
}
12192
12193
/*[clinic input]
12194
@permit_long_docstring_body
12195
str.isnumeric as unicode_isnumeric
12196
12197
Return True if the string is a numeric string, False otherwise.
12198
12199
A string is numeric if all characters in the string are numeric and there is at
12200
least one character in the string.
12201
[clinic start generated code]*/
12202
12203
static PyObject *
12204
unicode_isnumeric_impl(PyObject *self)
12205
/*[clinic end generated code: output=9172a32d9013051a input=e9f5b6b8b29b0ee6]*/
12206
0
{
12207
0
    Py_ssize_t i, length;
12208
0
    int kind;
12209
0
    const void *data;
12210
12211
0
    length = PyUnicode_GET_LENGTH(self);
12212
0
    kind = PyUnicode_KIND(self);
12213
0
    data = PyUnicode_DATA(self);
12214
12215
    /* Shortcut for single character strings */
12216
0
    if (length == 1)
12217
0
        return PyBool_FromLong(
12218
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12219
12220
    /* Special case for empty strings */
12221
0
    if (length == 0)
12222
0
        Py_RETURN_FALSE;
12223
12224
0
    for (i = 0; i < length; i++) {
12225
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12226
0
            Py_RETURN_FALSE;
12227
0
    }
12228
0
    Py_RETURN_TRUE;
12229
0
}
12230
12231
Py_ssize_t
12232
_PyUnicode_ScanIdentifier(PyObject *self)
12233
14.4k
{
12234
14.4k
    Py_ssize_t i;
12235
14.4k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12236
14.4k
    if (len == 0) {
12237
        /* an empty string is not a valid identifier */
12238
0
        return 0;
12239
0
    }
12240
12241
14.4k
    int kind = PyUnicode_KIND(self);
12242
14.4k
    const void *data = PyUnicode_DATA(self);
12243
14.4k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12244
    /* PEP 3131 says that the first character must be in
12245
       XID_Start and subsequent characters in XID_Continue,
12246
       and for the ASCII range, the 2.x rules apply (i.e
12247
       start with letters and underscore, continue with
12248
       letters, digits, underscore). However, given the current
12249
       definition of XID_Start and XID_Continue, it is sufficient
12250
       to check just for these, except that _ must be allowed
12251
       as starting an identifier.  */
12252
14.4k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12253
493
        return 0;
12254
493
    }
12255
12256
52.4k
    for (i = 1; i < len; i++) {
12257
38.6k
        ch = PyUnicode_READ(kind, data, i);
12258
38.6k
        if (!_PyUnicode_IsXidContinue(ch)) {
12259
254
            return i;
12260
254
        }
12261
38.6k
    }
12262
13.7k
    return i;
12263
13.9k
}
12264
12265
int
12266
PyUnicode_IsIdentifier(PyObject *self)
12267
958
{
12268
958
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12269
958
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12270
    /* an empty string is not a valid identifier */
12271
958
    return len && i == len;
12272
958
}
12273
12274
/*[clinic input]
12275
@permit_long_docstring_body
12276
str.isidentifier as unicode_isidentifier
12277
12278
Return True if the string is a valid Python identifier, False otherwise.
12279
12280
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12281
such as "def" or "class".
12282
[clinic start generated code]*/
12283
12284
static PyObject *
12285
unicode_isidentifier_impl(PyObject *self)
12286
/*[clinic end generated code: output=fe585a9666572905 input=86315dd889d7bd04]*/
12287
496
{
12288
496
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12289
496
}
12290
12291
/*[clinic input]
12292
@permit_long_summary
12293
str.isprintable as unicode_isprintable
12294
12295
Return True if all characters in the string are printable, False otherwise.
12296
12297
A character is printable if repr() may use it in its output.
12298
[clinic start generated code]*/
12299
12300
static PyObject *
12301
unicode_isprintable_impl(PyObject *self)
12302
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12303
1.12M
{
12304
1.12M
    Py_ssize_t i, length;
12305
1.12M
    int kind;
12306
1.12M
    const void *data;
12307
12308
1.12M
    length = PyUnicode_GET_LENGTH(self);
12309
1.12M
    kind = PyUnicode_KIND(self);
12310
1.12M
    data = PyUnicode_DATA(self);
12311
12312
    /* Shortcut for single character strings */
12313
1.12M
    if (length == 1)
12314
1.12M
        return PyBool_FromLong(
12315
1.12M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12316
12317
0
    for (i = 0; i < length; i++) {
12318
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12319
0
            Py_RETURN_FALSE;
12320
0
        }
12321
0
    }
12322
0
    Py_RETURN_TRUE;
12323
0
}
12324
12325
/*[clinic input]
12326
@permit_long_docstring_body
12327
str.join as unicode_join
12328
12329
    iterable: object
12330
    /
12331
12332
Concatenate any number of strings.
12333
12334
The string whose method is called is inserted in between each given string.
12335
The result is returned as a new string.
12336
12337
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12338
[clinic start generated code]*/
12339
12340
static PyObject *
12341
unicode_join(PyObject *self, PyObject *iterable)
12342
/*[clinic end generated code: output=6857e7cecfe7bf98 input=bac724ed412ef3f8]*/
12343
20.7M
{
12344
20.7M
    return PyUnicode_Join(self, iterable);
12345
20.7M
}
12346
12347
static Py_ssize_t
12348
unicode_length(PyObject *self)
12349
41.6M
{
12350
41.6M
    return PyUnicode_GET_LENGTH(self);
12351
41.6M
}
12352
12353
/*[clinic input]
12354
str.ljust as unicode_ljust
12355
12356
    width: Py_ssize_t
12357
    fillchar: Py_UCS4 = ' '
12358
    /
12359
12360
Return a left-justified string of length width.
12361
12362
Padding is done using the specified fill character (default is a space).
12363
[clinic start generated code]*/
12364
12365
static PyObject *
12366
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12367
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12368
0
{
12369
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12370
0
        return unicode_result_unchanged(self);
12371
12372
0
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12373
0
}
12374
12375
/*[clinic input]
12376
str.lower as unicode_lower
12377
12378
Return a copy of the string converted to lowercase.
12379
[clinic start generated code]*/
12380
12381
static PyObject *
12382
unicode_lower_impl(PyObject *self)
12383
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12384
106M
{
12385
106M
    if (PyUnicode_IS_ASCII(self))
12386
80.1M
        return ascii_upper_or_lower(self, 1);
12387
26.6M
    return case_operation(self, do_lower);
12388
106M
}
12389
12390
60.0M
#define LEFTSTRIP 0
12391
77.7M
#define RIGHTSTRIP 1
12392
37.3M
#define BOTHSTRIP 2
12393
12394
/* Arrays indexed by above */
12395
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12396
12397
0
#define STRIPNAME(i) (stripfuncnames[i])
12398
12399
/* externally visible for str.strip(unicode) */
12400
PyObject *
12401
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12402
8.47M
{
12403
8.47M
    const void *data;
12404
8.47M
    int kind;
12405
8.47M
    Py_ssize_t i, j, len;
12406
8.47M
    BLOOM_MASK sepmask;
12407
8.47M
    Py_ssize_t seplen;
12408
12409
8.47M
    kind = PyUnicode_KIND(self);
12410
8.47M
    data = PyUnicode_DATA(self);
12411
8.47M
    len = PyUnicode_GET_LENGTH(self);
12412
8.47M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12413
8.47M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12414
8.47M
                              PyUnicode_DATA(sepobj),
12415
8.47M
                              seplen);
12416
12417
8.47M
    i = 0;
12418
8.47M
    if (striptype != RIGHTSTRIP) {
12419
428k
        while (i < len) {
12420
425k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12421
425k
            if (!BLOOM(sepmask, ch))
12422
390k
                break;
12423
35.3k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12424
2.36k
                break;
12425
33.0k
            i++;
12426
33.0k
        }
12427
395k
    }
12428
12429
8.47M
    j = len;
12430
8.47M
    if (striptype != LEFTSTRIP) {
12431
8.08M
        j--;
12432
8.47M
        while (j >= i) {
12433
3.78M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12434
3.78M
            if (!BLOOM(sepmask, ch))
12435
3.36M
                break;
12436
418k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12437
26.7k
                break;
12438
391k
            j--;
12439
391k
        }
12440
12441
8.08M
        j++;
12442
8.08M
    }
12443
12444
8.47M
    return PyUnicode_Substring(self, i, j);
12445
8.47M
}
12446
12447
PyObject*
12448
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12449
286M
{
12450
286M
    const unsigned char *data;
12451
286M
    int kind;
12452
286M
    Py_ssize_t length;
12453
12454
286M
    length = PyUnicode_GET_LENGTH(self);
12455
286M
    end = Py_MIN(end, length);
12456
12457
286M
    if (start == 0 && end == length)
12458
52.5M
        return unicode_result_unchanged(self);
12459
12460
234M
    if (start < 0 || end < 0) {
12461
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12462
0
        return NULL;
12463
0
    }
12464
234M
    if (start >= length || end < start)
12465
175k
        _Py_RETURN_UNICODE_EMPTY();
12466
12467
234M
    length = end - start;
12468
234M
    if (PyUnicode_IS_ASCII(self)) {
12469
46.6M
        data = PyUnicode_1BYTE_DATA(self);
12470
46.6M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12471
46.6M
    }
12472
187M
    else {
12473
187M
        kind = PyUnicode_KIND(self);
12474
187M
        data = PyUnicode_1BYTE_DATA(self);
12475
187M
        return PyUnicode_FromKindAndData(kind,
12476
187M
                                         data + kind * start,
12477
187M
                                         length);
12478
187M
    }
12479
234M
}
12480
12481
static PyObject *
12482
do_strip(PyObject *self, int striptype)
12483
49.8M
{
12484
49.8M
    Py_ssize_t len, i, j;
12485
12486
49.8M
    len = PyUnicode_GET_LENGTH(self);
12487
12488
49.8M
    if (PyUnicode_IS_ASCII(self)) {
12489
41.2M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12490
12491
41.2M
        i = 0;
12492
41.2M
        if (striptype != RIGHTSTRIP) {
12493
31.7M
            while (i < len) {
12494
20.1M
                Py_UCS1 ch = data[i];
12495
20.1M
                if (!_Py_ascii_whitespace[ch])
12496
19.6M
                    break;
12497
456k
                i++;
12498
456k
            }
12499
31.2M
        }
12500
12501
41.2M
        j = len;
12502
41.2M
        if (striptype != LEFTSTRIP) {
12503
40.9M
            j--;
12504
51.6M
            while (j >= i) {
12505
34.4M
                Py_UCS1 ch = data[j];
12506
34.4M
                if (!_Py_ascii_whitespace[ch])
12507
23.7M
                    break;
12508
10.7M
                j--;
12509
10.7M
            }
12510
40.9M
            j++;
12511
40.9M
        }
12512
41.2M
    }
12513
8.67M
    else {
12514
8.67M
        int kind = PyUnicode_KIND(self);
12515
8.67M
        const void *data = PyUnicode_DATA(self);
12516
12517
8.67M
        i = 0;
12518
8.67M
        if (striptype != RIGHTSTRIP) {
12519
8.86M
            while (i < len) {
12520
8.86M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12521
8.86M
                if (!Py_UNICODE_ISSPACE(ch))
12522
7.33M
                    break;
12523
1.52M
                i++;
12524
1.52M
            }
12525
7.33M
        }
12526
12527
8.67M
        j = len;
12528
8.67M
        if (striptype != LEFTSTRIP) {
12529
7.74M
            j--;
12530
8.46M
            while (j >= i) {
12531
8.43M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12532
8.43M
                if (!Py_UNICODE_ISSPACE(ch))
12533
7.71M
                    break;
12534
719k
                j--;
12535
719k
            }
12536
7.74M
            j++;
12537
7.74M
        }
12538
8.67M
    }
12539
12540
49.8M
    return PyUnicode_Substring(self, i, j);
12541
49.8M
}
12542
12543
12544
static PyObject *
12545
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12546
58.3M
{
12547
58.3M
    if (sep != Py_None) {
12548
8.47M
        if (PyUnicode_Check(sep))
12549
8.47M
            return _PyUnicode_XStrip(self, striptype, sep);
12550
0
        else {
12551
0
            PyErr_Format(PyExc_TypeError,
12552
0
                         "%s arg must be None or str",
12553
0
                         STRIPNAME(striptype));
12554
0
            return NULL;
12555
0
        }
12556
8.47M
    }
12557
12558
49.8M
    return do_strip(self, striptype);
12559
58.3M
}
12560
12561
12562
/*[clinic input]
12563
@permit_long_summary
12564
str.strip as unicode_strip
12565
12566
    chars: object = None
12567
    /
12568
12569
Return a copy of the string with leading and trailing whitespace removed.
12570
12571
If chars is given and not None, remove characters in chars instead.
12572
[clinic start generated code]*/
12573
12574
static PyObject *
12575
unicode_strip_impl(PyObject *self, PyObject *chars)
12576
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12577
37.3M
{
12578
37.3M
    return do_argstrip(self, BOTHSTRIP, chars);
12579
37.3M
}
12580
12581
12582
/*[clinic input]
12583
str.lstrip as unicode_lstrip
12584
12585
    chars: object = None
12586
    /
12587
12588
Return a copy of the string with leading whitespace removed.
12589
12590
If chars is given and not None, remove characters in chars instead.
12591
[clinic start generated code]*/
12592
12593
static PyObject *
12594
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12595
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12596
1.62M
{
12597
1.62M
    return do_argstrip(self, LEFTSTRIP, chars);
12598
1.62M
}
12599
12600
12601
/*[clinic input]
12602
str.rstrip as unicode_rstrip
12603
12604
    chars: object = None
12605
    /
12606
12607
Return a copy of the string with trailing whitespace removed.
12608
12609
If chars is given and not None, remove characters in chars instead.
12610
[clinic start generated code]*/
12611
12612
static PyObject *
12613
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12614
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12615
19.3M
{
12616
19.3M
    return do_argstrip(self, RIGHTSTRIP, chars);
12617
19.3M
}
12618
12619
12620
static PyObject*
12621
unicode_repeat(PyObject *str, Py_ssize_t len)
12622
418k
{
12623
418k
    PyObject *u;
12624
418k
    Py_ssize_t nchars, n;
12625
12626
418k
    if (len < 1)
12627
36.6k
        _Py_RETURN_UNICODE_EMPTY();
12628
12629
    /* no repeat, return original string */
12630
381k
    if (len == 1)
12631
120k
        return unicode_result_unchanged(str);
12632
12633
260k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12634
0
        PyErr_SetString(PyExc_OverflowError,
12635
0
                        "repeated string is too long");
12636
0
        return NULL;
12637
0
    }
12638
260k
    nchars = len * PyUnicode_GET_LENGTH(str);
12639
12640
260k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12641
260k
    if (!u)
12642
0
        return NULL;
12643
260k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12644
12645
260k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12646
259k
        int kind = PyUnicode_KIND(str);
12647
259k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12648
259k
        if (kind == PyUnicode_1BYTE_KIND) {
12649
259k
            void *to = PyUnicode_DATA(u);
12650
259k
            memset(to, (unsigned char)fill_char, len);
12651
259k
        }
12652
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12653
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12654
0
            for (n = 0; n < len; ++n)
12655
0
                ucs2[n] = fill_char;
12656
0
        } else {
12657
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12658
0
            assert(kind == PyUnicode_4BYTE_KIND);
12659
0
            for (n = 0; n < len; ++n)
12660
0
                ucs4[n] = fill_char;
12661
0
        }
12662
259k
    }
12663
1.81k
    else {
12664
1.81k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12665
1.81k
        char *to = (char *) PyUnicode_DATA(u);
12666
1.81k
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12667
1.81k
            PyUnicode_GET_LENGTH(str) * char_size);
12668
1.81k
    }
12669
12670
260k
    assert(_PyUnicode_CheckConsistency(u, 1));
12671
260k
    return u;
12672
260k
}
12673
12674
PyObject *
12675
PyUnicode_Replace(PyObject *str,
12676
                  PyObject *substr,
12677
                  PyObject *replstr,
12678
                  Py_ssize_t maxcount)
12679
2
{
12680
2
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12681
2
            ensure_unicode(replstr) < 0)
12682
0
        return NULL;
12683
2
    return replace(str, substr, replstr, maxcount);
12684
2
}
12685
12686
/*[clinic input]
12687
@permit_long_docstring_body
12688
str.replace as unicode_replace
12689
12690
    old: unicode
12691
    new: unicode
12692
    /
12693
    count: Py_ssize_t = -1
12694
        Maximum number of occurrences to replace.
12695
        -1 (the default value) means replace all occurrences.
12696
12697
Return a copy with all occurrences of substring old replaced by new.
12698
12699
If the optional argument count is given, only the first count occurrences are
12700
replaced.
12701
[clinic start generated code]*/
12702
12703
static PyObject *
12704
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12705
                     Py_ssize_t count)
12706
/*[clinic end generated code: output=b63f1a8b5eebf448 input=f27ca92ac46b65a1]*/
12707
79.2M
{
12708
79.2M
    return replace(self, old, new, count);
12709
79.2M
}
12710
12711
/*[clinic input]
12712
@permit_long_docstring_body
12713
str.removeprefix as unicode_removeprefix
12714
12715
    prefix: unicode
12716
    /
12717
12718
Return a str with the given prefix string removed if present.
12719
12720
If the string starts with the prefix string, return string[len(prefix):].
12721
Otherwise, return a copy of the original string.
12722
[clinic start generated code]*/
12723
12724
static PyObject *
12725
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12726
/*[clinic end generated code: output=f1e5945e9763bcb9 input=1989a856dbb813f1]*/
12727
0
{
12728
0
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12729
0
    if (match == -1) {
12730
0
        return NULL;
12731
0
    }
12732
0
    if (match) {
12733
0
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12734
0
                                   PyUnicode_GET_LENGTH(self));
12735
0
    }
12736
0
    return unicode_result_unchanged(self);
12737
0
}
12738
12739
/*[clinic input]
12740
str.removesuffix as unicode_removesuffix
12741
12742
    suffix: unicode
12743
    /
12744
12745
Return a str with the given suffix string removed if present.
12746
12747
If the string ends with the suffix string and that suffix is not empty,
12748
return string[:-len(suffix)]. Otherwise, return a copy of the original
12749
string.
12750
[clinic start generated code]*/
12751
12752
static PyObject *
12753
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12754
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12755
0
{
12756
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12757
0
    if (match == -1) {
12758
0
        return NULL;
12759
0
    }
12760
0
    if (match) {
12761
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12762
0
                                            - PyUnicode_GET_LENGTH(suffix));
12763
0
    }
12764
0
    return unicode_result_unchanged(self);
12765
0
}
12766
12767
static PyObject *
12768
unicode_repr(PyObject *unicode)
12769
4.23M
{
12770
4.23M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12771
4.23M
    const void *idata = PyUnicode_DATA(unicode);
12772
12773
    /* Compute length of output, quote characters, and
12774
       maximum character */
12775
4.23M
    Py_ssize_t osize = 0;
12776
4.23M
    Py_UCS4 maxch = 127;
12777
4.23M
    Py_ssize_t squote = 0;
12778
4.23M
    Py_ssize_t dquote = 0;
12779
4.23M
    int ikind = PyUnicode_KIND(unicode);
12780
141M
    for (Py_ssize_t i = 0; i < isize; i++) {
12781
137M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12782
137M
        Py_ssize_t incr = 1;
12783
137M
        switch (ch) {
12784
160k
        case '\'': squote++; break;
12785
591k
        case '"':  dquote++; break;
12786
147k
        case '\\': case '\t': case '\r': case '\n':
12787
147k
            incr = 2;
12788
147k
            break;
12789
136M
        default:
12790
            /* Fast-path ASCII */
12791
136M
            if (ch < ' ' || ch == 0x7f)
12792
89.2M
                incr = 4; /* \xHH */
12793
47.5M
            else if (ch < 0x7f)
12794
40.4M
                ;
12795
7.08M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12796
6.97M
                maxch = (ch > maxch) ? ch : maxch;
12797
103k
            else if (ch < 0x100)
12798
28.8k
                incr = 4; /* \xHH */
12799
74.8k
            else if (ch < 0x10000)
12800
52.0k
                incr = 6; /* \uHHHH */
12801
22.7k
            else
12802
22.7k
                incr = 10; /* \uHHHHHHHH */
12803
137M
        }
12804
137M
        if (osize > PY_SSIZE_T_MAX - incr) {
12805
0
            PyErr_SetString(PyExc_OverflowError,
12806
0
                            "string is too long to generate repr");
12807
0
            return NULL;
12808
0
        }
12809
137M
        osize += incr;
12810
137M
    }
12811
12812
4.23M
    Py_UCS4 quote = '\'';
12813
4.23M
    int changed = (osize != isize);
12814
4.23M
    if (squote) {
12815
73.4k
        changed = 1;
12816
73.4k
        if (dquote)
12817
            /* Both squote and dquote present. Use squote,
12818
               and escape them */
12819
7.61k
            osize += squote;
12820
65.7k
        else
12821
65.7k
            quote = '"';
12822
73.4k
    }
12823
4.23M
    osize += 2;   /* quotes */
12824
12825
4.23M
    PyObject *repr = PyUnicode_New(osize, maxch);
12826
4.23M
    if (repr == NULL)
12827
0
        return NULL;
12828
4.23M
    int okind = PyUnicode_KIND(repr);
12829
4.23M
    void *odata = PyUnicode_DATA(repr);
12830
12831
4.23M
    if (!changed) {
12832
3.60M
        PyUnicode_WRITE(okind, odata, 0, quote);
12833
12834
3.60M
        _PyUnicode_FastCopyCharacters(repr, 1,
12835
3.60M
                                      unicode, 0,
12836
3.60M
                                      isize);
12837
12838
3.60M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12839
3.60M
    }
12840
632k
    else {
12841
632k
        switch (okind) {
12842
409k
        case PyUnicode_1BYTE_KIND:
12843
409k
            ucs1lib_repr(unicode, quote, odata);
12844
409k
            break;
12845
218k
        case PyUnicode_2BYTE_KIND:
12846
218k
            ucs2lib_repr(unicode, quote, odata);
12847
218k
            break;
12848
4.14k
        default:
12849
4.14k
            assert(okind == PyUnicode_4BYTE_KIND);
12850
4.14k
            ucs4lib_repr(unicode, quote, odata);
12851
632k
        }
12852
632k
    }
12853
12854
4.23M
    assert(_PyUnicode_CheckConsistency(repr, 1));
12855
4.23M
    return repr;
12856
4.23M
}
12857
12858
/*[clinic input]
12859
@permit_long_summary
12860
str.rfind as unicode_rfind = str.count
12861
12862
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12863
12864
Optional arguments start and end are interpreted as in slice notation.
12865
Return -1 on failure.
12866
[clinic start generated code]*/
12867
12868
static Py_ssize_t
12869
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12870
                   Py_ssize_t end)
12871
/*[clinic end generated code: output=880b29f01dd014c8 input=7f7e97d5cd3299a2]*/
12872
9.83k
{
12873
9.83k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12874
9.83k
    if (result < 0) {
12875
6.59k
        return -1;
12876
6.59k
    }
12877
3.23k
    return result;
12878
9.83k
}
12879
12880
/*[clinic input]
12881
@permit_long_summary
12882
str.rindex as unicode_rindex = str.count
12883
12884
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12885
12886
Optional arguments start and end are interpreted as in slice notation.
12887
Raises ValueError when the substring is not found.
12888
[clinic start generated code]*/
12889
12890
static Py_ssize_t
12891
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12892
                    Py_ssize_t end)
12893
/*[clinic end generated code: output=5f3aef124c867fe1 input=0363a324740b3e62]*/
12894
107k
{
12895
107k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12896
107k
    if (result == -1) {
12897
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12898
0
    }
12899
107k
    else if (result < 0) {
12900
0
        return -1;
12901
0
    }
12902
107k
    return result;
12903
107k
}
12904
12905
/*[clinic input]
12906
str.rjust as unicode_rjust
12907
12908
    width: Py_ssize_t
12909
    fillchar: Py_UCS4 = ' '
12910
    /
12911
12912
Return a right-justified string of length width.
12913
12914
Padding is done using the specified fill character (default is a space).
12915
[clinic start generated code]*/
12916
12917
static PyObject *
12918
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12919
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12920
0
{
12921
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12922
0
        return unicode_result_unchanged(self);
12923
12924
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12925
0
}
12926
12927
PyObject *
12928
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12929
0
{
12930
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12931
0
        return NULL;
12932
12933
0
    return split(s, sep, maxsplit);
12934
0
}
12935
12936
/*[clinic input]
12937
@permit_long_summary
12938
str.split as unicode_split
12939
12940
    sep: object = None
12941
        The separator used to split the string.
12942
12943
        When set to None (the default value), will split on any whitespace
12944
        character (including \n \r \t \f and spaces) and will discard
12945
        empty strings from the result.
12946
    maxsplit: Py_ssize_t = -1
12947
        Maximum number of splits.
12948
        -1 (the default value) means no limit.
12949
12950
Return a list of the substrings in the string, using sep as the separator string.
12951
12952
Splitting starts at the front of the string and works to the end.
12953
12954
Note, str.split() is mainly useful for data that has been intentionally
12955
delimited.  With natural text that includes punctuation, consider using
12956
the regular expression module.
12957
12958
[clinic start generated code]*/
12959
12960
static PyObject *
12961
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12962
/*[clinic end generated code: output=3a65b1db356948dc input=2c1fd08a78e038b8]*/
12963
22.6M
{
12964
22.6M
    if (sep == Py_None)
12965
181k
        return split(self, NULL, maxsplit);
12966
22.5M
    if (PyUnicode_Check(sep))
12967
22.5M
        return split(self, sep, maxsplit);
12968
12969
0
    PyErr_Format(PyExc_TypeError,
12970
0
                 "must be str or None, not %.100s",
12971
0
                 Py_TYPE(sep)->tp_name);
12972
0
    return NULL;
12973
22.5M
}
12974
12975
PyObject *
12976
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12977
6.57M
{
12978
6.57M
    PyObject* out;
12979
6.57M
    int kind1, kind2;
12980
6.57M
    const void *buf1, *buf2;
12981
6.57M
    Py_ssize_t len1, len2;
12982
12983
6.57M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12984
0
        return NULL;
12985
12986
6.57M
    kind1 = PyUnicode_KIND(str_obj);
12987
6.57M
    kind2 = PyUnicode_KIND(sep_obj);
12988
6.57M
    len1 = PyUnicode_GET_LENGTH(str_obj);
12989
6.57M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12990
6.57M
    if (kind1 < kind2 || len1 < len2) {
12991
1.26k
        PyObject *empty = unicode_get_empty();  // Borrowed reference
12992
1.26k
        return PyTuple_Pack(3, str_obj, empty, empty);
12993
1.26k
    }
12994
6.57M
    buf1 = PyUnicode_DATA(str_obj);
12995
6.57M
    buf2 = PyUnicode_DATA(sep_obj);
12996
6.57M
    if (kind2 != kind1) {
12997
82.4k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12998
82.4k
        if (!buf2)
12999
0
            return NULL;
13000
82.4k
    }
13001
13002
6.57M
    switch (kind1) {
13003
6.49M
    case PyUnicode_1BYTE_KIND:
13004
6.49M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13005
2.42M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13006
4.06M
        else
13007
4.06M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13008
6.49M
        break;
13009
72.0k
    case PyUnicode_2BYTE_KIND:
13010
72.0k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13011
72.0k
        break;
13012
10.3k
    case PyUnicode_4BYTE_KIND:
13013
10.3k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13014
10.3k
        break;
13015
0
    default:
13016
0
        Py_UNREACHABLE();
13017
6.57M
    }
13018
13019
6.57M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13020
6.57M
    if (kind2 != kind1)
13021
82.4k
        PyMem_Free((void *)buf2);
13022
13023
6.57M
    return out;
13024
6.57M
}
13025
13026
13027
PyObject *
13028
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13029
11.1k
{
13030
11.1k
    PyObject* out;
13031
11.1k
    int kind1, kind2;
13032
11.1k
    const void *buf1, *buf2;
13033
11.1k
    Py_ssize_t len1, len2;
13034
13035
11.1k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13036
0
        return NULL;
13037
13038
11.1k
    kind1 = PyUnicode_KIND(str_obj);
13039
11.1k
    kind2 = PyUnicode_KIND(sep_obj);
13040
11.1k
    len1 = PyUnicode_GET_LENGTH(str_obj);
13041
11.1k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
13042
11.1k
    if (kind1 < kind2 || len1 < len2) {
13043
0
        PyObject *empty = unicode_get_empty();  // Borrowed reference
13044
0
        return PyTuple_Pack(3, empty, empty, str_obj);
13045
0
    }
13046
11.1k
    buf1 = PyUnicode_DATA(str_obj);
13047
11.1k
    buf2 = PyUnicode_DATA(sep_obj);
13048
11.1k
    if (kind2 != kind1) {
13049
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
13050
0
        if (!buf2)
13051
0
            return NULL;
13052
0
    }
13053
13054
11.1k
    switch (kind1) {
13055
11.1k
    case PyUnicode_1BYTE_KIND:
13056
11.1k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13057
11.1k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13058
0
        else
13059
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13060
11.1k
        break;
13061
0
    case PyUnicode_2BYTE_KIND:
13062
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13063
0
        break;
13064
0
    case PyUnicode_4BYTE_KIND:
13065
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13066
0
        break;
13067
0
    default:
13068
0
        Py_UNREACHABLE();
13069
11.1k
    }
13070
13071
11.1k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13072
11.1k
    if (kind2 != kind1)
13073
0
        PyMem_Free((void *)buf2);
13074
13075
11.1k
    return out;
13076
11.1k
}
13077
13078
/*[clinic input]
13079
@permit_long_docstring_body
13080
str.partition as unicode_partition
13081
13082
    sep: object
13083
    /
13084
13085
Partition the string into three parts using the given separator.
13086
13087
This will search for the separator in the string.  If the separator is found,
13088
returns a 3-tuple containing the part before the separator, the separator
13089
itself, and the part after it.
13090
13091
If the separator is not found, returns a 3-tuple containing the original string
13092
and two empty strings.
13093
[clinic start generated code]*/
13094
13095
static PyObject *
13096
unicode_partition(PyObject *self, PyObject *sep)
13097
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=4d854b520d7b0e97]*/
13098
6.57M
{
13099
6.57M
    return PyUnicode_Partition(self, sep);
13100
6.57M
}
13101
13102
/*[clinic input]
13103
@permit_long_docstring_body
13104
str.rpartition as unicode_rpartition = str.partition
13105
13106
Partition the string into three parts using the given separator.
13107
13108
This will search for the separator in the string, starting at the end. If
13109
the separator is found, returns a 3-tuple containing the part before the
13110
separator, the separator itself, and the part after it.
13111
13112
If the separator is not found, returns a 3-tuple containing two empty strings
13113
and the original string.
13114
[clinic start generated code]*/
13115
13116
static PyObject *
13117
unicode_rpartition(PyObject *self, PyObject *sep)
13118
/*[clinic end generated code: output=1aa13cf1156572aa input=a6adabe91e75b486]*/
13119
11.1k
{
13120
11.1k
    return PyUnicode_RPartition(self, sep);
13121
11.1k
}
13122
13123
PyObject *
13124
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13125
0
{
13126
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13127
0
        return NULL;
13128
13129
0
    return rsplit(s, sep, maxsplit);
13130
0
}
13131
13132
/*[clinic input]
13133
@permit_long_summary
13134
str.rsplit as unicode_rsplit = str.split
13135
13136
Return a list of the substrings in the string, using sep as the separator string.
13137
13138
Splitting starts at the end of the string and works to the front.
13139
[clinic start generated code]*/
13140
13141
static PyObject *
13142
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13143
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13144
50
{
13145
50
    if (sep == Py_None)
13146
0
        return rsplit(self, NULL, maxsplit);
13147
50
    if (PyUnicode_Check(sep))
13148
50
        return rsplit(self, sep, maxsplit);
13149
13150
0
    PyErr_Format(PyExc_TypeError,
13151
0
                 "must be str or None, not %.100s",
13152
0
                 Py_TYPE(sep)->tp_name);
13153
0
    return NULL;
13154
50
}
13155
13156
/*[clinic input]
13157
@permit_long_docstring_body
13158
str.splitlines as unicode_splitlines
13159
13160
    keepends: bool = False
13161
13162
Return a list of the lines in the string, breaking at line boundaries.
13163
13164
Line breaks are not included in the resulting list unless keepends is given and
13165
true.
13166
[clinic start generated code]*/
13167
13168
static PyObject *
13169
unicode_splitlines_impl(PyObject *self, int keepends)
13170
/*[clinic end generated code: output=f664dcdad153ec40 input=39eeafbfef61c827]*/
13171
13.8k
{
13172
13.8k
    return PyUnicode_Splitlines(self, keepends);
13173
13.8k
}
13174
13175
static
13176
PyObject *unicode_str(PyObject *self)
13177
3.17M
{
13178
3.17M
    return unicode_result_unchanged(self);
13179
3.17M
}
13180
13181
/*[clinic input]
13182
@permit_long_summary
13183
str.swapcase as unicode_swapcase
13184
13185
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13186
[clinic start generated code]*/
13187
13188
static PyObject *
13189
unicode_swapcase_impl(PyObject *self)
13190
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13191
0
{
13192
0
    return case_operation(self, do_swapcase);
13193
0
}
13194
13195
/*[clinic input]
13196
13197
@staticmethod
13198
str.maketrans as unicode_maketrans
13199
13200
  x: object
13201
13202
  y: unicode=NULL
13203
13204
  z: unicode=NULL
13205
13206
  /
13207
13208
Return a translation table usable for str.translate().
13209
13210
If there is only one argument, it must be a dictionary mapping Unicode
13211
ordinals (integers) or characters to Unicode ordinals, strings or None.
13212
Character keys will be then converted to ordinals.
13213
If there are two arguments, they must be strings of equal length, and
13214
in the resulting dictionary, each character in x will be mapped to the
13215
character at the same position in y. If there is a third argument, it
13216
must be a string, whose characters will be mapped to None in the result.
13217
[clinic start generated code]*/
13218
13219
static PyObject *
13220
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13221
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13222
0
{
13223
0
    PyObject *new = NULL, *key, *value;
13224
0
    Py_ssize_t i = 0;
13225
0
    int res;
13226
13227
0
    new = PyDict_New();
13228
0
    if (!new)
13229
0
        return NULL;
13230
0
    if (y != NULL) {
13231
0
        int x_kind, y_kind, z_kind;
13232
0
        const void *x_data, *y_data, *z_data;
13233
13234
        /* x must be a string too, of equal length */
13235
0
        if (!PyUnicode_Check(x)) {
13236
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13237
0
                            "be a string if there is a second argument");
13238
0
            goto err;
13239
0
        }
13240
0
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13241
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13242
0
                            "arguments must have equal length");
13243
0
            goto err;
13244
0
        }
13245
        /* create entries for translating chars in x to those in y */
13246
0
        x_kind = PyUnicode_KIND(x);
13247
0
        y_kind = PyUnicode_KIND(y);
13248
0
        x_data = PyUnicode_DATA(x);
13249
0
        y_data = PyUnicode_DATA(y);
13250
0
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13251
0
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13252
0
            if (!key)
13253
0
                goto err;
13254
0
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13255
0
            if (!value) {
13256
0
                Py_DECREF(key);
13257
0
                goto err;
13258
0
            }
13259
0
            res = PyDict_SetItem(new, key, value);
13260
0
            Py_DECREF(key);
13261
0
            Py_DECREF(value);
13262
0
            if (res < 0)
13263
0
                goto err;
13264
0
        }
13265
        /* create entries for deleting chars in z */
13266
0
        if (z != NULL) {
13267
0
            z_kind = PyUnicode_KIND(z);
13268
0
            z_data = PyUnicode_DATA(z);
13269
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13270
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13271
0
                if (!key)
13272
0
                    goto err;
13273
0
                res = PyDict_SetItem(new, key, Py_None);
13274
0
                Py_DECREF(key);
13275
0
                if (res < 0)
13276
0
                    goto err;
13277
0
            }
13278
0
        }
13279
0
    } else {
13280
0
        int kind;
13281
0
        const void *data;
13282
13283
        /* x must be a dict */
13284
0
        if (!PyDict_CheckExact(x)) {
13285
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13286
0
                            "to maketrans it must be a dict");
13287
0
            goto err;
13288
0
        }
13289
        /* copy entries into the new dict, converting string keys to int keys */
13290
0
        while (PyDict_Next(x, &i, &key, &value)) {
13291
0
            if (PyUnicode_Check(key)) {
13292
                /* convert string keys to integer keys */
13293
0
                PyObject *newkey;
13294
0
                if (PyUnicode_GET_LENGTH(key) != 1) {
13295
0
                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13296
0
                                    "table must be of length 1");
13297
0
                    goto err;
13298
0
                }
13299
0
                kind = PyUnicode_KIND(key);
13300
0
                data = PyUnicode_DATA(key);
13301
0
                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13302
0
                if (!newkey)
13303
0
                    goto err;
13304
0
                res = PyDict_SetItem(new, newkey, value);
13305
0
                Py_DECREF(newkey);
13306
0
                if (res < 0)
13307
0
                    goto err;
13308
0
            } else if (PyLong_Check(key)) {
13309
                /* just keep integer keys */
13310
0
                if (PyDict_SetItem(new, key, value) < 0)
13311
0
                    goto err;
13312
0
            } else {
13313
0
                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13314
0
                                "be strings or integers");
13315
0
                goto err;
13316
0
            }
13317
0
        }
13318
0
    }
13319
0
    return new;
13320
0
  err:
13321
0
    Py_DECREF(new);
13322
0
    return NULL;
13323
0
}
13324
13325
/*[clinic input]
13326
@permit_long_docstring_body
13327
str.translate as unicode_translate
13328
13329
    table: object
13330
        Translation table, which must be a mapping of Unicode ordinals to
13331
        Unicode ordinals, strings, or None.
13332
    /
13333
13334
Replace each character in the string using the given translation table.
13335
13336
The table must implement lookup/indexing via __getitem__, for instance a
13337
dictionary or list.  If this operation raises LookupError, the character is
13338
left untouched.  Characters mapped to None are deleted.
13339
[clinic start generated code]*/
13340
13341
static PyObject *
13342
unicode_translate(PyObject *self, PyObject *table)
13343
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=699e5fa0ebf9f5e9]*/
13344
104
{
13345
104
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13346
104
}
13347
13348
/*[clinic input]
13349
str.upper as unicode_upper
13350
13351
Return a copy of the string converted to uppercase.
13352
[clinic start generated code]*/
13353
13354
static PyObject *
13355
unicode_upper_impl(PyObject *self)
13356
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13357
102
{
13358
102
    if (PyUnicode_IS_ASCII(self))
13359
102
        return ascii_upper_or_lower(self, 0);
13360
0
    return case_operation(self, do_upper);
13361
102
}
13362
13363
/*[clinic input]
13364
@permit_long_summary
13365
str.zfill as unicode_zfill
13366
13367
    width: Py_ssize_t
13368
    /
13369
13370
Pad a numeric string with zeros on the left, to fill a field of the given width.
13371
13372
The string is never truncated.
13373
[clinic start generated code]*/
13374
13375
static PyObject *
13376
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13377
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13378
0
{
13379
0
    Py_ssize_t fill;
13380
0
    PyObject *u;
13381
0
    int kind;
13382
0
    const void *data;
13383
0
    Py_UCS4 chr;
13384
13385
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13386
0
        return unicode_result_unchanged(self);
13387
13388
0
    fill = width - PyUnicode_GET_LENGTH(self);
13389
13390
0
    u = pad(self, fill, 0, '0');
13391
13392
0
    if (u == NULL)
13393
0
        return NULL;
13394
13395
0
    kind = PyUnicode_KIND(u);
13396
0
    data = PyUnicode_DATA(u);
13397
0
    chr = PyUnicode_READ(kind, data, fill);
13398
13399
0
    if (chr == '+' || chr == '-') {
13400
        /* move sign to beginning of string */
13401
0
        PyUnicode_WRITE(kind, data, 0, chr);
13402
0
        PyUnicode_WRITE(kind, data, fill, '0');
13403
0
    }
13404
13405
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13406
0
    return u;
13407
0
}
13408
13409
/*[clinic input]
13410
@permit_long_summary
13411
@text_signature "($self, prefix[, start[, end]], /)"
13412
str.startswith as unicode_startswith
13413
13414
    prefix as subobj: object
13415
        A string or a tuple of strings to try.
13416
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13417
        Optional start position. Default: start of the string.
13418
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13419
        Optional stop position. Default: end of the string.
13420
    /
13421
13422
Return True if the string starts with the specified prefix, False otherwise.
13423
[clinic start generated code]*/
13424
13425
static PyObject *
13426
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13427
                        Py_ssize_t end)
13428
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13429
69.9M
{
13430
69.9M
    if (PyTuple_Check(subobj)) {
13431
8.71M
        Py_ssize_t i;
13432
31.6M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13433
22.9M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13434
22.9M
            if (!PyUnicode_Check(substring)) {
13435
0
                PyErr_Format(PyExc_TypeError,
13436
0
                             "tuple for startswith must only contain str, "
13437
0
                             "not %.100s",
13438
0
                             Py_TYPE(substring)->tp_name);
13439
0
                return NULL;
13440
0
            }
13441
22.9M
            int result = tailmatch(self, substring, start, end, -1);
13442
22.9M
            if (result < 0) {
13443
0
                return NULL;
13444
0
            }
13445
22.9M
            if (result) {
13446
37.5k
                Py_RETURN_TRUE;
13447
37.5k
            }
13448
22.9M
        }
13449
        /* nothing matched */
13450
8.71M
        Py_RETURN_FALSE;
13451
8.71M
    }
13452
61.2M
    if (!PyUnicode_Check(subobj)) {
13453
0
        PyErr_Format(PyExc_TypeError,
13454
0
                     "startswith first arg must be str or "
13455
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13456
0
        return NULL;
13457
0
    }
13458
61.2M
    int result = tailmatch(self, subobj, start, end, -1);
13459
61.2M
    if (result < 0) {
13460
0
        return NULL;
13461
0
    }
13462
61.2M
    return PyBool_FromLong(result);
13463
61.2M
}
13464
13465
13466
/*[clinic input]
13467
@permit_long_summary
13468
@text_signature "($self, suffix[, start[, end]], /)"
13469
str.endswith as unicode_endswith
13470
13471
    suffix as subobj: object
13472
        A string or a tuple of strings to try.
13473
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13474
        Optional start position. Default: start of the string.
13475
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13476
        Optional stop position. Default: end of the string.
13477
    /
13478
13479
Return True if the string ends with the specified suffix, False otherwise.
13480
[clinic start generated code]*/
13481
13482
static PyObject *
13483
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13484
                      Py_ssize_t end)
13485
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13486
15.4M
{
13487
15.4M
    if (PyTuple_Check(subobj)) {
13488
178k
        Py_ssize_t i;
13489
323k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13490
304k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13491
304k
            if (!PyUnicode_Check(substring)) {
13492
0
                PyErr_Format(PyExc_TypeError,
13493
0
                             "tuple for endswith must only contain str, "
13494
0
                             "not %.100s",
13495
0
                             Py_TYPE(substring)->tp_name);
13496
0
                return NULL;
13497
0
            }
13498
304k
            int result = tailmatch(self, substring, start, end, +1);
13499
304k
            if (result < 0) {
13500
0
                return NULL;
13501
0
            }
13502
304k
            if (result) {
13503
159k
                Py_RETURN_TRUE;
13504
159k
            }
13505
304k
        }
13506
178k
        Py_RETURN_FALSE;
13507
178k
    }
13508
15.2M
    if (!PyUnicode_Check(subobj)) {
13509
0
        PyErr_Format(PyExc_TypeError,
13510
0
                     "endswith first arg must be str or "
13511
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13512
0
        return NULL;
13513
0
    }
13514
15.2M
    int result = tailmatch(self, subobj, start, end, +1);
13515
15.2M
    if (result < 0) {
13516
0
        return NULL;
13517
0
    }
13518
15.2M
    return PyBool_FromLong(result);
13519
15.2M
}
13520
13521
13522
static inline void
13523
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13524
66.5M
{
13525
66.5M
    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13526
66.5M
    writer->data = PyUnicode_DATA(writer->buffer);
13527
13528
66.5M
    if (!writer->readonly) {
13529
66.4M
        writer->kind = PyUnicode_KIND(writer->buffer);
13530
66.4M
        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13531
66.4M
    }
13532
19.2k
    else {
13533
        /* use a value smaller than PyUnicode_1BYTE_KIND() so
13534
           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13535
19.2k
        writer->kind = 0;
13536
19.2k
        assert(writer->kind <= PyUnicode_1BYTE_KIND);
13537
13538
        /* Copy-on-write mode: set buffer size to 0 so
13539
         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13540
         * next write. */
13541
19.2k
        writer->size = 0;
13542
19.2k
    }
13543
66.5M
}
13544
13545
13546
void
13547
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13548
52.2M
{
13549
52.2M
    memset(writer, 0, sizeof(*writer));
13550
13551
    /* ASCII is the bare minimum */
13552
52.2M
    writer->min_char = 127;
13553
13554
    /* use a kind value smaller than PyUnicode_1BYTE_KIND so
13555
       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13556
52.2M
    assert(writer->kind == 0);
13557
52.2M
    assert(writer->kind < PyUnicode_1BYTE_KIND);
13558
52.2M
}
13559
13560
13561
PyUnicodeWriter*
13562
PyUnicodeWriter_Create(Py_ssize_t length)
13563
4.90M
{
13564
4.90M
    if (length < 0) {
13565
0
        PyErr_SetString(PyExc_ValueError,
13566
0
                        "length must be positive");
13567
0
        return NULL;
13568
0
    }
13569
13570
4.90M
    const size_t size = sizeof(_PyUnicodeWriter);
13571
4.90M
    PyUnicodeWriter *pub_writer;
13572
4.90M
    pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
13573
4.90M
    if (pub_writer == NULL) {
13574
2.74M
        pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
13575
2.74M
        if (pub_writer == NULL) {
13576
0
            return (PyUnicodeWriter *)PyErr_NoMemory();
13577
0
        }
13578
2.74M
    }
13579
4.90M
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
13580
13581
4.90M
    _PyUnicodeWriter_Init(writer);
13582
4.90M
    if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
13583
0
        PyUnicodeWriter_Discard(pub_writer);
13584
0
        return NULL;
13585
0
    }
13586
4.90M
    writer->overallocate = 1;
13587
13588
4.90M
    return pub_writer;
13589
4.90M
}
13590
13591
13592
void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
13593
65.9k
{
13594
65.9k
    if (writer == NULL) {
13595
65.4k
        return;
13596
65.4k
    }
13597
466
    _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
13598
466
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
13599
466
}
13600
13601
13602
// Initialize _PyUnicodeWriter with initial buffer
13603
static inline void
13604
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13605
637k
{
13606
637k
    memset(writer, 0, sizeof(*writer));
13607
637k
    writer->buffer = buffer;
13608
637k
    _PyUnicodeWriter_Update(writer);
13609
637k
    writer->min_length = writer->size;
13610
637k
}
13611
13612
13613
int
13614
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13615
                                 Py_ssize_t length, Py_UCS4 maxchar)
13616
65.8M
{
13617
65.8M
    Py_ssize_t newlen;
13618
65.8M
    PyObject *newbuffer;
13619
13620
65.8M
    assert(length >= 0);
13621
65.8M
    assert(maxchar <= MAX_UNICODE);
13622
13623
    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13624
65.8M
    assert((maxchar > writer->maxchar && length >= 0)
13625
65.8M
           || length > 0);
13626
13627
65.8M
    if (length > PY_SSIZE_T_MAX - writer->pos) {
13628
0
        PyErr_NoMemory();
13629
0
        return -1;
13630
0
    }
13631
65.8M
    newlen = writer->pos + length;
13632
13633
65.8M
    maxchar = Py_MAX(maxchar, writer->min_char);
13634
13635
65.8M
    if (writer->buffer == NULL) {
13636
48.0M
        assert(!writer->readonly);
13637
48.0M
        if (writer->overallocate
13638
36.7M
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13639
            /* overallocate to limit the number of realloc() */
13640
36.7M
            newlen += newlen / OVERALLOCATE_FACTOR;
13641
36.7M
        }
13642
48.0M
        if (newlen < writer->min_length)
13643
43.0M
            newlen = writer->min_length;
13644
13645
48.0M
        writer->buffer = PyUnicode_New(newlen, maxchar);
13646
48.0M
        if (writer->buffer == NULL)
13647
0
            return -1;
13648
48.0M
    }
13649
17.8M
    else if (newlen > writer->size) {
13650
15.0M
        if (writer->overallocate
13651
14.6M
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13652
            /* overallocate to limit the number of realloc() */
13653
14.6M
            newlen += newlen / OVERALLOCATE_FACTOR;
13654
14.6M
        }
13655
15.0M
        if (newlen < writer->min_length)
13656
1.22k
            newlen = writer->min_length;
13657
13658
15.0M
        if (maxchar > writer->maxchar || writer->readonly) {
13659
            /* resize + widen */
13660
3.71M
            maxchar = Py_MAX(maxchar, writer->maxchar);
13661
3.71M
            newbuffer = PyUnicode_New(newlen, maxchar);
13662
3.71M
            if (newbuffer == NULL)
13663
0
                return -1;
13664
3.71M
            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13665
3.71M
                                          writer->buffer, 0, writer->pos);
13666
3.71M
            Py_DECREF(writer->buffer);
13667
3.71M
            writer->readonly = 0;
13668
3.71M
        }
13669
11.3M
        else {
13670
11.3M
            newbuffer = resize_compact(writer->buffer, newlen);
13671
11.3M
            if (newbuffer == NULL)
13672
0
                return -1;
13673
11.3M
        }
13674
15.0M
        writer->buffer = newbuffer;
13675
15.0M
    }
13676
2.78M
    else if (maxchar > writer->maxchar) {
13677
2.78M
        assert(!writer->readonly);
13678
2.78M
        newbuffer = PyUnicode_New(writer->size, maxchar);
13679
2.78M
        if (newbuffer == NULL)
13680
0
            return -1;
13681
2.78M
        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13682
2.78M
                                      writer->buffer, 0, writer->pos);
13683
2.78M
        Py_SETREF(writer->buffer, newbuffer);
13684
2.78M
    }
13685
65.8M
    _PyUnicodeWriter_Update(writer);
13686
65.8M
    return 0;
13687
13688
65.8M
#undef OVERALLOCATE_FACTOR
13689
65.8M
}
13690
13691
int
13692
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13693
                                     int kind)
13694
141k
{
13695
141k
    Py_UCS4 maxchar;
13696
13697
    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13698
141k
    assert(writer->kind < kind);
13699
13700
141k
    switch (kind)
13701
141k
    {
13702
0
    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13703
141k
    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13704
0
    case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13705
0
    default:
13706
0
        Py_UNREACHABLE();
13707
141k
    }
13708
13709
141k
    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13710
141k
}
13711
13712
int
13713
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13714
107M
{
13715
107M
    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13716
107M
}
13717
13718
int
13719
PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
13720
72.9M
{
13721
72.9M
    if (ch > MAX_UNICODE) {
13722
0
        PyErr_SetString(PyExc_ValueError,
13723
0
                        "character must be in range(0x110000)");
13724
0
        return -1;
13725
0
    }
13726
13727
72.9M
    return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
13728
72.9M
}
13729
13730
int
13731
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13732
65.3M
{
13733
65.3M
    assert(PyUnicode_Check(str));
13734
13735
65.3M
    Py_UCS4 maxchar;
13736
65.3M
    Py_ssize_t len;
13737
13738
65.3M
    len = PyUnicode_GET_LENGTH(str);
13739
65.3M
    if (len == 0)
13740
22.4M
        return 0;
13741
42.9M
    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13742
42.9M
    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13743
23.8M
        if (writer->buffer == NULL && !writer->overallocate) {
13744
11.1k
            assert(_PyUnicode_CheckConsistency(str, 1));
13745
11.1k
            writer->readonly = 1;
13746
11.1k
            writer->buffer = Py_NewRef(str);
13747
11.1k
            _PyUnicodeWriter_Update(writer);
13748
11.1k
            writer->pos += len;
13749
11.1k
            return 0;
13750
11.1k
        }
13751
23.8M
        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13752
0
            return -1;
13753
23.8M
    }
13754
42.9M
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13755
42.9M
                                  str, 0, len);
13756
42.9M
    writer->pos += len;
13757
42.9M
    return 0;
13758
42.9M
}
13759
13760
int
13761
PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
13762
4.28M
{
13763
4.28M
    PyTypeObject *type = Py_TYPE(obj);
13764
4.28M
    if (type == &PyUnicode_Type) {
13765
4.28M
        return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
13766
4.28M
    }
13767
13768
0
    if (type == &PyLong_Type) {
13769
0
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
13770
0
    }
13771
13772
0
    PyObject *str = PyObject_Str(obj);
13773
0
    if (str == NULL) {
13774
0
        return -1;
13775
0
    }
13776
13777
0
    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
13778
0
    Py_DECREF(str);
13779
0
    return res;
13780
0
}
13781
13782
13783
int
13784
PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
13785
8.84M
{
13786
8.84M
    if (Py_TYPE(obj) == &PyLong_Type) {
13787
910k
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
13788
910k
    }
13789
13790
7.92M
    PyObject *repr = PyObject_Repr(obj);
13791
7.92M
    if (repr == NULL) {
13792
0
        return -1;
13793
0
    }
13794
13795
7.92M
    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
13796
7.92M
    Py_DECREF(repr);
13797
7.92M
    return res;
13798
7.92M
}
13799
13800
13801
int
13802
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13803
                                Py_ssize_t start, Py_ssize_t end)
13804
66.4M
{
13805
66.4M
    assert(0 <= start);
13806
66.4M
    assert(end <= PyUnicode_GET_LENGTH(str));
13807
66.4M
    assert(start <= end);
13808
13809
66.4M
    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13810
116
        return _PyUnicodeWriter_WriteStr(writer, str);
13811
13812
66.4M
    Py_ssize_t len = end - start;
13813
66.4M
    if (len == 0) {
13814
0
        return 0;
13815
0
    }
13816
13817
66.4M
    Py_UCS4 maxchar;
13818
66.4M
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
13819
13.8M
        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13820
13.8M
    }
13821
52.6M
    else {
13822
52.6M
        maxchar = writer->maxchar;
13823
52.6M
    }
13824
66.4M
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
13825
0
        return -1;
13826
0
    }
13827
13828
66.4M
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13829
66.4M
                                  str, start, len);
13830
66.4M
    writer->pos += len;
13831
66.4M
    return 0;
13832
66.4M
}
13833
13834
13835
int
13836
PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
13837
                               Py_ssize_t start, Py_ssize_t end)
13838
616k
{
13839
616k
    if (!PyUnicode_Check(str)) {
13840
0
        PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
13841
0
        return -1;
13842
0
    }
13843
616k
    if (start < 0 || start > end) {
13844
0
        PyErr_Format(PyExc_ValueError, "invalid start argument");
13845
0
        return -1;
13846
0
    }
13847
616k
    if (end > PyUnicode_GET_LENGTH(str)) {
13848
0
        PyErr_Format(PyExc_ValueError, "invalid end argument");
13849
0
        return -1;
13850
0
    }
13851
13852
616k
    return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
13853
616k
                                           start, end);
13854
616k
}
13855
13856
13857
int
13858
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13859
                                  const char *ascii, Py_ssize_t len)
13860
52.5M
{
13861
52.5M
    if (len == -1)
13862
0
        len = strlen(ascii);
13863
13864
52.5M
    assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13865
13866
52.5M
    if (writer->buffer == NULL && !writer->overallocate) {
13867
8.09k
        PyObject *str;
13868
13869
8.09k
        str = _PyUnicode_FromASCII(ascii, len);
13870
8.09k
        if (str == NULL)
13871
0
            return -1;
13872
13873
8.09k
        writer->readonly = 1;
13874
8.09k
        writer->buffer = str;
13875
8.09k
        _PyUnicodeWriter_Update(writer);
13876
8.09k
        writer->pos += len;
13877
8.09k
        return 0;
13878
8.09k
    }
13879
13880
52.5M
    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13881
0
        return -1;
13882
13883
52.5M
    switch (writer->kind)
13884
52.5M
    {
13885
52.5M
    case PyUnicode_1BYTE_KIND:
13886
52.5M
    {
13887
52.5M
        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13888
52.5M
        Py_UCS1 *data = writer->data;
13889
13890
52.5M
        memcpy(data + writer->pos, str, len);
13891
52.5M
        break;
13892
0
    }
13893
12.1k
    case PyUnicode_2BYTE_KIND:
13894
12.1k
    {
13895
12.1k
        _PyUnicode_CONVERT_BYTES(
13896
12.1k
            Py_UCS1, Py_UCS2,
13897
12.1k
            ascii, ascii + len,
13898
12.1k
            (Py_UCS2 *)writer->data + writer->pos);
13899
12.1k
        break;
13900
0
    }
13901
3.42k
    case PyUnicode_4BYTE_KIND:
13902
3.42k
    {
13903
3.42k
        _PyUnicode_CONVERT_BYTES(
13904
3.42k
            Py_UCS1, Py_UCS4,
13905
3.42k
            ascii, ascii + len,
13906
3.42k
            (Py_UCS4 *)writer->data + writer->pos);
13907
3.42k
        break;
13908
0
    }
13909
0
    default:
13910
0
        Py_UNREACHABLE();
13911
52.5M
    }
13912
13913
52.5M
    writer->pos += len;
13914
52.5M
    return 0;
13915
52.5M
}
13916
13917
13918
int
13919
PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
13920
                           const char *str,
13921
                           Py_ssize_t size)
13922
440k
{
13923
440k
    assert(writer != NULL);
13924
440k
    _Py_AssertHoldsTstate();
13925
13926
440k
    _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer;
13927
440k
    return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
13928
440k
}
13929
13930
13931
int
13932
PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
13933
                          const char *str,
13934
                          Py_ssize_t size)
13935
0
{
13936
0
    if (size < 0) {
13937
0
        size = strlen(str);
13938
0
    }
13939
13940
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
13941
0
    Py_ssize_t old_pos = _writer->pos;
13942
0
    int res = unicode_decode_utf8_writer(_writer, str, size,
13943
0
                                         _Py_ERROR_STRICT, NULL, NULL);
13944
0
    if (res < 0) {
13945
0
        _writer->pos = old_pos;
13946
0
    }
13947
0
    return res;
13948
0
}
13949
13950
13951
int
13952
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
13953
                                   const char *string,
13954
                                   Py_ssize_t length,
13955
                                   const char *errors,
13956
                                   Py_ssize_t *consumed)
13957
0
{
13958
0
    if (length < 0) {
13959
0
        length = strlen(string);
13960
0
    }
13961
13962
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
13963
0
    Py_ssize_t old_pos = _writer->pos;
13964
0
    int res = unicode_decode_utf8_writer(_writer, string, length,
13965
0
                                         _Py_ERROR_UNKNOWN, errors, consumed);
13966
0
    if (res < 0) {
13967
0
        _writer->pos = old_pos;
13968
0
        if (consumed) {
13969
0
            *consumed = 0;
13970
0
        }
13971
0
    }
13972
0
    return res;
13973
0
}
13974
13975
13976
int
13977
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13978
                                   const char *str, Py_ssize_t len)
13979
0
{
13980
0
    Py_UCS4 maxchar;
13981
13982
0
    maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
13983
0
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13984
0
        return -1;
13985
0
    unicode_write_cstr(writer->buffer, writer->pos, str, len);
13986
0
    writer->pos += len;
13987
0
    return 0;
13988
0
}
13989
13990
PyObject *
13991
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13992
48.6M
{
13993
48.6M
    PyObject *str;
13994
13995
48.6M
    if (writer->pos == 0) {
13996
864
        Py_CLEAR(writer->buffer);
13997
864
        _Py_RETURN_UNICODE_EMPTY();
13998
864
    }
13999
14000
48.6M
    str = writer->buffer;
14001
48.6M
    writer->buffer = NULL;
14002
14003
48.6M
    if (writer->readonly) {
14004
17.9k
        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14005
17.9k
        return str;
14006
17.9k
    }
14007
14008
48.6M
    if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14009
47.5M
        PyObject *str2;
14010
47.5M
        str2 = resize_compact(str, writer->pos);
14011
47.5M
        if (str2 == NULL) {
14012
0
            Py_DECREF(str);
14013
0
            return NULL;
14014
0
        }
14015
47.5M
        str = str2;
14016
47.5M
    }
14017
14018
48.6M
    assert(_PyUnicode_CheckConsistency(str, 1));
14019
48.6M
    return unicode_result(str);
14020
48.6M
}
14021
14022
14023
PyObject*
14024
PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
14025
4.90M
{
14026
4.90M
    PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
14027
4.90M
    assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
14028
4.90M
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
14029
4.90M
    return str;
14030
4.90M
}
14031
14032
14033
void
14034
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14035
4.28M
{
14036
4.28M
    Py_CLEAR(writer->buffer);
14037
4.28M
}
14038
14039
#include "stringlib/unicode_format.h"
14040
14041
PyDoc_STRVAR(format__doc__,
14042
             "format($self, /, *args, **kwargs)\n\
14043
--\n\
14044
\n\
14045
Return a formatted version of the string, using substitutions from args and kwargs.\n\
14046
The substitutions are identified by braces ('{' and '}').");
14047
14048
PyDoc_STRVAR(format_map__doc__,
14049
             "format_map($self, mapping, /)\n\
14050
--\n\
14051
\n\
14052
Return a formatted version of the string, using substitutions from mapping.\n\
14053
The substitutions are identified by braces ('{' and '}').");
14054
14055
/*[clinic input]
14056
str.__format__ as unicode___format__
14057
14058
    format_spec: unicode
14059
    /
14060
14061
Return a formatted version of the string as described by format_spec.
14062
[clinic start generated code]*/
14063
14064
static PyObject *
14065
unicode___format___impl(PyObject *self, PyObject *format_spec)
14066
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14067
0
{
14068
0
    _PyUnicodeWriter writer;
14069
0
    int ret;
14070
14071
0
    _PyUnicodeWriter_Init(&writer);
14072
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
14073
0
                                          self, format_spec, 0,
14074
0
                                          PyUnicode_GET_LENGTH(format_spec));
14075
0
    if (ret == -1) {
14076
0
        _PyUnicodeWriter_Dealloc(&writer);
14077
0
        return NULL;
14078
0
    }
14079
0
    return _PyUnicodeWriter_Finish(&writer);
14080
0
}
14081
14082
/*[clinic input]
14083
str.__sizeof__ as unicode_sizeof
14084
14085
Return the size of the string in memory, in bytes.
14086
[clinic start generated code]*/
14087
14088
static PyObject *
14089
unicode_sizeof_impl(PyObject *self)
14090
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14091
0
{
14092
0
    Py_ssize_t size;
14093
14094
    /* If it's a compact object, account for base structure +
14095
       character data. */
14096
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
14097
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14098
0
    }
14099
0
    else if (PyUnicode_IS_COMPACT(self)) {
14100
0
        size = sizeof(PyCompactUnicodeObject) +
14101
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14102
0
    }
14103
0
    else {
14104
        /* If it is a two-block object, account for base object, and
14105
           for character block if present. */
14106
0
        size = sizeof(PyUnicodeObject);
14107
0
        if (_PyUnicode_DATA_ANY(self))
14108
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
14109
0
                PyUnicode_KIND(self);
14110
0
    }
14111
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
14112
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
14113
14114
0
    return PyLong_FromSsize_t(size);
14115
0
}
14116
14117
static PyObject *
14118
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14119
0
{
14120
0
    PyObject *copy = _PyUnicode_Copy(v);
14121
0
    if (!copy)
14122
0
        return NULL;
14123
0
    return Py_BuildValue("(N)", copy);
14124
0
}
14125
14126
/*
14127
This function searchs the longest common leading whitespace
14128
of all lines in the [src, end).
14129
It returns the length of the common leading whitespace and sets `output` to
14130
point to the beginning of the common leading whitespace if length > 0.
14131
*/
14132
static Py_ssize_t
14133
search_longest_common_leading_whitespace(
14134
    const char *const src,
14135
    const char *const end,
14136
    const char **output)
14137
0
{
14138
    // [_start, _start + _len)
14139
    // describes the current longest common leading whitespace
14140
0
    const char *_start = NULL;
14141
0
    Py_ssize_t _len = 0;
14142
14143
0
    for (const char *iter = src; iter < end; ++iter) {
14144
0
        const char *line_start = iter;
14145
0
        const char *leading_whitespace_end = NULL;
14146
14147
        // scan the whole line
14148
0
        while (iter < end && *iter != '\n') {
14149
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
14150
                /* `iter` points to the first non-whitespace character
14151
                   in this line */
14152
0
                if (iter == line_start) {
14153
                    // some line has no indent, fast exit!
14154
0
                    return 0;
14155
0
                }
14156
0
                leading_whitespace_end = iter;
14157
0
            }
14158
0
            ++iter;
14159
0
        }
14160
14161
        // if this line has all white space, skip it
14162
0
        if (!leading_whitespace_end) {
14163
0
            continue;
14164
0
        }
14165
14166
0
        if (!_start) {
14167
            // update the first leading whitespace
14168
0
            _start = line_start;
14169
0
            _len = leading_whitespace_end - line_start;
14170
0
            assert(_len > 0);
14171
0
        }
14172
0
        else {
14173
            /* We then compare with the current longest leading whitespace.
14174
14175
               [line_start, leading_whitespace_end) is the leading
14176
               whitespace of this line,
14177
14178
               [_start, _start + _len) is the leading whitespace of the
14179
               current longest leading whitespace. */
14180
0
            Py_ssize_t new_len = 0;
14181
0
            const char *_iter = _start, *line_iter = line_start;
14182
14183
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
14184
0
                   && *_iter == *line_iter)
14185
0
            {
14186
0
                ++_iter;
14187
0
                ++line_iter;
14188
0
                ++new_len;
14189
0
            }
14190
14191
0
            _len = new_len;
14192
0
            if (_len == 0) {
14193
                // No common things now, fast exit!
14194
0
                return 0;
14195
0
            }
14196
0
        }
14197
0
    }
14198
14199
0
    assert(_len >= 0);
14200
0
    if (_len > 0) {
14201
0
        *output = _start;
14202
0
    }
14203
0
    return _len;
14204
0
}
14205
14206
/* Dedent a string.
14207
   Behaviour is expected to be an exact match of `textwrap.dedent`.
14208
   Return a new reference on success, NULL with exception set on error.
14209
   */
14210
PyObject *
14211
_PyUnicode_Dedent(PyObject *unicode)
14212
0
{
14213
0
    Py_ssize_t src_len = 0;
14214
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
14215
0
    if (!src) {
14216
0
        return NULL;
14217
0
    }
14218
0
    assert(src_len >= 0);
14219
0
    if (src_len == 0) {
14220
0
        return Py_NewRef(unicode);
14221
0
    }
14222
14223
0
    const char *const end = src + src_len;
14224
14225
    // [whitespace_start, whitespace_start + whitespace_len)
14226
    // describes the current longest common leading whitespace
14227
0
    const char *whitespace_start = NULL;
14228
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
14229
0
        src, end, &whitespace_start);
14230
14231
0
    if (whitespace_len == 0) {
14232
0
        return Py_NewRef(unicode);
14233
0
    }
14234
14235
    // now we should trigger a dedent
14236
0
    char *dest = PyMem_Malloc(src_len);
14237
0
    if (!dest) {
14238
0
        PyErr_NoMemory();
14239
0
        return NULL;
14240
0
    }
14241
0
    char *dest_iter = dest;
14242
14243
0
    for (const char *iter = src; iter < end; ++iter) {
14244
0
        const char *line_start = iter;
14245
0
        bool in_leading_space = true;
14246
14247
        // iterate over a line to find the end of a line
14248
0
        while (iter < end && *iter != '\n') {
14249
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
14250
0
                in_leading_space = false;
14251
0
            }
14252
0
            ++iter;
14253
0
        }
14254
14255
        // invariant: *iter == '\n' or iter == end
14256
0
        bool append_newline = iter < end;
14257
14258
        // if this line has all white space, write '\n' and continue
14259
0
        if (in_leading_space && append_newline) {
14260
0
            *dest_iter++ = '\n';
14261
0
            continue;
14262
0
        }
14263
14264
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
14265
            conditionally append '\n' */
14266
14267
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
14268
0
        assert(new_line_len >= 0);
14269
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
14270
14271
0
        dest_iter += new_line_len;
14272
14273
0
        if (append_newline) {
14274
0
            *dest_iter++ = '\n';
14275
0
        }
14276
0
    }
14277
14278
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
14279
0
    PyMem_Free(dest);
14280
0
    return res;
14281
0
}
14282
14283
static PyMethodDef unicode_methods[] = {
14284
    UNICODE_ENCODE_METHODDEF
14285
    UNICODE_REPLACE_METHODDEF
14286
    UNICODE_SPLIT_METHODDEF
14287
    UNICODE_RSPLIT_METHODDEF
14288
    UNICODE_JOIN_METHODDEF
14289
    UNICODE_CAPITALIZE_METHODDEF
14290
    UNICODE_CASEFOLD_METHODDEF
14291
    UNICODE_TITLE_METHODDEF
14292
    UNICODE_CENTER_METHODDEF
14293
    UNICODE_COUNT_METHODDEF
14294
    UNICODE_EXPANDTABS_METHODDEF
14295
    UNICODE_FIND_METHODDEF
14296
    UNICODE_PARTITION_METHODDEF
14297
    UNICODE_INDEX_METHODDEF
14298
    UNICODE_LJUST_METHODDEF
14299
    UNICODE_LOWER_METHODDEF
14300
    UNICODE_LSTRIP_METHODDEF
14301
    UNICODE_RFIND_METHODDEF
14302
    UNICODE_RINDEX_METHODDEF
14303
    UNICODE_RJUST_METHODDEF
14304
    UNICODE_RSTRIP_METHODDEF
14305
    UNICODE_RPARTITION_METHODDEF
14306
    UNICODE_SPLITLINES_METHODDEF
14307
    UNICODE_STRIP_METHODDEF
14308
    UNICODE_SWAPCASE_METHODDEF
14309
    UNICODE_TRANSLATE_METHODDEF
14310
    UNICODE_UPPER_METHODDEF
14311
    UNICODE_STARTSWITH_METHODDEF
14312
    UNICODE_ENDSWITH_METHODDEF
14313
    UNICODE_REMOVEPREFIX_METHODDEF
14314
    UNICODE_REMOVESUFFIX_METHODDEF
14315
    UNICODE_ISASCII_METHODDEF
14316
    UNICODE_ISLOWER_METHODDEF
14317
    UNICODE_ISUPPER_METHODDEF
14318
    UNICODE_ISTITLE_METHODDEF
14319
    UNICODE_ISSPACE_METHODDEF
14320
    UNICODE_ISDECIMAL_METHODDEF
14321
    UNICODE_ISDIGIT_METHODDEF
14322
    UNICODE_ISNUMERIC_METHODDEF
14323
    UNICODE_ISALPHA_METHODDEF
14324
    UNICODE_ISALNUM_METHODDEF
14325
    UNICODE_ISIDENTIFIER_METHODDEF
14326
    UNICODE_ISPRINTABLE_METHODDEF
14327
    UNICODE_ZFILL_METHODDEF
14328
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
14329
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
14330
    UNICODE___FORMAT___METHODDEF
14331
    UNICODE_MAKETRANS_METHODDEF
14332
    UNICODE_SIZEOF_METHODDEF
14333
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
14334
    {NULL, NULL}
14335
};
14336
14337
static PyObject *
14338
unicode_mod(PyObject *v, PyObject *w)
14339
23.3M
{
14340
23.3M
    if (!PyUnicode_Check(v))
14341
0
        Py_RETURN_NOTIMPLEMENTED;
14342
23.3M
    return PyUnicode_Format(v, w);
14343
23.3M
}
14344
14345
static PyNumberMethods unicode_as_number = {
14346
    0,              /*nb_add*/
14347
    0,              /*nb_subtract*/
14348
    0,              /*nb_multiply*/
14349
    unicode_mod,            /*nb_remainder*/
14350
};
14351
14352
static PySequenceMethods unicode_as_sequence = {
14353
    unicode_length,     /* sq_length */
14354
    PyUnicode_Concat,   /* sq_concat */
14355
    unicode_repeat,     /* sq_repeat */
14356
    unicode_getitem,    /* sq_item */
14357
    0,                  /* sq_slice */
14358
    0,                  /* sq_ass_item */
14359
    0,                  /* sq_ass_slice */
14360
    PyUnicode_Contains, /* sq_contains */
14361
};
14362
14363
static PyObject*
14364
unicode_subscript(PyObject* self, PyObject* item)
14365
143M
{
14366
143M
    if (_PyIndex_Check(item)) {
14367
55.3M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14368
55.3M
        if (i == -1 && PyErr_Occurred())
14369
0
            return NULL;
14370
55.3M
        if (i < 0)
14371
61.3k
            i += PyUnicode_GET_LENGTH(self);
14372
55.3M
        return unicode_getitem(self, i);
14373
88.4M
    } else if (PySlice_Check(item)) {
14374
88.4M
        Py_ssize_t start, stop, step, slicelength, i;
14375
88.4M
        size_t cur;
14376
88.4M
        PyObject *result;
14377
88.4M
        const void *src_data;
14378
88.4M
        void *dest_data;
14379
88.4M
        int src_kind, dest_kind;
14380
88.4M
        Py_UCS4 ch, max_char, kind_limit;
14381
14382
88.4M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14383
0
            return NULL;
14384
0
        }
14385
88.4M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14386
88.4M
                                            &start, &stop, step);
14387
14388
88.4M
        if (slicelength <= 0) {
14389
14.8M
            _Py_RETURN_UNICODE_EMPTY();
14390
73.5M
        } else if (start == 0 && step == 1 &&
14391
30.7M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
14392
6.63M
            return unicode_result_unchanged(self);
14393
66.9M
        } else if (step == 1) {
14394
66.9M
            return PyUnicode_Substring(self,
14395
66.9M
                                       start, start + slicelength);
14396
66.9M
        }
14397
        /* General case */
14398
0
        src_kind = PyUnicode_KIND(self);
14399
0
        src_data = PyUnicode_DATA(self);
14400
0
        if (!PyUnicode_IS_ASCII(self)) {
14401
0
            kind_limit = kind_maxchar_limit(src_kind);
14402
0
            max_char = 0;
14403
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14404
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
14405
0
                if (ch > max_char) {
14406
0
                    max_char = ch;
14407
0
                    if (max_char >= kind_limit)
14408
0
                        break;
14409
0
                }
14410
0
            }
14411
0
        }
14412
0
        else
14413
0
            max_char = 127;
14414
0
        result = PyUnicode_New(slicelength, max_char);
14415
0
        if (result == NULL)
14416
0
            return NULL;
14417
0
        dest_kind = PyUnicode_KIND(result);
14418
0
        dest_data = PyUnicode_DATA(result);
14419
14420
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14421
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14422
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14423
0
        }
14424
0
        assert(_PyUnicode_CheckConsistency(result, 1));
14425
0
        return result;
14426
0
    } else {
14427
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
14428
0
                     Py_TYPE(item)->tp_name);
14429
0
        return NULL;
14430
0
    }
14431
143M
}
14432
14433
static PyMappingMethods unicode_as_mapping = {
14434
    unicode_length,     /* mp_length */
14435
    unicode_subscript,  /* mp_subscript */
14436
    0,                  /* mp_ass_subscript */
14437
};
14438
14439
14440
static PyObject *
14441
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
14442
14443
/*[clinic input]
14444
@classmethod
14445
str.__new__ as unicode_new
14446
14447
    object as x: object = NULL
14448
    encoding: str = NULL
14449
    errors: str = NULL
14450
14451
[clinic start generated code]*/
14452
14453
static PyObject *
14454
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
14455
                 const char *errors)
14456
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
14457
11.2M
{
14458
11.2M
    PyObject *unicode;
14459
11.2M
    if (x == NULL) {
14460
0
        unicode = unicode_get_empty();
14461
0
    }
14462
11.2M
    else if (encoding == NULL && errors == NULL) {
14463
11.2M
        unicode = PyObject_Str(x);
14464
11.2M
    }
14465
0
    else {
14466
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
14467
0
    }
14468
14469
11.2M
    if (unicode != NULL && type != &PyUnicode_Type) {
14470
11.2M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
14471
11.2M
    }
14472
11.2M
    return unicode;
14473
11.2M
}
14474
14475
static const char *
14476
arg_as_utf8(PyObject *obj, const char *name)
14477
1.40M
{
14478
1.40M
    if (!PyUnicode_Check(obj)) {
14479
0
        PyErr_Format(PyExc_TypeError,
14480
0
                     "str() argument '%s' must be str, not %T",
14481
0
                     name, obj);
14482
0
        return NULL;
14483
0
    }
14484
1.40M
    return _PyUnicode_AsUTF8NoNUL(obj);
14485
1.40M
}
14486
14487
static PyObject *
14488
unicode_vectorcall(PyObject *type, PyObject *const *args,
14489
                   size_t nargsf, PyObject *kwnames)
14490
1.18M
{
14491
1.18M
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
14492
14493
1.18M
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
14494
1.18M
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
14495
        // Fallback to unicode_new()
14496
0
        PyObject *tuple = PyTuple_FromArray(args, nargs);
14497
0
        if (tuple == NULL) {
14498
0
            return NULL;
14499
0
        }
14500
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
14501
0
        if (dict == NULL) {
14502
0
            Py_DECREF(tuple);
14503
0
            return NULL;
14504
0
        }
14505
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
14506
0
        Py_DECREF(tuple);
14507
0
        Py_DECREF(dict);
14508
0
        return ret;
14509
0
    }
14510
1.18M
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
14511
0
        return NULL;
14512
0
    }
14513
1.18M
    if (nargs == 0) {
14514
0
        return unicode_get_empty();
14515
0
    }
14516
1.18M
    PyObject *object = args[0];
14517
1.18M
    if (nargs == 1) {
14518
426
        return PyObject_Str(object);
14519
426
    }
14520
1.18M
    const char *encoding = arg_as_utf8(args[1], "encoding");
14521
1.18M
    if (encoding == NULL) {
14522
156
        return NULL;
14523
156
    }
14524
1.18M
    const char *errors = NULL;
14525
1.18M
    if (nargs == 3) {
14526
213k
        errors = arg_as_utf8(args[2], "errors");
14527
213k
        if (errors == NULL) {
14528
0
            return NULL;
14529
0
        }
14530
213k
    }
14531
1.18M
    return PyUnicode_FromEncodedObject(object, encoding, errors);
14532
1.18M
}
14533
14534
static PyObject *
14535
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
14536
11.2M
{
14537
11.2M
    PyObject *self;
14538
11.2M
    Py_ssize_t length, char_size;
14539
11.2M
    int share_utf8;
14540
11.2M
    int kind;
14541
11.2M
    void *data;
14542
14543
11.2M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
14544
11.2M
    assert(_PyUnicode_CHECK(unicode));
14545
14546
11.2M
    self = type->tp_alloc(type, 0);
14547
11.2M
    if (self == NULL) {
14548
0
        return NULL;
14549
0
    }
14550
11.2M
    kind = PyUnicode_KIND(unicode);
14551
11.2M
    length = PyUnicode_GET_LENGTH(unicode);
14552
14553
11.2M
    _PyUnicode_LENGTH(self) = length;
14554
#ifdef Py_DEBUG
14555
    _PyUnicode_HASH(self) = -1;
14556
#else
14557
11.2M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14558
11.2M
#endif
14559
11.2M
    _PyUnicode_STATE(self).interned = 0;
14560
11.2M
    _PyUnicode_STATE(self).kind = kind;
14561
11.2M
    _PyUnicode_STATE(self).compact = 0;
14562
11.2M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14563
11.2M
    _PyUnicode_STATE(self).statically_allocated = 0;
14564
11.2M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
14565
11.2M
    PyUnicode_SET_UTF8(self, NULL);
14566
11.2M
    _PyUnicode_DATA_ANY(self) = NULL;
14567
14568
11.2M
    share_utf8 = 0;
14569
11.2M
    if (kind == PyUnicode_1BYTE_KIND) {
14570
9.97M
        char_size = 1;
14571
9.97M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14572
9.93M
            share_utf8 = 1;
14573
9.97M
    }
14574
1.24M
    else if (kind == PyUnicode_2BYTE_KIND) {
14575
1.19M
        char_size = 2;
14576
1.19M
    }
14577
51.3k
    else {
14578
51.3k
        assert(kind == PyUnicode_4BYTE_KIND);
14579
51.3k
        char_size = 4;
14580
51.3k
    }
14581
14582
    /* Ensure we won't overflow the length. */
14583
11.2M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14584
0
        PyErr_NoMemory();
14585
0
        goto onError;
14586
0
    }
14587
11.2M
    data = PyMem_Malloc((length + 1) * char_size);
14588
11.2M
    if (data == NULL) {
14589
0
        PyErr_NoMemory();
14590
0
        goto onError;
14591
0
    }
14592
14593
11.2M
    _PyUnicode_DATA_ANY(self) = data;
14594
11.2M
    if (share_utf8) {
14595
9.93M
        PyUnicode_SET_UTF8_LENGTH(self, length);
14596
9.93M
        PyUnicode_SET_UTF8(self, data);
14597
9.93M
    }
14598
14599
11.2M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
14600
11.2M
    assert(_PyUnicode_CheckConsistency(self, 1));
14601
#ifdef Py_DEBUG
14602
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14603
#endif
14604
11.2M
    return self;
14605
14606
0
onError:
14607
0
    Py_DECREF(self);
14608
0
    return NULL;
14609
11.2M
}
14610
14611
void
14612
_PyUnicode_ExactDealloc(PyObject *op)
14613
76.6M
{
14614
76.6M
    assert(PyUnicode_CheckExact(op));
14615
76.6M
    unicode_dealloc(op);
14616
76.6M
}
14617
14618
PyDoc_STRVAR(unicode_doc,
14619
"str(object='') -> str\n\
14620
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14621
\n\
14622
Create a new string object from the given object. If encoding or\n\
14623
errors is specified, then the object must expose a data buffer\n\
14624
that will be decoded using the given encoding and error handler.\n\
14625
Otherwise, returns the result of object.__str__() (if defined)\n\
14626
or repr(object).\n\
14627
encoding defaults to 'utf-8'.\n\
14628
errors defaults to 'strict'.");
14629
14630
static PyObject *unicode_iter(PyObject *seq);
14631
14632
PyTypeObject PyUnicode_Type = {
14633
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14634
    "str",                        /* tp_name */
14635
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14636
    0,                            /* tp_itemsize */
14637
    /* Slots */
14638
    unicode_dealloc,              /* tp_dealloc */
14639
    0,                            /* tp_vectorcall_offset */
14640
    0,                            /* tp_getattr */
14641
    0,                            /* tp_setattr */
14642
    0,                            /* tp_as_async */
14643
    unicode_repr,                 /* tp_repr */
14644
    &unicode_as_number,           /* tp_as_number */
14645
    &unicode_as_sequence,         /* tp_as_sequence */
14646
    &unicode_as_mapping,          /* tp_as_mapping */
14647
    unicode_hash,                 /* tp_hash*/
14648
    0,                            /* tp_call*/
14649
    unicode_str,                  /* tp_str */
14650
    PyObject_GenericGetAttr,      /* tp_getattro */
14651
    0,                            /* tp_setattro */
14652
    0,                            /* tp_as_buffer */
14653
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14654
        Py_TPFLAGS_UNICODE_SUBCLASS |
14655
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14656
    unicode_doc,                  /* tp_doc */
14657
    0,                            /* tp_traverse */
14658
    0,                            /* tp_clear */
14659
    PyUnicode_RichCompare,        /* tp_richcompare */
14660
    0,                            /* tp_weaklistoffset */
14661
    unicode_iter,                 /* tp_iter */
14662
    0,                            /* tp_iternext */
14663
    unicode_methods,              /* tp_methods */
14664
    0,                            /* tp_members */
14665
    0,                            /* tp_getset */
14666
    0,                            /* tp_base */
14667
    0,                            /* tp_dict */
14668
    0,                            /* tp_descr_get */
14669
    0,                            /* tp_descr_set */
14670
    0,                            /* tp_dictoffset */
14671
    0,                            /* tp_init */
14672
    0,                            /* tp_alloc */
14673
    unicode_new,                  /* tp_new */
14674
    PyObject_Free,                /* tp_free */
14675
    .tp_vectorcall = unicode_vectorcall,
14676
};
14677
14678
/* Initialize the Unicode implementation */
14679
14680
static void
14681
_init_global_state(void)
14682
16
{
14683
16
    static int initialized = 0;
14684
16
    if (initialized) {
14685
0
        return;
14686
0
    }
14687
16
    initialized = 1;
14688
14689
    /* initialize the linebreak bloom filter */
14690
16
    const Py_UCS2 linebreak[] = {
14691
16
        0x000A, /* LINE FEED */
14692
16
        0x000D, /* CARRIAGE RETURN */
14693
16
        0x001C, /* FILE SEPARATOR */
14694
16
        0x001D, /* GROUP SEPARATOR */
14695
16
        0x001E, /* RECORD SEPARATOR */
14696
16
        0x0085, /* NEXT LINE */
14697
16
        0x2028, /* LINE SEPARATOR */
14698
16
        0x2029, /* PARAGRAPH SEPARATOR */
14699
16
    };
14700
16
    bloom_linebreak = make_bloom_mask(
14701
16
        PyUnicode_2BYTE_KIND, linebreak,
14702
16
        Py_ARRAY_LENGTH(linebreak));
14703
16
}
14704
14705
void
14706
_PyUnicode_InitState(PyInterpreterState *interp)
14707
16
{
14708
16
    if (!_Py_IsMainInterpreter(interp)) {
14709
0
        return;
14710
0
    }
14711
16
    _init_global_state();
14712
16
}
14713
14714
14715
PyStatus
14716
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14717
16
{
14718
16
    if (_Py_IsMainInterpreter(interp)) {
14719
16
        PyStatus status = init_global_interned_strings(interp);
14720
16
        if (_PyStatus_EXCEPTION(status)) {
14721
0
            return status;
14722
0
        }
14723
16
    }
14724
16
    assert(INTERNED_STRINGS);
14725
14726
16
    if (init_interned_dict(interp)) {
14727
0
        PyErr_Clear();
14728
0
        return _PyStatus_ERR("failed to create interned dict");
14729
0
    }
14730
14731
16
    return _PyStatus_OK();
14732
16
}
14733
14734
14735
PyStatus
14736
_PyUnicode_InitTypes(PyInterpreterState *interp)
14737
16
{
14738
16
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
14739
0
        goto error;
14740
0
    }
14741
16
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
14742
0
        goto error;
14743
0
    }
14744
16
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
14745
0
        goto error;
14746
0
    }
14747
16
    return _PyStatus_OK();
14748
14749
0
error:
14750
0
    return _PyStatus_ERR("Can't initialize unicode types");
14751
16
}
14752
14753
static /* non-null */ PyObject*
14754
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
14755
17.1k
{
14756
    // Note that this steals a reference to `s`, but in many cases that
14757
    // stolen ref is returned, requiring no decref/incref.
14758
14759
17.1k
    assert(s != NULL);
14760
17.1k
    assert(_PyUnicode_CHECK(s));
14761
17.1k
    assert(_PyUnicode_STATE(s).statically_allocated);
14762
17.1k
    assert(!PyUnicode_CHECK_INTERNED(s));
14763
14764
#ifdef Py_DEBUG
14765
    /* We must not add process-global interned string if there's already a
14766
     * per-interpreter interned_dict, which might contain duplicates.
14767
     */
14768
    PyObject *interned = get_interned_dict(interp);
14769
    assert(interned == NULL);
14770
#endif
14771
14772
    /* Look in the global cache first. */
14773
17.1k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14774
    /* We should only init each string once */
14775
17.1k
    assert(r == NULL);
14776
    /* but just in case (for the non-debug build), handle this */
14777
17.1k
    if (r != NULL && r != s) {
14778
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
14779
0
        assert(_PyUnicode_CHECK(r));
14780
0
        Py_DECREF(s);
14781
0
        return Py_NewRef(r);
14782
0
    }
14783
14784
17.1k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
14785
0
        Py_FatalError("failed to intern static string");
14786
0
    }
14787
14788
17.1k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14789
17.1k
    return s;
14790
17.1k
}
14791
14792
void
14793
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
14794
17.1k
{
14795
    // This should only be called as part of runtime initialization
14796
17.1k
    assert(!Py_IsInitialized());
14797
14798
17.1k
    *p = intern_static(interp, *p);
14799
17.1k
    assert(*p);
14800
17.1k
}
14801
14802
static void
14803
immortalize_interned(PyObject *s)
14804
100k
{
14805
100k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
14806
100k
    assert(!_Py_IsImmortal(s));
14807
#ifdef Py_REF_DEBUG
14808
    /* The reference count value should be excluded from the RefTotal.
14809
       The decrements to these objects will not be registered so they
14810
       need to be accounted for in here. */
14811
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
14812
        _Py_DecRefTotal(_PyThreadState_GET());
14813
    }
14814
#endif
14815
100k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
14816
100k
    _Py_SetImmortal(s);
14817
100k
}
14818
14819
static /* non-null */ PyObject*
14820
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
14821
              bool immortalize)
14822
36.2M
{
14823
    // Note that this steals a reference to `s`, but in many cases that
14824
    // stolen ref is returned, requiring no decref/incref.
14825
14826
#ifdef Py_DEBUG
14827
    assert(s != NULL);
14828
    assert(_PyUnicode_CHECK(s));
14829
#else
14830
36.2M
    if (s == NULL || !PyUnicode_Check(s)) {
14831
0
        return s;
14832
0
    }
14833
36.2M
#endif
14834
14835
    /* If it's a subclass, we don't really know what putting
14836
       it in the interned dict might do. */
14837
36.2M
    if (!PyUnicode_CheckExact(s)) {
14838
0
        return s;
14839
0
    }
14840
14841
    /* Is it already interned? */
14842
36.2M
    switch (PyUnicode_CHECK_INTERNED(s)) {
14843
3.00M
        case SSTATE_NOT_INTERNED:
14844
            // no, go on
14845
3.00M
            break;
14846
19.3k
        case SSTATE_INTERNED_MORTAL:
14847
            // yes but we might need to make it immortal
14848
19.3k
            if (immortalize) {
14849
5.44k
                immortalize_interned(s);
14850
5.44k
            }
14851
19.3k
            return s;
14852
33.2M
        default:
14853
            // all done
14854
33.2M
            return s;
14855
36.2M
    }
14856
14857
    /* Statically allocated strings must be already interned. */
14858
36.2M
    assert(!_PyUnicode_STATE(s).statically_allocated);
14859
14860
#if Py_GIL_DISABLED
14861
    /* In the free-threaded build, all interned strings are immortal */
14862
    immortalize = 1;
14863
#endif
14864
14865
    /* If it's already immortal, intern it as such */
14866
3.00M
    if (_Py_IsImmortal(s)) {
14867
0
        immortalize = 1;
14868
0
    }
14869
14870
    /* if it's a short string, get the singleton */
14871
3.00M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
14872
24.9k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
14873
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
14874
0
        assert(PyUnicode_CHECK_INTERNED(r));
14875
0
        Py_DECREF(s);
14876
0
        return r;
14877
0
    }
14878
#ifdef Py_DEBUG
14879
    assert(!unicode_is_singleton(s));
14880
#endif
14881
14882
    /* Look in the global cache now. */
14883
3.00M
    {
14884
3.00M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14885
3.00M
        if (r != NULL) {
14886
284k
            assert(_PyUnicode_STATE(r).statically_allocated);
14887
284k
            assert(r != s);  // r must be statically_allocated; s is not
14888
284k
            Py_DECREF(s);
14889
284k
            return Py_NewRef(r);
14890
284k
        }
14891
3.00M
    }
14892
14893
    /* Do a setdefault on the per-interpreter cache. */
14894
2.72M
    PyObject *interned = get_interned_dict(interp);
14895
2.72M
    assert(interned != NULL);
14896
#ifdef Py_GIL_DISABLED
14897
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
14898
#endif
14899
2.72M
    FT_MUTEX_LOCK(INTERN_MUTEX);
14900
2.72M
    PyObject *t;
14901
2.72M
    {
14902
2.72M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
14903
2.72M
        if (res < 0) {
14904
0
            PyErr_Clear();
14905
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14906
0
            return s;
14907
0
        }
14908
2.72M
        else if (res == 1) {
14909
            // value was already present (not inserted)
14910
2.06M
            Py_DECREF(s);
14911
2.06M
            if (immortalize &&
14912
574k
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
14913
4.19k
                immortalize_interned(t);
14914
4.19k
            }
14915
2.06M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14916
2.06M
            return t;
14917
2.06M
        }
14918
656k
        else {
14919
            // value was newly inserted
14920
656k
            assert (s == t);
14921
656k
            Py_DECREF(t);
14922
656k
        }
14923
2.72M
    }
14924
14925
    /* NOT_INTERNED -> INTERNED_MORTAL */
14926
14927
2.72M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
14928
14929
656k
    if (!_Py_IsImmortal(s)) {
14930
        /* The two references in interned dict (key and value) are not counted.
14931
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
14932
656k
        Py_DECREF(s);
14933
656k
        Py_DECREF(s);
14934
656k
    }
14935
656k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
14936
14937
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
14938
14939
#ifdef Py_DEBUG
14940
    if (_Py_IsImmortal(s)) {
14941
        assert(immortalize);
14942
    }
14943
#endif
14944
656k
    if (immortalize) {
14945
90.3k
        immortalize_interned(s);
14946
90.3k
    }
14947
14948
656k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
14949
656k
    return s;
14950
2.72M
}
14951
14952
void
14953
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
14954
2.58M
{
14955
2.58M
    *p = intern_common(interp, *p, 1);
14956
2.58M
    assert(*p);
14957
2.58M
}
14958
14959
void
14960
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
14961
33.6M
{
14962
33.6M
    *p = intern_common(interp, *p, 0);
14963
33.6M
    assert(*p);
14964
33.6M
}
14965
14966
14967
void
14968
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
14969
0
{
14970
0
    _PyUnicode_InternImmortal(interp, p);
14971
0
    return;
14972
0
}
14973
14974
void
14975
PyUnicode_InternInPlace(PyObject **p)
14976
0
{
14977
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14978
0
    _PyUnicode_InternMortal(interp, p);
14979
0
}
14980
14981
// Public-looking name kept for the stable ABI; user should not call this:
14982
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14983
void
14984
PyUnicode_InternImmortal(PyObject **p)
14985
0
{
14986
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14987
0
    _PyUnicode_InternImmortal(interp, p);
14988
0
}
14989
14990
PyObject *
14991
PyUnicode_InternFromString(const char *cp)
14992
1.21M
{
14993
1.21M
    PyObject *s = PyUnicode_FromString(cp);
14994
1.21M
    if (s == NULL) {
14995
0
        return NULL;
14996
0
    }
14997
1.21M
    PyInterpreterState *interp = _PyInterpreterState_GET();
14998
1.21M
    _PyUnicode_InternMortal(interp, &s);
14999
1.21M
    return s;
15000
1.21M
}
15001
15002
15003
void
15004
_PyUnicode_ClearInterned(PyInterpreterState *interp)
15005
0
{
15006
0
    PyObject *interned = get_interned_dict(interp);
15007
0
    if (interned == NULL) {
15008
0
        return;
15009
0
    }
15010
0
    assert(PyDict_CheckExact(interned));
15011
15012
0
    if (has_shared_intern_dict(interp)) {
15013
        // the dict doesn't belong to this interpreter, skip the debug
15014
        // checks on it and just clear the pointer to it
15015
0
        clear_interned_dict(interp);
15016
0
        return;
15017
0
    }
15018
15019
#ifdef INTERNED_STATS
15020
    fprintf(stderr, "releasing %zd interned strings\n",
15021
            PyDict_GET_SIZE(interned));
15022
15023
    Py_ssize_t total_length = 0;
15024
#endif
15025
0
    Py_ssize_t pos = 0;
15026
0
    PyObject *s, *ignored_value;
15027
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15028
0
        int shared = 0;
15029
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
15030
0
        case SSTATE_INTERNED_IMMORTAL:
15031
            /* Make immortal interned strings mortal again. */
15032
            // Skip the Immortal Instance check and restore
15033
            // the two references (key and value) ignored
15034
            // by PyUnicode_InternInPlace().
15035
0
            _Py_SetMortal(s, 2);
15036
#ifdef Py_REF_DEBUG
15037
            /* let's be pedantic with the ref total */
15038
            _Py_IncRefTotal(_PyThreadState_GET());
15039
            _Py_IncRefTotal(_PyThreadState_GET());
15040
#endif
15041
#ifdef INTERNED_STATS
15042
            total_length += PyUnicode_GET_LENGTH(s);
15043
#endif
15044
0
            break;
15045
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
15046
            /* It is shared between interpreters, so we should unmark it
15047
               only when this is the last interpreter in which it's
15048
               interned.  We immortalize all the statically initialized
15049
               strings during startup, so we can rely on the
15050
               main interpreter to be the last one. */
15051
0
            if (!_Py_IsMainInterpreter(interp)) {
15052
0
                shared = 1;
15053
0
            }
15054
0
            break;
15055
0
        case SSTATE_INTERNED_MORTAL:
15056
            // Restore 2 references held by the interned dict; these will
15057
            // be decref'd by clear_interned_dict's PyDict_Clear.
15058
0
            _Py_RefcntAdd(s, 2);
15059
#ifdef Py_REF_DEBUG
15060
            /* let's be pedantic with the ref total */
15061
            _Py_IncRefTotal(_PyThreadState_GET());
15062
            _Py_IncRefTotal(_PyThreadState_GET());
15063
#endif
15064
0
            break;
15065
0
        case SSTATE_NOT_INTERNED:
15066
0
            _Py_FALLTHROUGH;
15067
0
        default:
15068
0
            Py_UNREACHABLE();
15069
0
        }
15070
0
        if (!shared) {
15071
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
15072
0
        }
15073
0
    }
15074
#ifdef INTERNED_STATS
15075
    fprintf(stderr,
15076
            "total length of all interned strings: %zd characters\n",
15077
            total_length);
15078
#endif
15079
15080
0
    struct _Py_unicode_state *state = &interp->unicode;
15081
0
    struct _Py_unicode_ids *ids = &state->ids;
15082
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
15083
0
        Py_XINCREF(ids->array[i]);
15084
0
    }
15085
0
    clear_interned_dict(interp);
15086
0
    if (_Py_IsMainInterpreter(interp)) {
15087
0
        clear_global_interned_strings();
15088
0
    }
15089
0
}
15090
15091
15092
/********************* Unicode Iterator **************************/
15093
15094
typedef struct {
15095
    PyObject_HEAD
15096
    Py_ssize_t it_index;
15097
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15098
} unicodeiterobject;
15099
15100
static void
15101
unicodeiter_dealloc(PyObject *op)
15102
1.80M
{
15103
1.80M
    unicodeiterobject *it = (unicodeiterobject *)op;
15104
1.80M
    _PyObject_GC_UNTRACK(it);
15105
1.80M
    Py_XDECREF(it->it_seq);
15106
1.80M
    PyObject_GC_Del(it);
15107
1.80M
}
15108
15109
static int
15110
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
15111
11
{
15112
11
    unicodeiterobject *it = (unicodeiterobject *)op;
15113
11
    Py_VISIT(it->it_seq);
15114
11
    return 0;
15115
11
}
15116
15117
static PyObject *
15118
unicodeiter_next(PyObject *op)
15119
158M
{
15120
158M
    unicodeiterobject *it = (unicodeiterobject *)op;
15121
158M
    PyObject *seq;
15122
15123
158M
    assert(it != NULL);
15124
158M
    seq = it->it_seq;
15125
158M
    if (seq == NULL)
15126
0
        return NULL;
15127
158M
    assert(_PyUnicode_CHECK(seq));
15128
15129
158M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15130
157M
        int kind = PyUnicode_KIND(seq);
15131
157M
        const void *data = PyUnicode_DATA(seq);
15132
157M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15133
157M
        it->it_index++;
15134
157M
        return unicode_char(chr);
15135
157M
    }
15136
15137
795k
    it->it_seq = NULL;
15138
795k
    Py_DECREF(seq);
15139
795k
    return NULL;
15140
158M
}
15141
15142
static PyObject *
15143
unicode_ascii_iter_next(PyObject *op)
15144
102M
{
15145
102M
    unicodeiterobject *it = (unicodeiterobject *)op;
15146
102M
    assert(it != NULL);
15147
102M
    PyObject *seq = it->it_seq;
15148
102M
    if (seq == NULL) {
15149
0
        return NULL;
15150
0
    }
15151
102M
    assert(_PyUnicode_CHECK(seq));
15152
102M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
15153
102M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15154
102M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
15155
102M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
15156
102M
                                              data, it->it_index);
15157
102M
        it->it_index++;
15158
102M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
15159
102M
    }
15160
914k
    it->it_seq = NULL;
15161
914k
    Py_DECREF(seq);
15162
914k
    return NULL;
15163
102M
}
15164
15165
static PyObject *
15166
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
15167
0
{
15168
0
    unicodeiterobject *it = (unicodeiterobject *)op;
15169
0
    Py_ssize_t len = 0;
15170
0
    if (it->it_seq)
15171
0
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15172
0
    return PyLong_FromSsize_t(len);
15173
0
}
15174
15175
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15176
15177
static PyObject *
15178
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
15179
0
{
15180
0
    unicodeiterobject *it = (unicodeiterobject *)op;
15181
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
15182
15183
    /* _PyEval_GetBuiltin can invoke arbitrary code,
15184
     * call must be before access of iterator pointers.
15185
     * see issue #101765 */
15186
15187
0
    if (it->it_seq != NULL) {
15188
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
15189
0
    } else {
15190
0
        PyObject *u = unicode_get_empty();
15191
0
        if (u == NULL) {
15192
0
            Py_XDECREF(iter);
15193
0
            return NULL;
15194
0
        }
15195
0
        return Py_BuildValue("N(N)", iter, u);
15196
0
    }
15197
0
}
15198
15199
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15200
15201
static PyObject *
15202
unicodeiter_setstate(PyObject *op, PyObject *state)
15203
0
{
15204
0
    unicodeiterobject *it = (unicodeiterobject *)op;
15205
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
15206
0
    if (index == -1 && PyErr_Occurred())
15207
0
        return NULL;
15208
0
    if (it->it_seq != NULL) {
15209
0
        if (index < 0)
15210
0
            index = 0;
15211
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15212
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15213
0
        it->it_index = index;
15214
0
    }
15215
0
    Py_RETURN_NONE;
15216
0
}
15217
15218
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15219
15220
static PyMethodDef unicodeiter_methods[] = {
15221
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
15222
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
15223
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
15224
    {NULL,      NULL}       /* sentinel */
15225
};
15226
15227
PyTypeObject PyUnicodeIter_Type = {
15228
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15229
    "str_iterator",         /* tp_name */
15230
    sizeof(unicodeiterobject),      /* tp_basicsize */
15231
    0,                  /* tp_itemsize */
15232
    /* methods */
15233
    unicodeiter_dealloc,/* tp_dealloc */
15234
    0,                  /* tp_vectorcall_offset */
15235
    0,                  /* tp_getattr */
15236
    0,                  /* tp_setattr */
15237
    0,                  /* tp_as_async */
15238
    0,                  /* tp_repr */
15239
    0,                  /* tp_as_number */
15240
    0,                  /* tp_as_sequence */
15241
    0,                  /* tp_as_mapping */
15242
    0,                  /* tp_hash */
15243
    0,                  /* tp_call */
15244
    0,                  /* tp_str */
15245
    PyObject_GenericGetAttr,        /* tp_getattro */
15246
    0,                  /* tp_setattro */
15247
    0,                  /* tp_as_buffer */
15248
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15249
    0,                  /* tp_doc */
15250
    unicodeiter_traverse, /* tp_traverse */
15251
    0,                  /* tp_clear */
15252
    0,                  /* tp_richcompare */
15253
    0,                  /* tp_weaklistoffset */
15254
    PyObject_SelfIter,          /* tp_iter */
15255
    unicodeiter_next,   /* tp_iternext */
15256
    unicodeiter_methods,            /* tp_methods */
15257
    0,
15258
};
15259
15260
PyTypeObject _PyUnicodeASCIIIter_Type = {
15261
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15262
    .tp_name = "str_ascii_iterator",
15263
    .tp_basicsize = sizeof(unicodeiterobject),
15264
    .tp_dealloc = unicodeiter_dealloc,
15265
    .tp_getattro = PyObject_GenericGetAttr,
15266
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
15267
    .tp_traverse = unicodeiter_traverse,
15268
    .tp_iter = PyObject_SelfIter,
15269
    .tp_iternext = unicode_ascii_iter_next,
15270
    .tp_methods = unicodeiter_methods,
15271
};
15272
15273
static PyObject *
15274
unicode_iter(PyObject *seq)
15275
1.80M
{
15276
1.80M
    unicodeiterobject *it;
15277
15278
1.80M
    if (!PyUnicode_Check(seq)) {
15279
0
        PyErr_BadInternalCall();
15280
0
        return NULL;
15281
0
    }
15282
1.80M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
15283
1.00M
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
15284
1.00M
    }
15285
795k
    else {
15286
795k
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15287
795k
    }
15288
1.80M
    if (it == NULL)
15289
0
        return NULL;
15290
1.80M
    it->it_index = 0;
15291
1.80M
    it->it_seq = Py_NewRef(seq);
15292
1.80M
    _PyObject_GC_TRACK(it);
15293
1.80M
    return (PyObject *)it;
15294
1.80M
}
15295
15296
static int
15297
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15298
64
{
15299
64
    int res;
15300
64
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15301
64
    if (res == -2) {
15302
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
15303
0
        return -1;
15304
0
    }
15305
64
    if (res < 0) {
15306
0
        PyErr_NoMemory();
15307
0
        return -1;
15308
0
    }
15309
64
    return 0;
15310
64
}
15311
15312
15313
static int
15314
config_get_codec_name(wchar_t **config_encoding)
15315
32
{
15316
32
    char *encoding;
15317
32
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15318
0
        return -1;
15319
0
    }
15320
15321
32
    PyObject *name_obj = NULL;
15322
32
    PyObject *codec = _PyCodec_Lookup(encoding);
15323
32
    PyMem_RawFree(encoding);
15324
15325
32
    if (!codec)
15326
0
        goto error;
15327
15328
32
    name_obj = PyObject_GetAttrString(codec, "name");
15329
32
    Py_CLEAR(codec);
15330
32
    if (!name_obj) {
15331
0
        goto error;
15332
0
    }
15333
15334
32
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15335
32
    Py_DECREF(name_obj);
15336
32
    if (wname == NULL) {
15337
0
        goto error;
15338
0
    }
15339
15340
32
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15341
32
    if (raw_wname == NULL) {
15342
0
        PyMem_Free(wname);
15343
0
        PyErr_NoMemory();
15344
0
        goto error;
15345
0
    }
15346
15347
32
    PyMem_RawFree(*config_encoding);
15348
32
    *config_encoding = raw_wname;
15349
15350
32
    PyMem_Free(wname);
15351
32
    return 0;
15352
15353
0
error:
15354
0
    Py_XDECREF(codec);
15355
0
    Py_XDECREF(name_obj);
15356
0
    return -1;
15357
32
}
15358
15359
15360
static PyStatus
15361
init_stdio_encoding(PyInterpreterState *interp)
15362
16
{
15363
    /* Update the stdio encoding to the normalized Python codec name. */
15364
16
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
15365
16
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
15366
0
        return _PyStatus_ERR("failed to get the Python codec name "
15367
0
                             "of the stdio encoding");
15368
0
    }
15369
16
    return _PyStatus_OK();
15370
16
}
15371
15372
15373
static int
15374
init_fs_codec(PyInterpreterState *interp)
15375
16
{
15376
16
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
15377
15378
16
    _Py_error_handler error_handler;
15379
16
    error_handler = get_error_handler_wide(config->filesystem_errors);
15380
16
    if (error_handler == _Py_ERROR_UNKNOWN) {
15381
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
15382
0
        return -1;
15383
0
    }
15384
15385
16
    char *encoding, *errors;
15386
16
    if (encode_wstr_utf8(config->filesystem_encoding,
15387
16
                         &encoding,
15388
16
                         "filesystem_encoding") < 0) {
15389
0
        return -1;
15390
0
    }
15391
15392
16
    if (encode_wstr_utf8(config->filesystem_errors,
15393
16
                         &errors,
15394
16
                         "filesystem_errors") < 0) {
15395
0
        PyMem_RawFree(encoding);
15396
0
        return -1;
15397
0
    }
15398
15399
16
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
15400
16
    PyMem_RawFree(fs_codec->encoding);
15401
16
    fs_codec->encoding = encoding;
15402
    /* encoding has been normalized by init_fs_encoding() */
15403
16
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
15404
16
    PyMem_RawFree(fs_codec->errors);
15405
16
    fs_codec->errors = errors;
15406
16
    fs_codec->error_handler = error_handler;
15407
15408
#ifdef _Py_FORCE_UTF8_FS_ENCODING
15409
    assert(fs_codec->utf8 == 1);
15410
#endif
15411
15412
    /* At this point, PyUnicode_EncodeFSDefault() and
15413
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15414
       the C implementation of the filesystem encoding. */
15415
15416
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15417
       global configuration variables. */
15418
16
    if (_Py_IsMainInterpreter(interp)) {
15419
15420
16
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
15421
16
                                      fs_codec->errors) < 0) {
15422
0
            PyErr_NoMemory();
15423
0
            return -1;
15424
0
        }
15425
16
    }
15426
16
    return 0;
15427
16
}
15428
15429
15430
static PyStatus
15431
init_fs_encoding(PyThreadState *tstate)
15432
16
{
15433
16
    PyInterpreterState *interp = tstate->interp;
15434
15435
    /* Update the filesystem encoding to the normalized Python codec name.
15436
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15437
       (Python codec name). */
15438
16
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
15439
16
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
15440
0
        _Py_DumpPathConfig(tstate);
15441
0
        return _PyStatus_ERR("failed to get the Python codec "
15442
0
                             "of the filesystem encoding");
15443
0
    }
15444
15445
16
    if (init_fs_codec(interp) < 0) {
15446
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
15447
0
    }
15448
16
    return _PyStatus_OK();
15449
16
}
15450
15451
15452
PyStatus
15453
_PyUnicode_InitEncodings(PyThreadState *tstate)
15454
16
{
15455
16
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
15456
16
    if (_PyStatus_EXCEPTION(status)) {
15457
0
        return status;
15458
0
    }
15459
16
    status = init_fs_encoding(tstate);
15460
16
    if (_PyStatus_EXCEPTION(status)) {
15461
0
        return status;
15462
0
    }
15463
15464
16
    return init_stdio_encoding(tstate->interp);
15465
16
}
15466
15467
15468
static void
15469
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
15470
0
{
15471
0
    PyMem_RawFree(fs_codec->encoding);
15472
0
    fs_codec->encoding = NULL;
15473
0
    fs_codec->utf8 = 0;
15474
0
    PyMem_RawFree(fs_codec->errors);
15475
0
    fs_codec->errors = NULL;
15476
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
15477
0
}
15478
15479
15480
#ifdef MS_WINDOWS
15481
int
15482
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15483
{
15484
    PyInterpreterState *interp = _PyInterpreterState_GET();
15485
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
15486
15487
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15488
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15489
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15490
    if (encoding == NULL || errors == NULL) {
15491
        PyMem_RawFree(encoding);
15492
        PyMem_RawFree(errors);
15493
        PyErr_NoMemory();
15494
        return -1;
15495
    }
15496
15497
    PyMem_RawFree(config->filesystem_encoding);
15498
    config->filesystem_encoding = encoding;
15499
    PyMem_RawFree(config->filesystem_errors);
15500
    config->filesystem_errors = errors;
15501
15502
    return init_fs_codec(interp);
15503
}
15504
#endif
15505
15506
15507
#ifdef Py_DEBUG
15508
static inline int
15509
unicode_is_finalizing(void)
15510
{
15511
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
15512
}
15513
#endif
15514
15515
15516
void
15517
_PyUnicode_FiniTypes(PyInterpreterState *interp)
15518
0
{
15519
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
15520
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
15521
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
15522
0
}
15523
15524
15525
void
15526
_PyUnicode_Fini(PyInterpreterState *interp)
15527
0
{
15528
0
    struct _Py_unicode_state *state = &interp->unicode;
15529
15530
0
    if (!has_shared_intern_dict(interp)) {
15531
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
15532
0
        assert(get_interned_dict(interp) == NULL);
15533
0
    }
15534
15535
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
15536
15537
    // bpo-47182: force a unicodedata CAPI capsule re-import on
15538
    // subsequent initialization of interpreter.
15539
0
    interp->unicode.ucnhash_capi = NULL;
15540
15541
0
    unicode_clear_identifiers(state);
15542
0
}
15543
15544
/* A _string module, to export formatter_parser and formatter_field_name_split
15545
   to the string.Formatter class implemented in Python. */
15546
15547
static PyMethodDef _string_methods[] = {
15548
    {"formatter_field_name_split", formatter_field_name_split,
15549
     METH_O, PyDoc_STR("split the argument as a field name")},
15550
    {"formatter_parser", formatter_parser,
15551
     METH_O, PyDoc_STR("parse the argument as a format string")},
15552
    {NULL, NULL}
15553
};
15554
15555
static PyModuleDef_Slot module_slots[] = {
15556
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
15557
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
15558
    {0, NULL}
15559
};
15560
15561
static struct PyModuleDef _string_module = {
15562
    PyModuleDef_HEAD_INIT,
15563
    .m_name = "_string",
15564
    .m_doc = PyDoc_STR("string helper module"),
15565
    .m_size = 0,
15566
    .m_methods = _string_methods,
15567
    .m_slots = module_slots,
15568
};
15569
15570
PyMODINIT_FUNC
15571
PyInit__string(void)
15572
6
{
15573
6
    return PyModuleDef_Init(&_string_module);
15574
6
}
15575
15576
15577
#undef PyUnicode_KIND
15578
int PyUnicode_KIND(PyObject *op)
15579
0
{
15580
0
    if (!PyUnicode_Check(op)) {
15581
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
15582
0
        return -1;
15583
0
    }
15584
0
    return _PyASCIIObject_CAST(op)->state.kind;
15585
0
}
15586
15587
#undef PyUnicode_DATA
15588
void* PyUnicode_DATA(PyObject *op)
15589
0
{
15590
0
    if (!PyUnicode_Check(op)) {
15591
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
15592
0
        return NULL;
15593
0
    }
15594
0
    return _PyUnicode_DATA(op);
15595
0
}