Coverage Report

Created: 2026-04-12 06:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_initconfig.h"    // _PyStatus_OK()
50
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
51
#include "pycore_long.h"          // _PyLong_FormatWriter()
52
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
54
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
55
#include "pycore_pyhash.h"        // _Py_HashSecret_t
56
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
57
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
58
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
59
#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart
60
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
61
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
62
63
#include "stringlib/eq.h"         // unicode_eq()
64
#include <stddef.h>               // ptrdiff_t
65
66
#ifdef MS_WINDOWS
67
#include <windows.h>
68
#endif
69
70
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
71
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
72
#endif
73
74
/* Uncomment to display statistics on interned strings at exit
75
   in _PyUnicode_ClearInterned(). */
76
/* #define INTERNED_STATS 1 */
77
78
79
/*[clinic input]
80
class str "PyObject *" "&PyUnicode_Type"
81
[clinic start generated code]*/
82
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
83
84
/*[python input]
85
class Py_UCS4_converter(CConverter):
86
    type = 'Py_UCS4'
87
    converter = 'convert_uc'
88
89
    def c_default_init(self):
90
        import libclinic
91
        self.c_default = libclinic.c_unichar_repr(self.default)
92
93
[python start generated code]*/
94
/*[python end generated code: output=da39a3ee5e6b4b0d input=22f057b68fd9a65a]*/
95
96
/* --- Globals ------------------------------------------------------------
97
98
NOTE: In the interpreter's initialization phase, some globals are currently
99
      initialized dynamically as needed. In the process Unicode objects may
100
      be created before the Unicode type is ready.
101
102
*/
103
104
16.0M
#define MAX_UNICODE _Py_MAX_UNICODE
105
269M
#define ensure_unicode _PyUnicode_EnsureUnicode
106
107
#ifdef Py_DEBUG
108
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
109
#else
110
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
111
#endif
112
113
static inline char* _PyUnicode_UTF8(PyObject *op)
114
215M
{
115
215M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
116
215M
}
117
118
static inline char* PyUnicode_UTF8(PyObject *op)
119
168M
{
120
168M
    assert(_PyUnicode_CHECK(op));
121
168M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
122
152M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
123
152M
    }
124
16.0M
    else {
125
16.0M
         return _PyUnicode_UTF8(op);
126
16.0M
    }
127
168M
}
128
129
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
130
29.9M
{
131
29.9M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
132
29.9M
}
133
134
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
135
79.3M
{
136
79.3M
    assert(_PyUnicode_CHECK(op));
137
79.3M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
138
76.2M
         return _PyASCIIObject_CAST(op)->length;
139
76.2M
    }
140
3.17M
    else {
141
3.17M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
142
3.17M
    }
143
79.3M
}
144
145
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
146
29.9M
{
147
29.9M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
148
29.9M
}
149
150
#define _PyUnicode_LENGTH(op)                           \
151
581M
    (_PyASCIIObject_CAST(op)->length)
152
#define _PyUnicode_STATE(op)                            \
153
3.62G
    (_PyASCIIObject_CAST(op)->state)
154
#define _PyUnicode_HASH(op)                             \
155
531M
    (_PyASCIIObject_CAST(op)->hash)
156
157
1.05G
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
158
159
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
160
47.0M
{
161
47.0M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
162
47.0M
}
163
164
#define _PyUnicode_DATA_ANY(op)                         \
165
64.6M
    (_PyUnicodeObject_CAST(op)->data.any)
166
167
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
168
0
{
169
0
    assert(_PyUnicode_CHECK(op));
170
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
171
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
172
0
}
173
174
/* true if the Unicode object has an allocated UTF-8 memory block
175
   (not shared with other data) */
176
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
177
579M
{
178
579M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
179
185M
            && _PyUnicode_UTF8(op) != NULL
180
13.7M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
181
579M
}
182
183
184
200M
#define LATIN1 _Py_LATIN1_CHR
185
186
/* Forward declaration */
187
static PyObject *
188
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
189
                    const char *errors);
190
static PyObject *
191
unicode_decode_utf8(const char *s, Py_ssize_t size,
192
                    _Py_error_handler error_handler, const char *errors,
193
                    Py_ssize_t *consumed);
194
#ifdef Py_DEBUG
195
static inline int unicode_is_finalizing(void);
196
static int unicode_is_singleton(PyObject *unicode);
197
#endif
198
199
200
// Return a reference to the immortal empty string singleton.
201
PyObject*
202
_PyUnicode_GetEmpty(void)
203
111M
{
204
111M
    _Py_DECLARE_STR(empty, "");
205
111M
    return &_Py_STR(empty);
206
111M
}
207
208
/* This dictionary holds per-interpreter interned strings.
209
 * See InternalDocs/string_interning.md for details.
210
 */
211
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
212
5.73M
{
213
5.73M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
214
5.73M
}
215
216
/* This hashtable holds statically allocated interned strings.
217
 * See InternalDocs/string_interning.md for details.
218
 */
219
5.77M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
220
221
/* Get number of all interned strings for the current interpreter. */
222
Py_ssize_t
223
_PyUnicode_InternedSize(void)
224
0
{
225
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
226
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
227
0
}
228
229
/* Get number of immortal interned strings for the current interpreter. */
230
Py_ssize_t
231
_PyUnicode_InternedSize_Immortal(void)
232
0
{
233
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
234
0
    PyObject *key, *value;
235
0
    Py_ssize_t pos = 0;
236
0
    Py_ssize_t count = 0;
237
238
    // It's tempting to keep a count and avoid a loop here. But, this function
239
    // is intended for refleak tests. It spends extra work to report the true
240
    // value, to help detect bugs in optimizations.
241
242
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
243
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
244
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
245
0
           count++;
246
0
       }
247
0
    }
248
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
249
0
}
250
251
static Py_hash_t unicode_hash(PyObject *);
252
253
static Py_uhash_t
254
hashtable_unicode_hash(const void *key)
255
5.77M
{
256
5.77M
    return unicode_hash((PyObject *)key);
257
5.77M
}
258
259
static int
260
hashtable_unicode_compare(const void *key1, const void *key2)
261
538k
{
262
538k
    PyObject *obj1 = (PyObject *)key1;
263
538k
    PyObject *obj2 = (PyObject *)key2;
264
538k
    if (obj1 != NULL && obj2 != NULL) {
265
538k
        return unicode_eq(obj1, obj2);
266
538k
    }
267
0
    else {
268
0
        return obj1 == obj2;
269
0
    }
270
538k
}
271
272
/* Return true if this interpreter should share the main interpreter's
273
   intern_dict.  That's important for interpreters which load basic
274
   single-phase init extension modules (m_size == -1).  There could be interned
275
   immortal strings that are shared between interpreters, due to the
276
   PyDict_Update(mdict, m_copy) call in import_find_extension().
277
278
   It's not safe to deallocate those strings until all interpreters that
279
   potentially use them are freed.  By storing them in the main interpreter, we
280
   ensure they get freed after all other interpreters are freed.
281
*/
282
static bool
283
has_shared_intern_dict(PyInterpreterState *interp)
284
36
{
285
36
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
286
36
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
287
36
}
288
289
static int
290
init_interned_dict(PyInterpreterState *interp)
291
36
{
292
36
    assert(get_interned_dict(interp) == NULL);
293
36
    PyObject *interned;
294
36
    if (has_shared_intern_dict(interp)) {
295
0
        interned = get_interned_dict(_PyInterpreterState_Main());
296
0
        Py_INCREF(interned);
297
0
    }
298
36
    else {
299
36
        interned = PyDict_New();
300
36
        if (interned == NULL) {
301
0
            return -1;
302
0
        }
303
36
    }
304
36
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
305
36
    return 0;
306
36
}
307
308
static void
309
clear_interned_dict(PyInterpreterState *interp)
310
0
{
311
0
    PyObject *interned = get_interned_dict(interp);
312
0
    if (interned != NULL) {
313
0
        if (!has_shared_intern_dict(interp)) {
314
            // only clear if the dict belongs to this interpreter
315
0
            PyDict_Clear(interned);
316
0
        }
317
0
        Py_DECREF(interned);
318
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
319
0
    }
320
0
}
321
322
static PyStatus
323
init_global_interned_strings(PyInterpreterState *interp)
324
36
{
325
36
    assert(INTERNED_STRINGS == NULL);
326
36
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
327
328
36
    INTERNED_STRINGS = _Py_hashtable_new_full(
329
36
        hashtable_unicode_hash,
330
36
        hashtable_unicode_compare,
331
        // Objects stored here are immortal and statically allocated,
332
        // so we don't need key_destroy_func & value_destroy_func:
333
36
        NULL,
334
36
        NULL,
335
36
        &hashtable_alloc
336
36
    );
337
36
    if (INTERNED_STRINGS == NULL) {
338
0
        PyErr_Clear();
339
0
        return _PyStatus_ERR("failed to create global interned dict");
340
0
    }
341
342
    /* Intern statically allocated string identifiers, deepfreeze strings,
343
        * and one-byte latin-1 strings.
344
        * This must be done before any module initialization so that statically
345
        * allocated string identifiers are used instead of heap allocated strings.
346
        * Deepfreeze uses the interned identifiers if present to save space
347
        * else generates them and they are interned to speed up dict lookups.
348
    */
349
36
    _PyUnicode_InitStaticStrings(interp);
350
351
9.25k
    for (int i = 0; i < 256; i++) {
352
9.21k
        PyObject *s = LATIN1(i);
353
9.21k
        _PyUnicode_InternStatic(interp, &s);
354
9.21k
        assert(s == LATIN1(i));
355
9.21k
    }
356
#ifdef Py_DEBUG
357
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
358
359
    for (int i = 0; i < 256; i++) {
360
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
361
    }
362
#endif
363
36
    return _PyStatus_OK();
364
36
}
365
366
static void clear_global_interned_strings(void)
367
0
{
368
0
    if (INTERNED_STRINGS != NULL) {
369
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
370
0
        INTERNED_STRINGS = NULL;
371
0
    }
372
0
}
373
374
#define _Py_RETURN_UNICODE_EMPTY()   \
375
54.7M
    do {                             \
376
54.7M
        return _PyUnicode_GetEmpty();\
377
54.7M
    } while (0)
378
379
380
/* Fast detection of the most frequent whitespace characters */
381
const unsigned char _Py_ascii_whitespace[] = {
382
    0, 0, 0, 0, 0, 0, 0, 0,
383
/*     case 0x0009: * CHARACTER TABULATION */
384
/*     case 0x000A: * LINE FEED */
385
/*     case 0x000B: * LINE TABULATION */
386
/*     case 0x000C: * FORM FEED */
387
/*     case 0x000D: * CARRIAGE RETURN */
388
    0, 1, 1, 1, 1, 1, 0, 0,
389
    0, 0, 0, 0, 0, 0, 0, 0,
390
/*     case 0x001C: * FILE SEPARATOR */
391
/*     case 0x001D: * GROUP SEPARATOR */
392
/*     case 0x001E: * RECORD SEPARATOR */
393
/*     case 0x001F: * UNIT SEPARATOR */
394
    0, 0, 0, 0, 1, 1, 1, 1,
395
/*     case 0x0020: * SPACE */
396
    1, 0, 0, 0, 0, 0, 0, 0,
397
    0, 0, 0, 0, 0, 0, 0, 0,
398
    0, 0, 0, 0, 0, 0, 0, 0,
399
    0, 0, 0, 0, 0, 0, 0, 0,
400
401
    0, 0, 0, 0, 0, 0, 0, 0,
402
    0, 0, 0, 0, 0, 0, 0, 0,
403
    0, 0, 0, 0, 0, 0, 0, 0,
404
    0, 0, 0, 0, 0, 0, 0, 0,
405
    0, 0, 0, 0, 0, 0, 0, 0,
406
    0, 0, 0, 0, 0, 0, 0, 0,
407
    0, 0, 0, 0, 0, 0, 0, 0,
408
    0, 0, 0, 0, 0, 0, 0, 0
409
};
410
411
/* forward */
412
static PyObject* get_latin1_char(unsigned char ch);
413
414
415
static PyObject *
416
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
417
static PyObject *
418
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
419
static PyObject *
420
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
421
422
static PyObject *
423
unicode_encode_call_errorhandler(const char *errors,
424
       PyObject **errorHandler,const char *encoding, const char *reason,
425
       PyObject *unicode, PyObject **exceptionObject,
426
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
427
428
static void
429
raise_encode_exception(PyObject **exceptionObject,
430
                       const char *encoding,
431
                       PyObject *unicode,
432
                       Py_ssize_t startpos, Py_ssize_t endpos,
433
                       const char *reason);
434
435
/* Same for linebreaks */
436
static const unsigned char ascii_linebreak[] = {
437
    0, 0, 0, 0, 0, 0, 0, 0,
438
/*         0x000A, * LINE FEED */
439
/*         0x000B, * LINE TABULATION */
440
/*         0x000C, * FORM FEED */
441
/*         0x000D, * CARRIAGE RETURN */
442
    0, 0, 1, 1, 1, 1, 0, 0,
443
    0, 0, 0, 0, 0, 0, 0, 0,
444
/*         0x001C, * FILE SEPARATOR */
445
/*         0x001D, * GROUP SEPARATOR */
446
/*         0x001E, * RECORD SEPARATOR */
447
    0, 0, 0, 0, 1, 1, 1, 0,
448
    0, 0, 0, 0, 0, 0, 0, 0,
449
    0, 0, 0, 0, 0, 0, 0, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0,
452
453
    0, 0, 0, 0, 0, 0, 0, 0,
454
    0, 0, 0, 0, 0, 0, 0, 0,
455
    0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 0, 0, 0, 0, 0, 0, 0,
459
    0, 0, 0, 0, 0, 0, 0, 0,
460
    0, 0, 0, 0, 0, 0, 0, 0
461
};
462
463
static int convert_uc(PyObject *obj, void *addr);
464
465
struct encoding_map;
466
#include "clinic/unicodeobject.c.h"
467
468
_Py_error_handler
469
_Py_GetErrorHandler(const char *errors)
470
3.43M
{
471
3.43M
    if (errors == NULL || strcmp(errors, "strict") == 0) {
472
2.59M
        return _Py_ERROR_STRICT;
473
2.59M
    }
474
839k
    if (strcmp(errors, "surrogateescape") == 0) {
475
601k
        return _Py_ERROR_SURROGATEESCAPE;
476
601k
    }
477
237k
    if (strcmp(errors, "replace") == 0) {
478
237k
        return _Py_ERROR_REPLACE;
479
237k
    }
480
0
    if (strcmp(errors, "ignore") == 0) {
481
0
        return _Py_ERROR_IGNORE;
482
0
    }
483
0
    if (strcmp(errors, "backslashreplace") == 0) {
484
0
        return _Py_ERROR_BACKSLASHREPLACE;
485
0
    }
486
0
    if (strcmp(errors, "surrogatepass") == 0) {
487
0
        return _Py_ERROR_SURROGATEPASS;
488
0
    }
489
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
490
0
        return _Py_ERROR_XMLCHARREFREPLACE;
491
0
    }
492
0
    return _Py_ERROR_OTHER;
493
0
}
494
495
496
static _Py_error_handler
497
get_error_handler_wide(const wchar_t *errors)
498
12.5k
{
499
12.5k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
500
0
        return _Py_ERROR_STRICT;
501
0
    }
502
12.5k
    if (wcscmp(errors, L"surrogateescape") == 0) {
503
12.5k
        return _Py_ERROR_SURROGATEESCAPE;
504
12.5k
    }
505
0
    if (wcscmp(errors, L"replace") == 0) {
506
0
        return _Py_ERROR_REPLACE;
507
0
    }
508
0
    if (wcscmp(errors, L"ignore") == 0) {
509
0
        return _Py_ERROR_IGNORE;
510
0
    }
511
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
512
0
        return _Py_ERROR_BACKSLASHREPLACE;
513
0
    }
514
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
515
0
        return _Py_ERROR_SURROGATEPASS;
516
0
    }
517
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
518
0
        return _Py_ERROR_XMLCHARREFREPLACE;
519
0
    }
520
0
    return _Py_ERROR_OTHER;
521
0
}
522
523
524
static inline int
525
unicode_check_encoding_errors(const char *encoding, const char *errors)
526
42.4M
{
527
42.4M
    if (encoding == NULL && errors == NULL) {
528
13.1M
        return 0;
529
13.1M
    }
530
531
29.3M
    PyInterpreterState *interp = _PyInterpreterState_GET();
532
29.3M
#ifndef Py_DEBUG
533
    /* In release mode, only check in development mode (-X dev) */
534
29.3M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
535
29.3M
        return 0;
536
29.3M
    }
537
#else
538
    /* Always check in debug mode */
539
#endif
540
541
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
542
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
543
0
    if (!interp->unicode.fs_codec.encoding) {
544
0
        return 0;
545
0
    }
546
547
    /* Disable checks during Python finalization. For example, it allows to
548
     * call PyObject_Dump() during finalization for debugging purpose.
549
     */
550
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
551
0
        return 0;
552
0
    }
553
554
0
    if (encoding != NULL
555
        // Fast path for the most common built-in encodings. Even if the codec
556
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
557
        // create a temporary Unicode string (the key in the cache).
558
0
        && strcmp(encoding, "utf-8") != 0
559
0
        && strcmp(encoding, "utf8") != 0
560
0
        && strcmp(encoding, "ascii") != 0)
561
0
    {
562
0
        PyObject *handler = _PyCodec_Lookup(encoding);
563
0
        if (handler == NULL) {
564
0
            return -1;
565
0
        }
566
0
        Py_DECREF(handler);
567
0
    }
568
569
0
    if (errors != NULL
570
        // Fast path for the most common built-in error handlers.
571
0
        && strcmp(errors, "strict") != 0
572
0
        && strcmp(errors, "ignore") != 0
573
0
        && strcmp(errors, "replace") != 0
574
0
        && strcmp(errors, "surrogateescape") != 0
575
0
        && strcmp(errors, "surrogatepass") != 0)
576
0
    {
577
0
        PyObject *handler = PyCodec_LookupError(errors);
578
0
        if (handler == NULL) {
579
0
            return -1;
580
0
        }
581
0
        Py_DECREF(handler);
582
0
    }
583
0
    return 0;
584
0
}
585
586
587
int
588
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
589
0
{
590
0
#define CHECK(expr) \
591
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
592
593
0
    assert(op != NULL);
594
0
    CHECK(PyUnicode_Check(op));
595
596
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
597
0
    int kind = ascii->state.kind;
598
599
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
600
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
601
0
    }
602
0
    else {
603
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
604
0
        void *data;
605
606
0
        if (ascii->state.compact == 1) {
607
0
            data = compact + 1;
608
0
            CHECK(kind == PyUnicode_1BYTE_KIND
609
0
                                 || kind == PyUnicode_2BYTE_KIND
610
0
                                 || kind == PyUnicode_4BYTE_KIND);
611
0
            CHECK(ascii->state.ascii == 0);
612
0
            CHECK(_PyUnicode_UTF8(op) != data);
613
0
        }
614
0
        else {
615
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
616
617
0
            data = unicode->data.any;
618
0
            CHECK(kind == PyUnicode_1BYTE_KIND
619
0
                     || kind == PyUnicode_2BYTE_KIND
620
0
                     || kind == PyUnicode_4BYTE_KIND);
621
0
            CHECK(ascii->state.compact == 0);
622
0
            CHECK(data != NULL);
623
0
            if (ascii->state.ascii) {
624
0
                CHECK(_PyUnicode_UTF8(op) == data);
625
0
                CHECK(compact->utf8_length == ascii->length);
626
0
            }
627
0
            else {
628
0
                CHECK(_PyUnicode_UTF8(op) != data);
629
0
            }
630
0
        }
631
0
#ifndef Py_GIL_DISABLED
632
0
        if (_PyUnicode_UTF8(op) == NULL)
633
0
            CHECK(compact->utf8_length == 0);
634
0
#endif
635
0
    }
636
637
    /* check that the best kind is used: O(n) operation */
638
0
    if (check_content) {
639
0
        Py_ssize_t i;
640
0
        Py_UCS4 maxchar = 0;
641
0
        const void *data;
642
0
        Py_UCS4 ch;
643
644
0
        data = PyUnicode_DATA(ascii);
645
0
        for (i=0; i < ascii->length; i++)
646
0
        {
647
0
            ch = PyUnicode_READ(kind, data, i);
648
0
            if (ch > maxchar)
649
0
                maxchar = ch;
650
0
        }
651
0
        if (kind == PyUnicode_1BYTE_KIND) {
652
0
            if (ascii->state.ascii == 0) {
653
0
                CHECK(maxchar >= 128);
654
0
                CHECK(maxchar <= 255);
655
0
            }
656
0
            else
657
0
                CHECK(maxchar < 128);
658
0
        }
659
0
        else if (kind == PyUnicode_2BYTE_KIND) {
660
0
            CHECK(maxchar >= 0x100);
661
0
            CHECK(maxchar <= 0xFFFF);
662
0
        }
663
0
        else {
664
0
            CHECK(maxchar >= 0x10000);
665
0
            CHECK(maxchar <= MAX_UNICODE);
666
0
        }
667
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
668
0
    }
669
670
    /* Check interning state */
671
#ifdef Py_DEBUG
672
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
673
    // extensions can make immortal strings mortal (but with a high enough
674
    // refcount).
675
    // The other way is extremely unlikely (worth a potential failed assertion
676
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
677
    switch (PyUnicode_CHECK_INTERNED(op)) {
678
        case SSTATE_NOT_INTERNED:
679
            if (ascii->state.statically_allocated) {
680
                // This state is for two exceptions:
681
                // - strings are currently checked before they're interned
682
                // - the 256 one-latin1-character strings
683
                //   are static but use SSTATE_NOT_INTERNED
684
            }
685
            else {
686
                CHECK(!_Py_IsImmortal(op));
687
            }
688
            break;
689
        case SSTATE_INTERNED_MORTAL:
690
            CHECK(!ascii->state.statically_allocated);
691
            CHECK(!_Py_IsImmortal(op));
692
            break;
693
        case SSTATE_INTERNED_IMMORTAL:
694
            CHECK(!ascii->state.statically_allocated);
695
            break;
696
        case SSTATE_INTERNED_IMMORTAL_STATIC:
697
            CHECK(ascii->state.statically_allocated);
698
            break;
699
        default:
700
            Py_UNREACHABLE();
701
    }
702
#endif
703
704
0
    return 1;
705
706
0
#undef CHECK
707
0
}
708
709
PyObject*
710
_PyUnicode_Result(PyObject *unicode)
711
60.6M
{
712
60.6M
    assert(_PyUnicode_CHECK(unicode));
713
714
60.6M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
715
60.6M
    if (length == 0) {
716
241
        PyObject *empty = _PyUnicode_GetEmpty();
717
241
        if (unicode != empty) {
718
0
            Py_DECREF(unicode);
719
0
        }
720
241
        return empty;
721
241
    }
722
723
60.6M
    if (length == 1) {
724
2.27M
        int kind = PyUnicode_KIND(unicode);
725
2.27M
        if (kind == PyUnicode_1BYTE_KIND) {
726
218k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
727
218k
            Py_UCS1 ch = data[0];
728
218k
            PyObject *latin1_char = LATIN1(ch);
729
218k
            if (unicode != latin1_char) {
730
213k
                Py_DECREF(unicode);
731
213k
            }
732
218k
            return latin1_char;
733
218k
        }
734
2.27M
    }
735
736
60.6M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
737
60.4M
    return unicode;
738
60.6M
}
739
1.48M
#define unicode_result _PyUnicode_Result
740
741
static PyObject*
742
unicode_result_unchanged(PyObject *unicode)
743
99.3M
{
744
99.3M
    if (PyUnicode_CheckExact(unicode)) {
745
96.2M
        return Py_NewRef(unicode);
746
96.2M
    }
747
3.10M
    else
748
        /* Subtype -- return genuine unicode string with the same value. */
749
3.10M
        return _PyUnicode_Copy(unicode);
750
99.3M
}
751
752
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
753
   ASCII, Latin1, UTF-8, etc. */
754
static char*
755
backslashreplace(PyBytesWriter *writer, char *str,
756
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
757
0
{
758
0
    Py_ssize_t size, i;
759
0
    Py_UCS4 ch;
760
0
    int kind;
761
0
    const void *data;
762
763
0
    kind = PyUnicode_KIND(unicode);
764
0
    data = PyUnicode_DATA(unicode);
765
766
0
    size = 0;
767
    /* determine replacement size */
768
0
    for (i = collstart; i < collend; ++i) {
769
0
        Py_ssize_t incr;
770
771
0
        ch = PyUnicode_READ(kind, data, i);
772
0
        if (ch < 0x100)
773
0
            incr = 2+2;
774
0
        else if (ch < 0x10000)
775
0
            incr = 2+4;
776
0
        else {
777
0
            assert(ch <= MAX_UNICODE);
778
0
            incr = 2+8;
779
0
        }
780
0
        if (size > PY_SSIZE_T_MAX - incr) {
781
0
            PyErr_SetString(PyExc_OverflowError,
782
0
                            "encoded result is too long for a Python string");
783
0
            return NULL;
784
0
        }
785
0
        size += incr;
786
0
    }
787
788
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
789
0
    if (str == NULL) {
790
0
        return NULL;
791
0
    }
792
793
    /* generate replacement */
794
0
    for (i = collstart; i < collend; ++i) {
795
0
        ch = PyUnicode_READ(kind, data, i);
796
0
        *str++ = '\\';
797
0
        if (ch >= 0x00010000) {
798
0
            *str++ = 'U';
799
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
800
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
801
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
802
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
803
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
804
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
805
0
        }
806
0
        else if (ch >= 0x100) {
807
0
            *str++ = 'u';
808
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
809
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
810
0
        }
811
0
        else
812
0
            *str++ = 'x';
813
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
814
0
        *str++ = Py_hexdigits[ch&0xf];
815
0
    }
816
0
    return str;
817
0
}
818
819
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
820
   ASCII, Latin1, UTF-8, etc. */
821
static char*
822
xmlcharrefreplace(PyBytesWriter *writer, char *str,
823
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
824
0
{
825
0
    Py_ssize_t size, i;
826
0
    Py_UCS4 ch;
827
0
    int kind;
828
0
    const void *data;
829
830
0
    kind = PyUnicode_KIND(unicode);
831
0
    data = PyUnicode_DATA(unicode);
832
833
0
    size = 0;
834
    /* determine replacement size */
835
0
    for (i = collstart; i < collend; ++i) {
836
0
        Py_ssize_t incr;
837
838
0
        ch = PyUnicode_READ(kind, data, i);
839
0
        if (ch < 10)
840
0
            incr = 2+1+1;
841
0
        else if (ch < 100)
842
0
            incr = 2+2+1;
843
0
        else if (ch < 1000)
844
0
            incr = 2+3+1;
845
0
        else if (ch < 10000)
846
0
            incr = 2+4+1;
847
0
        else if (ch < 100000)
848
0
            incr = 2+5+1;
849
0
        else if (ch < 1000000)
850
0
            incr = 2+6+1;
851
0
        else {
852
0
            assert(ch <= MAX_UNICODE);
853
0
            incr = 2+7+1;
854
0
        }
855
0
        if (size > PY_SSIZE_T_MAX - incr) {
856
0
            PyErr_SetString(PyExc_OverflowError,
857
0
                            "encoded result is too long for a Python string");
858
0
            return NULL;
859
0
        }
860
0
        size += incr;
861
0
    }
862
863
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
864
0
    if (str == NULL) {
865
0
        return NULL;
866
0
    }
867
868
    /* generate replacement */
869
0
    for (i = collstart; i < collend; ++i) {
870
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
871
0
        if (size < 0) {
872
0
            return NULL;
873
0
        }
874
0
        str += size;
875
0
    }
876
0
    return str;
877
0
}
878
879
/* --- Bloom Filters ----------------------------------------------------- */
880
881
/* stuff to implement simple "bloom filters" for Unicode characters.
882
   to keep things simple, we use a single bitmask, using the least 5
883
   bits from each unicode characters as the bit index. */
884
885
/* the linebreak mask is set up by _PyUnicode_Init() below */
886
887
#if LONG_BIT >= 128
888
#define BLOOM_WIDTH 128
889
#elif LONG_BIT >= 64
890
26.0M
#define BLOOM_WIDTH 64
891
#elif LONG_BIT >= 32
892
#define BLOOM_WIDTH 32
893
#else
894
#error "LONG_BIT is smaller than 32"
895
#endif
896
897
10.2M
#define BLOOM_MASK unsigned long
898
899
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
900
901
33.6M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
902
903
#define BLOOM_LINEBREAK(ch)                                             \
904
131M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
905
131M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
906
907
static inline BLOOM_MASK
908
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
909
5.12M
{
910
5.12M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
911
5.12M
    do {                                               \
912
5.12M
        TYPE *data = (TYPE *)PTR;                      \
913
5.12M
        TYPE *end = data + LEN;                        \
914
5.12M
        Py_UCS4 ch;                                    \
915
12.1M
        for (; data != end; data++) {                  \
916
6.99M
            ch = *data;                                \
917
6.99M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
918
6.99M
        }                                              \
919
5.12M
        break;                                         \
920
5.12M
    } while (0)
921
922
    /* calculate simple bloom-style bitmask for a given unicode string */
923
924
5.12M
    BLOOM_MASK mask;
925
926
5.12M
    mask = 0;
927
5.12M
    switch (kind) {
928
5.12M
    case PyUnicode_1BYTE_KIND:
929
5.12M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
930
5.12M
        break;
931
36
    case PyUnicode_2BYTE_KIND:
932
36
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
933
36
        break;
934
0
    case PyUnicode_4BYTE_KIND:
935
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
936
0
        break;
937
0
    default:
938
0
        Py_UNREACHABLE();
939
5.12M
    }
940
5.12M
    return mask;
941
942
5.12M
#undef BLOOM_UPDATE
943
5.12M
}
944
945
/* Compilation of templated routines */
946
947
1.03M
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
948
949
#include "stringlib/asciilib.h"
950
#include "stringlib/fastsearch.h"
951
#include "stringlib/partition.h"
952
#include "stringlib/split.h"
953
#include "stringlib/count.h"
954
#include "stringlib/find.h"
955
#include "stringlib/find_max_char.h"
956
#include "stringlib/undef.h"
957
958
#include "stringlib/ucs1lib.h"
959
#include "stringlib/fastsearch.h"
960
#include "stringlib/partition.h"
961
#include "stringlib/split.h"
962
#include "stringlib/count.h"
963
#include "stringlib/find.h"
964
#include "stringlib/replace.h"
965
#include "stringlib/repr.h"
966
#include "stringlib/find_max_char.h"
967
#include "stringlib/undef.h"
968
969
#include "stringlib/ucs2lib.h"
970
#include "stringlib/fastsearch.h"
971
#include "stringlib/partition.h"
972
#include "stringlib/split.h"
973
#include "stringlib/count.h"
974
#include "stringlib/find.h"
975
#include "stringlib/replace.h"
976
#include "stringlib/repr.h"
977
#include "stringlib/find_max_char.h"
978
#include "stringlib/undef.h"
979
980
#include "stringlib/ucs4lib.h"
981
#include "stringlib/fastsearch.h"
982
#include "stringlib/partition.h"
983
#include "stringlib/split.h"
984
#include "stringlib/count.h"
985
#include "stringlib/find.h"
986
#include "stringlib/replace.h"
987
#include "stringlib/repr.h"
988
#include "stringlib/find_max_char.h"
989
#include "stringlib/undef.h"
990
991
#undef STRINGLIB_GET_EMPTY
992
993
/* --- Unicode Object ----------------------------------------------------- */
994
995
static inline Py_ssize_t
996
findchar(const void *s, int kind,
997
         Py_ssize_t size, Py_UCS4 ch,
998
         int direction)
999
216M
{
1000
216M
    switch (kind) {
1001
208M
    case PyUnicode_1BYTE_KIND:
1002
208M
        if ((Py_UCS1) ch != ch)
1003
3.72k
            return -1;
1004
208M
        if (direction > 0)
1005
208M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1006
68.2k
        else
1007
68.2k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1008
5.01M
    case PyUnicode_2BYTE_KIND:
1009
5.01M
        if ((Py_UCS2) ch != ch)
1010
0
            return -1;
1011
5.01M
        if (direction > 0)
1012
4.72M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1013
292k
        else
1014
292k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1015
3.03M
    case PyUnicode_4BYTE_KIND:
1016
3.03M
        if (direction > 0)
1017
2.93M
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1018
98.9k
        else
1019
98.9k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1020
0
    default:
1021
0
        Py_UNREACHABLE();
1022
216M
    }
1023
216M
}
1024
1025
#ifdef Py_DEBUG
1026
/* Fill the data of a Unicode string with invalid characters to detect bugs
1027
   earlier.
1028
1029
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1030
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1031
   invalid character in Unicode 6.0. */
1032
static void
1033
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1034
{
1035
    int kind = PyUnicode_KIND(unicode);
1036
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1037
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1038
    if (length <= old_length)
1039
        return;
1040
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1041
}
1042
#endif
1043
1044
static PyObject*
1045
resize_copy(PyObject *unicode, Py_ssize_t length)
1046
0
{
1047
0
    Py_ssize_t copy_length;
1048
0
    PyObject *copy;
1049
1050
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051
0
    if (copy == NULL)
1052
0
        return NULL;
1053
1054
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1055
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1056
0
    return copy;
1057
0
}
1058
1059
PyObject*
1060
_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
1061
66.0M
{
1062
66.0M
    Py_ssize_t char_size;
1063
66.0M
    Py_ssize_t struct_size;
1064
66.0M
    Py_ssize_t new_size;
1065
66.0M
    PyObject *new_unicode;
1066
#ifdef Py_DEBUG
1067
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1068
#endif
1069
1070
66.0M
    if (!_PyUnicode_IsModifiable(unicode)) {
1071
0
        PyObject *copy = resize_copy(unicode, length);
1072
0
        if (copy == NULL) {
1073
0
            return NULL;
1074
0
        }
1075
0
        Py_DECREF(unicode);
1076
0
        return copy;
1077
0
    }
1078
66.0M
    assert(PyUnicode_IS_COMPACT(unicode));
1079
1080
66.0M
    char_size = PyUnicode_KIND(unicode);
1081
66.0M
    if (PyUnicode_IS_ASCII(unicode))
1082
43.2M
        struct_size = sizeof(PyASCIIObject);
1083
22.8M
    else
1084
22.8M
        struct_size = sizeof(PyCompactUnicodeObject);
1085
1086
66.0M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1087
0
        PyErr_NoMemory();
1088
0
        return NULL;
1089
0
    }
1090
66.0M
    new_size = (struct_size + (length + 1) * char_size);
1091
1092
66.0M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1093
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1094
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1095
0
        PyUnicode_SET_UTF8(unicode, NULL);
1096
0
    }
1097
#ifdef Py_TRACE_REFS
1098
    _Py_ForgetReference(unicode);
1099
#endif
1100
66.0M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1101
1102
66.0M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1103
66.0M
    if (new_unicode == NULL) {
1104
0
        _Py_NewReferenceNoTotal(unicode);
1105
0
        PyErr_NoMemory();
1106
0
        return NULL;
1107
0
    }
1108
66.0M
    unicode = new_unicode;
1109
66.0M
    _Py_NewReferenceNoTotal(unicode);
1110
1111
66.0M
    _PyUnicode_LENGTH(unicode) = length;
1112
#ifdef Py_DEBUG
1113
    unicode_fill_invalid(unicode, old_length);
1114
#endif
1115
66.0M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1116
66.0M
                    length, 0);
1117
66.0M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1118
66.0M
    return unicode;
1119
66.0M
}
1120
1121
static int
1122
resize_inplace(PyObject *unicode, Py_ssize_t length)
1123
0
{
1124
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1125
0
    assert(Py_REFCNT(unicode) == 1);
1126
1127
0
    Py_ssize_t new_size;
1128
0
    Py_ssize_t char_size;
1129
0
    int share_utf8;
1130
0
    void *data;
1131
#ifdef Py_DEBUG
1132
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1133
#endif
1134
1135
0
    data = _PyUnicode_DATA_ANY(unicode);
1136
0
    char_size = PyUnicode_KIND(unicode);
1137
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1138
1139
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1140
0
        PyErr_NoMemory();
1141
0
        return -1;
1142
0
    }
1143
0
    new_size = (length + 1) * char_size;
1144
1145
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1146
0
    {
1147
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1148
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1149
0
        PyUnicode_SET_UTF8(unicode, NULL);
1150
0
    }
1151
1152
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1153
0
    if (data == NULL) {
1154
0
        PyErr_NoMemory();
1155
0
        return -1;
1156
0
    }
1157
0
    _PyUnicode_DATA_ANY(unicode) = data;
1158
0
    if (share_utf8) {
1159
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1160
0
        PyUnicode_SET_UTF8(unicode, data);
1161
0
    }
1162
0
    _PyUnicode_LENGTH(unicode) = length;
1163
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1164
#ifdef Py_DEBUG
1165
    unicode_fill_invalid(unicode, old_length);
1166
#endif
1167
1168
    /* check for integer overflow */
1169
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1170
0
        PyErr_NoMemory();
1171
0
        return -1;
1172
0
    }
1173
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1174
0
    return 0;
1175
0
}
1176
1177
static const char*
1178
unicode_kind_name(PyObject *unicode)
1179
0
{
1180
    /* don't check consistency: unicode_kind_name() is called from
1181
       _PyUnicode_Dump() */
1182
0
    if (!PyUnicode_IS_COMPACT(unicode))
1183
0
    {
1184
0
        switch (PyUnicode_KIND(unicode))
1185
0
        {
1186
0
        case PyUnicode_1BYTE_KIND:
1187
0
            if (PyUnicode_IS_ASCII(unicode))
1188
0
                return "legacy ascii";
1189
0
            else
1190
0
                return "legacy latin1";
1191
0
        case PyUnicode_2BYTE_KIND:
1192
0
            return "legacy UCS2";
1193
0
        case PyUnicode_4BYTE_KIND:
1194
0
            return "legacy UCS4";
1195
0
        default:
1196
0
            return "<legacy invalid kind>";
1197
0
        }
1198
0
    }
1199
0
    switch (PyUnicode_KIND(unicode)) {
1200
0
    case PyUnicode_1BYTE_KIND:
1201
0
        if (PyUnicode_IS_ASCII(unicode))
1202
0
            return "ascii";
1203
0
        else
1204
0
            return "latin1";
1205
0
    case PyUnicode_2BYTE_KIND:
1206
0
        return "UCS2";
1207
0
    case PyUnicode_4BYTE_KIND:
1208
0
        return "UCS4";
1209
0
    default:
1210
0
        return "<invalid compact kind>";
1211
0
    }
1212
0
}
1213
1214
#ifdef Py_DEBUG
1215
/* Functions wrapping macros for use in debugger */
1216
const char *_PyUnicode_utf8(void *unicode_raw){
1217
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1218
    return PyUnicode_UTF8(unicode);
1219
}
1220
1221
const void *_PyUnicode_compact_data(void *unicode_raw) {
1222
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1223
    return _PyUnicode_COMPACT_DATA(unicode);
1224
}
1225
const void *_PyUnicode_data(void *unicode_raw) {
1226
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1227
    printf("obj %p\n", (void*)unicode);
1228
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1229
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1230
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1231
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1232
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1233
    return PyUnicode_DATA(unicode);
1234
}
1235
1236
void
1237
_PyUnicode_Dump(PyObject *op)
1238
{
1239
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1240
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1241
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1242
    const void *data;
1243
1244
    if (ascii->state.compact)
1245
    {
1246
        if (ascii->state.ascii)
1247
            data = (ascii + 1);
1248
        else
1249
            data = (compact + 1);
1250
    }
1251
    else
1252
        data = unicode->data.any;
1253
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1254
1255
    if (!ascii->state.ascii) {
1256
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1257
    }
1258
    printf(", data=%p\n", data);
1259
}
1260
#endif
1261
1262
1263
PyObject *
1264
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1265
523M
{
1266
    /* Optimization for empty strings */
1267
523M
    if (size == 0) {
1268
23.8M
        return _PyUnicode_GetEmpty();
1269
23.8M
    }
1270
1271
499M
    PyObject *obj;
1272
499M
    PyCompactUnicodeObject *unicode;
1273
499M
    void *data;
1274
499M
    int kind;
1275
499M
    int is_ascii;
1276
499M
    Py_ssize_t char_size;
1277
499M
    Py_ssize_t struct_size;
1278
1279
499M
    is_ascii = 0;
1280
499M
    struct_size = sizeof(PyCompactUnicodeObject);
1281
499M
    if (maxchar < 128) {
1282
352M
        kind = PyUnicode_1BYTE_KIND;
1283
352M
        char_size = 1;
1284
352M
        is_ascii = 1;
1285
352M
        struct_size = sizeof(PyASCIIObject);
1286
352M
    }
1287
146M
    else if (maxchar < 256) {
1288
14.4M
        kind = PyUnicode_1BYTE_KIND;
1289
14.4M
        char_size = 1;
1290
14.4M
    }
1291
132M
    else if (maxchar < 65536) {
1292
125M
        kind = PyUnicode_2BYTE_KIND;
1293
125M
        char_size = 2;
1294
125M
    }
1295
7.17M
    else {
1296
7.17M
        if (maxchar > MAX_UNICODE) {
1297
0
            PyErr_SetString(PyExc_SystemError,
1298
0
                            "invalid maximum character passed to PyUnicode_New");
1299
0
            return NULL;
1300
0
        }
1301
7.17M
        kind = PyUnicode_4BYTE_KIND;
1302
7.17M
        char_size = 4;
1303
7.17M
    }
1304
1305
    /* Ensure we won't overflow the size. */
1306
499M
    if (size < 0) {
1307
0
        PyErr_SetString(PyExc_SystemError,
1308
0
                        "Negative size passed to PyUnicode_New");
1309
0
        return NULL;
1310
0
    }
1311
499M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1312
0
        return PyErr_NoMemory();
1313
1314
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1315
     * PyObject_New() so we are able to allocate space for the object and
1316
     * it's data buffer.
1317
     */
1318
499M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1319
499M
    if (obj == NULL) {
1320
0
        return PyErr_NoMemory();
1321
0
    }
1322
499M
    _PyObject_Init(obj, &PyUnicode_Type);
1323
1324
499M
    unicode = (PyCompactUnicodeObject *)obj;
1325
499M
    if (is_ascii)
1326
352M
        data = ((PyASCIIObject*)obj) + 1;
1327
146M
    else
1328
146M
        data = unicode + 1;
1329
499M
    _PyUnicode_LENGTH(unicode) = size;
1330
499M
    _PyUnicode_HASH(unicode) = -1;
1331
499M
    _PyUnicode_STATE(unicode).interned = 0;
1332
499M
    _PyUnicode_STATE(unicode).kind = kind;
1333
499M
    _PyUnicode_STATE(unicode).compact = 1;
1334
499M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1335
499M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1336
499M
    if (is_ascii) {
1337
352M
        ((char*)data)[size] = 0;
1338
352M
    }
1339
146M
    else if (kind == PyUnicode_1BYTE_KIND) {
1340
14.4M
        ((char*)data)[size] = 0;
1341
14.4M
        unicode->utf8 = NULL;
1342
14.4M
        unicode->utf8_length = 0;
1343
14.4M
    }
1344
132M
    else {
1345
132M
        unicode->utf8 = NULL;
1346
132M
        unicode->utf8_length = 0;
1347
132M
        if (kind == PyUnicode_2BYTE_KIND)
1348
125M
            ((Py_UCS2*)data)[size] = 0;
1349
7.17M
        else /* kind == PyUnicode_4BYTE_KIND */
1350
7.17M
            ((Py_UCS4*)data)[size] = 0;
1351
132M
    }
1352
#ifdef Py_DEBUG
1353
    unicode_fill_invalid((PyObject*)unicode, 0);
1354
#endif
1355
499M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1356
499M
    return obj;
1357
499M
}
1358
1359
static int
1360
unicode_check_modifiable(PyObject *unicode)
1361
641
{
1362
641
    if (!_PyUnicode_IsModifiable(unicode)) {
1363
0
        PyErr_SetString(PyExc_SystemError,
1364
0
                        "Cannot modify a string currently used");
1365
0
        return -1;
1366
0
    }
1367
641
    return 0;
1368
641
}
1369
1370
static int
1371
_copy_characters(PyObject *to, Py_ssize_t to_start,
1372
                 PyObject *from, Py_ssize_t from_start,
1373
                 Py_ssize_t how_many, int check_maxchar)
1374
283M
{
1375
283M
    int from_kind, to_kind;
1376
283M
    const void *from_data;
1377
283M
    void *to_data;
1378
1379
283M
    assert(0 <= how_many);
1380
283M
    assert(0 <= from_start);
1381
283M
    assert(0 <= to_start);
1382
283M
    assert(PyUnicode_Check(from));
1383
283M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1384
1385
283M
    assert(to == NULL || PyUnicode_Check(to));
1386
1387
283M
    if (how_many == 0) {
1388
4.32M
        return 0;
1389
4.32M
    }
1390
1391
283M
    assert(to != NULL);
1392
278M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1393
1394
278M
    from_kind = PyUnicode_KIND(from);
1395
278M
    from_data = PyUnicode_DATA(from);
1396
278M
    to_kind = PyUnicode_KIND(to);
1397
278M
    to_data = PyUnicode_DATA(to);
1398
1399
#ifdef Py_DEBUG
1400
    if (!check_maxchar
1401
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1402
    {
1403
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1404
        Py_UCS4 ch;
1405
        Py_ssize_t i;
1406
        for (i=0; i < how_many; i++) {
1407
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1408
            assert(ch <= to_maxchar);
1409
        }
1410
    }
1411
#endif
1412
1413
278M
    if (from_kind == to_kind) {
1414
183M
        if (check_maxchar
1415
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1416
0
        {
1417
            /* Writing Latin-1 characters into an ASCII string requires to
1418
               check that all written characters are pure ASCII */
1419
0
            Py_UCS4 max_char;
1420
0
            max_char = ucs1lib_find_max_char(from_data,
1421
0
                                             (const Py_UCS1*)from_data + how_many);
1422
0
            if (max_char >= 128)
1423
0
                return -1;
1424
0
        }
1425
183M
        memcpy((char*)to_data + to_kind * to_start,
1426
183M
                  (const char*)from_data + from_kind * from_start,
1427
183M
                  to_kind * how_many);
1428
183M
    }
1429
95.3M
    else if (from_kind == PyUnicode_1BYTE_KIND
1430
93.5M
             && to_kind == PyUnicode_2BYTE_KIND)
1431
81.9M
    {
1432
81.9M
        _PyUnicode_CONVERT_BYTES(
1433
81.9M
            Py_UCS1, Py_UCS2,
1434
81.9M
            PyUnicode_1BYTE_DATA(from) + from_start,
1435
81.9M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1436
81.9M
            PyUnicode_2BYTE_DATA(to) + to_start
1437
81.9M
            );
1438
81.9M
    }
1439
13.4M
    else if (from_kind == PyUnicode_1BYTE_KIND
1440
11.6M
             && to_kind == PyUnicode_4BYTE_KIND)
1441
11.6M
    {
1442
11.6M
        _PyUnicode_CONVERT_BYTES(
1443
11.6M
            Py_UCS1, Py_UCS4,
1444
11.6M
            PyUnicode_1BYTE_DATA(from) + from_start,
1445
11.6M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1446
11.6M
            PyUnicode_4BYTE_DATA(to) + to_start
1447
11.6M
            );
1448
11.6M
    }
1449
1.74M
    else if (from_kind == PyUnicode_2BYTE_KIND
1450
1.73M
             && to_kind == PyUnicode_4BYTE_KIND)
1451
1.72M
    {
1452
1.72M
        _PyUnicode_CONVERT_BYTES(
1453
1.72M
            Py_UCS2, Py_UCS4,
1454
1.72M
            PyUnicode_2BYTE_DATA(from) + from_start,
1455
1.72M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1456
1.72M
            PyUnicode_4BYTE_DATA(to) + to_start
1457
1.72M
            );
1458
1.72M
    }
1459
15.5k
    else {
1460
15.5k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1461
1462
15.5k
        if (!check_maxchar) {
1463
15.5k
            if (from_kind == PyUnicode_2BYTE_KIND
1464
3.23k
                && to_kind == PyUnicode_1BYTE_KIND)
1465
3.23k
            {
1466
3.23k
                _PyUnicode_CONVERT_BYTES(
1467
3.23k
                    Py_UCS2, Py_UCS1,
1468
3.23k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1469
3.23k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1470
3.23k
                    PyUnicode_1BYTE_DATA(to) + to_start
1471
3.23k
                    );
1472
3.23k
            }
1473
12.2k
            else if (from_kind == PyUnicode_4BYTE_KIND
1474
12.2k
                     && to_kind == PyUnicode_1BYTE_KIND)
1475
7.74k
            {
1476
7.74k
                _PyUnicode_CONVERT_BYTES(
1477
7.74k
                    Py_UCS4, Py_UCS1,
1478
7.74k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1479
7.74k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1480
7.74k
                    PyUnicode_1BYTE_DATA(to) + to_start
1481
7.74k
                    );
1482
7.74k
            }
1483
4.54k
            else if (from_kind == PyUnicode_4BYTE_KIND
1484
4.54k
                     && to_kind == PyUnicode_2BYTE_KIND)
1485
4.54k
            {
1486
4.54k
                _PyUnicode_CONVERT_BYTES(
1487
4.54k
                    Py_UCS4, Py_UCS2,
1488
4.54k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1489
4.54k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1490
4.54k
                    PyUnicode_2BYTE_DATA(to) + to_start
1491
4.54k
                    );
1492
4.54k
            }
1493
0
            else {
1494
0
                Py_UNREACHABLE();
1495
0
            }
1496
15.5k
        }
1497
0
        else {
1498
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1499
0
            Py_UCS4 ch;
1500
0
            Py_ssize_t i;
1501
1502
0
            for (i=0; i < how_many; i++) {
1503
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1504
0
                if (ch > to_maxchar)
1505
0
                    return -1;
1506
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1507
0
            }
1508
0
        }
1509
15.5k
    }
1510
278M
    return 0;
1511
278M
}
1512
1513
void
1514
_PyUnicode_FastCopyCharacters(
1515
    PyObject *to, Py_ssize_t to_start,
1516
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1517
283M
{
1518
283M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1519
283M
}
1520
1521
Py_ssize_t
1522
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1523
                         PyObject *from, Py_ssize_t from_start,
1524
                         Py_ssize_t how_many)
1525
0
{
1526
0
    int err;
1527
1528
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1529
0
        PyErr_BadInternalCall();
1530
0
        return -1;
1531
0
    }
1532
1533
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1534
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1535
0
        return -1;
1536
0
    }
1537
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1538
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1539
0
        return -1;
1540
0
    }
1541
0
    if (how_many < 0) {
1542
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1543
0
        return -1;
1544
0
    }
1545
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1546
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1547
0
        PyErr_Format(PyExc_SystemError,
1548
0
                     "Cannot write %zi characters at %zi "
1549
0
                     "in a string of %zi characters",
1550
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1551
0
        return -1;
1552
0
    }
1553
1554
0
    if (how_many == 0)
1555
0
        return 0;
1556
1557
0
    if (unicode_check_modifiable(to))
1558
0
        return -1;
1559
1560
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1561
0
    if (err) {
1562
0
        PyErr_Format(PyExc_SystemError,
1563
0
                     "Cannot copy %s characters "
1564
0
                     "into a string of %s characters",
1565
0
                     unicode_kind_name(from),
1566
0
                     unicode_kind_name(to));
1567
0
        return -1;
1568
0
    }
1569
0
    return how_many;
1570
0
}
1571
1572
/* Find the maximum code point and count the number of surrogate pairs so a
1573
   correct string length can be computed before converting a string to UCS4.
1574
   This function counts single surrogates as a character and not as a pair.
1575
1576
   Return 0 on success, or -1 on error. */
1577
static int
1578
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1579
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1580
619k
{
1581
619k
    const wchar_t *iter;
1582
619k
    Py_UCS4 ch;
1583
1584
619k
    assert(num_surrogates != NULL && maxchar != NULL);
1585
619k
    *num_surrogates = 0;
1586
619k
    *maxchar = 0;
1587
1588
15.2M
    for (iter = begin; iter < end; ) {
1589
#if SIZEOF_WCHAR_T == 2
1590
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1591
            && (iter+1) < end
1592
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1593
        {
1594
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1595
            ++(*num_surrogates);
1596
            iter += 2;
1597
        }
1598
        else
1599
#endif
1600
14.6M
        {
1601
14.6M
            ch = *iter;
1602
14.6M
            iter++;
1603
14.6M
        }
1604
14.6M
        if (ch > *maxchar) {
1605
2.46M
            *maxchar = ch;
1606
2.46M
            if (*maxchar > MAX_UNICODE) {
1607
0
                PyErr_Format(PyExc_ValueError,
1608
0
                             "character U+%x is not in range [U+0000; U+%x]",
1609
0
                             ch, MAX_UNICODE);
1610
0
                return -1;
1611
0
            }
1612
2.46M
        }
1613
14.6M
    }
1614
619k
    return 0;
1615
619k
}
1616
1617
static void
1618
unicode_dealloc(PyObject *unicode)
1619
513M
{
1620
#ifdef Py_DEBUG
1621
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1622
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1623
    }
1624
#endif
1625
513M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1626
        /* This should never get called, but we also don't want to SEGV if
1627
        * we accidentally decref an immortal string out of existence. Since
1628
        * the string is an immortal object, just re-set the reference count.
1629
        */
1630
#ifdef Py_DEBUG
1631
        Py_UNREACHABLE();
1632
#endif
1633
0
        _Py_SetImmortal(unicode);
1634
0
        return;
1635
0
    }
1636
513M
    switch (_PyUnicode_STATE(unicode).interned) {
1637
513M
        case SSTATE_NOT_INTERNED:
1638
513M
            break;
1639
573k
        case SSTATE_INTERNED_MORTAL:
1640
            /* Remove the object from the intern dict.
1641
             * Before doing so, we set the refcount to 2: the key and value
1642
             * in the interned_dict.
1643
             */
1644
573k
            assert(Py_REFCNT(unicode) == 0);
1645
573k
            Py_SET_REFCNT(unicode, 2);
1646
#ifdef Py_REF_DEBUG
1647
            /* let's be pedantic with the ref total */
1648
            _Py_IncRefTotal(_PyThreadState_GET());
1649
            _Py_IncRefTotal(_PyThreadState_GET());
1650
#endif
1651
573k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1652
573k
            PyObject *interned = get_interned_dict(interp);
1653
573k
            assert(interned != NULL);
1654
573k
            PyObject *popped;
1655
573k
            int r = PyDict_Pop(interned, unicode, &popped);
1656
573k
            if (r == -1) {
1657
0
                PyErr_FormatUnraisable("Exception ignored while "
1658
0
                                       "removing an interned string %R",
1659
0
                                       unicode);
1660
                // We don't know what happened to the string. It's probably
1661
                // best to leak it:
1662
                // - if it was popped, there are no more references to it
1663
                //   so it can't cause trouble (except wasted memory)
1664
                // - if it wasn't popped, it'll remain interned
1665
0
                _Py_SetImmortal(unicode);
1666
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1667
0
                return;
1668
0
            }
1669
573k
            if (r == 0) {
1670
                // The interned string was not found in the interned_dict.
1671
#ifdef Py_DEBUG
1672
                Py_UNREACHABLE();
1673
#endif
1674
0
                _Py_SetImmortal(unicode);
1675
0
                return;
1676
0
            }
1677
            // Successfully popped.
1678
573k
            assert(popped == unicode);
1679
            // Only our `popped` reference should be left; remove it too.
1680
573k
            assert(Py_REFCNT(unicode) == 1);
1681
573k
            Py_SET_REFCNT(unicode, 0);
1682
#ifdef Py_REF_DEBUG
1683
            /* let's be pedantic with the ref total */
1684
            _Py_DecRefTotal(_PyThreadState_GET());
1685
#endif
1686
573k
            break;
1687
0
        default:
1688
            // As with `statically_allocated` above.
1689
#ifdef Py_REF_DEBUG
1690
            Py_UNREACHABLE();
1691
#endif
1692
0
            _Py_SetImmortal(unicode);
1693
0
            return;
1694
513M
    }
1695
513M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1696
158k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1697
158k
    }
1698
513M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1699
16.1M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1700
16.1M
    }
1701
1702
513M
    Py_TYPE(unicode)->tp_free(unicode);
1703
513M
}
1704
1705
#ifdef Py_DEBUG
1706
static int
1707
unicode_is_singleton(PyObject *unicode)
1708
{
1709
    if (unicode == &_Py_STR(empty)) {
1710
        return 1;
1711
    }
1712
1713
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1714
    if (ascii->length == 1) {
1715
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1716
        if (ch < 256 && LATIN1(ch) == unicode) {
1717
            return 1;
1718
        }
1719
    }
1720
    return 0;
1721
}
1722
#endif
1723
1724
int
1725
_PyUnicode_IsModifiable(PyObject *unicode)
1726
73.5M
{
1727
73.5M
    assert(_PyUnicode_CHECK(unicode));
1728
73.5M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1729
2.75M
        return 0;
1730
70.7M
    if (PyUnicode_HASH(unicode) != -1)
1731
0
        return 0;
1732
70.7M
    if (PyUnicode_CHECK_INTERNED(unicode))
1733
0
        return 0;
1734
70.7M
    if (!PyUnicode_CheckExact(unicode))
1735
0
        return 0;
1736
#ifdef Py_DEBUG
1737
    /* singleton refcount is greater than 1 */
1738
    assert(!unicode_is_singleton(unicode));
1739
#endif
1740
70.7M
    return 1;
1741
70.7M
}
1742
1743
static int
1744
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1745
2.34M
{
1746
2.34M
    PyObject *unicode;
1747
2.34M
    Py_ssize_t old_length;
1748
1749
2.34M
    assert(p_unicode != NULL);
1750
2.34M
    unicode = *p_unicode;
1751
1752
2.34M
    assert(unicode != NULL);
1753
2.34M
    assert(PyUnicode_Check(unicode));
1754
2.34M
    assert(0 <= length);
1755
1756
2.34M
    old_length = PyUnicode_GET_LENGTH(unicode);
1757
2.34M
    if (old_length == length)
1758
0
        return 0;
1759
1760
2.34M
    if (length == 0) {
1761
0
        PyObject *empty = _PyUnicode_GetEmpty();
1762
0
        Py_SETREF(*p_unicode, empty);
1763
0
        return 0;
1764
0
    }
1765
1766
2.34M
    if (!_PyUnicode_IsModifiable(unicode)) {
1767
0
        PyObject *copy = resize_copy(unicode, length);
1768
0
        if (copy == NULL)
1769
0
            return -1;
1770
0
        Py_SETREF(*p_unicode, copy);
1771
0
        return 0;
1772
0
    }
1773
1774
2.34M
    if (PyUnicode_IS_COMPACT(unicode)) {
1775
2.34M
        PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
1776
2.34M
        if (new_unicode == NULL)
1777
0
            return -1;
1778
2.34M
        *p_unicode = new_unicode;
1779
2.34M
        return 0;
1780
2.34M
    }
1781
0
    return resize_inplace(unicode, length);
1782
2.34M
}
1783
1784
int
1785
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1786
0
{
1787
0
    PyObject *unicode;
1788
0
    if (p_unicode == NULL) {
1789
0
        PyErr_BadInternalCall();
1790
0
        return -1;
1791
0
    }
1792
0
    unicode = *p_unicode;
1793
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1794
0
    {
1795
0
        PyErr_BadInternalCall();
1796
0
        return -1;
1797
0
    }
1798
0
    return unicode_resize(p_unicode, length);
1799
0
}
1800
1801
static PyObject*
1802
get_latin1_char(Py_UCS1 ch)
1803
200M
{
1804
200M
    PyObject *o = LATIN1(ch);
1805
200M
    return o;
1806
200M
}
1807
1808
static PyObject*
1809
unicode_char(Py_UCS4 ch)
1810
189M
{
1811
189M
    PyObject *unicode;
1812
1813
189M
    assert(ch <= MAX_UNICODE);
1814
1815
189M
    if (ch < 256) {
1816
111M
        return get_latin1_char(ch);
1817
111M
    }
1818
1819
77.6M
    unicode = PyUnicode_New(1, ch);
1820
77.6M
    if (unicode == NULL)
1821
0
        return NULL;
1822
1823
77.6M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1824
77.6M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1825
72.7M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1826
72.7M
    } else {
1827
4.88M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1828
4.88M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1829
4.88M
    }
1830
77.6M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1831
77.6M
    return unicode;
1832
77.6M
}
1833
1834
1835
static inline void
1836
unicode_write_widechar(int kind, void *data,
1837
                       const wchar_t *u, Py_ssize_t size,
1838
                       Py_ssize_t num_surrogates)
1839
619k
{
1840
619k
    switch (kind) {
1841
548k
    case PyUnicode_1BYTE_KIND:
1842
548k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1843
548k
        break;
1844
1845
69.7k
    case PyUnicode_2BYTE_KIND:
1846
#if SIZEOF_WCHAR_T == 2
1847
        memcpy(data, u, size * 2);
1848
#else
1849
69.7k
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1850
69.7k
#endif
1851
69.7k
        break;
1852
1853
789
    case PyUnicode_4BYTE_KIND:
1854
789
    {
1855
#if SIZEOF_WCHAR_T == 2
1856
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1857
        // surrogate pairs.
1858
        const wchar_t *end = u + size;
1859
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1860
#  ifndef NDEBUG
1861
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1862
#  endif
1863
        for (const wchar_t *iter = u; iter < end; ) {
1864
            assert(ucs4_out < ucs4_end);
1865
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1866
                && (iter+1) < end
1867
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1868
            {
1869
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1870
                iter += 2;
1871
            }
1872
            else {
1873
                *ucs4_out++ = *iter;
1874
                iter++;
1875
            }
1876
        }
1877
        assert(ucs4_out == ucs4_end);
1878
#else
1879
789
        assert(num_surrogates == 0);
1880
789
        memcpy(data, u, size * 4);
1881
789
#endif
1882
789
        break;
1883
0
    }
1884
0
    default:
1885
0
        Py_UNREACHABLE();
1886
619k
    }
1887
619k
}
1888
1889
1890
PyObject *
1891
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1892
1.17M
{
1893
1.17M
    PyObject *unicode;
1894
1.17M
    Py_UCS4 maxchar = 0;
1895
1.17M
    Py_ssize_t num_surrogates;
1896
1897
1.17M
    if (u == NULL && size != 0) {
1898
0
        PyErr_BadInternalCall();
1899
0
        return NULL;
1900
0
    }
1901
1902
1.17M
    if (size == -1) {
1903
1.29k
        size = wcslen(u);
1904
1.29k
    }
1905
1906
    /* If the Unicode data is known at construction time, we can apply
1907
       some optimizations which share commonly used objects. */
1908
1909
    /* Optimization for empty strings */
1910
1.17M
    if (size == 0)
1911
448k
        _Py_RETURN_UNICODE_EMPTY();
1912
1913
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1914
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1915
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1916
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1917
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1918
        if (!converted) {
1919
            return NULL;
1920
        }
1921
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1922
        PyMem_Free(converted);
1923
        return unicode;
1924
    }
1925
#endif
1926
1927
    /* Single character Unicode objects in the Latin-1 range are
1928
       shared when using this constructor */
1929
722k
    if (size == 1 && (Py_UCS4)*u < 256)
1930
103k
        return get_latin1_char((unsigned char)*u);
1931
1932
    /* If not empty and not single character, copy the Unicode data
1933
       into the new object */
1934
619k
    if (find_maxchar_surrogates(u, u + size,
1935
619k
                                &maxchar, &num_surrogates) == -1)
1936
0
        return NULL;
1937
1938
619k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1939
619k
    if (!unicode)
1940
0
        return NULL;
1941
1942
619k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1943
619k
                           u, size, num_surrogates);
1944
1945
619k
    return unicode_result(unicode);
1946
619k
}
1947
1948
1949
int
1950
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
1951
                              const wchar_t *str,
1952
                              Py_ssize_t size)
1953
0
{
1954
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
1955
1956
0
    if (size < 0) {
1957
0
        size = wcslen(str);
1958
0
    }
1959
1960
0
    if (size == 0) {
1961
0
        return 0;
1962
0
    }
1963
1964
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1965
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1966
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1967
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1968
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
1969
        if (!converted) {
1970
            return -1;
1971
        }
1972
1973
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
1974
        PyMem_Free(converted);
1975
        return res;
1976
    }
1977
#endif
1978
1979
0
    Py_UCS4 maxchar = 0;
1980
0
    Py_ssize_t num_surrogates;
1981
0
    if (find_maxchar_surrogates(str, str + size,
1982
0
                                &maxchar, &num_surrogates) == -1) {
1983
0
        return -1;
1984
0
    }
1985
1986
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
1987
0
        return -1;
1988
0
    }
1989
1990
0
    int kind = writer->kind;
1991
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
1992
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
1993
1994
0
    writer->pos += size - num_surrogates;
1995
0
    return 0;
1996
0
}
1997
1998
1999
PyObject *
2000
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2001
7.16M
{
2002
7.16M
    if (size < 0) {
2003
0
        PyErr_SetString(PyExc_SystemError,
2004
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2005
0
        return NULL;
2006
0
    }
2007
7.16M
    if (u != NULL) {
2008
7.16M
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2009
7.16M
    }
2010
0
    if (size > 0) {
2011
0
        PyErr_SetString(PyExc_SystemError,
2012
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2013
0
        return NULL;
2014
0
    }
2015
0
    return _PyUnicode_GetEmpty();
2016
0
}
2017
2018
PyObject *
2019
PyUnicode_FromString(const char *u)
2020
13.3M
{
2021
13.3M
    size_t size = strlen(u);
2022
13.3M
    if (size > PY_SSIZE_T_MAX) {
2023
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2024
0
        return NULL;
2025
0
    }
2026
13.3M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2027
13.3M
}
2028
2029
2030
PyObject *
2031
_PyUnicode_FromId(_Py_Identifier *id)
2032
0
{
2033
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2034
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2035
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2036
2037
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2038
0
    if (index < 0) {
2039
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2040
2041
0
        PyMutex_Lock(&rt_ids->mutex);
2042
        // Check again to detect concurrent access. Another thread can have
2043
        // initialized the index while this thread waited for the lock.
2044
0
        index = _Py_atomic_load_ssize(&id->index);
2045
0
        if (index < 0) {
2046
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2047
0
            index = rt_ids->next_index;
2048
0
            rt_ids->next_index++;
2049
0
            _Py_atomic_store_ssize(&id->index, index);
2050
0
        }
2051
0
        PyMutex_Unlock(&rt_ids->mutex);
2052
0
    }
2053
0
    assert(index >= 0);
2054
2055
0
    PyObject *obj;
2056
0
    if (index < ids->size) {
2057
0
        obj = ids->array[index];
2058
0
        if (obj) {
2059
            // Return a borrowed reference
2060
0
            goto end;
2061
0
        }
2062
0
    }
2063
2064
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2065
0
                                       NULL, NULL);
2066
0
    if (!obj) {
2067
0
        goto end;
2068
0
    }
2069
0
    _PyUnicode_InternImmortal(interp, &obj);
2070
2071
0
    if (index >= ids->size) {
2072
        // Overallocate to reduce the number of realloc
2073
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2074
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2075
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2076
0
        if (new_array == NULL) {
2077
0
            PyErr_NoMemory();
2078
0
            obj = NULL;
2079
0
            goto end;
2080
0
        }
2081
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2082
0
        ids->array = new_array;
2083
0
        ids->size = new_size;
2084
0
    }
2085
2086
    // The array stores a strong reference
2087
0
    ids->array[index] = obj;
2088
2089
0
end:
2090
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2091
    // Return a borrowed reference
2092
0
    return obj;
2093
0
}
2094
2095
2096
static void
2097
unicode_clear_identifiers(struct _Py_unicode_state *state)
2098
0
{
2099
0
    struct _Py_unicode_ids *ids = &state->ids;
2100
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2101
0
        Py_XDECREF(ids->array[i]);
2102
0
    }
2103
0
    ids->size = 0;
2104
0
    PyMem_Free(ids->array);
2105
0
    ids->array = NULL;
2106
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2107
    // after Py_Finalize().
2108
0
}
2109
2110
2111
/* Internal function, doesn't check maximum character */
2112
2113
PyObject*
2114
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2115
121M
{
2116
121M
    const unsigned char *s = (const unsigned char *)buffer;
2117
121M
    PyObject *unicode;
2118
121M
    if (size == 1) {
2119
#ifdef Py_DEBUG
2120
        assert((unsigned char)s[0] < 128);
2121
#endif
2122
45.0M
        return get_latin1_char(s[0]);
2123
45.0M
    }
2124
76.5M
    unicode = PyUnicode_New(size, 127);
2125
76.5M
    if (!unicode)
2126
0
        return NULL;
2127
76.5M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2128
76.5M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2129
76.5M
    return unicode;
2130
76.5M
}
2131
2132
static Py_UCS4
2133
kind_maxchar_limit(int kind)
2134
0
{
2135
0
    switch (kind) {
2136
0
    case PyUnicode_1BYTE_KIND:
2137
0
        return 0x80;
2138
0
    case PyUnicode_2BYTE_KIND:
2139
0
        return 0x100;
2140
0
    case PyUnicode_4BYTE_KIND:
2141
0
        return 0x10000;
2142
0
    default:
2143
0
        Py_UNREACHABLE();
2144
0
    }
2145
0
}
2146
2147
static PyObject*
2148
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2149
47.6M
{
2150
47.6M
    PyObject *res;
2151
47.6M
    unsigned char max_char;
2152
2153
47.6M
    if (size == 0) {
2154
7.18M
        _Py_RETURN_UNICODE_EMPTY();
2155
7.18M
    }
2156
47.6M
    assert(size > 0);
2157
40.5M
    if (size == 1) {
2158
10.3M
        return get_latin1_char(u[0]);
2159
10.3M
    }
2160
2161
30.1M
    max_char = ucs1lib_find_max_char(u, u + size);
2162
30.1M
    res = PyUnicode_New(size, max_char);
2163
30.1M
    if (!res)
2164
0
        return NULL;
2165
30.1M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2166
30.1M
    assert(_PyUnicode_CheckConsistency(res, 1));
2167
30.1M
    return res;
2168
30.1M
}
2169
2170
static PyObject*
2171
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2172
102M
{
2173
102M
    PyObject *res;
2174
102M
    Py_UCS2 max_char;
2175
2176
102M
    if (size == 0)
2177
17.8M
        _Py_RETURN_UNICODE_EMPTY();
2178
102M
    assert(size > 0);
2179
84.9M
    if (size == 1)
2180
55.8M
        return unicode_char(u[0]);
2181
2182
29.0M
    max_char = ucs2lib_find_max_char(u, u + size);
2183
29.0M
    res = PyUnicode_New(size, max_char);
2184
29.0M
    if (!res)
2185
0
        return NULL;
2186
29.0M
    if (max_char >= 256)
2187
17.5M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2188
11.5M
    else {
2189
11.5M
        _PyUnicode_CONVERT_BYTES(
2190
11.5M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2191
11.5M
    }
2192
29.0M
    assert(_PyUnicode_CheckConsistency(res, 1));
2193
29.0M
    return res;
2194
29.0M
}
2195
2196
static PyObject*
2197
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2198
62.3M
{
2199
62.3M
    PyObject *res;
2200
62.3M
    Py_UCS4 max_char;
2201
2202
62.3M
    if (size == 0)
2203
8.21M
        _Py_RETURN_UNICODE_EMPTY();
2204
62.3M
    assert(size > 0);
2205
54.1M
    if (size == 1)
2206
36.4M
        return unicode_char(u[0]);
2207
2208
17.7M
    max_char = ucs4lib_find_max_char(u, u + size);
2209
17.7M
    res = PyUnicode_New(size, max_char);
2210
17.7M
    if (!res)
2211
0
        return NULL;
2212
17.7M
    if (max_char < 256)
2213
12.4M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2214
17.7M
                                 PyUnicode_1BYTE_DATA(res));
2215
5.31M
    else if (max_char < 0x10000)
2216
3.97M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2217
5.31M
                                 PyUnicode_2BYTE_DATA(res));
2218
1.34M
    else
2219
1.34M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2220
17.7M
    assert(_PyUnicode_CheckConsistency(res, 1));
2221
17.7M
    return res;
2222
17.7M
}
2223
2224
2225
int
2226
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2227
                          const Py_UCS4 *str,
2228
                          Py_ssize_t size)
2229
0
{
2230
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2231
2232
0
    if (size < 0) {
2233
0
        PyErr_SetString(PyExc_ValueError,
2234
0
                        "size must be positive");
2235
0
        return -1;
2236
0
    }
2237
2238
0
    if (size == 0) {
2239
0
        return 0;
2240
0
    }
2241
2242
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2243
2244
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2245
0
        return -1;
2246
0
    }
2247
2248
0
    int kind = writer->kind;
2249
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2250
0
    if (kind == PyUnicode_1BYTE_KIND) {
2251
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2252
0
                                 str, str + size,
2253
0
                                 data);
2254
0
    }
2255
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2256
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2257
0
                                 str, str + size,
2258
0
                                 data);
2259
0
    }
2260
0
    else {
2261
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2262
0
    }
2263
0
    writer->pos += size;
2264
2265
0
    return 0;
2266
0
}
2267
2268
2269
PyObject*
2270
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2271
156M
{
2272
156M
    if (size < 0) {
2273
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2274
0
        return NULL;
2275
0
    }
2276
156M
    switch (kind) {
2277
22.1M
    case PyUnicode_1BYTE_KIND:
2278
22.1M
        return _PyUnicode_FromUCS1(buffer, size);
2279
83.9M
    case PyUnicode_2BYTE_KIND:
2280
83.9M
        return _PyUnicode_FromUCS2(buffer, size);
2281
50.8M
    case PyUnicode_4BYTE_KIND:
2282
50.8M
        return _PyUnicode_FromUCS4(buffer, size);
2283
0
    default:
2284
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2285
0
        return NULL;
2286
156M
    }
2287
156M
}
2288
2289
Py_UCS4
2290
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2291
15.8M
{
2292
15.8M
    int kind;
2293
15.8M
    const void *startptr, *endptr;
2294
2295
15.8M
    assert(0 <= start);
2296
15.8M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2297
15.8M
    assert(start <= end);
2298
2299
15.8M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2300
81.7k
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2301
2302
15.7M
    if (start == end)
2303
0
        return 127;
2304
2305
15.7M
    if (PyUnicode_IS_ASCII(unicode))
2306
15.7M
        return 127;
2307
2308
25.0k
    kind = PyUnicode_KIND(unicode);
2309
25.0k
    startptr = PyUnicode_DATA(unicode);
2310
25.0k
    endptr = (char *)startptr + end * kind;
2311
25.0k
    startptr = (char *)startptr + start * kind;
2312
25.0k
    switch(kind) {
2313
1.11k
    case PyUnicode_1BYTE_KIND:
2314
1.11k
        return ucs1lib_find_max_char(startptr, endptr);
2315
4.70k
    case PyUnicode_2BYTE_KIND:
2316
4.70k
        return ucs2lib_find_max_char(startptr, endptr);
2317
19.2k
    case PyUnicode_4BYTE_KIND:
2318
19.2k
        return ucs4lib_find_max_char(startptr, endptr);
2319
0
    default:
2320
0
        Py_UNREACHABLE();
2321
25.0k
    }
2322
25.0k
}
2323
2324
/* Ensure that a string uses the most efficient storage, if it is not the
2325
   case: create a new string with of the right kind. Write NULL into *p_unicode
2326
   on error. */
2327
static void
2328
unicode_adjust_maxchar(PyObject **p_unicode)
2329
0
{
2330
0
    PyObject *unicode, *copy;
2331
0
    Py_UCS4 max_char;
2332
0
    Py_ssize_t len;
2333
0
    int kind;
2334
2335
0
    assert(p_unicode != NULL);
2336
0
    unicode = *p_unicode;
2337
0
    if (PyUnicode_IS_ASCII(unicode))
2338
0
        return;
2339
2340
0
    len = PyUnicode_GET_LENGTH(unicode);
2341
0
    kind = PyUnicode_KIND(unicode);
2342
0
    if (kind == PyUnicode_1BYTE_KIND) {
2343
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2344
0
        max_char = ucs1lib_find_max_char(u, u + len);
2345
0
        if (max_char >= 128)
2346
0
            return;
2347
0
    }
2348
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2349
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2350
0
        max_char = ucs2lib_find_max_char(u, u + len);
2351
0
        if (max_char >= 256)
2352
0
            return;
2353
0
    }
2354
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2355
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2356
0
        max_char = ucs4lib_find_max_char(u, u + len);
2357
0
        if (max_char >= 0x10000)
2358
0
            return;
2359
0
    }
2360
0
    else
2361
0
        Py_UNREACHABLE();
2362
2363
0
    copy = PyUnicode_New(len, max_char);
2364
0
    if (copy != NULL)
2365
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2366
0
    Py_DECREF(unicode);
2367
0
    *p_unicode = copy;
2368
0
}
2369
2370
PyObject*
2371
_PyUnicode_Copy(PyObject *unicode)
2372
3.10M
{
2373
3.10M
    Py_ssize_t length;
2374
3.10M
    PyObject *copy;
2375
2376
3.10M
    if (!PyUnicode_Check(unicode)) {
2377
0
        PyErr_BadInternalCall();
2378
0
        return NULL;
2379
0
    }
2380
2381
3.10M
    length = PyUnicode_GET_LENGTH(unicode);
2382
3.10M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2383
3.10M
    if (!copy)
2384
0
        return NULL;
2385
3.10M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2386
2387
3.10M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2388
3.10M
              length * PyUnicode_KIND(unicode));
2389
3.10M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2390
3.10M
    return copy;
2391
3.10M
}
2392
2393
2394
/* Widen Unicode objects to larger buffers. Don't write terminating null
2395
   character. Return NULL on error. */
2396
2397
static void*
2398
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2399
8.38M
{
2400
8.38M
    void *result;
2401
2402
8.38M
    assert(skind < kind);
2403
8.38M
    switch (kind) {
2404
5.39M
    case PyUnicode_2BYTE_KIND:
2405
5.39M
        result = PyMem_New(Py_UCS2, len);
2406
5.39M
        if (!result)
2407
0
            return PyErr_NoMemory();
2408
5.39M
        assert(skind == PyUnicode_1BYTE_KIND);
2409
5.39M
        _PyUnicode_CONVERT_BYTES(
2410
5.39M
            Py_UCS1, Py_UCS2,
2411
5.39M
            (const Py_UCS1 *)data,
2412
5.39M
            ((const Py_UCS1 *)data) + len,
2413
5.39M
            result);
2414
5.39M
        return result;
2415
2.98M
    case PyUnicode_4BYTE_KIND:
2416
2.98M
        result = PyMem_New(Py_UCS4, len);
2417
2.98M
        if (!result)
2418
0
            return PyErr_NoMemory();
2419
2.98M
        if (skind == PyUnicode_2BYTE_KIND) {
2420
0
            _PyUnicode_CONVERT_BYTES(
2421
0
                Py_UCS2, Py_UCS4,
2422
0
                (const Py_UCS2 *)data,
2423
0
                ((const Py_UCS2 *)data) + len,
2424
0
                result);
2425
0
        }
2426
2.98M
        else {
2427
2.98M
            assert(skind == PyUnicode_1BYTE_KIND);
2428
2.98M
            _PyUnicode_CONVERT_BYTES(
2429
2.98M
                Py_UCS1, Py_UCS4,
2430
2.98M
                (const Py_UCS1 *)data,
2431
2.98M
                ((const Py_UCS1 *)data) + len,
2432
2.98M
                result);
2433
2.98M
        }
2434
2.98M
        return result;
2435
0
    default:
2436
0
        Py_UNREACHABLE();
2437
0
        return NULL;
2438
8.38M
    }
2439
8.38M
}
2440
2441
static Py_UCS4*
2442
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2443
        int copy_null)
2444
76.5k
{
2445
76.5k
    int kind;
2446
76.5k
    const void *data;
2447
76.5k
    Py_ssize_t len, targetlen;
2448
76.5k
    kind = PyUnicode_KIND(string);
2449
76.5k
    data = PyUnicode_DATA(string);
2450
76.5k
    len = PyUnicode_GET_LENGTH(string);
2451
76.5k
    targetlen = len;
2452
76.5k
    if (copy_null)
2453
0
        targetlen++;
2454
76.5k
    if (!target) {
2455
0
        target = PyMem_New(Py_UCS4, targetlen);
2456
0
        if (!target) {
2457
0
            PyErr_NoMemory();
2458
0
            return NULL;
2459
0
        }
2460
0
    }
2461
76.5k
    else {
2462
76.5k
        if (targetsize < targetlen) {
2463
0
            PyErr_Format(PyExc_SystemError,
2464
0
                         "string is longer than the buffer");
2465
0
            if (copy_null && 0 < targetsize)
2466
0
                target[0] = 0;
2467
0
            return NULL;
2468
0
        }
2469
76.5k
    }
2470
76.5k
    if (kind == PyUnicode_1BYTE_KIND) {
2471
54.3k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2472
54.3k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2473
54.3k
    }
2474
22.1k
    else if (kind == PyUnicode_2BYTE_KIND) {
2475
15.8k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2476
15.8k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2477
15.8k
    }
2478
6.31k
    else if (kind == PyUnicode_4BYTE_KIND) {
2479
6.31k
        memcpy(target, data, len * sizeof(Py_UCS4));
2480
6.31k
    }
2481
0
    else {
2482
0
        Py_UNREACHABLE();
2483
0
    }
2484
76.5k
    if (copy_null)
2485
0
        target[len] = 0;
2486
76.5k
    return target;
2487
76.5k
}
2488
2489
Py_UCS4*
2490
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2491
                 int copy_null)
2492
76.5k
{
2493
76.5k
    if (target == NULL || targetsize < 0) {
2494
0
        PyErr_BadInternalCall();
2495
0
        return NULL;
2496
0
    }
2497
76.5k
    return as_ucs4(string, target, targetsize, copy_null);
2498
76.5k
}
2499
2500
Py_UCS4*
2501
PyUnicode_AsUCS4Copy(PyObject *string)
2502
0
{
2503
0
    return as_ucs4(string, NULL, 0, 1);
2504
0
}
2505
2506
/* maximum number of characters required for output of %jo or %jd or %p.
2507
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2508
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2509
   plus 1 for the terminal NUL. */
2510
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2511
2512
static int
2513
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2514
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2515
9.84M
{
2516
9.84M
    Py_ssize_t length, fill, arglen;
2517
9.84M
    Py_UCS4 maxchar;
2518
2519
9.84M
    length = PyUnicode_GET_LENGTH(str);
2520
9.84M
    if ((precision == -1 || precision >= length)
2521
9.84M
        && width <= length)
2522
9.84M
        return _PyUnicodeWriter_WriteStr(writer, str);
2523
2524
53
    if (precision != -1)
2525
53
        length = Py_MIN(precision, length);
2526
2527
53
    arglen = Py_MAX(length, width);
2528
53
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2529
25
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2530
28
    else
2531
28
        maxchar = writer->maxchar;
2532
2533
53
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2534
0
        return -1;
2535
2536
53
    fill = Py_MAX(width - length, 0);
2537
53
    if (fill && !(flags & F_LJUST)) {
2538
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2539
0
            return -1;
2540
0
        writer->pos += fill;
2541
0
    }
2542
2543
53
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2544
53
                                  str, 0, length);
2545
53
    writer->pos += length;
2546
2547
53
    if (fill && (flags & F_LJUST)) {
2548
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2549
0
            return -1;
2550
0
        writer->pos += fill;
2551
0
    }
2552
2553
53
    return 0;
2554
53
}
2555
2556
static int
2557
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2558
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2559
3.62M
{
2560
    /* UTF-8 */
2561
3.62M
    Py_ssize_t *pconsumed = NULL;
2562
3.62M
    Py_ssize_t length;
2563
3.62M
    if (precision == -1) {
2564
1.23M
        length = strlen(str);
2565
1.23M
    }
2566
2.38M
    else {
2567
2.38M
        length = 0;
2568
37.8M
        while (length < precision && str[length]) {
2569
35.4M
            length++;
2570
35.4M
        }
2571
2.38M
        if (length == precision) {
2572
            /* The input string is not NUL-terminated.  If it ends with an
2573
             * incomplete UTF-8 sequence, truncate the string just before it.
2574
             * Incomplete sequences in the middle and sequences which cannot
2575
             * be valid prefixes are still treated as errors and replaced
2576
             * with \xfffd. */
2577
1.83k
            pconsumed = &length;
2578
1.83k
        }
2579
2.38M
    }
2580
2581
3.62M
    if (width < 0) {
2582
3.62M
        return _PyUnicode_DecodeUTF8Writer(writer, str, length,
2583
3.62M
                                           _Py_ERROR_REPLACE, "replace", pconsumed);
2584
3.62M
    }
2585
2586
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2587
0
                                                     "replace", pconsumed);
2588
0
    if (unicode == NULL)
2589
0
        return -1;
2590
2591
0
    int res = unicode_fromformat_write_str(writer, unicode,
2592
0
                                           width, -1, flags);
2593
0
    Py_DECREF(unicode);
2594
0
    return res;
2595
0
}
2596
2597
static int
2598
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2599
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2600
0
{
2601
0
    Py_ssize_t length;
2602
0
    if (precision == -1) {
2603
0
        length = wcslen(str);
2604
0
    }
2605
0
    else {
2606
0
        length = 0;
2607
0
        while (length < precision && str[length]) {
2608
0
            length++;
2609
0
        }
2610
0
    }
2611
2612
0
    if (width < 0) {
2613
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2614
0
                                             str, length);
2615
0
    }
2616
2617
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2618
0
    if (unicode == NULL)
2619
0
        return -1;
2620
2621
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2622
0
    Py_DECREF(unicode);
2623
0
    return res;
2624
0
}
2625
2626
0
#define F_LONG 1
2627
0
#define F_LONGLONG 2
2628
210k
#define F_SIZE 3
2629
0
#define F_PTRDIFF 4
2630
0
#define F_INTMAX 5
2631
2632
static const char*
2633
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2634
                       const char *f, va_list *vargs)
2635
28.6M
{
2636
28.6M
    const char *p;
2637
28.6M
    Py_ssize_t len;
2638
28.6M
    int flags = 0;
2639
28.6M
    Py_ssize_t width;
2640
28.6M
    Py_ssize_t precision;
2641
2642
28.6M
    p = f;
2643
28.6M
    f++;
2644
28.6M
    if (*f == '%') {
2645
1.20M
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2646
0
            return NULL;
2647
1.20M
        f++;
2648
1.20M
        return f;
2649
1.20M
    }
2650
2651
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2652
    /* Flags '+', ' ' and '#' are not particularly useful.
2653
     * They are not worth the implementation and maintenance costs.
2654
     * In addition, '#' should add "0" for "o" conversions for compatibility
2655
     * with printf, but it would confuse Python users. */
2656
27.4M
    while (1) {
2657
27.4M
        switch (*f++) {
2658
0
        case '-': flags |= F_LJUST; continue;
2659
1.66k
        case '0': flags |= F_ZERO; continue;
2660
0
        case '#': flags |= F_ALT; continue;
2661
27.4M
        }
2662
27.4M
        f--;
2663
27.4M
        break;
2664
27.4M
    }
2665
2666
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2667
27.4M
    width = -1;
2668
27.4M
    if (*f == '*') {
2669
0
        width = va_arg(*vargs, int);
2670
0
        if (width < 0) {
2671
0
            flags |= F_LJUST;
2672
0
            width = -width;
2673
0
        }
2674
0
        f++;
2675
0
    }
2676
27.4M
    else if (Py_ISDIGIT((unsigned)*f)) {
2677
1.66k
        width = *f - '0';
2678
1.66k
        f++;
2679
1.66k
        while (Py_ISDIGIT((unsigned)*f)) {
2680
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2681
0
                PyErr_SetString(PyExc_ValueError,
2682
0
                                "width too big");
2683
0
                return NULL;
2684
0
            }
2685
0
            width = (width * 10) + (*f - '0');
2686
0
            f++;
2687
0
        }
2688
1.66k
    }
2689
27.4M
    precision = -1;
2690
27.4M
    if (*f == '.') {
2691
5.68M
        f++;
2692
5.68M
        if (*f == '*') {
2693
0
            precision = va_arg(*vargs, int);
2694
0
            if (precision < 0) {
2695
0
                precision = -2;
2696
0
            }
2697
0
            f++;
2698
0
        }
2699
5.68M
        else if (Py_ISDIGIT((unsigned)*f)) {
2700
5.68M
            precision = (*f - '0');
2701
5.68M
            f++;
2702
17.0M
            while (Py_ISDIGIT((unsigned)*f)) {
2703
11.3M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2704
0
                    PyErr_SetString(PyExc_ValueError,
2705
0
                                    "precision too big");
2706
0
                    return NULL;
2707
0
                }
2708
11.3M
                precision = (precision * 10) + (*f - '0');
2709
11.3M
                f++;
2710
11.3M
            }
2711
5.68M
        }
2712
5.68M
    }
2713
2714
27.4M
    int sizemod = 0;
2715
27.4M
    if (*f == 'l') {
2716
0
        if (f[1] == 'l') {
2717
0
            sizemod = F_LONGLONG;
2718
0
            f += 2;
2719
0
        }
2720
0
        else {
2721
0
            sizemod = F_LONG;
2722
0
            ++f;
2723
0
        }
2724
0
    }
2725
27.4M
    else if (*f == 'z') {
2726
105k
        sizemod = F_SIZE;
2727
105k
        ++f;
2728
105k
    }
2729
27.3M
    else if (*f == 't') {
2730
0
        sizemod = F_PTRDIFF;
2731
0
        ++f;
2732
0
    }
2733
27.3M
    else if (*f == 'j') {
2734
0
        sizemod = F_INTMAX;
2735
0
        ++f;
2736
0
    }
2737
27.4M
    if (f[0] != '\0' && f[1] == '\0')
2738
5.60M
        writer->overallocate = 0;
2739
2740
27.4M
    switch (*f) {
2741
11.8M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2742
11.8M
        break;
2743
2.20M
    case 'c': case 'p':
2744
2.20M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2745
2.20M
        break;
2746
3.62M
    case 's':
2747
3.62M
    case 'V':
2748
3.62M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2749
3.62M
        break;
2750
9.84M
    default:
2751
9.84M
        if (sizemod) goto invalid_format;
2752
9.84M
        break;
2753
27.4M
    }
2754
2755
27.4M
    switch (*f) {
2756
2.19M
    case 'c':
2757
2.19M
    {
2758
2.19M
        int ordinal = va_arg(*vargs, int);
2759
2.19M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2760
0
            PyErr_SetString(PyExc_OverflowError,
2761
0
                            "character argument not in range(0x110000)");
2762
0
            return NULL;
2763
0
        }
2764
2.19M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2765
0
            return NULL;
2766
2.19M
        break;
2767
2.19M
    }
2768
2769
11.7M
    case 'd': case 'i':
2770
11.8M
    case 'o': case 'u': case 'x': case 'X':
2771
11.8M
    {
2772
11.8M
        char buffer[MAX_INTMAX_CHARS];
2773
2774
        // Fill buffer using sprinf, with one of many possible format
2775
        // strings, like "%llX" for `long long` in hexadecimal.
2776
        // The type/size is in `sizemod`; the format is in `*f`.
2777
2778
        // Use macros with nested switches to keep the sprintf format strings
2779
        // as compile-time literals, avoiding warnings and maybe allowing
2780
        // optimizations.
2781
2782
        // `SPRINT` macro does one sprintf
2783
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2784
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2785
11.8M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2786
11.8M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2787
2788
        // One inner switch to handle all format variants
2789
11.8M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2790
11.8M
            switch (*f) {                                                     \
2791
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2792
16.9k
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2793
1.29k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2794
953
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2795
11.7M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2796
11.8M
            }
2797
2798
        // Outer switch to handle all the sizes/types
2799
11.8M
        switch (sizemod) {
2800
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2801
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2802
105k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2803
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2804
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2805
11.7M
            default:         DO_SPRINTS("", int, unsigned int); break;
2806
11.8M
        }
2807
11.8M
        #undef SPRINT
2808
11.8M
        #undef DO_SPRINTS
2809
2810
11.8M
        assert(len >= 0);
2811
2812
11.8M
        int sign = (buffer[0] == '-');
2813
11.8M
        len -= sign;
2814
2815
11.8M
        precision = Py_MAX(precision, len);
2816
11.8M
        width = Py_MAX(width, precision + sign);
2817
11.8M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2818
1.66k
            precision = width - sign;
2819
1.66k
        }
2820
2821
11.8M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2822
11.8M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2823
2824
11.8M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2825
0
            return NULL;
2826
2827
11.8M
        if (spacepad && !(flags & F_LJUST)) {
2828
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2829
0
                return NULL;
2830
0
            writer->pos += spacepad;
2831
0
        }
2832
2833
11.8M
        if (sign) {
2834
818
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2835
0
                return NULL;
2836
818
        }
2837
2838
11.8M
        if (zeropad) {
2839
641
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2840
0
                return NULL;
2841
641
            writer->pos += zeropad;
2842
641
        }
2843
2844
11.8M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2845
0
            return NULL;
2846
2847
11.8M
        if (spacepad && (flags & F_LJUST)) {
2848
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2849
0
                return NULL;
2850
0
            writer->pos += spacepad;
2851
0
        }
2852
11.8M
        break;
2853
11.8M
    }
2854
2855
11.8M
    case 'p':
2856
2.92k
    {
2857
2.92k
        char number[MAX_INTMAX_CHARS];
2858
2859
2.92k
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2860
2.92k
        assert(len >= 0);
2861
2862
        /* %p is ill-defined:  ensure leading 0x. */
2863
2.92k
        if (number[1] == 'X')
2864
0
            number[1] = 'x';
2865
2.92k
        else if (number[1] != 'x') {
2866
0
            memmove(number + 2, number,
2867
0
                    strlen(number) + 1);
2868
0
            number[0] = '0';
2869
0
            number[1] = 'x';
2870
0
            len += 2;
2871
0
        }
2872
2873
2.92k
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2874
0
            return NULL;
2875
2.92k
        break;
2876
2.92k
    }
2877
2878
3.62M
    case 's':
2879
3.62M
    {
2880
3.62M
        if (sizemod) {
2881
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2882
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2883
0
                return NULL;
2884
0
        }
2885
3.62M
        else {
2886
            /* UTF-8 */
2887
3.62M
            const char *s = va_arg(*vargs, const char*);
2888
3.62M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2889
0
                return NULL;
2890
3.62M
        }
2891
3.62M
        break;
2892
3.62M
    }
2893
2894
4.34M
    case 'U':
2895
4.34M
    {
2896
4.34M
        PyObject *obj = va_arg(*vargs, PyObject *);
2897
4.34M
        assert(obj && _PyUnicode_CHECK(obj));
2898
2899
4.34M
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2900
0
            return NULL;
2901
4.34M
        break;
2902
4.34M
    }
2903
2904
4.34M
    case 'V':
2905
577
    {
2906
577
        PyObject *obj = va_arg(*vargs, PyObject *);
2907
577
        const char *str;
2908
577
        const wchar_t *wstr;
2909
577
        if (sizemod) {
2910
0
            wstr = va_arg(*vargs, const wchar_t*);
2911
0
        }
2912
577
        else {
2913
577
            str = va_arg(*vargs, const char *);
2914
577
        }
2915
577
        if (obj) {
2916
0
            assert(_PyUnicode_CHECK(obj));
2917
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2918
0
                return NULL;
2919
0
        }
2920
577
        else if (sizemod) {
2921
0
            assert(wstr != NULL);
2922
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2923
0
                return NULL;
2924
0
        }
2925
577
        else {
2926
577
            assert(str != NULL);
2927
577
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
2928
0
                return NULL;
2929
577
        }
2930
577
        break;
2931
577
    }
2932
2933
1.87k
    case 'S':
2934
1.87k
    {
2935
1.87k
        PyObject *obj = va_arg(*vargs, PyObject *);
2936
1.87k
        PyObject *str;
2937
1.87k
        assert(obj);
2938
1.87k
        str = PyObject_Str(obj);
2939
1.87k
        if (!str)
2940
0
            return NULL;
2941
1.87k
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2942
0
            Py_DECREF(str);
2943
0
            return NULL;
2944
0
        }
2945
1.87k
        Py_DECREF(str);
2946
1.87k
        break;
2947
1.87k
    }
2948
2949
3.30M
    case 'R':
2950
3.30M
    {
2951
3.30M
        PyObject *obj = va_arg(*vargs, PyObject *);
2952
3.30M
        PyObject *repr;
2953
3.30M
        assert(obj);
2954
3.30M
        repr = PyObject_Repr(obj);
2955
3.30M
        if (!repr)
2956
0
            return NULL;
2957
3.30M
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2958
0
            Py_DECREF(repr);
2959
0
            return NULL;
2960
0
        }
2961
3.30M
        Py_DECREF(repr);
2962
3.30M
        break;
2963
3.30M
    }
2964
2965
0
    case 'A':
2966
0
    {
2967
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2968
0
        PyObject *ascii;
2969
0
        assert(obj);
2970
0
        ascii = PyObject_ASCII(obj);
2971
0
        if (!ascii)
2972
0
            return NULL;
2973
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2974
0
            Py_DECREF(ascii);
2975
0
            return NULL;
2976
0
        }
2977
0
        Py_DECREF(ascii);
2978
0
        break;
2979
0
    }
2980
2981
2.19M
    case 'T':
2982
2.19M
    {
2983
2.19M
        PyObject *obj = va_arg(*vargs, PyObject *);
2984
2.19M
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2985
2986
2.19M
        PyObject *type_name;
2987
2.19M
        if (flags & F_ALT) {
2988
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
2989
0
        }
2990
2.19M
        else {
2991
2.19M
            type_name = PyType_GetFullyQualifiedName(type);
2992
2.19M
        }
2993
2.19M
        Py_DECREF(type);
2994
2.19M
        if (!type_name) {
2995
0
            return NULL;
2996
0
        }
2997
2998
2.19M
        if (unicode_fromformat_write_str(writer, type_name,
2999
2.19M
                                         width, precision, flags) == -1) {
3000
0
            Py_DECREF(type_name);
3001
0
            return NULL;
3002
0
        }
3003
2.19M
        Py_DECREF(type_name);
3004
2.19M
        break;
3005
2.19M
    }
3006
3007
0
    case 'N':
3008
0
    {
3009
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3010
0
        assert(type_raw != NULL);
3011
3012
0
        if (!PyType_Check(type_raw)) {
3013
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3014
0
            return NULL;
3015
0
        }
3016
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3017
3018
0
        PyObject *type_name;
3019
0
        if (flags & F_ALT) {
3020
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3021
0
        }
3022
0
        else {
3023
0
            type_name = PyType_GetFullyQualifiedName(type);
3024
0
        }
3025
0
        if (!type_name) {
3026
0
            return NULL;
3027
0
        }
3028
0
        if (unicode_fromformat_write_str(writer, type_name,
3029
0
                                         width, precision, flags) == -1) {
3030
0
            Py_DECREF(type_name);
3031
0
            return NULL;
3032
0
        }
3033
0
        Py_DECREF(type_name);
3034
0
        break;
3035
0
    }
3036
3037
0
    default:
3038
0
    invalid_format:
3039
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3040
0
        return NULL;
3041
27.4M
    }
3042
3043
27.4M
    f++;
3044
27.4M
    return f;
3045
27.4M
}
3046
3047
static int
3048
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3049
13.3M
{
3050
13.3M
    Py_ssize_t len = strlen(format);
3051
13.3M
    writer->min_length += len + 100;
3052
13.3M
    writer->overallocate = 1;
3053
3054
    // Copy varags to be able to pass a reference to a subfunction.
3055
13.3M
    va_list vargs2;
3056
13.3M
    va_copy(vargs2, vargs);
3057
3058
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3059
    // to be encoded to ASCII.
3060
13.3M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3061
13.3M
    if (!is_ascii) {
3062
0
        Py_ssize_t i;
3063
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3064
0
        PyErr_Format(PyExc_ValueError,
3065
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3066
0
            "string, got a non-ASCII byte: 0x%02x",
3067
0
            (unsigned char)format[i]);
3068
0
        goto fail;
3069
0
    }
3070
3071
74.2M
    for (const char *f = format; *f; ) {
3072
60.8M
        if (*f == '%') {
3073
28.6M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3074
28.6M
            if (f == NULL)
3075
0
                goto fail;
3076
28.6M
        }
3077
32.2M
        else {
3078
32.2M
            const char *p = strchr(f, '%');
3079
32.2M
            if (p != NULL) {
3080
24.4M
                len = p - f;
3081
24.4M
            }
3082
7.78M
            else {
3083
7.78M
                len = strlen(f);
3084
7.78M
                writer->overallocate = 0;
3085
7.78M
            }
3086
3087
32.2M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3088
0
                goto fail;
3089
0
            }
3090
32.2M
            f += len;
3091
32.2M
        }
3092
60.8M
    }
3093
13.3M
    va_end(vargs2);
3094
13.3M
    return 0;
3095
3096
0
  fail:
3097
0
    va_end(vargs2);
3098
0
    return -1;
3099
13.3M
}
3100
3101
PyObject *
3102
PyUnicode_FromFormatV(const char *format, va_list vargs)
3103
13.3M
{
3104
13.3M
    _PyUnicodeWriter writer;
3105
13.3M
    _PyUnicodeWriter_Init(&writer);
3106
3107
13.3M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3108
0
        _PyUnicodeWriter_Dealloc(&writer);
3109
0
        return NULL;
3110
0
    }
3111
13.3M
    return _PyUnicodeWriter_Finish(&writer);
3112
13.3M
}
3113
3114
PyObject *
3115
PyUnicode_FromFormat(const char *format, ...)
3116
1.03M
{
3117
1.03M
    PyObject* ret;
3118
1.03M
    va_list vargs;
3119
3120
1.03M
    va_start(vargs, format);
3121
1.03M
    ret = PyUnicode_FromFormatV(format, vargs);
3122
1.03M
    va_end(vargs);
3123
1.03M
    return ret;
3124
1.03M
}
3125
3126
int
3127
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3128
0
{
3129
0
    va_list vargs;
3130
0
    va_start(vargs, format);
3131
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3132
0
    va_end(vargs);
3133
0
    return res;
3134
0
}
3135
3136
int
3137
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3138
                         va_list vargs)
3139
0
{
3140
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3141
0
    Py_ssize_t old_pos = _writer->pos;
3142
3143
0
    int res = unicode_from_format(_writer, format, vargs);
3144
3145
0
    if (res < 0) {
3146
0
        _writer->pos = old_pos;
3147
0
    }
3148
0
    return res;
3149
0
}
3150
3151
static Py_ssize_t
3152
unicode_get_widechar_size(PyObject *unicode)
3153
278k
{
3154
278k
    Py_ssize_t res;
3155
3156
278k
    assert(unicode != NULL);
3157
278k
    assert(_PyUnicode_CHECK(unicode));
3158
3159
278k
    res = _PyUnicode_LENGTH(unicode);
3160
#if SIZEOF_WCHAR_T == 2
3161
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3162
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3163
        const Py_UCS4 *end = s + res;
3164
        for (; s < end; ++s) {
3165
            if (*s > 0xFFFF) {
3166
                ++res;
3167
            }
3168
        }
3169
    }
3170
#endif
3171
278k
    return res;
3172
278k
}
3173
3174
static void
3175
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3176
278k
{
3177
278k
    assert(unicode != NULL);
3178
278k
    assert(_PyUnicode_CHECK(unicode));
3179
3180
278k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3181
789
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3182
789
        return;
3183
789
    }
3184
3185
277k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3186
207k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3187
4.15M
        for (; size--; ++s, ++w) {
3188
3.94M
            *w = *s;
3189
3.94M
        }
3190
207k
    }
3191
69.7k
    else {
3192
69.7k
#if SIZEOF_WCHAR_T == 4
3193
69.7k
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3194
69.7k
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3195
1.45M
        for (; size--; ++s, ++w) {
3196
1.38M
            *w = *s;
3197
1.38M
        }
3198
#else
3199
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3200
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3201
        for (; size--; ++s, ++w) {
3202
            Py_UCS4 ch = *s;
3203
            if (ch > 0xFFFF) {
3204
                assert(ch <= MAX_UNICODE);
3205
                /* encode surrogate pair in this case */
3206
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3207
                if (!size--)
3208
                    break;
3209
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3210
            }
3211
            else {
3212
                *w = ch;
3213
            }
3214
        }
3215
#endif
3216
69.7k
    }
3217
277k
}
3218
3219
#ifdef HAVE_WCHAR_H
3220
3221
/* Convert a Unicode object to a wide character string.
3222
3223
   - If w is NULL: return the number of wide characters (including the null
3224
     character) required to convert the unicode object. Ignore size argument.
3225
3226
   - Otherwise: return the number of wide characters (excluding the null
3227
     character) written into w. Write at most size wide characters (including
3228
     the null character). */
3229
Py_ssize_t
3230
PyUnicode_AsWideChar(PyObject *unicode,
3231
                     wchar_t *w,
3232
                     Py_ssize_t size)
3233
2.10k
{
3234
2.10k
    Py_ssize_t res;
3235
3236
2.10k
    if (unicode == NULL) {
3237
0
        PyErr_BadInternalCall();
3238
0
        return -1;
3239
0
    }
3240
2.10k
    if (!PyUnicode_Check(unicode)) {
3241
0
        PyErr_BadArgument();
3242
0
        return -1;
3243
0
    }
3244
3245
2.10k
    res = unicode_get_widechar_size(unicode);
3246
2.10k
    if (w == NULL) {
3247
0
        return res + 1;
3248
0
    }
3249
3250
2.10k
    if (size > res) {
3251
2.10k
        size = res + 1;
3252
2.10k
    }
3253
0
    else {
3254
0
        res = size;
3255
0
    }
3256
2.10k
    unicode_copy_as_widechar(unicode, w, size);
3257
3258
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3259
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3260
       non-Unicode locales and hence needs conversion first. */
3261
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3262
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3263
            return -1;
3264
        }
3265
    }
3266
#endif
3267
3268
2.10k
    return res;
3269
2.10k
}
3270
3271
wchar_t*
3272
PyUnicode_AsWideCharString(PyObject *unicode,
3273
                           Py_ssize_t *size)
3274
276k
{
3275
276k
    wchar_t *buffer;
3276
276k
    Py_ssize_t buflen;
3277
3278
276k
    if (unicode == NULL) {
3279
0
        PyErr_BadInternalCall();
3280
0
        return NULL;
3281
0
    }
3282
276k
    if (!PyUnicode_Check(unicode)) {
3283
0
        PyErr_BadArgument();
3284
0
        return NULL;
3285
0
    }
3286
3287
276k
    buflen = unicode_get_widechar_size(unicode);
3288
276k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3289
276k
    if (buffer == NULL) {
3290
0
        PyErr_NoMemory();
3291
0
        return NULL;
3292
0
    }
3293
276k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3294
3295
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3296
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3297
       non-Unicode locales and hence needs conversion first. */
3298
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3299
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3300
            return NULL;
3301
        }
3302
    }
3303
#endif
3304
3305
276k
    if (size != NULL) {
3306
274k
        *size = buflen;
3307
274k
    }
3308
1.11k
    else if (wcslen(buffer) != (size_t)buflen) {
3309
0
        PyMem_Free(buffer);
3310
0
        PyErr_SetString(PyExc_ValueError,
3311
0
                        "embedded null character");
3312
0
        return NULL;
3313
0
    }
3314
276k
    return buffer;
3315
276k
}
3316
3317
#endif /* HAVE_WCHAR_H */
3318
3319
int
3320
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3321
0
{
3322
0
    wchar_t **p = (wchar_t **)ptr;
3323
0
    if (obj == NULL) {
3324
0
        PyMem_Free(*p);
3325
0
        *p = NULL;
3326
0
        return 1;
3327
0
    }
3328
0
    if (PyUnicode_Check(obj)) {
3329
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3330
0
        if (*p == NULL) {
3331
0
            return 0;
3332
0
        }
3333
0
        return Py_CLEANUP_SUPPORTED;
3334
0
    }
3335
0
    PyErr_Format(PyExc_TypeError,
3336
0
                 "argument must be str, not %.50s",
3337
0
                 Py_TYPE(obj)->tp_name);
3338
0
    return 0;
3339
0
}
3340
3341
int
3342
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3343
0
{
3344
0
    wchar_t **p = (wchar_t **)ptr;
3345
0
    if (obj == NULL) {
3346
0
        PyMem_Free(*p);
3347
0
        *p = NULL;
3348
0
        return 1;
3349
0
    }
3350
0
    if (obj == Py_None) {
3351
0
        *p = NULL;
3352
0
        return 1;
3353
0
    }
3354
0
    if (PyUnicode_Check(obj)) {
3355
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3356
0
        if (*p == NULL) {
3357
0
            return 0;
3358
0
        }
3359
0
        return Py_CLEANUP_SUPPORTED;
3360
0
    }
3361
0
    PyErr_Format(PyExc_TypeError,
3362
0
                 "argument must be str or None, not %.50s",
3363
0
                 Py_TYPE(obj)->tp_name);
3364
0
    return 0;
3365
0
}
3366
3367
PyObject *
3368
PyUnicode_FromOrdinal(int ordinal)
3369
4.14M
{
3370
4.14M
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3371
32
        PyErr_SetString(PyExc_ValueError,
3372
32
                        "chr() arg not in range(0x110000)");
3373
32
        return NULL;
3374
32
    }
3375
3376
4.14M
    return unicode_char((Py_UCS4)ordinal);
3377
4.14M
}
3378
3379
PyObject *
3380
PyUnicode_FromObject(PyObject *obj)
3381
1.53M
{
3382
    /* XXX Perhaps we should make this API an alias of
3383
       PyObject_Str() instead ?! */
3384
1.53M
    if (PyUnicode_CheckExact(obj)) {
3385
1.53M
        return Py_NewRef(obj);
3386
1.53M
    }
3387
0
    if (PyUnicode_Check(obj)) {
3388
        /* For a Unicode subtype that's not a Unicode object,
3389
           return a true Unicode object with the same data. */
3390
0
        return _PyUnicode_Copy(obj);
3391
0
    }
3392
0
    PyErr_Format(PyExc_TypeError,
3393
0
                 "Can't convert '%.100s' object to str implicitly",
3394
0
                 Py_TYPE(obj)->tp_name);
3395
0
    return NULL;
3396
0
}
3397
3398
PyObject *
3399
PyUnicode_FromEncodedObject(PyObject *obj,
3400
                            const char *encoding,
3401
                            const char *errors)
3402
23.2M
{
3403
23.2M
    Py_buffer buffer;
3404
23.2M
    PyObject *v;
3405
3406
23.2M
    if (obj == NULL) {
3407
0
        PyErr_BadInternalCall();
3408
0
        return NULL;
3409
0
    }
3410
3411
    /* Decoding bytes objects is the most common case and should be fast */
3412
23.2M
    if (PyBytes_Check(obj)) {
3413
22.7M
        if (PyBytes_GET_SIZE(obj) == 0) {
3414
3.11M
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3415
0
                return NULL;
3416
0
            }
3417
3.11M
            _Py_RETURN_UNICODE_EMPTY();
3418
3.11M
        }
3419
19.6M
        return PyUnicode_Decode(
3420
19.6M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3421
19.6M
                encoding, errors);
3422
22.7M
    }
3423
3424
502k
    if (PyUnicode_Check(obj)) {
3425
0
        PyErr_SetString(PyExc_TypeError,
3426
0
                        "decoding str is not supported");
3427
0
        return NULL;
3428
0
    }
3429
3430
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3431
502k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3432
0
        PyErr_Format(PyExc_TypeError,
3433
0
                     "decoding to str: need a bytes-like object, %.80s found",
3434
0
                     Py_TYPE(obj)->tp_name);
3435
0
        return NULL;
3436
0
    }
3437
3438
502k
    if (buffer.len == 0) {
3439
0
        PyBuffer_Release(&buffer);
3440
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3441
0
            return NULL;
3442
0
        }
3443
0
        _Py_RETURN_UNICODE_EMPTY();
3444
0
    }
3445
3446
502k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3447
502k
    PyBuffer_Release(&buffer);
3448
502k
    return v;
3449
502k
}
3450
3451
/* Normalize an encoding name like encodings.normalize_encoding()
3452
   but allow to convert to lowercase if *to_lower* is true.
3453
   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
3454
int
3455
_Py_normalize_encoding(const char *encoding,
3456
                       char *lower,
3457
                       size_t lower_len,
3458
                       int to_lower)
3459
26.2M
{
3460
26.2M
    const char *e;
3461
26.2M
    char *l;
3462
26.2M
    char *l_end;
3463
26.2M
    int punct;
3464
3465
26.2M
    assert(encoding != NULL);
3466
3467
26.2M
    e = encoding;
3468
26.2M
    l = lower;
3469
26.2M
    l_end = &lower[lower_len - 1];
3470
26.2M
    punct = 0;
3471
171M
    while (1) {
3472
171M
        char c = *e;
3473
171M
        if (c == 0) {
3474
25.4M
            break;
3475
25.4M
        }
3476
3477
145M
        if (Py_ISALNUM(c) || c == '.') {
3478
128M
            if (punct && l != lower) {
3479
13.1M
                if (l == l_end) {
3480
724
                    return 0;
3481
724
                }
3482
13.1M
                *l++ = '_';
3483
13.1M
            }
3484
128M
            punct = 0;
3485
3486
128M
            if (l == l_end) {
3487
740k
                return 0;
3488
740k
            }
3489
127M
            *l++ = to_lower ? Py_TOLOWER(c) : c;
3490
127M
        }
3491
17.2M
        else {
3492
17.2M
            punct = 1;
3493
17.2M
        }
3494
3495
145M
        e++;
3496
145M
    }
3497
25.4M
    *l = '\0';
3498
25.4M
    return 1;
3499
26.2M
}
3500
3501
PyObject *
3502
PyUnicode_Decode(const char *s,
3503
                 Py_ssize_t size,
3504
                 const char *encoding,
3505
                 const char *errors)
3506
20.2M
{
3507
20.2M
    PyObject *buffer = NULL, *unicode;
3508
20.2M
    Py_buffer info;
3509
20.2M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3510
3511
20.2M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3512
0
        return NULL;
3513
0
    }
3514
3515
20.2M
    if (size == 0) {
3516
0
        _Py_RETURN_UNICODE_EMPTY();
3517
0
    }
3518
3519
20.2M
    if (encoding == NULL) {
3520
42.0k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3521
42.0k
    }
3522
3523
    /* Shortcuts for common default encodings */
3524
20.1M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3525
20.1M
        char *lower = buflower;
3526
3527
        /* Fast paths */
3528
20.1M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3529
4.26M
            lower += 3;
3530
4.26M
            if (*lower == '_') {
3531
                /* Match "utf8" and "utf_8" */
3532
4.26M
                lower++;
3533
4.26M
            }
3534
3535
4.26M
            if (lower[0] == '8' && lower[1] == 0) {
3536
4.26M
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3537
4.26M
            }
3538
628
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3539
113
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3540
113
            }
3541
515
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3542
83
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3543
83
            }
3544
4.26M
        }
3545
15.9M
        else {
3546
15.9M
            if (strcmp(lower, "ascii") == 0
3547
12.1M
                || strcmp(lower, "us_ascii") == 0) {
3548
12.1M
                return PyUnicode_DecodeASCII(s, size, errors);
3549
12.1M
            }
3550
    #ifdef MS_WINDOWS
3551
            else if (strcmp(lower, "mbcs") == 0) {
3552
                return PyUnicode_DecodeMBCS(s, size, errors);
3553
            }
3554
    #endif
3555
3.79M
            else if (strcmp(lower, "latin1") == 0
3556
3.79M
                     || strcmp(lower, "latin_1") == 0
3557
938k
                     || strcmp(lower, "iso_8859_1") == 0
3558
2.88M
                     || strcmp(lower, "iso8859_1") == 0) {
3559
2.88M
                return PyUnicode_DecodeLatin1(s, size, errors);
3560
2.88M
            }
3561
15.9M
        }
3562
20.1M
    }
3563
3564
    /* Decode via the codec registry */
3565
917k
    buffer = NULL;
3566
917k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3567
0
        goto onError;
3568
917k
    buffer = PyMemoryView_FromBuffer(&info);
3569
917k
    if (buffer == NULL)
3570
0
        goto onError;
3571
917k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3572
917k
    if (unicode == NULL)
3573
76.1k
        goto onError;
3574
841k
    if (!PyUnicode_Check(unicode)) {
3575
0
        PyErr_Format(PyExc_TypeError,
3576
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3577
0
                     "use codecs.decode() to decode to arbitrary types",
3578
0
                     encoding,
3579
0
                     Py_TYPE(unicode)->tp_name);
3580
0
        Py_DECREF(unicode);
3581
0
        goto onError;
3582
0
    }
3583
841k
    Py_DECREF(buffer);
3584
841k
    return unicode_result(unicode);
3585
3586
76.1k
  onError:
3587
76.1k
    Py_XDECREF(buffer);
3588
76.1k
    return NULL;
3589
841k
}
3590
3591
PyAPI_FUNC(PyObject *)
3592
PyUnicode_AsDecodedObject(PyObject *unicode,
3593
                          const char *encoding,
3594
                          const char *errors)
3595
0
{
3596
0
    if (!PyUnicode_Check(unicode)) {
3597
0
        PyErr_BadArgument();
3598
0
        return NULL;
3599
0
    }
3600
3601
0
    if (encoding == NULL)
3602
0
        encoding = PyUnicode_GetDefaultEncoding();
3603
3604
    /* Decode via the codec registry */
3605
0
    return PyCodec_Decode(unicode, encoding, errors);
3606
0
}
3607
3608
PyAPI_FUNC(PyObject *)
3609
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3610
                           const char *encoding,
3611
                           const char *errors)
3612
0
{
3613
0
    PyObject *v;
3614
3615
0
    if (!PyUnicode_Check(unicode)) {
3616
0
        PyErr_BadArgument();
3617
0
        goto onError;
3618
0
    }
3619
3620
0
    if (encoding == NULL)
3621
0
        encoding = PyUnicode_GetDefaultEncoding();
3622
3623
    /* Decode via the codec registry */
3624
0
    v = PyCodec_Decode(unicode, encoding, errors);
3625
0
    if (v == NULL)
3626
0
        goto onError;
3627
0
    if (!PyUnicode_Check(v)) {
3628
0
        PyErr_Format(PyExc_TypeError,
3629
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3630
0
                     "use codecs.decode() to decode to arbitrary types",
3631
0
                     encoding,
3632
0
                     Py_TYPE(unicode)->tp_name);
3633
0
        Py_DECREF(v);
3634
0
        goto onError;
3635
0
    }
3636
0
    return unicode_result(v);
3637
3638
0
  onError:
3639
0
    return NULL;
3640
0
}
3641
3642
PyAPI_FUNC(PyObject *)
3643
PyUnicode_AsEncodedObject(PyObject *unicode,
3644
                          const char *encoding,
3645
                          const char *errors)
3646
0
{
3647
0
    PyObject *v;
3648
3649
0
    if (!PyUnicode_Check(unicode)) {
3650
0
        PyErr_BadArgument();
3651
0
        goto onError;
3652
0
    }
3653
3654
0
    if (encoding == NULL)
3655
0
        encoding = PyUnicode_GetDefaultEncoding();
3656
3657
    /* Encode via the codec registry */
3658
0
    v = PyCodec_Encode(unicode, encoding, errors);
3659
0
    if (v == NULL)
3660
0
        goto onError;
3661
0
    return v;
3662
3663
0
  onError:
3664
0
    return NULL;
3665
0
}
3666
3667
3668
static PyObject *
3669
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3670
                      int current_locale)
3671
900
{
3672
900
    Py_ssize_t wlen;
3673
900
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3674
900
    if (wstr == NULL) {
3675
0
        return NULL;
3676
0
    }
3677
3678
900
    if ((size_t)wlen != wcslen(wstr)) {
3679
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3680
0
        PyMem_Free(wstr);
3681
0
        return NULL;
3682
0
    }
3683
3684
900
    char *str;
3685
900
    size_t error_pos;
3686
900
    const char *reason;
3687
900
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3688
900
                                 current_locale, error_handler);
3689
900
    PyMem_Free(wstr);
3690
3691
900
    if (res != 0) {
3692
0
        if (res == -2) {
3693
0
            PyObject *exc;
3694
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3695
0
                    "locale", unicode,
3696
0
                    (Py_ssize_t)error_pos,
3697
0
                    (Py_ssize_t)(error_pos+1),
3698
0
                    reason);
3699
0
            if (exc != NULL) {
3700
0
                PyCodec_StrictErrors(exc);
3701
0
                Py_DECREF(exc);
3702
0
            }
3703
0
        }
3704
0
        else if (res == -3) {
3705
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3706
0
        }
3707
0
        else {
3708
0
            PyErr_NoMemory();
3709
0
        }
3710
0
        return NULL;
3711
0
    }
3712
3713
900
    PyObject *bytes = PyBytes_FromString(str);
3714
900
    PyMem_RawFree(str);
3715
900
    return bytes;
3716
900
}
3717
3718
PyObject *
3719
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3720
0
{
3721
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3722
0
    return unicode_encode_locale(unicode, error_handler, 1);
3723
0
}
3724
3725
PyObject *
3726
PyUnicode_EncodeFSDefault(PyObject *unicode)
3727
1.38M
{
3728
1.38M
    PyInterpreterState *interp = _PyInterpreterState_GET();
3729
1.38M
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3730
1.38M
    if (fs_codec->utf8) {
3731
1.38M
        return unicode_encode_utf8(unicode,
3732
1.38M
                                   fs_codec->error_handler,
3733
1.38M
                                   fs_codec->errors);
3734
1.38M
    }
3735
900
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3736
900
    else if (fs_codec->encoding) {
3737
0
        return PyUnicode_AsEncodedString(unicode,
3738
0
                                         fs_codec->encoding,
3739
0
                                         fs_codec->errors);
3740
0
    }
3741
900
#endif
3742
900
    else {
3743
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3744
           machinery is not ready and so cannot be used:
3745
           use wcstombs() in this case. */
3746
900
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3747
900
        const wchar_t *filesystem_errors = config->filesystem_errors;
3748
900
        assert(filesystem_errors != NULL);
3749
900
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3750
900
        assert(errors != _Py_ERROR_UNKNOWN);
3751
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3752
        return unicode_encode_utf8(unicode, errors, NULL);
3753
#else
3754
900
        return unicode_encode_locale(unicode, errors, 0);
3755
900
#endif
3756
900
    }
3757
1.38M
}
3758
3759
PyObject *
3760
PyUnicode_AsEncodedString(PyObject *unicode,
3761
                          const char *encoding,
3762
                          const char *errors)
3763
19.1M
{
3764
19.1M
    PyObject *v;
3765
19.1M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3766
3767
19.1M
    if (!PyUnicode_Check(unicode)) {
3768
0
        PyErr_BadArgument();
3769
0
        return NULL;
3770
0
    }
3771
3772
19.1M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3773
0
        return NULL;
3774
0
    }
3775
3776
19.1M
    if (encoding == NULL) {
3777
13.1M
        return _PyUnicode_AsUTF8String(unicode, errors);
3778
13.1M
    }
3779
3780
    /* Shortcuts for common default encodings */
3781
6.03M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3782
5.29M
        char *lower = buflower;
3783
3784
        /* Fast paths */
3785
5.29M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3786
4.98M
            lower += 3;
3787
4.98M
            if (*lower == '_') {
3788
                /* Match "utf8" and "utf_8" */
3789
4.98M
                lower++;
3790
4.98M
            }
3791
3792
4.98M
            if (lower[0] == '8' && lower[1] == 0) {
3793
4.97M
                return _PyUnicode_AsUTF8String(unicode, errors);
3794
4.97M
            }
3795
6.46k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3796
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3797
0
            }
3798
6.46k
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3799
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3800
0
            }
3801
4.98M
        }
3802
309k
        else {
3803
309k
            if (strcmp(lower, "ascii") == 0
3804
303k
                || strcmp(lower, "us_ascii") == 0) {
3805
303k
                return _PyUnicode_AsASCIIString(unicode, errors);
3806
303k
            }
3807
#ifdef MS_WINDOWS
3808
            else if (strcmp(lower, "mbcs") == 0) {
3809
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3810
            }
3811
#endif
3812
6.56k
            else if (strcmp(lower, "latin1") == 0 ||
3813
6.56k
                     strcmp(lower, "latin_1") == 0 ||
3814
6.55k
                     strcmp(lower, "iso_8859_1") == 0 ||
3815
6.55k
                     strcmp(lower, "iso8859_1") == 0) {
3816
10
                return _PyUnicode_AsLatin1String(unicode, errors);
3817
10
            }
3818
309k
        }
3819
5.29M
    }
3820
3821
    /* Encode via the codec registry */
3822
752k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3823
752k
    if (v == NULL)
3824
0
        return NULL;
3825
3826
    /* The normal path */
3827
752k
    if (PyBytes_Check(v))
3828
752k
        return v;
3829
3830
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3831
0
    if (PyByteArray_Check(v)) {
3832
0
        int error;
3833
0
        PyObject *b;
3834
3835
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3836
0
            "encoder %s returned bytearray instead of bytes; "
3837
0
            "use codecs.encode() to encode to arbitrary types",
3838
0
            encoding);
3839
0
        if (error) {
3840
0
            Py_DECREF(v);
3841
0
            return NULL;
3842
0
        }
3843
3844
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3845
0
                                      PyByteArray_GET_SIZE(v));
3846
0
        Py_DECREF(v);
3847
0
        return b;
3848
0
    }
3849
3850
0
    PyErr_Format(PyExc_TypeError,
3851
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3852
0
                 "use codecs.encode() to encode to arbitrary types",
3853
0
                 encoding,
3854
0
                 Py_TYPE(v)->tp_name);
3855
0
    Py_DECREF(v);
3856
0
    return NULL;
3857
0
}
3858
3859
PyAPI_FUNC(PyObject *)
3860
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3861
                           const char *encoding,
3862
                           const char *errors)
3863
0
{
3864
0
    PyObject *v;
3865
3866
0
    if (!PyUnicode_Check(unicode)) {
3867
0
        PyErr_BadArgument();
3868
0
        goto onError;
3869
0
    }
3870
3871
0
    if (encoding == NULL)
3872
0
        encoding = PyUnicode_GetDefaultEncoding();
3873
3874
    /* Encode via the codec registry */
3875
0
    v = PyCodec_Encode(unicode, encoding, errors);
3876
0
    if (v == NULL)
3877
0
        goto onError;
3878
0
    if (!PyUnicode_Check(v)) {
3879
0
        PyErr_Format(PyExc_TypeError,
3880
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3881
0
                     "use codecs.encode() to encode to arbitrary types",
3882
0
                     encoding,
3883
0
                     Py_TYPE(v)->tp_name);
3884
0
        Py_DECREF(v);
3885
0
        goto onError;
3886
0
    }
3887
0
    return v;
3888
3889
0
  onError:
3890
0
    return NULL;
3891
0
}
3892
3893
static PyObject*
3894
unicode_decode_locale(const char *str, Py_ssize_t len,
3895
                      _Py_error_handler errors, int current_locale)
3896
417k
{
3897
417k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3898
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3899
0
        return NULL;
3900
0
    }
3901
3902
417k
    wchar_t *wstr;
3903
417k
    size_t wlen;
3904
417k
    const char *reason;
3905
417k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3906
417k
                                 current_locale, errors);
3907
417k
    if (res != 0) {
3908
0
        if (res == -2) {
3909
0
            PyObject *exc;
3910
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3911
0
                                        "locale", str, len,
3912
0
                                        (Py_ssize_t)wlen,
3913
0
                                        (Py_ssize_t)(wlen + 1),
3914
0
                                        reason);
3915
0
            if (exc != NULL) {
3916
0
                PyCodec_StrictErrors(exc);
3917
0
                Py_DECREF(exc);
3918
0
            }
3919
0
        }
3920
0
        else if (res == -3) {
3921
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3922
0
        }
3923
0
        else {
3924
0
            PyErr_NoMemory();
3925
0
        }
3926
0
        return NULL;
3927
0
    }
3928
3929
417k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3930
417k
    PyMem_RawFree(wstr);
3931
417k
    return unicode;
3932
417k
}
3933
3934
PyObject*
3935
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3936
                              const char *errors)
3937
0
{
3938
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3939
0
    return unicode_decode_locale(str, len, error_handler, 1);
3940
0
}
3941
3942
PyObject*
3943
PyUnicode_DecodeLocale(const char *str, const char *errors)
3944
406k
{
3945
406k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3946
406k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3947
406k
    return unicode_decode_locale(str, size, error_handler, 1);
3948
406k
}
3949
3950
3951
PyObject*
3952
213
PyUnicode_DecodeFSDefault(const char *s) {
3953
213
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3954
213
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3955
213
}
3956
3957
PyObject*
3958
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3959
253k
{
3960
253k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3961
253k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3962
253k
    if (fs_codec->utf8) {
3963
242k
        return unicode_decode_utf8(s, size,
3964
242k
                                   fs_codec->error_handler,
3965
242k
                                   fs_codec->errors,
3966
242k
                                   NULL);
3967
242k
    }
3968
11.5k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3969
11.5k
    else if (fs_codec->encoding) {
3970
0
        return PyUnicode_Decode(s, size,
3971
0
                                fs_codec->encoding,
3972
0
                                fs_codec->errors);
3973
0
    }
3974
11.5k
#endif
3975
11.5k
    else {
3976
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3977
           machinery is not ready and so cannot be used:
3978
           use mbstowcs() in this case. */
3979
11.5k
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3980
11.5k
        const wchar_t *filesystem_errors = config->filesystem_errors;
3981
11.5k
        assert(filesystem_errors != NULL);
3982
11.5k
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3983
11.5k
        assert(errors != _Py_ERROR_UNKNOWN);
3984
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3985
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3986
#else
3987
11.5k
        return unicode_decode_locale(s, size, errors, 0);
3988
11.5k
#endif
3989
11.5k
    }
3990
253k
}
3991
3992
3993
int
3994
PyUnicode_FSConverter(PyObject* arg, void* addr)
3995
331k
{
3996
331k
    PyObject *path = NULL;
3997
331k
    PyObject *output = NULL;
3998
331k
    Py_ssize_t size;
3999
331k
    const char *data;
4000
331k
    if (arg == NULL) {
4001
0
        Py_DECREF(*(PyObject**)addr);
4002
0
        *(PyObject**)addr = NULL;
4003
0
        return 1;
4004
0
    }
4005
331k
    path = PyOS_FSPath(arg);
4006
331k
    if (path == NULL) {
4007
0
        return 0;
4008
0
    }
4009
331k
    if (PyBytes_Check(path)) {
4010
0
        output = path;
4011
0
    }
4012
331k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4013
331k
        output = PyUnicode_EncodeFSDefault(path);
4014
331k
        Py_DECREF(path);
4015
331k
        if (!output) {
4016
0
            return 0;
4017
0
        }
4018
331k
        assert(PyBytes_Check(output));
4019
331k
    }
4020
4021
331k
    size = PyBytes_GET_SIZE(output);
4022
331k
    data = PyBytes_AS_STRING(output);
4023
331k
    if ((size_t)size != strlen(data)) {
4024
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4025
0
        Py_DECREF(output);
4026
0
        return 0;
4027
0
    }
4028
331k
    *(PyObject**)addr = output;
4029
331k
    return Py_CLEANUP_SUPPORTED;
4030
331k
}
4031
4032
4033
int
4034
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4035
101k
{
4036
101k
    if (arg == NULL) {
4037
0
        Py_DECREF(*(PyObject**)addr);
4038
0
        *(PyObject**)addr = NULL;
4039
0
        return 1;
4040
0
    }
4041
4042
101k
    PyObject *path = PyOS_FSPath(arg);
4043
101k
    if (path == NULL) {
4044
0
        return 0;
4045
0
    }
4046
4047
101k
    PyObject *output = NULL;
4048
101k
    if (PyUnicode_Check(path)) {
4049
101k
        output = path;
4050
101k
    }
4051
0
    else if (PyBytes_Check(path)) {
4052
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4053
0
                                                  PyBytes_GET_SIZE(path));
4054
0
        Py_DECREF(path);
4055
0
        if (!output) {
4056
0
            return 0;
4057
0
        }
4058
0
    }
4059
0
    else {
4060
0
        PyErr_Format(PyExc_TypeError,
4061
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4062
0
                     Py_TYPE(arg)->tp_name);
4063
0
        Py_DECREF(path);
4064
0
        return 0;
4065
0
    }
4066
4067
101k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4068
101k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4069
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4070
0
        Py_DECREF(output);
4071
0
        return 0;
4072
0
    }
4073
101k
    *(PyObject**)addr = output;
4074
101k
    return Py_CLEANUP_SUPPORTED;
4075
101k
}
4076
4077
4078
static int unicode_fill_utf8(PyObject *unicode);
4079
4080
4081
static int
4082
unicode_ensure_utf8(PyObject *unicode)
4083
69.6M
{
4084
69.6M
    int err = 0;
4085
69.6M
    if (PyUnicode_UTF8(unicode) == NULL) {
4086
159k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4087
159k
        if (PyUnicode_UTF8(unicode) == NULL) {
4088
159k
            err = unicode_fill_utf8(unicode);
4089
159k
        }
4090
159k
        Py_END_CRITICAL_SECTION();
4091
159k
    }
4092
69.6M
    return err;
4093
69.6M
}
4094
4095
const char *
4096
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4097
69.6M
{
4098
69.6M
    if (!PyUnicode_Check(unicode)) {
4099
0
        PyErr_BadArgument();
4100
0
        if (psize) {
4101
0
            *psize = -1;
4102
0
        }
4103
0
        return NULL;
4104
0
    }
4105
4106
69.6M
    if (unicode_ensure_utf8(unicode) == -1) {
4107
207
        if (psize) {
4108
207
            *psize = -1;
4109
207
        }
4110
207
        return NULL;
4111
207
    }
4112
4113
69.6M
    if (psize) {
4114
69.4M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4115
69.4M
    }
4116
69.6M
    return PyUnicode_UTF8(unicode);
4117
69.6M
}
4118
4119
const char *
4120
PyUnicode_AsUTF8(PyObject *unicode)
4121
251k
{
4122
251k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4123
251k
}
4124
4125
const char *
4126
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4127
555k
{
4128
555k
    Py_ssize_t size;
4129
555k
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4130
555k
    if (s && strlen(s) != (size_t)size) {
4131
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4132
0
        return NULL;
4133
0
    }
4134
555k
    return s;
4135
555k
}
4136
4137
/*
4138
PyUnicode_GetSize() has been deprecated since Python 3.3
4139
because it returned length of Py_UNICODE.
4140
4141
But this function is part of stable abi, because it doesn't
4142
include Py_UNICODE in signature and it was not excluded from
4143
stable ABI in PEP 384.
4144
*/
4145
PyAPI_FUNC(Py_ssize_t)
4146
PyUnicode_GetSize(PyObject *unicode)
4147
0
{
4148
0
    PyErr_SetString(PyExc_RuntimeError,
4149
0
                    "PyUnicode_GetSize has been removed.");
4150
0
    return -1;
4151
0
}
4152
4153
Py_ssize_t
4154
PyUnicode_GetLength(PyObject *unicode)
4155
26.2k
{
4156
26.2k
    if (!PyUnicode_Check(unicode)) {
4157
0
        PyErr_BadArgument();
4158
0
        return -1;
4159
0
    }
4160
26.2k
    return PyUnicode_GET_LENGTH(unicode);
4161
26.2k
}
4162
4163
Py_UCS4
4164
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4165
20
{
4166
20
    const void *data;
4167
20
    int kind;
4168
4169
20
    if (!PyUnicode_Check(unicode)) {
4170
0
        PyErr_BadArgument();
4171
0
        return (Py_UCS4)-1;
4172
0
    }
4173
20
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4174
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4175
0
        return (Py_UCS4)-1;
4176
0
    }
4177
20
    data = PyUnicode_DATA(unicode);
4178
20
    kind = PyUnicode_KIND(unicode);
4179
20
    return PyUnicode_READ(kind, data, index);
4180
20
}
4181
4182
int
4183
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4184
0
{
4185
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4186
0
        PyErr_BadArgument();
4187
0
        return -1;
4188
0
    }
4189
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4190
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4191
0
        return -1;
4192
0
    }
4193
0
    if (unicode_check_modifiable(unicode))
4194
0
        return -1;
4195
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4196
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4197
0
        return -1;
4198
0
    }
4199
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4200
0
                    index, ch);
4201
0
    return 0;
4202
0
}
4203
4204
const char *
4205
PyUnicode_GetDefaultEncoding(void)
4206
0
{
4207
0
    return "utf-8";
4208
0
}
4209
4210
/* create or adjust a UnicodeDecodeError */
4211
static void
4212
make_decode_exception(PyObject **exceptionObject,
4213
                      const char *encoding,
4214
                      const char *input, Py_ssize_t length,
4215
                      Py_ssize_t startpos, Py_ssize_t endpos,
4216
                      const char *reason)
4217
2.71M
{
4218
2.71M
    if (*exceptionObject == NULL) {
4219
2.49M
        *exceptionObject = PyUnicodeDecodeError_Create(
4220
2.49M
            encoding, input, length, startpos, endpos, reason);
4221
2.49M
    }
4222
226k
    else {
4223
226k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4224
0
            goto onError;
4225
226k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4226
0
            goto onError;
4227
226k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4228
0
            goto onError;
4229
226k
    }
4230
2.71M
    return;
4231
4232
2.71M
onError:
4233
0
    Py_CLEAR(*exceptionObject);
4234
0
}
4235
4236
#ifdef MS_WINDOWS
4237
static int
4238
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4239
{
4240
    if (newsize > *size) {
4241
        wchar_t *newbuf = *buf;
4242
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4243
            PyErr_NoMemory();
4244
            return -1;
4245
        }
4246
        *buf = newbuf;
4247
    }
4248
    *size = newsize;
4249
    return 0;
4250
}
4251
4252
/* error handling callback helper:
4253
   build arguments, call the callback and check the arguments,
4254
   if no exception occurred, copy the replacement to the output
4255
   and adjust various state variables.
4256
   return 0 on success, -1 on error
4257
*/
4258
4259
static int
4260
unicode_decode_call_errorhandler_wchar(
4261
    const char *errors, PyObject **errorHandler,
4262
    const char *encoding, const char *reason,
4263
    const char **input, const char **inend, Py_ssize_t *startinpos,
4264
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4265
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4266
{
4267
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4268
4269
    PyObject *restuple = NULL;
4270
    PyObject *repunicode = NULL;
4271
    Py_ssize_t outsize;
4272
    Py_ssize_t insize;
4273
    Py_ssize_t requiredsize;
4274
    Py_ssize_t newpos;
4275
    PyObject *inputobj = NULL;
4276
    Py_ssize_t repwlen;
4277
4278
    if (*errorHandler == NULL) {
4279
        *errorHandler = PyCodec_LookupError(errors);
4280
        if (*errorHandler == NULL)
4281
            goto onError;
4282
    }
4283
4284
    make_decode_exception(exceptionObject,
4285
        encoding,
4286
        *input, *inend - *input,
4287
        *startinpos, *endinpos,
4288
        reason);
4289
    if (*exceptionObject == NULL)
4290
        goto onError;
4291
4292
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4293
    if (restuple == NULL)
4294
        goto onError;
4295
    if (!PyTuple_Check(restuple)) {
4296
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4297
        goto onError;
4298
    }
4299
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4300
        goto onError;
4301
4302
    /* Copy back the bytes variables, which might have been modified by the
4303
       callback */
4304
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4305
    if (!inputobj)
4306
        goto onError;
4307
    *input = PyBytes_AS_STRING(inputobj);
4308
    insize = PyBytes_GET_SIZE(inputobj);
4309
    *inend = *input + insize;
4310
    /* we can DECREF safely, as the exception has another reference,
4311
       so the object won't go away. */
4312
    Py_DECREF(inputobj);
4313
4314
    if (newpos<0)
4315
        newpos = insize+newpos;
4316
    if (newpos<0 || newpos>insize) {
4317
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4318
        goto onError;
4319
    }
4320
4321
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4322
    if (repwlen < 0)
4323
        goto onError;
4324
    repwlen--;
4325
    /* need more space? (at least enough for what we
4326
       have+the replacement+the rest of the string (starting
4327
       at the new input position), so we won't have to check space
4328
       when there are no errors in the rest of the string) */
4329
    requiredsize = *outpos;
4330
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4331
        goto overflow;
4332
    requiredsize += repwlen;
4333
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4334
        goto overflow;
4335
    requiredsize += insize - newpos;
4336
    outsize = *bufsize;
4337
    if (requiredsize > outsize) {
4338
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4339
            requiredsize = 2*outsize;
4340
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4341
            goto onError;
4342
        }
4343
    }
4344
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4345
    *outpos += repwlen;
4346
    *endinpos = newpos;
4347
    *inptr = *input + newpos;
4348
4349
    /* we made it! */
4350
    Py_DECREF(restuple);
4351
    return 0;
4352
4353
  overflow:
4354
    PyErr_SetString(PyExc_OverflowError,
4355
                    "decoded result is too long for a Python string");
4356
4357
  onError:
4358
    Py_XDECREF(restuple);
4359
    return -1;
4360
}
4361
#endif   /* MS_WINDOWS */
4362
4363
static int
4364
unicode_decode_call_errorhandler_writer(
4365
    const char *errors, PyObject **errorHandler,
4366
    const char *encoding, const char *reason,
4367
    const char **input, const char **inend, Py_ssize_t *startinpos,
4368
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4369
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4370
2.71M
{
4371
2.71M
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4372
4373
2.71M
    PyObject *restuple = NULL;
4374
2.71M
    PyObject *repunicode = NULL;
4375
2.71M
    Py_ssize_t insize;
4376
2.71M
    Py_ssize_t newpos;
4377
2.71M
    Py_ssize_t replen;
4378
2.71M
    Py_ssize_t remain;
4379
2.71M
    PyObject *inputobj = NULL;
4380
2.71M
    int need_to_grow = 0;
4381
2.71M
    const char *new_inptr;
4382
4383
2.71M
    if (*errorHandler == NULL) {
4384
2.49M
        *errorHandler = PyCodec_LookupError(errors);
4385
2.49M
        if (*errorHandler == NULL)
4386
0
            goto onError;
4387
2.49M
    }
4388
4389
2.71M
    make_decode_exception(exceptionObject,
4390
2.71M
        encoding,
4391
2.71M
        *input, *inend - *input,
4392
2.71M
        *startinpos, *endinpos,
4393
2.71M
        reason);
4394
2.71M
    if (*exceptionObject == NULL)
4395
0
        goto onError;
4396
4397
2.71M
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4398
2.71M
    if (restuple == NULL)
4399
2.45M
        goto onError;
4400
268k
    if (!PyTuple_Check(restuple)) {
4401
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4402
0
        goto onError;
4403
0
    }
4404
268k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4405
0
        goto onError;
4406
4407
    /* Copy back the bytes variables, which might have been modified by the
4408
       callback */
4409
268k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4410
268k
    if (!inputobj)
4411
0
        goto onError;
4412
268k
    remain = *inend - *input - *endinpos;
4413
268k
    *input = PyBytes_AS_STRING(inputobj);
4414
268k
    insize = PyBytes_GET_SIZE(inputobj);
4415
268k
    *inend = *input + insize;
4416
    /* we can DECREF safely, as the exception has another reference,
4417
       so the object won't go away. */
4418
268k
    Py_DECREF(inputobj);
4419
4420
268k
    if (newpos<0)
4421
0
        newpos = insize+newpos;
4422
268k
    if (newpos<0 || newpos>insize) {
4423
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4424
0
        goto onError;
4425
0
    }
4426
4427
268k
    replen = PyUnicode_GET_LENGTH(repunicode);
4428
268k
    if (replen > 1) {
4429
31.2k
        writer->min_length += replen - 1;
4430
31.2k
        need_to_grow = 1;
4431
31.2k
    }
4432
268k
    new_inptr = *input + newpos;
4433
268k
    if (*inend - new_inptr > remain) {
4434
        /* We don't know the decoding algorithm here so we make the worst
4435
           assumption that one byte decodes to one unicode character.
4436
           If unfortunately one byte could decode to more unicode characters,
4437
           the decoder may write out-of-bound then.  Is it possible for the
4438
           algorithms using this function? */
4439
15.9k
        writer->min_length += *inend - new_inptr - remain;
4440
15.9k
        need_to_grow = 1;
4441
15.9k
    }
4442
268k
    if (need_to_grow) {
4443
31.4k
        writer->overallocate = 1;
4444
31.4k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4445
31.4k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4446
0
            goto onError;
4447
31.4k
    }
4448
268k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4449
0
        goto onError;
4450
4451
268k
    *endinpos = newpos;
4452
268k
    *inptr = new_inptr;
4453
4454
    /* we made it! */
4455
268k
    Py_DECREF(restuple);
4456
268k
    return 0;
4457
4458
2.45M
  onError:
4459
2.45M
    Py_XDECREF(restuple);
4460
2.45M
    return -1;
4461
268k
}
4462
4463
/* --- UTF-7 Codec -------------------------------------------------------- */
4464
4465
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4466
4467
/* Three simple macros defining base-64. */
4468
4469
/* Is c a base-64 character? */
4470
4471
#define IS_BASE64(c) \
4472
249k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4473
249k
     ((c) >= 'a' && (c) <= 'z') ||     \
4474
249k
     ((c) >= '0' && (c) <= '9') ||     \
4475
249k
     (c) == '+' || (c) == '/')
4476
4477
/* given that c is a base-64 character, what is its base-64 value? */
4478
4479
#define FROM_BASE64(c)                                                  \
4480
215k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4481
215k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4482
170k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4483
97.0k
     (c) == '+' ? 62 : 63)
4484
4485
/* What is the base-64 character of the bottom 6 bits of n? */
4486
4487
#define TO_BASE64(n)  \
4488
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4489
4490
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4491
 * decoded as itself.  We are permissive on decoding; the only ASCII
4492
 * byte not decoding to itself is the + which begins a base64
4493
 * string. */
4494
4495
#define DECODE_DIRECT(c)                                \
4496
4.68M
    ((c) <= 127 && (c) != '+')
4497
4498
/* The UTF-7 encoder treats ASCII characters differently according to
4499
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4500
 * the above).  See RFC2152.  This array identifies these different
4501
 * sets:
4502
 * 0 : "Set D"
4503
 *     alphanumeric and '(),-./:?
4504
 * 1 : "Set O"
4505
 *     !"#$%&*;<=>@[]^_`{|}
4506
 * 2 : "whitespace"
4507
 *     ht nl cr sp
4508
 * 3 : special (must be base64 encoded)
4509
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4510
 */
4511
4512
static
4513
char utf7_category[128] = {
4514
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4515
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4516
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4517
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4518
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4519
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4520
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4521
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4522
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4523
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4524
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4525
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4526
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4527
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4528
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4529
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4530
};
4531
4532
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4533
 * answer depends on whether we are encoding set O as itself, and also
4534
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4535
 * clear that the answers to these questions vary between
4536
 * applications, so this code needs to be flexible.  */
4537
4538
#define ENCODE_DIRECT(c) \
4539
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4540
4541
PyObject *
4542
PyUnicode_DecodeUTF7(const char *s,
4543
                     Py_ssize_t size,
4544
                     const char *errors)
4545
0
{
4546
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4547
0
}
4548
4549
/* The decoder.  The only state we preserve is our read position,
4550
 * i.e. how many characters we have consumed.  So if we end in the
4551
 * middle of a shift sequence we have to back off the read position
4552
 * and the output to the beginning of the sequence, otherwise we lose
4553
 * all the shift state (seen bits, number of bits seen, high
4554
 * surrogate). */
4555
4556
PyObject *
4557
PyUnicode_DecodeUTF7Stateful(const char *s,
4558
                             Py_ssize_t size,
4559
                             const char *errors,
4560
                             Py_ssize_t *consumed)
4561
20.4k
{
4562
20.4k
    const char *starts = s;
4563
20.4k
    Py_ssize_t startinpos;
4564
20.4k
    Py_ssize_t endinpos;
4565
20.4k
    const char *e;
4566
20.4k
    _PyUnicodeWriter writer;
4567
20.4k
    const char *errmsg = "";
4568
20.4k
    int inShift = 0;
4569
20.4k
    Py_ssize_t shiftOutStart;
4570
20.4k
    unsigned int base64bits = 0;
4571
20.4k
    unsigned long base64buffer = 0;
4572
20.4k
    Py_UCS4 surrogate = 0;
4573
20.4k
    PyObject *errorHandler = NULL;
4574
20.4k
    PyObject *exc = NULL;
4575
4576
20.4k
    if (size == 0) {
4577
0
        if (consumed)
4578
0
            *consumed = 0;
4579
0
        _Py_RETURN_UNICODE_EMPTY();
4580
0
    }
4581
4582
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4583
20.4k
    _PyUnicodeWriter_Init(&writer);
4584
20.4k
    writer.min_length = size;
4585
4586
20.4k
    shiftOutStart = 0;
4587
20.4k
    e = s + size;
4588
4589
4.94M
    while (s < e) {
4590
4.93M
        Py_UCS4 ch;
4591
4.93M
      restart:
4592
4.93M
        ch = (unsigned char) *s;
4593
4594
4.93M
        if (inShift) { /* in a base-64 section */
4595
229k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4596
215k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4597
215k
                base64bits += 6;
4598
215k
                s++;
4599
215k
                if (base64bits >= 16) {
4600
                    /* we have enough bits for a UTF-16 value */
4601
75.4k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4602
75.4k
                    base64bits -= 16;
4603
75.4k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4604
75.4k
                    assert(outCh <= 0xffff);
4605
75.4k
                    if (surrogate) {
4606
                        /* expecting a second surrogate */
4607
6.76k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4608
2.41k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4609
2.41k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4610
0
                                goto onError;
4611
2.41k
                            surrogate = 0;
4612
2.41k
                            continue;
4613
2.41k
                        }
4614
4.34k
                        else {
4615
4.34k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4616
0
                                goto onError;
4617
4.34k
                            surrogate = 0;
4618
4.34k
                        }
4619
6.76k
                    }
4620
73.0k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4621
                        /* first surrogate */
4622
10.4k
                        surrogate = outCh;
4623
10.4k
                    }
4624
62.5k
                    else {
4625
62.5k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4626
0
                            goto onError;
4627
62.5k
                    }
4628
73.0k
                }
4629
215k
            }
4630
13.8k
            else { /* now leaving a base-64 section */
4631
13.8k
                inShift = 0;
4632
13.8k
                if (base64bits > 0) { /* left-over bits */
4633
11.4k
                    if (base64bits >= 6) {
4634
                        /* We've seen at least one base-64 character */
4635
5.72k
                        s++;
4636
5.72k
                        errmsg = "partial character in shift sequence";
4637
5.72k
                        goto utf7Error;
4638
5.72k
                    }
4639
5.73k
                    else {
4640
                        /* Some bits remain; they should be zero */
4641
5.73k
                        if (base64buffer != 0) {
4642
1.53k
                            s++;
4643
1.53k
                            errmsg = "non-zero padding bits in shift sequence";
4644
1.53k
                            goto utf7Error;
4645
1.53k
                        }
4646
5.73k
                    }
4647
11.4k
                }
4648
6.56k
                if (surrogate && DECODE_DIRECT(ch)) {
4649
2.52k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4650
0
                        goto onError;
4651
2.52k
                }
4652
6.56k
                surrogate = 0;
4653
6.56k
                if (ch == '-') {
4654
                    /* '-' is absorbed; other terminating
4655
                       characters are preserved */
4656
1.92k
                    s++;
4657
1.92k
                }
4658
6.56k
            }
4659
229k
        }
4660
4.70M
        else if ( ch == '+' ) {
4661
21.8k
            startinpos = s-starts;
4662
21.8k
            s++; /* consume '+' */
4663
21.8k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4664
1.63k
                s++;
4665
1.63k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4666
0
                    goto onError;
4667
1.63k
            }
4668
20.1k
            else if (s < e && !IS_BASE64(*s)) {
4669
2.78k
                s++;
4670
2.78k
                errmsg = "ill-formed sequence";
4671
2.78k
                goto utf7Error;
4672
2.78k
            }
4673
17.3k
            else { /* begin base64-encoded section */
4674
17.3k
                inShift = 1;
4675
17.3k
                surrogate = 0;
4676
17.3k
                shiftOutStart = writer.pos;
4677
17.3k
                base64bits = 0;
4678
17.3k
                base64buffer = 0;
4679
17.3k
            }
4680
21.8k
        }
4681
4.68M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4682
4.58M
            s++;
4683
4.58M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4684
0
                goto onError;
4685
4.58M
        }
4686
105k
        else {
4687
105k
            startinpos = s-starts;
4688
105k
            s++;
4689
105k
            errmsg = "unexpected special character";
4690
105k
            goto utf7Error;
4691
105k
        }
4692
4.81M
        continue;
4693
4.81M
utf7Error:
4694
115k
        endinpos = s-starts;
4695
115k
        if (unicode_decode_call_errorhandler_writer(
4696
115k
                errors, &errorHandler,
4697
115k
                "utf7", errmsg,
4698
115k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4699
115k
                &writer))
4700
9.12k
            goto onError;
4701
115k
    }
4702
4703
    /* end of string */
4704
4705
11.3k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4706
        /* if we're in an inconsistent state, that's an error */
4707
3.56k
        inShift = 0;
4708
3.56k
        if (surrogate ||
4709
3.01k
                (base64bits >= 6) ||
4710
2.30k
                (base64bits > 0 && base64buffer != 0)) {
4711
2.30k
            endinpos = size;
4712
2.30k
            if (unicode_decode_call_errorhandler_writer(
4713
2.30k
                    errors, &errorHandler,
4714
2.30k
                    "utf7", "unterminated shift sequence",
4715
2.30k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4716
2.30k
                    &writer))
4717
1.90k
                goto onError;
4718
401
            if (s < e)
4719
0
                goto restart;
4720
401
        }
4721
3.56k
    }
4722
4723
    /* return state */
4724
9.41k
    if (consumed) {
4725
0
        if (inShift) {
4726
0
            *consumed = startinpos;
4727
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4728
0
                PyObject *result = PyUnicode_FromKindAndData(
4729
0
                        writer.kind, writer.data, shiftOutStart);
4730
0
                Py_XDECREF(errorHandler);
4731
0
                Py_XDECREF(exc);
4732
0
                _PyUnicodeWriter_Dealloc(&writer);
4733
0
                return result;
4734
0
            }
4735
0
            writer.pos = shiftOutStart; /* back off output */
4736
0
        }
4737
0
        else {
4738
0
            *consumed = s-starts;
4739
0
        }
4740
0
    }
4741
4742
9.41k
    Py_XDECREF(errorHandler);
4743
9.41k
    Py_XDECREF(exc);
4744
9.41k
    return _PyUnicodeWriter_Finish(&writer);
4745
4746
11.0k
  onError:
4747
11.0k
    Py_XDECREF(errorHandler);
4748
11.0k
    Py_XDECREF(exc);
4749
11.0k
    _PyUnicodeWriter_Dealloc(&writer);
4750
11.0k
    return NULL;
4751
9.41k
}
4752
4753
4754
PyObject *
4755
_PyUnicode_EncodeUTF7(PyObject *str,
4756
                      const char *errors)
4757
0
{
4758
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4759
0
    if (len == 0) {
4760
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4761
0
    }
4762
0
    int kind = PyUnicode_KIND(str);
4763
0
    const void *data = PyUnicode_DATA(str);
4764
4765
    /* It might be possible to tighten this worst case */
4766
0
    if (len > PY_SSIZE_T_MAX / 8) {
4767
0
        return PyErr_NoMemory();
4768
0
    }
4769
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4770
0
    if (writer == NULL) {
4771
0
        return NULL;
4772
0
    }
4773
4774
0
    int inShift = 0;
4775
0
    unsigned int base64bits = 0;
4776
0
    unsigned long base64buffer = 0;
4777
0
    char *out = PyBytesWriter_GetData(writer);
4778
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4779
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4780
4781
0
        if (inShift) {
4782
0
            if (ENCODE_DIRECT(ch)) {
4783
                /* shifting out */
4784
0
                if (base64bits) { /* output remaining bits */
4785
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4786
0
                    base64buffer = 0;
4787
0
                    base64bits = 0;
4788
0
                }
4789
0
                inShift = 0;
4790
                /* Characters not in the BASE64 set implicitly unshift the sequence
4791
                   so no '-' is required, except if the character is itself a '-' */
4792
0
                if (IS_BASE64(ch) || ch == '-') {
4793
0
                    *out++ = '-';
4794
0
                }
4795
0
                *out++ = (char) ch;
4796
0
            }
4797
0
            else {
4798
0
                goto encode_char;
4799
0
            }
4800
0
        }
4801
0
        else { /* not in a shift sequence */
4802
0
            if (ch == '+') {
4803
0
                *out++ = '+';
4804
0
                        *out++ = '-';
4805
0
            }
4806
0
            else if (ENCODE_DIRECT(ch)) {
4807
0
                *out++ = (char) ch;
4808
0
            }
4809
0
            else {
4810
0
                *out++ = '+';
4811
0
                inShift = 1;
4812
0
                goto encode_char;
4813
0
            }
4814
0
        }
4815
0
        continue;
4816
0
encode_char:
4817
0
        if (ch >= 0x10000) {
4818
0
            assert(ch <= MAX_UNICODE);
4819
4820
            /* code first surrogate */
4821
0
            base64bits += 16;
4822
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4823
0
            while (base64bits >= 6) {
4824
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4825
0
                base64bits -= 6;
4826
0
            }
4827
            /* prepare second surrogate */
4828
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4829
0
        }
4830
0
        base64bits += 16;
4831
0
        base64buffer = (base64buffer << 16) | ch;
4832
0
        while (base64bits >= 6) {
4833
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4834
0
            base64bits -= 6;
4835
0
        }
4836
0
    }
4837
0
    if (base64bits)
4838
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4839
0
    if (inShift)
4840
0
        *out++ = '-';
4841
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4842
0
}
4843
4844
#undef IS_BASE64
4845
#undef FROM_BASE64
4846
#undef TO_BASE64
4847
#undef DECODE_DIRECT
4848
#undef ENCODE_DIRECT
4849
4850
/* --- UTF-8 Codec -------------------------------------------------------- */
4851
4852
PyObject *
4853
PyUnicode_DecodeUTF8(const char *s,
4854
                     Py_ssize_t size,
4855
                     const char *errors)
4856
74.0M
{
4857
74.0M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4858
74.0M
}
4859
4860
#include "stringlib/asciilib.h"
4861
#include "stringlib/codecs.h"
4862
#include "stringlib/undef.h"
4863
4864
#include "stringlib/ucs1lib.h"
4865
#include "stringlib/codecs.h"
4866
#include "stringlib/undef.h"
4867
4868
#include "stringlib/ucs2lib.h"
4869
#include "stringlib/codecs.h"
4870
#include "stringlib/undef.h"
4871
4872
#include "stringlib/ucs4lib.h"
4873
#include "stringlib/codecs.h"
4874
#include "stringlib/undef.h"
4875
4876
#if (SIZEOF_SIZE_T == 8)
4877
/* Mask to quickly check whether a C 'size_t' contains a
4878
   non-ASCII, UTF8-encoded char. */
4879
185M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4880
// used to count codepoints in UTF-8 string.
4881
58.2M
# define VECTOR_0101     0x0101010101010101ULL
4882
998k
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4883
#elif (SIZEOF_SIZE_T == 4)
4884
# define ASCII_CHAR_MASK 0x80808080U
4885
# define VECTOR_0101     0x01010101U
4886
# define VECTOR_00FF     0x00ff00ffU
4887
#else
4888
# error C 'size_t' size should be either 4 or 8!
4889
#endif
4890
4891
#if (defined(__clang__) || defined(__GNUC__))
4892
#define HAVE_CTZ 1
4893
static inline unsigned int
4894
ctz(size_t v)
4895
16.2M
{
4896
16.2M
    return __builtin_ctzll((unsigned long long)v);
4897
16.2M
}
4898
#elif defined(_MSC_VER)
4899
#define HAVE_CTZ 1
4900
static inline unsigned int
4901
ctz(size_t v)
4902
{
4903
    unsigned long pos;
4904
#if SIZEOF_SIZE_T == 4
4905
    _BitScanForward(&pos, v);
4906
#else
4907
    _BitScanForward64(&pos, v);
4908
#endif /* SIZEOF_SIZE_T */
4909
    return pos;
4910
}
4911
#else
4912
#define HAVE_CTZ 0
4913
#endif
4914
4915
#if HAVE_CTZ && PY_LITTLE_ENDIAN
4916
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
4917
static size_t
4918
load_unaligned(const unsigned char *p, size_t size)
4919
57.3M
{
4920
57.3M
    union {
4921
57.3M
        size_t s;
4922
57.3M
        unsigned char b[SIZEOF_SIZE_T];
4923
57.3M
    } u;
4924
57.3M
    u.s = 0;
4925
    // This switch statement assumes little endian because:
4926
    // * union is faster than bitwise or and shift.
4927
    // * big endian machine is rare and hard to maintain.
4928
57.3M
    switch (size) {
4929
0
    default:
4930
0
#if SIZEOF_SIZE_T == 8
4931
0
    case 8:
4932
0
        u.b[7] = p[7];
4933
0
        _Py_FALLTHROUGH;
4934
4.94M
    case 7:
4935
4.94M
        u.b[6] = p[6];
4936
4.94M
        _Py_FALLTHROUGH;
4937
14.7M
    case 6:
4938
14.7M
        u.b[5] = p[5];
4939
14.7M
        _Py_FALLTHROUGH;
4940
22.8M
    case 5:
4941
22.8M
        u.b[4] = p[4];
4942
22.8M
        _Py_FALLTHROUGH;
4943
22.8M
#endif
4944
28.7M
    case 4:
4945
28.7M
        u.b[3] = p[3];
4946
28.7M
        _Py_FALLTHROUGH;
4947
46.2M
    case 3:
4948
46.2M
        u.b[2] = p[2];
4949
46.2M
        _Py_FALLTHROUGH;
4950
51.4M
    case 2:
4951
51.4M
        u.b[1] = p[1];
4952
51.4M
        _Py_FALLTHROUGH;
4953
54.1M
    case 1:
4954
54.1M
        u.b[0] = p[0];
4955
54.1M
        break;
4956
3.18M
    case 0:
4957
3.18M
        break;
4958
57.3M
    }
4959
57.3M
    return u.s;
4960
57.3M
}
4961
#endif
4962
4963
/*
4964
 * Find the first non-ASCII character in a byte sequence.
4965
 *
4966
 * This function scans a range of bytes from `start` to `end` and returns the
4967
 * index of the first byte that is not an ASCII character (i.e., has the most
4968
 * significant bit set). If all characters in the range are ASCII, it returns
4969
 * `end - start`.
4970
 */
4971
static Py_ssize_t
4972
find_first_nonascii(const unsigned char *start, const unsigned char *end)
4973
66.0M
{
4974
    // The search is done in `size_t` chunks.
4975
    // The start and end might not be aligned at `size_t` boundaries,
4976
    // so they're handled specially.
4977
4978
66.0M
    const unsigned char *p = start;
4979
4980
66.0M
    if (end - start >= SIZEOF_SIZE_T) {
4981
        // Avoid unaligned read.
4982
26.0M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
4983
26.0M
        size_t u;
4984
26.0M
        memcpy(&u, p, sizeof(size_t));
4985
26.0M
        u &= ASCII_CHAR_MASK;
4986
26.0M
        if (u) {
4987
6.95M
            return (ctz(u) - 7) / 8;
4988
6.95M
        }
4989
19.0M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
4990
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
4991
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
4992
        while (p < p2) {
4993
            if (*p & 0x80) {
4994
                return p - start;
4995
            }
4996
            p++;
4997
        }
4998
#endif
4999
5000
19.0M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5001
113M
        while (p <= e) {
5002
96.1M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5003
96.1M
            if (u) {
5004
1.68M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5005
1.68M
                return p - start + (ctz(u) - 7) / 8;
5006
#else
5007
                // big endian and minor compilers are difficult to test.
5008
                // fallback to per byte check.
5009
                break;
5010
#endif
5011
1.68M
            }
5012
94.4M
            p += SIZEOF_SIZE_T;
5013
94.4M
        }
5014
19.0M
    }
5015
57.3M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5016
66.0M
    assert((end - p) < SIZEOF_SIZE_T);
5017
    // we can not use *(const size_t*)p to avoid buffer overrun.
5018
57.3M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5019
57.3M
    if (u) {
5020
7.59M
        return p - start + (ctz(u) - 7) / 8;
5021
7.59M
    }
5022
49.7M
    return end - start;
5023
#else
5024
    while (p < end) {
5025
        if (*p & 0x80) {
5026
            break;
5027
        }
5028
        p++;
5029
    }
5030
    return p - start;
5031
#endif
5032
57.3M
}
5033
5034
static inline int
5035
scalar_utf8_start_char(unsigned int ch)
5036
891k
{
5037
    // 0xxxxxxx or 11xxxxxx are first byte.
5038
891k
    return (~ch >> 7 | ch >> 6) & 1;
5039
891k
}
5040
5041
static inline size_t
5042
vector_utf8_start_chars(size_t v)
5043
58.2M
{
5044
58.2M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5045
58.2M
}
5046
5047
5048
// Count the number of UTF-8 code points in a given byte sequence.
5049
static Py_ssize_t
5050
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5051
358k
{
5052
358k
    Py_ssize_t len = 0;
5053
5054
358k
    if (end - s >= SIZEOF_SIZE_T) {
5055
298k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5056
18.7k
            len += scalar_utf8_start_char(*s++);
5057
18.7k
        }
5058
5059
779k
        while (s + SIZEOF_SIZE_T <= end) {
5060
499k
            const unsigned char *e = end;
5061
499k
            if (e - s > SIZEOF_SIZE_T * 255) {
5062
220k
                e = s + SIZEOF_SIZE_T * 255;
5063
220k
            }
5064
499k
            Py_ssize_t vstart = 0;
5065
58.7M
            while (s + SIZEOF_SIZE_T <= e) {
5066
58.2M
                size_t v = *(size_t*)s;
5067
58.2M
                size_t vs = vector_utf8_start_chars(v);
5068
58.2M
                vstart += vs;
5069
58.2M
                s += SIZEOF_SIZE_T;
5070
58.2M
            }
5071
499k
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5072
499k
            vstart += vstart >> 16;
5073
499k
#if SIZEOF_SIZE_T == 8
5074
499k
            vstart += vstart >> 32;
5075
499k
#endif
5076
499k
            len += vstart & 0x7ff;
5077
499k
        }
5078
280k
    }
5079
1.23M
    while (s < end) {
5080
872k
        len += scalar_utf8_start_char(*s++);
5081
872k
    }
5082
358k
    return len;
5083
358k
}
5084
5085
static Py_ssize_t
5086
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5087
15.7M
{
5088
15.7M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5089
15.7M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5090
15.5M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5091
12.1M
    {
5092
        /* Fast path, see in STRINGLIB(utf8_decode) for
5093
           an explanation. */
5094
12.1M
        const char *p = start;
5095
12.1M
        Py_UCS1 *q = dest;
5096
15.6M
        while (p + SIZEOF_SIZE_T <= end) {
5097
6.03M
            size_t value = *(const size_t *) p;
5098
6.03M
            if (value & ASCII_CHAR_MASK)
5099
2.56M
                break;
5100
3.47M
            *((size_t *)q) = value;
5101
3.47M
            p += SIZEOF_SIZE_T;
5102
3.47M
            q += SIZEOF_SIZE_T;
5103
3.47M
        }
5104
58.4M
        while (p < end) {
5105
48.9M
            if ((unsigned char)*p & 0x80)
5106
2.58M
                break;
5107
46.3M
            *q++ = *p++;
5108
46.3M
        }
5109
12.1M
        return p - start;
5110
12.1M
    }
5111
3.60M
#endif
5112
3.60M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5113
3.60M
                                         (const unsigned char*)end);
5114
3.60M
    memcpy(dest, start, pos);
5115
3.60M
    return pos;
5116
15.7M
}
5117
5118
static int
5119
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5120
                         const char *starts, const char *s, const char *end,
5121
                         _Py_error_handler error_handler,
5122
                         const char *errors,
5123
                         Py_ssize_t *consumed)
5124
16.2M
{
5125
16.2M
    Py_ssize_t startinpos, endinpos;
5126
16.2M
    const char *errmsg = "";
5127
16.2M
    PyObject *error_handler_obj = NULL;
5128
16.2M
    PyObject *exc = NULL;
5129
5130
343M
    while (s < end) {
5131
337M
        Py_UCS4 ch;
5132
337M
        int kind = writer->kind;
5133
5134
337M
        if (kind == PyUnicode_1BYTE_KIND) {
5135
16.6M
            if (PyUnicode_IS_ASCII(writer->buffer))
5136
15.8M
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5137
774k
            else
5138
774k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5139
320M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5140
119M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5141
200M
        } else {
5142
200M
            assert(kind == PyUnicode_4BYTE_KIND);
5143
200M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5144
200M
        }
5145
5146
337M
        switch (ch) {
5147
10.2M
        case 0:
5148
10.2M
            if (s == end || consumed)
5149
10.2M
                goto End;
5150
26.2k
            errmsg = "unexpected end of data";
5151
26.2k
            startinpos = s - starts;
5152
26.2k
            endinpos = end - starts;
5153
26.2k
            break;
5154
233M
        case 1:
5155
233M
            errmsg = "invalid start byte";
5156
233M
            startinpos = s - starts;
5157
233M
            endinpos = startinpos + 1;
5158
233M
            break;
5159
75.7M
        case 2:
5160
75.7M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5161
0
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5162
0
            {
5163
                /* Truncated surrogate code in range D800-DFFF */
5164
0
                goto End;
5165
0
            }
5166
75.7M
            _Py_FALLTHROUGH;
5167
77.1M
        case 3:
5168
77.3M
        case 4:
5169
77.3M
            errmsg = "invalid continuation byte";
5170
77.3M
            startinpos = s - starts;
5171
77.3M
            endinpos = startinpos + ch - 1;
5172
77.3M
            break;
5173
15.9M
        default:
5174
            // ch doesn't fit into kind, so change the buffer kind to write
5175
            // the character
5176
15.9M
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5177
0
                goto onError;
5178
15.9M
            continue;
5179
337M
        }
5180
5181
311M
        if (error_handler == _Py_ERROR_UNKNOWN)
5182
171k
            error_handler = _Py_GetErrorHandler(errors);
5183
5184
311M
        switch (error_handler) {
5185
0
        case _Py_ERROR_IGNORE:
5186
0
            s += (endinpos - startinpos);
5187
0
            break;
5188
5189
307M
        case _Py_ERROR_REPLACE:
5190
307M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5191
0
                goto onError;
5192
307M
            s += (endinpos - startinpos);
5193
307M
            break;
5194
5195
3.21M
        case _Py_ERROR_SURROGATEESCAPE:
5196
3.21M
        {
5197
3.21M
            Py_ssize_t i;
5198
5199
3.21M
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5200
0
                goto onError;
5201
6.42M
            for (i=startinpos; i<endinpos; i++) {
5202
3.21M
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5203
3.21M
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5204
3.21M
                                ch + 0xdc00);
5205
3.21M
                writer->pos++;
5206
3.21M
            }
5207
3.21M
            s += (endinpos - startinpos);
5208
3.21M
            break;
5209
3.21M
        }
5210
5211
1.36k
        default:
5212
1.36k
            if (unicode_decode_call_errorhandler_writer(
5213
1.36k
                    errors, &error_handler_obj,
5214
1.36k
                    "utf-8", errmsg,
5215
1.36k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5216
1.36k
                    writer)) {
5217
1.36k
                goto onError;
5218
1.36k
            }
5219
5220
0
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5221
0
                goto onError;
5222
0
            }
5223
311M
        }
5224
311M
    }
5225
5226
16.2M
End:
5227
16.2M
    if (consumed)
5228
696
        *consumed = s - starts;
5229
5230
16.2M
    Py_XDECREF(error_handler_obj);
5231
16.2M
    Py_XDECREF(exc);
5232
16.2M
    return 0;
5233
5234
1.36k
onError:
5235
1.36k
    Py_XDECREF(error_handler_obj);
5236
1.36k
    Py_XDECREF(exc);
5237
1.36k
    return -1;
5238
16.2M
}
5239
5240
5241
static PyObject *
5242
unicode_decode_utf8(const char *s, Py_ssize_t size,
5243
                    _Py_error_handler error_handler, const char *errors,
5244
                    Py_ssize_t *consumed)
5245
99.1M
{
5246
99.1M
    if (size == 0) {
5247
3.54M
        if (consumed) {
5248
0
            *consumed = 0;
5249
0
        }
5250
3.54M
        _Py_RETURN_UNICODE_EMPTY();
5251
3.54M
    }
5252
5253
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5254
95.6M
    if (size == 1 && (unsigned char)s[0] < 128) {
5255
33.2M
        if (consumed) {
5256
0
            *consumed = 1;
5257
0
        }
5258
33.2M
        return get_latin1_char((unsigned char)s[0]);
5259
33.2M
    }
5260
5261
    // I don't know this check is necessary or not. But there is a test
5262
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5263
62.3M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5264
0
        PyErr_NoMemory();
5265
0
        return NULL;
5266
0
    }
5267
5268
62.3M
    const char *starts = s;
5269
62.3M
    const char *end = s + size;
5270
5271
62.3M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5272
62.3M
    if (pos == size) {  // fast path: ASCII string.
5273
46.2M
        PyObject *u = PyUnicode_New(size, 127);
5274
46.2M
        if (u == NULL) {
5275
0
            return NULL;
5276
0
        }
5277
46.2M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5278
46.2M
        if (consumed) {
5279
102
            *consumed = size;
5280
102
        }
5281
46.2M
        return u;
5282
46.2M
    }
5283
5284
16.1M
    int maxchr = 127;
5285
16.1M
    Py_ssize_t maxsize = size;
5286
5287
16.1M
    unsigned char ch = (unsigned char)(s[pos]);
5288
    // error handler other than strict may remove/replace the invalid byte.
5289
    // consumed != NULL allows 1~3 bytes remainings.
5290
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5291
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5292
    // reallocation and copy.
5293
16.1M
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5294
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5295
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5296
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5297
        // means that it is no longer necessary to allocate several times the required amount
5298
        // of memory.
5299
358k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5300
358k
        if (ch < 0xc4) { // latin1
5301
238k
            maxchr = 0xff;
5302
238k
        }
5303
120k
        else if (ch < 0xf0) { // ucs2
5304
108k
            maxchr = 0xffff;
5305
108k
        }
5306
12.8k
        else { // ucs4
5307
12.8k
            maxchr = 0x10ffff;
5308
12.8k
        }
5309
358k
    }
5310
16.1M
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5311
16.1M
    if (!u) {
5312
0
        return NULL;
5313
0
    }
5314
5315
    // Use _PyUnicodeWriter after fast path is failed.
5316
16.1M
    _PyUnicodeWriter writer;
5317
16.1M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5318
16.1M
    if (maxchr <= 255) {
5319
16.0M
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5320
16.0M
        s += pos;
5321
16.0M
        writer.pos = pos;
5322
16.0M
    }
5323
5324
16.1M
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5325
16.1M
                                 error_handler, errors,
5326
16.1M
                                 consumed) < 0) {
5327
1.36k
        _PyUnicodeWriter_Dealloc(&writer);
5328
1.36k
        return NULL;
5329
1.36k
    }
5330
16.1M
    return _PyUnicodeWriter_Finish(&writer);
5331
16.1M
}
5332
5333
5334
// Used by PyUnicodeWriter_WriteUTF8() implementation
5335
int
5336
_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
5337
                            const char *s, Py_ssize_t size,
5338
                            _Py_error_handler error_handler, const char *errors,
5339
                            Py_ssize_t *consumed)
5340
3.62M
{
5341
3.62M
    if (size == 0) {
5342
6.21k
        if (consumed) {
5343
0
            *consumed = 0;
5344
0
        }
5345
6.21k
        return 0;
5346
6.21k
    }
5347
5348
    // fast path: try ASCII string.
5349
3.61M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5350
0
        return -1;
5351
0
    }
5352
5353
3.61M
    const char *starts = s;
5354
3.61M
    const char *end = s + size;
5355
3.61M
    Py_ssize_t decoded = 0;
5356
3.61M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5357
3.61M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5358
3.61M
        decoded = ascii_decode(s, end, dest);
5359
3.61M
        writer->pos += decoded;
5360
5361
3.61M
        if (decoded == size) {
5362
3.56M
            if (consumed) {
5363
1.14k
                *consumed = size;
5364
1.14k
            }
5365
3.56M
            return 0;
5366
3.56M
        }
5367
42.8k
        s += decoded;
5368
42.8k
    }
5369
5370
44.8k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5371
44.8k
                                    error_handler, errors, consumed);
5372
3.61M
}
5373
5374
5375
PyObject *
5376
PyUnicode_DecodeUTF8Stateful(const char *s,
5377
                             Py_ssize_t size,
5378
                             const char *errors,
5379
                             Py_ssize_t *consumed)
5380
98.9M
{
5381
98.9M
    return unicode_decode_utf8(s, size,
5382
98.9M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5383
98.9M
                               errors, consumed);
5384
98.9M
}
5385
5386
5387
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5388
   non-zero, use strict error handler otherwise.
5389
5390
   On success, write a pointer to a newly allocated wide character string into
5391
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5392
   (in number of wchar_t units) into *wlen (if wlen is set).
5393
5394
   On memory allocation failure, return -1.
5395
5396
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5397
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5398
   is not NULL, write the decoding error message into *reason. */
5399
int
5400
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5401
                 const char **reason, _Py_error_handler errors)
5402
11.8k
{
5403
11.8k
    const char *orig_s = s;
5404
11.8k
    const char *e;
5405
11.8k
    wchar_t *unicode;
5406
11.8k
    Py_ssize_t outpos;
5407
5408
11.8k
    int surrogateescape = 0;
5409
11.8k
    int surrogatepass = 0;
5410
11.8k
    switch (errors)
5411
11.8k
    {
5412
0
    case _Py_ERROR_STRICT:
5413
0
        break;
5414
11.8k
    case _Py_ERROR_SURROGATEESCAPE:
5415
11.8k
        surrogateescape = 1;
5416
11.8k
        break;
5417
0
    case _Py_ERROR_SURROGATEPASS:
5418
0
        surrogatepass = 1;
5419
0
        break;
5420
0
    default:
5421
0
        return -3;
5422
11.8k
    }
5423
5424
    /* Note: size will always be longer than the resulting Unicode
5425
       character count */
5426
11.8k
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5427
0
        return -1;
5428
0
    }
5429
5430
11.8k
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5431
11.8k
    if (!unicode) {
5432
0
        return -1;
5433
0
    }
5434
5435
    /* Unpack UTF-8 encoded data */
5436
11.8k
    e = s + size;
5437
11.8k
    outpos = 0;
5438
11.8k
    while (s < e) {
5439
11.8k
        Py_UCS4 ch;
5440
11.8k
#if SIZEOF_WCHAR_T == 4
5441
11.8k
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5442
#else
5443
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5444
#endif
5445
11.8k
        if (ch > 0xFF) {
5446
0
#if SIZEOF_WCHAR_T == 4
5447
0
            Py_UNREACHABLE();
5448
#else
5449
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5450
            /* write a surrogate pair */
5451
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5452
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5453
#endif
5454
0
        }
5455
11.8k
        else {
5456
11.8k
            if (!ch && s == e) {
5457
11.8k
                break;
5458
11.8k
            }
5459
5460
0
            if (surrogateescape) {
5461
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5462
0
            }
5463
0
            else {
5464
                /* Is it a valid three-byte code? */
5465
0
                if (surrogatepass
5466
0
                    && (e - s) >= 3
5467
0
                    && (s[0] & 0xf0) == 0xe0
5468
0
                    && (s[1] & 0xc0) == 0x80
5469
0
                    && (s[2] & 0xc0) == 0x80)
5470
0
                {
5471
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5472
0
                    s += 3;
5473
0
                    unicode[outpos++] = ch;
5474
0
                }
5475
0
                else {
5476
0
                    PyMem_RawFree(unicode );
5477
0
                    if (reason != NULL) {
5478
0
                        switch (ch) {
5479
0
                        case 0:
5480
0
                            *reason = "unexpected end of data";
5481
0
                            break;
5482
0
                        case 1:
5483
0
                            *reason = "invalid start byte";
5484
0
                            break;
5485
                        /* 2, 3, 4 */
5486
0
                        default:
5487
0
                            *reason = "invalid continuation byte";
5488
0
                            break;
5489
0
                        }
5490
0
                    }
5491
0
                    if (wlen != NULL) {
5492
0
                        *wlen = s - orig_s;
5493
0
                    }
5494
0
                    return -2;
5495
0
                }
5496
0
            }
5497
0
        }
5498
11.8k
    }
5499
11.8k
    unicode[outpos] = L'\0';
5500
11.8k
    if (wlen) {
5501
11.8k
        *wlen = outpos;
5502
11.8k
    }
5503
11.8k
    *wstr = unicode;
5504
11.8k
    return 0;
5505
11.8k
}
5506
5507
5508
wchar_t*
5509
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5510
                               size_t *wlen)
5511
0
{
5512
0
    wchar_t *wstr;
5513
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5514
0
                               &wstr, wlen,
5515
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5516
0
    if (res != 0) {
5517
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5518
0
        assert(res != -3);
5519
0
        if (wlen) {
5520
0
            *wlen = (size_t)res;
5521
0
        }
5522
0
        return NULL;
5523
0
    }
5524
0
    return wstr;
5525
0
}
5526
5527
5528
/* UTF-8 encoder.
5529
5530
   On success, return 0 and write the newly allocated character string (use
5531
   PyMem_Free() to free the memory) into *str.
5532
5533
   On encoding failure, return -2 and write the position of the invalid
5534
   surrogate character into *error_pos (if error_pos is set) and the decoding
5535
   error message into *reason (if reason is set).
5536
5537
   On memory allocation failure, return -1. */
5538
int
5539
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5540
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5541
1.51k
{
5542
1.51k
    const Py_ssize_t max_char_size = 4;
5543
1.51k
    Py_ssize_t len = wcslen(text);
5544
5545
1.51k
    assert(len >= 0);
5546
5547
1.51k
    int surrogateescape = 0;
5548
1.51k
    int surrogatepass = 0;
5549
1.51k
    switch (errors)
5550
1.51k
    {
5551
144
    case _Py_ERROR_STRICT:
5552
144
        break;
5553
1.36k
    case _Py_ERROR_SURROGATEESCAPE:
5554
1.36k
        surrogateescape = 1;
5555
1.36k
        break;
5556
0
    case _Py_ERROR_SURROGATEPASS:
5557
0
        surrogatepass = 1;
5558
0
        break;
5559
0
    default:
5560
0
        return -3;
5561
1.51k
    }
5562
5563
1.51k
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5564
0
        return -1;
5565
0
    }
5566
1.51k
    char *bytes;
5567
1.51k
    if (raw_malloc) {
5568
1.51k
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5569
1.51k
    }
5570
0
    else {
5571
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5572
0
    }
5573
1.51k
    if (bytes == NULL) {
5574
0
        return -1;
5575
0
    }
5576
5577
1.51k
    char *p = bytes;
5578
1.51k
    Py_ssize_t i;
5579
99.6k
    for (i = 0; i < len; ) {
5580
98.1k
        Py_ssize_t ch_pos = i;
5581
98.1k
        Py_UCS4 ch = text[i];
5582
98.1k
        i++;
5583
98.1k
        if (sizeof(wchar_t) == 2
5584
0
            && Py_UNICODE_IS_HIGH_SURROGATE(ch)
5585
0
            && i < len
5586
0
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5587
0
        {
5588
0
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5589
0
            i++;
5590
0
        }
5591
5592
98.1k
        if (ch < 0x80) {
5593
            /* Encode ASCII */
5594
98.1k
            *p++ = (char) ch;
5595
5596
98.1k
        }
5597
0
        else if (ch < 0x0800) {
5598
            /* Encode Latin-1 */
5599
0
            *p++ = (char)(0xc0 | (ch >> 6));
5600
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5601
0
        }
5602
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5603
            /* surrogateescape error handler */
5604
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5605
0
                if (error_pos != NULL) {
5606
0
                    *error_pos = (size_t)ch_pos;
5607
0
                }
5608
0
                if (reason != NULL) {
5609
0
                    *reason = "encoding error";
5610
0
                }
5611
0
                if (raw_malloc) {
5612
0
                    PyMem_RawFree(bytes);
5613
0
                }
5614
0
                else {
5615
0
                    PyMem_Free(bytes);
5616
0
                }
5617
0
                return -2;
5618
0
            }
5619
0
            *p++ = (char)(ch & 0xff);
5620
0
        }
5621
0
        else if (ch < 0x10000) {
5622
0
            *p++ = (char)(0xe0 | (ch >> 12));
5623
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5624
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5625
0
        }
5626
0
        else {  /* ch >= 0x10000 */
5627
0
            assert(ch <= MAX_UNICODE);
5628
            /* Encode UCS4 Unicode ordinals */
5629
0
            *p++ = (char)(0xf0 | (ch >> 18));
5630
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5631
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5632
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5633
0
        }
5634
98.1k
    }
5635
1.51k
    *p++ = '\0';
5636
5637
1.51k
    size_t final_size = (p - bytes);
5638
1.51k
    char *bytes2;
5639
1.51k
    if (raw_malloc) {
5640
1.51k
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5641
1.51k
    }
5642
0
    else {
5643
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5644
0
    }
5645
1.51k
    if (bytes2 == NULL) {
5646
0
        if (error_pos != NULL) {
5647
0
            *error_pos = (size_t)-1;
5648
0
        }
5649
0
        if (raw_malloc) {
5650
0
            PyMem_RawFree(bytes);
5651
0
        }
5652
0
        else {
5653
0
            PyMem_Free(bytes);
5654
0
        }
5655
0
        return -1;
5656
0
    }
5657
1.51k
    *str = bytes2;
5658
1.51k
    return 0;
5659
1.51k
}
5660
5661
5662
/* Primary internal function which creates utf8 encoded bytes objects.
5663
5664
   Allocation strategy:  if the string is short, convert into a stack buffer
5665
   and allocate exactly as much space needed at the end.  Else allocate the
5666
   maximum possible needed (4 result bytes per Unicode character), and return
5667
   the excess memory at the end.
5668
*/
5669
static PyObject *
5670
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5671
                    const char *errors)
5672
19.4M
{
5673
19.4M
    if (!PyUnicode_Check(unicode)) {
5674
0
        PyErr_BadArgument();
5675
0
        return NULL;
5676
0
    }
5677
5678
19.4M
    if (PyUnicode_UTF8(unicode))
5679
9.96M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5680
9.96M
                                         PyUnicode_UTF8_LENGTH(unicode));
5681
5682
9.53M
    int kind = PyUnicode_KIND(unicode);
5683
9.53M
    const void *data = PyUnicode_DATA(unicode);
5684
9.53M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5685
5686
9.53M
    PyBytesWriter *writer;
5687
9.53M
    char *end;
5688
5689
9.53M
    switch (kind) {
5690
0
    default:
5691
0
        Py_UNREACHABLE();
5692
6.31M
    case PyUnicode_1BYTE_KIND:
5693
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5694
6.31M
        assert(!PyUnicode_IS_ASCII(unicode));
5695
6.31M
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5696
6.31M
                                      error_handler, errors, &end);
5697
6.31M
        break;
5698
2.02M
    case PyUnicode_2BYTE_KIND:
5699
2.02M
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5700
2.02M
                                      error_handler, errors, &end);
5701
2.02M
        break;
5702
1.19M
    case PyUnicode_4BYTE_KIND:
5703
1.19M
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5704
1.19M
                                      error_handler, errors, &end);
5705
1.19M
        break;
5706
9.53M
    }
5707
5708
9.53M
    if (writer == NULL) {
5709
151k
        PyBytesWriter_Discard(writer);
5710
151k
        return NULL;
5711
151k
    }
5712
9.38M
    return PyBytesWriter_FinishWithPointer(writer, end);
5713
9.53M
}
5714
5715
static int
5716
unicode_fill_utf8(PyObject *unicode)
5717
159k
{
5718
159k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5719
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5720
159k
    assert(!PyUnicode_IS_ASCII(unicode));
5721
5722
159k
    int kind = PyUnicode_KIND(unicode);
5723
159k
    const void *data = PyUnicode_DATA(unicode);
5724
159k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5725
5726
159k
    PyBytesWriter *writer;
5727
159k
    char *end;
5728
5729
159k
    switch (kind) {
5730
0
    default:
5731
0
        Py_UNREACHABLE();
5732
121k
    case PyUnicode_1BYTE_KIND:
5733
121k
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5734
121k
                                      _Py_ERROR_STRICT, NULL, &end);
5735
121k
        break;
5736
31.9k
    case PyUnicode_2BYTE_KIND:
5737
31.9k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5738
31.9k
                                      _Py_ERROR_STRICT, NULL, &end);
5739
31.9k
        break;
5740
6.70k
    case PyUnicode_4BYTE_KIND:
5741
6.70k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5742
6.70k
                                      _Py_ERROR_STRICT, NULL, &end);
5743
6.70k
        break;
5744
159k
    }
5745
159k
    if (writer == NULL) {
5746
207
        return -1;
5747
207
    }
5748
5749
159k
    const char *start = PyBytesWriter_GetData(writer);
5750
159k
    Py_ssize_t len = end - start;
5751
5752
159k
    char *cache = PyMem_Malloc(len + 1);
5753
159k
    if (cache == NULL) {
5754
0
        PyBytesWriter_Discard(writer);
5755
0
        PyErr_NoMemory();
5756
0
        return -1;
5757
0
    }
5758
159k
    memcpy(cache, start, len);
5759
159k
    cache[len] = '\0';
5760
159k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5761
159k
    PyUnicode_SET_UTF8(unicode, cache);
5762
159k
    PyBytesWriter_Discard(writer);
5763
159k
    return 0;
5764
159k
}
5765
5766
PyObject *
5767
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5768
18.1M
{
5769
18.1M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5770
18.1M
}
5771
5772
5773
PyObject *
5774
PyUnicode_AsUTF8String(PyObject *unicode)
5775
2.40k
{
5776
2.40k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5777
2.40k
}
5778
5779
/* --- UTF-32 Codec ------------------------------------------------------- */
5780
5781
PyObject *
5782
PyUnicode_DecodeUTF32(const char *s,
5783
                      Py_ssize_t size,
5784
                      const char *errors,
5785
                      int *byteorder)
5786
83
{
5787
83
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5788
83
}
5789
5790
PyObject *
5791
PyUnicode_DecodeUTF32Stateful(const char *s,
5792
                              Py_ssize_t size,
5793
                              const char *errors,
5794
                              int *byteorder,
5795
                              Py_ssize_t *consumed)
5796
40.9k
{
5797
40.9k
    const char *starts = s;
5798
40.9k
    Py_ssize_t startinpos;
5799
40.9k
    Py_ssize_t endinpos;
5800
40.9k
    _PyUnicodeWriter writer;
5801
40.9k
    const unsigned char *q, *e;
5802
40.9k
    int le, bo = 0;       /* assume native ordering by default */
5803
40.9k
    const char *encoding;
5804
40.9k
    const char *errmsg = "";
5805
40.9k
    PyObject *errorHandler = NULL;
5806
40.9k
    PyObject *exc = NULL;
5807
5808
40.9k
    q = (const unsigned char *)s;
5809
40.9k
    e = q + size;
5810
5811
40.9k
    if (byteorder)
5812
40.8k
        bo = *byteorder;
5813
5814
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5815
       byte order setting accordingly. In native mode, the leading BOM
5816
       mark is skipped, in all other modes, it is copied to the output
5817
       stream as-is (giving a ZWNBSP character). */
5818
40.9k
    if (bo == 0 && size >= 4) {
5819
38.7k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5820
38.7k
        if (bom == 0x0000FEFF) {
5821
194
            bo = -1;
5822
194
            q += 4;
5823
194
        }
5824
38.5k
        else if (bom == 0xFFFE0000) {
5825
475
            bo = 1;
5826
475
            q += 4;
5827
475
        }
5828
38.7k
        if (byteorder)
5829
38.6k
            *byteorder = bo;
5830
38.7k
    }
5831
5832
40.9k
    if (q == e) {
5833
100
        if (consumed)
5834
0
            *consumed = size;
5835
100
        _Py_RETURN_UNICODE_EMPTY();
5836
100
    }
5837
5838
#ifdef WORDS_BIGENDIAN
5839
    le = bo < 0;
5840
#else
5841
40.8k
    le = bo <= 0;
5842
40.8k
#endif
5843
40.8k
    encoding = le ? "utf-32-le" : "utf-32-be";
5844
5845
40.8k
    _PyUnicodeWriter_Init(&writer);
5846
40.8k
    writer.min_length = (e - q + 3) / 4;
5847
40.8k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5848
0
        goto onError;
5849
5850
123k
    while (1) {
5851
123k
        Py_UCS4 ch = 0;
5852
123k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5853
5854
123k
        if (e - q >= 4) {
5855
95.0k
            int kind = writer.kind;
5856
95.0k
            void *data = writer.data;
5857
95.0k
            const unsigned char *last = e - 4;
5858
95.0k
            Py_ssize_t pos = writer.pos;
5859
95.0k
            if (le) {
5860
2.14M
                do {
5861
2.14M
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5862
2.14M
                    if (ch > maxch)
5863
89.2k
                        break;
5864
2.06M
                    if (kind != PyUnicode_1BYTE_KIND &&
5865
2.03M
                        Py_UNICODE_IS_SURROGATE(ch))
5866
135
                        break;
5867
2.06M
                    PyUnicode_WRITE(kind, data, pos++, ch);
5868
2.06M
                    q += 4;
5869
2.06M
                } while (q <= last);
5870
90.5k
            }
5871
4.48k
            else {
5872
7.12k
                do {
5873
7.12k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5874
7.12k
                    if (ch > maxch)
5875
4.04k
                        break;
5876
3.08k
                    if (kind != PyUnicode_1BYTE_KIND &&
5877
2.79k
                        Py_UNICODE_IS_SURROGATE(ch))
5878
130
                        break;
5879
2.95k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5880
2.95k
                    q += 4;
5881
2.95k
                } while (q <= last);
5882
4.48k
            }
5883
95.0k
            writer.pos = pos;
5884
95.0k
        }
5885
5886
123k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5887
267
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5888
267
            startinpos = ((const char *)q) - starts;
5889
267
            endinpos = startinpos + 4;
5890
267
        }
5891
123k
        else if (ch <= maxch) {
5892
30.0k
            if (q == e || consumed)
5893
6.32k
                break;
5894
            /* remaining bytes at the end? (size should be divisible by 4) */
5895
23.7k
            errmsg = "truncated data";
5896
23.7k
            startinpos = ((const char *)q) - starts;
5897
23.7k
            endinpos = ((const char *)e) - starts;
5898
23.7k
        }
5899
93.2k
        else {
5900
93.2k
            if (ch < 0x110000) {
5901
5.26k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5902
0
                    goto onError;
5903
5.26k
                q += 4;
5904
5.26k
                continue;
5905
5.26k
            }
5906
88.0k
            errmsg = "code point not in range(0x110000)";
5907
88.0k
            startinpos = ((const char *)q) - starts;
5908
88.0k
            endinpos = startinpos + 4;
5909
88.0k
        }
5910
5911
        /* The remaining input chars are ignored if the callback
5912
           chooses to skip the input */
5913
112k
        if (unicode_decode_call_errorhandler_writer(
5914
112k
                errors, &errorHandler,
5915
112k
                encoding, errmsg,
5916
112k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5917
112k
                &writer))
5918
34.4k
            goto onError;
5919
112k
    }
5920
5921
6.32k
    if (consumed)
5922
0
        *consumed = (const char *)q-starts;
5923
5924
6.32k
    Py_XDECREF(errorHandler);
5925
6.32k
    Py_XDECREF(exc);
5926
6.32k
    return _PyUnicodeWriter_Finish(&writer);
5927
5928
34.4k
  onError:
5929
34.4k
    _PyUnicodeWriter_Dealloc(&writer);
5930
34.4k
    Py_XDECREF(errorHandler);
5931
34.4k
    Py_XDECREF(exc);
5932
34.4k
    return NULL;
5933
40.8k
}
5934
5935
PyObject *
5936
_PyUnicode_EncodeUTF32(PyObject *str,
5937
                       const char *errors,
5938
                       int byteorder)
5939
0
{
5940
0
    if (!PyUnicode_Check(str)) {
5941
0
        PyErr_BadArgument();
5942
0
        return NULL;
5943
0
    }
5944
0
    int kind = PyUnicode_KIND(str);
5945
0
    const void *data = PyUnicode_DATA(str);
5946
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
5947
5948
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5949
0
        return PyErr_NoMemory();
5950
0
    Py_ssize_t nsize = len + (byteorder == 0);
5951
5952
0
#if PY_LITTLE_ENDIAN
5953
0
    int native_ordering = byteorder <= 0;
5954
#else
5955
    int native_ordering = byteorder >= 0;
5956
#endif
5957
5958
0
    if (kind == PyUnicode_1BYTE_KIND) {
5959
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
5960
        // on short strings
5961
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5962
0
        if (v == NULL) {
5963
0
            return NULL;
5964
0
        }
5965
5966
        /* output buffer is 4-bytes aligned */
5967
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5968
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
5969
0
        if (byteorder == 0) {
5970
0
            *out++ = 0xFEFF;
5971
0
        }
5972
0
        if (len > 0) {
5973
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
5974
0
                                 &out, native_ordering);
5975
0
        }
5976
0
        return v;
5977
0
    }
5978
5979
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
5980
0
    if (writer == NULL) {
5981
0
        return NULL;
5982
0
    }
5983
5984
    /* output buffer is 4-bytes aligned */
5985
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
5986
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
5987
0
    if (byteorder == 0) {
5988
0
        *out++ = 0xFEFF;
5989
0
    }
5990
0
    if (len == 0) {
5991
0
        return PyBytesWriter_Finish(writer);
5992
0
    }
5993
5994
0
    const char *encoding;
5995
0
    if (byteorder == -1)
5996
0
        encoding = "utf-32-le";
5997
0
    else if (byteorder == 1)
5998
0
        encoding = "utf-32-be";
5999
0
    else
6000
0
        encoding = "utf-32";
6001
6002
0
    PyObject *errorHandler = NULL;
6003
0
    PyObject *exc = NULL;
6004
0
    PyObject *rep = NULL;
6005
6006
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6007
0
        if (kind == PyUnicode_2BYTE_KIND) {
6008
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6009
0
                                        &out, native_ordering);
6010
0
        }
6011
0
        else {
6012
0
            assert(kind == PyUnicode_4BYTE_KIND);
6013
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6014
0
                                        &out, native_ordering);
6015
0
        }
6016
0
        if (pos == len)
6017
0
            break;
6018
6019
0
        Py_ssize_t newpos;
6020
0
        rep = unicode_encode_call_errorhandler(
6021
0
                errors, &errorHandler,
6022
0
                encoding, "surrogates not allowed",
6023
0
                str, &exc, pos, pos + 1, &newpos);
6024
0
        if (!rep)
6025
0
            goto error;
6026
6027
0
        Py_ssize_t repsize, moreunits;
6028
0
        if (PyBytes_Check(rep)) {
6029
0
            repsize = PyBytes_GET_SIZE(rep);
6030
0
            if (repsize & 3) {
6031
0
                raise_encode_exception(&exc, encoding,
6032
0
                                       str, pos, pos + 1,
6033
0
                                       "surrogates not allowed");
6034
0
                goto error;
6035
0
            }
6036
0
            moreunits = repsize / 4;
6037
0
        }
6038
0
        else {
6039
0
            assert(PyUnicode_Check(rep));
6040
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6041
0
            if (!PyUnicode_IS_ASCII(rep)) {
6042
0
                raise_encode_exception(&exc, encoding,
6043
0
                                       str, pos, pos + 1,
6044
0
                                       "surrogates not allowed");
6045
0
                goto error;
6046
0
            }
6047
0
        }
6048
0
        moreunits += pos - newpos;
6049
0
        pos = newpos;
6050
6051
        /* four bytes are reserved for each surrogate */
6052
0
        if (moreunits > 0) {
6053
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6054
0
            if (out == NULL) {
6055
0
                goto error;
6056
0
            }
6057
0
        }
6058
6059
0
        if (PyBytes_Check(rep)) {
6060
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6061
0
            out += repsize / 4;
6062
0
        }
6063
0
        else {
6064
            /* rep is unicode */
6065
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6066
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6067
0
                                 &out, native_ordering);
6068
0
        }
6069
6070
0
        Py_CLEAR(rep);
6071
0
    }
6072
6073
0
    Py_XDECREF(errorHandler);
6074
0
    Py_XDECREF(exc);
6075
6076
    /* Cut back to size actually needed. This is necessary for, for example,
6077
       encoding of a string containing isolated surrogates and the 'ignore'
6078
       handler is used. */
6079
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6080
6081
0
  error:
6082
0
    Py_XDECREF(rep);
6083
0
    Py_XDECREF(errorHandler);
6084
0
    Py_XDECREF(exc);
6085
0
    PyBytesWriter_Discard(writer);
6086
0
    return NULL;
6087
0
}
6088
6089
PyObject *
6090
PyUnicode_AsUTF32String(PyObject *unicode)
6091
0
{
6092
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6093
0
}
6094
6095
/* --- UTF-16 Codec ------------------------------------------------------- */
6096
6097
PyObject *
6098
PyUnicode_DecodeUTF16(const char *s,
6099
                      Py_ssize_t size,
6100
                      const char *errors,
6101
                      int *byteorder)
6102
113
{
6103
113
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6104
113
}
6105
6106
PyObject *
6107
PyUnicode_DecodeUTF16Stateful(const char *s,
6108
                              Py_ssize_t size,
6109
                              const char *errors,
6110
                              int *byteorder,
6111
                              Py_ssize_t *consumed)
6112
14.2k
{
6113
14.2k
    const char *starts = s;
6114
14.2k
    Py_ssize_t startinpos;
6115
14.2k
    Py_ssize_t endinpos;
6116
14.2k
    _PyUnicodeWriter writer;
6117
14.2k
    const unsigned char *q, *e;
6118
14.2k
    int bo = 0;       /* assume native ordering by default */
6119
14.2k
    int native_ordering;
6120
14.2k
    const char *errmsg = "";
6121
14.2k
    PyObject *errorHandler = NULL;
6122
14.2k
    PyObject *exc = NULL;
6123
14.2k
    const char *encoding;
6124
6125
14.2k
    q = (const unsigned char *)s;
6126
14.2k
    e = q + size;
6127
6128
14.2k
    if (byteorder)
6129
14.1k
        bo = *byteorder;
6130
6131
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6132
       byte order setting accordingly. In native mode, the leading BOM
6133
       mark is skipped, in all other modes, it is copied to the output
6134
       stream as-is (giving a ZWNBSP character). */
6135
14.2k
    if (bo == 0 && size >= 2) {
6136
13.5k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6137
13.5k
        if (bom == 0xFEFF) {
6138
312
            q += 2;
6139
312
            bo = -1;
6140
312
        }
6141
13.1k
        else if (bom == 0xFFFE) {
6142
2.45k
            q += 2;
6143
2.45k
            bo = 1;
6144
2.45k
        }
6145
13.5k
        if (byteorder)
6146
13.3k
            *byteorder = bo;
6147
13.5k
    }
6148
6149
14.2k
    if (q == e) {
6150
94
        if (consumed)
6151
0
            *consumed = size;
6152
94
        _Py_RETURN_UNICODE_EMPTY();
6153
94
    }
6154
6155
14.1k
#if PY_LITTLE_ENDIAN
6156
14.1k
    native_ordering = bo <= 0;
6157
14.1k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6158
#else
6159
    native_ordering = bo >= 0;
6160
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6161
#endif
6162
6163
    /* Note: size will always be longer than the resulting Unicode
6164
       character count normally.  Error handler will take care of
6165
       resizing when needed. */
6166
14.1k
    _PyUnicodeWriter_Init(&writer);
6167
14.1k
    writer.min_length = (e - q + 1) / 2;
6168
14.1k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6169
0
        goto onError;
6170
6171
51.0k
    while (1) {
6172
51.0k
        Py_UCS4 ch = 0;
6173
51.0k
        if (e - q >= 2) {
6174
43.0k
            int kind = writer.kind;
6175
43.0k
            if (kind == PyUnicode_1BYTE_KIND) {
6176
17.1k
                if (PyUnicode_IS_ASCII(writer.buffer))
6177
13.6k
                    ch = asciilib_utf16_decode(&q, e,
6178
13.6k
                            (Py_UCS1*)writer.data, &writer.pos,
6179
13.6k
                            native_ordering);
6180
3.50k
                else
6181
3.50k
                    ch = ucs1lib_utf16_decode(&q, e,
6182
3.50k
                            (Py_UCS1*)writer.data, &writer.pos,
6183
3.50k
                            native_ordering);
6184
25.8k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6185
11.8k
                ch = ucs2lib_utf16_decode(&q, e,
6186
11.8k
                        (Py_UCS2*)writer.data, &writer.pos,
6187
11.8k
                        native_ordering);
6188
14.0k
            } else {
6189
14.0k
                assert(kind == PyUnicode_4BYTE_KIND);
6190
14.0k
                ch = ucs4lib_utf16_decode(&q, e,
6191
14.0k
                        (Py_UCS4*)writer.data, &writer.pos,
6192
14.0k
                        native_ordering);
6193
14.0k
            }
6194
43.0k
        }
6195
6196
51.0k
        switch (ch)
6197
51.0k
        {
6198
14.8k
        case 0:
6199
            /* remaining byte at the end? (size should be even) */
6200
14.8k
            if (q == e || consumed)
6201
9.68k
                goto End;
6202
5.14k
            errmsg = "truncated data";
6203
5.14k
            startinpos = ((const char *)q) - starts;
6204
5.14k
            endinpos = ((const char *)e) - starts;
6205
5.14k
            break;
6206
            /* The remaining input chars are ignored if the callback
6207
               chooses to skip the input */
6208
1.57k
        case 1:
6209
1.57k
            q -= 2;
6210
1.57k
            if (consumed)
6211
0
                goto End;
6212
1.57k
            errmsg = "unexpected end of data";
6213
1.57k
            startinpos = ((const char *)q) - starts;
6214
1.57k
            endinpos = ((const char *)e) - starts;
6215
1.57k
            break;
6216
12.0k
        case 2:
6217
12.0k
            errmsg = "illegal encoding";
6218
12.0k
            startinpos = ((const char *)q) - 2 - starts;
6219
12.0k
            endinpos = startinpos + 2;
6220
12.0k
            break;
6221
7.28k
        case 3:
6222
7.28k
            errmsg = "illegal UTF-16 surrogate";
6223
7.28k
            startinpos = ((const char *)q) - 4 - starts;
6224
7.28k
            endinpos = startinpos + 2;
6225
7.28k
            break;
6226
15.3k
        default:
6227
15.3k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6228
0
                goto onError;
6229
15.3k
            continue;
6230
51.0k
        }
6231
6232
26.0k
        if (unicode_decode_call_errorhandler_writer(
6233
26.0k
                errors,
6234
26.0k
                &errorHandler,
6235
26.0k
                encoding, errmsg,
6236
26.0k
                &starts,
6237
26.0k
                (const char **)&e,
6238
26.0k
                &startinpos,
6239
26.0k
                &endinpos,
6240
26.0k
                &exc,
6241
26.0k
                (const char **)&q,
6242
26.0k
                &writer))
6243
4.51k
            goto onError;
6244
26.0k
    }
6245
6246
9.68k
End:
6247
9.68k
    if (consumed)
6248
0
        *consumed = (const char *)q-starts;
6249
6250
9.68k
    Py_XDECREF(errorHandler);
6251
9.68k
    Py_XDECREF(exc);
6252
9.68k
    return _PyUnicodeWriter_Finish(&writer);
6253
6254
4.51k
  onError:
6255
4.51k
    _PyUnicodeWriter_Dealloc(&writer);
6256
4.51k
    Py_XDECREF(errorHandler);
6257
4.51k
    Py_XDECREF(exc);
6258
4.51k
    return NULL;
6259
14.1k
}
6260
6261
PyObject *
6262
_PyUnicode_EncodeUTF16(PyObject *str,
6263
                       const char *errors,
6264
                       int byteorder)
6265
6.46k
{
6266
6.46k
    if (!PyUnicode_Check(str)) {
6267
0
        PyErr_BadArgument();
6268
0
        return NULL;
6269
0
    }
6270
6.46k
    int kind = PyUnicode_KIND(str);
6271
6.46k
    const void *data = PyUnicode_DATA(str);
6272
6.46k
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6273
6274
6.46k
    Py_ssize_t pairs = 0;
6275
6.46k
    if (kind == PyUnicode_4BYTE_KIND) {
6276
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6277
0
        const Py_UCS4 *end = in + len;
6278
0
        while (in < end) {
6279
0
            if (*in++ >= 0x10000) {
6280
0
                pairs++;
6281
0
            }
6282
0
        }
6283
0
    }
6284
6.46k
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6285
0
        return PyErr_NoMemory();
6286
0
    }
6287
6.46k
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6288
6289
#if PY_BIG_ENDIAN
6290
    int native_ordering = byteorder >= 0;
6291
#else
6292
6.46k
    int native_ordering = byteorder <= 0;
6293
6.46k
#endif
6294
6295
6.46k
    if (kind == PyUnicode_1BYTE_KIND) {
6296
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6297
        // on short strings
6298
6.39k
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6299
6.39k
        if (v == NULL) {
6300
0
            return NULL;
6301
0
        }
6302
6303
        /* output buffer is 2-bytes aligned */
6304
6.39k
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6305
6.39k
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6306
6.39k
        if (byteorder == 0) {
6307
0
            *out++ = 0xFEFF;
6308
0
        }
6309
6.39k
        if (len > 0) {
6310
6.39k
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6311
6.39k
        }
6312
6.39k
        return v;
6313
6.39k
    }
6314
6315
62
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6316
62
    if (writer == NULL) {
6317
0
        return NULL;
6318
0
    }
6319
6320
    /* output buffer is 2-bytes aligned */
6321
62
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6322
62
    unsigned short *out = PyBytesWriter_GetData(writer);
6323
62
    if (byteorder == 0) {
6324
0
        *out++ = 0xFEFF;
6325
0
    }
6326
62
    if (len == 0) {
6327
0
        return PyBytesWriter_Finish(writer);
6328
0
    }
6329
6330
62
    const char *encoding;
6331
62
    if (byteorder < 0) {
6332
0
        encoding = "utf-16-le";
6333
0
    }
6334
62
    else if (byteorder > 0) {
6335
62
        encoding = "utf-16-be";
6336
62
    }
6337
0
    else {
6338
0
        encoding = "utf-16";
6339
0
    }
6340
6341
62
    PyObject *errorHandler = NULL;
6342
62
    PyObject *exc = NULL;
6343
62
    PyObject *rep = NULL;
6344
6345
62
    for (Py_ssize_t pos = 0; pos < len; ) {
6346
62
        if (kind == PyUnicode_2BYTE_KIND) {
6347
62
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6348
62
                                        &out, native_ordering);
6349
62
        }
6350
0
        else {
6351
0
            assert(kind == PyUnicode_4BYTE_KIND);
6352
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6353
0
                                        &out, native_ordering);
6354
0
        }
6355
62
        if (pos == len)
6356
62
            break;
6357
6358
0
        Py_ssize_t newpos;
6359
0
        rep = unicode_encode_call_errorhandler(
6360
0
                errors, &errorHandler,
6361
0
                encoding, "surrogates not allowed",
6362
0
                str, &exc, pos, pos + 1, &newpos);
6363
0
        if (!rep)
6364
0
            goto error;
6365
6366
0
        Py_ssize_t repsize, moreunits;
6367
0
        if (PyBytes_Check(rep)) {
6368
0
            repsize = PyBytes_GET_SIZE(rep);
6369
0
            if (repsize & 1) {
6370
0
                raise_encode_exception(&exc, encoding,
6371
0
                                       str, pos, pos + 1,
6372
0
                                       "surrogates not allowed");
6373
0
                goto error;
6374
0
            }
6375
0
            moreunits = repsize / 2;
6376
0
        }
6377
0
        else {
6378
0
            assert(PyUnicode_Check(rep));
6379
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6380
0
            if (!PyUnicode_IS_ASCII(rep)) {
6381
0
                raise_encode_exception(&exc, encoding,
6382
0
                                       str, pos, pos + 1,
6383
0
                                       "surrogates not allowed");
6384
0
                goto error;
6385
0
            }
6386
0
        }
6387
0
        moreunits += pos - newpos;
6388
0
        pos = newpos;
6389
6390
        /* two bytes are reserved for each surrogate */
6391
0
        if (moreunits > 0) {
6392
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6393
0
            if (out == NULL) {
6394
0
                goto error;
6395
0
            }
6396
0
        }
6397
6398
0
        if (PyBytes_Check(rep)) {
6399
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6400
0
            out += repsize / 2;
6401
0
        } else {
6402
            /* rep is unicode */
6403
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6404
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6405
0
                                 &out, native_ordering);
6406
0
        }
6407
6408
0
        Py_CLEAR(rep);
6409
0
    }
6410
6411
62
    Py_XDECREF(errorHandler);
6412
62
    Py_XDECREF(exc);
6413
6414
    /* Cut back to size actually needed. This is necessary for, for example,
6415
    encoding of a string containing isolated surrogates and the 'ignore' handler
6416
    is used. */
6417
62
    return PyBytesWriter_FinishWithPointer(writer, out);
6418
6419
0
  error:
6420
0
    Py_XDECREF(rep);
6421
0
    Py_XDECREF(errorHandler);
6422
0
    Py_XDECREF(exc);
6423
0
    PyBytesWriter_Discard(writer);
6424
0
    return NULL;
6425
62
}
6426
6427
PyObject *
6428
PyUnicode_AsUTF16String(PyObject *unicode)
6429
0
{
6430
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6431
0
}
6432
6433
_PyUnicode_Name_CAPI *
6434
_PyUnicode_GetNameCAPI(void)
6435
2.21k
{
6436
2.21k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6437
2.21k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6438
6439
2.21k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6440
2.21k
    if (ucnhash_capi == NULL) {
6441
1
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6442
1
                PyUnicodeData_CAPSULE_NAME, 1);
6443
6444
        // It's fine if we overwrite the value here. It's always the same value.
6445
1
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6446
1
    }
6447
2.21k
    return ucnhash_capi;
6448
2.21k
}
6449
6450
/* --- Unicode Escape Codec ----------------------------------------------- */
6451
6452
PyObject *
6453
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6454
                               Py_ssize_t size,
6455
                               const char *errors,
6456
                               Py_ssize_t *consumed,
6457
                               int *first_invalid_escape_char,
6458
                               const char **first_invalid_escape_ptr)
6459
31.6k
{
6460
31.6k
    const char *starts = s;
6461
31.6k
    const char *initial_starts = starts;
6462
31.6k
    _PyUnicodeWriter writer;
6463
31.6k
    const char *end;
6464
31.6k
    PyObject *errorHandler = NULL;
6465
31.6k
    PyObject *exc = NULL;
6466
31.6k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6467
6468
    // so we can remember if we've seen an invalid escape char or not
6469
31.6k
    *first_invalid_escape_char = -1;
6470
31.6k
    *first_invalid_escape_ptr = NULL;
6471
6472
31.6k
    if (size == 0) {
6473
2.14k
        if (consumed) {
6474
0
            *consumed = 0;
6475
0
        }
6476
2.14k
        _Py_RETURN_UNICODE_EMPTY();
6477
2.14k
    }
6478
    /* Escaped strings will always be longer than the resulting
6479
       Unicode string, so we start with size here and then reduce the
6480
       length after conversion to the true value.
6481
       (but if the error callback returns a long replacement string
6482
       we'll have to allocate more space) */
6483
29.5k
    _PyUnicodeWriter_Init(&writer);
6484
29.5k
    writer.min_length = size;
6485
29.5k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6486
0
        goto onError;
6487
0
    }
6488
6489
29.5k
    end = s + size;
6490
205k
    while (s < end) {
6491
175k
        unsigned char c = (unsigned char) *s++;
6492
175k
        Py_UCS4 ch;
6493
175k
        int count;
6494
175k
        const char *message;
6495
6496
175k
#define WRITE_ASCII_CHAR(ch)                                                  \
6497
175k
            do {                                                              \
6498
15.8k
                assert(ch <= 127);                                            \
6499
15.8k
                assert(writer.pos < writer.size);                             \
6500
15.8k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6501
15.8k
            } while(0)
6502
6503
175k
#define WRITE_CHAR(ch)                                                        \
6504
175k
            do {                                                              \
6505
165k
                if (ch <= writer.maxchar) {                                   \
6506
149k
                    assert(writer.pos < writer.size);                         \
6507
149k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6508
149k
                }                                                             \
6509
165k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6510
0
                    goto onError;                                             \
6511
0
                }                                                             \
6512
165k
            } while(0)
6513
6514
        /* Non-escape characters are interpreted as Unicode ordinals */
6515
175k
        if (c != '\\') {
6516
123k
            WRITE_CHAR(c);
6517
123k
            continue;
6518
123k
        }
6519
6520
52.0k
        Py_ssize_t startinpos = s - starts - 1;
6521
        /* \ - Escapes */
6522
52.0k
        if (s >= end) {
6523
0
            message = "\\ at end of string";
6524
0
            goto incomplete;
6525
0
        }
6526
52.0k
        c = (unsigned char) *s++;
6527
6528
52.0k
        assert(writer.pos < writer.size);
6529
52.0k
        switch (c) {
6530
6531
            /* \x escapes */
6532
727
        case '\n': continue;
6533
1.53k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6534
766
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6535
1.19k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6536
1.03k
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6537
        /* FF */
6538
731
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6539
714
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6540
1.06k
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6541
1.43k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6542
        /* VT */
6543
888
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6544
        /* BEL, not classic C */
6545
588
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6546
6547
            /* \OOO (octal) escapes */
6548
4.45k
        case '0': case '1': case '2': case '3':
6549
7.55k
        case '4': case '5': case '6': case '7':
6550
7.55k
            ch = c - '0';
6551
7.55k
            if (s < end && '0' <= *s && *s <= '7') {
6552
4.02k
                ch = (ch<<3) + *s++ - '0';
6553
4.02k
                if (s < end && '0' <= *s && *s <= '7') {
6554
3.09k
                    ch = (ch<<3) + *s++ - '0';
6555
3.09k
                }
6556
4.02k
            }
6557
7.55k
            if (ch > 0377) {
6558
1.41k
                if (*first_invalid_escape_char == -1) {
6559
1.05k
                    *first_invalid_escape_char = ch;
6560
1.05k
                    if (starts == initial_starts) {
6561
                        /* Back up 3 chars, since we've already incremented s. */
6562
1.05k
                        *first_invalid_escape_ptr = s - 3;
6563
1.05k
                    }
6564
1.05k
                }
6565
1.41k
            }
6566
7.55k
            WRITE_CHAR(ch);
6567
7.55k
            continue;
6568
6569
            /* hex escapes */
6570
            /* \xXX */
6571
7.55k
        case 'x':
6572
6.09k
            count = 2;
6573
6.09k
            message = "truncated \\xXX escape";
6574
6.09k
            goto hexescape;
6575
6576
            /* \uXXXX */
6577
6.36k
        case 'u':
6578
6.36k
            count = 4;
6579
6.36k
            message = "truncated \\uXXXX escape";
6580
6.36k
            goto hexescape;
6581
6582
            /* \UXXXXXXXX */
6583
13.1k
        case 'U':
6584
13.1k
            count = 8;
6585
13.1k
            message = "truncated \\UXXXXXXXX escape";
6586
25.6k
        hexescape:
6587
168k
            for (ch = 0; count; ++s, --count) {
6588
142k
                if (s >= end) {
6589
12
                    goto incomplete;
6590
12
                }
6591
142k
                c = (unsigned char)*s;
6592
142k
                ch <<= 4;
6593
142k
                if (c >= '0' && c <= '9') {
6594
109k
                    ch += c - '0';
6595
109k
                }
6596
33.1k
                else if (c >= 'a' && c <= 'f') {
6597
32.8k
                    ch += c - ('a' - 10);
6598
32.8k
                }
6599
290
                else if (c >= 'A' && c <= 'F') {
6600
277
                    ch += c - ('A' - 10);
6601
277
                }
6602
13
                else {
6603
13
                    goto error;
6604
13
                }
6605
142k
            }
6606
6607
            /* when we get here, ch is a 32-bit unicode character */
6608
25.5k
            if (ch > MAX_UNICODE) {
6609
1
                message = "illegal Unicode character";
6610
1
                goto error;
6611
1
            }
6612
6613
25.5k
            WRITE_CHAR(ch);
6614
25.5k
            continue;
6615
6616
            /* \N{name} */
6617
25.5k
        case 'N':
6618
2.21k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6619
2.21k
            if (ucnhash_capi == NULL) {
6620
0
                PyErr_SetString(
6621
0
                        PyExc_UnicodeError,
6622
0
                        "\\N escapes not supported (can't load unicodedata module)"
6623
0
                );
6624
0
                goto onError;
6625
0
            }
6626
6627
2.21k
            message = "malformed \\N character escape";
6628
2.21k
            if (s >= end) {
6629
2
                goto incomplete;
6630
2
            }
6631
2.21k
            if (*s == '{') {
6632
2.20k
                const char *start = ++s;
6633
2.20k
                size_t namelen;
6634
                /* look for the closing brace */
6635
28.3k
                while (s < end && *s != '}')
6636
26.1k
                    s++;
6637
2.20k
                if (s >= end) {
6638
8
                    goto incomplete;
6639
8
                }
6640
2.20k
                namelen = s - start;
6641
2.20k
                if (namelen) {
6642
                    /* found a name.  look it up in the unicode database */
6643
2.19k
                    s++;
6644
2.19k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6645
2.19k
                    if (namelen <= INT_MAX &&
6646
2.19k
                        ucnhash_capi->getcode(start, (int)namelen,
6647
2.19k
                                              &ch, 0)) {
6648
2.13k
                        assert(ch <= MAX_UNICODE);
6649
2.13k
                        WRITE_CHAR(ch);
6650
2.13k
                        continue;
6651
2.13k
                    }
6652
65
                    message = "unknown Unicode character name";
6653
65
                }
6654
2.20k
            }
6655
75
            goto error;
6656
6657
5.93k
        default:
6658
5.93k
            if (*first_invalid_escape_char == -1) {
6659
3.85k
                *first_invalid_escape_char = c;
6660
3.85k
                if (starts == initial_starts) {
6661
                    /* Back up one char, since we've already incremented s. */
6662
3.85k
                    *first_invalid_escape_ptr = s - 1;
6663
3.85k
                }
6664
3.85k
            }
6665
5.93k
            WRITE_ASCII_CHAR('\\');
6666
5.93k
            WRITE_CHAR(c);
6667
5.93k
            continue;
6668
52.0k
        }
6669
6670
22
      incomplete:
6671
22
        if (consumed) {
6672
0
            *consumed = startinpos;
6673
0
            break;
6674
0
        }
6675
111
      error:;
6676
111
        Py_ssize_t endinpos = s-starts;
6677
111
        writer.min_length = end - s + writer.pos;
6678
111
        if (unicode_decode_call_errorhandler_writer(
6679
111
                errors, &errorHandler,
6680
111
                "unicodeescape", message,
6681
111
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6682
111
                &writer)) {
6683
111
            goto onError;
6684
111
        }
6685
111
        assert(end - s <= writer.size - writer.pos);
6686
6687
0
#undef WRITE_ASCII_CHAR
6688
0
#undef WRITE_CHAR
6689
0
    }
6690
6691
29.4k
    Py_XDECREF(errorHandler);
6692
29.4k
    Py_XDECREF(exc);
6693
29.4k
    return _PyUnicodeWriter_Finish(&writer);
6694
6695
111
  onError:
6696
111
    _PyUnicodeWriter_Dealloc(&writer);
6697
111
    Py_XDECREF(errorHandler);
6698
111
    Py_XDECREF(exc);
6699
111
    return NULL;
6700
29.5k
}
6701
6702
PyObject *
6703
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6704
                              Py_ssize_t size,
6705
                              const char *errors,
6706
                              Py_ssize_t *consumed)
6707
44
{
6708
44
    int first_invalid_escape_char;
6709
44
    const char *first_invalid_escape_ptr;
6710
44
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6711
44
                                                      consumed,
6712
44
                                                      &first_invalid_escape_char,
6713
44
                                                      &first_invalid_escape_ptr);
6714
44
    if (result == NULL)
6715
0
        return NULL;
6716
44
    if (first_invalid_escape_char != -1) {
6717
0
        if (first_invalid_escape_char > 0xff) {
6718
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6719
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6720
0
                                 "Such sequences will not work in the future. ",
6721
0
                                 first_invalid_escape_char) < 0)
6722
0
            {
6723
0
                Py_DECREF(result);
6724
0
                return NULL;
6725
0
            }
6726
0
        }
6727
0
        else {
6728
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6729
0
                                 "\"\\%c\" is an invalid escape sequence. "
6730
0
                                 "Such sequences will not work in the future. ",
6731
0
                                 first_invalid_escape_char) < 0)
6732
0
            {
6733
0
                Py_DECREF(result);
6734
0
                return NULL;
6735
0
            }
6736
0
        }
6737
0
    }
6738
44
    return result;
6739
44
}
6740
6741
PyObject *
6742
PyUnicode_DecodeUnicodeEscape(const char *s,
6743
                              Py_ssize_t size,
6744
                              const char *errors)
6745
0
{
6746
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6747
0
}
6748
6749
/* Return a Unicode-Escape string version of the Unicode object. */
6750
6751
PyObject *
6752
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6753
464k
{
6754
464k
    if (!PyUnicode_Check(unicode)) {
6755
0
        PyErr_BadArgument();
6756
0
        return NULL;
6757
0
    }
6758
6759
464k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6760
464k
    if (len == 0) {
6761
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6762
0
    }
6763
464k
    int kind = PyUnicode_KIND(unicode);
6764
464k
    const void *data = PyUnicode_DATA(unicode);
6765
6766
    /* Initial allocation is based on the longest-possible character
6767
     * escape.
6768
     *
6769
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6770
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6771
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6772
464k
    Py_ssize_t expandsize = kind * 2 + 2;
6773
464k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6774
0
        return PyErr_NoMemory();
6775
0
    }
6776
6777
464k
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6778
464k
    if (writer == NULL) {
6779
0
        return NULL;
6780
0
    }
6781
464k
    char *p = PyBytesWriter_GetData(writer);
6782
6783
929k
    for (Py_ssize_t i = 0; i < len; i++) {
6784
464k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6785
6786
        /* U+0000-U+00ff range */
6787
464k
        if (ch < 0x100) {
6788
459k
            if (ch >= ' ' && ch < 127) {
6789
41.1k
                if (ch != '\\') {
6790
                    /* Copy printable US ASCII as-is */
6791
0
                    *p++ = (char) ch;
6792
0
                }
6793
                /* Escape backslashes */
6794
41.1k
                else {
6795
41.1k
                    *p++ = '\\';
6796
41.1k
                    *p++ = '\\';
6797
41.1k
                }
6798
41.1k
            }
6799
6800
            /* Map special whitespace to '\t', \n', '\r' */
6801
417k
            else if (ch == '\t') {
6802
4.51k
                *p++ = '\\';
6803
4.51k
                *p++ = 't';
6804
4.51k
            }
6805
413k
            else if (ch == '\n') {
6806
890
                *p++ = '\\';
6807
890
                *p++ = 'n';
6808
890
            }
6809
412k
            else if (ch == '\r') {
6810
685
                *p++ = '\\';
6811
685
                *p++ = 'r';
6812
685
            }
6813
6814
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6815
411k
            else {
6816
411k
                *p++ = '\\';
6817
411k
                *p++ = 'x';
6818
411k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6819
411k
                *p++ = Py_hexdigits[ch & 0x000F];
6820
411k
            }
6821
459k
        }
6822
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6823
5.98k
        else if (ch < 0x10000) {
6824
4.80k
            *p++ = '\\';
6825
4.80k
            *p++ = 'u';
6826
4.80k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6827
4.80k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6828
4.80k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6829
4.80k
            *p++ = Py_hexdigits[ch & 0x000F];
6830
4.80k
        }
6831
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6832
1.18k
        else {
6833
6834
            /* Make sure that the first two digits are zero */
6835
1.18k
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6836
1.18k
            *p++ = '\\';
6837
1.18k
            *p++ = 'U';
6838
1.18k
            *p++ = '0';
6839
1.18k
            *p++ = '0';
6840
1.18k
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6841
1.18k
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6842
1.18k
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6843
1.18k
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6844
1.18k
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6845
1.18k
            *p++ = Py_hexdigits[ch & 0x0000000F];
6846
1.18k
        }
6847
464k
    }
6848
6849
464k
    return PyBytesWriter_FinishWithPointer(writer, p);
6850
464k
}
6851
6852
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6853
6854
PyObject *
6855
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6856
                                          Py_ssize_t size,
6857
                                          const char *errors,
6858
                                          Py_ssize_t *consumed)
6859
34
{
6860
34
    const char *starts = s;
6861
34
    _PyUnicodeWriter writer;
6862
34
    const char *end;
6863
34
    PyObject *errorHandler = NULL;
6864
34
    PyObject *exc = NULL;
6865
6866
34
    if (size == 0) {
6867
0
        if (consumed) {
6868
0
            *consumed = 0;
6869
0
        }
6870
0
        _Py_RETURN_UNICODE_EMPTY();
6871
0
    }
6872
6873
    /* Escaped strings will always be longer than the resulting
6874
       Unicode string, so we start with size here and then reduce the
6875
       length after conversion to the true value. (But decoding error
6876
       handler might have to resize the string) */
6877
34
    _PyUnicodeWriter_Init(&writer);
6878
34
    writer.min_length = size;
6879
34
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6880
0
        goto onError;
6881
0
    }
6882
6883
34
    end = s + size;
6884
37.6k
    while (s < end) {
6885
37.6k
        unsigned char c = (unsigned char) *s++;
6886
37.6k
        Py_UCS4 ch;
6887
37.6k
        int count;
6888
37.6k
        const char *message;
6889
6890
37.6k
#define WRITE_CHAR(ch)                                                        \
6891
37.6k
            do {                                                              \
6892
37.6k
                if (ch <= writer.maxchar) {                                   \
6893
37.6k
                    assert(writer.pos < writer.size);                         \
6894
37.6k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6895
37.6k
                }                                                             \
6896
37.6k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6897
0
                    goto onError;                                             \
6898
0
                }                                                             \
6899
37.6k
            } while(0)
6900
6901
        /* Non-escape characters are interpreted as Unicode ordinals */
6902
37.6k
        if (c != '\\' || (s >= end && !consumed)) {
6903
37.6k
            WRITE_CHAR(c);
6904
37.6k
            continue;
6905
37.6k
        }
6906
6907
0
        Py_ssize_t startinpos = s - starts - 1;
6908
        /* \ - Escapes */
6909
0
        if (s >= end) {
6910
0
            assert(consumed);
6911
            // Set message to silent compiler warning.
6912
            // Actually it is never used.
6913
0
            message = "\\ at end of string";
6914
0
            goto incomplete;
6915
0
        }
6916
6917
0
        c = (unsigned char) *s++;
6918
0
        if (c == 'u') {
6919
0
            count = 4;
6920
0
            message = "truncated \\uXXXX escape";
6921
0
        }
6922
0
        else if (c == 'U') {
6923
0
            count = 8;
6924
0
            message = "truncated \\UXXXXXXXX escape";
6925
0
        }
6926
0
        else {
6927
0
            assert(writer.pos < writer.size);
6928
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6929
0
            WRITE_CHAR(c);
6930
0
            continue;
6931
0
        }
6932
6933
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6934
0
        for (ch = 0; count; ++s, --count) {
6935
0
            if (s >= end) {
6936
0
                goto incomplete;
6937
0
            }
6938
0
            c = (unsigned char)*s;
6939
0
            ch <<= 4;
6940
0
            if (c >= '0' && c <= '9') {
6941
0
                ch += c - '0';
6942
0
            }
6943
0
            else if (c >= 'a' && c <= 'f') {
6944
0
                ch += c - ('a' - 10);
6945
0
            }
6946
0
            else if (c >= 'A' && c <= 'F') {
6947
0
                ch += c - ('A' - 10);
6948
0
            }
6949
0
            else {
6950
0
                goto error;
6951
0
            }
6952
0
        }
6953
0
        if (ch > MAX_UNICODE) {
6954
0
            message = "\\Uxxxxxxxx out of range";
6955
0
            goto error;
6956
0
        }
6957
0
        WRITE_CHAR(ch);
6958
0
        continue;
6959
6960
0
      incomplete:
6961
0
        if (consumed) {
6962
0
            *consumed = startinpos;
6963
0
            break;
6964
0
        }
6965
0
      error:;
6966
0
        Py_ssize_t endinpos = s-starts;
6967
0
        writer.min_length = end - s + writer.pos;
6968
0
        if (unicode_decode_call_errorhandler_writer(
6969
0
                errors, &errorHandler,
6970
0
                "rawunicodeescape", message,
6971
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6972
0
                &writer)) {
6973
0
            goto onError;
6974
0
        }
6975
0
        assert(end - s <= writer.size - writer.pos);
6976
6977
0
#undef WRITE_CHAR
6978
0
    }
6979
34
    Py_XDECREF(errorHandler);
6980
34
    Py_XDECREF(exc);
6981
34
    return _PyUnicodeWriter_Finish(&writer);
6982
6983
0
  onError:
6984
0
    _PyUnicodeWriter_Dealloc(&writer);
6985
0
    Py_XDECREF(errorHandler);
6986
0
    Py_XDECREF(exc);
6987
0
    return NULL;
6988
34
}
6989
6990
PyObject *
6991
PyUnicode_DecodeRawUnicodeEscape(const char *s,
6992
                                 Py_ssize_t size,
6993
                                 const char *errors)
6994
0
{
6995
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6996
0
}
6997
6998
6999
PyObject *
7000
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7001
274k
{
7002
274k
    if (!PyUnicode_Check(unicode)) {
7003
0
        PyErr_BadArgument();
7004
0
        return NULL;
7005
0
    }
7006
274k
    int kind = PyUnicode_KIND(unicode);
7007
274k
    const void *data = PyUnicode_DATA(unicode);
7008
274k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7009
274k
    if (len == 0) {
7010
530
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7011
530
    }
7012
273k
    if (kind == PyUnicode_1BYTE_KIND) {
7013
273k
        return PyBytes_FromStringAndSize(data, len);
7014
273k
    }
7015
7016
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7017
       bytes, and 1 byte characters 4. */
7018
292
    Py_ssize_t expandsize = kind * 2 + 2;
7019
292
    if (len > PY_SSIZE_T_MAX / expandsize) {
7020
0
        return PyErr_NoMemory();
7021
0
    }
7022
7023
292
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7024
292
    if (writer == NULL) {
7025
0
        return NULL;
7026
0
    }
7027
292
    char *p = PyBytesWriter_GetData(writer);
7028
7029
4.76M
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7030
4.76M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7031
7032
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7033
4.76M
        if (ch < 0x100) {
7034
4.73M
            *p++ = (char) ch;
7035
4.73M
        }
7036
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7037
34.4k
        else if (ch < 0x10000) {
7038
33.9k
            *p++ = '\\';
7039
33.9k
            *p++ = 'u';
7040
33.9k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7041
33.9k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7042
33.9k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7043
33.9k
            *p++ = Py_hexdigits[ch & 15];
7044
33.9k
        }
7045
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7046
577
        else {
7047
577
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7048
577
            *p++ = '\\';
7049
577
            *p++ = 'U';
7050
577
            *p++ = '0';
7051
577
            *p++ = '0';
7052
577
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7053
577
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7054
577
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7055
577
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7056
577
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7057
577
            *p++ = Py_hexdigits[ch & 15];
7058
577
        }
7059
4.76M
    }
7060
7061
292
    return PyBytesWriter_FinishWithPointer(writer, p);
7062
292
}
7063
7064
/* --- Latin-1 Codec ------------------------------------------------------ */
7065
7066
PyObject *
7067
PyUnicode_DecodeLatin1(const char *s,
7068
                       Py_ssize_t size,
7069
                       const char *errors)
7070
2.88M
{
7071
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7072
2.88M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7073
2.88M
}
7074
7075
/* create or adjust a UnicodeEncodeError */
7076
static void
7077
make_encode_exception(PyObject **exceptionObject,
7078
                      const char *encoding,
7079
                      PyObject *unicode,
7080
                      Py_ssize_t startpos, Py_ssize_t endpos,
7081
                      const char *reason)
7082
203k
{
7083
203k
    if (*exceptionObject == NULL) {
7084
203k
        *exceptionObject = PyObject_CallFunction(
7085
203k
            PyExc_UnicodeEncodeError, "sOnns",
7086
203k
            encoding, unicode, startpos, endpos, reason);
7087
203k
    }
7088
0
    else {
7089
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7090
0
            goto onError;
7091
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7092
0
            goto onError;
7093
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7094
0
            goto onError;
7095
0
        return;
7096
0
      onError:
7097
0
        Py_CLEAR(*exceptionObject);
7098
0
    }
7099
203k
}
7100
7101
/* raises a UnicodeEncodeError */
7102
static void
7103
raise_encode_exception(PyObject **exceptionObject,
7104
                       const char *encoding,
7105
                       PyObject *unicode,
7106
                       Py_ssize_t startpos, Py_ssize_t endpos,
7107
                       const char *reason)
7108
43.0k
{
7109
43.0k
    make_encode_exception(exceptionObject,
7110
43.0k
                          encoding, unicode, startpos, endpos, reason);
7111
43.0k
    if (*exceptionObject != NULL)
7112
43.0k
        PyCodec_StrictErrors(*exceptionObject);
7113
43.0k
}
7114
7115
/* error handling callback helper:
7116
   build arguments, call the callback and check the arguments,
7117
   put the result into newpos and return the replacement string, which
7118
   has to be freed by the caller */
7119
static PyObject *
7120
unicode_encode_call_errorhandler(const char *errors,
7121
                                 PyObject **errorHandler,
7122
                                 const char *encoding, const char *reason,
7123
                                 PyObject *unicode, PyObject **exceptionObject,
7124
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7125
                                 Py_ssize_t *newpos)
7126
160k
{
7127
160k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7128
160k
    Py_ssize_t len;
7129
160k
    PyObject *restuple;
7130
160k
    PyObject *resunicode;
7131
7132
160k
    if (*errorHandler == NULL) {
7133
160k
        *errorHandler = PyCodec_LookupError(errors);
7134
160k
        if (*errorHandler == NULL)
7135
0
            return NULL;
7136
160k
    }
7137
7138
160k
    len = PyUnicode_GET_LENGTH(unicode);
7139
7140
160k
    make_encode_exception(exceptionObject,
7141
160k
                          encoding, unicode, startpos, endpos, reason);
7142
160k
    if (*exceptionObject == NULL)
7143
0
        return NULL;
7144
7145
160k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7146
160k
    if (restuple == NULL)
7147
160k
        return NULL;
7148
0
    if (!PyTuple_Check(restuple)) {
7149
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7150
0
        Py_DECREF(restuple);
7151
0
        return NULL;
7152
0
    }
7153
0
    if (!PyArg_ParseTuple(restuple, argparse,
7154
0
                          &resunicode, newpos)) {
7155
0
        Py_DECREF(restuple);
7156
0
        return NULL;
7157
0
    }
7158
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7159
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7160
0
        Py_DECREF(restuple);
7161
0
        return NULL;
7162
0
    }
7163
0
    if (*newpos<0)
7164
0
        *newpos = len + *newpos;
7165
0
    if (*newpos<0 || *newpos>len) {
7166
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7167
0
        Py_DECREF(restuple);
7168
0
        return NULL;
7169
0
    }
7170
0
    Py_INCREF(resunicode);
7171
0
    Py_DECREF(restuple);
7172
0
    return resunicode;
7173
0
}
7174
7175
static PyObject *
7176
unicode_encode_ucs1(PyObject *unicode,
7177
                    const char *errors,
7178
                    const Py_UCS4 limit)
7179
58.3k
{
7180
    /* input state */
7181
58.3k
    Py_ssize_t pos=0, size;
7182
58.3k
    int kind;
7183
58.3k
    const void *data;
7184
58.3k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7185
58.3k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7186
58.3k
    PyObject *error_handler_obj = NULL;
7187
58.3k
    PyObject *exc = NULL;
7188
58.3k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7189
58.3k
    PyObject *rep = NULL;
7190
7191
58.3k
    size = PyUnicode_GET_LENGTH(unicode);
7192
58.3k
    kind = PyUnicode_KIND(unicode);
7193
58.3k
    data = PyUnicode_DATA(unicode);
7194
    /* allocate enough for a simple encoding without
7195
       replacements, if we need more, we'll resize */
7196
58.3k
    if (size == 0)
7197
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7198
7199
    /* output object */
7200
58.3k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7201
58.3k
    if (writer == NULL) {
7202
0
        return NULL;
7203
0
    }
7204
    /* pointer into the output */
7205
58.3k
    char *str = PyBytesWriter_GetData(writer);
7206
7207
5.29M
    while (pos < size) {
7208
5.29M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7209
7210
        /* can we encode this? */
7211
5.29M
        if (ch < limit) {
7212
            /* no overflow check, because we know that the space is enough */
7213
5.23M
            *str++ = (char)ch;
7214
5.23M
            ++pos;
7215
5.23M
        }
7216
58.5k
        else {
7217
58.5k
            Py_ssize_t newpos, i;
7218
            /* startpos for collecting unencodable chars */
7219
58.5k
            Py_ssize_t collstart = pos;
7220
58.5k
            Py_ssize_t collend = collstart + 1;
7221
            /* find all unecodable characters */
7222
7223
578k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7224
520k
                ++collend;
7225
7226
            /* Only overallocate the buffer if it's not the last write */
7227
58.5k
            writer->overallocate = (collend < size);
7228
7229
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7230
58.5k
            if (error_handler == _Py_ERROR_UNKNOWN)
7231
58.3k
                error_handler = _Py_GetErrorHandler(errors);
7232
7233
58.5k
            switch (error_handler) {
7234
43.0k
            case _Py_ERROR_STRICT:
7235
43.0k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7236
43.0k
                goto onError;
7237
7238
6.07k
            case _Py_ERROR_REPLACE:
7239
6.07k
                memset(str, '?', collend - collstart);
7240
6.07k
                str += (collend - collstart);
7241
6.07k
                _Py_FALLTHROUGH;
7242
6.07k
            case _Py_ERROR_IGNORE:
7243
6.07k
                pos = collend;
7244
6.07k
                break;
7245
7246
0
            case _Py_ERROR_BACKSLASHREPLACE:
7247
                /* subtract preallocated bytes */
7248
0
                writer->size -= (collend - collstart);
7249
0
                str = backslashreplace(writer, str,
7250
0
                                       unicode, collstart, collend);
7251
0
                if (str == NULL)
7252
0
                    goto onError;
7253
0
                pos = collend;
7254
0
                break;
7255
7256
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7257
                /* subtract preallocated bytes */
7258
0
                writer->size -= (collend - collstart);
7259
0
                str = xmlcharrefreplace(writer, str,
7260
0
                                        unicode, collstart, collend);
7261
0
                if (str == NULL)
7262
0
                    goto onError;
7263
0
                pos = collend;
7264
0
                break;
7265
7266
9.42k
            case _Py_ERROR_SURROGATEESCAPE:
7267
9.42k
                for (i = collstart; i < collend; ++i) {
7268
9.42k
                    ch = PyUnicode_READ(kind, data, i);
7269
9.42k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7270
                        /* Not a UTF-8b surrogate */
7271
9.42k
                        break;
7272
9.42k
                    }
7273
0
                    *str++ = (char)(ch - 0xdc00);
7274
0
                    ++pos;
7275
0
                }
7276
9.42k
                if (i >= collend)
7277
0
                    break;
7278
9.42k
                collstart = pos;
7279
9.42k
                assert(collstart != collend);
7280
9.42k
                _Py_FALLTHROUGH;
7281
7282
9.42k
            default:
7283
9.42k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7284
9.42k
                                                       encoding, reason, unicode, &exc,
7285
9.42k
                                                       collstart, collend, &newpos);
7286
9.42k
                if (rep == NULL)
7287
9.42k
                    goto onError;
7288
7289
0
                if (newpos < collstart) {
7290
0
                    writer->overallocate = 1;
7291
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7292
0
                                                             collstart - newpos,
7293
0
                                                             str);
7294
0
                    if (str == NULL) {
7295
0
                        goto onError;
7296
0
                    }
7297
0
                }
7298
0
                else {
7299
                    /* subtract preallocated bytes */
7300
0
                    writer->size -= newpos - collstart;
7301
                    /* Only overallocate the buffer if it's not the last write */
7302
0
                    writer->overallocate = (newpos < size);
7303
0
                }
7304
7305
0
                char *rep_str;
7306
0
                Py_ssize_t rep_len;
7307
0
                if (PyBytes_Check(rep)) {
7308
                    /* Directly copy bytes result to output. */
7309
0
                    rep_str = PyBytes_AS_STRING(rep);
7310
0
                    rep_len = PyBytes_GET_SIZE(rep);
7311
0
                }
7312
0
                else {
7313
0
                    assert(PyUnicode_Check(rep));
7314
7315
0
                    if (limit == 256 ?
7316
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7317
0
                        !PyUnicode_IS_ASCII(rep))
7318
0
                    {
7319
                        /* Not all characters are smaller than limit */
7320
0
                        raise_encode_exception(&exc, encoding, unicode,
7321
0
                                               collstart, collend, reason);
7322
0
                        goto onError;
7323
0
                    }
7324
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7325
0
                    rep_str = PyUnicode_DATA(rep);
7326
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7327
0
                }
7328
7329
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7330
0
                if (str == NULL) {
7331
0
                    goto onError;
7332
0
                }
7333
0
                memcpy(str, rep_str, rep_len);
7334
0
                str += rep_len;
7335
7336
0
                pos = newpos;
7337
0
                Py_CLEAR(rep);
7338
58.5k
            }
7339
7340
            /* If overallocation was disabled, ensure that it was the last
7341
               write. Otherwise, we missed an optimization */
7342
58.5k
            assert(writer->overallocate || pos == size);
7343
6.07k
        }
7344
5.29M
    }
7345
7346
5.84k
    Py_XDECREF(error_handler_obj);
7347
5.84k
    Py_XDECREF(exc);
7348
5.84k
    return PyBytesWriter_FinishWithPointer(writer, str);
7349
7350
52.4k
  onError:
7351
52.4k
    Py_XDECREF(rep);
7352
52.4k
    PyBytesWriter_Discard(writer);
7353
52.4k
    Py_XDECREF(error_handler_obj);
7354
52.4k
    Py_XDECREF(exc);
7355
52.4k
    return NULL;
7356
58.3k
}
7357
7358
PyObject *
7359
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7360
10
{
7361
10
    if (!PyUnicode_Check(unicode)) {
7362
0
        PyErr_BadArgument();
7363
0
        return NULL;
7364
0
    }
7365
    /* Fast path: if it is a one-byte string, construct
7366
       bytes object directly. */
7367
10
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7368
10
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7369
10
                                         PyUnicode_GET_LENGTH(unicode));
7370
    /* Non-Latin-1 characters present. Defer to above function to
7371
       raise the exception. */
7372
0
    return unicode_encode_ucs1(unicode, errors, 256);
7373
10
}
7374
7375
PyObject*
7376
PyUnicode_AsLatin1String(PyObject *unicode)
7377
0
{
7378
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7379
0
}
7380
7381
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7382
7383
PyObject *
7384
PyUnicode_DecodeASCII(const char *s,
7385
                      Py_ssize_t size,
7386
                      const char *errors)
7387
12.1M
{
7388
12.1M
    const char *starts = s;
7389
12.1M
    const char *e = s + size;
7390
12.1M
    PyObject *error_handler_obj = NULL;
7391
12.1M
    PyObject *exc = NULL;
7392
12.1M
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7393
7394
12.1M
    if (size == 0)
7395
0
        _Py_RETURN_UNICODE_EMPTY();
7396
7397
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7398
12.1M
    if (size == 1 && (unsigned char)s[0] < 128) {
7399
13.8k
        return get_latin1_char((unsigned char)s[0]);
7400
13.8k
    }
7401
7402
    // Shortcut for simple case
7403
12.1M
    PyObject *u = PyUnicode_New(size, 127);
7404
12.1M
    if (u == NULL) {
7405
0
        return NULL;
7406
0
    }
7407
12.1M
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7408
12.1M
    if (outpos == size) {
7409
9.53M
        return u;
7410
9.53M
    }
7411
7412
2.58M
    _PyUnicodeWriter writer;
7413
2.58M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7414
2.58M
    writer.pos = outpos;
7415
7416
2.58M
    s += outpos;
7417
2.58M
    int kind = writer.kind;
7418
2.58M
    void *data = writer.data;
7419
2.58M
    Py_ssize_t startinpos, endinpos;
7420
7421
22.3M
    while (s < e) {
7422
22.1M
        unsigned char c = (unsigned char)*s;
7423
22.1M
        if (c < 128) {
7424
7.86M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7425
7.86M
            writer.pos++;
7426
7.86M
            ++s;
7427
7.86M
            continue;
7428
7.86M
        }
7429
7430
        /* byte outsize range 0x00..0x7f: call the error handler */
7431
7432
14.2M
        if (error_handler == _Py_ERROR_UNKNOWN)
7433
2.58M
            error_handler = _Py_GetErrorHandler(errors);
7434
7435
14.2M
        switch (error_handler)
7436
14.2M
        {
7437
1.01M
        case _Py_ERROR_REPLACE:
7438
11.8M
        case _Py_ERROR_SURROGATEESCAPE:
7439
            /* Fast-path: the error handler only writes one character,
7440
               but we may switch to UCS2 at the first write */
7441
11.8M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7442
0
                goto onError;
7443
11.8M
            kind = writer.kind;
7444
11.8M
            data = writer.data;
7445
7446
11.8M
            if (error_handler == _Py_ERROR_REPLACE)
7447
1.01M
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7448
10.8M
            else
7449
10.8M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7450
11.8M
            writer.pos++;
7451
11.8M
            ++s;
7452
11.8M
            break;
7453
7454
0
        case _Py_ERROR_IGNORE:
7455
0
            ++s;
7456
0
            break;
7457
7458
2.39M
        default:
7459
2.39M
            startinpos = s-starts;
7460
2.39M
            endinpos = startinpos + 1;
7461
2.39M
            if (unicode_decode_call_errorhandler_writer(
7462
2.39M
                    errors, &error_handler_obj,
7463
2.39M
                    "ascii", "ordinal not in range(128)",
7464
2.39M
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7465
2.39M
                    &writer))
7466
2.39M
                goto onError;
7467
0
            kind = writer.kind;
7468
0
            data = writer.data;
7469
14.2M
        }
7470
14.2M
    }
7471
185k
    Py_XDECREF(error_handler_obj);
7472
185k
    Py_XDECREF(exc);
7473
185k
    return _PyUnicodeWriter_Finish(&writer);
7474
7475
2.39M
  onError:
7476
2.39M
    _PyUnicodeWriter_Dealloc(&writer);
7477
2.39M
    Py_XDECREF(error_handler_obj);
7478
2.39M
    Py_XDECREF(exc);
7479
2.39M
    return NULL;
7480
2.58M
}
7481
7482
PyObject *
7483
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7484
423k
{
7485
423k
    if (!PyUnicode_Check(unicode)) {
7486
0
        PyErr_BadArgument();
7487
0
        return NULL;
7488
0
    }
7489
    /* Fast path: if it is an ASCII-only string, construct bytes object
7490
       directly. Else defer to above function to raise the exception. */
7491
423k
    if (PyUnicode_IS_ASCII(unicode))
7492
365k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7493
365k
                                         PyUnicode_GET_LENGTH(unicode));
7494
58.3k
    return unicode_encode_ucs1(unicode, errors, 128);
7495
423k
}
7496
7497
PyObject *
7498
PyUnicode_AsASCIIString(PyObject *unicode)
7499
120k
{
7500
120k
    return _PyUnicode_AsASCIIString(unicode, NULL);
7501
120k
}
7502
7503
#ifdef MS_WINDOWS
7504
7505
/* --- MBCS codecs for Windows -------------------------------------------- */
7506
7507
#if SIZEOF_INT < SIZEOF_SIZE_T
7508
#define NEED_RETRY
7509
#endif
7510
7511
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7512
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7513
   both cases also and avoids partial characters overrunning the
7514
   length limit in MultiByteToWideChar on Windows */
7515
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7516
7517
#ifndef WC_ERR_INVALID_CHARS
7518
#  define WC_ERR_INVALID_CHARS 0x0080
7519
#endif
7520
7521
static const char*
7522
code_page_name(UINT code_page, PyObject **obj)
7523
{
7524
    *obj = NULL;
7525
    if (code_page == CP_ACP)
7526
        return "mbcs";
7527
7528
    *obj = PyBytes_FromFormat("cp%u", code_page);
7529
    if (*obj == NULL)
7530
        return NULL;
7531
    return PyBytes_AS_STRING(*obj);
7532
}
7533
7534
static DWORD
7535
decode_code_page_flags(UINT code_page)
7536
{
7537
    if (code_page == CP_UTF7) {
7538
        /* The CP_UTF7 decoder only supports flags=0 */
7539
        return 0;
7540
    }
7541
    else
7542
        return MB_ERR_INVALID_CHARS;
7543
}
7544
7545
/*
7546
 * Decode a byte string from a Windows code page into unicode object in strict
7547
 * mode.
7548
 *
7549
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7550
 * OSError and returns -1 on other error.
7551
 */
7552
static int
7553
decode_code_page_strict(UINT code_page,
7554
                        wchar_t **buf,
7555
                        Py_ssize_t *bufsize,
7556
                        const char *in,
7557
                        int insize)
7558
{
7559
    DWORD flags = MB_ERR_INVALID_CHARS;
7560
    wchar_t *out;
7561
    DWORD outsize;
7562
7563
    /* First get the size of the result */
7564
    assert(insize > 0);
7565
    while ((outsize = MultiByteToWideChar(code_page, flags,
7566
                                          in, insize, NULL, 0)) <= 0)
7567
    {
7568
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7569
            goto error;
7570
        }
7571
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7572
        flags = 0;
7573
    }
7574
7575
    /* Extend a wchar_t* buffer */
7576
    Py_ssize_t n = *bufsize;   /* Get the current length */
7577
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7578
        return -1;
7579
    }
7580
    out = *buf + n;
7581
7582
    /* Do the conversion */
7583
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7584
    if (outsize <= 0)
7585
        goto error;
7586
    return insize;
7587
7588
error:
7589
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7590
        return -2;
7591
    PyErr_SetFromWindowsErr(0);
7592
    return -1;
7593
}
7594
7595
/*
7596
 * Decode a byte string from a code page into unicode object with an error
7597
 * handler.
7598
 *
7599
 * Returns consumed size if succeed, or raise an OSError or
7600
 * UnicodeDecodeError exception and returns -1 on error.
7601
 */
7602
static int
7603
decode_code_page_errors(UINT code_page,
7604
                        wchar_t **buf,
7605
                        Py_ssize_t *bufsize,
7606
                        const char *in, const int size,
7607
                        const char *errors, int final)
7608
{
7609
    const char *startin = in;
7610
    const char *endin = in + size;
7611
    DWORD flags = MB_ERR_INVALID_CHARS;
7612
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7613
       2000 English version of the message. */
7614
    const char *reason = "No mapping for the Unicode character exists "
7615
                         "in the target code page.";
7616
    /* each step cannot decode more than 1 character, but a character can be
7617
       represented as a surrogate pair */
7618
    wchar_t buffer[2], *out;
7619
    int insize;
7620
    Py_ssize_t outsize;
7621
    PyObject *errorHandler = NULL;
7622
    PyObject *exc = NULL;
7623
    PyObject *encoding_obj = NULL;
7624
    const char *encoding;
7625
    DWORD err;
7626
    int ret = -1;
7627
7628
    assert(size > 0);
7629
7630
    encoding = code_page_name(code_page, &encoding_obj);
7631
    if (encoding == NULL)
7632
        return -1;
7633
7634
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7635
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7636
           UnicodeDecodeError. */
7637
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7638
        if (exc != NULL) {
7639
            PyCodec_StrictErrors(exc);
7640
            Py_CLEAR(exc);
7641
        }
7642
        goto error;
7643
    }
7644
7645
    /* Extend a wchar_t* buffer */
7646
    Py_ssize_t n = *bufsize;   /* Get the current length */
7647
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7648
        PyErr_NoMemory();
7649
        goto error;
7650
    }
7651
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7652
        goto error;
7653
    }
7654
    out = *buf + n;
7655
7656
    /* Decode the byte string character per character */
7657
    while (in < endin)
7658
    {
7659
        /* Decode a character */
7660
        insize = 1;
7661
        do
7662
        {
7663
            outsize = MultiByteToWideChar(code_page, flags,
7664
                                          in, insize,
7665
                                          buffer, Py_ARRAY_LENGTH(buffer));
7666
            if (outsize > 0)
7667
                break;
7668
            err = GetLastError();
7669
            if (err == ERROR_INVALID_FLAGS && flags) {
7670
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7671
                flags = 0;
7672
                continue;
7673
            }
7674
            if (err != ERROR_NO_UNICODE_TRANSLATION
7675
                && err != ERROR_INSUFFICIENT_BUFFER)
7676
            {
7677
                PyErr_SetFromWindowsErr(err);
7678
                goto error;
7679
            }
7680
            insize++;
7681
        }
7682
        /* 4=maximum length of a UTF-8 sequence */
7683
        while (insize <= 4 && (in + insize) <= endin);
7684
7685
        if (outsize <= 0) {
7686
            Py_ssize_t startinpos, endinpos, outpos;
7687
7688
            /* last character in partial decode? */
7689
            if (in + insize >= endin && !final)
7690
                break;
7691
7692
            startinpos = in - startin;
7693
            endinpos = startinpos + 1;
7694
            outpos = out - *buf;
7695
            if (unicode_decode_call_errorhandler_wchar(
7696
                    errors, &errorHandler,
7697
                    encoding, reason,
7698
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7699
                    buf, bufsize, &outpos))
7700
            {
7701
                goto error;
7702
            }
7703
            out = *buf + outpos;
7704
        }
7705
        else {
7706
            in += insize;
7707
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7708
            out += outsize;
7709
        }
7710
    }
7711
7712
    /* Shrink the buffer */
7713
    assert(out - *buf <= *bufsize);
7714
    *bufsize = out - *buf;
7715
    /* (in - startin) <= size and size is an int */
7716
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7717
7718
error:
7719
    Py_XDECREF(encoding_obj);
7720
    Py_XDECREF(errorHandler);
7721
    Py_XDECREF(exc);
7722
    return ret;
7723
}
7724
7725
static PyObject *
7726
decode_code_page_stateful(int code_page,
7727
                          const char *s, Py_ssize_t size,
7728
                          const char *errors, Py_ssize_t *consumed)
7729
{
7730
    wchar_t *buf = NULL;
7731
    Py_ssize_t bufsize = 0;
7732
    int chunk_size, final, converted, done;
7733
7734
    if (code_page < 0) {
7735
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7736
        return NULL;
7737
    }
7738
    if (size < 0) {
7739
        PyErr_BadInternalCall();
7740
        return NULL;
7741
    }
7742
7743
    if (consumed)
7744
        *consumed = 0;
7745
7746
    do
7747
    {
7748
#ifdef NEED_RETRY
7749
        if (size > DECODING_CHUNK_SIZE) {
7750
            chunk_size = DECODING_CHUNK_SIZE;
7751
            final = 0;
7752
            done = 0;
7753
        }
7754
        else
7755
#endif
7756
        {
7757
            chunk_size = (int)size;
7758
            final = (consumed == NULL);
7759
            done = 1;
7760
        }
7761
7762
        if (chunk_size == 0 && done) {
7763
            if (buf != NULL)
7764
                break;
7765
            _Py_RETURN_UNICODE_EMPTY();
7766
        }
7767
7768
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7769
                                            s, chunk_size);
7770
        if (converted == -2)
7771
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7772
                                                s, chunk_size,
7773
                                                errors, final);
7774
        assert(converted != 0 || done);
7775
7776
        if (converted < 0) {
7777
            PyMem_Free(buf);
7778
            return NULL;
7779
        }
7780
7781
        if (consumed)
7782
            *consumed += converted;
7783
7784
        s += converted;
7785
        size -= converted;
7786
    } while (!done);
7787
7788
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7789
    PyMem_Free(buf);
7790
    return v;
7791
}
7792
7793
PyObject *
7794
PyUnicode_DecodeCodePageStateful(int code_page,
7795
                                 const char *s,
7796
                                 Py_ssize_t size,
7797
                                 const char *errors,
7798
                                 Py_ssize_t *consumed)
7799
{
7800
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7801
}
7802
7803
PyObject *
7804
PyUnicode_DecodeMBCSStateful(const char *s,
7805
                             Py_ssize_t size,
7806
                             const char *errors,
7807
                             Py_ssize_t *consumed)
7808
{
7809
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7810
}
7811
7812
PyObject *
7813
PyUnicode_DecodeMBCS(const char *s,
7814
                     Py_ssize_t size,
7815
                     const char *errors)
7816
{
7817
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7818
}
7819
7820
static DWORD
7821
encode_code_page_flags(UINT code_page, const char *errors)
7822
{
7823
    if (code_page == CP_UTF8) {
7824
        return WC_ERR_INVALID_CHARS;
7825
    }
7826
    else if (code_page == CP_UTF7) {
7827
        /* CP_UTF7 only supports flags=0 */
7828
        return 0;
7829
    }
7830
    else {
7831
        if (errors != NULL && strcmp(errors, "replace") == 0)
7832
            return 0;
7833
        else
7834
            return WC_NO_BEST_FIT_CHARS;
7835
    }
7836
}
7837
7838
/*
7839
 * Encode a Unicode string to a Windows code page into a byte string in strict
7840
 * mode.
7841
 *
7842
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7843
 * an OSError and returns -1 on other error.
7844
 */
7845
static int
7846
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7847
                        PyObject *unicode, Py_ssize_t offset, int len,
7848
                        const char* errors)
7849
{
7850
    BOOL usedDefaultChar = FALSE;
7851
    BOOL *pusedDefaultChar = &usedDefaultChar;
7852
    int outsize;
7853
    wchar_t *p;
7854
    Py_ssize_t size;
7855
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7856
    char *out;
7857
    /* Create a substring so that we can get the UTF-16 representation
7858
       of just the slice under consideration. */
7859
    PyObject *substring;
7860
    int ret = -1;
7861
7862
    assert(len > 0);
7863
7864
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7865
        pusedDefaultChar = &usedDefaultChar;
7866
    else
7867
        pusedDefaultChar = NULL;
7868
7869
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7870
    if (substring == NULL)
7871
        return -1;
7872
    p = PyUnicode_AsWideCharString(substring, &size);
7873
    Py_CLEAR(substring);
7874
    if (p == NULL) {
7875
        return -1;
7876
    }
7877
    assert(size <= INT_MAX);
7878
7879
    /* First get the size of the result */
7880
    outsize = WideCharToMultiByte(code_page, flags,
7881
                                  p, (int)size,
7882
                                  NULL, 0,
7883
                                  NULL, pusedDefaultChar);
7884
    if (outsize <= 0)
7885
        goto error;
7886
    /* If we used a default char, then we failed! */
7887
    if (pusedDefaultChar && *pusedDefaultChar) {
7888
        ret = -2;
7889
        goto done;
7890
    }
7891
7892
    if (*writer == NULL) {
7893
        /* Create string object */
7894
        *writer = PyBytesWriter_Create(outsize);
7895
        if (*writer == NULL) {
7896
            goto done;
7897
        }
7898
        out = PyBytesWriter_GetData(*writer);
7899
    }
7900
    else {
7901
        /* Extend string object */
7902
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
7903
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
7904
            goto done;
7905
        }
7906
        out = (char*)PyBytesWriter_GetData(*writer) + n;
7907
    }
7908
7909
    /* Do the conversion */
7910
    outsize = WideCharToMultiByte(code_page, flags,
7911
                                  p, (int)size,
7912
                                  out, outsize,
7913
                                  NULL, pusedDefaultChar);
7914
    if (outsize <= 0)
7915
        goto error;
7916
    if (pusedDefaultChar && *pusedDefaultChar) {
7917
        ret = -2;
7918
        goto done;
7919
    }
7920
    ret = 0;
7921
7922
done:
7923
    PyMem_Free(p);
7924
    return ret;
7925
7926
error:
7927
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7928
        ret = -2;
7929
        goto done;
7930
    }
7931
    PyErr_SetFromWindowsErr(0);
7932
    goto done;
7933
}
7934
7935
/*
7936
 * Encode a Unicode string to a Windows code page into a byte string using an
7937
 * error handler.
7938
 *
7939
 * Returns consumed characters if succeed, or raise an OSError and returns
7940
 * -1 on other error.
7941
 */
7942
static int
7943
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
7944
                        PyObject *unicode, Py_ssize_t unicode_offset,
7945
                        Py_ssize_t insize, const char* errors)
7946
{
7947
    const DWORD flags = encode_code_page_flags(code_page, errors);
7948
    Py_ssize_t pos = unicode_offset;
7949
    Py_ssize_t endin = unicode_offset + insize;
7950
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7951
       2000 English version of the message. */
7952
    const char *reason = "invalid character";
7953
    /* 4=maximum length of a UTF-8 sequence */
7954
    char buffer[4];
7955
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7956
    Py_ssize_t outsize;
7957
    char *out;
7958
    PyObject *errorHandler = NULL;
7959
    PyObject *exc = NULL;
7960
    PyObject *encoding_obj = NULL;
7961
    const char *encoding;
7962
    Py_ssize_t newpos;
7963
    PyObject *rep;
7964
    int ret = -1;
7965
7966
    assert(insize > 0);
7967
7968
    encoding = code_page_name(code_page, &encoding_obj);
7969
    if (encoding == NULL)
7970
        return -1;
7971
7972
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7973
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7974
           then we raise a UnicodeEncodeError. */
7975
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7976
        if (exc != NULL) {
7977
            PyCodec_StrictErrors(exc);
7978
            Py_DECREF(exc);
7979
        }
7980
        Py_XDECREF(encoding_obj);
7981
        return -1;
7982
    }
7983
7984
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7985
        pusedDefaultChar = &usedDefaultChar;
7986
    else
7987
        pusedDefaultChar = NULL;
7988
7989
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7990
        PyErr_NoMemory();
7991
        goto error;
7992
    }
7993
    outsize = insize * Py_ARRAY_LENGTH(buffer);
7994
7995
    if (*writer == NULL) {
7996
        /* Create string object */
7997
        *writer = PyBytesWriter_Create(outsize);
7998
        if (*writer == NULL) {
7999
            goto error;
8000
        }
8001
        out = PyBytesWriter_GetData(*writer);
8002
    }
8003
    else {
8004
        /* Extend string object */
8005
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8006
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8007
            goto error;
8008
        }
8009
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8010
    }
8011
8012
    /* Encode the string character per character */
8013
    while (pos < endin)
8014
    {
8015
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8016
        wchar_t chars[2];
8017
        int charsize;
8018
        if (ch < 0x10000) {
8019
            chars[0] = (wchar_t)ch;
8020
            charsize = 1;
8021
        }
8022
        else {
8023
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8024
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8025
            charsize = 2;
8026
        }
8027
8028
        outsize = WideCharToMultiByte(code_page, flags,
8029
                                      chars, charsize,
8030
                                      buffer, Py_ARRAY_LENGTH(buffer),
8031
                                      NULL, pusedDefaultChar);
8032
        if (outsize > 0) {
8033
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8034
            {
8035
                pos++;
8036
                memcpy(out, buffer, outsize);
8037
                out += outsize;
8038
                continue;
8039
            }
8040
        }
8041
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8042
            PyErr_SetFromWindowsErr(0);
8043
            goto error;
8044
        }
8045
8046
        rep = unicode_encode_call_errorhandler(
8047
                  errors, &errorHandler, encoding, reason,
8048
                  unicode, &exc,
8049
                  pos, pos + 1, &newpos);
8050
        if (rep == NULL)
8051
            goto error;
8052
8053
        Py_ssize_t morebytes = pos - newpos;
8054
        if (PyBytes_Check(rep)) {
8055
            outsize = PyBytes_GET_SIZE(rep);
8056
            morebytes += outsize;
8057
            if (morebytes > 0) {
8058
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8059
                if (out == NULL) {
8060
                    Py_DECREF(rep);
8061
                    goto error;
8062
                }
8063
            }
8064
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8065
            out += outsize;
8066
        }
8067
        else {
8068
            Py_ssize_t i;
8069
            int kind;
8070
            const void *data;
8071
8072
            outsize = PyUnicode_GET_LENGTH(rep);
8073
            morebytes += outsize;
8074
            if (morebytes > 0) {
8075
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8076
                if (out == NULL) {
8077
                    Py_DECREF(rep);
8078
                    goto error;
8079
                }
8080
            }
8081
            kind = PyUnicode_KIND(rep);
8082
            data = PyUnicode_DATA(rep);
8083
            for (i=0; i < outsize; i++) {
8084
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8085
                if (ch > 127) {
8086
                    raise_encode_exception(&exc,
8087
                        encoding, unicode,
8088
                        pos, pos + 1,
8089
                        "unable to encode error handler result to ASCII");
8090
                    Py_DECREF(rep);
8091
                    goto error;
8092
                }
8093
                *out = (unsigned char)ch;
8094
                out++;
8095
            }
8096
        }
8097
        pos = newpos;
8098
        Py_DECREF(rep);
8099
    }
8100
    /* write a NUL byte */
8101
    *out = 0;
8102
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8103
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8104
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8105
        goto error;
8106
    }
8107
    ret = 0;
8108
8109
error:
8110
    Py_XDECREF(encoding_obj);
8111
    Py_XDECREF(errorHandler);
8112
    Py_XDECREF(exc);
8113
    return ret;
8114
}
8115
8116
8117
PyObject *
8118
PyUnicode_EncodeCodePage(int code_page,
8119
                         PyObject *unicode,
8120
                         const char *errors)
8121
{
8122
    Py_ssize_t len;
8123
    PyBytesWriter *writer = NULL;
8124
    Py_ssize_t offset;
8125
    int chunk_len, ret, done;
8126
8127
    if (!PyUnicode_Check(unicode)) {
8128
        PyErr_BadArgument();
8129
        return NULL;
8130
    }
8131
8132
    len = PyUnicode_GET_LENGTH(unicode);
8133
8134
    if (code_page < 0) {
8135
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8136
        return NULL;
8137
    }
8138
8139
    if (len == 0)
8140
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8141
8142
    offset = 0;
8143
    do
8144
    {
8145
#ifdef NEED_RETRY
8146
        if (len > DECODING_CHUNK_SIZE) {
8147
            chunk_len = DECODING_CHUNK_SIZE;
8148
            done = 0;
8149
        }
8150
        else
8151
#endif
8152
        {
8153
            chunk_len = (int)len;
8154
            done = 1;
8155
        }
8156
8157
        ret = encode_code_page_strict(code_page, &writer,
8158
                                      unicode, offset, chunk_len,
8159
                                      errors);
8160
        if (ret == -2)
8161
            ret = encode_code_page_errors(code_page, &writer,
8162
                                          unicode, offset,
8163
                                          chunk_len, errors);
8164
        if (ret < 0) {
8165
            PyBytesWriter_Discard(writer);
8166
            return NULL;
8167
        }
8168
8169
        offset += chunk_len;
8170
        len -= chunk_len;
8171
    } while (!done);
8172
8173
    return PyBytesWriter_Finish(writer);
8174
}
8175
8176
8177
PyObject *
8178
PyUnicode_AsMBCSString(PyObject *unicode)
8179
{
8180
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8181
}
8182
8183
#undef NEED_RETRY
8184
8185
#endif /* MS_WINDOWS */
8186
8187
/* --- Character Mapping Codec -------------------------------------------- */
8188
8189
static int
8190
charmap_decode_string(const char *s,
8191
                      Py_ssize_t size,
8192
                      PyObject *mapping,
8193
                      const char *errors,
8194
                      _PyUnicodeWriter *writer)
8195
685k
{
8196
685k
    const char *starts = s;
8197
685k
    const char *e;
8198
685k
    Py_ssize_t startinpos, endinpos;
8199
685k
    PyObject *errorHandler = NULL, *exc = NULL;
8200
685k
    Py_ssize_t maplen;
8201
685k
    int mapkind;
8202
685k
    const void *mapdata;
8203
685k
    Py_UCS4 x;
8204
685k
    unsigned char ch;
8205
8206
685k
    maplen = PyUnicode_GET_LENGTH(mapping);
8207
685k
    mapdata = PyUnicode_DATA(mapping);
8208
685k
    mapkind = PyUnicode_KIND(mapping);
8209
8210
685k
    e = s + size;
8211
8212
685k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8213
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8214
         * is disabled in encoding aliases, latin1 is preferred because
8215
         * its implementation is faster. */
8216
370
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8217
370
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8218
370
        Py_UCS4 maxchar = writer->maxchar;
8219
8220
370
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8221
104k
        while (s < e) {
8222
104k
            ch = *s;
8223
104k
            x = mapdata_ucs1[ch];
8224
104k
            if (x > maxchar) {
8225
365
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8226
0
                    goto onError;
8227
365
                maxchar = writer->maxchar;
8228
365
                outdata = (Py_UCS1 *)writer->data;
8229
365
            }
8230
104k
            outdata[writer->pos] = x;
8231
104k
            writer->pos++;
8232
104k
            ++s;
8233
104k
        }
8234
370
        return 0;
8235
370
    }
8236
8237
807k
    while (s < e) {
8238
790k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8239
790k
            int outkind = writer->kind;
8240
790k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8241
790k
            if (outkind == PyUnicode_1BYTE_KIND) {
8242
725k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8243
725k
                Py_UCS4 maxchar = writer->maxchar;
8244
20.3M
                while (s < e) {
8245
19.6M
                    ch = *s;
8246
19.6M
                    x = mapdata_ucs2[ch];
8247
19.6M
                    if (x > maxchar)
8248
83.4k
                        goto Error;
8249
19.5M
                    outdata[writer->pos] = x;
8250
19.5M
                    writer->pos++;
8251
19.5M
                    ++s;
8252
19.5M
                }
8253
641k
                break;
8254
725k
            }
8255
65.8k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8256
65.8k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8257
37.9M
                while (s < e) {
8258
37.9M
                    ch = *s;
8259
37.9M
                    x = mapdata_ucs2[ch];
8260
37.9M
                    if (x == 0xFFFE)
8261
39.2k
                        goto Error;
8262
37.9M
                    outdata[writer->pos] = x;
8263
37.9M
                    writer->pos++;
8264
37.9M
                    ++s;
8265
37.9M
                }
8266
26.6k
                break;
8267
65.8k
            }
8268
790k
        }
8269
0
        ch = *s;
8270
8271
0
        if (ch < maplen)
8272
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8273
0
        else
8274
0
            x = 0xfffe; /* invalid value */
8275
122k
Error:
8276
122k
        if (x == 0xfffe)
8277
62.5k
        {
8278
            /* undefined mapping */
8279
62.5k
            startinpos = s-starts;
8280
62.5k
            endinpos = startinpos+1;
8281
62.5k
            if (unicode_decode_call_errorhandler_writer(
8282
62.5k
                    errors, &errorHandler,
8283
62.5k
                    "charmap", "character maps to <undefined>",
8284
62.5k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8285
62.5k
                    writer)) {
8286
368
                goto onError;
8287
368
            }
8288
62.1k
            continue;
8289
62.5k
        }
8290
8291
60.1k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8292
0
            goto onError;
8293
60.1k
        ++s;
8294
60.1k
    }
8295
685k
    Py_XDECREF(errorHandler);
8296
685k
    Py_XDECREF(exc);
8297
685k
    return 0;
8298
8299
368
onError:
8300
368
    Py_XDECREF(errorHandler);
8301
368
    Py_XDECREF(exc);
8302
368
    return -1;
8303
685k
}
8304
8305
static int
8306
charmap_decode_mapping(const char *s,
8307
                       Py_ssize_t size,
8308
                       PyObject *mapping,
8309
                       const char *errors,
8310
                       _PyUnicodeWriter *writer)
8311
0
{
8312
0
    const char *starts = s;
8313
0
    const char *e;
8314
0
    Py_ssize_t startinpos, endinpos;
8315
0
    PyObject *errorHandler = NULL, *exc = NULL;
8316
0
    unsigned char ch;
8317
0
    PyObject *key, *item = NULL;
8318
8319
0
    e = s + size;
8320
8321
0
    while (s < e) {
8322
0
        ch = *s;
8323
8324
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8325
0
        key = PyLong_FromLong((long)ch);
8326
0
        if (key == NULL)
8327
0
            goto onError;
8328
8329
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8330
0
        Py_DECREF(key);
8331
0
        if (rc == 0) {
8332
            /* No mapping found means: mapping is undefined. */
8333
0
            goto Undefined;
8334
0
        }
8335
0
        if (item == NULL) {
8336
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8337
                /* No mapping found means: mapping is undefined. */
8338
0
                PyErr_Clear();
8339
0
                goto Undefined;
8340
0
            } else
8341
0
                goto onError;
8342
0
        }
8343
8344
        /* Apply mapping */
8345
0
        if (item == Py_None)
8346
0
            goto Undefined;
8347
0
        if (PyLong_Check(item)) {
8348
0
            long value = PyLong_AsLong(item);
8349
0
            if (value == 0xFFFE)
8350
0
                goto Undefined;
8351
0
            if (value < 0 || value > MAX_UNICODE) {
8352
0
                PyErr_Format(PyExc_TypeError,
8353
0
                             "character mapping must be in range(0x%lx)",
8354
0
                             (unsigned long)MAX_UNICODE + 1);
8355
0
                goto onError;
8356
0
            }
8357
8358
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8359
0
                goto onError;
8360
0
        }
8361
0
        else if (PyUnicode_Check(item)) {
8362
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8363
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8364
0
                if (value == 0xFFFE)
8365
0
                    goto Undefined;
8366
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8367
0
                    goto onError;
8368
0
            }
8369
0
            else {
8370
0
                writer->overallocate = 1;
8371
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8372
0
                    goto onError;
8373
0
            }
8374
0
        }
8375
0
        else {
8376
            /* wrong return value */
8377
0
            PyErr_SetString(PyExc_TypeError,
8378
0
                            "character mapping must return integer, None or str");
8379
0
            goto onError;
8380
0
        }
8381
0
        Py_CLEAR(item);
8382
0
        ++s;
8383
0
        continue;
8384
8385
0
Undefined:
8386
        /* undefined mapping */
8387
0
        Py_CLEAR(item);
8388
0
        startinpos = s-starts;
8389
0
        endinpos = startinpos+1;
8390
0
        if (unicode_decode_call_errorhandler_writer(
8391
0
                errors, &errorHandler,
8392
0
                "charmap", "character maps to <undefined>",
8393
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8394
0
                writer)) {
8395
0
            goto onError;
8396
0
        }
8397
0
    }
8398
0
    Py_XDECREF(errorHandler);
8399
0
    Py_XDECREF(exc);
8400
0
    return 0;
8401
8402
0
onError:
8403
0
    Py_XDECREF(item);
8404
0
    Py_XDECREF(errorHandler);
8405
0
    Py_XDECREF(exc);
8406
0
    return -1;
8407
0
}
8408
8409
PyObject *
8410
PyUnicode_DecodeCharmap(const char *s,
8411
                        Py_ssize_t size,
8412
                        PyObject *mapping,
8413
                        const char *errors)
8414
685k
{
8415
685k
    _PyUnicodeWriter writer;
8416
8417
    /* Default to Latin-1 */
8418
685k
    if (mapping == NULL)
8419
12
        return PyUnicode_DecodeLatin1(s, size, errors);
8420
8421
685k
    if (size == 0)
8422
0
        _Py_RETURN_UNICODE_EMPTY();
8423
685k
    _PyUnicodeWriter_Init(&writer);
8424
685k
    writer.min_length = size;
8425
685k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8426
0
        goto onError;
8427
8428
685k
    if (PyUnicode_CheckExact(mapping)) {
8429
685k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8430
368
            goto onError;
8431
685k
    }
8432
0
    else {
8433
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8434
0
            goto onError;
8435
0
    }
8436
685k
    return _PyUnicodeWriter_Finish(&writer);
8437
8438
368
  onError:
8439
368
    _PyUnicodeWriter_Dealloc(&writer);
8440
368
    return NULL;
8441
685k
}
8442
8443
/* Charmap encoding: the lookup table */
8444
8445
/*[clinic input]
8446
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8447
[clinic start generated code]*/
8448
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8449
8450
struct encoding_map {
8451
    PyObject_HEAD
8452
    unsigned char level1[32];
8453
    int count2, count3;
8454
    unsigned char level23[1];
8455
};
8456
8457
/*[clinic input]
8458
EncodingMap.size
8459
8460
Return the size (in bytes) of this object.
8461
[clinic start generated code]*/
8462
8463
static PyObject *
8464
EncodingMap_size_impl(struct encoding_map *self)
8465
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8466
0
{
8467
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8468
0
                           128*self->count3);
8469
0
}
8470
8471
static PyMethodDef encoding_map_methods[] = {
8472
    ENCODINGMAP_SIZE_METHODDEF
8473
    {NULL, NULL}
8474
};
8475
8476
static PyTypeObject EncodingMapType = {
8477
    PyVarObject_HEAD_INIT(NULL, 0)
8478
    .tp_name = "EncodingMap",
8479
    .tp_basicsize = sizeof(struct encoding_map),
8480
    /* methods */
8481
    .tp_flags = Py_TPFLAGS_DEFAULT,
8482
    .tp_methods = encoding_map_methods,
8483
};
8484
8485
PyObject*
8486
PyUnicode_BuildEncodingMap(PyObject* string)
8487
136
{
8488
136
    PyObject *result;
8489
136
    struct encoding_map *mresult;
8490
136
    int i;
8491
136
    int need_dict = 0;
8492
136
    unsigned char level1[32];
8493
136
    unsigned char level2[512];
8494
136
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8495
136
    int count2 = 0, count3 = 0;
8496
136
    int kind;
8497
136
    const void *data;
8498
136
    int length;
8499
136
    Py_UCS4 ch;
8500
8501
136
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8502
0
        PyErr_BadArgument();
8503
0
        return NULL;
8504
0
    }
8505
136
    kind = PyUnicode_KIND(string);
8506
136
    data = PyUnicode_DATA(string);
8507
136
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8508
136
    memset(level1, 0xFF, sizeof level1);
8509
136
    memset(level2, 0xFF, sizeof level2);
8510
8511
    /* If there isn't a one-to-one mapping of NULL to \0,
8512
       or if there are non-BMP characters, we need to use
8513
       a mapping dictionary. */
8514
136
    if (PyUnicode_READ(kind, data, 0) != 0)
8515
0
        need_dict = 1;
8516
34.8k
    for (i = 1; i < length; i++) {
8517
34.6k
        int l1, l2;
8518
34.6k
        ch = PyUnicode_READ(kind, data, i);
8519
34.6k
        if (ch == 0 || ch > 0xFFFF) {
8520
0
            need_dict = 1;
8521
0
            break;
8522
0
        }
8523
34.6k
        if (ch == 0xFFFE)
8524
            /* unmapped character */
8525
926
            continue;
8526
33.7k
        l1 = ch >> 11;
8527
33.7k
        l2 = ch >> 7;
8528
33.7k
        if (level1[l1] == 0xFF)
8529
249
            level1[l1] = count2++;
8530
33.7k
        if (level2[l2] == 0xFF)
8531
737
            level2[l2] = count3++;
8532
33.7k
    }
8533
8534
136
    if (count2 >= 0xFF || count3 >= 0xFF)
8535
0
        need_dict = 1;
8536
8537
136
    if (need_dict) {
8538
0
        PyObject *result = PyDict_New();
8539
0
        if (!result)
8540
0
            return NULL;
8541
0
        for (i = 0; i < length; i++) {
8542
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8543
0
            PyObject *key = PyLong_FromLong(c);
8544
0
            if (key == NULL) {
8545
0
                Py_DECREF(result);
8546
0
                return NULL;
8547
0
            }
8548
0
            PyObject *value = PyLong_FromLong(i);
8549
0
            if (value == NULL) {
8550
0
                Py_DECREF(key);
8551
0
                Py_DECREF(result);
8552
0
                return NULL;
8553
0
            }
8554
0
            int rc = PyDict_SetItem(result, key, value);
8555
0
            Py_DECREF(key);
8556
0
            Py_DECREF(value);
8557
0
            if (rc < 0) {
8558
0
                Py_DECREF(result);
8559
0
                return NULL;
8560
0
            }
8561
0
        }
8562
0
        return result;
8563
0
    }
8564
8565
    /* Create a three-level trie */
8566
136
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8567
136
                             16*count2 + 128*count3 - 1);
8568
136
    if (!result) {
8569
0
        return PyErr_NoMemory();
8570
0
    }
8571
8572
136
    _PyObject_Init(result, &EncodingMapType);
8573
136
    mresult = (struct encoding_map*)result;
8574
136
    mresult->count2 = count2;
8575
136
    mresult->count3 = count3;
8576
136
    mlevel1 = mresult->level1;
8577
136
    mlevel2 = mresult->level23;
8578
136
    mlevel3 = mresult->level23 + 16*count2;
8579
136
    memcpy(mlevel1, level1, 32);
8580
136
    memset(mlevel2, 0xFF, 16*count2);
8581
136
    memset(mlevel3, 0, 128*count3);
8582
136
    count3 = 0;
8583
34.8k
    for (i = 1; i < length; i++) {
8584
34.6k
        int o1, o2, o3, i2, i3;
8585
34.6k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8586
34.6k
        if (ch == 0xFFFE)
8587
            /* unmapped character */
8588
926
            continue;
8589
33.7k
        o1 = ch>>11;
8590
33.7k
        o2 = (ch>>7) & 0xF;
8591
33.7k
        i2 = 16*mlevel1[o1] + o2;
8592
33.7k
        if (mlevel2[i2] == 0xFF)
8593
737
            mlevel2[i2] = count3++;
8594
33.7k
        o3 = ch & 0x7F;
8595
33.7k
        i3 = 128*mlevel2[i2] + o3;
8596
33.7k
        mlevel3[i3] = i;
8597
33.7k
    }
8598
136
    return result;
8599
136
}
8600
8601
static int
8602
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8603
0
{
8604
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8605
0
    int l1 = c>>11;
8606
0
    int l2 = (c>>7) & 0xF;
8607
0
    int l3 = c & 0x7F;
8608
0
    int i;
8609
8610
0
    if (c > 0xFFFF)
8611
0
        return -1;
8612
0
    if (c == 0)
8613
0
        return 0;
8614
    /* level 1*/
8615
0
    i = map->level1[l1];
8616
0
    if (i == 0xFF) {
8617
0
        return -1;
8618
0
    }
8619
    /* level 2*/
8620
0
    i = map->level23[16*i+l2];
8621
0
    if (i == 0xFF) {
8622
0
        return -1;
8623
0
    }
8624
    /* level 3 */
8625
0
    i = map->level23[16*map->count2 + 128*i + l3];
8626
0
    if (i == 0) {
8627
0
        return -1;
8628
0
    }
8629
0
    return i;
8630
0
}
8631
8632
/* Lookup the character in the mapping.
8633
   On success, return PyLong, PyBytes or None (if the character can't be found).
8634
   If the result is PyLong, put its value in replace.
8635
   On error, return NULL.
8636
   */
8637
static PyObject *
8638
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8639
0
{
8640
0
    PyObject *w = PyLong_FromLong((long)c);
8641
0
    PyObject *x;
8642
8643
0
    if (w == NULL)
8644
0
        return NULL;
8645
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8646
0
    Py_DECREF(w);
8647
0
    if (rc == 0) {
8648
        /* No mapping found means: mapping is undefined. */
8649
0
        Py_RETURN_NONE;
8650
0
    }
8651
0
    if (x == NULL) {
8652
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8653
            /* No mapping found means: mapping is undefined. */
8654
0
            PyErr_Clear();
8655
0
            Py_RETURN_NONE;
8656
0
        } else
8657
0
            return NULL;
8658
0
    }
8659
0
    else if (x == Py_None)
8660
0
        return x;
8661
0
    else if (PyLong_Check(x)) {
8662
0
        long value = PyLong_AsLong(x);
8663
0
        if (value < 0 || value > 255) {
8664
0
            PyErr_SetString(PyExc_TypeError,
8665
0
                            "character mapping must be in range(256)");
8666
0
            Py_DECREF(x);
8667
0
            return NULL;
8668
0
        }
8669
0
        *replace = (unsigned char)value;
8670
0
        return x;
8671
0
    }
8672
0
    else if (PyBytes_Check(x))
8673
0
        return x;
8674
0
    else {
8675
        /* wrong return value */
8676
0
        PyErr_Format(PyExc_TypeError,
8677
0
                     "character mapping must return integer, bytes or None, not %.400s",
8678
0
                     Py_TYPE(x)->tp_name);
8679
0
        Py_DECREF(x);
8680
0
        return NULL;
8681
0
    }
8682
0
}
8683
8684
static int
8685
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8686
0
{
8687
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8688
    /* exponentially overallocate to minimize reallocations */
8689
0
    if (requiredsize < 2 * outsize)
8690
0
        requiredsize = 2 * outsize;
8691
0
    return PyBytesWriter_Resize(writer, requiredsize);
8692
0
}
8693
8694
typedef enum charmapencode_result {
8695
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8696
} charmapencode_result;
8697
/* lookup the character, put the result in the output string and adjust
8698
   various state variables. Resize the output bytes object if not enough
8699
   space is available. Return a new reference to the object that
8700
   was put in the output buffer, or Py_None, if the mapping was undefined
8701
   (in which case no character was written) or NULL, if a
8702
   reallocation error occurred. The caller must decref the result */
8703
static charmapencode_result
8704
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8705
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8706
0
{
8707
0
    PyObject *rep;
8708
0
    unsigned char replace;
8709
0
    char *outstart;
8710
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8711
8712
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8713
0
        int res = encoding_map_lookup(c, mapping);
8714
0
        Py_ssize_t requiredsize = *outpos+1;
8715
0
        if (res == -1) {
8716
0
            return enc_FAILED;
8717
0
        }
8718
8719
0
        if (outsize<requiredsize) {
8720
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8721
0
                return enc_EXCEPTION;
8722
0
            }
8723
0
        }
8724
0
        outstart = _PyBytesWriter_GetData(writer);
8725
0
        outstart[(*outpos)++] = (char)res;
8726
0
        return enc_SUCCESS;
8727
0
    }
8728
8729
0
    rep = charmapencode_lookup(c, mapping, &replace);
8730
0
    if (rep==NULL)
8731
0
        return enc_EXCEPTION;
8732
0
    else if (rep==Py_None) {
8733
0
        Py_DECREF(rep);
8734
0
        return enc_FAILED;
8735
0
    } else {
8736
0
        if (PyLong_Check(rep)) {
8737
0
            Py_ssize_t requiredsize = *outpos+1;
8738
0
            if (outsize<requiredsize)
8739
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8740
0
                    Py_DECREF(rep);
8741
0
                    return enc_EXCEPTION;
8742
0
                }
8743
0
            outstart = _PyBytesWriter_GetData(writer);
8744
0
            outstart[(*outpos)++] = (char)replace;
8745
0
        }
8746
0
        else {
8747
0
            const char *repchars = PyBytes_AS_STRING(rep);
8748
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8749
0
            Py_ssize_t requiredsize = *outpos+repsize;
8750
0
            if (outsize<requiredsize)
8751
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8752
0
                    Py_DECREF(rep);
8753
0
                    return enc_EXCEPTION;
8754
0
                }
8755
0
            outstart = _PyBytesWriter_GetData(writer);
8756
0
            memcpy(outstart + *outpos, repchars, repsize);
8757
0
            *outpos += repsize;
8758
0
        }
8759
0
    }
8760
0
    Py_DECREF(rep);
8761
0
    return enc_SUCCESS;
8762
0
}
8763
8764
/* handle an error in _PyUnicode_EncodeCharmap()
8765
   Return 0 on success, -1 on error */
8766
static int
8767
charmap_encoding_error(
8768
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8769
    PyObject **exceptionObject,
8770
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8771
    PyBytesWriter *writer, Py_ssize_t *respos)
8772
0
{
8773
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8774
0
    Py_ssize_t size, repsize;
8775
0
    Py_ssize_t newpos;
8776
0
    int kind;
8777
0
    const void *data;
8778
0
    Py_ssize_t index;
8779
    /* startpos for collecting unencodable chars */
8780
0
    Py_ssize_t collstartpos = *inpos;
8781
0
    Py_ssize_t collendpos = *inpos+1;
8782
0
    Py_ssize_t collpos;
8783
0
    const char *encoding = "charmap";
8784
0
    const char *reason = "character maps to <undefined>";
8785
0
    charmapencode_result x;
8786
0
    Py_UCS4 ch;
8787
0
    int val;
8788
8789
0
    size = PyUnicode_GET_LENGTH(unicode);
8790
    /* find all unencodable characters */
8791
0
    while (collendpos < size) {
8792
0
        PyObject *rep;
8793
0
        unsigned char replace;
8794
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8795
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8796
0
            val = encoding_map_lookup(ch, mapping);
8797
0
            if (val != -1)
8798
0
                break;
8799
0
            ++collendpos;
8800
0
            continue;
8801
0
        }
8802
8803
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8804
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8805
0
        if (rep==NULL)
8806
0
            return -1;
8807
0
        else if (rep!=Py_None) {
8808
0
            Py_DECREF(rep);
8809
0
            break;
8810
0
        }
8811
0
        Py_DECREF(rep);
8812
0
        ++collendpos;
8813
0
    }
8814
    /* cache callback name lookup
8815
     * (if not done yet, i.e. it's the first error) */
8816
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8817
0
        *error_handler = _Py_GetErrorHandler(errors);
8818
8819
0
    switch (*error_handler) {
8820
0
    case _Py_ERROR_STRICT:
8821
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8822
0
        return -1;
8823
8824
0
    case _Py_ERROR_REPLACE:
8825
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8826
0
            x = charmapencode_output('?', mapping, writer, respos);
8827
0
            if (x==enc_EXCEPTION) {
8828
0
                return -1;
8829
0
            }
8830
0
            else if (x==enc_FAILED) {
8831
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8832
0
                return -1;
8833
0
            }
8834
0
        }
8835
0
        _Py_FALLTHROUGH;
8836
0
    case _Py_ERROR_IGNORE:
8837
0
        *inpos = collendpos;
8838
0
        break;
8839
8840
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8841
        /* generate replacement (temporarily (mis)uses p) */
8842
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8843
0
            char buffer[2+29+1+1];
8844
0
            char *cp;
8845
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8846
0
            for (cp = buffer; *cp; ++cp) {
8847
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8848
0
                if (x==enc_EXCEPTION)
8849
0
                    return -1;
8850
0
                else if (x==enc_FAILED) {
8851
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8852
0
                    return -1;
8853
0
                }
8854
0
            }
8855
0
        }
8856
0
        *inpos = collendpos;
8857
0
        break;
8858
8859
0
    default:
8860
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8861
0
                                                      encoding, reason, unicode, exceptionObject,
8862
0
                                                      collstartpos, collendpos, &newpos);
8863
0
        if (repunicode == NULL)
8864
0
            return -1;
8865
0
        if (PyBytes_Check(repunicode)) {
8866
            /* Directly copy bytes result to output. */
8867
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8868
0
            Py_ssize_t requiredsize;
8869
0
            repsize = PyBytes_Size(repunicode);
8870
0
            requiredsize = *respos + repsize;
8871
0
            if (requiredsize > outsize)
8872
                /* Make room for all additional bytes. */
8873
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8874
0
                    Py_DECREF(repunicode);
8875
0
                    return -1;
8876
0
                }
8877
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8878
0
                   PyBytes_AsString(repunicode),  repsize);
8879
0
            *respos += repsize;
8880
0
            *inpos = newpos;
8881
0
            Py_DECREF(repunicode);
8882
0
            break;
8883
0
        }
8884
        /* generate replacement  */
8885
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8886
0
        data = PyUnicode_DATA(repunicode);
8887
0
        kind = PyUnicode_KIND(repunicode);
8888
0
        for (index = 0; index < repsize; index++) {
8889
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8890
0
            x = charmapencode_output(repch, mapping, writer, respos);
8891
0
            if (x==enc_EXCEPTION) {
8892
0
                Py_DECREF(repunicode);
8893
0
                return -1;
8894
0
            }
8895
0
            else if (x==enc_FAILED) {
8896
0
                Py_DECREF(repunicode);
8897
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8898
0
                return -1;
8899
0
            }
8900
0
        }
8901
0
        *inpos = newpos;
8902
0
        Py_DECREF(repunicode);
8903
0
    }
8904
0
    return 0;
8905
0
}
8906
8907
PyObject *
8908
_PyUnicode_EncodeCharmap(PyObject *unicode,
8909
                         PyObject *mapping,
8910
                         const char *errors)
8911
0
{
8912
    /* Default to Latin-1 */
8913
0
    if (mapping == NULL) {
8914
0
        return unicode_encode_ucs1(unicode, errors, 256);
8915
0
    }
8916
8917
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
8918
0
    if (size == 0) {
8919
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8920
0
    }
8921
0
    const void *data = PyUnicode_DATA(unicode);
8922
0
    int kind = PyUnicode_KIND(unicode);
8923
8924
0
    PyObject *error_handler_obj = NULL;
8925
0
    PyObject *exc = NULL;
8926
8927
    /* output object */
8928
0
    PyBytesWriter *writer;
8929
    /* allocate enough for a simple encoding without
8930
       replacements, if we need more, we'll resize */
8931
0
    writer = PyBytesWriter_Create(size);
8932
0
    if (writer == NULL) {
8933
0
        goto onError;
8934
0
    }
8935
8936
    /* current input position */
8937
0
    Py_ssize_t inpos = 0;
8938
    /* current output position */
8939
0
    Py_ssize_t respos = 0;
8940
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8941
8942
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8943
0
        char *outstart = _PyBytesWriter_GetData(writer);
8944
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8945
8946
0
        while (inpos<size) {
8947
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8948
8949
            /* try to encode it */
8950
0
            int res = encoding_map_lookup(ch, mapping);
8951
0
            Py_ssize_t requiredsize = respos+1;
8952
0
            if (res == -1) {
8953
0
                goto enc_FAILED;
8954
0
            }
8955
8956
0
            if (outsize<requiredsize) {
8957
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
8958
0
                    goto onError;
8959
0
                }
8960
0
                outstart = _PyBytesWriter_GetData(writer);
8961
0
                outsize = _PyBytesWriter_GetSize(writer);
8962
0
            }
8963
0
            outstart[respos++] = (char)res;
8964
8965
            /* done with this character => adjust input position */
8966
0
            ++inpos;
8967
0
            continue;
8968
8969
0
enc_FAILED:
8970
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
8971
0
                                       &exc,
8972
0
                                       &error_handler, &error_handler_obj, errors,
8973
0
                                       writer, &respos)) {
8974
0
                goto onError;
8975
0
            }
8976
0
            outstart = _PyBytesWriter_GetData(writer);
8977
0
            outsize = _PyBytesWriter_GetSize(writer);
8978
0
        }
8979
0
    }
8980
0
    else {
8981
0
        while (inpos<size) {
8982
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8983
            /* try to encode it */
8984
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
8985
0
            if (x==enc_EXCEPTION) { /* error */
8986
0
                goto onError;
8987
0
            }
8988
0
            if (x==enc_FAILED) { /* unencodable character */
8989
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
8990
0
                                           &exc,
8991
0
                                           &error_handler, &error_handler_obj, errors,
8992
0
                                           writer, &respos)) {
8993
0
                    goto onError;
8994
0
                }
8995
0
            }
8996
0
            else {
8997
                /* done with this character => adjust input position */
8998
0
                ++inpos;
8999
0
            }
9000
0
        }
9001
0
    }
9002
9003
0
    Py_XDECREF(exc);
9004
0
    Py_XDECREF(error_handler_obj);
9005
9006
    /* Resize if we allocated too much */
9007
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9008
9009
0
  onError:
9010
0
    PyBytesWriter_Discard(writer);
9011
0
    Py_XDECREF(exc);
9012
0
    Py_XDECREF(error_handler_obj);
9013
0
    return NULL;
9014
0
}
9015
9016
PyObject *
9017
PyUnicode_AsCharmapString(PyObject *unicode,
9018
                          PyObject *mapping)
9019
0
{
9020
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9021
0
        PyErr_BadArgument();
9022
0
        return NULL;
9023
0
    }
9024
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9025
0
}
9026
9027
/* create or adjust a UnicodeTranslateError */
9028
static void
9029
make_translate_exception(PyObject **exceptionObject,
9030
                         PyObject *unicode,
9031
                         Py_ssize_t startpos, Py_ssize_t endpos,
9032
                         const char *reason)
9033
0
{
9034
0
    if (*exceptionObject == NULL) {
9035
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9036
0
            unicode, startpos, endpos, reason);
9037
0
    }
9038
0
    else {
9039
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9040
0
            goto onError;
9041
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9042
0
            goto onError;
9043
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9044
0
            goto onError;
9045
0
        return;
9046
0
      onError:
9047
0
        Py_CLEAR(*exceptionObject);
9048
0
    }
9049
0
}
9050
9051
/* error handling callback helper:
9052
   build arguments, call the callback and check the arguments,
9053
   put the result into newpos and return the replacement string, which
9054
   has to be freed by the caller */
9055
static PyObject *
9056
unicode_translate_call_errorhandler(const char *errors,
9057
                                    PyObject **errorHandler,
9058
                                    const char *reason,
9059
                                    PyObject *unicode, PyObject **exceptionObject,
9060
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9061
                                    Py_ssize_t *newpos)
9062
0
{
9063
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9064
9065
0
    Py_ssize_t i_newpos;
9066
0
    PyObject *restuple;
9067
0
    PyObject *resunicode;
9068
9069
0
    if (*errorHandler == NULL) {
9070
0
        *errorHandler = PyCodec_LookupError(errors);
9071
0
        if (*errorHandler == NULL)
9072
0
            return NULL;
9073
0
    }
9074
9075
0
    make_translate_exception(exceptionObject,
9076
0
                             unicode, startpos, endpos, reason);
9077
0
    if (*exceptionObject == NULL)
9078
0
        return NULL;
9079
9080
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9081
0
    if (restuple == NULL)
9082
0
        return NULL;
9083
0
    if (!PyTuple_Check(restuple)) {
9084
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9085
0
        Py_DECREF(restuple);
9086
0
        return NULL;
9087
0
    }
9088
0
    if (!PyArg_ParseTuple(restuple, argparse,
9089
0
                          &resunicode, &i_newpos)) {
9090
0
        Py_DECREF(restuple);
9091
0
        return NULL;
9092
0
    }
9093
0
    if (i_newpos<0)
9094
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9095
0
    else
9096
0
        *newpos = i_newpos;
9097
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9098
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9099
0
        Py_DECREF(restuple);
9100
0
        return NULL;
9101
0
    }
9102
0
    Py_INCREF(resunicode);
9103
0
    Py_DECREF(restuple);
9104
0
    return resunicode;
9105
0
}
9106
9107
/* Lookup the character ch in the mapping and put the result in result,
9108
   which must be decrefed by the caller.
9109
   The result can be PyLong, PyUnicode, None or NULL.
9110
   If the result is PyLong, put its value in replace.
9111
   Return 0 on success, -1 on error */
9112
static int
9113
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9114
18.8k
{
9115
18.8k
    PyObject *w = PyLong_FromLong((long)c);
9116
18.8k
    PyObject *x;
9117
9118
18.8k
    if (w == NULL)
9119
0
        return -1;
9120
18.8k
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9121
18.8k
    Py_DECREF(w);
9122
18.8k
    if (rc == 0) {
9123
        /* No mapping found means: use 1:1 mapping. */
9124
6.35k
        *result = NULL;
9125
6.35k
        return 0;
9126
6.35k
    }
9127
12.4k
    if (x == NULL) {
9128
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9129
            /* No mapping found means: use 1:1 mapping. */
9130
0
            PyErr_Clear();
9131
0
            *result = NULL;
9132
0
            return 0;
9133
0
        } else
9134
0
            return -1;
9135
0
    }
9136
12.4k
    else if (x == Py_None) {
9137
0
        *result = x;
9138
0
        return 0;
9139
0
    }
9140
12.4k
    else if (PyLong_Check(x)) {
9141
0
        long value = PyLong_AsLong(x);
9142
0
        if (value < 0 || value > MAX_UNICODE) {
9143
0
            PyErr_Format(PyExc_ValueError,
9144
0
                         "character mapping must be in range(0x%lx)",
9145
0
                         (unsigned long)MAX_UNICODE + 1);
9146
0
            Py_DECREF(x);
9147
0
            return -1;
9148
0
        }
9149
0
        *result = x;
9150
0
        *replace = (Py_UCS4)value;
9151
0
        return 0;
9152
0
    }
9153
12.4k
    else if (PyUnicode_Check(x)) {
9154
12.4k
        *result = x;
9155
12.4k
        return 0;
9156
12.4k
    }
9157
0
    else {
9158
        /* wrong return value */
9159
0
        PyErr_SetString(PyExc_TypeError,
9160
0
                        "character mapping must return integer, None or str");
9161
0
        Py_DECREF(x);
9162
0
        return -1;
9163
0
    }
9164
12.4k
}
9165
9166
/* lookup the character, write the result into the writer.
9167
   Return 1 if the result was written into the writer, return 0 if the mapping
9168
   was undefined, raise an exception return -1 on error. */
9169
static int
9170
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9171
                        _PyUnicodeWriter *writer)
9172
6.40k
{
9173
6.40k
    PyObject *item;
9174
6.40k
    Py_UCS4 replace;
9175
9176
6.40k
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9177
0
        return -1;
9178
9179
6.40k
    if (item == NULL) {
9180
        /* not found => default to 1:1 mapping */
9181
115
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9182
0
            return -1;
9183
0
        }
9184
115
        return 1;
9185
115
    }
9186
9187
6.29k
    if (item == Py_None) {
9188
0
        Py_DECREF(item);
9189
0
        return 0;
9190
0
    }
9191
9192
6.29k
    if (PyLong_Check(item)) {
9193
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9194
0
            Py_DECREF(item);
9195
0
            return -1;
9196
0
        }
9197
0
        Py_DECREF(item);
9198
0
        return 1;
9199
0
    }
9200
9201
6.29k
    if (!PyUnicode_Check(item)) {
9202
0
        Py_DECREF(item);
9203
0
        return -1;
9204
0
    }
9205
9206
6.29k
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9207
0
        Py_DECREF(item);
9208
0
        return -1;
9209
0
    }
9210
9211
6.29k
    Py_DECREF(item);
9212
6.29k
    return 1;
9213
6.29k
}
9214
9215
static int
9216
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9217
                              Py_UCS1 *translate)
9218
12.4k
{
9219
12.4k
    PyObject *item = NULL;
9220
12.4k
    Py_UCS4 replace;
9221
12.4k
    int ret = 0;
9222
9223
12.4k
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9224
0
        return -1;
9225
0
    }
9226
9227
12.4k
    if (item == Py_None) {
9228
        /* deletion */
9229
0
        translate[ch] = 0xfe;
9230
0
    }
9231
12.4k
    else if (item == NULL) {
9232
        /* not found => default to 1:1 mapping */
9233
6.24k
        translate[ch] = ch;
9234
6.24k
        return 1;
9235
6.24k
    }
9236
6.18k
    else if (PyLong_Check(item)) {
9237
0
        if (replace > 127) {
9238
            /* invalid character or character outside ASCII:
9239
               skip the fast translate */
9240
0
            goto exit;
9241
0
        }
9242
0
        translate[ch] = (Py_UCS1)replace;
9243
0
    }
9244
6.18k
    else if (PyUnicode_Check(item)) {
9245
6.18k
        if (PyUnicode_GET_LENGTH(item) != 1)
9246
6.18k
            goto exit;
9247
9248
0
        replace = PyUnicode_READ_CHAR(item, 0);
9249
0
        if (replace > 127)
9250
0
            goto exit;
9251
0
        translate[ch] = (Py_UCS1)replace;
9252
0
    }
9253
0
    else {
9254
        /* not None, NULL, long or unicode */
9255
0
        goto exit;
9256
0
    }
9257
0
    ret = 1;
9258
9259
6.18k
  exit:
9260
6.18k
    Py_DECREF(item);
9261
6.18k
    return ret;
9262
0
}
9263
9264
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9265
   was translated into writer, return 0 if the input string was partially
9266
   translated into writer, raise an exception and return -1 on error. */
9267
static int
9268
unicode_fast_translate(PyObject *input, PyObject *mapping,
9269
                       _PyUnicodeWriter *writer, int ignore,
9270
                       Py_ssize_t *input_pos)
9271
12.3k
{
9272
12.3k
    Py_UCS1 ascii_table[128], ch, ch2;
9273
12.3k
    Py_ssize_t len;
9274
12.3k
    const Py_UCS1 *in, *end;
9275
12.3k
    Py_UCS1 *out;
9276
12.3k
    int res = 0;
9277
9278
12.3k
    len = PyUnicode_GET_LENGTH(input);
9279
9280
12.3k
    memset(ascii_table, 0xff, 128);
9281
9282
12.3k
    in = PyUnicode_1BYTE_DATA(input);
9283
12.3k
    end = in + len;
9284
9285
12.3k
    assert(PyUnicode_IS_ASCII(writer->buffer));
9286
12.3k
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9287
12.3k
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9288
9289
18.6k
    for (; in < end; in++) {
9290
12.4k
        ch = *in;
9291
12.4k
        ch2 = ascii_table[ch];
9292
12.4k
        if (ch2 == 0xff) {
9293
12.4k
            int translate = unicode_fast_translate_lookup(mapping, ch,
9294
12.4k
                                                          ascii_table);
9295
12.4k
            if (translate < 0)
9296
0
                return -1;
9297
12.4k
            if (translate == 0)
9298
6.18k
                goto exit;
9299
6.24k
            ch2 = ascii_table[ch];
9300
6.24k
        }
9301
6.28k
        if (ch2 == 0xfe) {
9302
0
            if (ignore)
9303
0
                continue;
9304
0
            goto exit;
9305
0
        }
9306
6.28k
        assert(ch2 < 128);
9307
6.28k
        *out = ch2;
9308
6.28k
        out++;
9309
6.28k
    }
9310
6.17k
    res = 1;
9311
9312
12.3k
exit:
9313
12.3k
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9314
12.3k
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9315
12.3k
    return res;
9316
6.17k
}
9317
9318
static PyObject *
9319
_PyUnicode_TranslateCharmap(PyObject *input,
9320
                            PyObject *mapping,
9321
                            const char *errors)
9322
12.3k
{
9323
    /* input object */
9324
12.3k
    const void *data;
9325
12.3k
    Py_ssize_t size, i;
9326
12.3k
    int kind;
9327
    /* output buffer */
9328
12.3k
    _PyUnicodeWriter writer;
9329
    /* error handler */
9330
12.3k
    const char *reason = "character maps to <undefined>";
9331
12.3k
    PyObject *errorHandler = NULL;
9332
12.3k
    PyObject *exc = NULL;
9333
12.3k
    int ignore;
9334
12.3k
    int res;
9335
9336
12.3k
    if (mapping == NULL) {
9337
0
        PyErr_BadArgument();
9338
0
        return NULL;
9339
0
    }
9340
9341
12.3k
    data = PyUnicode_DATA(input);
9342
12.3k
    kind = PyUnicode_KIND(input);
9343
12.3k
    size = PyUnicode_GET_LENGTH(input);
9344
9345
12.3k
    if (size == 0)
9346
0
        return PyUnicode_FromObject(input);
9347
9348
    /* allocate enough for a simple 1:1 translation without
9349
       replacements, if we need more, we'll resize */
9350
12.3k
    _PyUnicodeWriter_Init(&writer);
9351
12.3k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9352
0
        goto onError;
9353
9354
12.3k
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9355
9356
12.3k
    if (PyUnicode_IS_ASCII(input)) {
9357
12.3k
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9358
12.3k
        if (res < 0) {
9359
0
            _PyUnicodeWriter_Dealloc(&writer);
9360
0
            return NULL;
9361
0
        }
9362
12.3k
        if (res == 1)
9363
6.17k
            return _PyUnicodeWriter_Finish(&writer);
9364
12.3k
    }
9365
0
    else {
9366
0
        i = 0;
9367
0
    }
9368
9369
12.5k
    while (i<size) {
9370
        /* try to encode it */
9371
6.40k
        int translate;
9372
6.40k
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9373
6.40k
        Py_ssize_t newpos;
9374
        /* startpos for collecting untranslatable chars */
9375
6.40k
        Py_ssize_t collstart;
9376
6.40k
        Py_ssize_t collend;
9377
6.40k
        Py_UCS4 ch;
9378
9379
6.40k
        ch = PyUnicode_READ(kind, data, i);
9380
6.40k
        translate = charmaptranslate_output(ch, mapping, &writer);
9381
6.40k
        if (translate < 0)
9382
0
            goto onError;
9383
9384
6.40k
        if (translate != 0) {
9385
            /* it worked => adjust input pointer */
9386
6.40k
            ++i;
9387
6.40k
            continue;
9388
6.40k
        }
9389
9390
        /* untranslatable character */
9391
0
        collstart = i;
9392
0
        collend = i+1;
9393
9394
        /* find all untranslatable characters */
9395
0
        while (collend < size) {
9396
0
            PyObject *x;
9397
0
            Py_UCS4 replace;
9398
0
            ch = PyUnicode_READ(kind, data, collend);
9399
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9400
0
                goto onError;
9401
0
            Py_XDECREF(x);
9402
0
            if (x != Py_None)
9403
0
                break;
9404
0
            ++collend;
9405
0
        }
9406
9407
0
        if (ignore) {
9408
0
            i = collend;
9409
0
        }
9410
0
        else {
9411
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9412
0
                                                             reason, input, &exc,
9413
0
                                                             collstart, collend, &newpos);
9414
0
            if (repunicode == NULL)
9415
0
                goto onError;
9416
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9417
0
                Py_DECREF(repunicode);
9418
0
                goto onError;
9419
0
            }
9420
0
            Py_DECREF(repunicode);
9421
0
            i = newpos;
9422
0
        }
9423
0
    }
9424
6.18k
    Py_XDECREF(exc);
9425
6.18k
    Py_XDECREF(errorHandler);
9426
6.18k
    return _PyUnicodeWriter_Finish(&writer);
9427
9428
0
  onError:
9429
0
    _PyUnicodeWriter_Dealloc(&writer);
9430
0
    Py_XDECREF(exc);
9431
0
    Py_XDECREF(errorHandler);
9432
0
    return NULL;
9433
6.18k
}
9434
9435
PyObject *
9436
PyUnicode_Translate(PyObject *str,
9437
                    PyObject *mapping,
9438
                    const char *errors)
9439
0
{
9440
0
    if (ensure_unicode(str) < 0)
9441
0
        return NULL;
9442
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9443
0
}
9444
9445
PyObject *
9446
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9447
15.4M
{
9448
15.4M
    if (!PyUnicode_Check(unicode)) {
9449
0
        PyErr_BadInternalCall();
9450
0
        return NULL;
9451
0
    }
9452
15.4M
    if (PyUnicode_IS_ASCII(unicode)) {
9453
        /* If the string is already ASCII, just return the same string */
9454
15.4M
        return Py_NewRef(unicode);
9455
15.4M
    }
9456
9457
2.65k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9458
2.65k
    PyObject *result = PyUnicode_New(len, 127);
9459
2.65k
    if (result == NULL) {
9460
0
        return NULL;
9461
0
    }
9462
9463
2.65k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9464
2.65k
    int kind = PyUnicode_KIND(unicode);
9465
2.65k
    const void *data = PyUnicode_DATA(unicode);
9466
2.65k
    Py_ssize_t i;
9467
59.8k
    for (i = 0; i < len; ++i) {
9468
57.3k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9469
57.3k
        if (ch < 127) {
9470
54.2k
            out[i] = ch;
9471
54.2k
        }
9472
3.14k
        else if (Py_UNICODE_ISSPACE(ch)) {
9473
1.45k
            out[i] = ' ';
9474
1.45k
        }
9475
1.69k
        else {
9476
1.69k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9477
1.69k
            if (decimal < 0) {
9478
141
                out[i] = '?';
9479
141
                out[i+1] = '\0';
9480
141
                _PyUnicode_LENGTH(result) = i + 1;
9481
141
                break;
9482
141
            }
9483
1.55k
            out[i] = '0' + decimal;
9484
1.55k
        }
9485
57.3k
    }
9486
9487
2.65k
    assert(_PyUnicode_CheckConsistency(result, 1));
9488
2.65k
    return result;
9489
2.65k
}
9490
9491
/* --- Helpers ------------------------------------------------------------ */
9492
9493
/* helper macro to fixup start/end slice values */
9494
#define ADJUST_INDICES(start, end, len) \
9495
116M
    do {                                \
9496
116M
        if (end > len) {                \
9497
90.2M
            end = len;                  \
9498
90.2M
        }                               \
9499
116M
        else if (end < 0) {             \
9500
0
            end += len;                 \
9501
0
            if (end < 0) {              \
9502
0
                end = 0;                \
9503
0
            }                           \
9504
0
        }                               \
9505
116M
        if (start < 0) {                \
9506
17.8k
            start += len;               \
9507
17.8k
            if (start < 0) {            \
9508
0
                start = 0;              \
9509
0
            }                           \
9510
17.8k
        }                               \
9511
116M
    } while (0)
9512
9513
static Py_ssize_t
9514
any_find_slice(PyObject* s1, PyObject* s2,
9515
               Py_ssize_t start,
9516
               Py_ssize_t end,
9517
               int direction)
9518
25.6M
{
9519
25.6M
    int kind1, kind2;
9520
25.6M
    const void *buf1, *buf2;
9521
25.6M
    Py_ssize_t len1, len2, result;
9522
9523
25.6M
    kind1 = PyUnicode_KIND(s1);
9524
25.6M
    kind2 = PyUnicode_KIND(s2);
9525
25.6M
    if (kind1 < kind2)
9526
0
        return -1;
9527
9528
25.6M
    len1 = PyUnicode_GET_LENGTH(s1);
9529
25.6M
    len2 = PyUnicode_GET_LENGTH(s2);
9530
25.6M
    ADJUST_INDICES(start, end, len1);
9531
25.6M
    if (end - start < len2)
9532
1.43M
        return -1;
9533
9534
24.1M
    buf1 = PyUnicode_DATA(s1);
9535
24.1M
    buf2 = PyUnicode_DATA(s2);
9536
24.1M
    if (len2 == 1) {
9537
23.1M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9538
23.1M
        result = findchar((const char *)buf1 + kind1*start,
9539
23.1M
                          kind1, end - start, ch, direction);
9540
23.1M
        if (result == -1)
9541
3.94M
            return -1;
9542
19.1M
        else
9543
19.1M
            return start + result;
9544
23.1M
    }
9545
9546
1.03M
    if (kind2 != kind1) {
9547
333k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9548
333k
        if (!buf2)
9549
0
            return -2;
9550
333k
    }
9551
9552
1.03M
    if (direction > 0) {
9553
1.03M
        switch (kind1) {
9554
704k
        case PyUnicode_1BYTE_KIND:
9555
704k
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9556
407k
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9557
296k
            else
9558
296k
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9559
704k
            break;
9560
257k
        case PyUnicode_2BYTE_KIND:
9561
257k
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9562
257k
            break;
9563
75.3k
        case PyUnicode_4BYTE_KIND:
9564
75.3k
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9565
75.3k
            break;
9566
0
        default:
9567
0
            Py_UNREACHABLE();
9568
1.03M
        }
9569
1.03M
    }
9570
0
    else {
9571
0
        switch (kind1) {
9572
0
        case PyUnicode_1BYTE_KIND:
9573
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9574
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9575
0
            else
9576
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9577
0
            break;
9578
0
        case PyUnicode_2BYTE_KIND:
9579
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9580
0
            break;
9581
0
        case PyUnicode_4BYTE_KIND:
9582
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9583
0
            break;
9584
0
        default:
9585
0
            Py_UNREACHABLE();
9586
0
        }
9587
0
    }
9588
9589
1.03M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9590
1.03M
    if (kind2 != kind1)
9591
333k
        PyMem_Free((void *)buf2);
9592
9593
1.03M
    return result;
9594
1.03M
}
9595
9596
9597
Py_ssize_t
9598
PyUnicode_Count(PyObject *str,
9599
                PyObject *substr,
9600
                Py_ssize_t start,
9601
                Py_ssize_t end)
9602
0
{
9603
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9604
0
        return -1;
9605
9606
0
    return unicode_count_impl(str, substr, start, end);
9607
0
}
9608
9609
Py_ssize_t
9610
PyUnicode_Find(PyObject *str,
9611
               PyObject *substr,
9612
               Py_ssize_t start,
9613
               Py_ssize_t end,
9614
               int direction)
9615
0
{
9616
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9617
0
        return -2;
9618
9619
0
    return any_find_slice(str, substr, start, end, direction);
9620
0
}
9621
9622
Py_ssize_t
9623
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9624
                   Py_ssize_t start, Py_ssize_t end,
9625
                   int direction)
9626
3.91M
{
9627
3.91M
    int kind;
9628
3.91M
    Py_ssize_t len, result;
9629
3.91M
    len = PyUnicode_GET_LENGTH(str);
9630
3.91M
    ADJUST_INDICES(start, end, len);
9631
3.91M
    if (end - start < 1)
9632
0
        return -1;
9633
3.91M
    kind = PyUnicode_KIND(str);
9634
3.91M
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9635
3.91M
                      kind, end-start, ch, direction);
9636
3.91M
    if (result == -1)
9637
2.85M
        return -1;
9638
1.06M
    else
9639
1.06M
        return start + result;
9640
3.91M
}
9641
9642
static int
9643
tailmatch(PyObject *self,
9644
          PyObject *substring,
9645
          Py_ssize_t start,
9646
          Py_ssize_t end,
9647
          int direction)
9648
60.5M
{
9649
60.5M
    int kind_self;
9650
60.5M
    int kind_sub;
9651
60.5M
    const void *data_self;
9652
60.5M
    const void *data_sub;
9653
60.5M
    Py_ssize_t offset;
9654
60.5M
    Py_ssize_t i;
9655
60.5M
    Py_ssize_t end_sub;
9656
9657
60.5M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9658
60.5M
    end -= PyUnicode_GET_LENGTH(substring);
9659
60.5M
    if (end < start)
9660
7.97M
        return 0;
9661
9662
52.5M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9663
0
        return 1;
9664
9665
52.5M
    kind_self = PyUnicode_KIND(self);
9666
52.5M
    data_self = PyUnicode_DATA(self);
9667
52.5M
    kind_sub = PyUnicode_KIND(substring);
9668
52.5M
    data_sub = PyUnicode_DATA(substring);
9669
52.5M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9670
9671
52.5M
    if (direction > 0)
9672
7.31M
        offset = end;
9673
45.2M
    else
9674
45.2M
        offset = start;
9675
9676
52.5M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9677
52.5M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9678
36.3M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9679
36.3M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9680
        /* If both are of the same kind, memcmp is sufficient */
9681
14.9M
        if (kind_self == kind_sub) {
9682
7.27M
            return ! memcmp((char *)data_self +
9683
7.27M
                                (offset * PyUnicode_KIND(substring)),
9684
7.27M
                            data_sub,
9685
7.27M
                            PyUnicode_GET_LENGTH(substring) *
9686
7.27M
                                PyUnicode_KIND(substring));
9687
7.27M
        }
9688
        /* otherwise we have to compare each character by first accessing it */
9689
7.67M
        else {
9690
            /* We do not need to compare 0 and len(substring)-1 because
9691
               the if statement above ensured already that they are equal
9692
               when we end up here. */
9693
7.78M
            for (i = 1; i < end_sub; ++i) {
9694
114k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9695
114k
                    PyUnicode_READ(kind_sub, data_sub, i))
9696
6.83k
                    return 0;
9697
114k
            }
9698
7.66M
            return 1;
9699
7.67M
        }
9700
14.9M
    }
9701
9702
37.5M
    return 0;
9703
52.5M
}
9704
9705
Py_ssize_t
9706
PyUnicode_Tailmatch(PyObject *str,
9707
                    PyObject *substr,
9708
                    Py_ssize_t start,
9709
                    Py_ssize_t end,
9710
                    int direction)
9711
246
{
9712
246
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9713
0
        return -1;
9714
9715
246
    return tailmatch(str, substr, start, end, direction);
9716
246
}
9717
9718
static PyObject *
9719
ascii_upper_or_lower(PyObject *self, int lower)
9720
69.5M
{
9721
69.5M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9722
69.5M
    const char *data = PyUnicode_DATA(self);
9723
69.5M
    char *resdata;
9724
69.5M
    PyObject *res;
9725
9726
69.5M
    res = PyUnicode_New(len, 127);
9727
69.5M
    if (res == NULL)
9728
0
        return NULL;
9729
69.5M
    resdata = PyUnicode_DATA(res);
9730
69.5M
    if (lower)
9731
69.5M
        _Py_bytes_lower(resdata, data, len);
9732
306
    else
9733
306
        _Py_bytes_upper(resdata, data, len);
9734
69.5M
    return res;
9735
69.5M
}
9736
9737
static Py_UCS4
9738
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9739
541k
{
9740
541k
    Py_ssize_t j;
9741
541k
    int final_sigma;
9742
541k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9743
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9744
9745
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9746
9747
    where ! is a negation and \p{xxx} is a character with property xxx.
9748
    */
9749
1.00M
    for (j = i - 1; j >= 0; j--) {
9750
998k
        c = PyUnicode_READ(kind, data, j);
9751
998k
        if (!_PyUnicode_IsCaseIgnorable(c))
9752
536k
            break;
9753
998k
    }
9754
541k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9755
541k
    if (final_sigma) {
9756
841k
        for (j = i + 1; j < length; j++) {
9757
839k
            c = PyUnicode_READ(kind, data, j);
9758
839k
            if (!_PyUnicode_IsCaseIgnorable(c))
9759
420k
                break;
9760
839k
        }
9761
422k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9762
422k
    }
9763
541k
    return (final_sigma) ? 0x3C2 : 0x3C3;
9764
541k
}
9765
9766
static int
9767
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9768
           Py_UCS4 c, Py_UCS4 *mapped)
9769
120M
{
9770
    /* Obscure special case. */
9771
120M
    if (c == 0x3A3) {
9772
541k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9773
541k
        return 1;
9774
541k
    }
9775
120M
    return _PyUnicode_ToLowerFull(c, mapped);
9776
120M
}
9777
9778
static Py_ssize_t
9779
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9780
0
{
9781
0
    Py_ssize_t i, k = 0;
9782
0
    int n_res, j;
9783
0
    Py_UCS4 c, mapped[3];
9784
9785
0
    c = PyUnicode_READ(kind, data, 0);
9786
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9787
0
    for (j = 0; j < n_res; j++) {
9788
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9789
0
        res[k++] = mapped[j];
9790
0
    }
9791
0
    for (i = 1; i < length; i++) {
9792
0
        c = PyUnicode_READ(kind, data, i);
9793
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9794
0
        for (j = 0; j < n_res; j++) {
9795
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9796
0
            res[k++] = mapped[j];
9797
0
        }
9798
0
    }
9799
0
    return k;
9800
0
}
9801
9802
static Py_ssize_t
9803
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9804
0
    Py_ssize_t i, k = 0;
9805
9806
0
    for (i = 0; i < length; i++) {
9807
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9808
0
        int n_res, j;
9809
0
        if (Py_UNICODE_ISUPPER(c)) {
9810
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9811
0
        }
9812
0
        else if (Py_UNICODE_ISLOWER(c)) {
9813
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9814
0
        }
9815
0
        else {
9816
0
            n_res = 1;
9817
0
            mapped[0] = c;
9818
0
        }
9819
0
        for (j = 0; j < n_res; j++) {
9820
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9821
0
            res[k++] = mapped[j];
9822
0
        }
9823
0
    }
9824
0
    return k;
9825
0
}
9826
9827
static Py_ssize_t
9828
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9829
                  Py_UCS4 *maxchar, int lower)
9830
3.08M
{
9831
3.08M
    Py_ssize_t i, k = 0;
9832
9833
123M
    for (i = 0; i < length; i++) {
9834
120M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9835
120M
        int n_res, j;
9836
120M
        if (lower)
9837
120M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9838
0
        else
9839
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9840
241M
        for (j = 0; j < n_res; j++) {
9841
120M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9842
120M
            res[k++] = mapped[j];
9843
120M
        }
9844
120M
    }
9845
3.08M
    return k;
9846
3.08M
}
9847
9848
static Py_ssize_t
9849
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9850
0
{
9851
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9852
0
}
9853
9854
static Py_ssize_t
9855
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9856
3.08M
{
9857
3.08M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9858
3.08M
}
9859
9860
static Py_ssize_t
9861
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9862
0
{
9863
0
    Py_ssize_t i, k = 0;
9864
9865
0
    for (i = 0; i < length; i++) {
9866
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9867
0
        Py_UCS4 mapped[3];
9868
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9869
0
        for (j = 0; j < n_res; j++) {
9870
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9871
0
            res[k++] = mapped[j];
9872
0
        }
9873
0
    }
9874
0
    return k;
9875
0
}
9876
9877
static Py_ssize_t
9878
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9879
0
{
9880
0
    Py_ssize_t i, k = 0;
9881
0
    int previous_is_cased;
9882
9883
0
    previous_is_cased = 0;
9884
0
    for (i = 0; i < length; i++) {
9885
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9886
0
        Py_UCS4 mapped[3];
9887
0
        int n_res, j;
9888
9889
0
        if (previous_is_cased)
9890
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9891
0
        else
9892
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9893
9894
0
        for (j = 0; j < n_res; j++) {
9895
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9896
0
            res[k++] = mapped[j];
9897
0
        }
9898
9899
0
        previous_is_cased = _PyUnicode_IsCased(c);
9900
0
    }
9901
0
    return k;
9902
0
}
9903
9904
static PyObject *
9905
case_operation(PyObject *self,
9906
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9907
3.08M
{
9908
3.08M
    PyObject *res = NULL;
9909
3.08M
    Py_ssize_t length, newlength = 0;
9910
3.08M
    int kind, outkind;
9911
3.08M
    const void *data;
9912
3.08M
    void *outdata;
9913
3.08M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9914
9915
3.08M
    kind = PyUnicode_KIND(self);
9916
3.08M
    data = PyUnicode_DATA(self);
9917
3.08M
    length = PyUnicode_GET_LENGTH(self);
9918
3.08M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9919
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9920
0
        return NULL;
9921
0
    }
9922
3.08M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9923
3.08M
    if (tmp == NULL)
9924
0
        return PyErr_NoMemory();
9925
3.08M
    newlength = perform(kind, data, length, tmp, &maxchar);
9926
3.08M
    res = PyUnicode_New(newlength, maxchar);
9927
3.08M
    if (res == NULL)
9928
0
        goto leave;
9929
3.08M
    tmpend = tmp + newlength;
9930
3.08M
    outdata = PyUnicode_DATA(res);
9931
3.08M
    outkind = PyUnicode_KIND(res);
9932
3.08M
    switch (outkind) {
9933
176k
    case PyUnicode_1BYTE_KIND:
9934
176k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9935
176k
        break;
9936
2.87M
    case PyUnicode_2BYTE_KIND:
9937
2.87M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9938
2.87M
        break;
9939
39.9k
    case PyUnicode_4BYTE_KIND:
9940
39.9k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9941
39.9k
        break;
9942
0
    default:
9943
0
        Py_UNREACHABLE();
9944
3.08M
    }
9945
3.08M
  leave:
9946
3.08M
    PyMem_Free(tmp);
9947
3.08M
    return res;
9948
3.08M
}
9949
9950
PyObject *
9951
PyUnicode_Join(PyObject *separator, PyObject *seq)
9952
27.1M
{
9953
27.1M
    PyObject *res;
9954
27.1M
    PyObject *fseq;
9955
27.1M
    Py_ssize_t seqlen;
9956
27.1M
    PyObject **items;
9957
9958
27.1M
    fseq = PySequence_Fast(seq, "can only join an iterable");
9959
27.1M
    if (fseq == NULL) {
9960
623
        return NULL;
9961
623
    }
9962
9963
27.1M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9964
9965
27.1M
    items = PySequence_Fast_ITEMS(fseq);
9966
27.1M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9967
27.1M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9968
9969
27.1M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9970
9971
27.1M
    Py_DECREF(fseq);
9972
27.1M
    return res;
9973
27.1M
}
9974
9975
PyObject *
9976
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9977
42.7M
{
9978
42.7M
    PyObject *res = NULL; /* the result */
9979
42.7M
    PyObject *sep = NULL;
9980
42.7M
    Py_ssize_t seplen;
9981
42.7M
    PyObject *item;
9982
42.7M
    Py_ssize_t sz, i, res_offset;
9983
42.7M
    Py_UCS4 maxchar;
9984
42.7M
    Py_UCS4 item_maxchar;
9985
42.7M
    int use_memcpy;
9986
42.7M
    unsigned char *res_data = NULL, *sep_data = NULL;
9987
42.7M
    PyObject *last_obj;
9988
42.7M
    int kind = 0;
9989
9990
    /* If empty sequence, return u"". */
9991
42.7M
    if (seqlen == 0) {
9992
7.62M
        _Py_RETURN_UNICODE_EMPTY();
9993
7.62M
    }
9994
9995
    /* If singleton sequence with an exact Unicode, return that. */
9996
35.1M
    last_obj = NULL;
9997
35.1M
    if (seqlen == 1) {
9998
12.6M
        if (PyUnicode_CheckExact(items[0])) {
9999
11.2M
            res = items[0];
10000
11.2M
            return Py_NewRef(res);
10001
11.2M
        }
10002
1.34M
        seplen = 0;
10003
1.34M
        maxchar = 0;
10004
1.34M
    }
10005
22.4M
    else {
10006
        /* Set up sep and seplen */
10007
22.4M
        if (separator == NULL) {
10008
            /* fall back to a blank space separator */
10009
0
            sep = PyUnicode_FromOrdinal(' ');
10010
0
            if (!sep)
10011
0
                goto onError;
10012
0
            seplen = 1;
10013
0
            maxchar = 32;
10014
0
        }
10015
22.4M
        else {
10016
22.4M
            if (!PyUnicode_Check(separator)) {
10017
0
                PyErr_Format(PyExc_TypeError,
10018
0
                             "separator: expected str instance,"
10019
0
                             " %.80s found",
10020
0
                             Py_TYPE(separator)->tp_name);
10021
0
                goto onError;
10022
0
            }
10023
22.4M
            sep = separator;
10024
22.4M
            seplen = PyUnicode_GET_LENGTH(separator);
10025
22.4M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10026
            /* inc refcount to keep this code path symmetric with the
10027
               above case of a blank separator */
10028
22.4M
            Py_INCREF(sep);
10029
22.4M
        }
10030
22.4M
        last_obj = sep;
10031
22.4M
    }
10032
10033
    /* There are at least two things to join, or else we have a subclass
10034
     * of str in the sequence.
10035
     * Do a pre-pass to figure out the total amount of space we'll
10036
     * need (sz), and see whether all argument are strings.
10037
     */
10038
23.8M
    sz = 0;
10039
#ifdef Py_DEBUG
10040
    use_memcpy = 0;
10041
#else
10042
23.8M
    use_memcpy = 1;
10043
23.8M
#endif
10044
211M
    for (i = 0; i < seqlen; i++) {
10045
187M
        size_t add_sz;
10046
187M
        item = items[i];
10047
187M
        if (!PyUnicode_Check(item)) {
10048
0
            PyErr_Format(PyExc_TypeError,
10049
0
                         "sequence item %zd: expected str instance,"
10050
0
                         " %.80s found",
10051
0
                         i, Py_TYPE(item)->tp_name);
10052
0
            goto onError;
10053
0
        }
10054
187M
        add_sz = PyUnicode_GET_LENGTH(item);
10055
187M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10056
187M
        maxchar = Py_MAX(maxchar, item_maxchar);
10057
187M
        if (i != 0) {
10058
163M
            add_sz += seplen;
10059
163M
        }
10060
187M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10061
0
            PyErr_SetString(PyExc_OverflowError,
10062
0
                            "join() result is too long for a Python string");
10063
0
            goto onError;
10064
0
        }
10065
187M
        sz += add_sz;
10066
187M
        if (use_memcpy && last_obj != NULL) {
10067
120M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10068
3.56M
                use_memcpy = 0;
10069
120M
        }
10070
187M
        last_obj = item;
10071
187M
    }
10072
10073
23.8M
    res = PyUnicode_New(sz, maxchar);
10074
23.8M
    if (res == NULL)
10075
0
        goto onError;
10076
10077
    /* Catenate everything. */
10078
#ifdef Py_DEBUG
10079
    use_memcpy = 0;
10080
#else
10081
23.8M
    if (use_memcpy) {
10082
20.2M
        res_data = PyUnicode_1BYTE_DATA(res);
10083
20.2M
        kind = PyUnicode_KIND(res);
10084
20.2M
        if (seplen != 0)
10085
196k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10086
20.2M
    }
10087
23.8M
#endif
10088
23.8M
    if (use_memcpy) {
10089
123M
        for (i = 0; i < seqlen; ++i) {
10090
103M
            Py_ssize_t itemlen;
10091
103M
            item = items[i];
10092
10093
            /* Copy item, and maybe the separator. */
10094
103M
            if (i && seplen != 0) {
10095
744k
                memcpy(res_data,
10096
744k
                          sep_data,
10097
744k
                          kind * seplen);
10098
744k
                res_data += kind * seplen;
10099
744k
            }
10100
10101
103M
            itemlen = PyUnicode_GET_LENGTH(item);
10102
103M
            if (itemlen != 0) {
10103
90.2M
                memcpy(res_data,
10104
90.2M
                          PyUnicode_DATA(item),
10105
90.2M
                          kind * itemlen);
10106
90.2M
                res_data += kind * itemlen;
10107
90.2M
            }
10108
103M
        }
10109
20.2M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10110
20.2M
                           + kind * PyUnicode_GET_LENGTH(res));
10111
20.2M
    }
10112
3.56M
    else {
10113
87.9M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10114
84.4M
            Py_ssize_t itemlen;
10115
84.4M
            item = items[i];
10116
10117
            /* Copy item, and maybe the separator. */
10118
84.4M
            if (i && seplen != 0) {
10119
1.08M
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10120
1.08M
                res_offset += seplen;
10121
1.08M
            }
10122
10123
84.4M
            itemlen = PyUnicode_GET_LENGTH(item);
10124
84.4M
            if (itemlen != 0) {
10125
82.9M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10126
82.9M
                res_offset += itemlen;
10127
82.9M
            }
10128
84.4M
        }
10129
3.56M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10130
3.56M
    }
10131
10132
23.8M
    Py_XDECREF(sep);
10133
23.8M
    assert(_PyUnicode_CheckConsistency(res, 1));
10134
23.8M
    return res;
10135
10136
0
  onError:
10137
0
    Py_XDECREF(sep);
10138
0
    Py_XDECREF(res);
10139
0
    return NULL;
10140
23.8M
}
10141
10142
void
10143
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10144
                    Py_UCS4 fill_char)
10145
641
{
10146
641
    const int kind = PyUnicode_KIND(unicode);
10147
641
    void *data = PyUnicode_DATA(unicode);
10148
641
    assert(_PyUnicode_IsModifiable(unicode));
10149
641
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10150
641
    assert(start >= 0);
10151
641
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10152
641
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10153
641
}
10154
10155
Py_ssize_t
10156
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10157
               Py_UCS4 fill_char)
10158
641
{
10159
641
    Py_ssize_t maxlen;
10160
10161
641
    if (!PyUnicode_Check(unicode)) {
10162
0
        PyErr_BadInternalCall();
10163
0
        return -1;
10164
0
    }
10165
641
    if (unicode_check_modifiable(unicode))
10166
0
        return -1;
10167
10168
641
    if (start < 0) {
10169
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10170
0
        return -1;
10171
0
    }
10172
641
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10173
0
        PyErr_SetString(PyExc_ValueError,
10174
0
                         "fill character is bigger than "
10175
0
                         "the string maximum character");
10176
0
        return -1;
10177
0
    }
10178
10179
641
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10180
641
    length = Py_MIN(maxlen, length);
10181
641
    if (length <= 0)
10182
0
        return 0;
10183
10184
641
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10185
641
    return length;
10186
641
}
10187
10188
static PyObject *
10189
pad(PyObject *self,
10190
    Py_ssize_t left,
10191
    Py_ssize_t right,
10192
    Py_UCS4 fill)
10193
68
{
10194
68
    PyObject *u;
10195
68
    Py_UCS4 maxchar;
10196
68
    int kind;
10197
68
    void *data;
10198
10199
68
    if (left < 0)
10200
0
        left = 0;
10201
68
    if (right < 0)
10202
0
        right = 0;
10203
10204
68
    if (left == 0 && right == 0)
10205
0
        return unicode_result_unchanged(self);
10206
10207
68
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10208
68
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10209
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10210
0
        return NULL;
10211
0
    }
10212
68
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10213
68
    maxchar = Py_MAX(maxchar, fill);
10214
68
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10215
68
    if (!u)
10216
0
        return NULL;
10217
10218
68
    kind = PyUnicode_KIND(u);
10219
68
    data = PyUnicode_DATA(u);
10220
68
    if (left)
10221
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10222
68
    if (right)
10223
68
        _PyUnicode_Fill(kind, data, fill,
10224
68
                        left + _PyUnicode_LENGTH(self), right);
10225
68
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10226
68
    assert(_PyUnicode_CheckConsistency(u, 1));
10227
68
    return u;
10228
68
}
10229
10230
PyObject *
10231
PyUnicode_Splitlines(PyObject *string, int keepends)
10232
19.3k
{
10233
19.3k
    PyObject *list;
10234
10235
19.3k
    if (ensure_unicode(string) < 0)
10236
0
        return NULL;
10237
10238
19.3k
    switch (PyUnicode_KIND(string)) {
10239
5.48k
    case PyUnicode_1BYTE_KIND:
10240
5.48k
        if (PyUnicode_IS_ASCII(string))
10241
4.37k
            list = asciilib_splitlines(
10242
4.37k
                string, PyUnicode_1BYTE_DATA(string),
10243
4.37k
                PyUnicode_GET_LENGTH(string), keepends);
10244
1.10k
        else
10245
1.10k
            list = ucs1lib_splitlines(
10246
1.10k
                string, PyUnicode_1BYTE_DATA(string),
10247
1.10k
                PyUnicode_GET_LENGTH(string), keepends);
10248
5.48k
        break;
10249
9.92k
    case PyUnicode_2BYTE_KIND:
10250
9.92k
        list = ucs2lib_splitlines(
10251
9.92k
            string, PyUnicode_2BYTE_DATA(string),
10252
9.92k
            PyUnicode_GET_LENGTH(string), keepends);
10253
9.92k
        break;
10254
3.90k
    case PyUnicode_4BYTE_KIND:
10255
3.90k
        list = ucs4lib_splitlines(
10256
3.90k
            string, PyUnicode_4BYTE_DATA(string),
10257
3.90k
            PyUnicode_GET_LENGTH(string), keepends);
10258
3.90k
        break;
10259
0
    default:
10260
0
        Py_UNREACHABLE();
10261
19.3k
    }
10262
19.3k
    return list;
10263
19.3k
}
10264
10265
static PyObject *
10266
split(PyObject *self,
10267
      PyObject *substring,
10268
      Py_ssize_t maxcount)
10269
22.0M
{
10270
22.0M
    int kind1, kind2;
10271
22.0M
    const void *buf1, *buf2;
10272
22.0M
    Py_ssize_t len1, len2;
10273
22.0M
    PyObject* out;
10274
22.0M
    len1 = PyUnicode_GET_LENGTH(self);
10275
22.0M
    kind1 = PyUnicode_KIND(self);
10276
10277
22.0M
    if (substring == NULL) {
10278
162k
        if (maxcount < 0) {
10279
136k
            maxcount = (len1 - 1) / 2 + 1;
10280
136k
        }
10281
162k
        switch (kind1) {
10282
105k
        case PyUnicode_1BYTE_KIND:
10283
105k
            if (PyUnicode_IS_ASCII(self))
10284
78.3k
                return asciilib_split_whitespace(
10285
78.3k
                    self,  PyUnicode_1BYTE_DATA(self),
10286
78.3k
                    len1, maxcount
10287
78.3k
                    );
10288
26.9k
            else
10289
26.9k
                return ucs1lib_split_whitespace(
10290
26.9k
                    self,  PyUnicode_1BYTE_DATA(self),
10291
26.9k
                    len1, maxcount
10292
26.9k
                    );
10293
45.6k
        case PyUnicode_2BYTE_KIND:
10294
45.6k
            return ucs2lib_split_whitespace(
10295
45.6k
                self,  PyUnicode_2BYTE_DATA(self),
10296
45.6k
                len1, maxcount
10297
45.6k
                );
10298
11.0k
        case PyUnicode_4BYTE_KIND:
10299
11.0k
            return ucs4lib_split_whitespace(
10300
11.0k
                self,  PyUnicode_4BYTE_DATA(self),
10301
11.0k
                len1, maxcount
10302
11.0k
                );
10303
0
        default:
10304
0
            Py_UNREACHABLE();
10305
162k
        }
10306
162k
    }
10307
10308
21.9M
    kind2 = PyUnicode_KIND(substring);
10309
21.9M
    len2 = PyUnicode_GET_LENGTH(substring);
10310
21.9M
    if (maxcount < 0) {
10311
        // if len2 == 0, it will raise ValueError.
10312
16.2M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10313
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10314
16.2M
        maxcount = maxcount < 0 ? len1 : maxcount;
10315
16.2M
    }
10316
21.9M
    if (kind1 < kind2 || len1 < len2) {
10317
1.11M
        out = PyList_New(1);
10318
1.11M
        if (out == NULL)
10319
0
            return NULL;
10320
1.11M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10321
1.11M
        return out;
10322
1.11M
    }
10323
20.8M
    buf1 = PyUnicode_DATA(self);
10324
20.8M
    buf2 = PyUnicode_DATA(substring);
10325
20.8M
    if (kind2 != kind1) {
10326
269k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10327
269k
        if (!buf2)
10328
0
            return NULL;
10329
269k
    }
10330
10331
20.8M
    switch (kind1) {
10332
20.5M
    case PyUnicode_1BYTE_KIND:
10333
20.5M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10334
19.2M
            out = asciilib_split(
10335
19.2M
                self,  buf1, len1, buf2, len2, maxcount);
10336
1.28M
        else
10337
1.28M
            out = ucs1lib_split(
10338
1.28M
                self,  buf1, len1, buf2, len2, maxcount);
10339
20.5M
        break;
10340
235k
    case PyUnicode_2BYTE_KIND:
10341
235k
        out = ucs2lib_split(
10342
235k
            self,  buf1, len1, buf2, len2, maxcount);
10343
235k
        break;
10344
34.1k
    case PyUnicode_4BYTE_KIND:
10345
34.1k
        out = ucs4lib_split(
10346
34.1k
            self,  buf1, len1, buf2, len2, maxcount);
10347
34.1k
        break;
10348
0
    default:
10349
0
        out = NULL;
10350
20.8M
    }
10351
20.8M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10352
20.8M
    if (kind2 != kind1)
10353
269k
        PyMem_Free((void *)buf2);
10354
20.8M
    return out;
10355
20.8M
}
10356
10357
static PyObject *
10358
rsplit(PyObject *self,
10359
       PyObject *substring,
10360
       Py_ssize_t maxcount)
10361
66
{
10362
66
    int kind1, kind2;
10363
66
    const void *buf1, *buf2;
10364
66
    Py_ssize_t len1, len2;
10365
66
    PyObject* out;
10366
10367
66
    len1 = PyUnicode_GET_LENGTH(self);
10368
66
    kind1 = PyUnicode_KIND(self);
10369
10370
66
    if (substring == NULL) {
10371
0
        if (maxcount < 0) {
10372
0
            maxcount = (len1 - 1) / 2 + 1;
10373
0
        }
10374
0
        switch (kind1) {
10375
0
        case PyUnicode_1BYTE_KIND:
10376
0
            if (PyUnicode_IS_ASCII(self))
10377
0
                return asciilib_rsplit_whitespace(
10378
0
                    self,  PyUnicode_1BYTE_DATA(self),
10379
0
                    len1, maxcount
10380
0
                    );
10381
0
            else
10382
0
                return ucs1lib_rsplit_whitespace(
10383
0
                    self,  PyUnicode_1BYTE_DATA(self),
10384
0
                    len1, maxcount
10385
0
                    );
10386
0
        case PyUnicode_2BYTE_KIND:
10387
0
            return ucs2lib_rsplit_whitespace(
10388
0
                self,  PyUnicode_2BYTE_DATA(self),
10389
0
                len1, maxcount
10390
0
                );
10391
0
        case PyUnicode_4BYTE_KIND:
10392
0
            return ucs4lib_rsplit_whitespace(
10393
0
                self,  PyUnicode_4BYTE_DATA(self),
10394
0
                len1, maxcount
10395
0
                );
10396
0
        default:
10397
0
            Py_UNREACHABLE();
10398
0
        }
10399
0
    }
10400
66
    kind2 = PyUnicode_KIND(substring);
10401
66
    len2 = PyUnicode_GET_LENGTH(substring);
10402
66
    if (maxcount < 0) {
10403
        // if len2 == 0, it will raise ValueError.
10404
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10405
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10406
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10407
0
    }
10408
66
    if (kind1 < kind2 || len1 < len2) {
10409
0
        out = PyList_New(1);
10410
0
        if (out == NULL)
10411
0
            return NULL;
10412
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10413
0
        return out;
10414
0
    }
10415
66
    buf1 = PyUnicode_DATA(self);
10416
66
    buf2 = PyUnicode_DATA(substring);
10417
66
    if (kind2 != kind1) {
10418
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10419
0
        if (!buf2)
10420
0
            return NULL;
10421
0
    }
10422
10423
66
    switch (kind1) {
10424
66
    case PyUnicode_1BYTE_KIND:
10425
66
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10426
66
            out = asciilib_rsplit(
10427
66
                self,  buf1, len1, buf2, len2, maxcount);
10428
0
        else
10429
0
            out = ucs1lib_rsplit(
10430
0
                self,  buf1, len1, buf2, len2, maxcount);
10431
66
        break;
10432
0
    case PyUnicode_2BYTE_KIND:
10433
0
        out = ucs2lib_rsplit(
10434
0
            self,  buf1, len1, buf2, len2, maxcount);
10435
0
        break;
10436
0
    case PyUnicode_4BYTE_KIND:
10437
0
        out = ucs4lib_rsplit(
10438
0
            self,  buf1, len1, buf2, len2, maxcount);
10439
0
        break;
10440
0
    default:
10441
0
        out = NULL;
10442
66
    }
10443
66
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10444
66
    if (kind2 != kind1)
10445
0
        PyMem_Free((void *)buf2);
10446
66
    return out;
10447
66
}
10448
10449
static Py_ssize_t
10450
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10451
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10452
24.1M
{
10453
24.1M
    switch (kind) {
10454
9.02M
    case PyUnicode_1BYTE_KIND:
10455
9.02M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10456
4.65M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10457
4.36M
        else
10458
4.36M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10459
6.95M
    case PyUnicode_2BYTE_KIND:
10460
6.95M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10461
8.17M
    case PyUnicode_4BYTE_KIND:
10462
8.17M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10463
24.1M
    }
10464
24.1M
    Py_UNREACHABLE();
10465
24.1M
}
10466
10467
static Py_ssize_t
10468
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10469
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10470
13.5M
{
10471
13.5M
    switch (kind) {
10472
12.6M
    case PyUnicode_1BYTE_KIND:
10473
12.6M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10474
765k
    case PyUnicode_2BYTE_KIND:
10475
765k
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10476
90.0k
    case PyUnicode_4BYTE_KIND:
10477
90.0k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10478
13.5M
    }
10479
13.5M
    Py_UNREACHABLE();
10480
13.5M
}
10481
10482
static void
10483
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10484
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10485
72.2k
{
10486
72.2k
    int kind = PyUnicode_KIND(u);
10487
72.2k
    void *data = PyUnicode_DATA(u);
10488
72.2k
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10489
72.2k
    if (kind == PyUnicode_1BYTE_KIND) {
10490
35.2k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10491
35.2k
                                      (Py_UCS1 *)data + len,
10492
35.2k
                                      u1, u2, maxcount);
10493
35.2k
    }
10494
37.0k
    else if (kind == PyUnicode_2BYTE_KIND) {
10495
29.3k
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10496
29.3k
                                      (Py_UCS2 *)data + len,
10497
29.3k
                                      u1, u2, maxcount);
10498
29.3k
    }
10499
7.67k
    else {
10500
7.67k
        assert(kind == PyUnicode_4BYTE_KIND);
10501
7.67k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10502
7.67k
                                      (Py_UCS4 *)data + len,
10503
7.67k
                                      u1, u2, maxcount);
10504
7.67k
    }
10505
72.2k
}
10506
10507
static PyObject *
10508
replace(PyObject *self, PyObject *str1,
10509
        PyObject *str2, Py_ssize_t maxcount)
10510
21.9M
{
10511
21.9M
    PyObject *u;
10512
21.9M
    const char *sbuf = PyUnicode_DATA(self);
10513
21.9M
    const void *buf1 = PyUnicode_DATA(str1);
10514
21.9M
    const void *buf2 = PyUnicode_DATA(str2);
10515
21.9M
    int srelease = 0, release1 = 0, release2 = 0;
10516
21.9M
    int skind = PyUnicode_KIND(self);
10517
21.9M
    int kind1 = PyUnicode_KIND(str1);
10518
21.9M
    int kind2 = PyUnicode_KIND(str2);
10519
21.9M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10520
21.9M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10521
21.9M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10522
21.9M
    int mayshrink;
10523
21.9M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10524
10525
21.9M
    if (slen < len1)
10526
7.95M
        goto nothing;
10527
10528
13.9M
    if (maxcount < 0)
10529
13.9M
        maxcount = PY_SSIZE_T_MAX;
10530
0
    else if (maxcount == 0)
10531
0
        goto nothing;
10532
10533
13.9M
    if (str1 == str2)
10534
29.1k
        goto nothing;
10535
10536
13.9M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10537
13.9M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10538
13.9M
    if (maxchar < maxchar_str1)
10539
        /* substring too wide to be present */
10540
0
        goto nothing;
10541
13.9M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10542
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10543
       result string. */
10544
13.9M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10545
13.9M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10546
10547
13.9M
    if (len1 == len2) {
10548
        /* same length */
10549
436k
        if (len1 == 0)
10550
0
            goto nothing;
10551
436k
        if (len1 == 1) {
10552
            /* replace characters */
10553
428k
            Py_UCS4 u1, u2;
10554
428k
            Py_ssize_t pos;
10555
10556
428k
            u1 = PyUnicode_READ(kind1, buf1, 0);
10557
428k
            pos = findchar(sbuf, skind, slen, u1, 1);
10558
428k
            if (pos < 0)
10559
356k
                goto nothing;
10560
72.2k
            u2 = PyUnicode_READ(kind2, buf2, 0);
10561
72.2k
            u = PyUnicode_New(slen, maxchar);
10562
72.2k
            if (!u)
10563
0
                goto error;
10564
10565
72.2k
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10566
72.2k
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10567
72.2k
        }
10568
7.51k
        else {
10569
7.51k
            int rkind = skind;
10570
7.51k
            char *res;
10571
7.51k
            Py_ssize_t i;
10572
10573
7.51k
            if (kind1 < rkind) {
10574
                /* widen substring */
10575
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10576
0
                if (!buf1) goto error;
10577
0
                release1 = 1;
10578
0
            }
10579
7.51k
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10580
7.51k
            if (i < 0)
10581
7.51k
                goto nothing;
10582
0
            if (rkind > kind2) {
10583
                /* widen replacement */
10584
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10585
0
                if (!buf2) goto error;
10586
0
                release2 = 1;
10587
0
            }
10588
0
            else if (rkind < kind2) {
10589
                /* widen self and buf1 */
10590
0
                rkind = kind2;
10591
0
                if (release1) {
10592
0
                    assert(buf1 != PyUnicode_DATA(str1));
10593
0
                    PyMem_Free((void *)buf1);
10594
0
                    buf1 = PyUnicode_DATA(str1);
10595
0
                    release1 = 0;
10596
0
                }
10597
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10598
0
                if (!sbuf) goto error;
10599
0
                srelease = 1;
10600
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10601
0
                if (!buf1) goto error;
10602
0
                release1 = 1;
10603
0
            }
10604
0
            u = PyUnicode_New(slen, maxchar);
10605
0
            if (!u)
10606
0
                goto error;
10607
0
            assert(PyUnicode_KIND(u) == rkind);
10608
0
            res = PyUnicode_DATA(u);
10609
10610
0
            memcpy(res, sbuf, rkind * slen);
10611
            /* change everything in-place, starting with this one */
10612
0
            memcpy(res + rkind * i,
10613
0
                   buf2,
10614
0
                   rkind * len2);
10615
0
            i += len1;
10616
10617
0
            while ( --maxcount > 0) {
10618
0
                i = anylib_find(rkind, self,
10619
0
                                sbuf+rkind*i, slen-i,
10620
0
                                str1, buf1, len1, i);
10621
0
                if (i == -1)
10622
0
                    break;
10623
0
                memcpy(res + rkind * i,
10624
0
                       buf2,
10625
0
                       rkind * len2);
10626
0
                i += len1;
10627
0
            }
10628
0
        }
10629
436k
    }
10630
13.5M
    else {
10631
13.5M
        Py_ssize_t n, i, j, ires;
10632
13.5M
        Py_ssize_t new_size;
10633
13.5M
        int rkind = skind;
10634
13.5M
        char *res;
10635
10636
13.5M
        if (kind1 < rkind) {
10637
            /* widen substring */
10638
855k
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10639
855k
            if (!buf1) goto error;
10640
855k
            release1 = 1;
10641
855k
        }
10642
13.5M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10643
13.5M
        if (n == 0)
10644
11.8M
            goto nothing;
10645
1.63M
        if (kind2 < rkind) {
10646
            /* widen replacement */
10647
54.3k
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10648
54.3k
            if (!buf2) goto error;
10649
54.3k
            release2 = 1;
10650
54.3k
        }
10651
1.57M
        else if (kind2 > rkind) {
10652
            /* widen self and buf1 */
10653
0
            rkind = kind2;
10654
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10655
0
            if (!sbuf) goto error;
10656
0
            srelease = 1;
10657
0
            if (release1) {
10658
0
                assert(buf1 != PyUnicode_DATA(str1));
10659
0
                PyMem_Free((void *)buf1);
10660
0
                buf1 = PyUnicode_DATA(str1);
10661
0
                release1 = 0;
10662
0
            }
10663
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10664
0
            if (!buf1) goto error;
10665
0
            release1 = 1;
10666
0
        }
10667
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10668
           PyUnicode_GET_LENGTH(str1)); */
10669
1.63M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10670
0
                PyErr_SetString(PyExc_OverflowError,
10671
0
                                "replace string is too long");
10672
0
                goto error;
10673
0
        }
10674
1.63M
        new_size = slen + n * (len2 - len1);
10675
1.63M
        if (new_size == 0) {
10676
0
            u = _PyUnicode_GetEmpty();
10677
0
            goto done;
10678
0
        }
10679
1.63M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10680
0
            PyErr_SetString(PyExc_OverflowError,
10681
0
                            "replace string is too long");
10682
0
            goto error;
10683
0
        }
10684
1.63M
        u = PyUnicode_New(new_size, maxchar);
10685
1.63M
        if (!u)
10686
0
            goto error;
10687
1.63M
        assert(PyUnicode_KIND(u) == rkind);
10688
1.63M
        res = PyUnicode_DATA(u);
10689
1.63M
        ires = i = 0;
10690
1.63M
        if (len1 > 0) {
10691
25.7M
            while (n-- > 0) {
10692
                /* look for next match */
10693
24.1M
                j = anylib_find(rkind, self,
10694
24.1M
                                sbuf + rkind * i, slen-i,
10695
24.1M
                                str1, buf1, len1, i);
10696
24.1M
                if (j == -1)
10697
0
                    break;
10698
24.1M
                else if (j > i) {
10699
                    /* copy unchanged part [i:j] */
10700
5.48M
                    memcpy(res + rkind * ires,
10701
5.48M
                           sbuf + rkind * i,
10702
5.48M
                           rkind * (j-i));
10703
5.48M
                    ires += j - i;
10704
5.48M
                }
10705
                /* copy substitution string */
10706
24.1M
                if (len2 > 0) {
10707
24.1M
                    memcpy(res + rkind * ires,
10708
24.1M
                           buf2,
10709
24.1M
                           rkind * len2);
10710
24.1M
                    ires += len2;
10711
24.1M
                }
10712
24.1M
                i = j + len1;
10713
24.1M
            }
10714
1.63M
            if (i < slen)
10715
                /* copy tail [i:] */
10716
1.62M
                memcpy(res + rkind * ires,
10717
1.62M
                       sbuf + rkind * i,
10718
1.62M
                       rkind * (slen-i));
10719
1.63M
        }
10720
0
        else {
10721
            /* interleave */
10722
0
            while (n > 0) {
10723
0
                memcpy(res + rkind * ires,
10724
0
                       buf2,
10725
0
                       rkind * len2);
10726
0
                ires += len2;
10727
0
                if (--n <= 0)
10728
0
                    break;
10729
0
                memcpy(res + rkind * ires,
10730
0
                       sbuf + rkind * i,
10731
0
                       rkind);
10732
0
                ires++;
10733
0
                i++;
10734
0
            }
10735
0
            memcpy(res + rkind * ires,
10736
0
                   sbuf + rkind * i,
10737
0
                   rkind * (slen-i));
10738
0
        }
10739
1.63M
    }
10740
10741
1.70M
    if (mayshrink) {
10742
0
        unicode_adjust_maxchar(&u);
10743
0
        if (u == NULL)
10744
0
            goto error;
10745
0
    }
10746
10747
1.70M
  done:
10748
1.70M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10749
1.70M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10750
1.70M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10751
1.70M
    if (srelease)
10752
0
        PyMem_Free((void *)sbuf);
10753
1.70M
    if (release1)
10754
54.3k
        PyMem_Free((void *)buf1);
10755
1.70M
    if (release2)
10756
54.3k
        PyMem_Free((void *)buf2);
10757
1.70M
    assert(_PyUnicode_CheckConsistency(u, 1));
10758
1.70M
    return u;
10759
10760
20.2M
  nothing:
10761
    /* nothing to replace; return original string (when possible) */
10762
20.2M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10763
20.2M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10764
20.2M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10765
20.2M
    if (srelease)
10766
0
        PyMem_Free((void *)sbuf);
10767
20.2M
    if (release1)
10768
801k
        PyMem_Free((void *)buf1);
10769
20.2M
    if (release2)
10770
0
        PyMem_Free((void *)buf2);
10771
20.2M
    return unicode_result_unchanged(self);
10772
10773
0
  error:
10774
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10775
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10776
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10777
0
    if (srelease)
10778
0
        PyMem_Free((void *)sbuf);
10779
0
    if (release1)
10780
0
        PyMem_Free((void *)buf1);
10781
0
    if (release2)
10782
0
        PyMem_Free((void *)buf2);
10783
0
    return NULL;
10784
1.70M
}
10785
10786
/* --- Unicode Object Methods --------------------------------------------- */
10787
10788
/*[clinic input]
10789
@permit_long_docstring_body
10790
str.title as unicode_title
10791
10792
Return a version of the string where each word is titlecased.
10793
10794
More specifically, words start with uppercased characters and all remaining
10795
cased characters have lower case.
10796
[clinic start generated code]*/
10797
10798
static PyObject *
10799
unicode_title_impl(PyObject *self)
10800
/*[clinic end generated code: output=c75ae03809574902 input=533ce0eb6a7f5d1b]*/
10801
0
{
10802
0
    return case_operation(self, do_title);
10803
0
}
10804
10805
/*[clinic input]
10806
@permit_long_docstring_body
10807
str.capitalize as unicode_capitalize
10808
10809
Return a capitalized version of the string.
10810
10811
More specifically, make the first character have upper case and the rest lower
10812
case.
10813
[clinic start generated code]*/
10814
10815
static PyObject *
10816
unicode_capitalize_impl(PyObject *self)
10817
/*[clinic end generated code: output=e49a4c333cdb7667 input=a4a15ade41f6f9e9]*/
10818
0
{
10819
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10820
0
        return unicode_result_unchanged(self);
10821
0
    return case_operation(self, do_capitalize);
10822
0
}
10823
10824
/*[clinic input]
10825
str.casefold as unicode_casefold
10826
10827
Return a version of the string suitable for caseless comparisons.
10828
[clinic start generated code]*/
10829
10830
static PyObject *
10831
unicode_casefold_impl(PyObject *self)
10832
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10833
0
{
10834
0
    if (PyUnicode_IS_ASCII(self))
10835
0
        return ascii_upper_or_lower(self, 1);
10836
0
    return case_operation(self, do_casefold);
10837
0
}
10838
10839
10840
/* Argument converter. Accepts a single Unicode character. */
10841
10842
static int
10843
convert_uc(PyObject *obj, void *addr)
10844
130
{
10845
130
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10846
10847
130
    if (!PyUnicode_Check(obj)) {
10848
0
        PyErr_Format(PyExc_TypeError,
10849
0
                     "The fill character must be a unicode character, "
10850
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10851
0
        return 0;
10852
0
    }
10853
130
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10854
0
        PyErr_SetString(PyExc_TypeError,
10855
0
                        "The fill character must be exactly one character long");
10856
0
        return 0;
10857
0
    }
10858
130
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10859
130
    return 1;
10860
130
}
10861
10862
/*[clinic input]
10863
str.center as unicode_center
10864
10865
    width: Py_ssize_t
10866
    fillchar: Py_UCS4 = ' '
10867
    /
10868
10869
Return a centered string of length width.
10870
10871
Padding is done using the specified fill character (default is a space).
10872
[clinic start generated code]*/
10873
10874
static PyObject *
10875
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10876
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10877
0
{
10878
0
    Py_ssize_t marg, left;
10879
10880
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10881
0
        return unicode_result_unchanged(self);
10882
10883
0
    marg = width - PyUnicode_GET_LENGTH(self);
10884
0
    left = marg / 2 + (marg & width & 1);
10885
10886
0
    return pad(self, left, marg - left, fillchar);
10887
0
}
10888
10889
/* This function assumes that str1 and str2 are readied by the caller. */
10890
10891
static int
10892
unicode_compare(PyObject *str1, PyObject *str2)
10893
14.7M
{
10894
14.7M
#define COMPARE(TYPE1, TYPE2) \
10895
14.7M
    do { \
10896
13.8M
        TYPE1* p1 = (TYPE1 *)data1; \
10897
13.8M
        TYPE2* p2 = (TYPE2 *)data2; \
10898
13.8M
        TYPE1* end = p1 + len; \
10899
13.8M
        Py_UCS4 c1, c2; \
10900
13.8M
        for (; p1 != end; p1++, p2++) { \
10901
13.8M
            c1 = *p1; \
10902
13.8M
            c2 = *p2; \
10903
13.8M
            if (c1 != c2) \
10904
13.8M
                return (c1 < c2) ? -1 : 1; \
10905
13.8M
        } \
10906
13.8M
    } \
10907
13.8M
    while (0)
10908
10909
14.7M
    int kind1, kind2;
10910
14.7M
    const void *data1, *data2;
10911
14.7M
    Py_ssize_t len1, len2, len;
10912
10913
14.7M
    kind1 = PyUnicode_KIND(str1);
10914
14.7M
    kind2 = PyUnicode_KIND(str2);
10915
14.7M
    data1 = PyUnicode_DATA(str1);
10916
14.7M
    data2 = PyUnicode_DATA(str2);
10917
14.7M
    len1 = PyUnicode_GET_LENGTH(str1);
10918
14.7M
    len2 = PyUnicode_GET_LENGTH(str2);
10919
14.7M
    len = Py_MIN(len1, len2);
10920
10921
14.7M
    switch(kind1) {
10922
775k
    case PyUnicode_1BYTE_KIND:
10923
775k
    {
10924
775k
        switch(kind2) {
10925
315k
        case PyUnicode_1BYTE_KIND:
10926
315k
        {
10927
315k
            int cmp = memcmp(data1, data2, len);
10928
            /* normalize result of memcmp() into the range [-1; 1] */
10929
315k
            if (cmp < 0)
10930
285k
                return -1;
10931
30.1k
            if (cmp > 0)
10932
24.1k
                return 1;
10933
6.04k
            break;
10934
30.1k
        }
10935
450k
        case PyUnicode_2BYTE_KIND:
10936
450k
            COMPARE(Py_UCS1, Py_UCS2);
10937
0
            break;
10938
9.38k
        case PyUnicode_4BYTE_KIND:
10939
9.38k
            COMPARE(Py_UCS1, Py_UCS4);
10940
0
            break;
10941
0
        default:
10942
0
            Py_UNREACHABLE();
10943
775k
        }
10944
6.04k
        break;
10945
775k
    }
10946
13.2M
    case PyUnicode_2BYTE_KIND:
10947
13.2M
    {
10948
13.2M
        switch(kind2) {
10949
2.59k
        case PyUnicode_1BYTE_KIND:
10950
2.59k
            COMPARE(Py_UCS2, Py_UCS1);
10951
0
            break;
10952
12.8M
        case PyUnicode_2BYTE_KIND:
10953
12.8M
        {
10954
12.8M
            COMPARE(Py_UCS2, Py_UCS2);
10955
0
            break;
10956
12.8M
        }
10957
358k
        case PyUnicode_4BYTE_KIND:
10958
358k
            COMPARE(Py_UCS2, Py_UCS4);
10959
0
            break;
10960
0
        default:
10961
0
            Py_UNREACHABLE();
10962
13.2M
        }
10963
0
        break;
10964
13.2M
    }
10965
781k
    case PyUnicode_4BYTE_KIND:
10966
781k
    {
10967
781k
        switch(kind2) {
10968
319
        case PyUnicode_1BYTE_KIND:
10969
319
            COMPARE(Py_UCS4, Py_UCS1);
10970
0
            break;
10971
156k
        case PyUnicode_2BYTE_KIND:
10972
156k
            COMPARE(Py_UCS4, Py_UCS2);
10973
0
            break;
10974
624k
        case PyUnicode_4BYTE_KIND:
10975
624k
        {
10976
624k
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10977
624k
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10978
            /* normalize result of wmemcmp() into the range [-1; 1] */
10979
624k
            if (cmp < 0)
10980
295k
                return -1;
10981
328k
            if (cmp > 0)
10982
328k
                return 1;
10983
#else
10984
            COMPARE(Py_UCS4, Py_UCS4);
10985
#endif
10986
0
            break;
10987
328k
        }
10988
0
        default:
10989
0
            Py_UNREACHABLE();
10990
781k
        }
10991
0
        break;
10992
781k
    }
10993
0
    default:
10994
0
        Py_UNREACHABLE();
10995
14.7M
    }
10996
10997
6.04k
    if (len1 == len2)
10998
6.00k
        return 0;
10999
36
    if (len1 < len2)
11000
10
        return -1;
11001
26
    else
11002
26
        return 1;
11003
11004
36
#undef COMPARE
11005
36
}
11006
11007
11008
int
11009
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11010
604M
{
11011
604M
    assert(PyUnicode_Check(str1));
11012
604M
    assert(PyUnicode_Check(str2));
11013
604M
    if (str1 == str2) {
11014
86.7M
        return 1;
11015
86.7M
    }
11016
518M
    return unicode_eq(str1, str2);
11017
604M
}
11018
11019
11020
int
11021
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11022
0
{
11023
0
    if (!PyUnicode_Check(str1)) {
11024
0
        PyErr_Format(PyExc_TypeError,
11025
0
                     "first argument must be str, not %T", str1);
11026
0
        return -1;
11027
0
    }
11028
0
    if (!PyUnicode_Check(str2)) {
11029
0
        PyErr_Format(PyExc_TypeError,
11030
0
                     "second argument must be str, not %T", str2);
11031
0
        return -1;
11032
0
    }
11033
11034
0
    return _PyUnicode_Equal(str1, str2);
11035
0
}
11036
11037
11038
int
11039
PyUnicode_Compare(PyObject *left, PyObject *right)
11040
261k
{
11041
261k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11042
        /* a string is equal to itself */
11043
261k
        if (left == right)
11044
0
            return 0;
11045
11046
261k
        return unicode_compare(left, right);
11047
261k
    }
11048
0
    PyErr_Format(PyExc_TypeError,
11049
0
                 "Can't compare %.100s and %.100s",
11050
0
                 Py_TYPE(left)->tp_name,
11051
0
                 Py_TYPE(right)->tp_name);
11052
0
    return -1;
11053
261k
}
11054
11055
int
11056
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11057
4.61M
{
11058
4.61M
    Py_ssize_t i;
11059
4.61M
    int kind;
11060
4.61M
    Py_UCS4 chr;
11061
11062
4.61M
    assert(_PyUnicode_CHECK(uni));
11063
4.61M
    kind = PyUnicode_KIND(uni);
11064
4.61M
    if (kind == PyUnicode_1BYTE_KIND) {
11065
4.61M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11066
4.61M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11067
4.61M
        size_t len, len2 = strlen(str);
11068
4.61M
        int cmp;
11069
11070
4.61M
        len = Py_MIN(len1, len2);
11071
4.61M
        cmp = memcmp(data, str, len);
11072
4.61M
        if (cmp != 0) {
11073
4.32M
            if (cmp < 0)
11074
49.5k
                return -1;
11075
4.27M
            else
11076
4.27M
                return 1;
11077
4.32M
        }
11078
290k
        if (len1 > len2)
11079
198
            return 1; /* uni is longer */
11080
290k
        if (len1 < len2)
11081
675
            return -1; /* str is longer */
11082
289k
        return 0;
11083
290k
    }
11084
1.50k
    else {
11085
1.50k
        const void *data = PyUnicode_DATA(uni);
11086
        /* Compare Unicode string and source character set string */
11087
2.61k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11088
2.41k
            if (chr != (unsigned char)str[i])
11089
1.29k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11090
        /* This check keeps Python strings that end in '\0' from comparing equal
11091
         to C strings identical up to that point. */
11092
203
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11093
203
            return 1; /* uni is longer */
11094
0
        if (str[i])
11095
0
            return -1; /* str is longer */
11096
0
        return 0;
11097
0
    }
11098
4.61M
}
11099
11100
int
11101
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11102
24
{
11103
24
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11104
24
}
11105
11106
int
11107
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11108
24
{
11109
24
    assert(_PyUnicode_CHECK(unicode));
11110
24
    assert(str);
11111
11112
24
    if (PyUnicode_IS_ASCII(unicode)) {
11113
24
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11114
24
        return size == len &&
11115
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11116
24
    }
11117
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11118
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11119
0
        return size == len &&
11120
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11121
0
    }
11122
11123
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11124
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11125
0
        return 0;
11126
0
    }
11127
0
    const unsigned char *s = (const unsigned char *)str;
11128
0
    const unsigned char *ends = s + (size_t)size;
11129
0
    int kind = PyUnicode_KIND(unicode);
11130
0
    const void *data = PyUnicode_DATA(unicode);
11131
    /* Compare Unicode string and UTF-8 string */
11132
0
    for (Py_ssize_t i = 0; i < len; i++) {
11133
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11134
0
        if (ch < 0x80) {
11135
0
            if (ends == s || s[0] != ch) {
11136
0
                return 0;
11137
0
            }
11138
0
            s += 1;
11139
0
        }
11140
0
        else if (ch < 0x800) {
11141
0
            if ((ends - s) < 2 ||
11142
0
                s[0] != (0xc0 | (ch >> 6)) ||
11143
0
                s[1] != (0x80 | (ch & 0x3f)))
11144
0
            {
11145
0
                return 0;
11146
0
            }
11147
0
            s += 2;
11148
0
        }
11149
0
        else if (ch < 0x10000) {
11150
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11151
0
                (ends - s) < 3 ||
11152
0
                s[0] != (0xe0 | (ch >> 12)) ||
11153
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11154
0
                s[2] != (0x80 | (ch & 0x3f)))
11155
0
            {
11156
0
                return 0;
11157
0
            }
11158
0
            s += 3;
11159
0
        }
11160
0
        else {
11161
0
            assert(ch <= MAX_UNICODE);
11162
0
            if ((ends - s) < 4 ||
11163
0
                s[0] != (0xf0 | (ch >> 18)) ||
11164
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11165
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11166
0
                s[3] != (0x80 | (ch & 0x3f)))
11167
0
            {
11168
0
                return 0;
11169
0
            }
11170
0
            s += 4;
11171
0
        }
11172
0
    }
11173
0
    return s == ends;
11174
0
}
11175
11176
int
11177
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11178
38.4M
{
11179
38.4M
    size_t len;
11180
38.4M
    assert(_PyUnicode_CHECK(unicode));
11181
38.4M
    assert(str);
11182
#ifndef NDEBUG
11183
    for (const char *p = str; *p; p++) {
11184
        assert((unsigned char)*p < 128);
11185
    }
11186
#endif
11187
38.4M
    if (!PyUnicode_IS_ASCII(unicode))
11188
143k
        return 0;
11189
38.2M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11190
38.2M
    return strlen(str) == len &&
11191
698k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11192
38.4M
}
11193
11194
PyObject *
11195
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11196
56.5M
{
11197
56.5M
    int result;
11198
11199
56.5M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11200
300k
        Py_RETURN_NOTIMPLEMENTED;
11201
11202
56.2M
    if (left == right) {
11203
2.26k
        switch (op) {
11204
2.02k
        case Py_EQ:
11205
2.02k
        case Py_LE:
11206
2.02k
        case Py_GE:
11207
            /* a string is equal to itself */
11208
2.02k
            Py_RETURN_TRUE;
11209
233
        case Py_NE:
11210
233
        case Py_LT:
11211
233
        case Py_GT:
11212
233
            Py_RETURN_FALSE;
11213
0
        default:
11214
0
            PyErr_BadArgument();
11215
0
            return NULL;
11216
2.26k
        }
11217
2.26k
    }
11218
56.2M
    else if (op == Py_EQ || op == Py_NE) {
11219
41.7M
        result = unicode_eq(left, right);
11220
41.7M
        result ^= (op == Py_NE);
11221
41.7M
        return PyBool_FromLong(result);
11222
41.7M
    }
11223
14.4M
    else {
11224
14.4M
        result = unicode_compare(left, right);
11225
14.4M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11226
14.4M
    }
11227
56.2M
}
11228
11229
int
11230
PyUnicode_Contains(PyObject *str, PyObject *substr)
11231
224M
{
11232
224M
    int kind1, kind2;
11233
224M
    const void *buf1, *buf2;
11234
224M
    Py_ssize_t len1, len2;
11235
224M
    int result;
11236
11237
224M
    if (!PyUnicode_Check(substr)) {
11238
0
        PyErr_Format(PyExc_TypeError,
11239
0
                     "'in <string>' requires string as left operand, not %.100s",
11240
0
                     Py_TYPE(substr)->tp_name);
11241
0
        return -1;
11242
0
    }
11243
224M
    if (ensure_unicode(str) < 0)
11244
0
        return -1;
11245
11246
224M
    kind1 = PyUnicode_KIND(str);
11247
224M
    kind2 = PyUnicode_KIND(substr);
11248
224M
    if (kind1 < kind2)
11249
13.5M
        return 0;
11250
211M
    len1 = PyUnicode_GET_LENGTH(str);
11251
211M
    len2 = PyUnicode_GET_LENGTH(substr);
11252
211M
    if (len1 < len2)
11253
1.11M
        return 0;
11254
210M
    buf1 = PyUnicode_DATA(str);
11255
210M
    buf2 = PyUnicode_DATA(substr);
11256
210M
    if (len2 == 1) {
11257
189M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11258
189M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11259
189M
        return result;
11260
189M
    }
11261
20.8M
    if (kind2 != kind1) {
11262
18.5k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11263
18.5k
        if (!buf2)
11264
0
            return -1;
11265
18.5k
    }
11266
11267
20.8M
    switch (kind1) {
11268
20.8M
    case PyUnicode_1BYTE_KIND:
11269
20.8M
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11270
20.8M
        break;
11271
14.0k
    case PyUnicode_2BYTE_KIND:
11272
14.0k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11273
14.0k
        break;
11274
4.50k
    case PyUnicode_4BYTE_KIND:
11275
4.50k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11276
4.50k
        break;
11277
0
    default:
11278
0
        Py_UNREACHABLE();
11279
20.8M
    }
11280
11281
20.8M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11282
20.8M
    if (kind2 != kind1)
11283
18.5k
        PyMem_Free((void *)buf2);
11284
11285
20.8M
    return result;
11286
20.8M
}
11287
11288
/* Concat to string or Unicode object giving a new Unicode object. */
11289
11290
PyObject *
11291
PyUnicode_Concat(PyObject *left, PyObject *right)
11292
26.6M
{
11293
26.6M
    PyObject *result;
11294
26.6M
    Py_UCS4 maxchar, maxchar2;
11295
26.6M
    Py_ssize_t left_len, right_len, new_len;
11296
11297
26.6M
    if (ensure_unicode(left) < 0)
11298
0
        return NULL;
11299
11300
26.6M
    if (!PyUnicode_Check(right)) {
11301
0
        PyErr_Format(PyExc_TypeError,
11302
0
            "can only concatenate str (not \"%.200s\") to str",
11303
0
            Py_TYPE(right)->tp_name);
11304
0
        return NULL;
11305
0
    }
11306
11307
    /* Shortcuts */
11308
26.6M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11309
26.6M
    if (left == empty) {
11310
93.5k
        return PyUnicode_FromObject(right);
11311
93.5k
    }
11312
26.5M
    if (right == empty) {
11313
1.42M
        return PyUnicode_FromObject(left);
11314
1.42M
    }
11315
11316
25.1M
    left_len = PyUnicode_GET_LENGTH(left);
11317
25.1M
    right_len = PyUnicode_GET_LENGTH(right);
11318
25.1M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11319
0
        PyErr_SetString(PyExc_OverflowError,
11320
0
                        "strings are too large to concat");
11321
0
        return NULL;
11322
0
    }
11323
25.1M
    new_len = left_len + right_len;
11324
11325
25.1M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11326
25.1M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11327
25.1M
    maxchar = Py_MAX(maxchar, maxchar2);
11328
11329
    /* Concat the two Unicode strings */
11330
25.1M
    result = PyUnicode_New(new_len, maxchar);
11331
25.1M
    if (result == NULL)
11332
0
        return NULL;
11333
25.1M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11334
25.1M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11335
25.1M
    assert(_PyUnicode_CheckConsistency(result, 1));
11336
25.1M
    return result;
11337
25.1M
}
11338
11339
void
11340
PyUnicode_Append(PyObject **p_left, PyObject *right)
11341
5.61M
{
11342
5.61M
    PyObject *left, *res;
11343
5.61M
    Py_UCS4 maxchar, maxchar2;
11344
5.61M
    Py_ssize_t left_len, right_len, new_len;
11345
11346
5.61M
    if (p_left == NULL) {
11347
0
        if (!PyErr_Occurred())
11348
0
            PyErr_BadInternalCall();
11349
0
        return;
11350
0
    }
11351
5.61M
    left = *p_left;
11352
5.61M
    if (right == NULL || left == NULL
11353
5.61M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11354
0
        if (!PyErr_Occurred())
11355
0
            PyErr_BadInternalCall();
11356
0
        goto error;
11357
0
    }
11358
11359
    /* Shortcuts */
11360
5.61M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11361
5.61M
    if (left == empty) {
11362
495k
        Py_DECREF(left);
11363
495k
        *p_left = Py_NewRef(right);
11364
495k
        return;
11365
495k
    }
11366
5.12M
    if (right == empty) {
11367
13.3k
        return;
11368
13.3k
    }
11369
11370
5.10M
    left_len = PyUnicode_GET_LENGTH(left);
11371
5.10M
    right_len = PyUnicode_GET_LENGTH(right);
11372
5.10M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11373
0
        PyErr_SetString(PyExc_OverflowError,
11374
0
                        "strings are too large to concat");
11375
0
        goto error;
11376
0
    }
11377
5.10M
    new_len = left_len + right_len;
11378
11379
5.10M
    if (_PyUnicode_IsModifiable(left)
11380
5.10M
        && PyUnicode_CheckExact(right)
11381
5.10M
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11382
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11383
           to change the structure size, but characters are stored just after
11384
           the structure, and so it requires to move all characters which is
11385
           not so different than duplicating the string. */
11386
2.34M
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11387
2.34M
    {
11388
        /* append inplace */
11389
2.34M
        if (unicode_resize(p_left, new_len) != 0)
11390
0
            goto error;
11391
11392
        /* copy 'right' into the newly allocated area of 'left' */
11393
2.34M
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11394
2.34M
    }
11395
2.76M
    else {
11396
2.76M
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11397
2.76M
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11398
2.76M
        maxchar = Py_MAX(maxchar, maxchar2);
11399
11400
        /* Concat the two Unicode strings */
11401
2.76M
        res = PyUnicode_New(new_len, maxchar);
11402
2.76M
        if (res == NULL)
11403
0
            goto error;
11404
2.76M
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11405
2.76M
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11406
2.76M
        Py_DECREF(left);
11407
2.76M
        *p_left = res;
11408
2.76M
    }
11409
5.10M
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11410
5.10M
    return;
11411
11412
0
error:
11413
0
    Py_CLEAR(*p_left);
11414
0
}
11415
11416
void
11417
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11418
8
{
11419
8
    PyUnicode_Append(pleft, right);
11420
8
    Py_XDECREF(right);
11421
8
}
11422
11423
/*[clinic input]
11424
@permit_long_summary
11425
@text_signature "($self, sub[, start[, end]], /)"
11426
str.count as unicode_count -> Py_ssize_t
11427
11428
    self as str: self
11429
    sub as substr: unicode
11430
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11431
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11432
    /
11433
11434
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11435
11436
Optional arguments start and end are interpreted as in slice notation.
11437
[clinic start generated code]*/
11438
11439
static Py_ssize_t
11440
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11441
                   Py_ssize_t end)
11442
/*[clinic end generated code: output=8fcc3aef0b18edbf input=8590716ee228b935]*/
11443
26.7M
{
11444
26.7M
    assert(PyUnicode_Check(str));
11445
26.7M
    assert(PyUnicode_Check(substr));
11446
11447
26.7M
    Py_ssize_t result;
11448
26.7M
    int kind1, kind2;
11449
26.7M
    const void *buf1 = NULL, *buf2 = NULL;
11450
26.7M
    Py_ssize_t len1, len2;
11451
11452
26.7M
    kind1 = PyUnicode_KIND(str);
11453
26.7M
    kind2 = PyUnicode_KIND(substr);
11454
26.7M
    if (kind1 < kind2)
11455
0
        return 0;
11456
11457
26.7M
    len1 = PyUnicode_GET_LENGTH(str);
11458
26.7M
    len2 = PyUnicode_GET_LENGTH(substr);
11459
26.7M
    ADJUST_INDICES(start, end, len1);
11460
26.7M
    if (end - start < len2)
11461
3.02M
        return 0;
11462
11463
23.7M
    buf1 = PyUnicode_DATA(str);
11464
23.7M
    buf2 = PyUnicode_DATA(substr);
11465
23.7M
    if (kind2 != kind1) {
11466
6.76M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11467
6.76M
        if (!buf2)
11468
0
            goto onError;
11469
6.76M
    }
11470
11471
    // We don't reuse `anylib_count` here because of the explicit casts.
11472
23.7M
    switch (kind1) {
11473
17.0M
    case PyUnicode_1BYTE_KIND:
11474
17.0M
        result = ucs1lib_count(
11475
17.0M
            ((const Py_UCS1*)buf1) + start, end - start,
11476
17.0M
            buf2, len2, PY_SSIZE_T_MAX
11477
17.0M
            );
11478
17.0M
        break;
11479
4.01M
    case PyUnicode_2BYTE_KIND:
11480
4.01M
        result = ucs2lib_count(
11481
4.01M
            ((const Py_UCS2*)buf1) + start, end - start,
11482
4.01M
            buf2, len2, PY_SSIZE_T_MAX
11483
4.01M
            );
11484
4.01M
        break;
11485
2.75M
    case PyUnicode_4BYTE_KIND:
11486
2.75M
        result = ucs4lib_count(
11487
2.75M
            ((const Py_UCS4*)buf1) + start, end - start,
11488
2.75M
            buf2, len2, PY_SSIZE_T_MAX
11489
2.75M
            );
11490
2.75M
        break;
11491
0
    default:
11492
0
        Py_UNREACHABLE();
11493
23.7M
    }
11494
11495
23.7M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11496
23.7M
    if (kind2 != kind1)
11497
6.76M
        PyMem_Free((void *)buf2);
11498
11499
23.7M
    return result;
11500
0
  onError:
11501
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11502
0
    if (kind2 != kind1)
11503
0
        PyMem_Free((void *)buf2);
11504
0
    return -1;
11505
23.7M
}
11506
11507
/*[clinic input]
11508
str.encode as unicode_encode
11509
11510
    encoding: str(c_default="NULL") = 'utf-8'
11511
        The encoding in which to encode the string.
11512
    errors: str(c_default="NULL") = 'strict'
11513
        The error handling scheme to use for encoding errors.
11514
        The default is 'strict' meaning that encoding errors raise a
11515
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11516
        'xmlcharrefreplace' as well as any other name registered with
11517
        codecs.register_error that can handle UnicodeEncodeErrors.
11518
11519
Encode the string using the codec registered for encoding.
11520
[clinic start generated code]*/
11521
11522
static PyObject *
11523
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11524
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11525
18.7M
{
11526
18.7M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11527
18.7M
}
11528
11529
/*[clinic input]
11530
str.expandtabs as unicode_expandtabs
11531
11532
    tabsize: int = 8
11533
11534
Return a copy where all tab characters are expanded using spaces.
11535
11536
If tabsize is not given, a tab size of 8 characters is assumed.
11537
[clinic start generated code]*/
11538
11539
static PyObject *
11540
unicode_expandtabs_impl(PyObject *self, int tabsize)
11541
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11542
1.52M
{
11543
1.52M
    Py_ssize_t i, j, line_pos, src_len, incr;
11544
1.52M
    Py_UCS4 ch;
11545
1.52M
    PyObject *u;
11546
1.52M
    const void *src_data;
11547
1.52M
    void *dest_data;
11548
1.52M
    int kind;
11549
1.52M
    int found;
11550
11551
    /* First pass: determine size of output string */
11552
1.52M
    src_len = PyUnicode_GET_LENGTH(self);
11553
1.52M
    i = j = line_pos = 0;
11554
1.52M
    kind = PyUnicode_KIND(self);
11555
1.52M
    src_data = PyUnicode_DATA(self);
11556
1.52M
    found = 0;
11557
3.29M
    for (; i < src_len; i++) {
11558
1.77M
        ch = PyUnicode_READ(kind, src_data, i);
11559
1.77M
        if (ch == '\t') {
11560
419k
            found = 1;
11561
419k
            if (tabsize > 0) {
11562
419k
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11563
419k
                if (j > PY_SSIZE_T_MAX - incr)
11564
0
                    goto overflow;
11565
419k
                line_pos += incr;
11566
419k
                j += incr;
11567
419k
            }
11568
419k
        }
11569
1.35M
        else {
11570
1.35M
            if (j > PY_SSIZE_T_MAX - 1)
11571
0
                goto overflow;
11572
1.35M
            line_pos++;
11573
1.35M
            j++;
11574
1.35M
            if (ch == '\n' || ch == '\r')
11575
4.46k
                line_pos = 0;
11576
1.35M
        }
11577
1.77M
    }
11578
1.52M
    if (!found)
11579
1.49M
        return unicode_result_unchanged(self);
11580
11581
    /* Second pass: create output string and fill it */
11582
26.6k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11583
26.6k
    if (!u)
11584
0
        return NULL;
11585
26.6k
    dest_data = PyUnicode_DATA(u);
11586
11587
26.6k
    i = j = line_pos = 0;
11588
11589
826k
    for (; i < src_len; i++) {
11590
800k
        ch = PyUnicode_READ(kind, src_data, i);
11591
800k
        if (ch == '\t') {
11592
419k
            if (tabsize > 0) {
11593
419k
                incr = tabsize - (line_pos % tabsize);
11594
419k
                line_pos += incr;
11595
419k
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11596
419k
                j += incr;
11597
419k
            }
11598
419k
        }
11599
381k
        else {
11600
381k
            line_pos++;
11601
381k
            PyUnicode_WRITE(kind, dest_data, j, ch);
11602
381k
            j++;
11603
381k
            if (ch == '\n' || ch == '\r')
11604
0
                line_pos = 0;
11605
381k
        }
11606
800k
    }
11607
26.6k
    assert (j == PyUnicode_GET_LENGTH(u));
11608
26.6k
    return unicode_result(u);
11609
11610
0
  overflow:
11611
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11612
0
    return NULL;
11613
26.6k
}
11614
11615
/*[clinic input]
11616
@permit_long_summary
11617
str.find as unicode_find = str.count
11618
11619
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11620
11621
Optional arguments start and end are interpreted as in slice notation.
11622
Return -1 on failure.
11623
[clinic start generated code]*/
11624
11625
static Py_ssize_t
11626
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11627
                  Py_ssize_t end)
11628
/*[clinic end generated code: output=51dbe6255712e278 input=3a9d650fe4c24695]*/
11629
25.1M
{
11630
25.1M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11631
25.1M
    if (result < 0) {
11632
5.36M
        return -1;
11633
5.36M
    }
11634
19.7M
    return result;
11635
25.1M
}
11636
11637
static PyObject *
11638
unicode_getitem(PyObject *self, Py_ssize_t index)
11639
44.9M
{
11640
44.9M
    const void *data;
11641
44.9M
    int kind;
11642
44.9M
    Py_UCS4 ch;
11643
11644
44.9M
    if (!PyUnicode_Check(self)) {
11645
0
        PyErr_BadArgument();
11646
0
        return NULL;
11647
0
    }
11648
44.9M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11649
15.4k
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11650
15.4k
        return NULL;
11651
15.4k
    }
11652
44.9M
    kind = PyUnicode_KIND(self);
11653
44.9M
    data = PyUnicode_DATA(self);
11654
44.9M
    ch = PyUnicode_READ(kind, data, index);
11655
44.9M
    return unicode_char(ch);
11656
44.9M
}
11657
11658
/* Believe it or not, this produces the same value for ASCII strings
11659
   as bytes_hash(). */
11660
static Py_hash_t
11661
unicode_hash(PyObject *self)
11662
987M
{
11663
987M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11664
11665
#ifdef Py_DEBUG
11666
    assert(_Py_HashSecret_Initialized);
11667
#endif
11668
987M
    Py_hash_t hash = PyUnicode_HASH(self);
11669
987M
    if (hash != -1) {
11670
940M
        return hash;
11671
940M
    }
11672
47.0M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11673
47.0M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11674
11675
47.0M
    PyUnicode_SET_HASH(self, x);
11676
47.0M
    return x;
11677
987M
}
11678
11679
/*[clinic input]
11680
@permit_long_summary
11681
str.index as unicode_index = str.count
11682
11683
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11684
11685
Optional arguments start and end are interpreted as in slice notation.
11686
Raises ValueError when the substring is not found.
11687
[clinic start generated code]*/
11688
11689
static Py_ssize_t
11690
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11691
                   Py_ssize_t end)
11692
/*[clinic end generated code: output=77558288837cdf40 input=ae5e48f69ed75b06]*/
11693
45.2k
{
11694
45.2k
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11695
45.2k
    if (result == -1) {
11696
712
        PyErr_SetString(PyExc_ValueError, "substring not found");
11697
712
    }
11698
44.5k
    else if (result < 0) {
11699
0
        return -1;
11700
0
    }
11701
45.2k
    return result;
11702
45.2k
}
11703
11704
/*[clinic input]
11705
str.isascii as unicode_isascii
11706
11707
Return True if all characters in the string are ASCII, False otherwise.
11708
11709
ASCII characters have code points in the range U+0000-U+007F.
11710
Empty string is ASCII too.
11711
[clinic start generated code]*/
11712
11713
static PyObject *
11714
unicode_isascii_impl(PyObject *self)
11715
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11716
5.24k
{
11717
5.24k
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11718
5.24k
}
11719
11720
/*[clinic input]
11721
@permit_long_docstring_body
11722
str.islower as unicode_islower
11723
11724
Return True if the string is a lowercase string, False otherwise.
11725
11726
A string is lowercase if all cased characters in the string are lowercase and
11727
there is at least one cased character in the string.
11728
[clinic start generated code]*/
11729
11730
static PyObject *
11731
unicode_islower_impl(PyObject *self)
11732
/*[clinic end generated code: output=dbd41995bd005b81 input=c6fc0295241a1aaa]*/
11733
0
{
11734
0
    Py_ssize_t i, length;
11735
0
    int kind;
11736
0
    const void *data;
11737
0
    int cased;
11738
11739
0
    length = PyUnicode_GET_LENGTH(self);
11740
0
    kind = PyUnicode_KIND(self);
11741
0
    data = PyUnicode_DATA(self);
11742
11743
    /* Shortcut for single character strings */
11744
0
    if (length == 1)
11745
0
        return PyBool_FromLong(
11746
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11747
11748
    /* Special case for empty strings */
11749
0
    if (length == 0)
11750
0
        Py_RETURN_FALSE;
11751
11752
0
    cased = 0;
11753
0
    for (i = 0; i < length; i++) {
11754
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11755
11756
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11757
0
            Py_RETURN_FALSE;
11758
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11759
0
            cased = 1;
11760
0
    }
11761
0
    return PyBool_FromLong(cased);
11762
0
}
11763
11764
/*[clinic input]
11765
@permit_long_docstring_body
11766
str.isupper as unicode_isupper
11767
11768
Return True if the string is an uppercase string, False otherwise.
11769
11770
A string is uppercase if all cased characters in the string are uppercase and
11771
there is at least one cased character in the string.
11772
[clinic start generated code]*/
11773
11774
static PyObject *
11775
unicode_isupper_impl(PyObject *self)
11776
/*[clinic end generated code: output=049209c8e7f15f59 input=8d5cb33e67efde72]*/
11777
10.7k
{
11778
10.7k
    Py_ssize_t i, length;
11779
10.7k
    int kind;
11780
10.7k
    const void *data;
11781
10.7k
    int cased;
11782
11783
10.7k
    length = PyUnicode_GET_LENGTH(self);
11784
10.7k
    kind = PyUnicode_KIND(self);
11785
10.7k
    data = PyUnicode_DATA(self);
11786
11787
    /* Shortcut for single character strings */
11788
10.7k
    if (length == 1)
11789
0
        return PyBool_FromLong(
11790
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11791
11792
    /* Special case for empty strings */
11793
10.7k
    if (length == 0)
11794
0
        Py_RETURN_FALSE;
11795
11796
10.7k
    cased = 0;
11797
135k
    for (i = 0; i < length; i++) {
11798
126k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11799
11800
126k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11801
1.36k
            Py_RETURN_FALSE;
11802
124k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11803
9.56k
            cased = 1;
11804
126k
    }
11805
9.42k
    return PyBool_FromLong(cased);
11806
10.7k
}
11807
11808
/*[clinic input]
11809
str.istitle as unicode_istitle
11810
11811
Return True if the string is a title-cased string, False otherwise.
11812
11813
In a title-cased string, upper- and title-case characters may only
11814
follow uncased characters and lowercase characters only cased ones.
11815
[clinic start generated code]*/
11816
11817
static PyObject *
11818
unicode_istitle_impl(PyObject *self)
11819
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11820
0
{
11821
0
    Py_ssize_t i, length;
11822
0
    int kind;
11823
0
    const void *data;
11824
0
    int cased, previous_is_cased;
11825
11826
0
    length = PyUnicode_GET_LENGTH(self);
11827
0
    kind = PyUnicode_KIND(self);
11828
0
    data = PyUnicode_DATA(self);
11829
11830
    /* Shortcut for single character strings */
11831
0
    if (length == 1) {
11832
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11833
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11834
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11835
0
    }
11836
11837
    /* Special case for empty strings */
11838
0
    if (length == 0)
11839
0
        Py_RETURN_FALSE;
11840
11841
0
    cased = 0;
11842
0
    previous_is_cased = 0;
11843
0
    for (i = 0; i < length; i++) {
11844
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11845
11846
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11847
0
            if (previous_is_cased)
11848
0
                Py_RETURN_FALSE;
11849
0
            previous_is_cased = 1;
11850
0
            cased = 1;
11851
0
        }
11852
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11853
0
            if (!previous_is_cased)
11854
0
                Py_RETURN_FALSE;
11855
0
            previous_is_cased = 1;
11856
0
            cased = 1;
11857
0
        }
11858
0
        else
11859
0
            previous_is_cased = 0;
11860
0
    }
11861
0
    return PyBool_FromLong(cased);
11862
0
}
11863
11864
/*[clinic input]
11865
@permit_long_docstring_body
11866
str.isspace as unicode_isspace
11867
11868
Return True if the string is a whitespace string, False otherwise.
11869
11870
A string is whitespace if all characters in the string are whitespace and there
11871
is at least one character in the string.
11872
[clinic start generated code]*/
11873
11874
static PyObject *
11875
unicode_isspace_impl(PyObject *self)
11876
/*[clinic end generated code: output=163a63bfa08ac2b9 input=44fe05e248c6e159]*/
11877
1.62M
{
11878
1.62M
    Py_ssize_t i, length;
11879
1.62M
    int kind;
11880
1.62M
    const void *data;
11881
11882
1.62M
    length = PyUnicode_GET_LENGTH(self);
11883
1.62M
    kind = PyUnicode_KIND(self);
11884
1.62M
    data = PyUnicode_DATA(self);
11885
11886
    /* Shortcut for single character strings */
11887
1.62M
    if (length == 1)
11888
1.62M
        return PyBool_FromLong(
11889
1.62M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11890
11891
    /* Special case for empty strings */
11892
1.33k
    if (length == 0)
11893
284
        Py_RETURN_FALSE;
11894
11895
7.60k
    for (i = 0; i < length; i++) {
11896
7.50k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11897
7.50k
        if (!Py_UNICODE_ISSPACE(ch))
11898
944
            Py_RETURN_FALSE;
11899
7.50k
    }
11900
1.04k
    Py_RETURN_TRUE;
11901
1.04k
}
11902
11903
/*[clinic input]
11904
@permit_long_docstring_body
11905
str.isalpha as unicode_isalpha
11906
11907
Return True if the string is an alphabetic string, False otherwise.
11908
11909
A string is alphabetic if all characters in the string are alphabetic and there
11910
is at least one character in the string.
11911
[clinic start generated code]*/
11912
11913
static PyObject *
11914
unicode_isalpha_impl(PyObject *self)
11915
/*[clinic end generated code: output=cc81b9ac3883ec4f input=c233000624a56e0d]*/
11916
22
{
11917
22
    Py_ssize_t i, length;
11918
22
    int kind;
11919
22
    const void *data;
11920
11921
22
    length = PyUnicode_GET_LENGTH(self);
11922
22
    kind = PyUnicode_KIND(self);
11923
22
    data = PyUnicode_DATA(self);
11924
11925
    /* Shortcut for single character strings */
11926
22
    if (length == 1)
11927
20
        return PyBool_FromLong(
11928
20
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11929
11930
    /* Special case for empty strings */
11931
2
    if (length == 0)
11932
0
        Py_RETURN_FALSE;
11933
11934
2
    for (i = 0; i < length; i++) {
11935
2
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11936
2
            Py_RETURN_FALSE;
11937
2
    }
11938
2
    Py_RETURN_TRUE;
11939
2
}
11940
11941
/*[clinic input]
11942
@permit_long_docstring_body
11943
str.isalnum as unicode_isalnum
11944
11945
Return True if the string is an alpha-numeric string, False otherwise.
11946
11947
A string is alpha-numeric if all characters in the string are alpha-numeric and
11948
there is at least one character in the string.
11949
[clinic start generated code]*/
11950
11951
static PyObject *
11952
unicode_isalnum_impl(PyObject *self)
11953
/*[clinic end generated code: output=a5a23490ffc3660c input=5d63ba9c9bafdb6b]*/
11954
0
{
11955
0
    int kind;
11956
0
    const void *data;
11957
0
    Py_ssize_t len, i;
11958
11959
0
    kind = PyUnicode_KIND(self);
11960
0
    data = PyUnicode_DATA(self);
11961
0
    len = PyUnicode_GET_LENGTH(self);
11962
11963
    /* Shortcut for single character strings */
11964
0
    if (len == 1) {
11965
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11966
0
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11967
0
    }
11968
11969
    /* Special case for empty strings */
11970
0
    if (len == 0)
11971
0
        Py_RETURN_FALSE;
11972
11973
0
    for (i = 0; i < len; i++) {
11974
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11975
0
        if (!Py_UNICODE_ISALNUM(ch))
11976
0
            Py_RETURN_FALSE;
11977
0
    }
11978
0
    Py_RETURN_TRUE;
11979
0
}
11980
11981
/*[clinic input]
11982
@permit_long_docstring_body
11983
str.isdecimal as unicode_isdecimal
11984
11985
Return True if the string is a decimal string, False otherwise.
11986
11987
A string is a decimal string if all characters in the string are decimal and
11988
there is at least one character in the string.
11989
[clinic start generated code]*/
11990
11991
static PyObject *
11992
unicode_isdecimal_impl(PyObject *self)
11993
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=8e84a58b414935a3]*/
11994
1.38k
{
11995
1.38k
    Py_ssize_t i, length;
11996
1.38k
    int kind;
11997
1.38k
    const void *data;
11998
11999
1.38k
    length = PyUnicode_GET_LENGTH(self);
12000
1.38k
    kind = PyUnicode_KIND(self);
12001
1.38k
    data = PyUnicode_DATA(self);
12002
12003
    /* Shortcut for single character strings */
12004
1.38k
    if (length == 1)
12005
197
        return PyBool_FromLong(
12006
197
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12007
12008
    /* Special case for empty strings */
12009
1.19k
    if (length == 0)
12010
0
        Py_RETURN_FALSE;
12011
12012
7.70k
    for (i = 0; i < length; i++) {
12013
7.12k
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12014
608
            Py_RETURN_FALSE;
12015
7.12k
    }
12016
1.19k
    Py_RETURN_TRUE;
12017
1.19k
}
12018
12019
/*[clinic input]
12020
@permit_long_docstring_body
12021
str.isdigit as unicode_isdigit
12022
12023
Return True if the string is a digit string, False otherwise.
12024
12025
A string is a digit string if all characters in the string are digits and there
12026
is at least one character in the string.
12027
[clinic start generated code]*/
12028
12029
static PyObject *
12030
unicode_isdigit_impl(PyObject *self)
12031
/*[clinic end generated code: output=10a6985311da6858 input=99e284affb54d4a0]*/
12032
1.22M
{
12033
1.22M
    Py_ssize_t i, length;
12034
1.22M
    int kind;
12035
1.22M
    const void *data;
12036
12037
1.22M
    length = PyUnicode_GET_LENGTH(self);
12038
1.22M
    kind = PyUnicode_KIND(self);
12039
1.22M
    data = PyUnicode_DATA(self);
12040
12041
    /* Shortcut for single character strings */
12042
1.22M
    if (length == 1) {
12043
1.22M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12044
1.22M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12045
1.22M
    }
12046
12047
    /* Special case for empty strings */
12048
408
    if (length == 0)
12049
0
        Py_RETURN_FALSE;
12050
12051
1.45k
    for (i = 0; i < length; i++) {
12052
1.04k
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12053
0
            Py_RETURN_FALSE;
12054
1.04k
    }
12055
408
    Py_RETURN_TRUE;
12056
408
}
12057
12058
/*[clinic input]
12059
@permit_long_docstring_body
12060
str.isnumeric as unicode_isnumeric
12061
12062
Return True if the string is a numeric string, False otherwise.
12063
12064
A string is numeric if all characters in the string are numeric and there is at
12065
least one character in the string.
12066
[clinic start generated code]*/
12067
12068
static PyObject *
12069
unicode_isnumeric_impl(PyObject *self)
12070
/*[clinic end generated code: output=9172a32d9013051a input=e9f5b6b8b29b0ee6]*/
12071
0
{
12072
0
    Py_ssize_t i, length;
12073
0
    int kind;
12074
0
    const void *data;
12075
12076
0
    length = PyUnicode_GET_LENGTH(self);
12077
0
    kind = PyUnicode_KIND(self);
12078
0
    data = PyUnicode_DATA(self);
12079
12080
    /* Shortcut for single character strings */
12081
0
    if (length == 1)
12082
0
        return PyBool_FromLong(
12083
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12084
12085
    /* Special case for empty strings */
12086
0
    if (length == 0)
12087
0
        Py_RETURN_FALSE;
12088
12089
0
    for (i = 0; i < length; i++) {
12090
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12091
0
            Py_RETURN_FALSE;
12092
0
    }
12093
0
    Py_RETURN_TRUE;
12094
0
}
12095
12096
Py_ssize_t
12097
_PyUnicode_ScanIdentifier(PyObject *self)
12098
64.6k
{
12099
64.6k
    Py_ssize_t i;
12100
64.6k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12101
64.6k
    if (len == 0) {
12102
        /* an empty string is not a valid identifier */
12103
0
        return 0;
12104
0
    }
12105
12106
64.6k
    int kind = PyUnicode_KIND(self);
12107
64.6k
    const void *data = PyUnicode_DATA(self);
12108
64.6k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12109
    /* PEP 3131 says that the first character must be in
12110
       XID_Start and subsequent characters in XID_Continue,
12111
       and for the ASCII range, the 2.x rules apply (i.e
12112
       start with letters and underscore, continue with
12113
       letters, digits, underscore). However, given the current
12114
       definition of XID_Start and XID_Continue, it is sufficient
12115
       to check just for these, except that _ must be allowed
12116
       as starting an identifier.  */
12117
64.6k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12118
785
        return 0;
12119
785
    }
12120
12121
535k
    for (i = 1; i < len; i++) {
12122
471k
        ch = PyUnicode_READ(kind, data, i);
12123
471k
        if (!_PyUnicode_IsXidContinue(ch)) {
12124
345
            return i;
12125
345
        }
12126
471k
    }
12127
63.5k
    return i;
12128
63.8k
}
12129
12130
int
12131
PyUnicode_IsIdentifier(PyObject *self)
12132
53.0k
{
12133
53.0k
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12134
53.0k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12135
    /* an empty string is not a valid identifier */
12136
53.0k
    return len && i == len;
12137
53.0k
}
12138
12139
/*[clinic input]
12140
@permit_long_docstring_body
12141
str.isidentifier as unicode_isidentifier
12142
12143
Return True if the string is a valid Python identifier, False otherwise.
12144
12145
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12146
such as "def" or "class".
12147
[clinic start generated code]*/
12148
12149
static PyObject *
12150
unicode_isidentifier_impl(PyObject *self)
12151
/*[clinic end generated code: output=fe585a9666572905 input=86315dd889d7bd04]*/
12152
50.7k
{
12153
50.7k
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12154
50.7k
}
12155
12156
/*[clinic input]
12157
@permit_long_summary
12158
str.isprintable as unicode_isprintable
12159
12160
Return True if all characters in the string are printable, False otherwise.
12161
12162
A character is printable if repr() may use it in its output.
12163
[clinic start generated code]*/
12164
12165
static PyObject *
12166
unicode_isprintable_impl(PyObject *self)
12167
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12168
1.76M
{
12169
1.76M
    Py_ssize_t i, length;
12170
1.76M
    int kind;
12171
1.76M
    const void *data;
12172
12173
1.76M
    length = PyUnicode_GET_LENGTH(self);
12174
1.76M
    kind = PyUnicode_KIND(self);
12175
1.76M
    data = PyUnicode_DATA(self);
12176
12177
    /* Shortcut for single character strings */
12178
1.76M
    if (length == 1)
12179
1.76M
        return PyBool_FromLong(
12180
1.76M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12181
12182
0
    for (i = 0; i < length; i++) {
12183
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12184
0
            Py_RETURN_FALSE;
12185
0
        }
12186
0
    }
12187
0
    Py_RETURN_TRUE;
12188
0
}
12189
12190
/*[clinic input]
12191
@permit_long_docstring_body
12192
str.join as unicode_join
12193
12194
    iterable: object
12195
    /
12196
12197
Concatenate any number of strings.
12198
12199
The string whose method is called is inserted in between each given string.
12200
The result is returned as a new string.
12201
12202
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12203
[clinic start generated code]*/
12204
12205
static PyObject *
12206
unicode_join(PyObject *self, PyObject *iterable)
12207
/*[clinic end generated code: output=6857e7cecfe7bf98 input=bac724ed412ef3f8]*/
12208
18.6M
{
12209
18.6M
    return PyUnicode_Join(self, iterable);
12210
18.6M
}
12211
12212
static Py_ssize_t
12213
unicode_length(PyObject *self)
12214
26.5M
{
12215
26.5M
    return PyUnicode_GET_LENGTH(self);
12216
26.5M
}
12217
12218
/*[clinic input]
12219
str.ljust as unicode_ljust
12220
12221
    width: Py_ssize_t
12222
    fillchar: Py_UCS4 = ' '
12223
    /
12224
12225
Return a left-justified string of length width.
12226
12227
Padding is done using the specified fill character (default is a space).
12228
[clinic start generated code]*/
12229
12230
static PyObject *
12231
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12232
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12233
130
{
12234
130
    if (PyUnicode_GET_LENGTH(self) >= width)
12235
62
        return unicode_result_unchanged(self);
12236
12237
68
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12238
130
}
12239
12240
/*[clinic input]
12241
str.lower as unicode_lower
12242
12243
Return a copy of the string converted to lowercase.
12244
[clinic start generated code]*/
12245
12246
static PyObject *
12247
unicode_lower_impl(PyObject *self)
12248
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12249
72.6M
{
12250
72.6M
    if (PyUnicode_IS_ASCII(self))
12251
69.5M
        return ascii_upper_or_lower(self, 1);
12252
3.08M
    return case_operation(self, do_lower);
12253
72.6M
}
12254
12255
70.3M
#define LEFTSTRIP 0
12256
86.2M
#define RIGHTSTRIP 1
12257
48.2M
#define BOTHSTRIP 2
12258
12259
/* Arrays indexed by above */
12260
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12261
12262
0
#define STRIPNAME(i) (stripfuncnames[i])
12263
12264
/* externally visible for str.strip(unicode) */
12265
PyObject *
12266
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12267
5.12M
{
12268
5.12M
    const void *data;
12269
5.12M
    int kind;
12270
5.12M
    Py_ssize_t i, j, len;
12271
5.12M
    BLOOM_MASK sepmask;
12272
5.12M
    Py_ssize_t seplen;
12273
12274
5.12M
    kind = PyUnicode_KIND(self);
12275
5.12M
    data = PyUnicode_DATA(self);
12276
5.12M
    len = PyUnicode_GET_LENGTH(self);
12277
5.12M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12278
5.12M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12279
5.12M
                              PyUnicode_DATA(sepobj),
12280
5.12M
                              seplen);
12281
12282
5.12M
    i = 0;
12283
5.12M
    if (striptype != RIGHTSTRIP) {
12284
498k
        while (i < len) {
12285
495k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12286
495k
            if (!BLOOM(sepmask, ch))
12287
463k
                break;
12288
32.0k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12289
2.71k
                break;
12290
29.3k
            i++;
12291
29.3k
        }
12292
468k
    }
12293
12294
5.12M
    j = len;
12295
5.12M
    if (striptype != LEFTSTRIP) {
12296
4.65M
        j--;
12297
5.43M
        while (j >= i) {
12298
4.05M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12299
4.05M
            if (!BLOOM(sepmask, ch))
12300
3.17M
                break;
12301
883k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12302
98.5k
                break;
12303
785k
            j--;
12304
785k
        }
12305
12306
4.65M
        j++;
12307
4.65M
    }
12308
12309
5.12M
    return PyUnicode_Substring(self, i, j);
12310
5.12M
}
12311
12312
PyObject*
12313
_PyUnicode_BinarySlice(PyObject *container, PyObject *start_o, PyObject *stop_o)
12314
32.2M
{
12315
32.2M
    assert(PyUnicode_CheckExact(container));
12316
32.2M
    Py_ssize_t len = PyUnicode_GET_LENGTH(container);
12317
32.2M
    Py_ssize_t istart, istop;
12318
32.2M
    if (!_PyEval_UnpackIndices(start_o, stop_o, len, &istart, &istop)) {
12319
0
        return NULL;
12320
0
    }
12321
32.2M
    return PyUnicode_Substring(container, istart, istop);
12322
32.2M
}
12323
12324
PyObject*
12325
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12326
269M
{
12327
269M
    const unsigned char *data;
12328
269M
    int kind;
12329
269M
    Py_ssize_t length;
12330
12331
269M
    length = PyUnicode_GET_LENGTH(self);
12332
269M
    end = Py_MIN(end, length);
12333
12334
269M
    if (start == 0 && end == length)
12335
70.6M
        return unicode_result_unchanged(self);
12336
12337
198M
    if (start < 0 || end < 0) {
12338
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12339
0
        return NULL;
12340
0
    }
12341
198M
    if (start >= length || end < start)
12342
5.05M
        _Py_RETURN_UNICODE_EMPTY();
12343
12344
193M
    length = end - start;
12345
193M
    if (PyUnicode_IS_ASCII(self)) {
12346
66.0M
        data = PyUnicode_1BYTE_DATA(self);
12347
66.0M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12348
66.0M
    }
12349
127M
    else {
12350
127M
        kind = PyUnicode_KIND(self);
12351
127M
        data = PyUnicode_1BYTE_DATA(self);
12352
127M
        return PyUnicode_FromKindAndData(kind,
12353
127M
                                         data + kind * start,
12354
127M
                                         length);
12355
127M
    }
12356
193M
}
12357
12358
static PyObject *
12359
do_strip(PyObject *self, int striptype)
12360
63.1M
{
12361
63.1M
    Py_ssize_t len, i, j;
12362
12363
63.1M
    len = PyUnicode_GET_LENGTH(self);
12364
12365
63.1M
    if (PyUnicode_IS_ASCII(self)) {
12366
49.8M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12367
12368
49.8M
        i = 0;
12369
49.8M
        if (striptype != RIGHTSTRIP) {
12370
44.3M
            while (i < len) {
12371
37.1M
                Py_UCS1 ch = data[i];
12372
37.1M
                if (!_Py_ascii_whitespace[ch])
12373
32.3M
                    break;
12374
4.80M
                i++;
12375
4.80M
            }
12376
39.5M
        }
12377
12378
49.8M
        j = len;
12379
49.8M
        if (striptype != LEFTSTRIP) {
12380
49.5M
            j--;
12381
54.8M
            while (j >= i) {
12382
42.1M
                Py_UCS1 ch = data[j];
12383
42.1M
                if (!_Py_ascii_whitespace[ch])
12384
36.8M
                    break;
12385
5.30M
                j--;
12386
5.30M
            }
12387
49.5M
            j++;
12388
49.5M
        }
12389
49.8M
    }
12390
13.2M
    else {
12391
13.2M
        int kind = PyUnicode_KIND(self);
12392
13.2M
        const void *data = PyUnicode_DATA(self);
12393
12394
13.2M
        i = 0;
12395
13.2M
        if (striptype != RIGHTSTRIP) {
12396
12.8M
            while (i < len) {
12397
12.8M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12398
12.8M
                if (!Py_UNICODE_ISSPACE(ch))
12399
10.2M
                    break;
12400
2.55M
                i++;
12401
2.55M
            }
12402
10.2M
        }
12403
12404
13.2M
        j = len;
12405
13.2M
        if (striptype != LEFTSTRIP) {
12406
12.0M
            j--;
12407
14.8M
            while (j >= i) {
12408
14.8M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12409
14.8M
                if (!Py_UNICODE_ISSPACE(ch))
12410
11.9M
                    break;
12411
2.88M
                j--;
12412
2.88M
            }
12413
12.0M
            j++;
12414
12.0M
        }
12415
13.2M
    }
12416
12417
63.1M
    return PyUnicode_Substring(self, i, j);
12418
63.1M
}
12419
12420
12421
static PyObject *
12422
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12423
68.3M
{
12424
68.3M
    if (sep != Py_None) {
12425
5.12M
        if (PyUnicode_Check(sep))
12426
5.12M
            return _PyUnicode_XStrip(self, striptype, sep);
12427
0
        else {
12428
0
            PyErr_Format(PyExc_TypeError,
12429
0
                         "%s arg must be None or str",
12430
0
                         STRIPNAME(striptype));
12431
0
            return NULL;
12432
0
        }
12433
5.12M
    }
12434
12435
63.1M
    return do_strip(self, striptype);
12436
68.3M
}
12437
12438
12439
/*[clinic input]
12440
@permit_long_summary
12441
str.strip as unicode_strip
12442
12443
    chars: object = None
12444
    /
12445
12446
Return a copy of the string with leading and trailing whitespace removed.
12447
12448
If chars is given and not None, remove characters in chars instead.
12449
[clinic start generated code]*/
12450
12451
static PyObject *
12452
unicode_strip_impl(PyObject *self, PyObject *chars)
12453
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12454
48.2M
{
12455
48.2M
    return do_argstrip(self, BOTHSTRIP, chars);
12456
48.2M
}
12457
12458
12459
/*[clinic input]
12460
str.lstrip as unicode_lstrip
12461
12462
    chars: object = None
12463
    /
12464
12465
Return a copy of the string with leading whitespace removed.
12466
12467
If chars is given and not None, remove characters in chars instead.
12468
[clinic start generated code]*/
12469
12470
static PyObject *
12471
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12472
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12473
2.08M
{
12474
2.08M
    return do_argstrip(self, LEFTSTRIP, chars);
12475
2.08M
}
12476
12477
12478
/*[clinic input]
12479
str.rstrip as unicode_rstrip
12480
12481
    chars: object = None
12482
    /
12483
12484
Return a copy of the string with trailing whitespace removed.
12485
12486
If chars is given and not None, remove characters in chars instead.
12487
[clinic start generated code]*/
12488
12489
static PyObject *
12490
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12491
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12492
17.9M
{
12493
17.9M
    return do_argstrip(self, RIGHTSTRIP, chars);
12494
17.9M
}
12495
12496
12497
static PyObject*
12498
unicode_repeat(PyObject *str, Py_ssize_t len)
12499
414k
{
12500
414k
    PyObject *u;
12501
414k
    Py_ssize_t nchars, n;
12502
12503
414k
    if (len < 1)
12504
27.8k
        _Py_RETURN_UNICODE_EMPTY();
12505
12506
    /* no repeat, return original string */
12507
386k
    if (len == 1)
12508
29.4k
        return unicode_result_unchanged(str);
12509
12510
357k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12511
0
        PyErr_SetString(PyExc_OverflowError,
12512
0
                        "repeated string is too long");
12513
0
        return NULL;
12514
0
    }
12515
357k
    nchars = len * PyUnicode_GET_LENGTH(str);
12516
12517
357k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12518
357k
    if (!u)
12519
0
        return NULL;
12520
357k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12521
12522
357k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12523
354k
        int kind = PyUnicode_KIND(str);
12524
354k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12525
354k
        if (kind == PyUnicode_1BYTE_KIND) {
12526
354k
            void *to = PyUnicode_DATA(u);
12527
354k
            memset(to, (unsigned char)fill_char, len);
12528
354k
        }
12529
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12530
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12531
0
            for (n = 0; n < len; ++n)
12532
0
                ucs2[n] = fill_char;
12533
0
        } else {
12534
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12535
0
            assert(kind == PyUnicode_4BYTE_KIND);
12536
0
            for (n = 0; n < len; ++n)
12537
0
                ucs4[n] = fill_char;
12538
0
        }
12539
354k
    }
12540
2.40k
    else {
12541
2.40k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12542
2.40k
        char *to = (char *) PyUnicode_DATA(u);
12543
2.40k
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12544
2.40k
            PyUnicode_GET_LENGTH(str) * char_size);
12545
2.40k
    }
12546
12547
357k
    assert(_PyUnicode_CheckConsistency(u, 1));
12548
357k
    return u;
12549
357k
}
12550
12551
PyObject *
12552
PyUnicode_Replace(PyObject *str,
12553
                  PyObject *substr,
12554
                  PyObject *replstr,
12555
                  Py_ssize_t maxcount)
12556
0
{
12557
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12558
0
            ensure_unicode(replstr) < 0)
12559
0
        return NULL;
12560
0
    return replace(str, substr, replstr, maxcount);
12561
0
}
12562
12563
/*[clinic input]
12564
str.replace as unicode_replace
12565
12566
    old: unicode
12567
    new: unicode
12568
    /
12569
    count: Py_ssize_t = -1
12570
        Maximum number of occurrences to replace.
12571
        -1 (the default value) means replace all occurrences.
12572
12573
Return a copy with all occurrences of substring old replaced by new.
12574
12575
If count is given, only the first count occurrences are replaced.
12576
If count is not specified or -1, then all occurrences are replaced.
12577
[clinic start generated code]*/
12578
12579
static PyObject *
12580
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12581
                     Py_ssize_t count)
12582
/*[clinic end generated code: output=b63f1a8b5eebf448 input=d15a6886b05e2edc]*/
12583
21.9M
{
12584
21.9M
    return replace(self, old, new, count);
12585
21.9M
}
12586
12587
/*[clinic input]
12588
@permit_long_docstring_body
12589
str.removeprefix as unicode_removeprefix
12590
12591
    prefix: unicode
12592
    /
12593
12594
Return a str with the given prefix string removed if present.
12595
12596
If the string starts with the prefix string, return string[len(prefix):].
12597
Otherwise, return a copy of the original string.
12598
[clinic start generated code]*/
12599
12600
static PyObject *
12601
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12602
/*[clinic end generated code: output=f1e5945e9763bcb9 input=1989a856dbb813f1]*/
12603
308
{
12604
308
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12605
308
    if (match == -1) {
12606
0
        return NULL;
12607
0
    }
12608
308
    if (match) {
12609
80
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12610
80
                                   PyUnicode_GET_LENGTH(self));
12611
80
    }
12612
228
    return unicode_result_unchanged(self);
12613
308
}
12614
12615
/*[clinic input]
12616
str.removesuffix as unicode_removesuffix
12617
12618
    suffix: unicode
12619
    /
12620
12621
Return a str with the given suffix string removed if present.
12622
12623
If the string ends with the suffix string and that suffix is not empty,
12624
return string[:-len(suffix)]. Otherwise, return a copy of the original
12625
string.
12626
[clinic start generated code]*/
12627
12628
static PyObject *
12629
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12630
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12631
0
{
12632
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12633
0
    if (match == -1) {
12634
0
        return NULL;
12635
0
    }
12636
0
    if (match) {
12637
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12638
0
                                            - PyUnicode_GET_LENGTH(suffix));
12639
0
    }
12640
0
    return unicode_result_unchanged(self);
12641
0
}
12642
12643
static PyObject *
12644
unicode_repr(PyObject *unicode)
12645
13.3M
{
12646
13.3M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12647
13.3M
    const void *idata = PyUnicode_DATA(unicode);
12648
12649
    /* Compute length of output, quote characters, and
12650
       maximum character */
12651
13.3M
    Py_ssize_t osize = 0;
12652
13.3M
    Py_UCS4 maxch = 127;
12653
13.3M
    Py_ssize_t squote = 0;
12654
13.3M
    Py_ssize_t dquote = 0;
12655
13.3M
    int ikind = PyUnicode_KIND(unicode);
12656
291M
    for (Py_ssize_t i = 0; i < isize; i++) {
12657
277M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12658
277M
        Py_ssize_t incr = 1;
12659
277M
        switch (ch) {
12660
202k
        case '\'': squote++; break;
12661
569k
        case '"':  dquote++; break;
12662
4.48M
        case '\\': case '\t': case '\r': case '\n':
12663
4.48M
            incr = 2;
12664
4.48M
            break;
12665
272M
        default:
12666
            /* Fast-path ASCII */
12667
272M
            if (ch < ' ' || ch == 0x7f)
12668
166M
                incr = 4; /* \xHH */
12669
106M
            else if (ch < 0x7f)
12670
95.2M
                ;
12671
10.9M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12672
10.5M
                maxch = (ch > maxch) ? ch : maxch;
12673
381k
            else if (ch < 0x100)
12674
93.5k
                incr = 4; /* \xHH */
12675
288k
            else if (ch < 0x10000)
12676
84.3k
                incr = 6; /* \uHHHH */
12677
203k
            else
12678
203k
                incr = 10; /* \uHHHHHHHH */
12679
277M
        }
12680
277M
        if (osize > PY_SSIZE_T_MAX - incr) {
12681
0
            PyErr_SetString(PyExc_OverflowError,
12682
0
                            "string is too long to generate repr");
12683
0
            return NULL;
12684
0
        }
12685
277M
        osize += incr;
12686
277M
    }
12687
12688
13.3M
    Py_UCS4 quote = '\'';
12689
13.3M
    int changed = (osize != isize);
12690
13.3M
    if (squote) {
12691
104k
        changed = 1;
12692
104k
        if (dquote)
12693
            /* Both squote and dquote present. Use squote,
12694
               and escape them */
12695
8.78k
            osize += squote;
12696
95.8k
        else
12697
95.8k
            quote = '"';
12698
104k
    }
12699
13.3M
    osize += 2;   /* quotes */
12700
12701
13.3M
    PyObject *repr = PyUnicode_New(osize, maxch);
12702
13.3M
    if (repr == NULL)
12703
0
        return NULL;
12704
13.3M
    int okind = PyUnicode_KIND(repr);
12705
13.3M
    void *odata = PyUnicode_DATA(repr);
12706
12707
13.3M
    if (!changed) {
12708
6.82M
        PyUnicode_WRITE(okind, odata, 0, quote);
12709
12710
6.82M
        _PyUnicode_FastCopyCharacters(repr, 1,
12711
6.82M
                                      unicode, 0,
12712
6.82M
                                      isize);
12713
12714
6.82M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12715
6.82M
    }
12716
6.50M
    else {
12717
6.50M
        switch (okind) {
12718
6.24M
        case PyUnicode_1BYTE_KIND:
12719
6.24M
            ucs1lib_repr(unicode, quote, odata);
12720
6.24M
            break;
12721
250k
        case PyUnicode_2BYTE_KIND:
12722
250k
            ucs2lib_repr(unicode, quote, odata);
12723
250k
            break;
12724
9.28k
        default:
12725
9.28k
            assert(okind == PyUnicode_4BYTE_KIND);
12726
9.28k
            ucs4lib_repr(unicode, quote, odata);
12727
6.50M
        }
12728
6.50M
    }
12729
12730
13.3M
    assert(_PyUnicode_CheckConsistency(repr, 1));
12731
13.3M
    return repr;
12732
13.3M
}
12733
12734
/*[clinic input]
12735
@permit_long_summary
12736
str.rfind as unicode_rfind = str.count
12737
12738
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12739
12740
Optional arguments start and end are interpreted as in slice notation.
12741
Return -1 on failure.
12742
[clinic start generated code]*/
12743
12744
static Py_ssize_t
12745
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12746
                   Py_ssize_t end)
12747
/*[clinic end generated code: output=880b29f01dd014c8 input=7f7e97d5cd3299a2]*/
12748
340k
{
12749
340k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12750
340k
    if (result < 0) {
12751
10.7k
        return -1;
12752
10.7k
    }
12753
329k
    return result;
12754
340k
}
12755
12756
/*[clinic input]
12757
@permit_long_summary
12758
str.rindex as unicode_rindex = str.count
12759
12760
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12761
12762
Optional arguments start and end are interpreted as in slice notation.
12763
Raises ValueError when the substring is not found.
12764
[clinic start generated code]*/
12765
12766
static Py_ssize_t
12767
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12768
                    Py_ssize_t end)
12769
/*[clinic end generated code: output=5f3aef124c867fe1 input=0363a324740b3e62]*/
12770
120k
{
12771
120k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12772
120k
    if (result == -1) {
12773
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12774
0
    }
12775
120k
    else if (result < 0) {
12776
0
        return -1;
12777
0
    }
12778
120k
    return result;
12779
120k
}
12780
12781
/*[clinic input]
12782
str.rjust as unicode_rjust
12783
12784
    width: Py_ssize_t
12785
    fillchar: Py_UCS4 = ' '
12786
    /
12787
12788
Return a right-justified string of length width.
12789
12790
Padding is done using the specified fill character (default is a space).
12791
[clinic start generated code]*/
12792
12793
static PyObject *
12794
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12795
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12796
0
{
12797
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12798
0
        return unicode_result_unchanged(self);
12799
12800
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12801
0
}
12802
12803
PyObject *
12804
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12805
0
{
12806
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12807
0
        return NULL;
12808
12809
0
    return split(s, sep, maxsplit);
12810
0
}
12811
12812
/*[clinic input]
12813
@permit_long_summary
12814
str.split as unicode_split
12815
12816
    sep: object = None
12817
        The separator used to split the string.
12818
12819
        When set to None (the default value), will split on any whitespace
12820
        character (including \n \r \t \f and spaces) and will discard
12821
        empty strings from the result.
12822
    maxsplit: Py_ssize_t = -1
12823
        Maximum number of splits.
12824
        -1 (the default value) means no limit.
12825
12826
Return a list of the substrings in the string, using sep as the separator string.
12827
12828
Splitting starts at the front of the string and works to the end.
12829
12830
Note, str.split() is mainly useful for data that has been intentionally
12831
delimited.  With natural text that includes punctuation, consider using
12832
the regular expression module.
12833
12834
[clinic start generated code]*/
12835
12836
static PyObject *
12837
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12838
/*[clinic end generated code: output=3a65b1db356948dc input=2c1fd08a78e038b8]*/
12839
22.0M
{
12840
22.0M
    if (sep == Py_None)
12841
162k
        return split(self, NULL, maxsplit);
12842
21.9M
    if (PyUnicode_Check(sep))
12843
21.9M
        return split(self, sep, maxsplit);
12844
12845
0
    PyErr_Format(PyExc_TypeError,
12846
0
                 "must be str or None, not %.100s",
12847
0
                 Py_TYPE(sep)->tp_name);
12848
0
    return NULL;
12849
21.9M
}
12850
12851
PyObject *
12852
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12853
8.95M
{
12854
8.95M
    PyObject* out;
12855
8.95M
    int kind1, kind2;
12856
8.95M
    const void *buf1, *buf2;
12857
8.95M
    Py_ssize_t len1, len2;
12858
12859
8.95M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12860
0
        return NULL;
12861
12862
8.95M
    kind1 = PyUnicode_KIND(str_obj);
12863
8.95M
    kind2 = PyUnicode_KIND(sep_obj);
12864
8.95M
    len1 = PyUnicode_GET_LENGTH(str_obj);
12865
8.95M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12866
8.95M
    if (kind1 < kind2 || len1 < len2) {
12867
1.27k
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12868
1.27k
        return PyTuple_Pack(3, str_obj, empty, empty);
12869
1.27k
    }
12870
8.95M
    buf1 = PyUnicode_DATA(str_obj);
12871
8.95M
    buf2 = PyUnicode_DATA(sep_obj);
12872
8.95M
    if (kind2 != kind1) {
12873
84.1k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12874
84.1k
        if (!buf2)
12875
0
            return NULL;
12876
84.1k
    }
12877
12878
8.95M
    switch (kind1) {
12879
8.86M
    case PyUnicode_1BYTE_KIND:
12880
8.86M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12881
2.98M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12882
5.88M
        else
12883
5.88M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12884
8.86M
        break;
12885
72.1k
    case PyUnicode_2BYTE_KIND:
12886
72.1k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12887
72.1k
        break;
12888
12.0k
    case PyUnicode_4BYTE_KIND:
12889
12.0k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12890
12.0k
        break;
12891
0
    default:
12892
0
        Py_UNREACHABLE();
12893
8.95M
    }
12894
12895
8.95M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12896
8.95M
    if (kind2 != kind1)
12897
84.1k
        PyMem_Free((void *)buf2);
12898
12899
8.95M
    return out;
12900
8.95M
}
12901
12902
12903
PyObject *
12904
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12905
50.2k
{
12906
50.2k
    PyObject* out;
12907
50.2k
    int kind1, kind2;
12908
50.2k
    const void *buf1, *buf2;
12909
50.2k
    Py_ssize_t len1, len2;
12910
12911
50.2k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12912
0
        return NULL;
12913
12914
50.2k
    kind1 = PyUnicode_KIND(str_obj);
12915
50.2k
    kind2 = PyUnicode_KIND(sep_obj);
12916
50.2k
    len1 = PyUnicode_GET_LENGTH(str_obj);
12917
50.2k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12918
50.2k
    if (kind1 < kind2 || len1 < len2) {
12919
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12920
0
        return PyTuple_Pack(3, empty, empty, str_obj);
12921
0
    }
12922
50.2k
    buf1 = PyUnicode_DATA(str_obj);
12923
50.2k
    buf2 = PyUnicode_DATA(sep_obj);
12924
50.2k
    if (kind2 != kind1) {
12925
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12926
0
        if (!buf2)
12927
0
            return NULL;
12928
0
    }
12929
12930
50.2k
    switch (kind1) {
12931
50.2k
    case PyUnicode_1BYTE_KIND:
12932
50.2k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12933
50.2k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12934
0
        else
12935
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12936
50.2k
        break;
12937
0
    case PyUnicode_2BYTE_KIND:
12938
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12939
0
        break;
12940
0
    case PyUnicode_4BYTE_KIND:
12941
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12942
0
        break;
12943
0
    default:
12944
0
        Py_UNREACHABLE();
12945
50.2k
    }
12946
12947
50.2k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12948
50.2k
    if (kind2 != kind1)
12949
0
        PyMem_Free((void *)buf2);
12950
12951
50.2k
    return out;
12952
50.2k
}
12953
12954
/*[clinic input]
12955
@permit_long_docstring_body
12956
str.partition as unicode_partition
12957
12958
    sep: object
12959
    /
12960
12961
Partition the string into three parts using the given separator.
12962
12963
This will search for the separator in the string.  If the separator is found,
12964
returns a 3-tuple containing the part before the separator, the separator
12965
itself, and the part after it.
12966
12967
If the separator is not found, returns a 3-tuple containing the original string
12968
and two empty strings.
12969
[clinic start generated code]*/
12970
12971
static PyObject *
12972
unicode_partition(PyObject *self, PyObject *sep)
12973
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=4d854b520d7b0e97]*/
12974
8.95M
{
12975
8.95M
    return PyUnicode_Partition(self, sep);
12976
8.95M
}
12977
12978
/*[clinic input]
12979
@permit_long_docstring_body
12980
str.rpartition as unicode_rpartition = str.partition
12981
12982
Partition the string into three parts using the given separator.
12983
12984
This will search for the separator in the string, starting at the end. If
12985
the separator is found, returns a 3-tuple containing the part before the
12986
separator, the separator itself, and the part after it.
12987
12988
If the separator is not found, returns a 3-tuple containing two empty strings
12989
and the original string.
12990
[clinic start generated code]*/
12991
12992
static PyObject *
12993
unicode_rpartition(PyObject *self, PyObject *sep)
12994
/*[clinic end generated code: output=1aa13cf1156572aa input=a6adabe91e75b486]*/
12995
50.2k
{
12996
50.2k
    return PyUnicode_RPartition(self, sep);
12997
50.2k
}
12998
12999
PyObject *
13000
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13001
0
{
13002
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13003
0
        return NULL;
13004
13005
0
    return rsplit(s, sep, maxsplit);
13006
0
}
13007
13008
/*[clinic input]
13009
@permit_long_summary
13010
str.rsplit as unicode_rsplit = str.split
13011
13012
Return a list of the substrings in the string, using sep as the separator string.
13013
13014
Splitting starts at the end of the string and works to the front.
13015
[clinic start generated code]*/
13016
13017
static PyObject *
13018
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13019
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13020
66
{
13021
66
    if (sep == Py_None)
13022
0
        return rsplit(self, NULL, maxsplit);
13023
66
    if (PyUnicode_Check(sep))
13024
66
        return rsplit(self, sep, maxsplit);
13025
13026
0
    PyErr_Format(PyExc_TypeError,
13027
0
                 "must be str or None, not %.100s",
13028
0
                 Py_TYPE(sep)->tp_name);
13029
0
    return NULL;
13030
66
}
13031
13032
/*[clinic input]
13033
@permit_long_docstring_body
13034
str.splitlines as unicode_splitlines
13035
13036
    keepends: bool = False
13037
13038
Return a list of the lines in the string, breaking at line boundaries.
13039
13040
Line breaks are not included in the resulting list unless keepends is given and
13041
true.
13042
[clinic start generated code]*/
13043
13044
static PyObject *
13045
unicode_splitlines_impl(PyObject *self, int keepends)
13046
/*[clinic end generated code: output=f664dcdad153ec40 input=39eeafbfef61c827]*/
13047
19.3k
{
13048
19.3k
    return PyUnicode_Splitlines(self, keepends);
13049
19.3k
}
13050
13051
static
13052
PyObject *unicode_str(PyObject *self)
13053
3.02M
{
13054
3.02M
    return unicode_result_unchanged(self);
13055
3.02M
}
13056
13057
/*[clinic input]
13058
@permit_long_summary
13059
str.swapcase as unicode_swapcase
13060
13061
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13062
[clinic start generated code]*/
13063
13064
static PyObject *
13065
unicode_swapcase_impl(PyObject *self)
13066
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13067
0
{
13068
0
    return case_operation(self, do_swapcase);
13069
0
}
13070
13071
static int
13072
unicode_maketrans_from_dict(PyObject *x, PyObject *newdict)
13073
0
{
13074
0
    PyObject *key, *value;
13075
0
    Py_ssize_t i = 0;
13076
0
    int res;
13077
0
    while (PyDict_Next(x, &i, &key, &value)) {
13078
0
        if (PyUnicode_Check(key)) {
13079
0
            PyObject *newkey;
13080
0
            int kind;
13081
0
            const void *data;
13082
0
            if (PyUnicode_GET_LENGTH(key) != 1) {
13083
0
                PyErr_SetString(PyExc_ValueError, "string keys in translate"
13084
0
                                "table must be of length 1");
13085
0
                return -1;
13086
0
            }
13087
0
            kind = PyUnicode_KIND(key);
13088
0
            data = PyUnicode_DATA(key);
13089
0
            newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13090
0
            if (!newkey)
13091
0
                return -1;
13092
0
            res = PyDict_SetItem(newdict, newkey, value);
13093
0
            Py_DECREF(newkey);
13094
0
            if (res < 0)
13095
0
                return -1;
13096
0
        }
13097
0
        else if (PyLong_Check(key)) {
13098
0
            if (PyDict_SetItem(newdict, key, value) < 0)
13099
0
                return -1;
13100
0
        }
13101
0
        else {
13102
0
            PyErr_SetString(PyExc_TypeError, "keys in translate table must"
13103
0
                            "be strings or integers");
13104
0
            return -1;
13105
0
        }
13106
0
    }
13107
0
    return 0;
13108
0
}
13109
13110
/*[clinic input]
13111
13112
@staticmethod
13113
str.maketrans as unicode_maketrans
13114
13115
  x: object
13116
13117
  y: unicode=NULL
13118
13119
  z: unicode=NULL
13120
13121
  /
13122
13123
Return a translation table usable for str.translate().
13124
13125
If there is only one argument, it must be a dictionary mapping Unicode
13126
ordinals (integers) or characters to Unicode ordinals, strings or None.
13127
Character keys will be then converted to ordinals.
13128
If there are two arguments, they must be strings of equal length, and
13129
in the resulting dictionary, each character in x will be mapped to the
13130
character at the same position in y. If there is a third argument, it
13131
must be a string, whose characters will be mapped to None in the result.
13132
[clinic start generated code]*/
13133
13134
static PyObject *
13135
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13136
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13137
4
{
13138
4
    PyObject *new = NULL, *key, *value;
13139
4
    Py_ssize_t i = 0;
13140
4
    int res;
13141
13142
4
    new = PyDict_New();
13143
4
    if (!new)
13144
0
        return NULL;
13145
4
    if (y != NULL) {
13146
4
        int x_kind, y_kind, z_kind;
13147
4
        const void *x_data, *y_data, *z_data;
13148
13149
        /* x must be a string too, of equal length */
13150
4
        if (!PyUnicode_Check(x)) {
13151
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13152
0
                            "be a string if there is a second argument");
13153
0
            goto err;
13154
0
        }
13155
4
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13156
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13157
0
                            "arguments must have equal length");
13158
0
            goto err;
13159
0
        }
13160
        /* create entries for translating chars in x to those in y */
13161
4
        x_kind = PyUnicode_KIND(x);
13162
4
        y_kind = PyUnicode_KIND(y);
13163
4
        x_data = PyUnicode_DATA(x);
13164
4
        y_data = PyUnicode_DATA(y);
13165
36
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13166
32
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13167
32
            if (!key)
13168
0
                goto err;
13169
32
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13170
32
            if (!value) {
13171
0
                Py_DECREF(key);
13172
0
                goto err;
13173
0
            }
13174
32
            res = PyDict_SetItem(new, key, value);
13175
32
            Py_DECREF(key);
13176
32
            Py_DECREF(value);
13177
32
            if (res < 0)
13178
0
                goto err;
13179
32
        }
13180
        /* create entries for deleting chars in z */
13181
4
        if (z != NULL) {
13182
0
            z_kind = PyUnicode_KIND(z);
13183
0
            z_data = PyUnicode_DATA(z);
13184
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13185
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13186
0
                if (!key)
13187
0
                    goto err;
13188
0
                res = PyDict_SetItem(new, key, Py_None);
13189
0
                Py_DECREF(key);
13190
0
                if (res < 0)
13191
0
                    goto err;
13192
0
            }
13193
0
        }
13194
4
    } else {
13195
        /* x must be a dict */
13196
0
        if (!PyAnyDict_CheckExact(x)) {
13197
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13198
0
                            "to maketrans it must be a dict");
13199
0
            goto err;
13200
0
        }
13201
        /* copy entries into the new dict, converting string keys to int keys */
13202
0
        int errcode;
13203
0
        Py_BEGIN_CRITICAL_SECTION(x);
13204
0
        errcode = unicode_maketrans_from_dict(x, new);
13205
0
        Py_END_CRITICAL_SECTION();
13206
0
        if (errcode < 0)
13207
0
            goto err;
13208
0
    }
13209
4
    return new;
13210
0
  err:
13211
0
    Py_DECREF(new);
13212
0
    return NULL;
13213
4
}
13214
13215
/*[clinic input]
13216
@permit_long_docstring_body
13217
str.translate as unicode_translate
13218
13219
    table: object
13220
        Translation table, which must be a mapping of Unicode ordinals to
13221
        Unicode ordinals, strings, or None.
13222
    /
13223
13224
Replace each character in the string using the given translation table.
13225
13226
The table must implement lookup/indexing via __getitem__, for instance a
13227
dictionary or list.  If this operation raises LookupError, the character is
13228
left untouched.  Characters mapped to None are deleted.
13229
[clinic start generated code]*/
13230
13231
static PyObject *
13232
unicode_translate(PyObject *self, PyObject *table)
13233
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=699e5fa0ebf9f5e9]*/
13234
12.3k
{
13235
12.3k
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13236
12.3k
}
13237
13238
/*[clinic input]
13239
str.upper as unicode_upper
13240
13241
Return a copy of the string converted to uppercase.
13242
[clinic start generated code]*/
13243
13244
static PyObject *
13245
unicode_upper_impl(PyObject *self)
13246
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13247
306
{
13248
306
    if (PyUnicode_IS_ASCII(self))
13249
306
        return ascii_upper_or_lower(self, 0);
13250
0
    return case_operation(self, do_upper);
13251
306
}
13252
13253
/*[clinic input]
13254
@permit_long_summary
13255
str.zfill as unicode_zfill
13256
13257
    width: Py_ssize_t
13258
    /
13259
13260
Pad a numeric string with zeros on the left, to fill a field of the given width.
13261
13262
The string is never truncated.
13263
[clinic start generated code]*/
13264
13265
static PyObject *
13266
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13267
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13268
0
{
13269
0
    Py_ssize_t fill;
13270
0
    PyObject *u;
13271
0
    int kind;
13272
0
    const void *data;
13273
0
    Py_UCS4 chr;
13274
13275
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13276
0
        return unicode_result_unchanged(self);
13277
13278
0
    fill = width - PyUnicode_GET_LENGTH(self);
13279
13280
0
    u = pad(self, fill, 0, '0');
13281
13282
0
    if (u == NULL)
13283
0
        return NULL;
13284
13285
0
    kind = PyUnicode_KIND(u);
13286
0
    data = PyUnicode_DATA(u);
13287
0
    chr = PyUnicode_READ(kind, data, fill);
13288
13289
0
    if (chr == '+' || chr == '-') {
13290
        /* move sign to beginning of string */
13291
0
        PyUnicode_WRITE(kind, data, 0, chr);
13292
0
        PyUnicode_WRITE(kind, data, fill, '0');
13293
0
    }
13294
13295
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13296
0
    return u;
13297
0
}
13298
13299
/*[clinic input]
13300
@permit_long_summary
13301
@text_signature "($self, prefix[, start[, end]], /)"
13302
str.startswith as unicode_startswith
13303
13304
    prefix as subobj: object
13305
        A string or a tuple of strings to try.
13306
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13307
        Optional start position. Default: start of the string.
13308
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13309
        Optional stop position. Default: end of the string.
13310
    /
13311
13312
Return True if the string starts with the specified prefix, False otherwise.
13313
[clinic start generated code]*/
13314
13315
static PyObject *
13316
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13317
                        Py_ssize_t end)
13318
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13319
46.3M
{
13320
46.3M
    if (PyTuple_Check(subobj)) {
13321
2.12M
        Py_ssize_t i;
13322
7.72M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13323
5.62M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13324
5.62M
            if (!PyUnicode_Check(substring)) {
13325
0
                PyErr_Format(PyExc_TypeError,
13326
0
                             "tuple for startswith must only contain str, "
13327
0
                             "not %.100s",
13328
0
                             Py_TYPE(substring)->tp_name);
13329
0
                return NULL;
13330
0
            }
13331
5.62M
            int result = tailmatch(self, substring, start, end, -1);
13332
5.62M
            if (result < 0) {
13333
0
                return NULL;
13334
0
            }
13335
5.62M
            if (result) {
13336
18.8k
                Py_RETURN_TRUE;
13337
18.8k
            }
13338
5.62M
        }
13339
        /* nothing matched */
13340
2.12M
        Py_RETURN_FALSE;
13341
2.12M
    }
13342
44.2M
    if (!PyUnicode_Check(subobj)) {
13343
0
        PyErr_Format(PyExc_TypeError,
13344
0
                     "startswith first arg must be str or "
13345
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13346
0
        return NULL;
13347
0
    }
13348
44.2M
    int result = tailmatch(self, subobj, start, end, -1);
13349
44.2M
    if (result < 0) {
13350
0
        return NULL;
13351
0
    }
13352
44.2M
    return PyBool_FromLong(result);
13353
44.2M
}
13354
13355
13356
/*[clinic input]
13357
@permit_long_summary
13358
@text_signature "($self, suffix[, start[, end]], /)"
13359
str.endswith as unicode_endswith
13360
13361
    suffix as subobj: object
13362
        A string or a tuple of strings to try.
13363
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13364
        Optional start position. Default: start of the string.
13365
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13366
        Optional stop position. Default: end of the string.
13367
    /
13368
13369
Return True if the string ends with the specified suffix, False otherwise.
13370
[clinic start generated code]*/
13371
13372
static PyObject *
13373
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13374
                      Py_ssize_t end)
13375
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13376
10.5M
{
13377
10.5M
    if (PyTuple_Check(subobj)) {
13378
186k
        Py_ssize_t i;
13379
355k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13380
329k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13381
329k
            if (!PyUnicode_Check(substring)) {
13382
0
                PyErr_Format(PyExc_TypeError,
13383
0
                             "tuple for endswith must only contain str, "
13384
0
                             "not %.100s",
13385
0
                             Py_TYPE(substring)->tp_name);
13386
0
                return NULL;
13387
0
            }
13388
329k
            int result = tailmatch(self, substring, start, end, +1);
13389
329k
            if (result < 0) {
13390
0
                return NULL;
13391
0
            }
13392
329k
            if (result) {
13393
160k
                Py_RETURN_TRUE;
13394
160k
            }
13395
329k
        }
13396
186k
        Py_RETURN_FALSE;
13397
186k
    }
13398
10.3M
    if (!PyUnicode_Check(subobj)) {
13399
0
        PyErr_Format(PyExc_TypeError,
13400
0
                     "endswith first arg must be str or "
13401
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13402
0
        return NULL;
13403
0
    }
13404
10.3M
    int result = tailmatch(self, subobj, start, end, +1);
13405
10.3M
    if (result < 0) {
13406
0
        return NULL;
13407
0
    }
13408
10.3M
    return PyBool_FromLong(result);
13409
10.3M
}
13410
13411
13412
#include "stringlib/unicode_format.h"
13413
13414
PyDoc_STRVAR(format__doc__,
13415
             "format($self, /, *args, **kwargs)\n\
13416
--\n\
13417
\n\
13418
Return a formatted version of the string, using substitutions from args and kwargs.\n\
13419
The substitutions are identified by braces ('{' and '}').");
13420
13421
PyDoc_STRVAR(format_map__doc__,
13422
             "format_map($self, mapping, /)\n\
13423
--\n\
13424
\n\
13425
Return a formatted version of the string, using substitutions from mapping.\n\
13426
The substitutions are identified by braces ('{' and '}').");
13427
13428
/*[clinic input]
13429
str.__format__ as unicode___format__
13430
13431
    format_spec: unicode
13432
    /
13433
13434
Return a formatted version of the string as described by format_spec.
13435
[clinic start generated code]*/
13436
13437
static PyObject *
13438
unicode___format___impl(PyObject *self, PyObject *format_spec)
13439
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13440
0
{
13441
0
    _PyUnicodeWriter writer;
13442
0
    int ret;
13443
13444
0
    _PyUnicodeWriter_Init(&writer);
13445
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13446
0
                                          self, format_spec, 0,
13447
0
                                          PyUnicode_GET_LENGTH(format_spec));
13448
0
    if (ret == -1) {
13449
0
        _PyUnicodeWriter_Dealloc(&writer);
13450
0
        return NULL;
13451
0
    }
13452
0
    return _PyUnicodeWriter_Finish(&writer);
13453
0
}
13454
13455
/*[clinic input]
13456
str.__sizeof__ as unicode_sizeof
13457
13458
Return the size of the string in memory, in bytes.
13459
[clinic start generated code]*/
13460
13461
static PyObject *
13462
unicode_sizeof_impl(PyObject *self)
13463
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13464
0
{
13465
0
    Py_ssize_t size;
13466
13467
    /* If it's a compact object, account for base structure +
13468
       character data. */
13469
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
13470
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13471
0
    }
13472
0
    else if (PyUnicode_IS_COMPACT(self)) {
13473
0
        size = sizeof(PyCompactUnicodeObject) +
13474
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13475
0
    }
13476
0
    else {
13477
        /* If it is a two-block object, account for base object, and
13478
           for character block if present. */
13479
0
        size = sizeof(PyUnicodeObject);
13480
0
        if (_PyUnicode_DATA_ANY(self))
13481
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13482
0
                PyUnicode_KIND(self);
13483
0
    }
13484
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13485
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13486
13487
0
    return PyLong_FromSsize_t(size);
13488
0
}
13489
13490
static PyObject *
13491
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13492
0
{
13493
0
    PyObject *copy = _PyUnicode_Copy(v);
13494
0
    if (!copy)
13495
0
        return NULL;
13496
0
    return Py_BuildValue("(N)", copy);
13497
0
}
13498
13499
/*
13500
This function searchs the longest common leading whitespace
13501
of all lines in the [src, end).
13502
It returns the length of the common leading whitespace and sets `output` to
13503
point to the beginning of the common leading whitespace if length > 0.
13504
*/
13505
static Py_ssize_t
13506
search_longest_common_leading_whitespace(
13507
    const char *const src,
13508
    const char *const end,
13509
    const char **output)
13510
0
{
13511
    // [_start, _start + _len)
13512
    // describes the current longest common leading whitespace
13513
0
    const char *_start = NULL;
13514
0
    Py_ssize_t _len = 0;
13515
13516
0
    for (const char *iter = src; iter < end; ++iter) {
13517
0
        const char *line_start = iter;
13518
0
        const char *leading_whitespace_end = NULL;
13519
13520
        // scan the whole line
13521
0
        while (iter < end && *iter != '\n') {
13522
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
13523
                /* `iter` points to the first non-whitespace character
13524
                   in this line */
13525
0
                if (iter == line_start) {
13526
                    // some line has no indent, fast exit!
13527
0
                    return 0;
13528
0
                }
13529
0
                leading_whitespace_end = iter;
13530
0
            }
13531
0
            ++iter;
13532
0
        }
13533
13534
        // if this line has all white space, skip it
13535
0
        if (!leading_whitespace_end) {
13536
0
            continue;
13537
0
        }
13538
13539
0
        if (!_start) {
13540
            // update the first leading whitespace
13541
0
            _start = line_start;
13542
0
            _len = leading_whitespace_end - line_start;
13543
0
            assert(_len > 0);
13544
0
        }
13545
0
        else {
13546
            /* We then compare with the current longest leading whitespace.
13547
13548
               [line_start, leading_whitespace_end) is the leading
13549
               whitespace of this line,
13550
13551
               [_start, _start + _len) is the leading whitespace of the
13552
               current longest leading whitespace. */
13553
0
            Py_ssize_t new_len = 0;
13554
0
            const char *_iter = _start, *line_iter = line_start;
13555
13556
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
13557
0
                   && *_iter == *line_iter)
13558
0
            {
13559
0
                ++_iter;
13560
0
                ++line_iter;
13561
0
                ++new_len;
13562
0
            }
13563
13564
0
            _len = new_len;
13565
0
            if (_len == 0) {
13566
                // No common things now, fast exit!
13567
0
                return 0;
13568
0
            }
13569
0
        }
13570
0
    }
13571
13572
0
    assert(_len >= 0);
13573
0
    if (_len > 0) {
13574
0
        *output = _start;
13575
0
    }
13576
0
    return _len;
13577
0
}
13578
13579
/* Dedent a string.
13580
   Intended to dedent Python source. Unlike `textwrap.dedent`, this
13581
   only supports spaces and tabs and doesn't normalize empty lines.
13582
   Return a new reference on success, NULL with exception set on error.
13583
   */
13584
PyObject *
13585
_PyUnicode_Dedent(PyObject *unicode)
13586
0
{
13587
0
    Py_ssize_t src_len = 0;
13588
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
13589
0
    if (!src) {
13590
0
        return NULL;
13591
0
    }
13592
0
    assert(src_len >= 0);
13593
0
    if (src_len == 0) {
13594
0
        return Py_NewRef(unicode);
13595
0
    }
13596
13597
0
    const char *const end = src + src_len;
13598
13599
    // [whitespace_start, whitespace_start + whitespace_len)
13600
    // describes the current longest common leading whitespace
13601
0
    const char *whitespace_start = NULL;
13602
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
13603
0
        src, end, &whitespace_start);
13604
13605
0
    if (whitespace_len == 0) {
13606
0
        return Py_NewRef(unicode);
13607
0
    }
13608
13609
    // now we should trigger a dedent
13610
0
    char *dest = PyMem_Malloc(src_len);
13611
0
    if (!dest) {
13612
0
        PyErr_NoMemory();
13613
0
        return NULL;
13614
0
    }
13615
0
    char *dest_iter = dest;
13616
13617
0
    for (const char *iter = src; iter < end; ++iter) {
13618
0
        const char *line_start = iter;
13619
0
        bool in_leading_space = true;
13620
13621
        // iterate over a line to find the end of a line
13622
0
        while (iter < end && *iter != '\n') {
13623
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
13624
0
                in_leading_space = false;
13625
0
            }
13626
0
            ++iter;
13627
0
        }
13628
13629
        // invariant: *iter == '\n' or iter == end
13630
0
        bool append_newline = iter < end;
13631
13632
        // if this line has all white space, write '\n' and continue
13633
0
        if (in_leading_space && append_newline) {
13634
0
            *dest_iter++ = '\n';
13635
0
            continue;
13636
0
        }
13637
13638
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
13639
            conditionally append '\n' */
13640
13641
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
13642
0
        assert(new_line_len >= 0);
13643
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
13644
13645
0
        dest_iter += new_line_len;
13646
13647
0
        if (append_newline) {
13648
0
            *dest_iter++ = '\n';
13649
0
        }
13650
0
    }
13651
13652
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
13653
0
    PyMem_Free(dest);
13654
0
    return res;
13655
0
}
13656
13657
static PyMethodDef unicode_methods[] = {
13658
    UNICODE_ENCODE_METHODDEF
13659
    UNICODE_REPLACE_METHODDEF
13660
    UNICODE_SPLIT_METHODDEF
13661
    UNICODE_RSPLIT_METHODDEF
13662
    UNICODE_JOIN_METHODDEF
13663
    UNICODE_CAPITALIZE_METHODDEF
13664
    UNICODE_CASEFOLD_METHODDEF
13665
    UNICODE_TITLE_METHODDEF
13666
    UNICODE_CENTER_METHODDEF
13667
    UNICODE_COUNT_METHODDEF
13668
    UNICODE_EXPANDTABS_METHODDEF
13669
    UNICODE_FIND_METHODDEF
13670
    UNICODE_PARTITION_METHODDEF
13671
    UNICODE_INDEX_METHODDEF
13672
    UNICODE_LJUST_METHODDEF
13673
    UNICODE_LOWER_METHODDEF
13674
    UNICODE_LSTRIP_METHODDEF
13675
    UNICODE_RFIND_METHODDEF
13676
    UNICODE_RINDEX_METHODDEF
13677
    UNICODE_RJUST_METHODDEF
13678
    UNICODE_RSTRIP_METHODDEF
13679
    UNICODE_RPARTITION_METHODDEF
13680
    UNICODE_SPLITLINES_METHODDEF
13681
    UNICODE_STRIP_METHODDEF
13682
    UNICODE_SWAPCASE_METHODDEF
13683
    UNICODE_TRANSLATE_METHODDEF
13684
    UNICODE_UPPER_METHODDEF
13685
    UNICODE_STARTSWITH_METHODDEF
13686
    UNICODE_ENDSWITH_METHODDEF
13687
    UNICODE_REMOVEPREFIX_METHODDEF
13688
    UNICODE_REMOVESUFFIX_METHODDEF
13689
    UNICODE_ISASCII_METHODDEF
13690
    UNICODE_ISLOWER_METHODDEF
13691
    UNICODE_ISUPPER_METHODDEF
13692
    UNICODE_ISTITLE_METHODDEF
13693
    UNICODE_ISSPACE_METHODDEF
13694
    UNICODE_ISDECIMAL_METHODDEF
13695
    UNICODE_ISDIGIT_METHODDEF
13696
    UNICODE_ISNUMERIC_METHODDEF
13697
    UNICODE_ISALPHA_METHODDEF
13698
    UNICODE_ISALNUM_METHODDEF
13699
    UNICODE_ISIDENTIFIER_METHODDEF
13700
    UNICODE_ISPRINTABLE_METHODDEF
13701
    UNICODE_ZFILL_METHODDEF
13702
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13703
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
13704
    UNICODE___FORMAT___METHODDEF
13705
    UNICODE_MAKETRANS_METHODDEF
13706
    UNICODE_SIZEOF_METHODDEF
13707
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13708
    {NULL, NULL}
13709
};
13710
13711
static PyObject *
13712
unicode_mod(PyObject *v, PyObject *w)
13713
17.3M
{
13714
17.3M
    if (!PyUnicode_Check(v))
13715
0
        Py_RETURN_NOTIMPLEMENTED;
13716
17.3M
    return PyUnicode_Format(v, w);
13717
17.3M
}
13718
13719
static PyNumberMethods unicode_as_number = {
13720
    0,              /*nb_add*/
13721
    0,              /*nb_subtract*/
13722
    0,              /*nb_multiply*/
13723
    unicode_mod,            /*nb_remainder*/
13724
};
13725
13726
static PySequenceMethods unicode_as_sequence = {
13727
    unicode_length,     /* sq_length */
13728
    PyUnicode_Concat,   /* sq_concat */
13729
    unicode_repeat,     /* sq_repeat */
13730
    unicode_getitem,    /* sq_item */
13731
    0,                  /* sq_slice */
13732
    0,                  /* sq_ass_item */
13733
    0,                  /* sq_ass_slice */
13734
    PyUnicode_Contains, /* sq_contains */
13735
};
13736
13737
static PyObject*
13738
unicode_subscript(PyObject* self, PyObject* item)
13739
64.7M
{
13740
64.7M
    if (_PyIndex_Check(item)) {
13741
44.9M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13742
44.9M
        if (i == -1 && PyErr_Occurred())
13743
0
            return NULL;
13744
44.9M
        if (i < 0)
13745
70.3k
            i += PyUnicode_GET_LENGTH(self);
13746
44.9M
        return unicode_getitem(self, i);
13747
44.9M
    } else if (PySlice_Check(item)) {
13748
19.7M
        Py_ssize_t start, stop, step, slicelength, i;
13749
19.7M
        size_t cur;
13750
19.7M
        PyObject *result;
13751
19.7M
        const void *src_data;
13752
19.7M
        void *dest_data;
13753
19.7M
        int src_kind, dest_kind;
13754
19.7M
        Py_UCS4 ch, max_char, kind_limit;
13755
13756
19.7M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13757
0
            return NULL;
13758
0
        }
13759
19.7M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13760
19.7M
                                            &start, &stop, step);
13761
13762
19.7M
        if (slicelength <= 0) {
13763
1.66M
            _Py_RETURN_UNICODE_EMPTY();
13764
18.1M
        } else if (start == 0 && step == 1 &&
13765
5.80M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
13766
3.87M
            return unicode_result_unchanged(self);
13767
14.2M
        } else if (step == 1) {
13768
14.2M
            return PyUnicode_Substring(self,
13769
14.2M
                                       start, start + slicelength);
13770
14.2M
        }
13771
        /* General case */
13772
0
        src_kind = PyUnicode_KIND(self);
13773
0
        src_data = PyUnicode_DATA(self);
13774
0
        if (!PyUnicode_IS_ASCII(self)) {
13775
0
            kind_limit = kind_maxchar_limit(src_kind);
13776
0
            max_char = 0;
13777
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13778
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
13779
0
                if (ch > max_char) {
13780
0
                    max_char = ch;
13781
0
                    if (max_char >= kind_limit)
13782
0
                        break;
13783
0
                }
13784
0
            }
13785
0
        }
13786
0
        else
13787
0
            max_char = 127;
13788
0
        result = PyUnicode_New(slicelength, max_char);
13789
0
        if (result == NULL)
13790
0
            return NULL;
13791
0
        dest_kind = PyUnicode_KIND(result);
13792
0
        dest_data = PyUnicode_DATA(result);
13793
13794
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13795
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13796
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13797
0
        }
13798
0
        assert(_PyUnicode_CheckConsistency(result, 1));
13799
0
        return result;
13800
0
    } else {
13801
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13802
0
                     Py_TYPE(item)->tp_name);
13803
0
        return NULL;
13804
0
    }
13805
64.7M
}
13806
13807
static PyMappingMethods unicode_as_mapping = {
13808
    unicode_length,     /* mp_length */
13809
    unicode_subscript,  /* mp_subscript */
13810
    0,                  /* mp_ass_subscript */
13811
};
13812
13813
13814
static PyObject *
13815
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
13816
13817
/*[clinic input]
13818
@classmethod
13819
str.__new__ as unicode_new
13820
13821
    object as x: object = NULL
13822
    encoding: str = NULL
13823
    errors: str = NULL
13824
13825
[clinic start generated code]*/
13826
13827
static PyObject *
13828
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
13829
                 const char *errors)
13830
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
13831
16.1M
{
13832
16.1M
    PyObject *unicode;
13833
16.1M
    if (x == NULL) {
13834
0
        unicode = _PyUnicode_GetEmpty();
13835
0
    }
13836
16.1M
    else if (encoding == NULL && errors == NULL) {
13837
16.1M
        unicode = PyObject_Str(x);
13838
16.1M
    }
13839
0
    else {
13840
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
13841
0
    }
13842
13843
16.1M
    if (unicode != NULL && type != &PyUnicode_Type) {
13844
16.1M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
13845
16.1M
    }
13846
16.1M
    return unicode;
13847
16.1M
}
13848
13849
static const char *
13850
arg_as_utf8(PyObject *obj, const char *name)
13851
553k
{
13852
553k
    if (!PyUnicode_Check(obj)) {
13853
0
        PyErr_Format(PyExc_TypeError,
13854
0
                     "str() argument '%s' must be str, not %T",
13855
0
                     name, obj);
13856
0
        return NULL;
13857
0
    }
13858
553k
    return _PyUnicode_AsUTF8NoNUL(obj);
13859
553k
}
13860
13861
static PyObject *
13862
unicode_vectorcall(PyObject *type, PyObject *const *args,
13863
                   size_t nargsf, PyObject *kwnames)
13864
299k
{
13865
299k
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
13866
13867
299k
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
13868
299k
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
13869
        // Fallback to unicode_new()
13870
0
        PyObject *tuple = PyTuple_FromArray(args, nargs);
13871
0
        if (tuple == NULL) {
13872
0
            return NULL;
13873
0
        }
13874
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
13875
0
        if (dict == NULL) {
13876
0
            Py_DECREF(tuple);
13877
0
            return NULL;
13878
0
        }
13879
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
13880
0
        Py_DECREF(tuple);
13881
0
        Py_DECREF(dict);
13882
0
        return ret;
13883
0
    }
13884
299k
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
13885
0
        return NULL;
13886
0
    }
13887
299k
    if (nargs == 0) {
13888
4.48k
        return _PyUnicode_GetEmpty();
13889
4.48k
    }
13890
295k
    PyObject *object = args[0];
13891
295k
    if (nargs == 1) {
13892
1.30k
        return PyObject_Str(object);
13893
1.30k
    }
13894
294k
    const char *encoding = arg_as_utf8(args[1], "encoding");
13895
294k
    if (encoding == NULL) {
13896
0
        return NULL;
13897
0
    }
13898
294k
    const char *errors = NULL;
13899
294k
    if (nargs == 3) {
13900
259k
        errors = arg_as_utf8(args[2], "errors");
13901
259k
        if (errors == NULL) {
13902
0
            return NULL;
13903
0
        }
13904
259k
    }
13905
294k
    return PyUnicode_FromEncodedObject(object, encoding, errors);
13906
294k
}
13907
13908
static PyObject *
13909
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
13910
16.1M
{
13911
16.1M
    PyObject *self;
13912
16.1M
    Py_ssize_t length, char_size;
13913
16.1M
    int share_utf8;
13914
16.1M
    int kind;
13915
16.1M
    void *data;
13916
13917
16.1M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13918
16.1M
    assert(_PyUnicode_CHECK(unicode));
13919
13920
16.1M
    self = type->tp_alloc(type, 0);
13921
16.1M
    if (self == NULL) {
13922
0
        return NULL;
13923
0
    }
13924
16.1M
    kind = PyUnicode_KIND(unicode);
13925
16.1M
    length = PyUnicode_GET_LENGTH(unicode);
13926
13927
16.1M
    _PyUnicode_LENGTH(self) = length;
13928
#ifdef Py_DEBUG
13929
    _PyUnicode_HASH(self) = -1;
13930
#else
13931
16.1M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13932
16.1M
#endif
13933
16.1M
    _PyUnicode_STATE(self).interned = 0;
13934
16.1M
    _PyUnicode_STATE(self).kind = kind;
13935
16.1M
    _PyUnicode_STATE(self).compact = 0;
13936
16.1M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13937
16.1M
    _PyUnicode_STATE(self).statically_allocated = 0;
13938
16.1M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
13939
16.1M
    PyUnicode_SET_UTF8(self, NULL);
13940
16.1M
    _PyUnicode_DATA_ANY(self) = NULL;
13941
13942
16.1M
    share_utf8 = 0;
13943
16.1M
    if (kind == PyUnicode_1BYTE_KIND) {
13944
13.6M
        char_size = 1;
13945
13.6M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13946
13.6M
            share_utf8 = 1;
13947
13.6M
    }
13948
2.49M
    else if (kind == PyUnicode_2BYTE_KIND) {
13949
2.43M
        char_size = 2;
13950
2.43M
    }
13951
58.1k
    else {
13952
58.1k
        assert(kind == PyUnicode_4BYTE_KIND);
13953
58.1k
        char_size = 4;
13954
58.1k
    }
13955
13956
    /* Ensure we won't overflow the length. */
13957
16.1M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13958
0
        PyErr_NoMemory();
13959
0
        goto onError;
13960
0
    }
13961
16.1M
    data = PyMem_Malloc((length + 1) * char_size);
13962
16.1M
    if (data == NULL) {
13963
0
        PyErr_NoMemory();
13964
0
        goto onError;
13965
0
    }
13966
13967
16.1M
    _PyUnicode_DATA_ANY(self) = data;
13968
16.1M
    if (share_utf8) {
13969
13.6M
        PyUnicode_SET_UTF8_LENGTH(self, length);
13970
13.6M
        PyUnicode_SET_UTF8(self, data);
13971
13.6M
    }
13972
13973
16.1M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
13974
16.1M
    assert(_PyUnicode_CheckConsistency(self, 1));
13975
#ifdef Py_DEBUG
13976
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13977
#endif
13978
16.1M
    return self;
13979
13980
0
onError:
13981
0
    Py_DECREF(self);
13982
0
    return NULL;
13983
16.1M
}
13984
13985
void
13986
_PyUnicode_ExactDealloc(PyObject *op)
13987
69.3M
{
13988
69.3M
    assert(PyUnicode_CheckExact(op));
13989
69.3M
    unicode_dealloc(op);
13990
69.3M
}
13991
13992
PyDoc_STRVAR(unicode_doc,
13993
"str(object='') -> str\n\
13994
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
13995
\n\
13996
Create a new string object from the given object. If encoding or\n\
13997
errors is specified, then the object must expose a data buffer\n\
13998
that will be decoded using the given encoding and error handler.\n\
13999
Otherwise, returns the result of object.__str__() (if defined)\n\
14000
or repr(object).\n\
14001
encoding defaults to 'utf-8'.\n\
14002
errors defaults to 'strict'.");
14003
14004
static PyObject *unicode_iter(PyObject *seq);
14005
14006
PyTypeObject PyUnicode_Type = {
14007
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14008
    "str",                        /* tp_name */
14009
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14010
    0,                            /* tp_itemsize */
14011
    /* Slots */
14012
    unicode_dealloc,              /* tp_dealloc */
14013
    0,                            /* tp_vectorcall_offset */
14014
    0,                            /* tp_getattr */
14015
    0,                            /* tp_setattr */
14016
    0,                            /* tp_as_async */
14017
    unicode_repr,                 /* tp_repr */
14018
    &unicode_as_number,           /* tp_as_number */
14019
    &unicode_as_sequence,         /* tp_as_sequence */
14020
    &unicode_as_mapping,          /* tp_as_mapping */
14021
    unicode_hash,                 /* tp_hash*/
14022
    0,                            /* tp_call*/
14023
    unicode_str,                  /* tp_str */
14024
    PyObject_GenericGetAttr,      /* tp_getattro */
14025
    0,                            /* tp_setattro */
14026
    0,                            /* tp_as_buffer */
14027
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14028
        Py_TPFLAGS_UNICODE_SUBCLASS |
14029
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14030
    unicode_doc,                  /* tp_doc */
14031
    0,                            /* tp_traverse */
14032
    0,                            /* tp_clear */
14033
    PyUnicode_RichCompare,        /* tp_richcompare */
14034
    0,                            /* tp_weaklistoffset */
14035
    unicode_iter,                 /* tp_iter */
14036
    0,                            /* tp_iternext */
14037
    unicode_methods,              /* tp_methods */
14038
    0,                            /* tp_members */
14039
    0,                            /* tp_getset */
14040
    0,                            /* tp_base */
14041
    0,                            /* tp_dict */
14042
    0,                            /* tp_descr_get */
14043
    0,                            /* tp_descr_set */
14044
    0,                            /* tp_dictoffset */
14045
    0,                            /* tp_init */
14046
    0,                            /* tp_alloc */
14047
    unicode_new,                  /* tp_new */
14048
    PyObject_Free,                /* tp_free */
14049
    .tp_vectorcall = unicode_vectorcall,
14050
};
14051
14052
/* Initialize the Unicode implementation */
14053
14054
static void
14055
_init_global_state(void)
14056
36
{
14057
36
    static int initialized = 0;
14058
36
    if (initialized) {
14059
0
        return;
14060
0
    }
14061
36
    initialized = 1;
14062
14063
    /* initialize the linebreak bloom filter */
14064
36
    const Py_UCS2 linebreak[] = {
14065
36
        0x000A, /* LINE FEED */
14066
36
        0x000D, /* CARRIAGE RETURN */
14067
36
        0x001C, /* FILE SEPARATOR */
14068
36
        0x001D, /* GROUP SEPARATOR */
14069
36
        0x001E, /* RECORD SEPARATOR */
14070
36
        0x0085, /* NEXT LINE */
14071
36
        0x2028, /* LINE SEPARATOR */
14072
36
        0x2029, /* PARAGRAPH SEPARATOR */
14073
36
    };
14074
36
    bloom_linebreak = make_bloom_mask(
14075
36
        PyUnicode_2BYTE_KIND, linebreak,
14076
36
        Py_ARRAY_LENGTH(linebreak));
14077
36
}
14078
14079
void
14080
_PyUnicode_InitState(PyInterpreterState *interp)
14081
36
{
14082
36
    if (!_Py_IsMainInterpreter(interp)) {
14083
0
        return;
14084
0
    }
14085
36
    _init_global_state();
14086
36
}
14087
14088
14089
PyStatus
14090
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14091
36
{
14092
36
    if (_Py_IsMainInterpreter(interp)) {
14093
36
        PyStatus status = init_global_interned_strings(interp);
14094
36
        if (_PyStatus_EXCEPTION(status)) {
14095
0
            return status;
14096
0
        }
14097
36
    }
14098
36
    assert(INTERNED_STRINGS);
14099
14100
36
    if (init_interned_dict(interp)) {
14101
0
        PyErr_Clear();
14102
0
        return _PyStatus_ERR("failed to create interned dict");
14103
0
    }
14104
14105
36
    return _PyStatus_OK();
14106
36
}
14107
14108
14109
PyStatus
14110
_PyUnicode_InitTypes(PyInterpreterState *interp)
14111
36
{
14112
36
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
14113
0
        goto error;
14114
0
    }
14115
36
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
14116
0
        goto error;
14117
0
    }
14118
36
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
14119
0
        goto error;
14120
0
    }
14121
36
    return _PyStatus_OK();
14122
14123
0
error:
14124
0
    return _PyStatus_ERR("Can't initialize unicode types");
14125
36
}
14126
14127
static /* non-null */ PyObject*
14128
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
14129
40.1k
{
14130
    // Note that this steals a reference to `s`, but in many cases that
14131
    // stolen ref is returned, requiring no decref/incref.
14132
14133
40.1k
    assert(s != NULL);
14134
40.1k
    assert(_PyUnicode_CHECK(s));
14135
40.1k
    assert(_PyUnicode_STATE(s).statically_allocated);
14136
40.1k
    assert(!PyUnicode_CHECK_INTERNED(s));
14137
14138
#ifdef Py_DEBUG
14139
    /* We must not add process-global interned string if there's already a
14140
     * per-interpreter interned_dict, which might contain duplicates.
14141
     */
14142
    PyObject *interned = get_interned_dict(interp);
14143
    assert(interned == NULL);
14144
#endif
14145
14146
    /* Look in the global cache first. */
14147
40.1k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14148
    /* We should only init each string once */
14149
40.1k
    assert(r == NULL);
14150
    /* but just in case (for the non-debug build), handle this */
14151
40.1k
    if (r != NULL && r != s) {
14152
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
14153
0
        assert(_PyUnicode_CHECK(r));
14154
0
        Py_DECREF(s);
14155
0
        return Py_NewRef(r);
14156
0
    }
14157
14158
40.1k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
14159
0
        Py_FatalError("failed to intern static string");
14160
0
    }
14161
14162
40.1k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14163
40.1k
    return s;
14164
40.1k
}
14165
14166
void
14167
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
14168
40.1k
{
14169
    // This should only be called as part of runtime initialization
14170
40.1k
    assert(!Py_IsInitialized());
14171
14172
40.1k
    *p = intern_static(interp, *p);
14173
40.1k
    assert(*p);
14174
40.1k
}
14175
14176
static void
14177
immortalize_interned(PyObject *s)
14178
286k
{
14179
286k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
14180
286k
    assert(!_Py_IsImmortal(s));
14181
#ifdef Py_REF_DEBUG
14182
    /* The reference count value should be excluded from the RefTotal.
14183
       The decrements to these objects will not be registered so they
14184
       need to be accounted for in here. */
14185
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
14186
        _Py_DecRefTotal(_PyThreadState_GET());
14187
    }
14188
#endif
14189
286k
    _Py_SetImmortal(s);
14190
    // The switch to SSTATE_INTERNED_IMMORTAL must be the last thing done here
14191
    // to synchronize with the check in intern_common() that avoids locking if
14192
    // the string is already immortal.
14193
286k
    FT_ATOMIC_STORE_UINT8(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
14194
286k
}
14195
14196
static /* non-null */ PyObject*
14197
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
14198
              bool immortalize)
14199
97.4M
{
14200
    // Note that this steals a reference to `s`, but in many cases that
14201
    // stolen ref is returned, requiring no decref/incref.
14202
14203
#ifdef Py_DEBUG
14204
    assert(s != NULL);
14205
    assert(_PyUnicode_CHECK(s));
14206
#else
14207
97.4M
    if (s == NULL || !PyUnicode_Check(s)) {
14208
0
        return s;
14209
0
    }
14210
97.4M
#endif
14211
14212
    /* If it's a subclass, we don't really know what putting
14213
       it in the interned dict might do. */
14214
97.4M
    if (!PyUnicode_CheckExact(s)) {
14215
0
        return s;
14216
0
    }
14217
14218
    /* Is it already interned? */
14219
97.4M
    switch (PyUnicode_CHECK_INTERNED(s)) {
14220
5.69M
        case SSTATE_NOT_INTERNED:
14221
            // no, go on
14222
5.69M
            break;
14223
28.8k
        case SSTATE_INTERNED_MORTAL:
14224
            // yes but we might need to make it immortal
14225
28.8k
            if (immortalize) {
14226
1.80k
                immortalize_interned(s);
14227
1.80k
            }
14228
28.8k
            return s;
14229
91.7M
        default:
14230
            // all done
14231
91.7M
            return s;
14232
97.4M
    }
14233
14234
    /* Statically allocated strings must be already interned. */
14235
97.4M
    assert(!_PyUnicode_STATE(s).statically_allocated);
14236
14237
#if Py_GIL_DISABLED
14238
    /* In the free-threaded build, all interned strings are immortal */
14239
    immortalize = 1;
14240
#endif
14241
14242
    /* If it's already immortal, intern it as such */
14243
5.69M
    if (_Py_IsImmortal(s)) {
14244
0
        immortalize = 1;
14245
0
    }
14246
14247
    /* if it's a short string, get the singleton */
14248
5.69M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
14249
19.8k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
14250
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
14251
0
        assert(PyUnicode_CHECK_INTERNED(r));
14252
0
        Py_DECREF(s);
14253
0
        return r;
14254
0
    }
14255
#ifdef Py_DEBUG
14256
    assert(!unicode_is_singleton(s));
14257
#endif
14258
14259
    /* Look in the global cache now. */
14260
5.69M
    {
14261
5.69M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14262
5.69M
        if (r != NULL) {
14263
538k
            assert(_PyUnicode_STATE(r).statically_allocated);
14264
538k
            assert(r != s);  // r must be statically_allocated; s is not
14265
538k
            Py_DECREF(s);
14266
538k
            return Py_NewRef(r);
14267
538k
        }
14268
5.69M
    }
14269
14270
    /* Do a setdefault on the per-interpreter cache. */
14271
5.15M
    PyObject *interned = get_interned_dict(interp);
14272
5.15M
    assert(interned != NULL);
14273
#ifdef Py_GIL_DISABLED
14274
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
14275
    // Lock-free fast path: check if there's already an interned copy that
14276
    // is in its final immortal state.
14277
    PyObject *r;
14278
    int res = PyDict_GetItemRef(interned, s, &r);
14279
    if (res < 0) {
14280
        PyErr_Clear();
14281
        return s;
14282
    }
14283
    if (res > 0) {
14284
        unsigned int state = _Py_atomic_load_uint8(&_PyUnicode_STATE(r).interned);
14285
        if (state == SSTATE_INTERNED_IMMORTAL) {
14286
            Py_DECREF(s);
14287
            return r;
14288
        }
14289
        // Not yet fully interned; fall through to the locking path.
14290
        Py_DECREF(r);
14291
    }
14292
#endif
14293
5.15M
    FT_MUTEX_LOCK(INTERN_MUTEX);
14294
5.15M
    PyObject *t;
14295
5.15M
    {
14296
5.15M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
14297
5.15M
        if (res < 0) {
14298
0
            PyErr_Clear();
14299
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14300
0
            return s;
14301
0
        }
14302
5.15M
        else if (res == 1) {
14303
            // value was already present (not inserted)
14304
4.27M
            Py_DECREF(s);
14305
4.27M
            if (immortalize &&
14306
1.07M
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
14307
10.1k
                immortalize_interned(t);
14308
10.1k
            }
14309
4.27M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14310
4.27M
            return t;
14311
4.27M
        }
14312
882k
        else {
14313
            // value was newly inserted
14314
882k
            assert (s == t);
14315
882k
            Py_DECREF(t);
14316
882k
        }
14317
5.15M
    }
14318
14319
    /* NOT_INTERNED -> INTERNED_MORTAL */
14320
14321
5.15M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
14322
14323
882k
    if (!_Py_IsImmortal(s)) {
14324
        /* The two references in interned dict (key and value) are not counted.
14325
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
14326
882k
        Py_DECREF(s);
14327
882k
        Py_DECREF(s);
14328
882k
    }
14329
882k
    FT_ATOMIC_STORE_UINT8(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
14330
14331
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
14332
14333
#ifdef Py_DEBUG
14334
    if (_Py_IsImmortal(s)) {
14335
        assert(immortalize);
14336
    }
14337
#endif
14338
882k
    if (immortalize) {
14339
274k
        immortalize_interned(s);
14340
274k
    }
14341
14342
882k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
14343
882k
    return s;
14344
5.15M
}
14345
14346
void
14347
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
14348
15.0M
{
14349
15.0M
    *p = intern_common(interp, *p, 1);
14350
15.0M
    assert(*p);
14351
15.0M
}
14352
14353
void
14354
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
14355
82.4M
{
14356
82.4M
    *p = intern_common(interp, *p, 0);
14357
82.4M
    assert(*p);
14358
82.4M
}
14359
14360
14361
void
14362
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
14363
0
{
14364
0
    _PyUnicode_InternImmortal(interp, p);
14365
0
    return;
14366
0
}
14367
14368
void
14369
PyUnicode_InternInPlace(PyObject **p)
14370
0
{
14371
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14372
0
    _PyUnicode_InternMortal(interp, p);
14373
0
}
14374
14375
// Public-looking name kept for the stable ABI; user should not call this:
14376
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14377
void
14378
PyUnicode_InternImmortal(PyObject **p)
14379
0
{
14380
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14381
0
    _PyUnicode_InternImmortal(interp, p);
14382
0
}
14383
14384
PyObject *
14385
PyUnicode_InternFromString(const char *cp)
14386
1.32M
{
14387
1.32M
    PyObject *s = PyUnicode_FromString(cp);
14388
1.32M
    if (s == NULL) {
14389
0
        return NULL;
14390
0
    }
14391
1.32M
    PyInterpreterState *interp = _PyInterpreterState_GET();
14392
1.32M
    _PyUnicode_InternMortal(interp, &s);
14393
1.32M
    return s;
14394
1.32M
}
14395
14396
14397
void
14398
_PyUnicode_ClearInterned(PyInterpreterState *interp)
14399
0
{
14400
0
    PyObject *interned = get_interned_dict(interp);
14401
0
    if (interned == NULL) {
14402
0
        return;
14403
0
    }
14404
0
    assert(PyDict_CheckExact(interned));
14405
14406
0
    if (has_shared_intern_dict(interp)) {
14407
        // the dict doesn't belong to this interpreter, skip the debug
14408
        // checks on it and just clear the pointer to it
14409
0
        clear_interned_dict(interp);
14410
0
        return;
14411
0
    }
14412
14413
#ifdef INTERNED_STATS
14414
    fprintf(stderr, "releasing %zd interned strings\n",
14415
            PyDict_GET_SIZE(interned));
14416
14417
    Py_ssize_t total_length = 0;
14418
#endif
14419
0
    Py_ssize_t pos = 0;
14420
0
    PyObject *s, *ignored_value;
14421
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
14422
0
        int shared = 0;
14423
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
14424
0
        case SSTATE_INTERNED_IMMORTAL:
14425
            /* Make immortal interned strings mortal again. */
14426
            // Skip the Immortal Instance check and restore
14427
            // the two references (key and value) ignored
14428
            // by PyUnicode_InternInPlace().
14429
0
            _Py_SetMortal(s, 2);
14430
#ifdef Py_REF_DEBUG
14431
            /* let's be pedantic with the ref total */
14432
            _Py_IncRefTotal(_PyThreadState_GET());
14433
            _Py_IncRefTotal(_PyThreadState_GET());
14434
#endif
14435
#ifdef INTERNED_STATS
14436
            total_length += PyUnicode_GET_LENGTH(s);
14437
#endif
14438
0
            break;
14439
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
14440
            /* It is shared between interpreters, so we should unmark it
14441
               only when this is the last interpreter in which it's
14442
               interned.  We immortalize all the statically initialized
14443
               strings during startup, so we can rely on the
14444
               main interpreter to be the last one. */
14445
0
            if (!_Py_IsMainInterpreter(interp)) {
14446
0
                shared = 1;
14447
0
            }
14448
0
            break;
14449
0
        case SSTATE_INTERNED_MORTAL:
14450
            // Restore 2 references held by the interned dict; these will
14451
            // be decref'd by clear_interned_dict's PyDict_Clear.
14452
0
            _Py_RefcntAdd(s, 2);
14453
#ifdef Py_REF_DEBUG
14454
            /* let's be pedantic with the ref total */
14455
            _Py_IncRefTotal(_PyThreadState_GET());
14456
            _Py_IncRefTotal(_PyThreadState_GET());
14457
#endif
14458
0
            break;
14459
0
        case SSTATE_NOT_INTERNED:
14460
0
            _Py_FALLTHROUGH;
14461
0
        default:
14462
0
            Py_UNREACHABLE();
14463
0
        }
14464
0
        if (!shared) {
14465
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
14466
0
        }
14467
0
    }
14468
#ifdef INTERNED_STATS
14469
    fprintf(stderr,
14470
            "total length of all interned strings: %zd characters\n",
14471
            total_length);
14472
#endif
14473
14474
0
    struct _Py_unicode_state *state = &interp->unicode;
14475
0
    struct _Py_unicode_ids *ids = &state->ids;
14476
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
14477
0
        Py_XINCREF(ids->array[i]);
14478
0
    }
14479
0
    clear_interned_dict(interp);
14480
0
    if (_Py_IsMainInterpreter(interp)) {
14481
0
        clear_global_interned_strings();
14482
0
    }
14483
0
}
14484
14485
14486
/********************* Unicode Iterator **************************/
14487
14488
typedef struct {
14489
    PyObject_HEAD
14490
    Py_ssize_t it_index;
14491
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14492
} unicodeiterobject;
14493
14494
static void
14495
unicodeiter_dealloc(PyObject *op)
14496
1.75M
{
14497
1.75M
    unicodeiterobject *it = (unicodeiterobject *)op;
14498
1.75M
    _PyObject_GC_UNTRACK(it);
14499
1.75M
    Py_XDECREF(it->it_seq);
14500
1.75M
    PyObject_GC_Del(it);
14501
1.75M
}
14502
14503
static int
14504
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
14505
5
{
14506
5
    unicodeiterobject *it = (unicodeiterobject *)op;
14507
5
    Py_VISIT(it->it_seq);
14508
5
    return 0;
14509
5
}
14510
14511
static PyObject *
14512
unicodeiter_next(PyObject *op)
14513
49.4M
{
14514
49.4M
    unicodeiterobject *it = (unicodeiterobject *)op;
14515
49.4M
    PyObject *seq;
14516
14517
49.4M
    assert(it != NULL);
14518
49.4M
    seq = it->it_seq;
14519
49.4M
    if (seq == NULL)
14520
0
        return NULL;
14521
49.4M
    assert(_PyUnicode_CHECK(seq));
14522
14523
49.4M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14524
48.1M
        int kind = PyUnicode_KIND(seq);
14525
48.1M
        const void *data = PyUnicode_DATA(seq);
14526
48.1M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14527
48.1M
        it->it_index++;
14528
48.1M
        return unicode_char(chr);
14529
48.1M
    }
14530
14531
1.38M
    it->it_seq = NULL;
14532
1.38M
    Py_DECREF(seq);
14533
1.38M
    return NULL;
14534
49.4M
}
14535
14536
static PyObject *
14537
unicode_ascii_iter_next(PyObject *op)
14538
6.03M
{
14539
6.03M
    unicodeiterobject *it = (unicodeiterobject *)op;
14540
6.03M
    assert(it != NULL);
14541
6.03M
    PyObject *seq = it->it_seq;
14542
6.03M
    if (seq == NULL) {
14543
0
        return NULL;
14544
0
    }
14545
6.03M
    assert(_PyUnicode_CHECK(seq));
14546
6.03M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
14547
6.03M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14548
5.67M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14549
5.67M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14550
5.67M
                                              data, it->it_index);
14551
5.67M
        it->it_index++;
14552
5.67M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14553
5.67M
    }
14554
355k
    it->it_seq = NULL;
14555
355k
    Py_DECREF(seq);
14556
355k
    return NULL;
14557
6.03M
}
14558
14559
static PyObject *
14560
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
14561
1.26M
{
14562
1.26M
    unicodeiterobject *it = (unicodeiterobject *)op;
14563
1.26M
    Py_ssize_t len = 0;
14564
1.26M
    if (it->it_seq)
14565
1.26M
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14566
1.26M
    return PyLong_FromSsize_t(len);
14567
1.26M
}
14568
14569
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14570
14571
static PyObject *
14572
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
14573
0
{
14574
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14575
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
14576
14577
    /* _PyEval_GetBuiltin can invoke arbitrary code,
14578
     * call must be before access of iterator pointers.
14579
     * see issue #101765 */
14580
14581
0
    if (it->it_seq != NULL) {
14582
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
14583
0
    } else {
14584
0
        PyObject *u = _PyUnicode_GetEmpty();
14585
0
        if (u == NULL) {
14586
0
            Py_XDECREF(iter);
14587
0
            return NULL;
14588
0
        }
14589
0
        return Py_BuildValue("N(N)", iter, u);
14590
0
    }
14591
0
}
14592
14593
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14594
14595
static PyObject *
14596
unicodeiter_setstate(PyObject *op, PyObject *state)
14597
0
{
14598
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14599
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
14600
0
    if (index == -1 && PyErr_Occurred())
14601
0
        return NULL;
14602
0
    if (it->it_seq != NULL) {
14603
0
        if (index < 0)
14604
0
            index = 0;
14605
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14606
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14607
0
        it->it_index = index;
14608
0
    }
14609
0
    Py_RETURN_NONE;
14610
0
}
14611
14612
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14613
14614
static PyMethodDef unicodeiter_methods[] = {
14615
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
14616
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
14617
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
14618
    {NULL,      NULL}       /* sentinel */
14619
};
14620
14621
PyTypeObject PyUnicodeIter_Type = {
14622
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14623
    "str_iterator",         /* tp_name */
14624
    sizeof(unicodeiterobject),      /* tp_basicsize */
14625
    0,                  /* tp_itemsize */
14626
    /* methods */
14627
    unicodeiter_dealloc,/* tp_dealloc */
14628
    0,                  /* tp_vectorcall_offset */
14629
    0,                  /* tp_getattr */
14630
    0,                  /* tp_setattr */
14631
    0,                  /* tp_as_async */
14632
    0,                  /* tp_repr */
14633
    0,                  /* tp_as_number */
14634
    0,                  /* tp_as_sequence */
14635
    0,                  /* tp_as_mapping */
14636
    0,                  /* tp_hash */
14637
    0,                  /* tp_call */
14638
    0,                  /* tp_str */
14639
    PyObject_GenericGetAttr,        /* tp_getattro */
14640
    0,                  /* tp_setattro */
14641
    0,                  /* tp_as_buffer */
14642
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14643
    0,                  /* tp_doc */
14644
    unicodeiter_traverse, /* tp_traverse */
14645
    0,                  /* tp_clear */
14646
    0,                  /* tp_richcompare */
14647
    0,                  /* tp_weaklistoffset */
14648
    PyObject_SelfIter,          /* tp_iter */
14649
    unicodeiter_next,   /* tp_iternext */
14650
    unicodeiter_methods,            /* tp_methods */
14651
    0,
14652
};
14653
14654
PyTypeObject _PyUnicodeASCIIIter_Type = {
14655
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14656
    .tp_name = "str_ascii_iterator",
14657
    .tp_basicsize = sizeof(unicodeiterobject),
14658
    .tp_dealloc = unicodeiter_dealloc,
14659
    .tp_getattro = PyObject_GenericGetAttr,
14660
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14661
    .tp_traverse = unicodeiter_traverse,
14662
    .tp_iter = PyObject_SelfIter,
14663
    .tp_iternext = unicode_ascii_iter_next,
14664
    .tp_methods = unicodeiter_methods,
14665
};
14666
14667
static PyObject *
14668
unicode_iter(PyObject *seq)
14669
1.75M
{
14670
1.75M
    unicodeiterobject *it;
14671
14672
1.75M
    if (!PyUnicode_Check(seq)) {
14673
0
        PyErr_BadInternalCall();
14674
0
        return NULL;
14675
0
    }
14676
1.75M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14677
367k
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14678
367k
    }
14679
1.38M
    else {
14680
1.38M
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14681
1.38M
    }
14682
1.75M
    if (it == NULL)
14683
0
        return NULL;
14684
1.75M
    it->it_index = 0;
14685
1.75M
    it->it_seq = Py_NewRef(seq);
14686
1.75M
    _PyObject_GC_TRACK(it);
14687
1.75M
    return (PyObject *)it;
14688
1.75M
}
14689
14690
static int
14691
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14692
144
{
14693
144
    int res;
14694
144
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14695
144
    if (res == -2) {
14696
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
14697
0
        return -1;
14698
0
    }
14699
144
    if (res < 0) {
14700
0
        PyErr_NoMemory();
14701
0
        return -1;
14702
0
    }
14703
144
    return 0;
14704
144
}
14705
14706
14707
static int
14708
config_get_codec_name(wchar_t **config_encoding)
14709
72
{
14710
72
    char *encoding;
14711
72
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
14712
0
        return -1;
14713
0
    }
14714
14715
72
    PyObject *name_obj = NULL;
14716
72
    PyObject *codec = _PyCodec_Lookup(encoding);
14717
72
    PyMem_RawFree(encoding);
14718
14719
72
    if (!codec)
14720
0
        goto error;
14721
14722
72
    name_obj = PyObject_GetAttrString(codec, "name");
14723
72
    Py_CLEAR(codec);
14724
72
    if (!name_obj) {
14725
0
        goto error;
14726
0
    }
14727
14728
72
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14729
72
    Py_DECREF(name_obj);
14730
72
    if (wname == NULL) {
14731
0
        goto error;
14732
0
    }
14733
14734
72
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14735
72
    if (raw_wname == NULL) {
14736
0
        PyMem_Free(wname);
14737
0
        PyErr_NoMemory();
14738
0
        goto error;
14739
0
    }
14740
14741
72
    PyMem_RawFree(*config_encoding);
14742
72
    *config_encoding = raw_wname;
14743
14744
72
    PyMem_Free(wname);
14745
72
    return 0;
14746
14747
0
error:
14748
0
    Py_XDECREF(codec);
14749
0
    Py_XDECREF(name_obj);
14750
0
    return -1;
14751
72
}
14752
14753
14754
static PyStatus
14755
init_stdio_encoding(PyInterpreterState *interp)
14756
36
{
14757
    /* Update the stdio encoding to the normalized Python codec name. */
14758
36
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14759
36
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
14760
0
        return _PyStatus_ERR("failed to get the Python codec name "
14761
0
                             "of the stdio encoding");
14762
0
    }
14763
36
    return _PyStatus_OK();
14764
36
}
14765
14766
14767
static int
14768
init_fs_codec(PyInterpreterState *interp)
14769
36
{
14770
36
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
14771
14772
36
    _Py_error_handler error_handler;
14773
36
    error_handler = get_error_handler_wide(config->filesystem_errors);
14774
36
    if (error_handler == _Py_ERROR_UNKNOWN) {
14775
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
14776
0
        return -1;
14777
0
    }
14778
14779
36
    char *encoding, *errors;
14780
36
    if (encode_wstr_utf8(config->filesystem_encoding,
14781
36
                         &encoding,
14782
36
                         "filesystem_encoding") < 0) {
14783
0
        return -1;
14784
0
    }
14785
14786
36
    if (encode_wstr_utf8(config->filesystem_errors,
14787
36
                         &errors,
14788
36
                         "filesystem_errors") < 0) {
14789
0
        PyMem_RawFree(encoding);
14790
0
        return -1;
14791
0
    }
14792
14793
36
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
14794
36
    PyMem_RawFree(fs_codec->encoding);
14795
36
    fs_codec->encoding = encoding;
14796
    /* encoding has been normalized by init_fs_encoding() */
14797
36
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
14798
36
    PyMem_RawFree(fs_codec->errors);
14799
36
    fs_codec->errors = errors;
14800
36
    fs_codec->error_handler = error_handler;
14801
14802
#ifdef _Py_FORCE_UTF8_FS_ENCODING
14803
    assert(fs_codec->utf8 == 1);
14804
#endif
14805
14806
    /* At this point, PyUnicode_EncodeFSDefault() and
14807
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
14808
       the C implementation of the filesystem encoding. */
14809
14810
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
14811
       global configuration variables. */
14812
36
    if (_Py_IsMainInterpreter(interp)) {
14813
14814
36
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
14815
36
                                      fs_codec->errors) < 0) {
14816
0
            PyErr_NoMemory();
14817
0
            return -1;
14818
0
        }
14819
36
    }
14820
36
    return 0;
14821
36
}
14822
14823
14824
static PyStatus
14825
init_fs_encoding(PyThreadState *tstate)
14826
36
{
14827
36
    PyInterpreterState *interp = tstate->interp;
14828
14829
    /* Update the filesystem encoding to the normalized Python codec name.
14830
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
14831
       (Python codec name). */
14832
36
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14833
36
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
14834
0
        _Py_DumpPathConfig(tstate);
14835
0
        return _PyStatus_ERR("failed to get the Python codec "
14836
0
                             "of the filesystem encoding");
14837
0
    }
14838
14839
36
    if (init_fs_codec(interp) < 0) {
14840
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
14841
0
    }
14842
36
    return _PyStatus_OK();
14843
36
}
14844
14845
14846
PyStatus
14847
_PyUnicode_InitEncodings(PyThreadState *tstate)
14848
36
{
14849
36
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
14850
36
    if (_PyStatus_EXCEPTION(status)) {
14851
0
        return status;
14852
0
    }
14853
36
    status = init_fs_encoding(tstate);
14854
36
    if (_PyStatus_EXCEPTION(status)) {
14855
0
        return status;
14856
0
    }
14857
14858
36
    return init_stdio_encoding(tstate->interp);
14859
36
}
14860
14861
14862
static void
14863
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
14864
0
{
14865
0
    PyMem_RawFree(fs_codec->encoding);
14866
0
    fs_codec->encoding = NULL;
14867
0
    fs_codec->utf8 = 0;
14868
0
    PyMem_RawFree(fs_codec->errors);
14869
0
    fs_codec->errors = NULL;
14870
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
14871
0
}
14872
14873
14874
#ifdef MS_WINDOWS
14875
int
14876
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
14877
{
14878
    PyInterpreterState *interp = _PyInterpreterState_GET();
14879
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
14880
14881
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
14882
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
14883
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
14884
    if (encoding == NULL || errors == NULL) {
14885
        PyMem_RawFree(encoding);
14886
        PyMem_RawFree(errors);
14887
        PyErr_NoMemory();
14888
        return -1;
14889
    }
14890
14891
    PyMem_RawFree(config->filesystem_encoding);
14892
    config->filesystem_encoding = encoding;
14893
    PyMem_RawFree(config->filesystem_errors);
14894
    config->filesystem_errors = errors;
14895
14896
    return init_fs_codec(interp);
14897
}
14898
#endif
14899
14900
14901
#ifdef Py_DEBUG
14902
static inline int
14903
unicode_is_finalizing(void)
14904
{
14905
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
14906
}
14907
#endif
14908
14909
14910
void
14911
_PyUnicode_FiniTypes(PyInterpreterState *interp)
14912
0
{
14913
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
14914
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
14915
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
14916
0
}
14917
14918
14919
void
14920
_PyUnicode_Fini(PyInterpreterState *interp)
14921
0
{
14922
0
    struct _Py_unicode_state *state = &interp->unicode;
14923
14924
0
    if (!has_shared_intern_dict(interp)) {
14925
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
14926
0
        assert(get_interned_dict(interp) == NULL);
14927
0
    }
14928
14929
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
14930
14931
    // bpo-47182: force a unicodedata CAPI capsule re-import on
14932
    // subsequent initialization of interpreter.
14933
0
    interp->unicode.ucnhash_capi = NULL;
14934
14935
0
    unicode_clear_identifiers(state);
14936
0
}
14937
14938
/* A _string module, to export formatter_parser and formatter_field_name_split
14939
   to the string.Formatter class implemented in Python. */
14940
14941
static PyMethodDef _string_methods[] = {
14942
    {"formatter_field_name_split", formatter_field_name_split,
14943
     METH_O, PyDoc_STR("split the argument as a field name")},
14944
    {"formatter_parser", formatter_parser,
14945
     METH_O, PyDoc_STR("parse the argument as a format string")},
14946
    {NULL, NULL}
14947
};
14948
14949
static PyModuleDef_Slot module_slots[] = {
14950
    _Py_ABI_SLOT,
14951
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
14952
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
14953
    {0, NULL}
14954
};
14955
14956
static struct PyModuleDef _string_module = {
14957
    PyModuleDef_HEAD_INIT,
14958
    .m_name = "_string",
14959
    .m_doc = PyDoc_STR("string helper module"),
14960
    .m_size = 0,
14961
    .m_methods = _string_methods,
14962
    .m_slots = module_slots,
14963
};
14964
14965
PyMODINIT_FUNC
14966
PyInit__string(void)
14967
8
{
14968
8
    return PyModuleDef_Init(&_string_module);
14969
8
}
14970
14971
14972
#undef PyUnicode_KIND
14973
int PyUnicode_KIND(PyObject *op)
14974
0
{
14975
0
    if (!PyUnicode_Check(op)) {
14976
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14977
0
        return -1;
14978
0
    }
14979
0
    return _PyASCIIObject_CAST(op)->state.kind;
14980
0
}
14981
14982
#undef PyUnicode_DATA
14983
void* PyUnicode_DATA(PyObject *op)
14984
0
{
14985
0
    if (!PyUnicode_Check(op)) {
14986
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14987
0
        return NULL;
14988
0
    }
14989
0
    return _PyUnicode_DATA(op);
14990
0
}