Coverage Report

Created: 2025-11-11 06:44

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_initconfig.h"    // _PyStatus_OK()
50
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
51
#include "pycore_long.h"          // _PyLong_FormatWriter()
52
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
54
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
55
#include "pycore_pyhash.h"        // _Py_HashSecret_t
56
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
57
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
58
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
59
#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart
60
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
61
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
62
63
#include "stringlib/eq.h"         // unicode_eq()
64
#include <stddef.h>               // ptrdiff_t
65
66
#ifdef MS_WINDOWS
67
#include <windows.h>
68
#endif
69
70
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
71
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
72
#endif
73
74
/* Uncomment to display statistics on interned strings at exit
75
   in _PyUnicode_ClearInterned(). */
76
/* #define INTERNED_STATS 1 */
77
78
79
/*[clinic input]
80
class str "PyObject *" "&PyUnicode_Type"
81
[clinic start generated code]*/
82
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
83
84
/*[python input]
85
class Py_UCS4_converter(CConverter):
86
    type = 'Py_UCS4'
87
    converter = 'convert_uc'
88
89
    def converter_init(self):
90
        if self.default is not unspecified:
91
            self.c_default = ascii(self.default)
92
            if len(self.c_default) > 4 or self.c_default[0] != "'":
93
                self.c_default = hex(ord(self.default))
94
95
[python start generated code]*/
96
/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
97
98
/* --- Globals ------------------------------------------------------------
99
100
NOTE: In the interpreter's initialization phase, some globals are currently
101
      initialized dynamically as needed. In the process Unicode objects may
102
      be created before the Unicode type is ready.
103
104
*/
105
106
10.5M
#define MAX_UNICODE _Py_MAX_UNICODE
107
142M
#define ensure_unicode _PyUnicode_EnsureUnicode
108
109
#ifdef Py_DEBUG
110
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
111
#else
112
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
113
#endif
114
115
static inline char* _PyUnicode_UTF8(PyObject *op)
116
204M
{
117
204M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
118
204M
}
119
120
static inline char* PyUnicode_UTF8(PyObject *op)
121
74.4M
{
122
74.4M
    assert(_PyUnicode_CHECK(op));
123
74.4M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
124
59.7M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
125
59.7M
    }
126
14.7M
    else {
127
14.7M
         return _PyUnicode_UTF8(op);
128
14.7M
    }
129
74.4M
}
130
131
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
132
20.1M
{
133
20.1M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
134
20.1M
}
135
136
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
137
33.3M
{
138
33.3M
    assert(_PyUnicode_CHECK(op));
139
33.3M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
140
29.8M
         return _PyASCIIObject_CAST(op)->length;
141
29.8M
    }
142
3.52M
    else {
143
3.52M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
144
3.52M
    }
145
33.3M
}
146
147
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
148
20.1M
{
149
20.1M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
150
20.1M
}
151
152
#define _PyUnicode_LENGTH(op)                           \
153
527M
    (_PyASCIIObject_CAST(op)->length)
154
#define _PyUnicode_STATE(op)                            \
155
3.33G
    (_PyASCIIObject_CAST(op)->state)
156
#define _PyUnicode_HASH(op)                             \
157
485M
    (_PyASCIIObject_CAST(op)->hash)
158
159
99.9M
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
160
161
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
162
45.2M
{
163
45.2M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
164
45.2M
}
165
166
#define _PyUnicode_DATA_ANY(op)                         \
167
42.5M
    (_PyUnicodeObject_CAST(op)->data.any)
168
169
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
170
0
{
171
0
    assert(_PyUnicode_CHECK(op));
172
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
173
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
174
0
}
175
176
/* true if the Unicode object has an allocated UTF-8 memory block
177
   (not shared with other data) */
178
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
179
527M
{
180
527M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
181
180M
            && _PyUnicode_UTF8(op) != NULL
182
9.52M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
183
527M
}
184
185
186
245M
#define LATIN1 _Py_LATIN1_CHR
187
188
/* Forward declaration */
189
static PyObject *
190
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
191
                    const char *errors);
192
static PyObject *
193
unicode_decode_utf8(const char *s, Py_ssize_t size,
194
                    _Py_error_handler error_handler, const char *errors,
195
                    Py_ssize_t *consumed);
196
#ifdef Py_DEBUG
197
static inline int unicode_is_finalizing(void);
198
static int unicode_is_singleton(PyObject *unicode);
199
#endif
200
201
202
// Return a reference to the immortal empty string singleton.
203
PyObject*
204
_PyUnicode_GetEmpty(void)
205
111M
{
206
111M
    _Py_DECLARE_STR(empty, "");
207
111M
    return &_Py_STR(empty);
208
111M
}
209
210
/* This dictionary holds per-interpreter interned strings.
211
 * See InternalDocs/string_interning.md for details.
212
 */
213
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
214
3.80M
{
215
3.80M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
216
3.80M
}
217
218
/* This hashtable holds statically allocated interned strings.
219
 * See InternalDocs/string_interning.md for details.
220
 */
221
3.54M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
222
223
/* Get number of all interned strings for the current interpreter. */
224
Py_ssize_t
225
_PyUnicode_InternedSize(void)
226
0
{
227
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
228
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
229
0
}
230
231
/* Get number of immortal interned strings for the current interpreter. */
232
Py_ssize_t
233
_PyUnicode_InternedSize_Immortal(void)
234
0
{
235
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
236
0
    PyObject *key, *value;
237
0
    Py_ssize_t pos = 0;
238
0
    Py_ssize_t count = 0;
239
240
    // It's tempting to keep a count and avoid a loop here. But, this function
241
    // is intended for refleak tests. It spends extra work to report the true
242
    // value, to help detect bugs in optimizations.
243
244
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
245
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
246
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
247
0
           count++;
248
0
       }
249
0
    }
250
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
251
0
}
252
253
static Py_hash_t unicode_hash(PyObject *);
254
255
static Py_uhash_t
256
hashtable_unicode_hash(const void *key)
257
3.54M
{
258
3.54M
    return unicode_hash((PyObject *)key);
259
3.54M
}
260
261
static int
262
hashtable_unicode_compare(const void *key1, const void *key2)
263
347k
{
264
347k
    PyObject *obj1 = (PyObject *)key1;
265
347k
    PyObject *obj2 = (PyObject *)key2;
266
347k
    if (obj1 != NULL && obj2 != NULL) {
267
347k
        return unicode_eq(obj1, obj2);
268
347k
    }
269
0
    else {
270
0
        return obj1 == obj2;
271
0
    }
272
347k
}
273
274
/* Return true if this interpreter should share the main interpreter's
275
   intern_dict.  That's important for interpreters which load basic
276
   single-phase init extension modules (m_size == -1).  There could be interned
277
   immortal strings that are shared between interpreters, due to the
278
   PyDict_Update(mdict, m_copy) call in import_find_extension().
279
280
   It's not safe to deallocate those strings until all interpreters that
281
   potentially use them are freed.  By storing them in the main interpreter, we
282
   ensure they get freed after all other interpreters are freed.
283
*/
284
static bool
285
has_shared_intern_dict(PyInterpreterState *interp)
286
22
{
287
22
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
288
22
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
289
22
}
290
291
static int
292
init_interned_dict(PyInterpreterState *interp)
293
22
{
294
22
    assert(get_interned_dict(interp) == NULL);
295
22
    PyObject *interned;
296
22
    if (has_shared_intern_dict(interp)) {
297
0
        interned = get_interned_dict(_PyInterpreterState_Main());
298
0
        Py_INCREF(interned);
299
0
    }
300
22
    else {
301
22
        interned = PyDict_New();
302
22
        if (interned == NULL) {
303
0
            return -1;
304
0
        }
305
22
    }
306
22
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
307
22
    return 0;
308
22
}
309
310
static void
311
clear_interned_dict(PyInterpreterState *interp)
312
0
{
313
0
    PyObject *interned = get_interned_dict(interp);
314
0
    if (interned != NULL) {
315
0
        if (!has_shared_intern_dict(interp)) {
316
            // only clear if the dict belongs to this interpreter
317
0
            PyDict_Clear(interned);
318
0
        }
319
0
        Py_DECREF(interned);
320
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
321
0
    }
322
0
}
323
324
static PyStatus
325
init_global_interned_strings(PyInterpreterState *interp)
326
22
{
327
22
    assert(INTERNED_STRINGS == NULL);
328
22
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
329
330
22
    INTERNED_STRINGS = _Py_hashtable_new_full(
331
22
        hashtable_unicode_hash,
332
22
        hashtable_unicode_compare,
333
        // Objects stored here are immortal and statically allocated,
334
        // so we don't need key_destroy_func & value_destroy_func:
335
22
        NULL,
336
22
        NULL,
337
22
        &hashtable_alloc
338
22
    );
339
22
    if (INTERNED_STRINGS == NULL) {
340
0
        PyErr_Clear();
341
0
        return _PyStatus_ERR("failed to create global interned dict");
342
0
    }
343
344
    /* Intern statically allocated string identifiers, deepfreeze strings,
345
        * and one-byte latin-1 strings.
346
        * This must be done before any module initialization so that statically
347
        * allocated string identifiers are used instead of heap allocated strings.
348
        * Deepfreeze uses the interned identifiers if present to save space
349
        * else generates them and they are interned to speed up dict lookups.
350
    */
351
22
    _PyUnicode_InitStaticStrings(interp);
352
353
5.65k
    for (int i = 0; i < 256; i++) {
354
5.63k
        PyObject *s = LATIN1(i);
355
5.63k
        _PyUnicode_InternStatic(interp, &s);
356
5.63k
        assert(s == LATIN1(i));
357
5.63k
    }
358
#ifdef Py_DEBUG
359
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
360
361
    for (int i = 0; i < 256; i++) {
362
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
363
    }
364
#endif
365
22
    return _PyStatus_OK();
366
22
}
367
368
static void clear_global_interned_strings(void)
369
0
{
370
0
    if (INTERNED_STRINGS != NULL) {
371
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
372
0
        INTERNED_STRINGS = NULL;
373
0
    }
374
0
}
375
376
#define _Py_RETURN_UNICODE_EMPTY()   \
377
45.4M
    do {                             \
378
45.4M
        return _PyUnicode_GetEmpty();\
379
45.4M
    } while (0)
380
381
382
/* Fast detection of the most frequent whitespace characters */
383
const unsigned char _Py_ascii_whitespace[] = {
384
    0, 0, 0, 0, 0, 0, 0, 0,
385
/*     case 0x0009: * CHARACTER TABULATION */
386
/*     case 0x000A: * LINE FEED */
387
/*     case 0x000B: * LINE TABULATION */
388
/*     case 0x000C: * FORM FEED */
389
/*     case 0x000D: * CARRIAGE RETURN */
390
    0, 1, 1, 1, 1, 1, 0, 0,
391
    0, 0, 0, 0, 0, 0, 0, 0,
392
/*     case 0x001C: * FILE SEPARATOR */
393
/*     case 0x001D: * GROUP SEPARATOR */
394
/*     case 0x001E: * RECORD SEPARATOR */
395
/*     case 0x001F: * UNIT SEPARATOR */
396
    0, 0, 0, 0, 1, 1, 1, 1,
397
/*     case 0x0020: * SPACE */
398
    1, 0, 0, 0, 0, 0, 0, 0,
399
    0, 0, 0, 0, 0, 0, 0, 0,
400
    0, 0, 0, 0, 0, 0, 0, 0,
401
    0, 0, 0, 0, 0, 0, 0, 0,
402
403
    0, 0, 0, 0, 0, 0, 0, 0,
404
    0, 0, 0, 0, 0, 0, 0, 0,
405
    0, 0, 0, 0, 0, 0, 0, 0,
406
    0, 0, 0, 0, 0, 0, 0, 0,
407
    0, 0, 0, 0, 0, 0, 0, 0,
408
    0, 0, 0, 0, 0, 0, 0, 0,
409
    0, 0, 0, 0, 0, 0, 0, 0,
410
    0, 0, 0, 0, 0, 0, 0, 0
411
};
412
413
/* forward */
414
static PyObject* get_latin1_char(unsigned char ch);
415
416
417
static PyObject *
418
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
419
static PyObject *
420
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
421
static PyObject *
422
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
423
424
static PyObject *
425
unicode_encode_call_errorhandler(const char *errors,
426
       PyObject **errorHandler,const char *encoding, const char *reason,
427
       PyObject *unicode, PyObject **exceptionObject,
428
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
429
430
static void
431
raise_encode_exception(PyObject **exceptionObject,
432
                       const char *encoding,
433
                       PyObject *unicode,
434
                       Py_ssize_t startpos, Py_ssize_t endpos,
435
                       const char *reason);
436
437
/* Same for linebreaks */
438
static const unsigned char ascii_linebreak[] = {
439
    0, 0, 0, 0, 0, 0, 0, 0,
440
/*         0x000A, * LINE FEED */
441
/*         0x000B, * LINE TABULATION */
442
/*         0x000C, * FORM FEED */
443
/*         0x000D, * CARRIAGE RETURN */
444
    0, 0, 1, 1, 1, 1, 0, 0,
445
    0, 0, 0, 0, 0, 0, 0, 0,
446
/*         0x001C, * FILE SEPARATOR */
447
/*         0x001D, * GROUP SEPARATOR */
448
/*         0x001E, * RECORD SEPARATOR */
449
    0, 0, 0, 0, 1, 1, 1, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0,
452
    0, 0, 0, 0, 0, 0, 0, 0,
453
    0, 0, 0, 0, 0, 0, 0, 0,
454
455
    0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 0, 0, 0, 0, 0, 0, 0,
459
    0, 0, 0, 0, 0, 0, 0, 0,
460
    0, 0, 0, 0, 0, 0, 0, 0,
461
    0, 0, 0, 0, 0, 0, 0, 0,
462
    0, 0, 0, 0, 0, 0, 0, 0
463
};
464
465
static int convert_uc(PyObject *obj, void *addr);
466
467
struct encoding_map;
468
#include "clinic/unicodeobject.c.h"
469
470
_Py_error_handler
471
_Py_GetErrorHandler(const char *errors)
472
634k
{
473
634k
    if (errors == NULL || strcmp(errors, "strict") == 0) {
474
230k
        return _Py_ERROR_STRICT;
475
230k
    }
476
403k
    if (strcmp(errors, "surrogateescape") == 0) {
477
187k
        return _Py_ERROR_SURROGATEESCAPE;
478
187k
    }
479
215k
    if (strcmp(errors, "replace") == 0) {
480
215k
        return _Py_ERROR_REPLACE;
481
215k
    }
482
0
    if (strcmp(errors, "ignore") == 0) {
483
0
        return _Py_ERROR_IGNORE;
484
0
    }
485
0
    if (strcmp(errors, "backslashreplace") == 0) {
486
0
        return _Py_ERROR_BACKSLASHREPLACE;
487
0
    }
488
0
    if (strcmp(errors, "surrogatepass") == 0) {
489
0
        return _Py_ERROR_SURROGATEPASS;
490
0
    }
491
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
492
0
        return _Py_ERROR_XMLCHARREFREPLACE;
493
0
    }
494
0
    return _Py_ERROR_OTHER;
495
0
}
496
497
498
static _Py_error_handler
499
get_error_handler_wide(const wchar_t *errors)
500
7.63k
{
501
7.63k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
502
0
        return _Py_ERROR_STRICT;
503
0
    }
504
7.63k
    if (wcscmp(errors, L"surrogateescape") == 0) {
505
7.63k
        return _Py_ERROR_SURROGATEESCAPE;
506
7.63k
    }
507
0
    if (wcscmp(errors, L"replace") == 0) {
508
0
        return _Py_ERROR_REPLACE;
509
0
    }
510
0
    if (wcscmp(errors, L"ignore") == 0) {
511
0
        return _Py_ERROR_IGNORE;
512
0
    }
513
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
514
0
        return _Py_ERROR_BACKSLASHREPLACE;
515
0
    }
516
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
517
0
        return _Py_ERROR_SURROGATEPASS;
518
0
    }
519
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
520
0
        return _Py_ERROR_XMLCHARREFREPLACE;
521
0
    }
522
0
    return _Py_ERROR_OTHER;
523
0
}
524
525
526
static inline int
527
unicode_check_encoding_errors(const char *encoding, const char *errors)
528
24.7M
{
529
24.7M
    if (encoding == NULL && errors == NULL) {
530
13.5M
        return 0;
531
13.5M
    }
532
533
11.1M
    PyInterpreterState *interp = _PyInterpreterState_GET();
534
11.1M
#ifndef Py_DEBUG
535
    /* In release mode, only check in development mode (-X dev) */
536
11.1M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
537
11.1M
        return 0;
538
11.1M
    }
539
#else
540
    /* Always check in debug mode */
541
#endif
542
543
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
544
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
545
0
    if (!interp->unicode.fs_codec.encoding) {
546
0
        return 0;
547
0
    }
548
549
    /* Disable checks during Python finalization. For example, it allows to
550
       call _PyObject_Dump() during finalization for debugging purpose. */
551
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
552
0
        return 0;
553
0
    }
554
555
0
    if (encoding != NULL
556
        // Fast path for the most common built-in encodings. Even if the codec
557
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
558
        // create a temporary Unicode string (the key in the cache).
559
0
        && strcmp(encoding, "utf-8") != 0
560
0
        && strcmp(encoding, "utf8") != 0
561
0
        && strcmp(encoding, "ascii") != 0)
562
0
    {
563
0
        PyObject *handler = _PyCodec_Lookup(encoding);
564
0
        if (handler == NULL) {
565
0
            return -1;
566
0
        }
567
0
        Py_DECREF(handler);
568
0
    }
569
570
0
    if (errors != NULL
571
        // Fast path for the most common built-in error handlers.
572
0
        && strcmp(errors, "strict") != 0
573
0
        && strcmp(errors, "ignore") != 0
574
0
        && strcmp(errors, "replace") != 0
575
0
        && strcmp(errors, "surrogateescape") != 0
576
0
        && strcmp(errors, "surrogatepass") != 0)
577
0
    {
578
0
        PyObject *handler = PyCodec_LookupError(errors);
579
0
        if (handler == NULL) {
580
0
            return -1;
581
0
        }
582
0
        Py_DECREF(handler);
583
0
    }
584
0
    return 0;
585
0
}
586
587
588
int
589
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
590
0
{
591
0
#define CHECK(expr) \
592
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
593
594
0
    assert(op != NULL);
595
0
    CHECK(PyUnicode_Check(op));
596
597
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
598
0
    int kind = ascii->state.kind;
599
600
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
601
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
602
0
    }
603
0
    else {
604
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
605
0
        void *data;
606
607
0
        if (ascii->state.compact == 1) {
608
0
            data = compact + 1;
609
0
            CHECK(kind == PyUnicode_1BYTE_KIND
610
0
                                 || kind == PyUnicode_2BYTE_KIND
611
0
                                 || kind == PyUnicode_4BYTE_KIND);
612
0
            CHECK(ascii->state.ascii == 0);
613
0
            CHECK(_PyUnicode_UTF8(op) != data);
614
0
        }
615
0
        else {
616
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
617
618
0
            data = unicode->data.any;
619
0
            CHECK(kind == PyUnicode_1BYTE_KIND
620
0
                     || kind == PyUnicode_2BYTE_KIND
621
0
                     || kind == PyUnicode_4BYTE_KIND);
622
0
            CHECK(ascii->state.compact == 0);
623
0
            CHECK(data != NULL);
624
0
            if (ascii->state.ascii) {
625
0
                CHECK(_PyUnicode_UTF8(op) == data);
626
0
                CHECK(compact->utf8_length == ascii->length);
627
0
            }
628
0
            else {
629
0
                CHECK(_PyUnicode_UTF8(op) != data);
630
0
            }
631
0
        }
632
0
#ifndef Py_GIL_DISABLED
633
0
        if (_PyUnicode_UTF8(op) == NULL)
634
0
            CHECK(compact->utf8_length == 0);
635
0
#endif
636
0
    }
637
638
    /* check that the best kind is used: O(n) operation */
639
0
    if (check_content) {
640
0
        Py_ssize_t i;
641
0
        Py_UCS4 maxchar = 0;
642
0
        const void *data;
643
0
        Py_UCS4 ch;
644
645
0
        data = PyUnicode_DATA(ascii);
646
0
        for (i=0; i < ascii->length; i++)
647
0
        {
648
0
            ch = PyUnicode_READ(kind, data, i);
649
0
            if (ch > maxchar)
650
0
                maxchar = ch;
651
0
        }
652
0
        if (kind == PyUnicode_1BYTE_KIND) {
653
0
            if (ascii->state.ascii == 0) {
654
0
                CHECK(maxchar >= 128);
655
0
                CHECK(maxchar <= 255);
656
0
            }
657
0
            else
658
0
                CHECK(maxchar < 128);
659
0
        }
660
0
        else if (kind == PyUnicode_2BYTE_KIND) {
661
0
            CHECK(maxchar >= 0x100);
662
0
            CHECK(maxchar <= 0xFFFF);
663
0
        }
664
0
        else {
665
0
            CHECK(maxchar >= 0x10000);
666
0
            CHECK(maxchar <= MAX_UNICODE);
667
0
        }
668
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
669
0
    }
670
671
    /* Check interning state */
672
#ifdef Py_DEBUG
673
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
674
    // extensions can make immortal strings mortal (but with a high enough
675
    // refcount).
676
    // The other way is extremely unlikely (worth a potential failed assertion
677
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
678
    switch (PyUnicode_CHECK_INTERNED(op)) {
679
        case SSTATE_NOT_INTERNED:
680
            if (ascii->state.statically_allocated) {
681
                // This state is for two exceptions:
682
                // - strings are currently checked before they're interned
683
                // - the 256 one-latin1-character strings
684
                //   are static but use SSTATE_NOT_INTERNED
685
            }
686
            else {
687
                CHECK(!_Py_IsImmortal(op));
688
            }
689
            break;
690
        case SSTATE_INTERNED_MORTAL:
691
            CHECK(!ascii->state.statically_allocated);
692
            CHECK(!_Py_IsImmortal(op));
693
            break;
694
        case SSTATE_INTERNED_IMMORTAL:
695
            CHECK(!ascii->state.statically_allocated);
696
            break;
697
        case SSTATE_INTERNED_IMMORTAL_STATIC:
698
            CHECK(ascii->state.statically_allocated);
699
            break;
700
        default:
701
            Py_UNREACHABLE();
702
    }
703
#endif
704
705
0
    return 1;
706
707
0
#undef CHECK
708
0
}
709
710
PyObject*
711
_PyUnicode_Result(PyObject *unicode)
712
43.0M
{
713
43.0M
    assert(_PyUnicode_CHECK(unicode));
714
715
43.0M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
716
43.0M
    if (length == 0) {
717
279
        PyObject *empty = _PyUnicode_GetEmpty();
718
279
        if (unicode != empty) {
719
0
            Py_DECREF(unicode);
720
0
        }
721
279
        return empty;
722
279
    }
723
724
43.0M
    if (length == 1) {
725
287k
        int kind = PyUnicode_KIND(unicode);
726
287k
        if (kind == PyUnicode_1BYTE_KIND) {
727
84.6k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
728
84.6k
            Py_UCS1 ch = data[0];
729
84.6k
            PyObject *latin1_char = LATIN1(ch);
730
84.6k
            if (unicode != latin1_char) {
731
79.6k
                Py_DECREF(unicode);
732
79.6k
            }
733
84.6k
            return latin1_char;
734
84.6k
        }
735
287k
    }
736
737
43.0M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
738
42.9M
    return unicode;
739
43.0M
}
740
499k
#define unicode_result _PyUnicode_Result
741
742
static PyObject*
743
unicode_result_unchanged(PyObject *unicode)
744
134M
{
745
134M
    if (PyUnicode_CheckExact(unicode)) {
746
131M
        return Py_NewRef(unicode);
747
131M
    }
748
3.17M
    else
749
        /* Subtype -- return genuine unicode string with the same value. */
750
3.17M
        return _PyUnicode_Copy(unicode);
751
134M
}
752
753
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
754
   ASCII, Latin1, UTF-8, etc. */
755
static char*
756
backslashreplace(PyBytesWriter *writer, char *str,
757
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
758
0
{
759
0
    Py_ssize_t size, i;
760
0
    Py_UCS4 ch;
761
0
    int kind;
762
0
    const void *data;
763
764
0
    kind = PyUnicode_KIND(unicode);
765
0
    data = PyUnicode_DATA(unicode);
766
767
0
    size = 0;
768
    /* determine replacement size */
769
0
    for (i = collstart; i < collend; ++i) {
770
0
        Py_ssize_t incr;
771
772
0
        ch = PyUnicode_READ(kind, data, i);
773
0
        if (ch < 0x100)
774
0
            incr = 2+2;
775
0
        else if (ch < 0x10000)
776
0
            incr = 2+4;
777
0
        else {
778
0
            assert(ch <= MAX_UNICODE);
779
0
            incr = 2+8;
780
0
        }
781
0
        if (size > PY_SSIZE_T_MAX - incr) {
782
0
            PyErr_SetString(PyExc_OverflowError,
783
0
                            "encoded result is too long for a Python string");
784
0
            return NULL;
785
0
        }
786
0
        size += incr;
787
0
    }
788
789
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
790
0
    if (str == NULL) {
791
0
        return NULL;
792
0
    }
793
794
    /* generate replacement */
795
0
    for (i = collstart; i < collend; ++i) {
796
0
        ch = PyUnicode_READ(kind, data, i);
797
0
        *str++ = '\\';
798
0
        if (ch >= 0x00010000) {
799
0
            *str++ = 'U';
800
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
801
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
802
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
803
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
804
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
805
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
806
0
        }
807
0
        else if (ch >= 0x100) {
808
0
            *str++ = 'u';
809
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
810
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
811
0
        }
812
0
        else
813
0
            *str++ = 'x';
814
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
815
0
        *str++ = Py_hexdigits[ch&0xf];
816
0
    }
817
0
    return str;
818
0
}
819
820
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
821
   ASCII, Latin1, UTF-8, etc. */
822
static char*
823
xmlcharrefreplace(PyBytesWriter *writer, char *str,
824
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
825
0
{
826
0
    Py_ssize_t size, i;
827
0
    Py_UCS4 ch;
828
0
    int kind;
829
0
    const void *data;
830
831
0
    kind = PyUnicode_KIND(unicode);
832
0
    data = PyUnicode_DATA(unicode);
833
834
0
    size = 0;
835
    /* determine replacement size */
836
0
    for (i = collstart; i < collend; ++i) {
837
0
        Py_ssize_t incr;
838
839
0
        ch = PyUnicode_READ(kind, data, i);
840
0
        if (ch < 10)
841
0
            incr = 2+1+1;
842
0
        else if (ch < 100)
843
0
            incr = 2+2+1;
844
0
        else if (ch < 1000)
845
0
            incr = 2+3+1;
846
0
        else if (ch < 10000)
847
0
            incr = 2+4+1;
848
0
        else if (ch < 100000)
849
0
            incr = 2+5+1;
850
0
        else if (ch < 1000000)
851
0
            incr = 2+6+1;
852
0
        else {
853
0
            assert(ch <= MAX_UNICODE);
854
0
            incr = 2+7+1;
855
0
        }
856
0
        if (size > PY_SSIZE_T_MAX - incr) {
857
0
            PyErr_SetString(PyExc_OverflowError,
858
0
                            "encoded result is too long for a Python string");
859
0
            return NULL;
860
0
        }
861
0
        size += incr;
862
0
    }
863
864
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
865
0
    if (str == NULL) {
866
0
        return NULL;
867
0
    }
868
869
    /* generate replacement */
870
0
    for (i = collstart; i < collend; ++i) {
871
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
872
0
        if (size < 0) {
873
0
            return NULL;
874
0
        }
875
0
        str += size;
876
0
    }
877
0
    return str;
878
0
}
879
880
/* --- Bloom Filters ----------------------------------------------------- */
881
882
/* stuff to implement simple "bloom filters" for Unicode characters.
883
   to keep things simple, we use a single bitmask, using the least 5
884
   bits from each unicode characters as the bit index. */
885
886
/* the linebreak mask is set up by _PyUnicode_Init() below */
887
888
#if LONG_BIT >= 128
889
#define BLOOM_WIDTH 128
890
#elif LONG_BIT >= 64
891
51.2M
#define BLOOM_WIDTH 64
892
#elif LONG_BIT >= 32
893
#define BLOOM_WIDTH 32
894
#else
895
#error "LONG_BIT is smaller than 32"
896
#endif
897
898
18.1M
#define BLOOM_MASK unsigned long
899
900
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
901
902
75.2M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
903
904
#define BLOOM_LINEBREAK(ch)                                             \
905
263M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
906
263M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
907
908
static inline BLOOM_MASK
909
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
910
9.05M
{
911
9.05M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
912
9.05M
    do {                                               \
913
9.05M
        TYPE *data = (TYPE *)PTR;                      \
914
9.05M
        TYPE *end = data + LEN;                        \
915
9.05M
        Py_UCS4 ch;                                    \
916
20.1M
        for (; data != end; data++) {                  \
917
11.0M
            ch = *data;                                \
918
11.0M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
919
11.0M
        }                                              \
920
9.05M
        break;                                         \
921
9.05M
    } while (0)
922
923
    /* calculate simple bloom-style bitmask for a given unicode string */
924
925
9.05M
    BLOOM_MASK mask;
926
927
9.05M
    mask = 0;
928
9.05M
    switch (kind) {
929
9.05M
    case PyUnicode_1BYTE_KIND:
930
9.05M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
931
9.05M
        break;
932
22
    case PyUnicode_2BYTE_KIND:
933
22
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
934
22
        break;
935
0
    case PyUnicode_4BYTE_KIND:
936
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
937
0
        break;
938
0
    default:
939
0
        Py_UNREACHABLE();
940
9.05M
    }
941
9.05M
    return mask;
942
943
9.05M
#undef BLOOM_UPDATE
944
9.05M
}
945
946
/* Compilation of templated routines */
947
948
1.46M
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
949
950
#include "stringlib/asciilib.h"
951
#include "stringlib/fastsearch.h"
952
#include "stringlib/partition.h"
953
#include "stringlib/split.h"
954
#include "stringlib/count.h"
955
#include "stringlib/find.h"
956
#include "stringlib/find_max_char.h"
957
#include "stringlib/undef.h"
958
959
#include "stringlib/ucs1lib.h"
960
#include "stringlib/fastsearch.h"
961
#include "stringlib/partition.h"
962
#include "stringlib/split.h"
963
#include "stringlib/count.h"
964
#include "stringlib/find.h"
965
#include "stringlib/replace.h"
966
#include "stringlib/repr.h"
967
#include "stringlib/find_max_char.h"
968
#include "stringlib/undef.h"
969
970
#include "stringlib/ucs2lib.h"
971
#include "stringlib/fastsearch.h"
972
#include "stringlib/partition.h"
973
#include "stringlib/split.h"
974
#include "stringlib/count.h"
975
#include "stringlib/find.h"
976
#include "stringlib/replace.h"
977
#include "stringlib/repr.h"
978
#include "stringlib/find_max_char.h"
979
#include "stringlib/undef.h"
980
981
#include "stringlib/ucs4lib.h"
982
#include "stringlib/fastsearch.h"
983
#include "stringlib/partition.h"
984
#include "stringlib/split.h"
985
#include "stringlib/count.h"
986
#include "stringlib/find.h"
987
#include "stringlib/replace.h"
988
#include "stringlib/repr.h"
989
#include "stringlib/find_max_char.h"
990
#include "stringlib/undef.h"
991
992
#undef STRINGLIB_GET_EMPTY
993
994
/* --- Unicode Object ----------------------------------------------------- */
995
996
static inline Py_ssize_t
997
findchar(const void *s, int kind,
998
         Py_ssize_t size, Py_UCS4 ch,
999
         int direction)
1000
115M
{
1001
115M
    switch (kind) {
1002
103M
    case PyUnicode_1BYTE_KIND:
1003
103M
        if ((Py_UCS1) ch != ch)
1004
3.65k
            return -1;
1005
103M
        if (direction > 0)
1006
103M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1007
8.06k
        else
1008
8.06k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1009
10.6M
    case PyUnicode_2BYTE_KIND:
1010
10.6M
        if ((Py_UCS2) ch != ch)
1011
0
            return -1;
1012
10.6M
        if (direction > 0)
1013
10.5M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1014
26.6k
        else
1015
26.6k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1016
1.49M
    case PyUnicode_4BYTE_KIND:
1017
1.49M
        if (direction > 0)
1018
1.38M
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1019
110k
        else
1020
110k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1021
0
    default:
1022
0
        Py_UNREACHABLE();
1023
115M
    }
1024
115M
}
1025
1026
#ifdef Py_DEBUG
1027
/* Fill the data of a Unicode string with invalid characters to detect bugs
1028
   earlier.
1029
1030
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1031
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1032
   invalid character in Unicode 6.0. */
1033
static void
1034
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1035
{
1036
    int kind = PyUnicode_KIND(unicode);
1037
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1038
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1039
    if (length <= old_length)
1040
        return;
1041
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1042
}
1043
#endif
1044
1045
static PyObject*
1046
resize_copy(PyObject *unicode, Py_ssize_t length)
1047
0
{
1048
0
    Py_ssize_t copy_length;
1049
0
    PyObject *copy;
1050
1051
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1052
0
    if (copy == NULL)
1053
0
        return NULL;
1054
1055
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1056
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1057
0
    return copy;
1058
0
}
1059
1060
PyObject*
1061
_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
1062
52.9M
{
1063
52.9M
    Py_ssize_t char_size;
1064
52.9M
    Py_ssize_t struct_size;
1065
52.9M
    Py_ssize_t new_size;
1066
52.9M
    PyObject *new_unicode;
1067
#ifdef Py_DEBUG
1068
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1069
#endif
1070
1071
52.9M
    if (!_PyUnicode_IsModifiable(unicode)) {
1072
0
        PyObject *copy = resize_copy(unicode, length);
1073
0
        if (copy == NULL) {
1074
0
            return NULL;
1075
0
        }
1076
0
        Py_DECREF(unicode);
1077
0
        return copy;
1078
0
    }
1079
52.9M
    assert(PyUnicode_IS_COMPACT(unicode));
1080
1081
52.9M
    char_size = PyUnicode_KIND(unicode);
1082
52.9M
    if (PyUnicode_IS_ASCII(unicode))
1083
43.6M
        struct_size = sizeof(PyASCIIObject);
1084
9.23M
    else
1085
9.23M
        struct_size = sizeof(PyCompactUnicodeObject);
1086
1087
52.9M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1088
0
        PyErr_NoMemory();
1089
0
        return NULL;
1090
0
    }
1091
52.9M
    new_size = (struct_size + (length + 1) * char_size);
1092
1093
52.9M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1094
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1095
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1096
0
        PyUnicode_SET_UTF8(unicode, NULL);
1097
0
    }
1098
#ifdef Py_TRACE_REFS
1099
    _Py_ForgetReference(unicode);
1100
#endif
1101
52.9M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1102
1103
52.9M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1104
52.9M
    if (new_unicode == NULL) {
1105
0
        _Py_NewReferenceNoTotal(unicode);
1106
0
        PyErr_NoMemory();
1107
0
        return NULL;
1108
0
    }
1109
52.9M
    unicode = new_unicode;
1110
52.9M
    _Py_NewReferenceNoTotal(unicode);
1111
1112
52.9M
    _PyUnicode_LENGTH(unicode) = length;
1113
#ifdef Py_DEBUG
1114
    unicode_fill_invalid(unicode, old_length);
1115
#endif
1116
52.9M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1117
52.9M
                    length, 0);
1118
52.9M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1119
52.9M
    return unicode;
1120
52.9M
}
1121
1122
static int
1123
resize_inplace(PyObject *unicode, Py_ssize_t length)
1124
0
{
1125
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1126
0
    assert(Py_REFCNT(unicode) == 1);
1127
1128
0
    Py_ssize_t new_size;
1129
0
    Py_ssize_t char_size;
1130
0
    int share_utf8;
1131
0
    void *data;
1132
#ifdef Py_DEBUG
1133
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1134
#endif
1135
1136
0
    data = _PyUnicode_DATA_ANY(unicode);
1137
0
    char_size = PyUnicode_KIND(unicode);
1138
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1139
1140
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1141
0
        PyErr_NoMemory();
1142
0
        return -1;
1143
0
    }
1144
0
    new_size = (length + 1) * char_size;
1145
1146
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1147
0
    {
1148
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1149
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1150
0
        PyUnicode_SET_UTF8(unicode, NULL);
1151
0
    }
1152
1153
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1154
0
    if (data == NULL) {
1155
0
        PyErr_NoMemory();
1156
0
        return -1;
1157
0
    }
1158
0
    _PyUnicode_DATA_ANY(unicode) = data;
1159
0
    if (share_utf8) {
1160
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1161
0
        PyUnicode_SET_UTF8(unicode, data);
1162
0
    }
1163
0
    _PyUnicode_LENGTH(unicode) = length;
1164
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1165
#ifdef Py_DEBUG
1166
    unicode_fill_invalid(unicode, old_length);
1167
#endif
1168
1169
    /* check for integer overflow */
1170
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1171
0
        PyErr_NoMemory();
1172
0
        return -1;
1173
0
    }
1174
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1175
0
    return 0;
1176
0
}
1177
1178
static const char*
1179
unicode_kind_name(PyObject *unicode)
1180
0
{
1181
    /* don't check consistency: unicode_kind_name() is called from
1182
       _PyUnicode_Dump() */
1183
0
    if (!PyUnicode_IS_COMPACT(unicode))
1184
0
    {
1185
0
        switch (PyUnicode_KIND(unicode))
1186
0
        {
1187
0
        case PyUnicode_1BYTE_KIND:
1188
0
            if (PyUnicode_IS_ASCII(unicode))
1189
0
                return "legacy ascii";
1190
0
            else
1191
0
                return "legacy latin1";
1192
0
        case PyUnicode_2BYTE_KIND:
1193
0
            return "legacy UCS2";
1194
0
        case PyUnicode_4BYTE_KIND:
1195
0
            return "legacy UCS4";
1196
0
        default:
1197
0
            return "<legacy invalid kind>";
1198
0
        }
1199
0
    }
1200
0
    switch (PyUnicode_KIND(unicode)) {
1201
0
    case PyUnicode_1BYTE_KIND:
1202
0
        if (PyUnicode_IS_ASCII(unicode))
1203
0
            return "ascii";
1204
0
        else
1205
0
            return "latin1";
1206
0
    case PyUnicode_2BYTE_KIND:
1207
0
        return "UCS2";
1208
0
    case PyUnicode_4BYTE_KIND:
1209
0
        return "UCS4";
1210
0
    default:
1211
0
        return "<invalid compact kind>";
1212
0
    }
1213
0
}
1214
1215
#ifdef Py_DEBUG
1216
/* Functions wrapping macros for use in debugger */
1217
const char *_PyUnicode_utf8(void *unicode_raw){
1218
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1219
    return PyUnicode_UTF8(unicode);
1220
}
1221
1222
const void *_PyUnicode_compact_data(void *unicode_raw) {
1223
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1224
    return _PyUnicode_COMPACT_DATA(unicode);
1225
}
1226
const void *_PyUnicode_data(void *unicode_raw) {
1227
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1228
    printf("obj %p\n", (void*)unicode);
1229
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1230
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1231
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1232
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1233
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1234
    return PyUnicode_DATA(unicode);
1235
}
1236
1237
void
1238
_PyUnicode_Dump(PyObject *op)
1239
{
1240
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1241
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1242
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1243
    const void *data;
1244
1245
    if (ascii->state.compact)
1246
    {
1247
        if (ascii->state.ascii)
1248
            data = (ascii + 1);
1249
        else
1250
            data = (compact + 1);
1251
    }
1252
    else
1253
        data = unicode->data.any;
1254
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1255
1256
    if (!ascii->state.ascii) {
1257
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1258
    }
1259
    printf(", data=%p\n", data);
1260
}
1261
#endif
1262
1263
1264
PyObject *
1265
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1266
486M
{
1267
    /* Optimization for empty strings */
1268
486M
    if (size == 0) {
1269
22.9M
        return _PyUnicode_GetEmpty();
1270
22.9M
    }
1271
1272
463M
    PyObject *obj;
1273
463M
    PyCompactUnicodeObject *unicode;
1274
463M
    void *data;
1275
463M
    int kind;
1276
463M
    int is_ascii;
1277
463M
    Py_ssize_t char_size;
1278
463M
    Py_ssize_t struct_size;
1279
1280
463M
    is_ascii = 0;
1281
463M
    struct_size = sizeof(PyCompactUnicodeObject);
1282
463M
    if (maxchar < 128) {
1283
303M
        kind = PyUnicode_1BYTE_KIND;
1284
303M
        char_size = 1;
1285
303M
        is_ascii = 1;
1286
303M
        struct_size = sizeof(PyASCIIObject);
1287
303M
    }
1288
160M
    else if (maxchar < 256) {
1289
14.8M
        kind = PyUnicode_1BYTE_KIND;
1290
14.8M
        char_size = 1;
1291
14.8M
    }
1292
145M
    else if (maxchar < 65536) {
1293
139M
        kind = PyUnicode_2BYTE_KIND;
1294
139M
        char_size = 2;
1295
139M
    }
1296
6.00M
    else {
1297
6.00M
        if (maxchar > MAX_UNICODE) {
1298
0
            PyErr_SetString(PyExc_SystemError,
1299
0
                            "invalid maximum character passed to PyUnicode_New");
1300
0
            return NULL;
1301
0
        }
1302
6.00M
        kind = PyUnicode_4BYTE_KIND;
1303
6.00M
        char_size = 4;
1304
6.00M
    }
1305
1306
    /* Ensure we won't overflow the size. */
1307
463M
    if (size < 0) {
1308
0
        PyErr_SetString(PyExc_SystemError,
1309
0
                        "Negative size passed to PyUnicode_New");
1310
0
        return NULL;
1311
0
    }
1312
463M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1313
0
        return PyErr_NoMemory();
1314
1315
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1316
     * PyObject_New() so we are able to allocate space for the object and
1317
     * it's data buffer.
1318
     */
1319
463M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1320
463M
    if (obj == NULL) {
1321
0
        return PyErr_NoMemory();
1322
0
    }
1323
463M
    _PyObject_Init(obj, &PyUnicode_Type);
1324
1325
463M
    unicode = (PyCompactUnicodeObject *)obj;
1326
463M
    if (is_ascii)
1327
303M
        data = ((PyASCIIObject*)obj) + 1;
1328
160M
    else
1329
160M
        data = unicode + 1;
1330
463M
    _PyUnicode_LENGTH(unicode) = size;
1331
463M
    _PyUnicode_HASH(unicode) = -1;
1332
463M
    _PyUnicode_STATE(unicode).interned = 0;
1333
463M
    _PyUnicode_STATE(unicode).kind = kind;
1334
463M
    _PyUnicode_STATE(unicode).compact = 1;
1335
463M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1336
463M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1337
463M
    if (is_ascii) {
1338
303M
        ((char*)data)[size] = 0;
1339
303M
    }
1340
160M
    else if (kind == PyUnicode_1BYTE_KIND) {
1341
14.8M
        ((char*)data)[size] = 0;
1342
14.8M
        unicode->utf8 = NULL;
1343
14.8M
        unicode->utf8_length = 0;
1344
14.8M
    }
1345
145M
    else {
1346
145M
        unicode->utf8 = NULL;
1347
145M
        unicode->utf8_length = 0;
1348
145M
        if (kind == PyUnicode_2BYTE_KIND)
1349
139M
            ((Py_UCS2*)data)[size] = 0;
1350
6.00M
        else /* kind == PyUnicode_4BYTE_KIND */
1351
6.00M
            ((Py_UCS4*)data)[size] = 0;
1352
145M
    }
1353
#ifdef Py_DEBUG
1354
    unicode_fill_invalid((PyObject*)unicode, 0);
1355
#endif
1356
463M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1357
463M
    return obj;
1358
463M
}
1359
1360
static int
1361
unicode_check_modifiable(PyObject *unicode)
1362
641
{
1363
641
    if (!_PyUnicode_IsModifiable(unicode)) {
1364
0
        PyErr_SetString(PyExc_SystemError,
1365
0
                        "Cannot modify a string currently used");
1366
0
        return -1;
1367
0
    }
1368
641
    return 0;
1369
641
}
1370
1371
static int
1372
_copy_characters(PyObject *to, Py_ssize_t to_start,
1373
                 PyObject *from, Py_ssize_t from_start,
1374
                 Py_ssize_t how_many, int check_maxchar)
1375
284M
{
1376
284M
    int from_kind, to_kind;
1377
284M
    const void *from_data;
1378
284M
    void *to_data;
1379
1380
284M
    assert(0 <= how_many);
1381
284M
    assert(0 <= from_start);
1382
284M
    assert(0 <= to_start);
1383
284M
    assert(PyUnicode_Check(from));
1384
284M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1385
1386
284M
    assert(to == NULL || PyUnicode_Check(to));
1387
1388
284M
    if (how_many == 0) {
1389
270k
        return 0;
1390
270k
    }
1391
1392
284M
    assert(to != NULL);
1393
284M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1394
1395
284M
    from_kind = PyUnicode_KIND(from);
1396
284M
    from_data = PyUnicode_DATA(from);
1397
284M
    to_kind = PyUnicode_KIND(to);
1398
284M
    to_data = PyUnicode_DATA(to);
1399
1400
#ifdef Py_DEBUG
1401
    if (!check_maxchar
1402
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1403
    {
1404
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1405
        Py_UCS4 ch;
1406
        Py_ssize_t i;
1407
        for (i=0; i < how_many; i++) {
1408
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1409
            assert(ch <= to_maxchar);
1410
        }
1411
    }
1412
#endif
1413
1414
284M
    if (from_kind == to_kind) {
1415
180M
        if (check_maxchar
1416
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1417
0
        {
1418
            /* Writing Latin-1 characters into an ASCII string requires to
1419
               check that all written characters are pure ASCII */
1420
0
            Py_UCS4 max_char;
1421
0
            max_char = ucs1lib_find_max_char(from_data,
1422
0
                                             (const Py_UCS1*)from_data + how_many);
1423
0
            if (max_char >= 128)
1424
0
                return -1;
1425
0
        }
1426
180M
        memcpy((char*)to_data + to_kind * to_start,
1427
180M
                  (const char*)from_data + from_kind * from_start,
1428
180M
                  to_kind * how_many);
1429
180M
    }
1430
103M
    else if (from_kind == PyUnicode_1BYTE_KIND
1431
101M
             && to_kind == PyUnicode_2BYTE_KIND)
1432
86.1M
    {
1433
86.1M
        _PyUnicode_CONVERT_BYTES(
1434
86.1M
            Py_UCS1, Py_UCS2,
1435
86.1M
            PyUnicode_1BYTE_DATA(from) + from_start,
1436
86.1M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1437
86.1M
            PyUnicode_2BYTE_DATA(to) + to_start
1438
86.1M
            );
1439
86.1M
    }
1440
17.7M
    else if (from_kind == PyUnicode_1BYTE_KIND
1441
15.7M
             && to_kind == PyUnicode_4BYTE_KIND)
1442
15.7M
    {
1443
15.7M
        _PyUnicode_CONVERT_BYTES(
1444
15.7M
            Py_UCS1, Py_UCS4,
1445
15.7M
            PyUnicode_1BYTE_DATA(from) + from_start,
1446
15.7M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1447
15.7M
            PyUnicode_4BYTE_DATA(to) + to_start
1448
15.7M
            );
1449
15.7M
    }
1450
1.94M
    else if (from_kind == PyUnicode_2BYTE_KIND
1451
1.91M
             && to_kind == PyUnicode_4BYTE_KIND)
1452
1.91M
    {
1453
1.91M
        _PyUnicode_CONVERT_BYTES(
1454
1.91M
            Py_UCS2, Py_UCS4,
1455
1.91M
            PyUnicode_2BYTE_DATA(from) + from_start,
1456
1.91M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1457
1.91M
            PyUnicode_4BYTE_DATA(to) + to_start
1458
1.91M
            );
1459
1.91M
    }
1460
30.3k
    else {
1461
30.3k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1462
1463
30.3k
        if (!check_maxchar) {
1464
30.3k
            if (from_kind == PyUnicode_2BYTE_KIND
1465
2.54k
                && to_kind == PyUnicode_1BYTE_KIND)
1466
2.54k
            {
1467
2.54k
                _PyUnicode_CONVERT_BYTES(
1468
2.54k
                    Py_UCS2, Py_UCS1,
1469
2.54k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1470
2.54k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1471
2.54k
                    PyUnicode_1BYTE_DATA(to) + to_start
1472
2.54k
                    );
1473
2.54k
            }
1474
27.7k
            else if (from_kind == PyUnicode_4BYTE_KIND
1475
27.7k
                     && to_kind == PyUnicode_1BYTE_KIND)
1476
10.8k
            {
1477
10.8k
                _PyUnicode_CONVERT_BYTES(
1478
10.8k
                    Py_UCS4, Py_UCS1,
1479
10.8k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1480
10.8k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1481
10.8k
                    PyUnicode_1BYTE_DATA(to) + to_start
1482
10.8k
                    );
1483
10.8k
            }
1484
16.8k
            else if (from_kind == PyUnicode_4BYTE_KIND
1485
16.8k
                     && to_kind == PyUnicode_2BYTE_KIND)
1486
16.8k
            {
1487
16.8k
                _PyUnicode_CONVERT_BYTES(
1488
16.8k
                    Py_UCS4, Py_UCS2,
1489
16.8k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1490
16.8k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1491
16.8k
                    PyUnicode_2BYTE_DATA(to) + to_start
1492
16.8k
                    );
1493
16.8k
            }
1494
0
            else {
1495
0
                Py_UNREACHABLE();
1496
0
            }
1497
30.3k
        }
1498
0
        else {
1499
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1500
0
            Py_UCS4 ch;
1501
0
            Py_ssize_t i;
1502
1503
0
            for (i=0; i < how_many; i++) {
1504
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1505
0
                if (ch > to_maxchar)
1506
0
                    return -1;
1507
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1508
0
            }
1509
0
        }
1510
30.3k
    }
1511
284M
    return 0;
1512
284M
}
1513
1514
void
1515
_PyUnicode_FastCopyCharacters(
1516
    PyObject *to, Py_ssize_t to_start,
1517
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1518
284M
{
1519
284M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1520
284M
}
1521
1522
Py_ssize_t
1523
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1524
                         PyObject *from, Py_ssize_t from_start,
1525
                         Py_ssize_t how_many)
1526
0
{
1527
0
    int err;
1528
1529
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1530
0
        PyErr_BadInternalCall();
1531
0
        return -1;
1532
0
    }
1533
1534
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1535
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1536
0
        return -1;
1537
0
    }
1538
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1539
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1540
0
        return -1;
1541
0
    }
1542
0
    if (how_many < 0) {
1543
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1544
0
        return -1;
1545
0
    }
1546
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1547
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1548
0
        PyErr_Format(PyExc_SystemError,
1549
0
                     "Cannot write %zi characters at %zi "
1550
0
                     "in a string of %zi characters",
1551
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1552
0
        return -1;
1553
0
    }
1554
1555
0
    if (how_many == 0)
1556
0
        return 0;
1557
1558
0
    if (unicode_check_modifiable(to))
1559
0
        return -1;
1560
1561
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1562
0
    if (err) {
1563
0
        PyErr_Format(PyExc_SystemError,
1564
0
                     "Cannot copy %s characters "
1565
0
                     "into a string of %s characters",
1566
0
                     unicode_kind_name(from),
1567
0
                     unicode_kind_name(to));
1568
0
        return -1;
1569
0
    }
1570
0
    return how_many;
1571
0
}
1572
1573
/* Find the maximum code point and count the number of surrogate pairs so a
1574
   correct string length can be computed before converting a string to UCS4.
1575
   This function counts single surrogates as a character and not as a pair.
1576
1577
   Return 0 on success, or -1 on error. */
1578
static int
1579
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1580
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1581
18.9k
{
1582
18.9k
    const wchar_t *iter;
1583
18.9k
    Py_UCS4 ch;
1584
1585
18.9k
    assert(num_surrogates != NULL && maxchar != NULL);
1586
18.9k
    *num_surrogates = 0;
1587
18.9k
    *maxchar = 0;
1588
1589
402k
    for (iter = begin; iter < end; ) {
1590
#if SIZEOF_WCHAR_T == 2
1591
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1592
            && (iter+1) < end
1593
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1594
        {
1595
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1596
            ++(*num_surrogates);
1597
            iter += 2;
1598
        }
1599
        else
1600
#endif
1601
384k
        {
1602
384k
            ch = *iter;
1603
384k
            iter++;
1604
384k
        }
1605
384k
        if (ch > *maxchar) {
1606
79.1k
            *maxchar = ch;
1607
79.1k
            if (*maxchar > MAX_UNICODE) {
1608
0
                PyErr_Format(PyExc_ValueError,
1609
0
                             "character U+%x is not in range [U+0000; U+%x]",
1610
0
                             ch, MAX_UNICODE);
1611
0
                return -1;
1612
0
            }
1613
79.1k
        }
1614
384k
    }
1615
18.9k
    return 0;
1616
18.9k
}
1617
1618
static void
1619
unicode_dealloc(PyObject *unicode)
1620
474M
{
1621
#ifdef Py_DEBUG
1622
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1623
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1624
    }
1625
#endif
1626
474M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1627
        /* This should never get called, but we also don't want to SEGV if
1628
        * we accidentally decref an immortal string out of existence. Since
1629
        * the string is an immortal object, just re-set the reference count.
1630
        */
1631
#ifdef Py_DEBUG
1632
        Py_UNREACHABLE();
1633
#endif
1634
0
        _Py_SetImmortal(unicode);
1635
0
        return;
1636
0
    }
1637
474M
    switch (_PyUnicode_STATE(unicode).interned) {
1638
473M
        case SSTATE_NOT_INTERNED:
1639
473M
            break;
1640
652k
        case SSTATE_INTERNED_MORTAL:
1641
            /* Remove the object from the intern dict.
1642
             * Before doing so, we set the refcount to 2: the key and value
1643
             * in the interned_dict.
1644
             */
1645
652k
            assert(Py_REFCNT(unicode) == 0);
1646
652k
            Py_SET_REFCNT(unicode, 2);
1647
#ifdef Py_REF_DEBUG
1648
            /* let's be pedantic with the ref total */
1649
            _Py_IncRefTotal(_PyThreadState_GET());
1650
            _Py_IncRefTotal(_PyThreadState_GET());
1651
#endif
1652
652k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1653
652k
            PyObject *interned = get_interned_dict(interp);
1654
652k
            assert(interned != NULL);
1655
652k
            PyObject *popped;
1656
652k
            int r = PyDict_Pop(interned, unicode, &popped);
1657
652k
            if (r == -1) {
1658
0
                PyErr_FormatUnraisable("Exception ignored while "
1659
0
                                       "removing an interned string %R",
1660
0
                                       unicode);
1661
                // We don't know what happened to the string. It's probably
1662
                // best to leak it:
1663
                // - if it was popped, there are no more references to it
1664
                //   so it can't cause trouble (except wasted memory)
1665
                // - if it wasn't popped, it'll remain interned
1666
0
                _Py_SetImmortal(unicode);
1667
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1668
0
                return;
1669
0
            }
1670
652k
            if (r == 0) {
1671
                // The interned string was not found in the interned_dict.
1672
#ifdef Py_DEBUG
1673
                Py_UNREACHABLE();
1674
#endif
1675
0
                _Py_SetImmortal(unicode);
1676
0
                return;
1677
0
            }
1678
            // Successfully popped.
1679
652k
            assert(popped == unicode);
1680
            // Only our `popped` reference should be left; remove it too.
1681
652k
            assert(Py_REFCNT(unicode) == 1);
1682
652k
            Py_SET_REFCNT(unicode, 0);
1683
#ifdef Py_REF_DEBUG
1684
            /* let's be pedantic with the ref total */
1685
            _Py_DecRefTotal(_PyThreadState_GET());
1686
#endif
1687
652k
            break;
1688
0
        default:
1689
            // As with `statically_allocated` above.
1690
#ifdef Py_REF_DEBUG
1691
            Py_UNREACHABLE();
1692
#endif
1693
0
            _Py_SetImmortal(unicode);
1694
0
            return;
1695
474M
    }
1696
474M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1697
179k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1698
179k
    }
1699
474M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1700
10.6M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1701
10.6M
    }
1702
1703
474M
    Py_TYPE(unicode)->tp_free(unicode);
1704
474M
}
1705
1706
#ifdef Py_DEBUG
1707
static int
1708
unicode_is_singleton(PyObject *unicode)
1709
{
1710
    if (unicode == &_Py_STR(empty)) {
1711
        return 1;
1712
    }
1713
1714
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1715
    if (ascii->length == 1) {
1716
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1717
        if (ch < 256 && LATIN1(ch) == unicode) {
1718
            return 1;
1719
        }
1720
    }
1721
    return 0;
1722
}
1723
#endif
1724
1725
int
1726
_PyUnicode_IsModifiable(PyObject *unicode)
1727
54.4M
{
1728
54.4M
    assert(_PyUnicode_CHECK(unicode));
1729
54.4M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1730
43.0k
        return 0;
1731
54.4M
    if (PyUnicode_HASH(unicode) != -1)
1732
0
        return 0;
1733
54.4M
    if (PyUnicode_CHECK_INTERNED(unicode))
1734
0
        return 0;
1735
54.4M
    if (!PyUnicode_CheckExact(unicode))
1736
0
        return 0;
1737
#ifdef Py_DEBUG
1738
    /* singleton refcount is greater than 1 */
1739
    assert(!unicode_is_singleton(unicode));
1740
#endif
1741
54.4M
    return 1;
1742
54.4M
}
1743
1744
static int
1745
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1746
763k
{
1747
763k
    PyObject *unicode;
1748
763k
    Py_ssize_t old_length;
1749
1750
763k
    assert(p_unicode != NULL);
1751
763k
    unicode = *p_unicode;
1752
1753
763k
    assert(unicode != NULL);
1754
763k
    assert(PyUnicode_Check(unicode));
1755
763k
    assert(0 <= length);
1756
1757
763k
    old_length = PyUnicode_GET_LENGTH(unicode);
1758
763k
    if (old_length == length)
1759
0
        return 0;
1760
1761
763k
    if (length == 0) {
1762
0
        PyObject *empty = _PyUnicode_GetEmpty();
1763
0
        Py_SETREF(*p_unicode, empty);
1764
0
        return 0;
1765
0
    }
1766
1767
763k
    if (!_PyUnicode_IsModifiable(unicode)) {
1768
0
        PyObject *copy = resize_copy(unicode, length);
1769
0
        if (copy == NULL)
1770
0
            return -1;
1771
0
        Py_SETREF(*p_unicode, copy);
1772
0
        return 0;
1773
0
    }
1774
1775
763k
    if (PyUnicode_IS_COMPACT(unicode)) {
1776
763k
        PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
1777
763k
        if (new_unicode == NULL)
1778
0
            return -1;
1779
763k
        *p_unicode = new_unicode;
1780
763k
        return 0;
1781
763k
    }
1782
0
    return resize_inplace(unicode, length);
1783
763k
}
1784
1785
int
1786
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1787
0
{
1788
0
    PyObject *unicode;
1789
0
    if (p_unicode == NULL) {
1790
0
        PyErr_BadInternalCall();
1791
0
        return -1;
1792
0
    }
1793
0
    unicode = *p_unicode;
1794
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1795
0
    {
1796
0
        PyErr_BadInternalCall();
1797
0
        return -1;
1798
0
    }
1799
0
    return unicode_resize(p_unicode, length);
1800
0
}
1801
1802
static PyObject*
1803
get_latin1_char(Py_UCS1 ch)
1804
245M
{
1805
245M
    PyObject *o = LATIN1(ch);
1806
245M
    return o;
1807
245M
}
1808
1809
static PyObject*
1810
unicode_char(Py_UCS4 ch)
1811
291M
{
1812
291M
    PyObject *unicode;
1813
1814
291M
    assert(ch <= MAX_UNICODE);
1815
1816
291M
    if (ch < 256) {
1817
198M
        return get_latin1_char(ch);
1818
198M
    }
1819
1820
93.0M
    unicode = PyUnicode_New(1, ch);
1821
93.0M
    if (unicode == NULL)
1822
0
        return NULL;
1823
1824
93.0M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1825
93.0M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1826
89.3M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1827
89.3M
    } else {
1828
3.68M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1829
3.68M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1830
3.68M
    }
1831
93.0M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1832
93.0M
    return unicode;
1833
93.0M
}
1834
1835
1836
static inline void
1837
unicode_write_widechar(int kind, void *data,
1838
                       const wchar_t *u, Py_ssize_t size,
1839
                       Py_ssize_t num_surrogates)
1840
18.9k
{
1841
18.9k
    switch (kind) {
1842
18.9k
    case PyUnicode_1BYTE_KIND:
1843
18.9k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1844
18.9k
        break;
1845
1846
0
    case PyUnicode_2BYTE_KIND:
1847
#if SIZEOF_WCHAR_T == 2
1848
        memcpy(data, u, size * 2);
1849
#else
1850
0
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1851
0
#endif
1852
0
        break;
1853
1854
0
    case PyUnicode_4BYTE_KIND:
1855
0
    {
1856
#if SIZEOF_WCHAR_T == 2
1857
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1858
        // surrogate pairs.
1859
        const wchar_t *end = u + size;
1860
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1861
#  ifndef NDEBUG
1862
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1863
#  endif
1864
        for (const wchar_t *iter = u; iter < end; ) {
1865
            assert(ucs4_out < ucs4_end);
1866
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1867
                && (iter+1) < end
1868
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1869
            {
1870
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1871
                iter += 2;
1872
            }
1873
            else {
1874
                *ucs4_out++ = *iter;
1875
                iter++;
1876
            }
1877
        }
1878
        assert(ucs4_out == ucs4_end);
1879
#else
1880
0
        assert(num_surrogates == 0);
1881
0
        memcpy(data, u, size * 4);
1882
0
#endif
1883
0
        break;
1884
0
    }
1885
0
    default:
1886
0
        Py_UNREACHABLE();
1887
18.9k
    }
1888
18.9k
}
1889
1890
1891
PyObject *
1892
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1893
19.0k
{
1894
19.0k
    PyObject *unicode;
1895
19.0k
    Py_UCS4 maxchar = 0;
1896
19.0k
    Py_ssize_t num_surrogates;
1897
1898
19.0k
    if (u == NULL && size != 0) {
1899
0
        PyErr_BadInternalCall();
1900
0
        return NULL;
1901
0
    }
1902
1903
19.0k
    if (size == -1) {
1904
792
        size = wcslen(u);
1905
792
    }
1906
1907
    /* If the Unicode data is known at construction time, we can apply
1908
       some optimizations which share commonly used objects. */
1909
1910
    /* Optimization for empty strings */
1911
19.0k
    if (size == 0)
1912
44
        _Py_RETURN_UNICODE_EMPTY();
1913
1914
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1915
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1916
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1917
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1918
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1919
        if (!converted) {
1920
            return NULL;
1921
        }
1922
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1923
        PyMem_Free(converted);
1924
        return unicode;
1925
    }
1926
#endif
1927
1928
    /* Single character Unicode objects in the Latin-1 range are
1929
       shared when using this constructor */
1930
18.9k
    if (size == 1 && (Py_UCS4)*u < 256)
1931
0
        return get_latin1_char((unsigned char)*u);
1932
1933
    /* If not empty and not single character, copy the Unicode data
1934
       into the new object */
1935
18.9k
    if (find_maxchar_surrogates(u, u + size,
1936
18.9k
                                &maxchar, &num_surrogates) == -1)
1937
0
        return NULL;
1938
1939
18.9k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1940
18.9k
    if (!unicode)
1941
0
        return NULL;
1942
1943
18.9k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1944
18.9k
                           u, size, num_surrogates);
1945
1946
18.9k
    return unicode_result(unicode);
1947
18.9k
}
1948
1949
1950
int
1951
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
1952
                              const wchar_t *str,
1953
                              Py_ssize_t size)
1954
0
{
1955
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
1956
1957
0
    if (size < 0) {
1958
0
        size = wcslen(str);
1959
0
    }
1960
1961
0
    if (size == 0) {
1962
0
        return 0;
1963
0
    }
1964
1965
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1966
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1967
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1968
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1969
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
1970
        if (!converted) {
1971
            return -1;
1972
        }
1973
1974
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
1975
        PyMem_Free(converted);
1976
        return res;
1977
    }
1978
#endif
1979
1980
0
    Py_UCS4 maxchar = 0;
1981
0
    Py_ssize_t num_surrogates;
1982
0
    if (find_maxchar_surrogates(str, str + size,
1983
0
                                &maxchar, &num_surrogates) == -1) {
1984
0
        return -1;
1985
0
    }
1986
1987
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
1988
0
        return -1;
1989
0
    }
1990
1991
0
    int kind = writer->kind;
1992
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
1993
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
1994
1995
0
    writer->pos += size - num_surrogates;
1996
0
    return 0;
1997
0
}
1998
1999
2000
PyObject *
2001
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2002
676k
{
2003
676k
    if (size < 0) {
2004
0
        PyErr_SetString(PyExc_SystemError,
2005
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2006
0
        return NULL;
2007
0
    }
2008
676k
    if (u != NULL) {
2009
676k
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2010
676k
    }
2011
0
    if (size > 0) {
2012
0
        PyErr_SetString(PyExc_SystemError,
2013
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2014
0
        return NULL;
2015
0
    }
2016
0
    return _PyUnicode_GetEmpty();
2017
0
}
2018
2019
PyObject *
2020
PyUnicode_FromString(const char *u)
2021
7.36M
{
2022
7.36M
    size_t size = strlen(u);
2023
7.36M
    if (size > PY_SSIZE_T_MAX) {
2024
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2025
0
        return NULL;
2026
0
    }
2027
7.36M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2028
7.36M
}
2029
2030
2031
PyObject *
2032
_PyUnicode_FromId(_Py_Identifier *id)
2033
0
{
2034
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2035
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2036
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2037
2038
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2039
0
    if (index < 0) {
2040
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2041
2042
0
        PyMutex_Lock(&rt_ids->mutex);
2043
        // Check again to detect concurrent access. Another thread can have
2044
        // initialized the index while this thread waited for the lock.
2045
0
        index = _Py_atomic_load_ssize(&id->index);
2046
0
        if (index < 0) {
2047
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2048
0
            index = rt_ids->next_index;
2049
0
            rt_ids->next_index++;
2050
0
            _Py_atomic_store_ssize(&id->index, index);
2051
0
        }
2052
0
        PyMutex_Unlock(&rt_ids->mutex);
2053
0
    }
2054
0
    assert(index >= 0);
2055
2056
0
    PyObject *obj;
2057
0
    if (index < ids->size) {
2058
0
        obj = ids->array[index];
2059
0
        if (obj) {
2060
            // Return a borrowed reference
2061
0
            goto end;
2062
0
        }
2063
0
    }
2064
2065
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2066
0
                                       NULL, NULL);
2067
0
    if (!obj) {
2068
0
        goto end;
2069
0
    }
2070
0
    _PyUnicode_InternImmortal(interp, &obj);
2071
2072
0
    if (index >= ids->size) {
2073
        // Overallocate to reduce the number of realloc
2074
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2075
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2076
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2077
0
        if (new_array == NULL) {
2078
0
            PyErr_NoMemory();
2079
0
            obj = NULL;
2080
0
            goto end;
2081
0
        }
2082
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2083
0
        ids->array = new_array;
2084
0
        ids->size = new_size;
2085
0
    }
2086
2087
    // The array stores a strong reference
2088
0
    ids->array[index] = obj;
2089
2090
0
end:
2091
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2092
    // Return a borrowed reference
2093
0
    return obj;
2094
0
}
2095
2096
2097
static void
2098
unicode_clear_identifiers(struct _Py_unicode_state *state)
2099
0
{
2100
0
    struct _Py_unicode_ids *ids = &state->ids;
2101
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2102
0
        Py_XDECREF(ids->array[i]);
2103
0
    }
2104
0
    ids->size = 0;
2105
0
    PyMem_Free(ids->array);
2106
0
    ids->array = NULL;
2107
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2108
    // after Py_Finalize().
2109
0
}
2110
2111
2112
/* Internal function, doesn't check maximum character */
2113
2114
PyObject*
2115
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2116
99.2M
{
2117
99.2M
    const unsigned char *s = (const unsigned char *)buffer;
2118
99.2M
    PyObject *unicode;
2119
99.2M
    if (size == 1) {
2120
#ifdef Py_DEBUG
2121
        assert((unsigned char)s[0] < 128);
2122
#endif
2123
32.4M
        return get_latin1_char(s[0]);
2124
32.4M
    }
2125
66.8M
    unicode = PyUnicode_New(size, 127);
2126
66.8M
    if (!unicode)
2127
0
        return NULL;
2128
66.8M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2129
66.8M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2130
66.8M
    return unicode;
2131
66.8M
}
2132
2133
static Py_UCS4
2134
kind_maxchar_limit(int kind)
2135
0
{
2136
0
    switch (kind) {
2137
0
    case PyUnicode_1BYTE_KIND:
2138
0
        return 0x80;
2139
0
    case PyUnicode_2BYTE_KIND:
2140
0
        return 0x100;
2141
0
    case PyUnicode_4BYTE_KIND:
2142
0
        return 0x10000;
2143
0
    default:
2144
0
        Py_UNREACHABLE();
2145
0
    }
2146
0
}
2147
2148
static PyObject*
2149
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2150
50.4M
{
2151
50.4M
    PyObject *res;
2152
50.4M
    unsigned char max_char;
2153
2154
50.4M
    if (size == 0) {
2155
6.69M
        _Py_RETURN_UNICODE_EMPTY();
2156
6.69M
    }
2157
50.4M
    assert(size > 0);
2158
43.7M
    if (size == 1) {
2159
13.0M
        return get_latin1_char(u[0]);
2160
13.0M
    }
2161
2162
30.7M
    max_char = ucs1lib_find_max_char(u, u + size);
2163
30.7M
    res = PyUnicode_New(size, max_char);
2164
30.7M
    if (!res)
2165
0
        return NULL;
2166
30.7M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2167
30.7M
    assert(_PyUnicode_CheckConsistency(res, 1));
2168
30.7M
    return res;
2169
30.7M
}
2170
2171
static PyObject*
2172
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2173
96.1M
{
2174
96.1M
    PyObject *res;
2175
96.1M
    Py_UCS2 max_char;
2176
2177
96.1M
    if (size == 0)
2178
12.0M
        _Py_RETURN_UNICODE_EMPTY();
2179
96.1M
    assert(size > 0);
2180
84.1M
    if (size == 1)
2181
54.9M
        return unicode_char(u[0]);
2182
2183
29.1M
    max_char = ucs2lib_find_max_char(u, u + size);
2184
29.1M
    res = PyUnicode_New(size, max_char);
2185
29.1M
    if (!res)
2186
0
        return NULL;
2187
29.1M
    if (max_char >= 256)
2188
17.2M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2189
11.9M
    else {
2190
11.9M
        _PyUnicode_CONVERT_BYTES(
2191
11.9M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2192
11.9M
    }
2193
29.1M
    assert(_PyUnicode_CheckConsistency(res, 1));
2194
29.1M
    return res;
2195
29.1M
}
2196
2197
static PyObject*
2198
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2199
70.6M
{
2200
70.6M
    PyObject *res;
2201
70.6M
    Py_UCS4 max_char;
2202
2203
70.6M
    if (size == 0)
2204
8.23M
        _Py_RETURN_UNICODE_EMPTY();
2205
70.6M
    assert(size > 0);
2206
62.4M
    if (size == 1)
2207
43.6M
        return unicode_char(u[0]);
2208
2209
18.8M
    max_char = ucs4lib_find_max_char(u, u + size);
2210
18.8M
    res = PyUnicode_New(size, max_char);
2211
18.8M
    if (!res)
2212
0
        return NULL;
2213
18.8M
    if (max_char < 256)
2214
13.4M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2215
18.8M
                                 PyUnicode_1BYTE_DATA(res));
2216
5.36M
    else if (max_char < 0x10000)
2217
3.83M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2218
5.36M
                                 PyUnicode_2BYTE_DATA(res));
2219
1.53M
    else
2220
1.53M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2221
18.8M
    assert(_PyUnicode_CheckConsistency(res, 1));
2222
18.8M
    return res;
2223
18.8M
}
2224
2225
2226
int
2227
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2228
                          Py_UCS4 *str,
2229
                          Py_ssize_t size)
2230
0
{
2231
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2232
2233
0
    if (size < 0) {
2234
0
        PyErr_SetString(PyExc_ValueError,
2235
0
                        "size must be positive");
2236
0
        return -1;
2237
0
    }
2238
2239
0
    if (size == 0) {
2240
0
        return 0;
2241
0
    }
2242
2243
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2244
2245
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2246
0
        return -1;
2247
0
    }
2248
2249
0
    int kind = writer->kind;
2250
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2251
0
    if (kind == PyUnicode_1BYTE_KIND) {
2252
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2253
0
                                 str, str + size,
2254
0
                                 data);
2255
0
    }
2256
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2257
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2258
0
                                 str, str + size,
2259
0
                                 data);
2260
0
    }
2261
0
    else {
2262
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2263
0
    }
2264
0
    writer->pos += size;
2265
2266
0
    return 0;
2267
0
}
2268
2269
2270
PyObject*
2271
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2272
162M
{
2273
162M
    if (size < 0) {
2274
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2275
0
        return NULL;
2276
0
    }
2277
162M
    switch (kind) {
2278
23.3M
    case PyUnicode_1BYTE_KIND:
2279
23.3M
        return _PyUnicode_FromUCS1(buffer, size);
2280
80.9M
    case PyUnicode_2BYTE_KIND:
2281
80.9M
        return _PyUnicode_FromUCS2(buffer, size);
2282
58.1M
    case PyUnicode_4BYTE_KIND:
2283
58.1M
        return _PyUnicode_FromUCS4(buffer, size);
2284
0
    default:
2285
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2286
0
        return NULL;
2287
162M
    }
2288
162M
}
2289
2290
Py_UCS4
2291
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2292
11.2M
{
2293
11.2M
    int kind;
2294
11.2M
    const void *startptr, *endptr;
2295
2296
11.2M
    assert(0 <= start);
2297
11.2M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2298
11.2M
    assert(start <= end);
2299
2300
11.2M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2301
0
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2302
2303
11.2M
    if (start == end)
2304
0
        return 127;
2305
2306
11.2M
    if (PyUnicode_IS_ASCII(unicode))
2307
11.2M
        return 127;
2308
2309
38.4k
    kind = PyUnicode_KIND(unicode);
2310
38.4k
    startptr = PyUnicode_DATA(unicode);
2311
38.4k
    endptr = (char *)startptr + end * kind;
2312
38.4k
    startptr = (char *)startptr + start * kind;
2313
38.4k
    switch(kind) {
2314
1.55k
    case PyUnicode_1BYTE_KIND:
2315
1.55k
        return ucs1lib_find_max_char(startptr, endptr);
2316
5.08k
    case PyUnicode_2BYTE_KIND:
2317
5.08k
        return ucs2lib_find_max_char(startptr, endptr);
2318
31.8k
    case PyUnicode_4BYTE_KIND:
2319
31.8k
        return ucs4lib_find_max_char(startptr, endptr);
2320
0
    default:
2321
0
        Py_UNREACHABLE();
2322
38.4k
    }
2323
38.4k
}
2324
2325
/* Ensure that a string uses the most efficient storage, if it is not the
2326
   case: create a new string with of the right kind. Write NULL into *p_unicode
2327
   on error. */
2328
static void
2329
unicode_adjust_maxchar(PyObject **p_unicode)
2330
0
{
2331
0
    PyObject *unicode, *copy;
2332
0
    Py_UCS4 max_char;
2333
0
    Py_ssize_t len;
2334
0
    int kind;
2335
2336
0
    assert(p_unicode != NULL);
2337
0
    unicode = *p_unicode;
2338
0
    if (PyUnicode_IS_ASCII(unicode))
2339
0
        return;
2340
2341
0
    len = PyUnicode_GET_LENGTH(unicode);
2342
0
    kind = PyUnicode_KIND(unicode);
2343
0
    if (kind == PyUnicode_1BYTE_KIND) {
2344
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2345
0
        max_char = ucs1lib_find_max_char(u, u + len);
2346
0
        if (max_char >= 128)
2347
0
            return;
2348
0
    }
2349
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2350
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2351
0
        max_char = ucs2lib_find_max_char(u, u + len);
2352
0
        if (max_char >= 256)
2353
0
            return;
2354
0
    }
2355
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2356
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2357
0
        max_char = ucs4lib_find_max_char(u, u + len);
2358
0
        if (max_char >= 0x10000)
2359
0
            return;
2360
0
    }
2361
0
    else
2362
0
        Py_UNREACHABLE();
2363
2364
0
    copy = PyUnicode_New(len, max_char);
2365
0
    if (copy != NULL)
2366
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2367
0
    Py_DECREF(unicode);
2368
0
    *p_unicode = copy;
2369
0
}
2370
2371
PyObject*
2372
_PyUnicode_Copy(PyObject *unicode)
2373
3.17M
{
2374
3.17M
    Py_ssize_t length;
2375
3.17M
    PyObject *copy;
2376
2377
3.17M
    if (!PyUnicode_Check(unicode)) {
2378
0
        PyErr_BadInternalCall();
2379
0
        return NULL;
2380
0
    }
2381
2382
3.17M
    length = PyUnicode_GET_LENGTH(unicode);
2383
3.17M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2384
3.17M
    if (!copy)
2385
0
        return NULL;
2386
3.17M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2387
2388
3.17M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2389
3.17M
              length * PyUnicode_KIND(unicode));
2390
3.17M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2391
3.17M
    return copy;
2392
3.17M
}
2393
2394
2395
/* Widen Unicode objects to larger buffers. Don't write terminating null
2396
   character. Return NULL on error. */
2397
2398
static void*
2399
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2400
14.7M
{
2401
14.7M
    void *result;
2402
2403
14.7M
    assert(skind < kind);
2404
14.7M
    switch (kind) {
2405
13.3M
    case PyUnicode_2BYTE_KIND:
2406
13.3M
        result = PyMem_New(Py_UCS2, len);
2407
13.3M
        if (!result)
2408
0
            return PyErr_NoMemory();
2409
13.3M
        assert(skind == PyUnicode_1BYTE_KIND);
2410
13.3M
        _PyUnicode_CONVERT_BYTES(
2411
13.3M
            Py_UCS1, Py_UCS2,
2412
13.3M
            (const Py_UCS1 *)data,
2413
13.3M
            ((const Py_UCS1 *)data) + len,
2414
13.3M
            result);
2415
13.3M
        return result;
2416
1.36M
    case PyUnicode_4BYTE_KIND:
2417
1.36M
        result = PyMem_New(Py_UCS4, len);
2418
1.36M
        if (!result)
2419
0
            return PyErr_NoMemory();
2420
1.36M
        if (skind == PyUnicode_2BYTE_KIND) {
2421
0
            _PyUnicode_CONVERT_BYTES(
2422
0
                Py_UCS2, Py_UCS4,
2423
0
                (const Py_UCS2 *)data,
2424
0
                ((const Py_UCS2 *)data) + len,
2425
0
                result);
2426
0
        }
2427
1.36M
        else {
2428
1.36M
            assert(skind == PyUnicode_1BYTE_KIND);
2429
1.36M
            _PyUnicode_CONVERT_BYTES(
2430
1.36M
                Py_UCS1, Py_UCS4,
2431
1.36M
                (const Py_UCS1 *)data,
2432
1.36M
                ((const Py_UCS1 *)data) + len,
2433
1.36M
                result);
2434
1.36M
        }
2435
1.36M
        return result;
2436
0
    default:
2437
0
        Py_UNREACHABLE();
2438
0
        return NULL;
2439
14.7M
    }
2440
14.7M
}
2441
2442
static Py_UCS4*
2443
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2444
        int copy_null)
2445
78.4k
{
2446
78.4k
    int kind;
2447
78.4k
    const void *data;
2448
78.4k
    Py_ssize_t len, targetlen;
2449
78.4k
    kind = PyUnicode_KIND(string);
2450
78.4k
    data = PyUnicode_DATA(string);
2451
78.4k
    len = PyUnicode_GET_LENGTH(string);
2452
78.4k
    targetlen = len;
2453
78.4k
    if (copy_null)
2454
0
        targetlen++;
2455
78.4k
    if (!target) {
2456
0
        target = PyMem_New(Py_UCS4, targetlen);
2457
0
        if (!target) {
2458
0
            PyErr_NoMemory();
2459
0
            return NULL;
2460
0
        }
2461
0
    }
2462
78.4k
    else {
2463
78.4k
        if (targetsize < targetlen) {
2464
0
            PyErr_Format(PyExc_SystemError,
2465
0
                         "string is longer than the buffer");
2466
0
            if (copy_null && 0 < targetsize)
2467
0
                target[0] = 0;
2468
0
            return NULL;
2469
0
        }
2470
78.4k
    }
2471
78.4k
    if (kind == PyUnicode_1BYTE_KIND) {
2472
60.0k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2473
60.0k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2474
60.0k
    }
2475
18.3k
    else if (kind == PyUnicode_2BYTE_KIND) {
2476
13.8k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2477
13.8k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2478
13.8k
    }
2479
4.53k
    else if (kind == PyUnicode_4BYTE_KIND) {
2480
4.53k
        memcpy(target, data, len * sizeof(Py_UCS4));
2481
4.53k
    }
2482
0
    else {
2483
0
        Py_UNREACHABLE();
2484
0
    }
2485
78.4k
    if (copy_null)
2486
0
        target[len] = 0;
2487
78.4k
    return target;
2488
78.4k
}
2489
2490
Py_UCS4*
2491
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2492
                 int copy_null)
2493
78.4k
{
2494
78.4k
    if (target == NULL || targetsize < 0) {
2495
0
        PyErr_BadInternalCall();
2496
0
        return NULL;
2497
0
    }
2498
78.4k
    return as_ucs4(string, target, targetsize, copy_null);
2499
78.4k
}
2500
2501
Py_UCS4*
2502
PyUnicode_AsUCS4Copy(PyObject *string)
2503
0
{
2504
0
    return as_ucs4(string, NULL, 0, 1);
2505
0
}
2506
2507
/* maximum number of characters required for output of %jo or %jd or %p.
2508
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2509
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2510
   plus 1 for the terminal NUL. */
2511
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2512
2513
static int
2514
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2515
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2516
28.7k
{
2517
28.7k
    Py_ssize_t length, fill, arglen;
2518
28.7k
    Py_UCS4 maxchar;
2519
2520
28.7k
    length = PyUnicode_GET_LENGTH(str);
2521
28.7k
    if ((precision == -1 || precision >= length)
2522
28.7k
        && width <= length)
2523
28.7k
        return _PyUnicodeWriter_WriteStr(writer, str);
2524
2525
44
    if (precision != -1)
2526
44
        length = Py_MIN(precision, length);
2527
2528
44
    arglen = Py_MAX(length, width);
2529
44
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2530
22
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2531
22
    else
2532
22
        maxchar = writer->maxchar;
2533
2534
44
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2535
0
        return -1;
2536
2537
44
    fill = Py_MAX(width - length, 0);
2538
44
    if (fill && !(flags & F_LJUST)) {
2539
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2540
0
            return -1;
2541
0
        writer->pos += fill;
2542
0
    }
2543
2544
44
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2545
44
                                  str, 0, length);
2546
44
    writer->pos += length;
2547
2548
44
    if (fill && (flags & F_LJUST)) {
2549
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2550
0
            return -1;
2551
0
        writer->pos += fill;
2552
0
    }
2553
2554
44
    return 0;
2555
44
}
2556
2557
static int
2558
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2559
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2560
4.49M
{
2561
    /* UTF-8 */
2562
4.49M
    Py_ssize_t *pconsumed = NULL;
2563
4.49M
    Py_ssize_t length;
2564
4.49M
    if (precision == -1) {
2565
242k
        length = strlen(str);
2566
242k
    }
2567
4.24M
    else {
2568
4.24M
        length = 0;
2569
17.5M
        while (length < precision && str[length]) {
2570
13.3M
            length++;
2571
13.3M
        }
2572
4.24M
        if (length == precision) {
2573
            /* The input string is not NUL-terminated.  If it ends with an
2574
             * incomplete UTF-8 sequence, truncate the string just before it.
2575
             * Incomplete sequences in the middle and sequences which cannot
2576
             * be valid prefixes are still treated as errors and replaced
2577
             * with \xfffd. */
2578
1.75k
            pconsumed = &length;
2579
1.75k
        }
2580
4.24M
    }
2581
2582
4.49M
    if (width < 0) {
2583
4.49M
        return _PyUnicode_DecodeUTF8Writer(writer, str, length,
2584
4.49M
                                           _Py_ERROR_REPLACE, "replace", pconsumed);
2585
4.49M
    }
2586
2587
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2588
0
                                                     "replace", pconsumed);
2589
0
    if (unicode == NULL)
2590
0
        return -1;
2591
2592
0
    int res = unicode_fromformat_write_str(writer, unicode,
2593
0
                                           width, -1, flags);
2594
0
    Py_DECREF(unicode);
2595
0
    return res;
2596
0
}
2597
2598
static int
2599
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2600
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2601
0
{
2602
0
    Py_ssize_t length;
2603
0
    if (precision == -1) {
2604
0
        length = wcslen(str);
2605
0
    }
2606
0
    else {
2607
0
        length = 0;
2608
0
        while (length < precision && str[length]) {
2609
0
            length++;
2610
0
        }
2611
0
    }
2612
2613
0
    if (width < 0) {
2614
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2615
0
                                             str, length);
2616
0
    }
2617
2618
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2619
0
    if (unicode == NULL)
2620
0
        return -1;
2621
2622
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2623
0
    Py_DECREF(unicode);
2624
0
    return res;
2625
0
}
2626
2627
0
#define F_LONG 1
2628
0
#define F_LONGLONG 2
2629
90.9k
#define F_SIZE 3
2630
0
#define F_PTRDIFF 4
2631
0
#define F_INTMAX 5
2632
2633
static const char*
2634
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2635
                       const char *f, va_list *vargs)
2636
25.8M
{
2637
25.8M
    const char *p;
2638
25.8M
    Py_ssize_t len;
2639
25.8M
    int flags = 0;
2640
25.8M
    Py_ssize_t width;
2641
25.8M
    Py_ssize_t precision;
2642
2643
25.8M
    p = f;
2644
25.8M
    f++;
2645
25.8M
    if (*f == '%') {
2646
4.23M
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2647
0
            return NULL;
2648
4.23M
        f++;
2649
4.23M
        return f;
2650
4.23M
    }
2651
2652
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2653
    /* Flags '+', ' ' and '#' are not particularly useful.
2654
     * They are not worth the implementation and maintenance costs.
2655
     * In addition, '#' should add "0" for "o" conversions for compatibility
2656
     * with printf, but it would confuse Python users. */
2657
21.5M
    while (1) {
2658
21.5M
        switch (*f++) {
2659
0
        case '-': flags |= F_LJUST; continue;
2660
2.13k
        case '0': flags |= F_ZERO; continue;
2661
0
        case '#': flags |= F_ALT; continue;
2662
21.5M
        }
2663
21.5M
        f--;
2664
21.5M
        break;
2665
21.5M
    }
2666
2667
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2668
21.5M
    width = -1;
2669
21.5M
    if (*f == '*') {
2670
0
        width = va_arg(*vargs, int);
2671
0
        if (width < 0) {
2672
0
            flags |= F_LJUST;
2673
0
            width = -width;
2674
0
        }
2675
0
        f++;
2676
0
    }
2677
21.5M
    else if (Py_ISDIGIT((unsigned)*f)) {
2678
2.13k
        width = *f - '0';
2679
2.13k
        f++;
2680
2.13k
        while (Py_ISDIGIT((unsigned)*f)) {
2681
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2682
0
                PyErr_SetString(PyExc_ValueError,
2683
0
                                "width too big");
2684
0
                return NULL;
2685
0
            }
2686
0
            width = (width * 10) + (*f - '0');
2687
0
            f++;
2688
0
        }
2689
2.13k
    }
2690
21.5M
    precision = -1;
2691
21.5M
    if (*f == '.') {
2692
4.25M
        f++;
2693
4.25M
        if (*f == '*') {
2694
0
            precision = va_arg(*vargs, int);
2695
0
            if (precision < 0) {
2696
0
                precision = -2;
2697
0
            }
2698
0
            f++;
2699
0
        }
2700
4.25M
        else if (Py_ISDIGIT((unsigned)*f)) {
2701
4.25M
            precision = (*f - '0');
2702
4.25M
            f++;
2703
12.7M
            while (Py_ISDIGIT((unsigned)*f)) {
2704
8.49M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2705
0
                    PyErr_SetString(PyExc_ValueError,
2706
0
                                    "precision too big");
2707
0
                    return NULL;
2708
0
                }
2709
8.49M
                precision = (precision * 10) + (*f - '0');
2710
8.49M
                f++;
2711
8.49M
            }
2712
4.25M
        }
2713
4.25M
    }
2714
2715
21.5M
    int sizemod = 0;
2716
21.5M
    if (*f == 'l') {
2717
0
        if (f[1] == 'l') {
2718
0
            sizemod = F_LONGLONG;
2719
0
            f += 2;
2720
0
        }
2721
0
        else {
2722
0
            sizemod = F_LONG;
2723
0
            ++f;
2724
0
        }
2725
0
    }
2726
21.5M
    else if (*f == 'z') {
2727
45.4k
        sizemod = F_SIZE;
2728
45.4k
        ++f;
2729
45.4k
    }
2730
21.5M
    else if (*f == 't') {
2731
0
        sizemod = F_PTRDIFF;
2732
0
        ++f;
2733
0
    }
2734
21.5M
    else if (*f == 'j') {
2735
0
        sizemod = F_INTMAX;
2736
0
        ++f;
2737
0
    }
2738
21.5M
    if (f[0] != '\0' && f[1] == '\0')
2739
4.32M
        writer->overallocate = 0;
2740
2741
21.5M
    switch (*f) {
2742
12.8M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2743
12.8M
        break;
2744
4.24M
    case 'c': case 'p':
2745
4.24M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2746
4.24M
        break;
2747
4.49M
    case 's':
2748
4.49M
    case 'V':
2749
4.49M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2750
4.49M
        break;
2751
4.49M
    default:
2752
28.7k
        if (sizemod) goto invalid_format;
2753
28.7k
        break;
2754
21.5M
    }
2755
2756
21.5M
    switch (*f) {
2757
4.24M
    case 'c':
2758
4.24M
    {
2759
4.24M
        int ordinal = va_arg(*vargs, int);
2760
4.24M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2761
0
            PyErr_SetString(PyExc_OverflowError,
2762
0
                            "character argument not in range(0x110000)");
2763
0
            return NULL;
2764
0
        }
2765
4.24M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2766
0
            return NULL;
2767
4.24M
        break;
2768
4.24M
    }
2769
2770
12.8M
    case 'd': case 'i':
2771
12.8M
    case 'o': case 'u': case 'x': case 'X':
2772
12.8M
    {
2773
12.8M
        char buffer[MAX_INTMAX_CHARS];
2774
2775
        // Fill buffer using sprinf, with one of many possible format
2776
        // strings, like "%llX" for `long long` in hexadecimal.
2777
        // The type/size is in `sizemod`; the format is in `*f`.
2778
2779
        // Use macros with nested switches to keep the sprintf format strings
2780
        // as compile-time literals, avoiding warnings and maybe allowing
2781
        // optimizations.
2782
2783
        // `SPRINT` macro does one sprintf
2784
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2785
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2786
12.8M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2787
12.8M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2788
2789
        // One inner switch to handle all format variants
2790
12.8M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2791
12.8M
            switch (*f) {                                                     \
2792
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2793
0
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2794
1.60k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2795
1.11k
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2796
12.8M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2797
12.8M
            }
2798
2799
        // Outer switch to handle all the sizes/types
2800
12.8M
        switch (sizemod) {
2801
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2802
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2803
45.4k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2804
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2805
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2806
12.7M
            default:         DO_SPRINTS("", int, unsigned int); break;
2807
12.8M
        }
2808
12.8M
        #undef SPRINT
2809
12.8M
        #undef DO_SPRINTS
2810
2811
12.8M
        assert(len >= 0);
2812
2813
12.8M
        int sign = (buffer[0] == '-');
2814
12.8M
        len -= sign;
2815
2816
12.8M
        precision = Py_MAX(precision, len);
2817
12.8M
        width = Py_MAX(width, precision + sign);
2818
12.8M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2819
2.13k
            precision = width - sign;
2820
2.13k
        }
2821
2822
12.8M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2823
12.8M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2824
2825
12.8M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2826
0
            return NULL;
2827
2828
12.8M
        if (spacepad && !(flags & F_LJUST)) {
2829
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2830
0
                return NULL;
2831
0
            writer->pos += spacepad;
2832
0
        }
2833
2834
12.8M
        if (sign) {
2835
0
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2836
0
                return NULL;
2837
0
        }
2838
2839
12.8M
        if (zeropad) {
2840
641
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2841
0
                return NULL;
2842
641
            writer->pos += zeropad;
2843
641
        }
2844
2845
12.8M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2846
0
            return NULL;
2847
2848
12.8M
        if (spacepad && (flags & F_LJUST)) {
2849
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2850
0
                return NULL;
2851
0
            writer->pos += spacepad;
2852
0
        }
2853
12.8M
        break;
2854
12.8M
    }
2855
2856
12.8M
    case 'p':
2857
0
    {
2858
0
        char number[MAX_INTMAX_CHARS];
2859
2860
0
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2861
0
        assert(len >= 0);
2862
2863
        /* %p is ill-defined:  ensure leading 0x. */
2864
0
        if (number[1] == 'X')
2865
0
            number[1] = 'x';
2866
0
        else if (number[1] != 'x') {
2867
0
            memmove(number + 2, number,
2868
0
                    strlen(number) + 1);
2869
0
            number[0] = '0';
2870
0
            number[1] = 'x';
2871
0
            len += 2;
2872
0
        }
2873
2874
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2875
0
            return NULL;
2876
0
        break;
2877
0
    }
2878
2879
4.49M
    case 's':
2880
4.49M
    {
2881
4.49M
        if (sizemod) {
2882
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2883
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2884
0
                return NULL;
2885
0
        }
2886
4.49M
        else {
2887
            /* UTF-8 */
2888
4.49M
            const char *s = va_arg(*vargs, const char*);
2889
4.49M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2890
0
                return NULL;
2891
4.49M
        }
2892
4.49M
        break;
2893
4.49M
    }
2894
2895
4.49M
    case 'U':
2896
27.9k
    {
2897
27.9k
        PyObject *obj = va_arg(*vargs, PyObject *);
2898
27.9k
        assert(obj && _PyUnicode_CHECK(obj));
2899
2900
27.9k
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2901
0
            return NULL;
2902
27.9k
        break;
2903
27.9k
    }
2904
2905
27.9k
    case 'V':
2906
587
    {
2907
587
        PyObject *obj = va_arg(*vargs, PyObject *);
2908
587
        const char *str;
2909
587
        const wchar_t *wstr;
2910
587
        if (sizemod) {
2911
0
            wstr = va_arg(*vargs, const wchar_t*);
2912
0
        }
2913
587
        else {
2914
587
            str = va_arg(*vargs, const char *);
2915
587
        }
2916
587
        if (obj) {
2917
0
            assert(_PyUnicode_CHECK(obj));
2918
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2919
0
                return NULL;
2920
0
        }
2921
587
        else if (sizemod) {
2922
0
            assert(wstr != NULL);
2923
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2924
0
                return NULL;
2925
0
        }
2926
587
        else {
2927
587
            assert(str != NULL);
2928
587
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
2929
0
                return NULL;
2930
587
        }
2931
587
        break;
2932
587
    }
2933
2934
587
    case 'S':
2935
48
    {
2936
48
        PyObject *obj = va_arg(*vargs, PyObject *);
2937
48
        PyObject *str;
2938
48
        assert(obj);
2939
48
        str = PyObject_Str(obj);
2940
48
        if (!str)
2941
0
            return NULL;
2942
48
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2943
0
            Py_DECREF(str);
2944
0
            return NULL;
2945
0
        }
2946
48
        Py_DECREF(str);
2947
48
        break;
2948
48
    }
2949
2950
737
    case 'R':
2951
737
    {
2952
737
        PyObject *obj = va_arg(*vargs, PyObject *);
2953
737
        PyObject *repr;
2954
737
        assert(obj);
2955
737
        repr = PyObject_Repr(obj);
2956
737
        if (!repr)
2957
0
            return NULL;
2958
737
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2959
0
            Py_DECREF(repr);
2960
0
            return NULL;
2961
0
        }
2962
737
        Py_DECREF(repr);
2963
737
        break;
2964
737
    }
2965
2966
0
    case 'A':
2967
0
    {
2968
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2969
0
        PyObject *ascii;
2970
0
        assert(obj);
2971
0
        ascii = PyObject_ASCII(obj);
2972
0
        if (!ascii)
2973
0
            return NULL;
2974
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2975
0
            Py_DECREF(ascii);
2976
0
            return NULL;
2977
0
        }
2978
0
        Py_DECREF(ascii);
2979
0
        break;
2980
0
    }
2981
2982
0
    case 'T':
2983
0
    {
2984
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2985
0
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2986
2987
0
        PyObject *type_name;
2988
0
        if (flags & F_ALT) {
2989
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
2990
0
        }
2991
0
        else {
2992
0
            type_name = PyType_GetFullyQualifiedName(type);
2993
0
        }
2994
0
        Py_DECREF(type);
2995
0
        if (!type_name) {
2996
0
            return NULL;
2997
0
        }
2998
2999
0
        if (unicode_fromformat_write_str(writer, type_name,
3000
0
                                         width, precision, flags) == -1) {
3001
0
            Py_DECREF(type_name);
3002
0
            return NULL;
3003
0
        }
3004
0
        Py_DECREF(type_name);
3005
0
        break;
3006
0
    }
3007
3008
0
    case 'N':
3009
0
    {
3010
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3011
0
        assert(type_raw != NULL);
3012
3013
0
        if (!PyType_Check(type_raw)) {
3014
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3015
0
            return NULL;
3016
0
        }
3017
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3018
3019
0
        PyObject *type_name;
3020
0
        if (flags & F_ALT) {
3021
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3022
0
        }
3023
0
        else {
3024
0
            type_name = PyType_GetFullyQualifiedName(type);
3025
0
        }
3026
0
        if (!type_name) {
3027
0
            return NULL;
3028
0
        }
3029
0
        if (unicode_fromformat_write_str(writer, type_name,
3030
0
                                         width, precision, flags) == -1) {
3031
0
            Py_DECREF(type_name);
3032
0
            return NULL;
3033
0
        }
3034
0
        Py_DECREF(type_name);
3035
0
        break;
3036
0
    }
3037
3038
0
    default:
3039
0
    invalid_format:
3040
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3041
0
        return NULL;
3042
21.5M
    }
3043
3044
21.5M
    f++;
3045
21.5M
    return f;
3046
21.5M
}
3047
3048
static int
3049
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3050
10.8M
{
3051
10.8M
    Py_ssize_t len = strlen(format);
3052
10.8M
    writer->min_length += len + 100;
3053
10.8M
    writer->overallocate = 1;
3054
3055
    // Copy varags to be able to pass a reference to a subfunction.
3056
10.8M
    va_list vargs2;
3057
10.8M
    va_copy(vargs2, vargs);
3058
3059
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3060
    // to be encoded to ASCII.
3061
10.8M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3062
10.8M
    if (!is_ascii) {
3063
0
        Py_ssize_t i;
3064
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3065
0
        PyErr_Format(PyExc_ValueError,
3066
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3067
0
            "string, got a non-ASCII byte: 0x%02x",
3068
0
            (unsigned char)format[i]);
3069
0
        goto fail;
3070
0
    }
3071
3072
60.4M
    for (const char *f = format; *f; ) {
3073
49.6M
        if (*f == '%') {
3074
25.8M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3075
25.8M
            if (f == NULL)
3076
0
                goto fail;
3077
25.8M
        }
3078
23.8M
        else {
3079
23.8M
            const char *p = strchr(f, '%');
3080
23.8M
            if (p != NULL) {
3081
17.2M
                len = p - f;
3082
17.2M
            }
3083
6.52M
            else {
3084
6.52M
                len = strlen(f);
3085
6.52M
                writer->overallocate = 0;
3086
6.52M
            }
3087
3088
23.8M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3089
0
                goto fail;
3090
0
            }
3091
23.8M
            f += len;
3092
23.8M
        }
3093
49.6M
    }
3094
10.8M
    va_end(vargs2);
3095
10.8M
    return 0;
3096
3097
0
  fail:
3098
0
    va_end(vargs2);
3099
0
    return -1;
3100
10.8M
}
3101
3102
PyObject *
3103
PyUnicode_FromFormatV(const char *format, va_list vargs)
3104
10.8M
{
3105
10.8M
    _PyUnicodeWriter writer;
3106
10.8M
    _PyUnicodeWriter_Init(&writer);
3107
3108
10.8M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3109
0
        _PyUnicodeWriter_Dealloc(&writer);
3110
0
        return NULL;
3111
0
    }
3112
10.8M
    return _PyUnicodeWriter_Finish(&writer);
3113
10.8M
}
3114
3115
PyObject *
3116
PyUnicode_FromFormat(const char *format, ...)
3117
9.79k
{
3118
9.79k
    PyObject* ret;
3119
9.79k
    va_list vargs;
3120
3121
9.79k
    va_start(vargs, format);
3122
9.79k
    ret = PyUnicode_FromFormatV(format, vargs);
3123
9.79k
    va_end(vargs);
3124
9.79k
    return ret;
3125
9.79k
}
3126
3127
int
3128
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3129
0
{
3130
0
    va_list vargs;
3131
0
    va_start(vargs, format);
3132
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3133
0
    va_end(vargs);
3134
0
    return res;
3135
0
}
3136
3137
int
3138
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3139
                         va_list vargs)
3140
0
{
3141
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3142
0
    Py_ssize_t old_pos = _writer->pos;
3143
3144
0
    int res = unicode_from_format(_writer, format, vargs);
3145
3146
0
    if (res < 0) {
3147
0
        _writer->pos = old_pos;
3148
0
    }
3149
0
    return res;
3150
0
}
3151
3152
static Py_ssize_t
3153
unicode_get_widechar_size(PyObject *unicode)
3154
7.62k
{
3155
7.62k
    Py_ssize_t res;
3156
3157
7.62k
    assert(unicode != NULL);
3158
7.62k
    assert(_PyUnicode_CHECK(unicode));
3159
3160
7.62k
    res = _PyUnicode_LENGTH(unicode);
3161
#if SIZEOF_WCHAR_T == 2
3162
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3163
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3164
        const Py_UCS4 *end = s + res;
3165
        for (; s < end; ++s) {
3166
            if (*s > 0xFFFF) {
3167
                ++res;
3168
            }
3169
        }
3170
    }
3171
#endif
3172
7.62k
    return res;
3173
7.62k
}
3174
3175
static void
3176
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3177
7.62k
{
3178
7.62k
    assert(unicode != NULL);
3179
7.62k
    assert(_PyUnicode_CHECK(unicode));
3180
3181
7.62k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3182
0
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3183
0
        return;
3184
0
    }
3185
3186
7.62k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3187
7.62k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3188
639k
        for (; size--; ++s, ++w) {
3189
631k
            *w = *s;
3190
631k
        }
3191
7.62k
    }
3192
0
    else {
3193
0
#if SIZEOF_WCHAR_T == 4
3194
0
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3195
0
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3196
0
        for (; size--; ++s, ++w) {
3197
0
            *w = *s;
3198
0
        }
3199
#else
3200
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3201
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3202
        for (; size--; ++s, ++w) {
3203
            Py_UCS4 ch = *s;
3204
            if (ch > 0xFFFF) {
3205
                assert(ch <= MAX_UNICODE);
3206
                /* encode surrogate pair in this case */
3207
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3208
                if (!size--)
3209
                    break;
3210
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3211
            }
3212
            else {
3213
                *w = ch;
3214
            }
3215
        }
3216
#endif
3217
0
    }
3218
7.62k
}
3219
3220
#ifdef HAVE_WCHAR_H
3221
3222
/* Convert a Unicode object to a wide character string.
3223
3224
   - If w is NULL: return the number of wide characters (including the null
3225
     character) required to convert the unicode object. Ignore size argument.
3226
3227
   - Otherwise: return the number of wide characters (excluding the null
3228
     character) written into w. Write at most size wide characters (including
3229
     the null character). */
3230
Py_ssize_t
3231
PyUnicode_AsWideChar(PyObject *unicode,
3232
                     wchar_t *w,
3233
                     Py_ssize_t size)
3234
5.90k
{
3235
5.90k
    Py_ssize_t res;
3236
3237
5.90k
    if (unicode == NULL) {
3238
0
        PyErr_BadInternalCall();
3239
0
        return -1;
3240
0
    }
3241
5.90k
    if (!PyUnicode_Check(unicode)) {
3242
0
        PyErr_BadArgument();
3243
0
        return -1;
3244
0
    }
3245
3246
5.90k
    res = unicode_get_widechar_size(unicode);
3247
5.90k
    if (w == NULL) {
3248
0
        return res + 1;
3249
0
    }
3250
3251
5.90k
    if (size > res) {
3252
5.90k
        size = res + 1;
3253
5.90k
    }
3254
0
    else {
3255
0
        res = size;
3256
0
    }
3257
5.90k
    unicode_copy_as_widechar(unicode, w, size);
3258
3259
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3260
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3261
       non-Unicode locales and hence needs conversion first. */
3262
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3263
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3264
            return -1;
3265
        }
3266
    }
3267
#endif
3268
3269
5.90k
    return res;
3270
5.90k
}
3271
3272
wchar_t*
3273
PyUnicode_AsWideCharString(PyObject *unicode,
3274
                           Py_ssize_t *size)
3275
1.71k
{
3276
1.71k
    wchar_t *buffer;
3277
1.71k
    Py_ssize_t buflen;
3278
3279
1.71k
    if (unicode == NULL) {
3280
0
        PyErr_BadInternalCall();
3281
0
        return NULL;
3282
0
    }
3283
1.71k
    if (!PyUnicode_Check(unicode)) {
3284
0
        PyErr_BadArgument();
3285
0
        return NULL;
3286
0
    }
3287
3288
1.71k
    buflen = unicode_get_widechar_size(unicode);
3289
1.71k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3290
1.71k
    if (buffer == NULL) {
3291
0
        PyErr_NoMemory();
3292
0
        return NULL;
3293
0
    }
3294
1.71k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3295
3296
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3297
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3298
       non-Unicode locales and hence needs conversion first. */
3299
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3300
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3301
            return NULL;
3302
        }
3303
    }
3304
#endif
3305
3306
1.71k
    if (size != NULL) {
3307
1.10k
        *size = buflen;
3308
1.10k
    }
3309
616
    else if (wcslen(buffer) != (size_t)buflen) {
3310
0
        PyMem_Free(buffer);
3311
0
        PyErr_SetString(PyExc_ValueError,
3312
0
                        "embedded null character");
3313
0
        return NULL;
3314
0
    }
3315
1.71k
    return buffer;
3316
1.71k
}
3317
3318
#endif /* HAVE_WCHAR_H */
3319
3320
int
3321
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3322
0
{
3323
0
    wchar_t **p = (wchar_t **)ptr;
3324
0
    if (obj == NULL) {
3325
0
        PyMem_Free(*p);
3326
0
        *p = NULL;
3327
0
        return 1;
3328
0
    }
3329
0
    if (PyUnicode_Check(obj)) {
3330
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3331
0
        if (*p == NULL) {
3332
0
            return 0;
3333
0
        }
3334
0
        return Py_CLEANUP_SUPPORTED;
3335
0
    }
3336
0
    PyErr_Format(PyExc_TypeError,
3337
0
                 "argument must be str, not %.50s",
3338
0
                 Py_TYPE(obj)->tp_name);
3339
0
    return 0;
3340
0
}
3341
3342
int
3343
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3344
0
{
3345
0
    wchar_t **p = (wchar_t **)ptr;
3346
0
    if (obj == NULL) {
3347
0
        PyMem_Free(*p);
3348
0
        *p = NULL;
3349
0
        return 1;
3350
0
    }
3351
0
    if (obj == Py_None) {
3352
0
        *p = NULL;
3353
0
        return 1;
3354
0
    }
3355
0
    if (PyUnicode_Check(obj)) {
3356
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3357
0
        if (*p == NULL) {
3358
0
            return 0;
3359
0
        }
3360
0
        return Py_CLEANUP_SUPPORTED;
3361
0
    }
3362
0
    PyErr_Format(PyExc_TypeError,
3363
0
                 "argument must be str or None, not %.50s",
3364
0
                 Py_TYPE(obj)->tp_name);
3365
0
    return 0;
3366
0
}
3367
3368
PyObject *
3369
PyUnicode_FromOrdinal(int ordinal)
3370
203k
{
3371
203k
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3372
0
        PyErr_SetString(PyExc_ValueError,
3373
0
                        "chr() arg not in range(0x110000)");
3374
0
        return NULL;
3375
0
    }
3376
3377
203k
    return unicode_char((Py_UCS4)ordinal);
3378
203k
}
3379
3380
PyObject *
3381
PyUnicode_FromObject(PyObject *obj)
3382
4.20M
{
3383
    /* XXX Perhaps we should make this API an alias of
3384
       PyObject_Str() instead ?! */
3385
4.20M
    if (PyUnicode_CheckExact(obj)) {
3386
4.20M
        return Py_NewRef(obj);
3387
4.20M
    }
3388
0
    if (PyUnicode_Check(obj)) {
3389
        /* For a Unicode subtype that's not a Unicode object,
3390
           return a true Unicode object with the same data. */
3391
0
        return _PyUnicode_Copy(obj);
3392
0
    }
3393
0
    PyErr_Format(PyExc_TypeError,
3394
0
                 "Can't convert '%.100s' object to str implicitly",
3395
0
                 Py_TYPE(obj)->tp_name);
3396
0
    return NULL;
3397
0
}
3398
3399
PyObject *
3400
PyUnicode_FromEncodedObject(PyObject *obj,
3401
                            const char *encoding,
3402
                            const char *errors)
3403
5.74M
{
3404
5.74M
    Py_buffer buffer;
3405
5.74M
    PyObject *v;
3406
3407
5.74M
    if (obj == NULL) {
3408
0
        PyErr_BadInternalCall();
3409
0
        return NULL;
3410
0
    }
3411
3412
    /* Decoding bytes objects is the most common case and should be fast */
3413
5.74M
    if (PyBytes_Check(obj)) {
3414
5.03M
        if (PyBytes_GET_SIZE(obj) == 0) {
3415
277k
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3416
0
                return NULL;
3417
0
            }
3418
277k
            _Py_RETURN_UNICODE_EMPTY();
3419
277k
        }
3420
4.76M
        return PyUnicode_Decode(
3421
4.76M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3422
4.76M
                encoding, errors);
3423
5.03M
    }
3424
3425
710k
    if (PyUnicode_Check(obj)) {
3426
0
        PyErr_SetString(PyExc_TypeError,
3427
0
                        "decoding str is not supported");
3428
0
        return NULL;
3429
0
    }
3430
3431
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3432
710k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3433
0
        PyErr_Format(PyExc_TypeError,
3434
0
                     "decoding to str: need a bytes-like object, %.80s found",
3435
0
                     Py_TYPE(obj)->tp_name);
3436
0
        return NULL;
3437
0
    }
3438
3439
710k
    if (buffer.len == 0) {
3440
0
        PyBuffer_Release(&buffer);
3441
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3442
0
            return NULL;
3443
0
        }
3444
0
        _Py_RETURN_UNICODE_EMPTY();
3445
0
    }
3446
3447
710k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3448
710k
    PyBuffer_Release(&buffer);
3449
710k
    return v;
3450
710k
}
3451
3452
/* Normalize an encoding name like encodings.normalize_encoding()
3453
   but allow to convert to lowercase if *to_lower* is true.
3454
   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
3455
int
3456
_Py_normalize_encoding(const char *encoding,
3457
                       char *lower,
3458
                       size_t lower_len,
3459
                       int to_lower)
3460
10.8M
{
3461
10.8M
    const char *e;
3462
10.8M
    char *l;
3463
10.8M
    char *l_end;
3464
10.8M
    int punct;
3465
3466
10.8M
    assert(encoding != NULL);
3467
3468
10.8M
    e = encoding;
3469
10.8M
    l = lower;
3470
10.8M
    l_end = &lower[lower_len - 1];
3471
10.8M
    punct = 0;
3472
206M
    while (1) {
3473
206M
        char c = *e;
3474
206M
        if (c == 0) {
3475
10.2M
            break;
3476
10.2M
        }
3477
3478
196M
        if (Py_ISALNUM(c) || c == '.') {
3479
67.5M
            if (punct && l != lower) {
3480
10.1M
                if (l == l_end) {
3481
641
                    return 0;
3482
641
                }
3483
10.1M
                *l++ = '_';
3484
10.1M
            }
3485
67.5M
            punct = 0;
3486
3487
67.5M
            if (l == l_end) {
3488
605k
                return 0;
3489
605k
            }
3490
66.9M
            *l++ = to_lower ? Py_TOLOWER(c) : c;
3491
66.9M
        }
3492
128M
        else {
3493
128M
            punct = 1;
3494
128M
        }
3495
3496
195M
        e++;
3497
195M
    }
3498
10.2M
    *l = '\0';
3499
10.2M
    return 1;
3500
10.8M
}
3501
3502
PyObject *
3503
PyUnicode_Decode(const char *s,
3504
                 Py_ssize_t size,
3505
                 const char *encoding,
3506
                 const char *errors)
3507
5.47M
{
3508
5.47M
    PyObject *buffer = NULL, *unicode;
3509
5.47M
    Py_buffer info;
3510
5.47M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3511
3512
5.47M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3513
0
        return NULL;
3514
0
    }
3515
3516
5.47M
    if (size == 0) {
3517
0
        _Py_RETURN_UNICODE_EMPTY();
3518
0
    }
3519
3520
5.47M
    if (encoding == NULL) {
3521
39.3k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3522
39.3k
    }
3523
3524
    /* Shortcuts for common default encodings */
3525
5.43M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3526
5.43M
        char *lower = buflower;
3527
3528
        /* Fast paths */
3529
5.43M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3530
872k
            lower += 3;
3531
872k
            if (*lower == '_') {
3532
                /* Match "utf8" and "utf_8" */
3533
871k
                lower++;
3534
871k
            }
3535
3536
872k
            if (lower[0] == '8' && lower[1] == 0) {
3537
871k
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3538
871k
            }
3539
460
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3540
96
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3541
96
            }
3542
364
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3543
118
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3544
118
            }
3545
872k
        }
3546
4.56M
        else {
3547
4.56M
            if (strcmp(lower, "ascii") == 0
3548
4.18M
                || strcmp(lower, "us_ascii") == 0) {
3549
570k
                return PyUnicode_DecodeASCII(s, size, errors);
3550
570k
            }
3551
    #ifdef MS_WINDOWS
3552
            else if (strcmp(lower, "mbcs") == 0) {
3553
                return PyUnicode_DecodeMBCS(s, size, errors);
3554
            }
3555
    #endif
3556
3.99M
            else if (strcmp(lower, "latin1") == 0
3557
3.99M
                     || strcmp(lower, "latin_1") == 0
3558
332k
                     || strcmp(lower, "iso_8859_1") == 0
3559
3.68M
                     || strcmp(lower, "iso8859_1") == 0) {
3560
3.68M
                return PyUnicode_DecodeLatin1(s, size, errors);
3561
3.68M
            }
3562
4.56M
        }
3563
5.43M
    }
3564
3565
    /* Decode via the codec registry */
3566
311k
    buffer = NULL;
3567
311k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3568
0
        goto onError;
3569
311k
    buffer = PyMemoryView_FromBuffer(&info);
3570
311k
    if (buffer == NULL)
3571
0
        goto onError;
3572
311k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3573
311k
    if (unicode == NULL)
3574
79.1k
        goto onError;
3575
232k
    if (!PyUnicode_Check(unicode)) {
3576
0
        PyErr_Format(PyExc_TypeError,
3577
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3578
0
                     "use codecs.decode() to decode to arbitrary types",
3579
0
                     encoding,
3580
0
                     Py_TYPE(unicode)->tp_name);
3581
0
        Py_DECREF(unicode);
3582
0
        goto onError;
3583
0
    }
3584
232k
    Py_DECREF(buffer);
3585
232k
    return unicode_result(unicode);
3586
3587
79.1k
  onError:
3588
79.1k
    Py_XDECREF(buffer);
3589
79.1k
    return NULL;
3590
232k
}
3591
3592
PyAPI_FUNC(PyObject *)
3593
PyUnicode_AsDecodedObject(PyObject *unicode,
3594
                          const char *encoding,
3595
                          const char *errors)
3596
0
{
3597
0
    if (!PyUnicode_Check(unicode)) {
3598
0
        PyErr_BadArgument();
3599
0
        return NULL;
3600
0
    }
3601
3602
0
    if (encoding == NULL)
3603
0
        encoding = PyUnicode_GetDefaultEncoding();
3604
3605
    /* Decode via the codec registry */
3606
0
    return PyCodec_Decode(unicode, encoding, errors);
3607
0
}
3608
3609
PyAPI_FUNC(PyObject *)
3610
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3611
                           const char *encoding,
3612
                           const char *errors)
3613
0
{
3614
0
    PyObject *v;
3615
3616
0
    if (!PyUnicode_Check(unicode)) {
3617
0
        PyErr_BadArgument();
3618
0
        goto onError;
3619
0
    }
3620
3621
0
    if (encoding == NULL)
3622
0
        encoding = PyUnicode_GetDefaultEncoding();
3623
3624
    /* Decode via the codec registry */
3625
0
    v = PyCodec_Decode(unicode, encoding, errors);
3626
0
    if (v == NULL)
3627
0
        goto onError;
3628
0
    if (!PyUnicode_Check(v)) {
3629
0
        PyErr_Format(PyExc_TypeError,
3630
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3631
0
                     "use codecs.decode() to decode to arbitrary types",
3632
0
                     encoding,
3633
0
                     Py_TYPE(unicode)->tp_name);
3634
0
        Py_DECREF(v);
3635
0
        goto onError;
3636
0
    }
3637
0
    return unicode_result(v);
3638
3639
0
  onError:
3640
0
    return NULL;
3641
0
}
3642
3643
PyAPI_FUNC(PyObject *)
3644
PyUnicode_AsEncodedObject(PyObject *unicode,
3645
                          const char *encoding,
3646
                          const char *errors)
3647
0
{
3648
0
    PyObject *v;
3649
3650
0
    if (!PyUnicode_Check(unicode)) {
3651
0
        PyErr_BadArgument();
3652
0
        goto onError;
3653
0
    }
3654
3655
0
    if (encoding == NULL)
3656
0
        encoding = PyUnicode_GetDefaultEncoding();
3657
3658
    /* Encode via the codec registry */
3659
0
    v = PyCodec_Encode(unicode, encoding, errors);
3660
0
    if (v == NULL)
3661
0
        goto onError;
3662
0
    return v;
3663
3664
0
  onError:
3665
0
    return NULL;
3666
0
}
3667
3668
3669
static PyObject *
3670
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3671
                      int current_locale)
3672
552
{
3673
552
    Py_ssize_t wlen;
3674
552
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3675
552
    if (wstr == NULL) {
3676
0
        return NULL;
3677
0
    }
3678
3679
552
    if ((size_t)wlen != wcslen(wstr)) {
3680
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3681
0
        PyMem_Free(wstr);
3682
0
        return NULL;
3683
0
    }
3684
3685
552
    char *str;
3686
552
    size_t error_pos;
3687
552
    const char *reason;
3688
552
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3689
552
                                 current_locale, error_handler);
3690
552
    PyMem_Free(wstr);
3691
3692
552
    if (res != 0) {
3693
0
        if (res == -2) {
3694
0
            PyObject *exc;
3695
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3696
0
                    "locale", unicode,
3697
0
                    (Py_ssize_t)error_pos,
3698
0
                    (Py_ssize_t)(error_pos+1),
3699
0
                    reason);
3700
0
            if (exc != NULL) {
3701
0
                PyCodec_StrictErrors(exc);
3702
0
                Py_DECREF(exc);
3703
0
            }
3704
0
        }
3705
0
        else if (res == -3) {
3706
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3707
0
        }
3708
0
        else {
3709
0
            PyErr_NoMemory();
3710
0
        }
3711
0
        return NULL;
3712
0
    }
3713
3714
552
    PyObject *bytes = PyBytes_FromString(str);
3715
552
    PyMem_RawFree(str);
3716
552
    return bytes;
3717
552
}
3718
3719
PyObject *
3720
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3721
0
{
3722
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3723
0
    return unicode_encode_locale(unicode, error_handler, 1);
3724
0
}
3725
3726
PyObject *
3727
PyUnicode_EncodeFSDefault(PyObject *unicode)
3728
18.9k
{
3729
18.9k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3730
18.9k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3731
18.9k
    if (fs_codec->utf8) {
3732
18.3k
        return unicode_encode_utf8(unicode,
3733
18.3k
                                   fs_codec->error_handler,
3734
18.3k
                                   fs_codec->errors);
3735
18.3k
    }
3736
552
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3737
552
    else if (fs_codec->encoding) {
3738
0
        return PyUnicode_AsEncodedString(unicode,
3739
0
                                         fs_codec->encoding,
3740
0
                                         fs_codec->errors);
3741
0
    }
3742
552
#endif
3743
552
    else {
3744
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3745
           machinery is not ready and so cannot be used:
3746
           use wcstombs() in this case. */
3747
552
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3748
552
        const wchar_t *filesystem_errors = config->filesystem_errors;
3749
552
        assert(filesystem_errors != NULL);
3750
552
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3751
552
        assert(errors != _Py_ERROR_UNKNOWN);
3752
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3753
        return unicode_encode_utf8(unicode, errors, NULL);
3754
#else
3755
552
        return unicode_encode_locale(unicode, errors, 0);
3756
552
#endif
3757
552
    }
3758
18.9k
}
3759
3760
PyObject *
3761
PyUnicode_AsEncodedString(PyObject *unicode,
3762
                          const char *encoding,
3763
                          const char *errors)
3764
18.9M
{
3765
18.9M
    PyObject *v;
3766
18.9M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3767
3768
18.9M
    if (!PyUnicode_Check(unicode)) {
3769
0
        PyErr_BadArgument();
3770
0
        return NULL;
3771
0
    }
3772
3773
18.9M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3774
0
        return NULL;
3775
0
    }
3776
3777
18.9M
    if (encoding == NULL) {
3778
13.5M
        return _PyUnicode_AsUTF8String(unicode, errors);
3779
13.5M
    }
3780
3781
    /* Shortcuts for common default encodings */
3782
5.42M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3783
4.82M
        char *lower = buflower;
3784
3785
        /* Fast paths */
3786
4.82M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3787
4.68M
            lower += 3;
3788
4.68M
            if (*lower == '_') {
3789
                /* Match "utf8" and "utf_8" */
3790
4.68M
                lower++;
3791
4.68M
            }
3792
3793
4.68M
            if (lower[0] == '8' && lower[1] == 0) {
3794
4.68M
                return _PyUnicode_AsUTF8String(unicode, errors);
3795
4.68M
            }
3796
0
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3797
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3798
0
            }
3799
0
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3800
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3801
0
            }
3802
4.68M
        }
3803
136k
        else {
3804
136k
            if (strcmp(lower, "ascii") == 0
3805
117k
                || strcmp(lower, "us_ascii") == 0) {
3806
117k
                return _PyUnicode_AsASCIIString(unicode, errors);
3807
117k
            }
3808
#ifdef MS_WINDOWS
3809
            else if (strcmp(lower, "mbcs") == 0) {
3810
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3811
            }
3812
#endif
3813
19.5k
            else if (strcmp(lower, "latin1") == 0 ||
3814
19.5k
                     strcmp(lower, "latin_1") == 0 ||
3815
19.5k
                     strcmp(lower, "iso_8859_1") == 0 ||
3816
19.5k
                     strcmp(lower, "iso8859_1") == 0) {
3817
0
                return _PyUnicode_AsLatin1String(unicode, errors);
3818
0
            }
3819
136k
        }
3820
4.82M
    }
3821
3822
    /* Encode via the codec registry */
3823
622k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3824
622k
    if (v == NULL)
3825
0
        return NULL;
3826
3827
    /* The normal path */
3828
622k
    if (PyBytes_Check(v))
3829
622k
        return v;
3830
3831
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3832
0
    if (PyByteArray_Check(v)) {
3833
0
        int error;
3834
0
        PyObject *b;
3835
3836
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3837
0
            "encoder %s returned bytearray instead of bytes; "
3838
0
            "use codecs.encode() to encode to arbitrary types",
3839
0
            encoding);
3840
0
        if (error) {
3841
0
            Py_DECREF(v);
3842
0
            return NULL;
3843
0
        }
3844
3845
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3846
0
                                      PyByteArray_GET_SIZE(v));
3847
0
        Py_DECREF(v);
3848
0
        return b;
3849
0
    }
3850
3851
0
    PyErr_Format(PyExc_TypeError,
3852
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3853
0
                 "use codecs.encode() to encode to arbitrary types",
3854
0
                 encoding,
3855
0
                 Py_TYPE(v)->tp_name);
3856
0
    Py_DECREF(v);
3857
0
    return NULL;
3858
0
}
3859
3860
PyAPI_FUNC(PyObject *)
3861
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3862
                           const char *encoding,
3863
                           const char *errors)
3864
0
{
3865
0
    PyObject *v;
3866
3867
0
    if (!PyUnicode_Check(unicode)) {
3868
0
        PyErr_BadArgument();
3869
0
        goto onError;
3870
0
    }
3871
3872
0
    if (encoding == NULL)
3873
0
        encoding = PyUnicode_GetDefaultEncoding();
3874
3875
    /* Encode via the codec registry */
3876
0
    v = PyCodec_Encode(unicode, encoding, errors);
3877
0
    if (v == NULL)
3878
0
        goto onError;
3879
0
    if (!PyUnicode_Check(v)) {
3880
0
        PyErr_Format(PyExc_TypeError,
3881
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3882
0
                     "use codecs.encode() to encode to arbitrary types",
3883
0
                     encoding,
3884
0
                     Py_TYPE(v)->tp_name);
3885
0
        Py_DECREF(v);
3886
0
        goto onError;
3887
0
    }
3888
0
    return v;
3889
3890
0
  onError:
3891
0
    return NULL;
3892
0
}
3893
3894
static PyObject*
3895
unicode_decode_locale(const char *str, Py_ssize_t len,
3896
                      _Py_error_handler errors, int current_locale)
3897
17.9k
{
3898
17.9k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3899
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3900
0
        return NULL;
3901
0
    }
3902
3903
17.9k
    wchar_t *wstr;
3904
17.9k
    size_t wlen;
3905
17.9k
    const char *reason;
3906
17.9k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3907
17.9k
                                 current_locale, errors);
3908
17.9k
    if (res != 0) {
3909
0
        if (res == -2) {
3910
0
            PyObject *exc;
3911
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3912
0
                                        "locale", str, len,
3913
0
                                        (Py_ssize_t)wlen,
3914
0
                                        (Py_ssize_t)(wlen + 1),
3915
0
                                        reason);
3916
0
            if (exc != NULL) {
3917
0
                PyCodec_StrictErrors(exc);
3918
0
                Py_DECREF(exc);
3919
0
            }
3920
0
        }
3921
0
        else if (res == -3) {
3922
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3923
0
        }
3924
0
        else {
3925
0
            PyErr_NoMemory();
3926
0
        }
3927
0
        return NULL;
3928
0
    }
3929
3930
17.9k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3931
17.9k
    PyMem_RawFree(wstr);
3932
17.9k
    return unicode;
3933
17.9k
}
3934
3935
PyObject*
3936
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3937
                              const char *errors)
3938
0
{
3939
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3940
0
    return unicode_decode_locale(str, len, error_handler, 1);
3941
0
}
3942
3943
PyObject*
3944
PyUnicode_DecodeLocale(const char *str, const char *errors)
3945
10.9k
{
3946
10.9k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3947
10.9k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3948
10.9k
    return unicode_decode_locale(str, size, error_handler, 1);
3949
10.9k
}
3950
3951
3952
PyObject*
3953
0
PyUnicode_DecodeFSDefault(const char *s) {
3954
0
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3955
0
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3956
0
}
3957
3958
PyObject*
3959
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3960
9.24k
{
3961
9.24k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3962
9.24k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3963
9.24k
    if (fs_codec->utf8) {
3964
2.17k
        return unicode_decode_utf8(s, size,
3965
2.17k
                                   fs_codec->error_handler,
3966
2.17k
                                   fs_codec->errors,
3967
2.17k
                                   NULL);
3968
2.17k
    }
3969
7.06k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3970
7.06k
    else if (fs_codec->encoding) {
3971
0
        return PyUnicode_Decode(s, size,
3972
0
                                fs_codec->encoding,
3973
0
                                fs_codec->errors);
3974
0
    }
3975
7.06k
#endif
3976
7.06k
    else {
3977
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3978
           machinery is not ready and so cannot be used:
3979
           use mbstowcs() in this case. */
3980
7.06k
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3981
7.06k
        const wchar_t *filesystem_errors = config->filesystem_errors;
3982
7.06k
        assert(filesystem_errors != NULL);
3983
7.06k
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3984
7.06k
        assert(errors != _Py_ERROR_UNKNOWN);
3985
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3986
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3987
#else
3988
7.06k
        return unicode_decode_locale(s, size, errors, 0);
3989
7.06k
#endif
3990
7.06k
    }
3991
9.24k
}
3992
3993
3994
int
3995
PyUnicode_FSConverter(PyObject* arg, void* addr)
3996
11.5k
{
3997
11.5k
    PyObject *path = NULL;
3998
11.5k
    PyObject *output = NULL;
3999
11.5k
    Py_ssize_t size;
4000
11.5k
    const char *data;
4001
11.5k
    if (arg == NULL) {
4002
0
        Py_DECREF(*(PyObject**)addr);
4003
0
        *(PyObject**)addr = NULL;
4004
0
        return 1;
4005
0
    }
4006
11.5k
    path = PyOS_FSPath(arg);
4007
11.5k
    if (path == NULL) {
4008
0
        return 0;
4009
0
    }
4010
11.5k
    if (PyBytes_Check(path)) {
4011
0
        output = path;
4012
0
    }
4013
11.5k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4014
11.5k
        output = PyUnicode_EncodeFSDefault(path);
4015
11.5k
        Py_DECREF(path);
4016
11.5k
        if (!output) {
4017
0
            return 0;
4018
0
        }
4019
11.5k
        assert(PyBytes_Check(output));
4020
11.5k
    }
4021
4022
11.5k
    size = PyBytes_GET_SIZE(output);
4023
11.5k
    data = PyBytes_AS_STRING(output);
4024
11.5k
    if ((size_t)size != strlen(data)) {
4025
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4026
0
        Py_DECREF(output);
4027
0
        return 0;
4028
0
    }
4029
11.5k
    *(PyObject**)addr = output;
4030
11.5k
    return Py_CLEANUP_SUPPORTED;
4031
11.5k
}
4032
4033
4034
int
4035
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4036
21.4k
{
4037
21.4k
    if (arg == NULL) {
4038
0
        Py_DECREF(*(PyObject**)addr);
4039
0
        *(PyObject**)addr = NULL;
4040
0
        return 1;
4041
0
    }
4042
4043
21.4k
    PyObject *path = PyOS_FSPath(arg);
4044
21.4k
    if (path == NULL) {
4045
0
        return 0;
4046
0
    }
4047
4048
21.4k
    PyObject *output = NULL;
4049
21.4k
    if (PyUnicode_Check(path)) {
4050
21.4k
        output = path;
4051
21.4k
    }
4052
0
    else if (PyBytes_Check(path)) {
4053
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4054
0
                                                  PyBytes_GET_SIZE(path));
4055
0
        Py_DECREF(path);
4056
0
        if (!output) {
4057
0
            return 0;
4058
0
        }
4059
0
    }
4060
0
    else {
4061
0
        PyErr_Format(PyExc_TypeError,
4062
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4063
0
                     Py_TYPE(arg)->tp_name);
4064
0
        Py_DECREF(path);
4065
0
        return 0;
4066
0
    }
4067
4068
21.4k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4069
21.4k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4070
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4071
0
        Py_DECREF(output);
4072
0
        return 0;
4073
0
    }
4074
21.4k
    *(PyObject**)addr = output;
4075
21.4k
    return Py_CLEANUP_SUPPORTED;
4076
21.4k
}
4077
4078
4079
static int unicode_fill_utf8(PyObject *unicode);
4080
4081
4082
static int
4083
unicode_ensure_utf8(PyObject *unicode)
4084
22.6M
{
4085
22.6M
    int err = 0;
4086
22.6M
    if (PyUnicode_UTF8(unicode) == NULL) {
4087
186k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4088
186k
        if (PyUnicode_UTF8(unicode) == NULL) {
4089
186k
            err = unicode_fill_utf8(unicode);
4090
186k
        }
4091
186k
        Py_END_CRITICAL_SECTION();
4092
186k
    }
4093
22.6M
    return err;
4094
22.6M
}
4095
4096
const char *
4097
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4098
22.6M
{
4099
22.6M
    if (!PyUnicode_Check(unicode)) {
4100
0
        PyErr_BadArgument();
4101
0
        if (psize) {
4102
0
            *psize = -1;
4103
0
        }
4104
0
        return NULL;
4105
0
    }
4106
4107
22.6M
    if (unicode_ensure_utf8(unicode) == -1) {
4108
207
        if (psize) {
4109
207
            *psize = -1;
4110
207
        }
4111
207
        return NULL;
4112
207
    }
4113
4114
22.6M
    if (psize) {
4115
22.5M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4116
22.5M
    }
4117
22.6M
    return PyUnicode_UTF8(unicode);
4118
22.6M
}
4119
4120
const char *
4121
PyUnicode_AsUTF8(PyObject *unicode)
4122
67.5k
{
4123
67.5k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4124
67.5k
}
4125
4126
const char *
4127
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4128
822k
{
4129
822k
    Py_ssize_t size;
4130
822k
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4131
822k
    if (s && strlen(s) != (size_t)size) {
4132
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4133
0
        return NULL;
4134
0
    }
4135
822k
    return s;
4136
822k
}
4137
4138
/*
4139
PyUnicode_GetSize() has been deprecated since Python 3.3
4140
because it returned length of Py_UNICODE.
4141
4142
But this function is part of stable abi, because it doesn't
4143
include Py_UNICODE in signature and it was not excluded from
4144
stable ABI in PEP 384.
4145
*/
4146
PyAPI_FUNC(Py_ssize_t)
4147
PyUnicode_GetSize(PyObject *unicode)
4148
0
{
4149
0
    PyErr_SetString(PyExc_RuntimeError,
4150
0
                    "PyUnicode_GetSize has been removed.");
4151
0
    return -1;
4152
0
}
4153
4154
Py_ssize_t
4155
PyUnicode_GetLength(PyObject *unicode)
4156
19.2k
{
4157
19.2k
    if (!PyUnicode_Check(unicode)) {
4158
0
        PyErr_BadArgument();
4159
0
        return -1;
4160
0
    }
4161
19.2k
    return PyUnicode_GET_LENGTH(unicode);
4162
19.2k
}
4163
4164
Py_UCS4
4165
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4166
21
{
4167
21
    const void *data;
4168
21
    int kind;
4169
4170
21
    if (!PyUnicode_Check(unicode)) {
4171
0
        PyErr_BadArgument();
4172
0
        return (Py_UCS4)-1;
4173
0
    }
4174
21
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4175
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4176
0
        return (Py_UCS4)-1;
4177
0
    }
4178
21
    data = PyUnicode_DATA(unicode);
4179
21
    kind = PyUnicode_KIND(unicode);
4180
21
    return PyUnicode_READ(kind, data, index);
4181
21
}
4182
4183
int
4184
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4185
0
{
4186
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4187
0
        PyErr_BadArgument();
4188
0
        return -1;
4189
0
    }
4190
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4191
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4192
0
        return -1;
4193
0
    }
4194
0
    if (unicode_check_modifiable(unicode))
4195
0
        return -1;
4196
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4197
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4198
0
        return -1;
4199
0
    }
4200
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4201
0
                    index, ch);
4202
0
    return 0;
4203
0
}
4204
4205
const char *
4206
PyUnicode_GetDefaultEncoding(void)
4207
0
{
4208
0
    return "utf-8";
4209
0
}
4210
4211
/* create or adjust a UnicodeDecodeError */
4212
static void
4213
make_decode_exception(PyObject **exceptionObject,
4214
                      const char *encoding,
4215
                      const char *input, Py_ssize_t length,
4216
                      Py_ssize_t startpos, Py_ssize_t endpos,
4217
                      const char *reason)
4218
314k
{
4219
314k
    if (*exceptionObject == NULL) {
4220
99.9k
        *exceptionObject = PyUnicodeDecodeError_Create(
4221
99.9k
            encoding, input, length, startpos, endpos, reason);
4222
99.9k
    }
4223
214k
    else {
4224
214k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4225
0
            goto onError;
4226
214k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4227
0
            goto onError;
4228
214k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4229
0
            goto onError;
4230
214k
    }
4231
314k
    return;
4232
4233
314k
onError:
4234
0
    Py_CLEAR(*exceptionObject);
4235
0
}
4236
4237
#ifdef MS_WINDOWS
4238
static int
4239
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4240
{
4241
    if (newsize > *size) {
4242
        wchar_t *newbuf = *buf;
4243
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4244
            PyErr_NoMemory();
4245
            return -1;
4246
        }
4247
        *buf = newbuf;
4248
    }
4249
    *size = newsize;
4250
    return 0;
4251
}
4252
4253
/* error handling callback helper:
4254
   build arguments, call the callback and check the arguments,
4255
   if no exception occurred, copy the replacement to the output
4256
   and adjust various state variables.
4257
   return 0 on success, -1 on error
4258
*/
4259
4260
static int
4261
unicode_decode_call_errorhandler_wchar(
4262
    const char *errors, PyObject **errorHandler,
4263
    const char *encoding, const char *reason,
4264
    const char **input, const char **inend, Py_ssize_t *startinpos,
4265
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4266
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4267
{
4268
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4269
4270
    PyObject *restuple = NULL;
4271
    PyObject *repunicode = NULL;
4272
    Py_ssize_t outsize;
4273
    Py_ssize_t insize;
4274
    Py_ssize_t requiredsize;
4275
    Py_ssize_t newpos;
4276
    PyObject *inputobj = NULL;
4277
    Py_ssize_t repwlen;
4278
4279
    if (*errorHandler == NULL) {
4280
        *errorHandler = PyCodec_LookupError(errors);
4281
        if (*errorHandler == NULL)
4282
            goto onError;
4283
    }
4284
4285
    make_decode_exception(exceptionObject,
4286
        encoding,
4287
        *input, *inend - *input,
4288
        *startinpos, *endinpos,
4289
        reason);
4290
    if (*exceptionObject == NULL)
4291
        goto onError;
4292
4293
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4294
    if (restuple == NULL)
4295
        goto onError;
4296
    if (!PyTuple_Check(restuple)) {
4297
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4298
        goto onError;
4299
    }
4300
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4301
        goto onError;
4302
4303
    /* Copy back the bytes variables, which might have been modified by the
4304
       callback */
4305
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4306
    if (!inputobj)
4307
        goto onError;
4308
    *input = PyBytes_AS_STRING(inputobj);
4309
    insize = PyBytes_GET_SIZE(inputobj);
4310
    *inend = *input + insize;
4311
    /* we can DECREF safely, as the exception has another reference,
4312
       so the object won't go away. */
4313
    Py_DECREF(inputobj);
4314
4315
    if (newpos<0)
4316
        newpos = insize+newpos;
4317
    if (newpos<0 || newpos>insize) {
4318
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4319
        goto onError;
4320
    }
4321
4322
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4323
    if (repwlen < 0)
4324
        goto onError;
4325
    repwlen--;
4326
    /* need more space? (at least enough for what we
4327
       have+the replacement+the rest of the string (starting
4328
       at the new input position), so we won't have to check space
4329
       when there are no errors in the rest of the string) */
4330
    requiredsize = *outpos;
4331
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4332
        goto overflow;
4333
    requiredsize += repwlen;
4334
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4335
        goto overflow;
4336
    requiredsize += insize - newpos;
4337
    outsize = *bufsize;
4338
    if (requiredsize > outsize) {
4339
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4340
            requiredsize = 2*outsize;
4341
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4342
            goto onError;
4343
        }
4344
    }
4345
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4346
    *outpos += repwlen;
4347
    *endinpos = newpos;
4348
    *inptr = *input + newpos;
4349
4350
    /* we made it! */
4351
    Py_DECREF(restuple);
4352
    return 0;
4353
4354
  overflow:
4355
    PyErr_SetString(PyExc_OverflowError,
4356
                    "decoded result is too long for a Python string");
4357
4358
  onError:
4359
    Py_XDECREF(restuple);
4360
    return -1;
4361
}
4362
#endif   /* MS_WINDOWS */
4363
4364
static int
4365
unicode_decode_call_errorhandler_writer(
4366
    const char *errors, PyObject **errorHandler,
4367
    const char *encoding, const char *reason,
4368
    const char **input, const char **inend, Py_ssize_t *startinpos,
4369
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4370
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4371
314k
{
4372
314k
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4373
4374
314k
    PyObject *restuple = NULL;
4375
314k
    PyObject *repunicode = NULL;
4376
314k
    Py_ssize_t insize;
4377
314k
    Py_ssize_t newpos;
4378
314k
    Py_ssize_t replen;
4379
314k
    Py_ssize_t remain;
4380
314k
    PyObject *inputobj = NULL;
4381
314k
    int need_to_grow = 0;
4382
314k
    const char *new_inptr;
4383
4384
314k
    if (*errorHandler == NULL) {
4385
99.9k
        *errorHandler = PyCodec_LookupError(errors);
4386
99.9k
        if (*errorHandler == NULL)
4387
0
            goto onError;
4388
99.9k
    }
4389
4390
314k
    make_decode_exception(exceptionObject,
4391
314k
        encoding,
4392
314k
        *input, *inend - *input,
4393
314k
        *startinpos, *endinpos,
4394
314k
        reason);
4395
314k
    if (*exceptionObject == NULL)
4396
0
        goto onError;
4397
4398
314k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4399
314k
    if (restuple == NULL)
4400
62.0k
        goto onError;
4401
252k
    if (!PyTuple_Check(restuple)) {
4402
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4403
0
        goto onError;
4404
0
    }
4405
252k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4406
0
        goto onError;
4407
4408
    /* Copy back the bytes variables, which might have been modified by the
4409
       callback */
4410
252k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4411
252k
    if (!inputobj)
4412
0
        goto onError;
4413
252k
    remain = *inend - *input - *endinpos;
4414
252k
    *input = PyBytes_AS_STRING(inputobj);
4415
252k
    insize = PyBytes_GET_SIZE(inputobj);
4416
252k
    *inend = *input + insize;
4417
    /* we can DECREF safely, as the exception has another reference,
4418
       so the object won't go away. */
4419
252k
    Py_DECREF(inputobj);
4420
4421
252k
    if (newpos<0)
4422
0
        newpos = insize+newpos;
4423
252k
    if (newpos<0 || newpos>insize) {
4424
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4425
0
        goto onError;
4426
0
    }
4427
4428
252k
    replen = PyUnicode_GET_LENGTH(repunicode);
4429
252k
    if (replen > 1) {
4430
25.6k
        writer->min_length += replen - 1;
4431
25.6k
        need_to_grow = 1;
4432
25.6k
    }
4433
252k
    new_inptr = *input + newpos;
4434
252k
    if (*inend - new_inptr > remain) {
4435
        /* We don't know the decoding algorithm here so we make the worst
4436
           assumption that one byte decodes to one unicode character.
4437
           If unfortunately one byte could decode to more unicode characters,
4438
           the decoder may write out-of-bound then.  Is it possible for the
4439
           algorithms using this function? */
4440
13.3k
        writer->min_length += *inend - new_inptr - remain;
4441
13.3k
        need_to_grow = 1;
4442
13.3k
    }
4443
252k
    if (need_to_grow) {
4444
25.8k
        writer->overallocate = 1;
4445
25.8k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4446
25.8k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4447
0
            goto onError;
4448
25.8k
    }
4449
252k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4450
0
        goto onError;
4451
4452
252k
    *endinpos = newpos;
4453
252k
    *inptr = new_inptr;
4454
4455
    /* we made it! */
4456
252k
    Py_DECREF(restuple);
4457
252k
    return 0;
4458
4459
62.0k
  onError:
4460
62.0k
    Py_XDECREF(restuple);
4461
62.0k
    return -1;
4462
252k
}
4463
4464
/* --- UTF-7 Codec -------------------------------------------------------- */
4465
4466
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4467
4468
/* Three simple macros defining base-64. */
4469
4470
/* Is c a base-64 character? */
4471
4472
#define IS_BASE64(c) \
4473
333k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4474
333k
     ((c) >= 'a' && (c) <= 'z') ||     \
4475
333k
     ((c) >= '0' && (c) <= '9') ||     \
4476
333k
     (c) == '+' || (c) == '/')
4477
4478
/* given that c is a base-64 character, what is its base-64 value? */
4479
4480
#define FROM_BASE64(c)                                                  \
4481
288k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4482
288k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4483
209k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4484
118k
     (c) == '+' ? 62 : 63)
4485
4486
/* What is the base-64 character of the bottom 6 bits of n? */
4487
4488
#define TO_BASE64(n)  \
4489
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4490
4491
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4492
 * decoded as itself.  We are permissive on decoding; the only ASCII
4493
 * byte not decoding to itself is the + which begins a base64
4494
 * string. */
4495
4496
#define DECODE_DIRECT(c)                                \
4497
7.55M
    ((c) <= 127 && (c) != '+')
4498
4499
/* The UTF-7 encoder treats ASCII characters differently according to
4500
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4501
 * the above).  See RFC2152.  This array identifies these different
4502
 * sets:
4503
 * 0 : "Set D"
4504
 *     alphanumeric and '(),-./:?
4505
 * 1 : "Set O"
4506
 *     !"#$%&*;<=>@[]^_`{|}
4507
 * 2 : "whitespace"
4508
 *     ht nl cr sp
4509
 * 3 : special (must be base64 encoded)
4510
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4511
 */
4512
4513
static
4514
char utf7_category[128] = {
4515
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4516
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4517
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4518
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4519
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4520
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4521
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4522
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4523
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4524
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4525
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4526
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4527
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4528
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4529
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4530
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4531
};
4532
4533
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4534
 * answer depends on whether we are encoding set O as itself, and also
4535
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4536
 * clear that the answers to these questions vary between
4537
 * applications, so this code needs to be flexible.  */
4538
4539
#define ENCODE_DIRECT(c) \
4540
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4541
4542
PyObject *
4543
PyUnicode_DecodeUTF7(const char *s,
4544
                     Py_ssize_t size,
4545
                     const char *errors)
4546
0
{
4547
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4548
0
}
4549
4550
/* The decoder.  The only state we preserve is our read position,
4551
 * i.e. how many characters we have consumed.  So if we end in the
4552
 * middle of a shift sequence we have to back off the read position
4553
 * and the output to the beginning of the sequence, otherwise we lose
4554
 * all the shift state (seen bits, number of bits seen, high
4555
 * surrogate). */
4556
4557
PyObject *
4558
PyUnicode_DecodeUTF7Stateful(const char *s,
4559
                             Py_ssize_t size,
4560
                             const char *errors,
4561
                             Py_ssize_t *consumed)
4562
28.1k
{
4563
28.1k
    const char *starts = s;
4564
28.1k
    Py_ssize_t startinpos;
4565
28.1k
    Py_ssize_t endinpos;
4566
28.1k
    const char *e;
4567
28.1k
    _PyUnicodeWriter writer;
4568
28.1k
    const char *errmsg = "";
4569
28.1k
    int inShift = 0;
4570
28.1k
    Py_ssize_t shiftOutStart;
4571
28.1k
    unsigned int base64bits = 0;
4572
28.1k
    unsigned long base64buffer = 0;
4573
28.1k
    Py_UCS4 surrogate = 0;
4574
28.1k
    PyObject *errorHandler = NULL;
4575
28.1k
    PyObject *exc = NULL;
4576
4577
28.1k
    if (size == 0) {
4578
0
        if (consumed)
4579
0
            *consumed = 0;
4580
0
        _Py_RETURN_UNICODE_EMPTY();
4581
0
    }
4582
4583
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4584
28.1k
    _PyUnicodeWriter_Init(&writer);
4585
28.1k
    writer.min_length = size;
4586
4587
28.1k
    shiftOutStart = 0;
4588
28.1k
    e = s + size;
4589
4590
7.90M
    while (s < e) {
4591
7.89M
        Py_UCS4 ch;
4592
7.89M
      restart:
4593
7.89M
        ch = (unsigned char) *s;
4594
4595
7.89M
        if (inShift) { /* in a base-64 section */
4596
307k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4597
288k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4598
288k
                base64bits += 6;
4599
288k
                s++;
4600
288k
                if (base64bits >= 16) {
4601
                    /* we have enough bits for a UTF-16 value */
4602
100k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4603
100k
                    base64bits -= 16;
4604
100k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4605
100k
                    assert(outCh <= 0xffff);
4606
100k
                    if (surrogate) {
4607
                        /* expecting a second surrogate */
4608
8.61k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4609
3.80k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4610
3.80k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4611
0
                                goto onError;
4612
3.80k
                            surrogate = 0;
4613
3.80k
                            continue;
4614
3.80k
                        }
4615
4.81k
                        else {
4616
4.81k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4617
0
                                goto onError;
4618
4.81k
                            surrogate = 0;
4619
4.81k
                        }
4620
8.61k
                    }
4621
96.4k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4622
                        /* first surrogate */
4623
12.8k
                        surrogate = outCh;
4624
12.8k
                    }
4625
83.5k
                    else {
4626
83.5k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4627
0
                            goto onError;
4628
83.5k
                    }
4629
96.4k
                }
4630
288k
            }
4631
18.7k
            else { /* now leaving a base-64 section */
4632
18.7k
                inShift = 0;
4633
18.7k
                if (base64bits > 0) { /* left-over bits */
4634
15.3k
                    if (base64bits >= 6) {
4635
                        /* We've seen at least one base-64 character */
4636
8.87k
                        s++;
4637
8.87k
                        errmsg = "partial character in shift sequence";
4638
8.87k
                        goto utf7Error;
4639
8.87k
                    }
4640
6.48k
                    else {
4641
                        /* Some bits remain; they should be zero */
4642
6.48k
                        if (base64buffer != 0) {
4643
1.52k
                            s++;
4644
1.52k
                            errmsg = "non-zero padding bits in shift sequence";
4645
1.52k
                            goto utf7Error;
4646
1.52k
                        }
4647
6.48k
                    }
4648
15.3k
                }
4649
8.38k
                if (surrogate && DECODE_DIRECT(ch)) {
4650
2.85k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4651
0
                        goto onError;
4652
2.85k
                }
4653
8.38k
                surrogate = 0;
4654
8.38k
                if (ch == '-') {
4655
                    /* '-' is absorbed; other terminating
4656
                       characters are preserved */
4657
2.19k
                    s++;
4658
2.19k
                }
4659
8.38k
            }
4660
307k
        }
4661
7.58M
        else if ( ch == '+' ) {
4662
29.3k
            startinpos = s-starts;
4663
29.3k
            s++; /* consume '+' */
4664
29.3k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4665
2.26k
                s++;
4666
2.26k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4667
0
                    goto onError;
4668
2.26k
            }
4669
27.0k
            else if (s < e && !IS_BASE64(*s)) {
4670
3.89k
                s++;
4671
3.89k
                errmsg = "ill-formed sequence";
4672
3.89k
                goto utf7Error;
4673
3.89k
            }
4674
23.1k
            else { /* begin base64-encoded section */
4675
23.1k
                inShift = 1;
4676
23.1k
                surrogate = 0;
4677
23.1k
                shiftOutStart = writer.pos;
4678
23.1k
                base64bits = 0;
4679
23.1k
                base64buffer = 0;
4680
23.1k
            }
4681
29.3k
        }
4682
7.55M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4683
7.46M
            s++;
4684
7.46M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4685
0
                goto onError;
4686
7.46M
        }
4687
89.2k
        else {
4688
89.2k
            startinpos = s-starts;
4689
89.2k
            s++;
4690
89.2k
            errmsg = "unexpected special character";
4691
89.2k
            goto utf7Error;
4692
89.2k
        }
4693
7.78M
        continue;
4694
7.78M
utf7Error:
4695
103k
        endinpos = s-starts;
4696
103k
        if (unicode_decode_call_errorhandler_writer(
4697
103k
                errors, &errorHandler,
4698
103k
                "utf7", errmsg,
4699
103k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4700
103k
                &writer))
4701
12.0k
            goto onError;
4702
103k
    }
4703
4704
    /* end of string */
4705
4706
16.1k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4707
        /* if we're in an inconsistent state, that's an error */
4708
4.39k
        inShift = 0;
4709
4.39k
        if (surrogate ||
4710
3.49k
                (base64bits >= 6) ||
4711
3.09k
                (base64bits > 0 && base64buffer != 0)) {
4712
3.09k
            endinpos = size;
4713
3.09k
            if (unicode_decode_call_errorhandler_writer(
4714
3.09k
                    errors, &errorHandler,
4715
3.09k
                    "utf7", "unterminated shift sequence",
4716
3.09k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4717
3.09k
                    &writer))
4718
2.73k
                goto onError;
4719
364
            if (s < e)
4720
0
                goto restart;
4721
364
        }
4722
4.39k
    }
4723
4724
    /* return state */
4725
13.4k
    if (consumed) {
4726
0
        if (inShift) {
4727
0
            *consumed = startinpos;
4728
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4729
0
                PyObject *result = PyUnicode_FromKindAndData(
4730
0
                        writer.kind, writer.data, shiftOutStart);
4731
0
                Py_XDECREF(errorHandler);
4732
0
                Py_XDECREF(exc);
4733
0
                _PyUnicodeWriter_Dealloc(&writer);
4734
0
                return result;
4735
0
            }
4736
0
            writer.pos = shiftOutStart; /* back off output */
4737
0
        }
4738
0
        else {
4739
0
            *consumed = s-starts;
4740
0
        }
4741
0
    }
4742
4743
13.4k
    Py_XDECREF(errorHandler);
4744
13.4k
    Py_XDECREF(exc);
4745
13.4k
    return _PyUnicodeWriter_Finish(&writer);
4746
4747
14.7k
  onError:
4748
14.7k
    Py_XDECREF(errorHandler);
4749
14.7k
    Py_XDECREF(exc);
4750
14.7k
    _PyUnicodeWriter_Dealloc(&writer);
4751
14.7k
    return NULL;
4752
13.4k
}
4753
4754
4755
PyObject *
4756
_PyUnicode_EncodeUTF7(PyObject *str,
4757
                      const char *errors)
4758
0
{
4759
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4760
0
    if (len == 0) {
4761
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4762
0
    }
4763
0
    int kind = PyUnicode_KIND(str);
4764
0
    const void *data = PyUnicode_DATA(str);
4765
4766
    /* It might be possible to tighten this worst case */
4767
0
    if (len > PY_SSIZE_T_MAX / 8) {
4768
0
        return PyErr_NoMemory();
4769
0
    }
4770
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4771
0
    if (writer == NULL) {
4772
0
        return NULL;
4773
0
    }
4774
4775
0
    int inShift = 0;
4776
0
    unsigned int base64bits = 0;
4777
0
    unsigned long base64buffer = 0;
4778
0
    char *out = PyBytesWriter_GetData(writer);
4779
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4780
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4781
4782
0
        if (inShift) {
4783
0
            if (ENCODE_DIRECT(ch)) {
4784
                /* shifting out */
4785
0
                if (base64bits) { /* output remaining bits */
4786
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4787
0
                    base64buffer = 0;
4788
0
                    base64bits = 0;
4789
0
                }
4790
0
                inShift = 0;
4791
                /* Characters not in the BASE64 set implicitly unshift the sequence
4792
                   so no '-' is required, except if the character is itself a '-' */
4793
0
                if (IS_BASE64(ch) || ch == '-') {
4794
0
                    *out++ = '-';
4795
0
                }
4796
0
                *out++ = (char) ch;
4797
0
            }
4798
0
            else {
4799
0
                goto encode_char;
4800
0
            }
4801
0
        }
4802
0
        else { /* not in a shift sequence */
4803
0
            if (ch == '+') {
4804
0
                *out++ = '+';
4805
0
                        *out++ = '-';
4806
0
            }
4807
0
            else if (ENCODE_DIRECT(ch)) {
4808
0
                *out++ = (char) ch;
4809
0
            }
4810
0
            else {
4811
0
                *out++ = '+';
4812
0
                inShift = 1;
4813
0
                goto encode_char;
4814
0
            }
4815
0
        }
4816
0
        continue;
4817
0
encode_char:
4818
0
        if (ch >= 0x10000) {
4819
0
            assert(ch <= MAX_UNICODE);
4820
4821
            /* code first surrogate */
4822
0
            base64bits += 16;
4823
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4824
0
            while (base64bits >= 6) {
4825
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4826
0
                base64bits -= 6;
4827
0
            }
4828
            /* prepare second surrogate */
4829
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4830
0
        }
4831
0
        base64bits += 16;
4832
0
        base64buffer = (base64buffer << 16) | ch;
4833
0
        while (base64bits >= 6) {
4834
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4835
0
            base64bits -= 6;
4836
0
        }
4837
0
    }
4838
0
    if (base64bits)
4839
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4840
0
    if (inShift)
4841
0
        *out++ = '-';
4842
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4843
0
}
4844
4845
#undef IS_BASE64
4846
#undef FROM_BASE64
4847
#undef TO_BASE64
4848
#undef DECODE_DIRECT
4849
#undef ENCODE_DIRECT
4850
4851
/* --- UTF-8 Codec -------------------------------------------------------- */
4852
4853
PyObject *
4854
PyUnicode_DecodeUTF8(const char *s,
4855
                     Py_ssize_t size,
4856
                     const char *errors)
4857
2.35M
{
4858
2.35M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4859
2.35M
}
4860
4861
#include "stringlib/asciilib.h"
4862
#include "stringlib/codecs.h"
4863
#include "stringlib/undef.h"
4864
4865
#include "stringlib/ucs1lib.h"
4866
#include "stringlib/codecs.h"
4867
#include "stringlib/undef.h"
4868
4869
#include "stringlib/ucs2lib.h"
4870
#include "stringlib/codecs.h"
4871
#include "stringlib/undef.h"
4872
4873
#include "stringlib/ucs4lib.h"
4874
#include "stringlib/codecs.h"
4875
#include "stringlib/undef.h"
4876
4877
#if (SIZEOF_SIZE_T == 8)
4878
/* Mask to quickly check whether a C 'size_t' contains a
4879
   non-ASCII, UTF8-encoded char. */
4880
122M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4881
// used to count codepoints in UTF-8 string.
4882
305M
# define VECTOR_0101     0x0101010101010101ULL
4883
3.03M
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4884
#elif (SIZEOF_SIZE_T == 4)
4885
# define ASCII_CHAR_MASK 0x80808080U
4886
# define VECTOR_0101     0x01010101U
4887
# define VECTOR_00FF     0x00ff00ffU
4888
#else
4889
# error C 'size_t' size should be either 4 or 8!
4890
#endif
4891
4892
#if (defined(__clang__) || defined(__GNUC__))
4893
#define HAVE_CTZ 1
4894
static inline unsigned int
4895
ctz(size_t v)
4896
746k
{
4897
746k
    return __builtin_ctzll((unsigned long long)v);
4898
746k
}
4899
#elif defined(_MSC_VER)
4900
#define HAVE_CTZ 1
4901
static inline unsigned int
4902
ctz(size_t v)
4903
{
4904
    unsigned long pos;
4905
#if SIZEOF_SIZE_T == 4
4906
    _BitScanForward(&pos, v);
4907
#else
4908
    _BitScanForward64(&pos, v);
4909
#endif /* SIZEOF_SIZE_T */
4910
    return pos;
4911
}
4912
#else
4913
#define HAVE_CTZ 0
4914
#endif
4915
4916
#if HAVE_CTZ && PY_LITTLE_ENDIAN
4917
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
4918
static size_t
4919
load_unaligned(const unsigned char *p, size_t size)
4920
13.5M
{
4921
13.5M
    union {
4922
13.5M
        size_t s;
4923
13.5M
        unsigned char b[SIZEOF_SIZE_T];
4924
13.5M
    } u;
4925
13.5M
    u.s = 0;
4926
    // This switch statement assumes little endian because:
4927
    // * union is faster than bitwise or and shift.
4928
    // * big endian machine is rare and hard to maintain.
4929
13.5M
    switch (size) {
4930
0
    default:
4931
0
#if SIZEOF_SIZE_T == 8
4932
0
    case 8:
4933
0
        u.b[7] = p[7];
4934
0
        _Py_FALLTHROUGH;
4935
885k
    case 7:
4936
885k
        u.b[6] = p[6];
4937
885k
        _Py_FALLTHROUGH;
4938
2.14M
    case 6:
4939
2.14M
        u.b[5] = p[5];
4940
2.14M
        _Py_FALLTHROUGH;
4941
2.77M
    case 5:
4942
2.77M
        u.b[4] = p[4];
4943
2.77M
        _Py_FALLTHROUGH;
4944
2.77M
#endif
4945
3.29M
    case 4:
4946
3.29M
        u.b[3] = p[3];
4947
3.29M
        _Py_FALLTHROUGH;
4948
8.36M
    case 3:
4949
8.36M
        u.b[2] = p[2];
4950
8.36M
        _Py_FALLTHROUGH;
4951
11.5M
    case 2:
4952
11.5M
        u.b[1] = p[1];
4953
11.5M
        _Py_FALLTHROUGH;
4954
13.1M
    case 1:
4955
13.1M
        u.b[0] = p[0];
4956
13.1M
        break;
4957
359k
    case 0:
4958
359k
        break;
4959
13.5M
    }
4960
13.5M
    return u.s;
4961
13.5M
}
4962
#endif
4963
4964
/*
4965
 * Find the first non-ASCII character in a byte sequence.
4966
 *
4967
 * This function scans a range of bytes from `start` to `end` and returns the
4968
 * index of the first byte that is not an ASCII character (i.e., has the most
4969
 * significant bit set). If all characters in the range are ASCII, it returns
4970
 * `end - start`.
4971
 */
4972
static Py_ssize_t
4973
find_first_nonascii(const unsigned char *start, const unsigned char *end)
4974
14.1M
{
4975
    // The search is done in `size_t` chunks.
4976
    // The start and end might not be aligned at `size_t` boundaries,
4977
    // so they're handled specially.
4978
4979
14.1M
    const unsigned char *p = start;
4980
4981
14.1M
    if (end - start >= SIZEOF_SIZE_T) {
4982
        // Avoid unaligned read.
4983
4.10M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
4984
4.10M
        size_t u;
4985
4.10M
        memcpy(&u, p, sizeof(size_t));
4986
4.10M
        u &= ASCII_CHAR_MASK;
4987
4.10M
        if (u) {
4988
281k
            return (ctz(u) - 7) / 8;
4989
281k
        }
4990
3.81M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
4991
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
4992
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
4993
        while (p < p2) {
4994
            if (*p & 0x80) {
4995
                return p - start;
4996
            }
4997
            p++;
4998
        }
4999
#endif
5000
5001
3.81M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5002
107M
        while (p <= e) {
5003
103M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5004
103M
            if (u) {
5005
269k
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5006
269k
                return p - start + (ctz(u) - 7) / 8;
5007
#else
5008
                // big endian and minor compilers are difficult to test.
5009
                // fallback to per byte check.
5010
                break;
5011
#endif
5012
269k
            }
5013
103M
            p += SIZEOF_SIZE_T;
5014
103M
        }
5015
3.81M
    }
5016
13.5M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5017
14.1M
    assert((end - p) < SIZEOF_SIZE_T);
5018
    // we can not use *(const size_t*)p to avoid buffer overrun.
5019
13.5M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5020
13.5M
    if (u) {
5021
195k
        return p - start + (ctz(u) - 7) / 8;
5022
195k
    }
5023
13.3M
    return end - start;
5024
#else
5025
    while (p < end) {
5026
        if (*p & 0x80) {
5027
            break;
5028
        }
5029
        p++;
5030
    }
5031
    return p - start;
5032
#endif
5033
13.5M
}
5034
5035
static inline int
5036
scalar_utf8_start_char(unsigned int ch)
5037
1.02M
{
5038
    // 0xxxxxxx or 11xxxxxx are first byte.
5039
1.02M
    return (~ch >> 7 | ch >> 6) & 1;
5040
1.02M
}
5041
5042
static inline size_t
5043
vector_utf8_start_chars(size_t v)
5044
305M
{
5045
305M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5046
305M
}
5047
5048
5049
// Count the number of UTF-8 code points in a given byte sequence.
5050
static Py_ssize_t
5051
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5052
421k
{
5053
421k
    Py_ssize_t len = 0;
5054
5055
421k
    if (end - s >= SIZEOF_SIZE_T) {
5056
348k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5057
17.2k
            len += scalar_utf8_start_char(*s++);
5058
17.2k
        }
5059
5060
1.85M
        while (s + SIZEOF_SIZE_T <= end) {
5061
1.51M
            const unsigned char *e = end;
5062
1.51M
            if (e - s > SIZEOF_SIZE_T * 255) {
5063
1.19M
                e = s + SIZEOF_SIZE_T * 255;
5064
1.19M
            }
5065
1.51M
            Py_ssize_t vstart = 0;
5066
306M
            while (s + SIZEOF_SIZE_T <= e) {
5067
305M
                size_t v = *(size_t*)s;
5068
305M
                size_t vs = vector_utf8_start_chars(v);
5069
305M
                vstart += vs;
5070
305M
                s += SIZEOF_SIZE_T;
5071
305M
            }
5072
1.51M
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5073
1.51M
            vstart += vstart >> 16;
5074
1.51M
#if SIZEOF_SIZE_T == 8
5075
1.51M
            vstart += vstart >> 32;
5076
1.51M
#endif
5077
1.51M
            len += vstart & 0x7ff;
5078
1.51M
        }
5079
330k
    }
5080
1.42M
    while (s < end) {
5081
1.00M
        len += scalar_utf8_start_char(*s++);
5082
1.00M
    }
5083
421k
    return len;
5084
421k
}
5085
5086
static Py_ssize_t
5087
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5088
5.07M
{
5089
5.07M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5090
5.07M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5091
5.05M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5092
666k
    {
5093
        /* Fast path, see in STRINGLIB(utf8_decode) for
5094
           an explanation. */
5095
666k
        const char *p = start;
5096
666k
        Py_UCS1 *q = dest;
5097
1.67M
        while (p + SIZEOF_SIZE_T <= end) {
5098
1.18M
            size_t value = *(const size_t *) p;
5099
1.18M
            if (value & ASCII_CHAR_MASK)
5100
173k
                break;
5101
1.00M
            *((size_t *)q) = value;
5102
1.00M
            p += SIZEOF_SIZE_T;
5103
1.00M
            q += SIZEOF_SIZE_T;
5104
1.00M
        }
5105
2.63M
        while (p < end) {
5106
2.16M
            if ((unsigned char)*p & 0x80)
5107
196k
                break;
5108
1.97M
            *q++ = *p++;
5109
1.97M
        }
5110
666k
        return p - start;
5111
666k
    }
5112
4.40M
#endif
5113
4.40M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5114
4.40M
                                         (const unsigned char*)end);
5115
4.40M
    memcpy(dest, start, pos);
5116
4.40M
    return pos;
5117
5.07M
}
5118
5119
static int
5120
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5121
                         const char *starts, const char *s, const char *end,
5122
                         _Py_error_handler error_handler,
5123
                         const char *errors,
5124
                         Py_ssize_t *consumed)
5125
749k
{
5126
749k
    Py_ssize_t startinpos, endinpos;
5127
749k
    const char *errmsg = "";
5128
749k
    PyObject *error_handler_obj = NULL;
5129
749k
    PyObject *exc = NULL;
5130
5131
165M
    while (s < end) {
5132
165M
        Py_UCS4 ch;
5133
165M
        int kind = writer->kind;
5134
5135
165M
        if (kind == PyUnicode_1BYTE_KIND) {
5136
711k
            if (PyUnicode_IS_ASCII(writer->buffer))
5137
325k
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5138
385k
            else
5139
385k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5140
164M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5141
78.6M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5142
85.9M
        } else {
5143
85.9M
            assert(kind == PyUnicode_4BYTE_KIND);
5144
85.9M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5145
85.9M
        }
5146
5147
165M
        switch (ch) {
5148
670k
        case 0:
5149
670k
            if (s == end || consumed)
5150
644k
                goto End;
5151
26.2k
            errmsg = "unexpected end of data";
5152
26.2k
            startinpos = s - starts;
5153
26.2k
            endinpos = end - starts;
5154
26.2k
            break;
5155
131M
        case 1:
5156
131M
            errmsg = "invalid start byte";
5157
131M
            startinpos = s - starts;
5158
131M
            endinpos = startinpos + 1;
5159
131M
            break;
5160
31.0M
        case 2:
5161
31.0M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5162
0
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5163
0
            {
5164
                /* Truncated surrogate code in range D800-DFFF */
5165
0
                goto End;
5166
0
            }
5167
31.0M
            _Py_FALLTHROUGH;
5168
32.4M
        case 3:
5169
32.5M
        case 4:
5170
32.5M
            errmsg = "invalid continuation byte";
5171
32.5M
            startinpos = s - starts;
5172
32.5M
            endinpos = startinpos + ch - 1;
5173
32.5M
            break;
5174
321k
        default:
5175
            // ch doesn't fit into kind, so change the buffer kind to write
5176
            // the character
5177
321k
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5178
0
                goto onError;
5179
321k
            continue;
5180
165M
        }
5181
5182
164M
        if (error_handler == _Py_ERROR_UNKNOWN)
5183
129k
            error_handler = _Py_GetErrorHandler(errors);
5184
5185
164M
        switch (error_handler) {
5186
0
        case _Py_ERROR_IGNORE:
5187
0
            s += (endinpos - startinpos);
5188
0
            break;
5189
5190
164M
        case _Py_ERROR_REPLACE:
5191
164M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5192
0
                goto onError;
5193
164M
            s += (endinpos - startinpos);
5194
164M
            break;
5195
5196
2.19k
        case _Py_ERROR_SURROGATEESCAPE:
5197
2.19k
        {
5198
2.19k
            Py_ssize_t i;
5199
5200
2.19k
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5201
0
                goto onError;
5202
4.67k
            for (i=startinpos; i<endinpos; i++) {
5203
2.47k
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5204
2.47k
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5205
2.47k
                                ch + 0xdc00);
5206
2.47k
                writer->pos++;
5207
2.47k
            }
5208
2.19k
            s += (endinpos - startinpos);
5209
2.19k
            break;
5210
2.19k
        }
5211
5212
782
        default:
5213
782
            if (unicode_decode_call_errorhandler_writer(
5214
782
                    errors, &error_handler_obj,
5215
782
                    "utf-8", errmsg,
5216
782
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5217
782
                    writer)) {
5218
782
                goto onError;
5219
782
            }
5220
5221
0
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5222
0
                return -1;
5223
0
            }
5224
164M
        }
5225
164M
    }
5226
5227
748k
End:
5228
748k
    if (consumed)
5229
903
        *consumed = s - starts;
5230
5231
748k
    Py_XDECREF(error_handler_obj);
5232
748k
    Py_XDECREF(exc);
5233
748k
    return 0;
5234
5235
782
onError:
5236
782
    Py_XDECREF(error_handler_obj);
5237
782
    Py_XDECREF(exc);
5238
782
    return -1;
5239
749k
}
5240
5241
5242
static PyObject *
5243
unicode_decode_utf8(const char *s, Py_ssize_t size,
5244
                    _Py_error_handler error_handler, const char *errors,
5245
                    Py_ssize_t *consumed)
5246
11.4M
{
5247
11.4M
    if (size == 0) {
5248
64.1k
        if (consumed) {
5249
0
            *consumed = 0;
5250
0
        }
5251
64.1k
        _Py_RETURN_UNICODE_EMPTY();
5252
64.1k
    }
5253
5254
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5255
11.3M
    if (size == 1 && (unsigned char)s[0] < 128) {
5256
1.69M
        if (consumed) {
5257
0
            *consumed = 1;
5258
0
        }
5259
1.69M
        return get_latin1_char((unsigned char)s[0]);
5260
1.69M
    }
5261
5262
    // I don't know this check is necessary or not. But there is a test
5263
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5264
9.69M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5265
0
        PyErr_NoMemory();
5266
0
        return NULL;
5267
0
    }
5268
5269
9.69M
    const char *starts = s;
5270
9.69M
    const char *end = s + size;
5271
5272
9.69M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5273
9.69M
    if (pos == size) {  // fast path: ASCII string.
5274
8.99M
        PyObject *u = PyUnicode_New(size, 127);
5275
8.99M
        if (u == NULL) {
5276
0
            return NULL;
5277
0
        }
5278
8.99M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5279
8.99M
        if (consumed) {
5280
0
            *consumed = size;
5281
0
        }
5282
8.99M
        return u;
5283
8.99M
    }
5284
5285
696k
    int maxchr = 127;
5286
696k
    Py_ssize_t maxsize = size;
5287
5288
696k
    unsigned char ch = (unsigned char)(s[pos]);
5289
    // error handler other than strict may remove/replace the invalid byte.
5290
    // consumed != NULL allows 1~3 bytes remainings.
5291
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5292
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5293
    // reallocation and copy.
5294
696k
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5295
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5296
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5297
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5298
        // means that it is no longer necessary to allocate several times the required amount
5299
        // of memory.
5300
421k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5301
421k
        if (ch < 0xc4) { // latin1
5302
298k
            maxchr = 0xff;
5303
298k
        }
5304
122k
        else if (ch < 0xf0) { // ucs2
5305
112k
            maxchr = 0xffff;
5306
112k
        }
5307
10.0k
        else { // ucs4
5308
10.0k
            maxchr = 0x10ffff;
5309
10.0k
        }
5310
421k
    }
5311
696k
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5312
696k
    if (!u) {
5313
0
        return NULL;
5314
0
    }
5315
5316
    // Use _PyUnicodeWriter after fast path is failed.
5317
696k
    _PyUnicodeWriter writer;
5318
696k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5319
696k
    if (maxchr <= 255) {
5320
574k
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5321
574k
        s += pos;
5322
574k
        writer.pos = pos;
5323
574k
    }
5324
5325
696k
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5326
696k
                                 error_handler, errors,
5327
696k
                                 consumed) < 0) {
5328
782
        _PyUnicodeWriter_Dealloc(&writer);
5329
782
        return NULL;
5330
782
    }
5331
696k
    return _PyUnicodeWriter_Finish(&writer);
5332
696k
}
5333
5334
5335
// Used by PyUnicodeWriter_WriteUTF8() implementation
5336
int
5337
_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
5338
                            const char *s, Py_ssize_t size,
5339
                            _Py_error_handler error_handler, const char *errors,
5340
                            Py_ssize_t *consumed)
5341
4.49M
{
5342
4.49M
    if (size == 0) {
5343
8.33k
        if (consumed) {
5344
0
            *consumed = 0;
5345
0
        }
5346
8.33k
        return 0;
5347
8.33k
    }
5348
5349
    // fast path: try ASCII string.
5350
4.48M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5351
0
        return -1;
5352
0
    }
5353
5354
4.48M
    const char *starts = s;
5355
4.48M
    const char *end = s + size;
5356
4.48M
    Py_ssize_t decoded = 0;
5357
4.48M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5358
4.48M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5359
4.48M
        decoded = ascii_decode(s, end, dest);
5360
4.48M
        writer->pos += decoded;
5361
5362
4.48M
        if (decoded == size) {
5363
4.43M
            if (consumed) {
5364
852
                *consumed = size;
5365
852
            }
5366
4.43M
            return 0;
5367
4.43M
        }
5368
50.1k
        s += decoded;
5369
50.1k
    }
5370
5371
52.4k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5372
52.4k
                                    error_handler, errors, consumed);
5373
4.48M
}
5374
5375
5376
PyObject *
5377
PyUnicode_DecodeUTF8Stateful(const char *s,
5378
                             Py_ssize_t size,
5379
                             const char *errors,
5380
                             Py_ssize_t *consumed)
5381
11.4M
{
5382
11.4M
    return unicode_decode_utf8(s, size,
5383
11.4M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5384
11.4M
                               errors, consumed);
5385
11.4M
}
5386
5387
5388
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5389
   non-zero, use strict error handler otherwise.
5390
5391
   On success, write a pointer to a newly allocated wide character string into
5392
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5393
   (in number of wchar_t units) into *wlen (if wlen is set).
5394
5395
   On memory allocation failure, return -1.
5396
5397
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5398
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5399
   is not NULL, write the decoding error message into *reason. */
5400
int
5401
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5402
                 const char **reason, _Py_error_handler errors)
5403
7.21k
{
5404
7.21k
    const char *orig_s = s;
5405
7.21k
    const char *e;
5406
7.21k
    wchar_t *unicode;
5407
7.21k
    Py_ssize_t outpos;
5408
5409
7.21k
    int surrogateescape = 0;
5410
7.21k
    int surrogatepass = 0;
5411
7.21k
    switch (errors)
5412
7.21k
    {
5413
0
    case _Py_ERROR_STRICT:
5414
0
        break;
5415
7.21k
    case _Py_ERROR_SURROGATEESCAPE:
5416
7.21k
        surrogateescape = 1;
5417
7.21k
        break;
5418
0
    case _Py_ERROR_SURROGATEPASS:
5419
0
        surrogatepass = 1;
5420
0
        break;
5421
0
    default:
5422
0
        return -3;
5423
7.21k
    }
5424
5425
    /* Note: size will always be longer than the resulting Unicode
5426
       character count */
5427
7.21k
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5428
0
        return -1;
5429
0
    }
5430
5431
7.21k
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5432
7.21k
    if (!unicode) {
5433
0
        return -1;
5434
0
    }
5435
5436
    /* Unpack UTF-8 encoded data */
5437
7.21k
    e = s + size;
5438
7.21k
    outpos = 0;
5439
7.21k
    while (s < e) {
5440
7.21k
        Py_UCS4 ch;
5441
7.21k
#if SIZEOF_WCHAR_T == 4
5442
7.21k
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5443
#else
5444
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5445
#endif
5446
7.21k
        if (ch > 0xFF) {
5447
0
#if SIZEOF_WCHAR_T == 4
5448
0
            Py_UNREACHABLE();
5449
#else
5450
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5451
            /* write a surrogate pair */
5452
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5453
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5454
#endif
5455
0
        }
5456
7.21k
        else {
5457
7.21k
            if (!ch && s == e) {
5458
7.21k
                break;
5459
7.21k
            }
5460
5461
0
            if (surrogateescape) {
5462
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5463
0
            }
5464
0
            else {
5465
                /* Is it a valid three-byte code? */
5466
0
                if (surrogatepass
5467
0
                    && (e - s) >= 3
5468
0
                    && (s[0] & 0xf0) == 0xe0
5469
0
                    && (s[1] & 0xc0) == 0x80
5470
0
                    && (s[2] & 0xc0) == 0x80)
5471
0
                {
5472
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5473
0
                    s += 3;
5474
0
                    unicode[outpos++] = ch;
5475
0
                }
5476
0
                else {
5477
0
                    PyMem_RawFree(unicode );
5478
0
                    if (reason != NULL) {
5479
0
                        switch (ch) {
5480
0
                        case 0:
5481
0
                            *reason = "unexpected end of data";
5482
0
                            break;
5483
0
                        case 1:
5484
0
                            *reason = "invalid start byte";
5485
0
                            break;
5486
                        /* 2, 3, 4 */
5487
0
                        default:
5488
0
                            *reason = "invalid continuation byte";
5489
0
                            break;
5490
0
                        }
5491
0
                    }
5492
0
                    if (wlen != NULL) {
5493
0
                        *wlen = s - orig_s;
5494
0
                    }
5495
0
                    return -2;
5496
0
                }
5497
0
            }
5498
0
        }
5499
7.21k
    }
5500
7.21k
    unicode[outpos] = L'\0';
5501
7.21k
    if (wlen) {
5502
7.21k
        *wlen = outpos;
5503
7.21k
    }
5504
7.21k
    *wstr = unicode;
5505
7.21k
    return 0;
5506
7.21k
}
5507
5508
5509
wchar_t*
5510
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5511
                               size_t *wlen)
5512
0
{
5513
0
    wchar_t *wstr;
5514
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5515
0
                               &wstr, wlen,
5516
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5517
0
    if (res != 0) {
5518
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5519
0
        assert(res != -3);
5520
0
        if (wlen) {
5521
0
            *wlen = (size_t)res;
5522
0
        }
5523
0
        return NULL;
5524
0
    }
5525
0
    return wstr;
5526
0
}
5527
5528
5529
/* UTF-8 encoder.
5530
5531
   On success, return 0 and write the newly allocated character string (use
5532
   PyMem_Free() to free the memory) into *str.
5533
5534
   On encoding failure, return -2 and write the position of the invalid
5535
   surrogate character into *error_pos (if error_pos is set) and the decoding
5536
   error message into *reason (if reason is set).
5537
5538
   On memory allocation failure, return -1. */
5539
int
5540
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5541
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5542
860
{
5543
860
    const Py_ssize_t max_char_size = 4;
5544
860
    Py_ssize_t len = wcslen(text);
5545
5546
860
    assert(len >= 0);
5547
5548
860
    int surrogateescape = 0;
5549
860
    int surrogatepass = 0;
5550
860
    switch (errors)
5551
860
    {
5552
88
    case _Py_ERROR_STRICT:
5553
88
        break;
5554
772
    case _Py_ERROR_SURROGATEESCAPE:
5555
772
        surrogateescape = 1;
5556
772
        break;
5557
0
    case _Py_ERROR_SURROGATEPASS:
5558
0
        surrogatepass = 1;
5559
0
        break;
5560
0
    default:
5561
0
        return -3;
5562
860
    }
5563
5564
860
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5565
0
        return -1;
5566
0
    }
5567
860
    char *bytes;
5568
860
    if (raw_malloc) {
5569
860
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5570
860
    }
5571
0
    else {
5572
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5573
0
    }
5574
860
    if (bytes == NULL) {
5575
0
        return -1;
5576
0
    }
5577
5578
860
    char *p = bytes;
5579
860
    Py_ssize_t i;
5580
56.2k
    for (i = 0; i < len; ) {
5581
55.3k
        Py_ssize_t ch_pos = i;
5582
55.3k
        Py_UCS4 ch = text[i];
5583
55.3k
        i++;
5584
#if Py_UNICODE_SIZE == 2
5585
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5586
            && i < len
5587
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5588
        {
5589
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5590
            i++;
5591
        }
5592
#endif
5593
5594
55.3k
        if (ch < 0x80) {
5595
            /* Encode ASCII */
5596
55.3k
            *p++ = (char) ch;
5597
5598
55.3k
        }
5599
0
        else if (ch < 0x0800) {
5600
            /* Encode Latin-1 */
5601
0
            *p++ = (char)(0xc0 | (ch >> 6));
5602
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5603
0
        }
5604
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5605
            /* surrogateescape error handler */
5606
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5607
0
                if (error_pos != NULL) {
5608
0
                    *error_pos = (size_t)ch_pos;
5609
0
                }
5610
0
                if (reason != NULL) {
5611
0
                    *reason = "encoding error";
5612
0
                }
5613
0
                if (raw_malloc) {
5614
0
                    PyMem_RawFree(bytes);
5615
0
                }
5616
0
                else {
5617
0
                    PyMem_Free(bytes);
5618
0
                }
5619
0
                return -2;
5620
0
            }
5621
0
            *p++ = (char)(ch & 0xff);
5622
0
        }
5623
0
        else if (ch < 0x10000) {
5624
0
            *p++ = (char)(0xe0 | (ch >> 12));
5625
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5626
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5627
0
        }
5628
0
        else {  /* ch >= 0x10000 */
5629
0
            assert(ch <= MAX_UNICODE);
5630
            /* Encode UCS4 Unicode ordinals */
5631
0
            *p++ = (char)(0xf0 | (ch >> 18));
5632
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5633
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5634
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5635
0
        }
5636
55.3k
    }
5637
860
    *p++ = '\0';
5638
5639
860
    size_t final_size = (p - bytes);
5640
860
    char *bytes2;
5641
860
    if (raw_malloc) {
5642
860
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5643
860
    }
5644
0
    else {
5645
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5646
0
    }
5647
860
    if (bytes2 == NULL) {
5648
0
        if (error_pos != NULL) {
5649
0
            *error_pos = (size_t)-1;
5650
0
        }
5651
0
        if (raw_malloc) {
5652
0
            PyMem_RawFree(bytes);
5653
0
        }
5654
0
        else {
5655
0
            PyMem_Free(bytes);
5656
0
        }
5657
0
        return -1;
5658
0
    }
5659
860
    *str = bytes2;
5660
860
    return 0;
5661
860
}
5662
5663
5664
/* Primary internal function which creates utf8 encoded bytes objects.
5665
5666
   Allocation strategy:  if the string is short, convert into a stack buffer
5667
   and allocate exactly as much space needed at the end.  Else allocate the
5668
   maximum possible needed (4 result bytes per Unicode character), and return
5669
   the excess memory at the end.
5670
*/
5671
static PyObject *
5672
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5673
                    const char *errors)
5674
18.2M
{
5675
18.2M
    if (!PyUnicode_Check(unicode)) {
5676
0
        PyErr_BadArgument();
5677
0
        return NULL;
5678
0
    }
5679
5680
18.2M
    if (PyUnicode_UTF8(unicode))
5681
10.7M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5682
10.7M
                                         PyUnicode_UTF8_LENGTH(unicode));
5683
5684
7.46M
    int kind = PyUnicode_KIND(unicode);
5685
7.46M
    const void *data = PyUnicode_DATA(unicode);
5686
7.46M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5687
5688
7.46M
    PyBytesWriter *writer;
5689
7.46M
    char *end;
5690
5691
7.46M
    switch (kind) {
5692
0
    default:
5693
0
        Py_UNREACHABLE();
5694
5.92M
    case PyUnicode_1BYTE_KIND:
5695
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5696
5.92M
        assert(!PyUnicode_IS_ASCII(unicode));
5697
5.92M
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5698
5.92M
                                      error_handler, errors, &end);
5699
5.92M
        break;
5700
1.48M
    case PyUnicode_2BYTE_KIND:
5701
1.48M
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5702
1.48M
                                      error_handler, errors, &end);
5703
1.48M
        break;
5704
63.6k
    case PyUnicode_4BYTE_KIND:
5705
63.6k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5706
63.6k
                                      error_handler, errors, &end);
5707
63.6k
        break;
5708
7.46M
    }
5709
5710
7.46M
    if (writer == NULL) {
5711
178k
        PyBytesWriter_Discard(writer);
5712
178k
        return NULL;
5713
178k
    }
5714
7.29M
    return PyBytesWriter_FinishWithPointer(writer, end);
5715
7.46M
}
5716
5717
static int
5718
unicode_fill_utf8(PyObject *unicode)
5719
186k
{
5720
186k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5721
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5722
186k
    assert(!PyUnicode_IS_ASCII(unicode));
5723
5724
186k
    int kind = PyUnicode_KIND(unicode);
5725
186k
    const void *data = PyUnicode_DATA(unicode);
5726
186k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5727
5728
186k
    PyBytesWriter *writer;
5729
186k
    char *end;
5730
5731
186k
    switch (kind) {
5732
0
    default:
5733
0
        Py_UNREACHABLE();
5734
151k
    case PyUnicode_1BYTE_KIND:
5735
151k
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5736
151k
                                      _Py_ERROR_STRICT, NULL, &end);
5737
151k
        break;
5738
28.8k
    case PyUnicode_2BYTE_KIND:
5739
28.8k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5740
28.8k
                                      _Py_ERROR_STRICT, NULL, &end);
5741
28.8k
        break;
5742
5.80k
    case PyUnicode_4BYTE_KIND:
5743
5.80k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5744
5.80k
                                      _Py_ERROR_STRICT, NULL, &end);
5745
5.80k
        break;
5746
186k
    }
5747
186k
    if (writer == NULL) {
5748
207
        return -1;
5749
207
    }
5750
5751
186k
    const char *start = PyBytesWriter_GetData(writer);
5752
186k
    Py_ssize_t len = end - start;
5753
5754
186k
    char *cache = PyMem_Malloc(len + 1);
5755
186k
    if (cache == NULL) {
5756
0
        PyBytesWriter_Discard(writer);
5757
0
        PyErr_NoMemory();
5758
0
        return -1;
5759
0
    }
5760
186k
    memcpy(cache, start, len);
5761
186k
    cache[len] = '\0';
5762
186k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5763
186k
    PyUnicode_SET_UTF8(unicode, cache);
5764
186k
    PyBytesWriter_Discard(writer);
5765
186k
    return 0;
5766
186k
}
5767
5768
PyObject *
5769
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5770
18.2M
{
5771
18.2M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5772
18.2M
}
5773
5774
5775
PyObject *
5776
PyUnicode_AsUTF8String(PyObject *unicode)
5777
2.95k
{
5778
2.95k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5779
2.95k
}
5780
5781
/* --- UTF-32 Codec ------------------------------------------------------- */
5782
5783
PyObject *
5784
PyUnicode_DecodeUTF32(const char *s,
5785
                      Py_ssize_t size,
5786
                      const char *errors,
5787
                      int *byteorder)
5788
118
{
5789
118
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5790
118
}
5791
5792
PyObject *
5793
PyUnicode_DecodeUTF32Stateful(const char *s,
5794
                              Py_ssize_t size,
5795
                              const char *errors,
5796
                              int *byteorder,
5797
                              Py_ssize_t *consumed)
5798
32.9k
{
5799
32.9k
    const char *starts = s;
5800
32.9k
    Py_ssize_t startinpos;
5801
32.9k
    Py_ssize_t endinpos;
5802
32.9k
    _PyUnicodeWriter writer;
5803
32.9k
    const unsigned char *q, *e;
5804
32.9k
    int le, bo = 0;       /* assume native ordering by default */
5805
32.9k
    const char *encoding;
5806
32.9k
    const char *errmsg = "";
5807
32.9k
    PyObject *errorHandler = NULL;
5808
32.9k
    PyObject *exc = NULL;
5809
5810
32.9k
    q = (const unsigned char *)s;
5811
32.9k
    e = q + size;
5812
5813
32.9k
    if (byteorder)
5814
32.8k
        bo = *byteorder;
5815
5816
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5817
       byte order setting accordingly. In native mode, the leading BOM
5818
       mark is skipped, in all other modes, it is copied to the output
5819
       stream as-is (giving a ZWNBSP character). */
5820
32.9k
    if (bo == 0 && size >= 4) {
5821
30.5k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5822
30.5k
        if (bom == 0x0000FEFF) {
5823
83
            bo = -1;
5824
83
            q += 4;
5825
83
        }
5826
30.4k
        else if (bom == 0xFFFE0000) {
5827
248
            bo = 1;
5828
248
            q += 4;
5829
248
        }
5830
30.5k
        if (byteorder)
5831
30.4k
            *byteorder = bo;
5832
30.5k
    }
5833
5834
32.9k
    if (q == e) {
5835
103
        if (consumed)
5836
0
            *consumed = size;
5837
103
        _Py_RETURN_UNICODE_EMPTY();
5838
103
    }
5839
5840
#ifdef WORDS_BIGENDIAN
5841
    le = bo < 0;
5842
#else
5843
32.8k
    le = bo <= 0;
5844
32.8k
#endif
5845
32.8k
    encoding = le ? "utf-32-le" : "utf-32-be";
5846
5847
32.8k
    _PyUnicodeWriter_Init(&writer);
5848
32.8k
    writer.min_length = (e - q + 3) / 4;
5849
32.8k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5850
0
        goto onError;
5851
5852
121k
    while (1) {
5853
121k
        Py_UCS4 ch = 0;
5854
121k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5855
5856
121k
        if (e - q >= 4) {
5857
100k
            int kind = writer.kind;
5858
100k
            void *data = writer.data;
5859
100k
            const unsigned char *last = e - 4;
5860
100k
            Py_ssize_t pos = writer.pos;
5861
100k
            if (le) {
5862
119k
                do {
5863
119k
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5864
119k
                    if (ch > maxch)
5865
96.1k
                        break;
5866
23.0k
                    if (kind != PyUnicode_1BYTE_KIND &&
5867
7.37k
                        Py_UNICODE_IS_SURROGATE(ch))
5868
189
                        break;
5869
22.8k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5870
22.8k
                    q += 4;
5871
22.8k
                } while (q <= last);
5872
97.4k
            }
5873
3.32k
            else {
5874
6.25k
                do {
5875
6.25k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5876
6.25k
                    if (ch > maxch)
5877
3.07k
                        break;
5878
3.18k
                    if (kind != PyUnicode_1BYTE_KIND &&
5879
2.60k
                        Py_UNICODE_IS_SURROGATE(ch))
5880
103
                        break;
5881
3.08k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5882
3.08k
                    q += 4;
5883
3.08k
                } while (q <= last);
5884
3.32k
            }
5885
100k
            writer.pos = pos;
5886
100k
        }
5887
5888
121k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5889
295
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5890
295
            startinpos = ((const char *)q) - starts;
5891
295
            endinpos = startinpos + 4;
5892
295
        }
5893
120k
        else if (ch <= maxch) {
5894
21.5k
            if (q == e || consumed)
5895
3.61k
                break;
5896
            /* remaining bytes at the end? (size should be divisible by 4) */
5897
17.9k
            errmsg = "truncated data";
5898
17.9k
            startinpos = ((const char *)q) - starts;
5899
17.9k
            endinpos = ((const char *)e) - starts;
5900
17.9k
        }
5901
99.2k
        else {
5902
99.2k
            if (ch < 0x110000) {
5903
4.14k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5904
0
                    goto onError;
5905
4.14k
                q += 4;
5906
4.14k
                continue;
5907
4.14k
            }
5908
95.0k
            errmsg = "code point not in range(0x110000)";
5909
95.0k
            startinpos = ((const char *)q) - starts;
5910
95.0k
            endinpos = startinpos + 4;
5911
95.0k
        }
5912
5913
        /* The remaining input chars are ignored if the callback
5914
           chooses to skip the input */
5915
113k
        if (unicode_decode_call_errorhandler_writer(
5916
113k
                errors, &errorHandler,
5917
113k
                encoding, errmsg,
5918
113k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5919
113k
                &writer))
5920
29.2k
            goto onError;
5921
113k
    }
5922
5923
3.61k
    if (consumed)
5924
0
        *consumed = (const char *)q-starts;
5925
5926
3.61k
    Py_XDECREF(errorHandler);
5927
3.61k
    Py_XDECREF(exc);
5928
3.61k
    return _PyUnicodeWriter_Finish(&writer);
5929
5930
29.2k
  onError:
5931
29.2k
    _PyUnicodeWriter_Dealloc(&writer);
5932
29.2k
    Py_XDECREF(errorHandler);
5933
29.2k
    Py_XDECREF(exc);
5934
29.2k
    return NULL;
5935
32.8k
}
5936
5937
PyObject *
5938
_PyUnicode_EncodeUTF32(PyObject *str,
5939
                       const char *errors,
5940
                       int byteorder)
5941
0
{
5942
0
    if (!PyUnicode_Check(str)) {
5943
0
        PyErr_BadArgument();
5944
0
        return NULL;
5945
0
    }
5946
0
    int kind = PyUnicode_KIND(str);
5947
0
    const void *data = PyUnicode_DATA(str);
5948
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
5949
5950
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5951
0
        return PyErr_NoMemory();
5952
0
    Py_ssize_t nsize = len + (byteorder == 0);
5953
5954
0
#if PY_LITTLE_ENDIAN
5955
0
    int native_ordering = byteorder <= 0;
5956
#else
5957
    int native_ordering = byteorder >= 0;
5958
#endif
5959
5960
0
    if (kind == PyUnicode_1BYTE_KIND) {
5961
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
5962
        // on short strings
5963
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5964
0
        if (v == NULL) {
5965
0
            return NULL;
5966
0
        }
5967
5968
        /* output buffer is 4-bytes aligned */
5969
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5970
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
5971
0
        if (byteorder == 0) {
5972
0
            *out++ = 0xFEFF;
5973
0
        }
5974
0
        if (len > 0) {
5975
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
5976
0
                                 &out, native_ordering);
5977
0
        }
5978
0
        return v;
5979
0
    }
5980
5981
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
5982
0
    if (writer == NULL) {
5983
0
        return NULL;
5984
0
    }
5985
5986
    /* output buffer is 4-bytes aligned */
5987
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
5988
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
5989
0
    if (byteorder == 0) {
5990
0
        *out++ = 0xFEFF;
5991
0
    }
5992
0
    if (len == 0) {
5993
0
        return PyBytesWriter_Finish(writer);
5994
0
    }
5995
5996
0
    const char *encoding;
5997
0
    if (byteorder == -1)
5998
0
        encoding = "utf-32-le";
5999
0
    else if (byteorder == 1)
6000
0
        encoding = "utf-32-be";
6001
0
    else
6002
0
        encoding = "utf-32";
6003
6004
0
    PyObject *errorHandler = NULL;
6005
0
    PyObject *exc = NULL;
6006
0
    PyObject *rep = NULL;
6007
6008
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6009
0
        if (kind == PyUnicode_2BYTE_KIND) {
6010
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6011
0
                                        &out, native_ordering);
6012
0
        }
6013
0
        else {
6014
0
            assert(kind == PyUnicode_4BYTE_KIND);
6015
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6016
0
                                        &out, native_ordering);
6017
0
        }
6018
0
        if (pos == len)
6019
0
            break;
6020
6021
0
        Py_ssize_t newpos;
6022
0
        rep = unicode_encode_call_errorhandler(
6023
0
                errors, &errorHandler,
6024
0
                encoding, "surrogates not allowed",
6025
0
                str, &exc, pos, pos + 1, &newpos);
6026
0
        if (!rep)
6027
0
            goto error;
6028
6029
0
        Py_ssize_t repsize, moreunits;
6030
0
        if (PyBytes_Check(rep)) {
6031
0
            repsize = PyBytes_GET_SIZE(rep);
6032
0
            if (repsize & 3) {
6033
0
                raise_encode_exception(&exc, encoding,
6034
0
                                       str, pos, pos + 1,
6035
0
                                       "surrogates not allowed");
6036
0
                goto error;
6037
0
            }
6038
0
            moreunits = repsize / 4;
6039
0
        }
6040
0
        else {
6041
0
            assert(PyUnicode_Check(rep));
6042
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6043
0
            if (!PyUnicode_IS_ASCII(rep)) {
6044
0
                raise_encode_exception(&exc, encoding,
6045
0
                                       str, pos, pos + 1,
6046
0
                                       "surrogates not allowed");
6047
0
                goto error;
6048
0
            }
6049
0
        }
6050
0
        moreunits += pos - newpos;
6051
0
        pos = newpos;
6052
6053
        /* four bytes are reserved for each surrogate */
6054
0
        if (moreunits > 0) {
6055
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6056
0
            if (out == NULL) {
6057
0
                goto error;
6058
0
            }
6059
0
        }
6060
6061
0
        if (PyBytes_Check(rep)) {
6062
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6063
0
            out += repsize / 4;
6064
0
        }
6065
0
        else {
6066
            /* rep is unicode */
6067
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6068
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6069
0
                                 &out, native_ordering);
6070
0
        }
6071
6072
0
        Py_CLEAR(rep);
6073
0
    }
6074
6075
0
    Py_XDECREF(errorHandler);
6076
0
    Py_XDECREF(exc);
6077
6078
    /* Cut back to size actually needed. This is necessary for, for example,
6079
       encoding of a string containing isolated surrogates and the 'ignore'
6080
       handler is used. */
6081
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6082
6083
0
  error:
6084
0
    Py_XDECREF(rep);
6085
0
    Py_XDECREF(errorHandler);
6086
0
    Py_XDECREF(exc);
6087
0
    PyBytesWriter_Discard(writer);
6088
0
    return NULL;
6089
0
}
6090
6091
PyObject *
6092
PyUnicode_AsUTF32String(PyObject *unicode)
6093
0
{
6094
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6095
0
}
6096
6097
/* --- UTF-16 Codec ------------------------------------------------------- */
6098
6099
PyObject *
6100
PyUnicode_DecodeUTF16(const char *s,
6101
                      Py_ssize_t size,
6102
                      const char *errors,
6103
                      int *byteorder)
6104
96
{
6105
96
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6106
96
}
6107
6108
PyObject *
6109
PyUnicode_DecodeUTF16Stateful(const char *s,
6110
                              Py_ssize_t size,
6111
                              const char *errors,
6112
                              int *byteorder,
6113
                              Py_ssize_t *consumed)
6114
13.6k
{
6115
13.6k
    const char *starts = s;
6116
13.6k
    Py_ssize_t startinpos;
6117
13.6k
    Py_ssize_t endinpos;
6118
13.6k
    _PyUnicodeWriter writer;
6119
13.6k
    const unsigned char *q, *e;
6120
13.6k
    int bo = 0;       /* assume native ordering by default */
6121
13.6k
    int native_ordering;
6122
13.6k
    const char *errmsg = "";
6123
13.6k
    PyObject *errorHandler = NULL;
6124
13.6k
    PyObject *exc = NULL;
6125
13.6k
    const char *encoding;
6126
6127
13.6k
    q = (const unsigned char *)s;
6128
13.6k
    e = q + size;
6129
6130
13.6k
    if (byteorder)
6131
13.5k
        bo = *byteorder;
6132
6133
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6134
       byte order setting accordingly. In native mode, the leading BOM
6135
       mark is skipped, in all other modes, it is copied to the output
6136
       stream as-is (giving a ZWNBSP character). */
6137
13.6k
    if (bo == 0 && size >= 2) {
6138
12.9k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6139
12.9k
        if (bom == 0xFEFF) {
6140
348
            q += 2;
6141
348
            bo = -1;
6142
348
        }
6143
12.6k
        else if (bom == 0xFFFE) {
6144
2.10k
            q += 2;
6145
2.10k
            bo = 1;
6146
2.10k
        }
6147
12.9k
        if (byteorder)
6148
12.8k
            *byteorder = bo;
6149
12.9k
    }
6150
6151
13.6k
    if (q == e) {
6152
76
        if (consumed)
6153
0
            *consumed = size;
6154
76
        _Py_RETURN_UNICODE_EMPTY();
6155
76
    }
6156
6157
13.6k
#if PY_LITTLE_ENDIAN
6158
13.6k
    native_ordering = bo <= 0;
6159
13.6k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6160
#else
6161
    native_ordering = bo >= 0;
6162
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6163
#endif
6164
6165
    /* Note: size will always be longer than the resulting Unicode
6166
       character count normally.  Error handler will take care of
6167
       resizing when needed. */
6168
13.6k
    _PyUnicodeWriter_Init(&writer);
6169
13.6k
    writer.min_length = (e - q + 1) / 2;
6170
13.6k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6171
0
        goto onError;
6172
6173
52.5k
    while (1) {
6174
52.5k
        Py_UCS4 ch = 0;
6175
52.5k
        if (e - q >= 2) {
6176
45.2k
            int kind = writer.kind;
6177
45.2k
            if (kind == PyUnicode_1BYTE_KIND) {
6178
16.6k
                if (PyUnicode_IS_ASCII(writer.buffer))
6179
13.0k
                    ch = asciilib_utf16_decode(&q, e,
6180
13.0k
                            (Py_UCS1*)writer.data, &writer.pos,
6181
13.0k
                            native_ordering);
6182
3.60k
                else
6183
3.60k
                    ch = ucs1lib_utf16_decode(&q, e,
6184
3.60k
                            (Py_UCS1*)writer.data, &writer.pos,
6185
3.60k
                            native_ordering);
6186
28.5k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6187
11.8k
                ch = ucs2lib_utf16_decode(&q, e,
6188
11.8k
                        (Py_UCS2*)writer.data, &writer.pos,
6189
11.8k
                        native_ordering);
6190
16.7k
            } else {
6191
16.7k
                assert(kind == PyUnicode_4BYTE_KIND);
6192
16.7k
                ch = ucs4lib_utf16_decode(&q, e,
6193
16.7k
                        (Py_UCS4*)writer.data, &writer.pos,
6194
16.7k
                        native_ordering);
6195
16.7k
            }
6196
45.2k
        }
6197
6198
52.5k
        switch (ch)
6199
52.5k
        {
6200
13.7k
        case 0:
6201
            /* remaining byte at the end? (size should be even) */
6202
13.7k
            if (q == e || consumed)
6203
8.51k
                goto End;
6204
5.23k
            errmsg = "truncated data";
6205
5.23k
            startinpos = ((const char *)q) - starts;
6206
5.23k
            endinpos = ((const char *)e) - starts;
6207
5.23k
            break;
6208
            /* The remaining input chars are ignored if the callback
6209
               chooses to skip the input */
6210
1.51k
        case 1:
6211
1.51k
            q -= 2;
6212
1.51k
            if (consumed)
6213
0
                goto End;
6214
1.51k
            errmsg = "unexpected end of data";
6215
1.51k
            startinpos = ((const char *)q) - starts;
6216
1.51k
            endinpos = ((const char *)e) - starts;
6217
1.51k
            break;
6218
14.8k
        case 2:
6219
14.8k
            errmsg = "illegal encoding";
6220
14.8k
            startinpos = ((const char *)q) - 2 - starts;
6221
14.8k
            endinpos = startinpos + 2;
6222
14.8k
            break;
6223
6.29k
        case 3:
6224
6.29k
            errmsg = "illegal UTF-16 surrogate";
6225
6.29k
            startinpos = ((const char *)q) - 4 - starts;
6226
6.29k
            endinpos = startinpos + 2;
6227
6.29k
            break;
6228
16.1k
        default:
6229
16.1k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6230
0
                goto onError;
6231
16.1k
            continue;
6232
52.5k
        }
6233
6234
27.8k
        if (unicode_decode_call_errorhandler_writer(
6235
27.8k
                errors,
6236
27.8k
                &errorHandler,
6237
27.8k
                encoding, errmsg,
6238
27.8k
                &starts,
6239
27.8k
                (const char **)&e,
6240
27.8k
                &startinpos,
6241
27.8k
                &endinpos,
6242
27.8k
                &exc,
6243
27.8k
                (const char **)&q,
6244
27.8k
                &writer))
6245
5.08k
            goto onError;
6246
27.8k
    }
6247
6248
8.51k
End:
6249
8.51k
    if (consumed)
6250
0
        *consumed = (const char *)q-starts;
6251
6252
8.51k
    Py_XDECREF(errorHandler);
6253
8.51k
    Py_XDECREF(exc);
6254
8.51k
    return _PyUnicodeWriter_Finish(&writer);
6255
6256
5.08k
  onError:
6257
5.08k
    _PyUnicodeWriter_Dealloc(&writer);
6258
5.08k
    Py_XDECREF(errorHandler);
6259
5.08k
    Py_XDECREF(exc);
6260
5.08k
    return NULL;
6261
13.6k
}
6262
6263
PyObject *
6264
_PyUnicode_EncodeUTF16(PyObject *str,
6265
                       const char *errors,
6266
                       int byteorder)
6267
0
{
6268
0
    if (!PyUnicode_Check(str)) {
6269
0
        PyErr_BadArgument();
6270
0
        return NULL;
6271
0
    }
6272
0
    int kind = PyUnicode_KIND(str);
6273
0
    const void *data = PyUnicode_DATA(str);
6274
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6275
6276
0
    Py_ssize_t pairs = 0;
6277
0
    if (kind == PyUnicode_4BYTE_KIND) {
6278
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6279
0
        const Py_UCS4 *end = in + len;
6280
0
        while (in < end) {
6281
0
            if (*in++ >= 0x10000) {
6282
0
                pairs++;
6283
0
            }
6284
0
        }
6285
0
    }
6286
0
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6287
0
        return PyErr_NoMemory();
6288
0
    }
6289
0
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6290
6291
#if PY_BIG_ENDIAN
6292
    int native_ordering = byteorder >= 0;
6293
#else
6294
0
    int native_ordering = byteorder <= 0;
6295
0
#endif
6296
6297
0
    if (kind == PyUnicode_1BYTE_KIND) {
6298
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6299
        // on short strings
6300
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6301
0
        if (v == NULL) {
6302
0
            return NULL;
6303
0
        }
6304
6305
        /* output buffer is 2-bytes aligned */
6306
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6307
0
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6308
0
        if (byteorder == 0) {
6309
0
            *out++ = 0xFEFF;
6310
0
        }
6311
0
        if (len > 0) {
6312
0
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6313
0
        }
6314
0
        return v;
6315
0
    }
6316
6317
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6318
0
    if (writer == NULL) {
6319
0
        return NULL;
6320
0
    }
6321
6322
    /* output buffer is 2-bytes aligned */
6323
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6324
0
    unsigned short *out = PyBytesWriter_GetData(writer);
6325
0
    if (byteorder == 0) {
6326
0
        *out++ = 0xFEFF;
6327
0
    }
6328
0
    if (len == 0) {
6329
0
        return PyBytesWriter_Finish(writer);
6330
0
    }
6331
6332
0
    const char *encoding;
6333
0
    if (byteorder < 0) {
6334
0
        encoding = "utf-16-le";
6335
0
    }
6336
0
    else if (byteorder > 0) {
6337
0
        encoding = "utf-16-be";
6338
0
    }
6339
0
    else {
6340
0
        encoding = "utf-16";
6341
0
    }
6342
6343
0
    PyObject *errorHandler = NULL;
6344
0
    PyObject *exc = NULL;
6345
0
    PyObject *rep = NULL;
6346
6347
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6348
0
        if (kind == PyUnicode_2BYTE_KIND) {
6349
0
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6350
0
                                        &out, native_ordering);
6351
0
        }
6352
0
        else {
6353
0
            assert(kind == PyUnicode_4BYTE_KIND);
6354
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6355
0
                                        &out, native_ordering);
6356
0
        }
6357
0
        if (pos == len)
6358
0
            break;
6359
6360
0
        Py_ssize_t newpos;
6361
0
        rep = unicode_encode_call_errorhandler(
6362
0
                errors, &errorHandler,
6363
0
                encoding, "surrogates not allowed",
6364
0
                str, &exc, pos, pos + 1, &newpos);
6365
0
        if (!rep)
6366
0
            goto error;
6367
6368
0
        Py_ssize_t repsize, moreunits;
6369
0
        if (PyBytes_Check(rep)) {
6370
0
            repsize = PyBytes_GET_SIZE(rep);
6371
0
            if (repsize & 1) {
6372
0
                raise_encode_exception(&exc, encoding,
6373
0
                                       str, pos, pos + 1,
6374
0
                                       "surrogates not allowed");
6375
0
                goto error;
6376
0
            }
6377
0
            moreunits = repsize / 2;
6378
0
        }
6379
0
        else {
6380
0
            assert(PyUnicode_Check(rep));
6381
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6382
0
            if (!PyUnicode_IS_ASCII(rep)) {
6383
0
                raise_encode_exception(&exc, encoding,
6384
0
                                       str, pos, pos + 1,
6385
0
                                       "surrogates not allowed");
6386
0
                goto error;
6387
0
            }
6388
0
        }
6389
0
        moreunits += pos - newpos;
6390
0
        pos = newpos;
6391
6392
        /* two bytes are reserved for each surrogate */
6393
0
        if (moreunits > 0) {
6394
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6395
0
            if (out == NULL) {
6396
0
                goto error;
6397
0
            }
6398
0
        }
6399
6400
0
        if (PyBytes_Check(rep)) {
6401
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6402
0
            out += repsize / 2;
6403
0
        } else {
6404
            /* rep is unicode */
6405
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6406
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6407
0
                                 &out, native_ordering);
6408
0
        }
6409
6410
0
        Py_CLEAR(rep);
6411
0
    }
6412
6413
0
    Py_XDECREF(errorHandler);
6414
0
    Py_XDECREF(exc);
6415
6416
    /* Cut back to size actually needed. This is necessary for, for example,
6417
    encoding of a string containing isolated surrogates and the 'ignore' handler
6418
    is used. */
6419
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6420
6421
0
  error:
6422
0
    Py_XDECREF(rep);
6423
0
    Py_XDECREF(errorHandler);
6424
0
    Py_XDECREF(exc);
6425
0
    PyBytesWriter_Discard(writer);
6426
0
    return NULL;
6427
0
}
6428
6429
PyObject *
6430
PyUnicode_AsUTF16String(PyObject *unicode)
6431
0
{
6432
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6433
0
}
6434
6435
_PyUnicode_Name_CAPI *
6436
_PyUnicode_GetNameCAPI(void)
6437
1.61k
{
6438
1.61k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6439
1.61k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6440
6441
1.61k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6442
1.61k
    if (ucnhash_capi == NULL) {
6443
1
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6444
1
                PyUnicodeData_CAPSULE_NAME, 1);
6445
6446
        // It's fine if we overwrite the value here. It's always the same value.
6447
1
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6448
1
    }
6449
1.61k
    return ucnhash_capi;
6450
1.61k
}
6451
6452
/* --- Unicode Escape Codec ----------------------------------------------- */
6453
6454
PyObject *
6455
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6456
                               Py_ssize_t size,
6457
                               const char *errors,
6458
                               Py_ssize_t *consumed,
6459
                               int *first_invalid_escape_char,
6460
                               const char **first_invalid_escape_ptr)
6461
30.7k
{
6462
30.7k
    const char *starts = s;
6463
30.7k
    const char *initial_starts = starts;
6464
30.7k
    _PyUnicodeWriter writer;
6465
30.7k
    const char *end;
6466
30.7k
    PyObject *errorHandler = NULL;
6467
30.7k
    PyObject *exc = NULL;
6468
30.7k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6469
6470
    // so we can remember if we've seen an invalid escape char or not
6471
30.7k
    *first_invalid_escape_char = -1;
6472
30.7k
    *first_invalid_escape_ptr = NULL;
6473
6474
30.7k
    if (size == 0) {
6475
1.78k
        if (consumed) {
6476
0
            *consumed = 0;
6477
0
        }
6478
1.78k
        _Py_RETURN_UNICODE_EMPTY();
6479
1.78k
    }
6480
    /* Escaped strings will always be longer than the resulting
6481
       Unicode string, so we start with size here and then reduce the
6482
       length after conversion to the true value.
6483
       (but if the error callback returns a long replacement string
6484
       we'll have to allocate more space) */
6485
28.9k
    _PyUnicodeWriter_Init(&writer);
6486
28.9k
    writer.min_length = size;
6487
28.9k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6488
0
        goto onError;
6489
0
    }
6490
6491
28.9k
    end = s + size;
6492
175k
    while (s < end) {
6493
146k
        unsigned char c = (unsigned char) *s++;
6494
146k
        Py_UCS4 ch;
6495
146k
        int count;
6496
146k
        const char *message;
6497
6498
146k
#define WRITE_ASCII_CHAR(ch)                                                  \
6499
146k
            do {                                                              \
6500
15.2k
                assert(ch <= 127);                                            \
6501
15.2k
                assert(writer.pos < writer.size);                             \
6502
15.2k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6503
15.2k
            } while(0)
6504
6505
146k
#define WRITE_CHAR(ch)                                                        \
6506
146k
            do {                                                              \
6507
135k
                if (ch <= writer.maxchar) {                                   \
6508
120k
                    assert(writer.pos < writer.size);                         \
6509
120k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6510
120k
                }                                                             \
6511
135k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6512
0
                    goto onError;                                             \
6513
0
                }                                                             \
6514
135k
            } while(0)
6515
6516
        /* Non-escape characters are interpreted as Unicode ordinals */
6517
146k
        if (c != '\\') {
6518
96.1k
            WRITE_CHAR(c);
6519
96.1k
            continue;
6520
96.1k
        }
6521
6522
50.7k
        Py_ssize_t startinpos = s - starts - 1;
6523
        /* \ - Escapes */
6524
50.7k
        if (s >= end) {
6525
0
            message = "\\ at end of string";
6526
0
            goto incomplete;
6527
0
        }
6528
50.7k
        c = (unsigned char) *s++;
6529
6530
50.7k
        assert(writer.pos < writer.size);
6531
50.7k
        switch (c) {
6532
6533
            /* \x escapes */
6534
670
        case '\n': continue;
6535
1.47k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6536
911
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6537
1.18k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6538
1.02k
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6539
        /* FF */
6540
735
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6541
933
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6542
982
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6543
1.46k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6544
        /* VT */
6545
828
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6546
        /* BEL, not classic C */
6547
706
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6548
6549
            /* \OOO (octal) escapes */
6550
3.76k
        case '0': case '1': case '2': case '3':
6551
6.74k
        case '4': case '5': case '6': case '7':
6552
6.74k
            ch = c - '0';
6553
6.74k
            if (s < end && '0' <= *s && *s <= '7') {
6554
2.47k
                ch = (ch<<3) + *s++ - '0';
6555
2.47k
                if (s < end && '0' <= *s && *s <= '7') {
6556
1.23k
                    ch = (ch<<3) + *s++ - '0';
6557
1.23k
                }
6558
2.47k
            }
6559
6.74k
            if (ch > 0377) {
6560
1.07k
                if (*first_invalid_escape_char == -1) {
6561
747
                    *first_invalid_escape_char = ch;
6562
747
                    if (starts == initial_starts) {
6563
                        /* Back up 3 chars, since we've already incremented s. */
6564
747
                        *first_invalid_escape_ptr = s - 3;
6565
747
                    }
6566
747
                }
6567
1.07k
            }
6568
6.74k
            WRITE_CHAR(ch);
6569
6.74k
            continue;
6570
6571
            /* hex escapes */
6572
            /* \xXX */
6573
6.74k
        case 'x':
6574
6.01k
            count = 2;
6575
6.01k
            message = "truncated \\xXX escape";
6576
6.01k
            goto hexescape;
6577
6578
            /* \uXXXX */
6579
9.24k
        case 'u':
6580
9.24k
            count = 4;
6581
9.24k
            message = "truncated \\uXXXX escape";
6582
9.24k
            goto hexescape;
6583
6584
            /* \UXXXXXXXX */
6585
11.1k
        case 'U':
6586
11.1k
            count = 8;
6587
11.1k
            message = "truncated \\UXXXXXXXX escape";
6588
26.4k
        hexescape:
6589
164k
            for (ch = 0; count; ++s, --count) {
6590
138k
                if (s >= end) {
6591
6
                    goto incomplete;
6592
6
                }
6593
138k
                c = (unsigned char)*s;
6594
138k
                ch <<= 4;
6595
138k
                if (c >= '0' && c <= '9') {
6596
106k
                    ch += c - '0';
6597
106k
                }
6598
31.9k
                else if (c >= 'a' && c <= 'f') {
6599
31.6k
                    ch += c - ('a' - 10);
6600
31.6k
                }
6601
258
                else if (c >= 'A' && c <= 'F') {
6602
249
                    ch += c - ('A' - 10);
6603
249
                }
6604
9
                else {
6605
9
                    goto error;
6606
9
                }
6607
138k
            }
6608
6609
            /* when we get here, ch is a 32-bit unicode character */
6610
26.4k
            if (ch > MAX_UNICODE) {
6611
1
                message = "illegal Unicode character";
6612
1
                goto error;
6613
1
            }
6614
6615
26.4k
            WRITE_CHAR(ch);
6616
26.4k
            continue;
6617
6618
            /* \N{name} */
6619
26.4k
        case 'N':
6620
1.61k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6621
1.61k
            if (ucnhash_capi == NULL) {
6622
0
                PyErr_SetString(
6623
0
                        PyExc_UnicodeError,
6624
0
                        "\\N escapes not supported (can't load unicodedata module)"
6625
0
                );
6626
0
                goto onError;
6627
0
            }
6628
6629
1.61k
            message = "malformed \\N character escape";
6630
1.61k
            if (s >= end) {
6631
4
                goto incomplete;
6632
4
            }
6633
1.61k
            if (*s == '{') {
6634
1.61k
                const char *start = ++s;
6635
1.61k
                size_t namelen;
6636
                /* look for the closing brace */
6637
22.3k
                while (s < end && *s != '}')
6638
20.7k
                    s++;
6639
1.61k
                if (s >= end) {
6640
14
                    goto incomplete;
6641
14
                }
6642
1.59k
                namelen = s - start;
6643
1.59k
                if (namelen) {
6644
                    /* found a name.  look it up in the unicode database */
6645
1.59k
                    s++;
6646
1.59k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6647
1.59k
                    if (namelen <= INT_MAX &&
6648
1.59k
                        ucnhash_capi->getcode(start, (int)namelen,
6649
1.59k
                                              &ch, 0)) {
6650
1.53k
                        assert(ch <= MAX_UNICODE);
6651
1.53k
                        WRITE_CHAR(ch);
6652
1.53k
                        continue;
6653
1.53k
                    }
6654
64
                    message = "unknown Unicode character name";
6655
64
                }
6656
1.59k
            }
6657
70
            goto error;
6658
6659
5.01k
        default:
6660
5.01k
            if (*first_invalid_escape_char == -1) {
6661
3.73k
                *first_invalid_escape_char = c;
6662
3.73k
                if (starts == initial_starts) {
6663
                    /* Back up one char, since we've already incremented s. */
6664
3.73k
                    *first_invalid_escape_ptr = s - 1;
6665
3.73k
                }
6666
3.73k
            }
6667
5.01k
            WRITE_ASCII_CHAR('\\');
6668
5.01k
            WRITE_CHAR(c);
6669
5.01k
            continue;
6670
50.7k
        }
6671
6672
24
      incomplete:
6673
24
        if (consumed) {
6674
0
            *consumed = startinpos;
6675
0
            break;
6676
0
        }
6677
104
      error:;
6678
104
        Py_ssize_t endinpos = s-starts;
6679
104
        writer.min_length = end - s + writer.pos;
6680
104
        if (unicode_decode_call_errorhandler_writer(
6681
104
                errors, &errorHandler,
6682
104
                "unicodeescape", message,
6683
104
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6684
104
                &writer)) {
6685
104
            goto onError;
6686
104
        }
6687
104
        assert(end - s <= writer.size - writer.pos);
6688
6689
0
#undef WRITE_ASCII_CHAR
6690
0
#undef WRITE_CHAR
6691
0
    }
6692
6693
28.8k
    Py_XDECREF(errorHandler);
6694
28.8k
    Py_XDECREF(exc);
6695
28.8k
    return _PyUnicodeWriter_Finish(&writer);
6696
6697
104
  onError:
6698
104
    _PyUnicodeWriter_Dealloc(&writer);
6699
104
    Py_XDECREF(errorHandler);
6700
104
    Py_XDECREF(exc);
6701
104
    return NULL;
6702
28.9k
}
6703
6704
PyObject *
6705
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6706
                              Py_ssize_t size,
6707
                              const char *errors,
6708
                              Py_ssize_t *consumed)
6709
0
{
6710
0
    int first_invalid_escape_char;
6711
0
    const char *first_invalid_escape_ptr;
6712
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6713
0
                                                      consumed,
6714
0
                                                      &first_invalid_escape_char,
6715
0
                                                      &first_invalid_escape_ptr);
6716
0
    if (result == NULL)
6717
0
        return NULL;
6718
0
    if (first_invalid_escape_char != -1) {
6719
0
        if (first_invalid_escape_char > 0xff) {
6720
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6721
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6722
0
                                 "Such sequences will not work in the future. ",
6723
0
                                 first_invalid_escape_char) < 0)
6724
0
            {
6725
0
                Py_DECREF(result);
6726
0
                return NULL;
6727
0
            }
6728
0
        }
6729
0
        else {
6730
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6731
0
                                 "\"\\%c\" is an invalid escape sequence. "
6732
0
                                 "Such sequences will not work in the future. ",
6733
0
                                 first_invalid_escape_char) < 0)
6734
0
            {
6735
0
                Py_DECREF(result);
6736
0
                return NULL;
6737
0
            }
6738
0
        }
6739
0
    }
6740
0
    return result;
6741
0
}
6742
6743
PyObject *
6744
PyUnicode_DecodeUnicodeEscape(const char *s,
6745
                              Py_ssize_t size,
6746
                              const char *errors)
6747
0
{
6748
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6749
0
}
6750
6751
/* Return a Unicode-Escape string version of the Unicode object. */
6752
6753
PyObject *
6754
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6755
312k
{
6756
312k
    if (!PyUnicode_Check(unicode)) {
6757
0
        PyErr_BadArgument();
6758
0
        return NULL;
6759
0
    }
6760
6761
312k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6762
312k
    if (len == 0) {
6763
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6764
0
    }
6765
312k
    int kind = PyUnicode_KIND(unicode);
6766
312k
    const void *data = PyUnicode_DATA(unicode);
6767
6768
    /* Initial allocation is based on the longest-possible character
6769
     * escape.
6770
     *
6771
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6772
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6773
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6774
312k
    Py_ssize_t expandsize = kind * 2 + 2;
6775
312k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6776
0
        return PyErr_NoMemory();
6777
0
    }
6778
6779
312k
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6780
312k
    if (writer == NULL) {
6781
0
        return NULL;
6782
0
    }
6783
312k
    char *p = PyBytesWriter_GetData(writer);
6784
6785
624k
    for (Py_ssize_t i = 0; i < len; i++) {
6786
312k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6787
6788
        /* U+0000-U+00ff range */
6789
312k
        if (ch < 0x100) {
6790
305k
            if (ch >= ' ' && ch < 127) {
6791
24.0k
                if (ch != '\\') {
6792
                    /* Copy printable US ASCII as-is */
6793
0
                    *p++ = (char) ch;
6794
0
                }
6795
                /* Escape backslashes */
6796
24.0k
                else {
6797
24.0k
                    *p++ = '\\';
6798
24.0k
                    *p++ = '\\';
6799
24.0k
                }
6800
24.0k
            }
6801
6802
            /* Map special whitespace to '\t', \n', '\r' */
6803
281k
            else if (ch == '\t') {
6804
2.81k
                *p++ = '\\';
6805
2.81k
                *p++ = 't';
6806
2.81k
            }
6807
278k
            else if (ch == '\n') {
6808
4.18k
                *p++ = '\\';
6809
4.18k
                *p++ = 'n';
6810
4.18k
            }
6811
274k
            else if (ch == '\r') {
6812
521
                *p++ = '\\';
6813
521
                *p++ = 'r';
6814
521
            }
6815
6816
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6817
273k
            else {
6818
273k
                *p++ = '\\';
6819
273k
                *p++ = 'x';
6820
273k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6821
273k
                *p++ = Py_hexdigits[ch & 0x000F];
6822
273k
            }
6823
305k
        }
6824
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6825
6.77k
        else if (ch < 0x10000) {
6826
5.57k
            *p++ = '\\';
6827
5.57k
            *p++ = 'u';
6828
5.57k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6829
5.57k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6830
5.57k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6831
5.57k
            *p++ = Py_hexdigits[ch & 0x000F];
6832
5.57k
        }
6833
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6834
1.19k
        else {
6835
6836
            /* Make sure that the first two digits are zero */
6837
1.19k
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6838
1.19k
            *p++ = '\\';
6839
1.19k
            *p++ = 'U';
6840
1.19k
            *p++ = '0';
6841
1.19k
            *p++ = '0';
6842
1.19k
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6843
1.19k
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6844
1.19k
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6845
1.19k
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6846
1.19k
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6847
1.19k
            *p++ = Py_hexdigits[ch & 0x0000000F];
6848
1.19k
        }
6849
312k
    }
6850
6851
312k
    return PyBytesWriter_FinishWithPointer(writer, p);
6852
312k
}
6853
6854
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6855
6856
PyObject *
6857
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6858
                                          Py_ssize_t size,
6859
                                          const char *errors,
6860
                                          Py_ssize_t *consumed)
6861
0
{
6862
0
    const char *starts = s;
6863
0
    _PyUnicodeWriter writer;
6864
0
    const char *end;
6865
0
    PyObject *errorHandler = NULL;
6866
0
    PyObject *exc = NULL;
6867
6868
0
    if (size == 0) {
6869
0
        if (consumed) {
6870
0
            *consumed = 0;
6871
0
        }
6872
0
        _Py_RETURN_UNICODE_EMPTY();
6873
0
    }
6874
6875
    /* Escaped strings will always be longer than the resulting
6876
       Unicode string, so we start with size here and then reduce the
6877
       length after conversion to the true value. (But decoding error
6878
       handler might have to resize the string) */
6879
0
    _PyUnicodeWriter_Init(&writer);
6880
0
    writer.min_length = size;
6881
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6882
0
        goto onError;
6883
0
    }
6884
6885
0
    end = s + size;
6886
0
    while (s < end) {
6887
0
        unsigned char c = (unsigned char) *s++;
6888
0
        Py_UCS4 ch;
6889
0
        int count;
6890
0
        const char *message;
6891
6892
0
#define WRITE_CHAR(ch)                                                        \
6893
0
            do {                                                              \
6894
0
                if (ch <= writer.maxchar) {                                   \
6895
0
                    assert(writer.pos < writer.size);                         \
6896
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6897
0
                }                                                             \
6898
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6899
0
                    goto onError;                                             \
6900
0
                }                                                             \
6901
0
            } while(0)
6902
6903
        /* Non-escape characters are interpreted as Unicode ordinals */
6904
0
        if (c != '\\' || (s >= end && !consumed)) {
6905
0
            WRITE_CHAR(c);
6906
0
            continue;
6907
0
        }
6908
6909
0
        Py_ssize_t startinpos = s - starts - 1;
6910
        /* \ - Escapes */
6911
0
        if (s >= end) {
6912
0
            assert(consumed);
6913
            // Set message to silent compiler warning.
6914
            // Actually it is never used.
6915
0
            message = "\\ at end of string";
6916
0
            goto incomplete;
6917
0
        }
6918
6919
0
        c = (unsigned char) *s++;
6920
0
        if (c == 'u') {
6921
0
            count = 4;
6922
0
            message = "truncated \\uXXXX escape";
6923
0
        }
6924
0
        else if (c == 'U') {
6925
0
            count = 8;
6926
0
            message = "truncated \\UXXXXXXXX escape";
6927
0
        }
6928
0
        else {
6929
0
            assert(writer.pos < writer.size);
6930
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6931
0
            WRITE_CHAR(c);
6932
0
            continue;
6933
0
        }
6934
6935
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6936
0
        for (ch = 0; count; ++s, --count) {
6937
0
            if (s >= end) {
6938
0
                goto incomplete;
6939
0
            }
6940
0
            c = (unsigned char)*s;
6941
0
            ch <<= 4;
6942
0
            if (c >= '0' && c <= '9') {
6943
0
                ch += c - '0';
6944
0
            }
6945
0
            else if (c >= 'a' && c <= 'f') {
6946
0
                ch += c - ('a' - 10);
6947
0
            }
6948
0
            else if (c >= 'A' && c <= 'F') {
6949
0
                ch += c - ('A' - 10);
6950
0
            }
6951
0
            else {
6952
0
                goto error;
6953
0
            }
6954
0
        }
6955
0
        if (ch > MAX_UNICODE) {
6956
0
            message = "\\Uxxxxxxxx out of range";
6957
0
            goto error;
6958
0
        }
6959
0
        WRITE_CHAR(ch);
6960
0
        continue;
6961
6962
0
      incomplete:
6963
0
        if (consumed) {
6964
0
            *consumed = startinpos;
6965
0
            break;
6966
0
        }
6967
0
      error:;
6968
0
        Py_ssize_t endinpos = s-starts;
6969
0
        writer.min_length = end - s + writer.pos;
6970
0
        if (unicode_decode_call_errorhandler_writer(
6971
0
                errors, &errorHandler,
6972
0
                "rawunicodeescape", message,
6973
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6974
0
                &writer)) {
6975
0
            goto onError;
6976
0
        }
6977
0
        assert(end - s <= writer.size - writer.pos);
6978
6979
0
#undef WRITE_CHAR
6980
0
    }
6981
0
    Py_XDECREF(errorHandler);
6982
0
    Py_XDECREF(exc);
6983
0
    return _PyUnicodeWriter_Finish(&writer);
6984
6985
0
  onError:
6986
0
    _PyUnicodeWriter_Dealloc(&writer);
6987
0
    Py_XDECREF(errorHandler);
6988
0
    Py_XDECREF(exc);
6989
0
    return NULL;
6990
0
}
6991
6992
PyObject *
6993
PyUnicode_DecodeRawUnicodeEscape(const char *s,
6994
                                 Py_ssize_t size,
6995
                                 const char *errors)
6996
0
{
6997
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6998
0
}
6999
7000
7001
PyObject *
7002
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7003
291k
{
7004
291k
    if (!PyUnicode_Check(unicode)) {
7005
0
        PyErr_BadArgument();
7006
0
        return NULL;
7007
0
    }
7008
291k
    int kind = PyUnicode_KIND(unicode);
7009
291k
    const void *data = PyUnicode_DATA(unicode);
7010
291k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7011
291k
    if (len == 0) {
7012
589
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7013
589
    }
7014
290k
    if (kind == PyUnicode_1BYTE_KIND) {
7015
290k
        return PyBytes_FromStringAndSize(data, len);
7016
290k
    }
7017
7018
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7019
       bytes, and 1 byte characters 4. */
7020
312
    Py_ssize_t expandsize = kind * 2 + 2;
7021
312
    if (len > PY_SSIZE_T_MAX / expandsize) {
7022
0
        return PyErr_NoMemory();
7023
0
    }
7024
7025
312
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7026
312
    if (writer == NULL) {
7027
0
        return NULL;
7028
0
    }
7029
312
    char *p = PyBytesWriter_GetData(writer);
7030
7031
5.17M
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7032
5.17M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7033
7034
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7035
5.17M
        if (ch < 0x100) {
7036
5.14M
            *p++ = (char) ch;
7037
5.14M
        }
7038
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7039
34.9k
        else if (ch < 0x10000) {
7040
34.4k
            *p++ = '\\';
7041
34.4k
            *p++ = 'u';
7042
34.4k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7043
34.4k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7044
34.4k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7045
34.4k
            *p++ = Py_hexdigits[ch & 15];
7046
34.4k
        }
7047
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7048
582
        else {
7049
582
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7050
582
            *p++ = '\\';
7051
582
            *p++ = 'U';
7052
582
            *p++ = '0';
7053
582
            *p++ = '0';
7054
582
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7055
582
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7056
582
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7057
582
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7058
582
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7059
582
            *p++ = Py_hexdigits[ch & 15];
7060
582
        }
7061
5.17M
    }
7062
7063
312
    return PyBytesWriter_FinishWithPointer(writer, p);
7064
312
}
7065
7066
/* --- Latin-1 Codec ------------------------------------------------------ */
7067
7068
PyObject *
7069
PyUnicode_DecodeLatin1(const char *s,
7070
                       Py_ssize_t size,
7071
                       const char *errors)
7072
3.69M
{
7073
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7074
3.69M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7075
3.69M
}
7076
7077
/* create or adjust a UnicodeEncodeError */
7078
static void
7079
make_encode_exception(PyObject **exceptionObject,
7080
                      const char *encoding,
7081
                      PyObject *unicode,
7082
                      Py_ssize_t startpos, Py_ssize_t endpos,
7083
                      const char *reason)
7084
229k
{
7085
229k
    if (*exceptionObject == NULL) {
7086
229k
        *exceptionObject = PyObject_CallFunction(
7087
229k
            PyExc_UnicodeEncodeError, "sOnns",
7088
229k
            encoding, unicode, startpos, endpos, reason);
7089
229k
    }
7090
0
    else {
7091
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7092
0
            goto onError;
7093
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7094
0
            goto onError;
7095
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7096
0
            goto onError;
7097
0
        return;
7098
0
      onError:
7099
0
        Py_CLEAR(*exceptionObject);
7100
0
    }
7101
229k
}
7102
7103
/* raises a UnicodeEncodeError */
7104
static void
7105
raise_encode_exception(PyObject **exceptionObject,
7106
                       const char *encoding,
7107
                       PyObject *unicode,
7108
                       Py_ssize_t startpos, Py_ssize_t endpos,
7109
                       const char *reason)
7110
39.4k
{
7111
39.4k
    make_encode_exception(exceptionObject,
7112
39.4k
                          encoding, unicode, startpos, endpos, reason);
7113
39.4k
    if (*exceptionObject != NULL)
7114
39.4k
        PyCodec_StrictErrors(*exceptionObject);
7115
39.4k
}
7116
7117
/* error handling callback helper:
7118
   build arguments, call the callback and check the arguments,
7119
   put the result into newpos and return the replacement string, which
7120
   has to be freed by the caller */
7121
static PyObject *
7122
unicode_encode_call_errorhandler(const char *errors,
7123
                                 PyObject **errorHandler,
7124
                                 const char *encoding, const char *reason,
7125
                                 PyObject *unicode, PyObject **exceptionObject,
7126
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7127
                                 Py_ssize_t *newpos)
7128
189k
{
7129
189k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7130
189k
    Py_ssize_t len;
7131
189k
    PyObject *restuple;
7132
189k
    PyObject *resunicode;
7133
7134
189k
    if (*errorHandler == NULL) {
7135
189k
        *errorHandler = PyCodec_LookupError(errors);
7136
189k
        if (*errorHandler == NULL)
7137
0
            return NULL;
7138
189k
    }
7139
7140
189k
    len = PyUnicode_GET_LENGTH(unicode);
7141
7142
189k
    make_encode_exception(exceptionObject,
7143
189k
                          encoding, unicode, startpos, endpos, reason);
7144
189k
    if (*exceptionObject == NULL)
7145
0
        return NULL;
7146
7147
189k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7148
189k
    if (restuple == NULL)
7149
189k
        return NULL;
7150
0
    if (!PyTuple_Check(restuple)) {
7151
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7152
0
        Py_DECREF(restuple);
7153
0
        return NULL;
7154
0
    }
7155
0
    if (!PyArg_ParseTuple(restuple, argparse,
7156
0
                          &resunicode, newpos)) {
7157
0
        Py_DECREF(restuple);
7158
0
        return NULL;
7159
0
    }
7160
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7161
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7162
0
        Py_DECREF(restuple);
7163
0
        return NULL;
7164
0
    }
7165
0
    if (*newpos<0)
7166
0
        *newpos = len + *newpos;
7167
0
    if (*newpos<0 || *newpos>len) {
7168
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7169
0
        Py_DECREF(restuple);
7170
0
        return NULL;
7171
0
    }
7172
0
    Py_INCREF(resunicode);
7173
0
    Py_DECREF(restuple);
7174
0
    return resunicode;
7175
0
}
7176
7177
static PyObject *
7178
unicode_encode_ucs1(PyObject *unicode,
7179
                    const char *errors,
7180
                    const Py_UCS4 limit)
7181
50.4k
{
7182
    /* input state */
7183
50.4k
    Py_ssize_t pos=0, size;
7184
50.4k
    int kind;
7185
50.4k
    const void *data;
7186
50.4k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7187
50.4k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7188
50.4k
    PyObject *error_handler_obj = NULL;
7189
50.4k
    PyObject *exc = NULL;
7190
50.4k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7191
50.4k
    PyObject *rep = NULL;
7192
7193
50.4k
    size = PyUnicode_GET_LENGTH(unicode);
7194
50.4k
    kind = PyUnicode_KIND(unicode);
7195
50.4k
    data = PyUnicode_DATA(unicode);
7196
    /* allocate enough for a simple encoding without
7197
       replacements, if we need more, we'll resize */
7198
50.4k
    if (size == 0)
7199
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7200
7201
    /* output object */
7202
50.4k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7203
50.4k
    if (writer == NULL) {
7204
0
        return NULL;
7205
0
    }
7206
    /* pointer into the output */
7207
50.4k
    char *str = PyBytesWriter_GetData(writer);
7208
7209
3.41M
    while (pos < size) {
7210
3.41M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7211
7212
        /* can we encode this? */
7213
3.41M
        if (ch < limit) {
7214
            /* no overflow check, because we know that the space is enough */
7215
3.35M
            *str++ = (char)ch;
7216
3.35M
            ++pos;
7217
3.35M
        }
7218
50.4k
        else {
7219
50.4k
            Py_ssize_t newpos, i;
7220
            /* startpos for collecting unencodable chars */
7221
50.4k
            Py_ssize_t collstart = pos;
7222
50.4k
            Py_ssize_t collend = collstart + 1;
7223
            /* find all unecodable characters */
7224
7225
355k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7226
304k
                ++collend;
7227
7228
            /* Only overallocate the buffer if it's not the last write */
7229
50.4k
            writer->overallocate = (collend < size);
7230
7231
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7232
50.4k
            if (error_handler == _Py_ERROR_UNKNOWN)
7233
50.4k
                error_handler = _Py_GetErrorHandler(errors);
7234
7235
50.4k
            switch (error_handler) {
7236
39.4k
            case _Py_ERROR_STRICT:
7237
39.4k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7238
39.4k
                goto onError;
7239
7240
0
            case _Py_ERROR_REPLACE:
7241
0
                memset(str, '?', collend - collstart);
7242
0
                str += (collend - collstart);
7243
0
                _Py_FALLTHROUGH;
7244
0
            case _Py_ERROR_IGNORE:
7245
0
                pos = collend;
7246
0
                break;
7247
7248
0
            case _Py_ERROR_BACKSLASHREPLACE:
7249
                /* subtract preallocated bytes */
7250
0
                writer->size -= (collend - collstart);
7251
0
                str = backslashreplace(writer, str,
7252
0
                                       unicode, collstart, collend);
7253
0
                if (str == NULL)
7254
0
                    goto onError;
7255
0
                pos = collend;
7256
0
                break;
7257
7258
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7259
                /* subtract preallocated bytes */
7260
0
                writer->size -= (collend - collstart);
7261
0
                str = xmlcharrefreplace(writer, str,
7262
0
                                        unicode, collstart, collend);
7263
0
                if (str == NULL)
7264
0
                    goto onError;
7265
0
                pos = collend;
7266
0
                break;
7267
7268
10.9k
            case _Py_ERROR_SURROGATEESCAPE:
7269
10.9k
                for (i = collstart; i < collend; ++i) {
7270
10.9k
                    ch = PyUnicode_READ(kind, data, i);
7271
10.9k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7272
                        /* Not a UTF-8b surrogate */
7273
10.9k
                        break;
7274
10.9k
                    }
7275
0
                    *str++ = (char)(ch - 0xdc00);
7276
0
                    ++pos;
7277
0
                }
7278
10.9k
                if (i >= collend)
7279
0
                    break;
7280
10.9k
                collstart = pos;
7281
10.9k
                assert(collstart != collend);
7282
10.9k
                _Py_FALLTHROUGH;
7283
7284
10.9k
            default:
7285
10.9k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7286
10.9k
                                                       encoding, reason, unicode, &exc,
7287
10.9k
                                                       collstart, collend, &newpos);
7288
10.9k
                if (rep == NULL)
7289
10.9k
                    goto onError;
7290
7291
0
                if (newpos < collstart) {
7292
0
                    writer->overallocate = 1;
7293
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7294
0
                                                             collstart - newpos,
7295
0
                                                             str);
7296
0
                    if (str == NULL) {
7297
0
                        goto onError;
7298
0
                    }
7299
0
                }
7300
0
                else {
7301
                    /* subtract preallocated bytes */
7302
0
                    writer->size -= newpos - collstart;
7303
                    /* Only overallocate the buffer if it's not the last write */
7304
0
                    writer->overallocate = (newpos < size);
7305
0
                }
7306
7307
0
                char *rep_str;
7308
0
                Py_ssize_t rep_len;
7309
0
                if (PyBytes_Check(rep)) {
7310
                    /* Directly copy bytes result to output. */
7311
0
                    rep_str = PyBytes_AS_STRING(rep);
7312
0
                    rep_len = PyBytes_GET_SIZE(rep);
7313
0
                }
7314
0
                else {
7315
0
                    assert(PyUnicode_Check(rep));
7316
7317
0
                    if (limit == 256 ?
7318
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7319
0
                        !PyUnicode_IS_ASCII(rep))
7320
0
                    {
7321
                        /* Not all characters are smaller than limit */
7322
0
                        raise_encode_exception(&exc, encoding, unicode,
7323
0
                                               collstart, collend, reason);
7324
0
                        goto onError;
7325
0
                    }
7326
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7327
0
                    rep_str = PyUnicode_DATA(rep);
7328
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7329
0
                }
7330
7331
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7332
0
                if (str == NULL) {
7333
0
                    goto onError;
7334
0
                }
7335
0
                memcpy(str, rep_str, rep_len);
7336
0
                str += rep_len;
7337
7338
0
                pos = newpos;
7339
0
                Py_CLEAR(rep);
7340
50.4k
            }
7341
7342
            /* If overallocation was disabled, ensure that it was the last
7343
               write. Otherwise, we missed an optimization */
7344
50.4k
            assert(writer->overallocate || pos == size);
7345
0
        }
7346
3.41M
    }
7347
7348
0
    Py_XDECREF(error_handler_obj);
7349
0
    Py_XDECREF(exc);
7350
0
    return PyBytesWriter_FinishWithPointer(writer, str);
7351
7352
50.4k
  onError:
7353
50.4k
    Py_XDECREF(rep);
7354
50.4k
    PyBytesWriter_Discard(writer);
7355
50.4k
    Py_XDECREF(error_handler_obj);
7356
50.4k
    Py_XDECREF(exc);
7357
50.4k
    return NULL;
7358
50.4k
}
7359
7360
PyObject *
7361
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7362
0
{
7363
0
    if (!PyUnicode_Check(unicode)) {
7364
0
        PyErr_BadArgument();
7365
0
        return NULL;
7366
0
    }
7367
    /* Fast path: if it is a one-byte string, construct
7368
       bytes object directly. */
7369
0
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7370
0
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7371
0
                                         PyUnicode_GET_LENGTH(unicode));
7372
    /* Non-Latin-1 characters present. Defer to above function to
7373
       raise the exception. */
7374
0
    return unicode_encode_ucs1(unicode, errors, 256);
7375
0
}
7376
7377
PyObject*
7378
PyUnicode_AsLatin1String(PyObject *unicode)
7379
0
{
7380
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7381
0
}
7382
7383
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7384
7385
PyObject *
7386
PyUnicode_DecodeASCII(const char *s,
7387
                      Py_ssize_t size,
7388
                      const char *errors)
7389
599k
{
7390
599k
    const char *starts = s;
7391
599k
    const char *e = s + size;
7392
599k
    PyObject *error_handler_obj = NULL;
7393
599k
    PyObject *exc = NULL;
7394
599k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7395
7396
599k
    if (size == 0)
7397
0
        _Py_RETURN_UNICODE_EMPTY();
7398
7399
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7400
599k
    if (size == 1 && (unsigned char)s[0] < 128) {
7401
7.81k
        return get_latin1_char((unsigned char)s[0]);
7402
7.81k
    }
7403
7404
    // Shortcut for simple case
7405
591k
    PyObject *u = PyUnicode_New(size, 127);
7406
591k
    if (u == NULL) {
7407
0
        return NULL;
7408
0
    }
7409
591k
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7410
591k
    if (outpos == size) {
7411
395k
        return u;
7412
395k
    }
7413
7414
196k
    _PyUnicodeWriter writer;
7415
196k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7416
196k
    writer.pos = outpos;
7417
7418
196k
    s += outpos;
7419
196k
    int kind = writer.kind;
7420
196k
    void *data = writer.data;
7421
196k
    Py_ssize_t startinpos, endinpos;
7422
7423
19.8M
    while (s < e) {
7424
19.6M
        unsigned char c = (unsigned char)*s;
7425
19.6M
        if (c < 128) {
7426
7.42M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7427
7.42M
            writer.pos++;
7428
7.42M
            ++s;
7429
7.42M
            continue;
7430
7.42M
        }
7431
7432
        /* byte outsize range 0x00..0x7f: call the error handler */
7433
7434
12.2M
        if (error_handler == _Py_ERROR_UNKNOWN)
7435
196k
            error_handler = _Py_GetErrorHandler(errors);
7436
7437
12.2M
        switch (error_handler)
7438
12.2M
        {
7439
992k
        case _Py_ERROR_REPLACE:
7440
12.2M
        case _Py_ERROR_SURROGATEESCAPE:
7441
            /* Fast-path: the error handler only writes one character,
7442
               but we may switch to UCS2 at the first write */
7443
12.2M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7444
0
                goto onError;
7445
12.2M
            kind = writer.kind;
7446
12.2M
            data = writer.data;
7447
7448
12.2M
            if (error_handler == _Py_ERROR_REPLACE)
7449
992k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7450
11.2M
            else
7451
11.2M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7452
12.2M
            writer.pos++;
7453
12.2M
            ++s;
7454
12.2M
            break;
7455
7456
0
        case _Py_ERROR_IGNORE:
7457
0
            ++s;
7458
0
            break;
7459
7460
12.0k
        default:
7461
12.0k
            startinpos = s-starts;
7462
12.0k
            endinpos = startinpos + 1;
7463
12.0k
            if (unicode_decode_call_errorhandler_writer(
7464
12.0k
                    errors, &error_handler_obj,
7465
12.0k
                    "ascii", "ordinal not in range(128)",
7466
12.0k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7467
12.0k
                    &writer))
7468
12.0k
                goto onError;
7469
0
            kind = writer.kind;
7470
0
            data = writer.data;
7471
12.2M
        }
7472
12.2M
    }
7473
184k
    Py_XDECREF(error_handler_obj);
7474
184k
    Py_XDECREF(exc);
7475
184k
    return _PyUnicodeWriter_Finish(&writer);
7476
7477
12.0k
  onError:
7478
12.0k
    _PyUnicodeWriter_Dealloc(&writer);
7479
12.0k
    Py_XDECREF(error_handler_obj);
7480
12.0k
    Py_XDECREF(exc);
7481
12.0k
    return NULL;
7482
196k
}
7483
7484
PyObject *
7485
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7486
117k
{
7487
117k
    if (!PyUnicode_Check(unicode)) {
7488
0
        PyErr_BadArgument();
7489
0
        return NULL;
7490
0
    }
7491
    /* Fast path: if it is an ASCII-only string, construct bytes object
7492
       directly. Else defer to above function to raise the exception. */
7493
117k
    if (PyUnicode_IS_ASCII(unicode))
7494
66.7k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7495
66.7k
                                         PyUnicode_GET_LENGTH(unicode));
7496
50.4k
    return unicode_encode_ucs1(unicode, errors, 128);
7497
117k
}
7498
7499
PyObject *
7500
PyUnicode_AsASCIIString(PyObject *unicode)
7501
9
{
7502
9
    return _PyUnicode_AsASCIIString(unicode, NULL);
7503
9
}
7504
7505
#ifdef MS_WINDOWS
7506
7507
/* --- MBCS codecs for Windows -------------------------------------------- */
7508
7509
#if SIZEOF_INT < SIZEOF_SIZE_T
7510
#define NEED_RETRY
7511
#endif
7512
7513
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7514
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7515
   both cases also and avoids partial characters overrunning the
7516
   length limit in MultiByteToWideChar on Windows */
7517
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7518
7519
#ifndef WC_ERR_INVALID_CHARS
7520
#  define WC_ERR_INVALID_CHARS 0x0080
7521
#endif
7522
7523
static const char*
7524
code_page_name(UINT code_page, PyObject **obj)
7525
{
7526
    *obj = NULL;
7527
    if (code_page == CP_ACP)
7528
        return "mbcs";
7529
7530
    *obj = PyBytes_FromFormat("cp%u", code_page);
7531
    if (*obj == NULL)
7532
        return NULL;
7533
    return PyBytes_AS_STRING(*obj);
7534
}
7535
7536
static DWORD
7537
decode_code_page_flags(UINT code_page)
7538
{
7539
    if (code_page == CP_UTF7) {
7540
        /* The CP_UTF7 decoder only supports flags=0 */
7541
        return 0;
7542
    }
7543
    else
7544
        return MB_ERR_INVALID_CHARS;
7545
}
7546
7547
/*
7548
 * Decode a byte string from a Windows code page into unicode object in strict
7549
 * mode.
7550
 *
7551
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7552
 * OSError and returns -1 on other error.
7553
 */
7554
static int
7555
decode_code_page_strict(UINT code_page,
7556
                        wchar_t **buf,
7557
                        Py_ssize_t *bufsize,
7558
                        const char *in,
7559
                        int insize)
7560
{
7561
    DWORD flags = MB_ERR_INVALID_CHARS;
7562
    wchar_t *out;
7563
    DWORD outsize;
7564
7565
    /* First get the size of the result */
7566
    assert(insize > 0);
7567
    while ((outsize = MultiByteToWideChar(code_page, flags,
7568
                                          in, insize, NULL, 0)) <= 0)
7569
    {
7570
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7571
            goto error;
7572
        }
7573
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7574
        flags = 0;
7575
    }
7576
7577
    /* Extend a wchar_t* buffer */
7578
    Py_ssize_t n = *bufsize;   /* Get the current length */
7579
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7580
        return -1;
7581
    }
7582
    out = *buf + n;
7583
7584
    /* Do the conversion */
7585
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7586
    if (outsize <= 0)
7587
        goto error;
7588
    return insize;
7589
7590
error:
7591
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7592
        return -2;
7593
    PyErr_SetFromWindowsErr(0);
7594
    return -1;
7595
}
7596
7597
/*
7598
 * Decode a byte string from a code page into unicode object with an error
7599
 * handler.
7600
 *
7601
 * Returns consumed size if succeed, or raise an OSError or
7602
 * UnicodeDecodeError exception and returns -1 on error.
7603
 */
7604
static int
7605
decode_code_page_errors(UINT code_page,
7606
                        wchar_t **buf,
7607
                        Py_ssize_t *bufsize,
7608
                        const char *in, const int size,
7609
                        const char *errors, int final)
7610
{
7611
    const char *startin = in;
7612
    const char *endin = in + size;
7613
    DWORD flags = MB_ERR_INVALID_CHARS;
7614
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7615
       2000 English version of the message. */
7616
    const char *reason = "No mapping for the Unicode character exists "
7617
                         "in the target code page.";
7618
    /* each step cannot decode more than 1 character, but a character can be
7619
       represented as a surrogate pair */
7620
    wchar_t buffer[2], *out;
7621
    int insize;
7622
    Py_ssize_t outsize;
7623
    PyObject *errorHandler = NULL;
7624
    PyObject *exc = NULL;
7625
    PyObject *encoding_obj = NULL;
7626
    const char *encoding;
7627
    DWORD err;
7628
    int ret = -1;
7629
7630
    assert(size > 0);
7631
7632
    encoding = code_page_name(code_page, &encoding_obj);
7633
    if (encoding == NULL)
7634
        return -1;
7635
7636
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7637
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7638
           UnicodeDecodeError. */
7639
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7640
        if (exc != NULL) {
7641
            PyCodec_StrictErrors(exc);
7642
            Py_CLEAR(exc);
7643
        }
7644
        goto error;
7645
    }
7646
7647
    /* Extend a wchar_t* buffer */
7648
    Py_ssize_t n = *bufsize;   /* Get the current length */
7649
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7650
        PyErr_NoMemory();
7651
        goto error;
7652
    }
7653
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7654
        goto error;
7655
    }
7656
    out = *buf + n;
7657
7658
    /* Decode the byte string character per character */
7659
    while (in < endin)
7660
    {
7661
        /* Decode a character */
7662
        insize = 1;
7663
        do
7664
        {
7665
            outsize = MultiByteToWideChar(code_page, flags,
7666
                                          in, insize,
7667
                                          buffer, Py_ARRAY_LENGTH(buffer));
7668
            if (outsize > 0)
7669
                break;
7670
            err = GetLastError();
7671
            if (err == ERROR_INVALID_FLAGS && flags) {
7672
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7673
                flags = 0;
7674
                continue;
7675
            }
7676
            if (err != ERROR_NO_UNICODE_TRANSLATION
7677
                && err != ERROR_INSUFFICIENT_BUFFER)
7678
            {
7679
                PyErr_SetFromWindowsErr(err);
7680
                goto error;
7681
            }
7682
            insize++;
7683
        }
7684
        /* 4=maximum length of a UTF-8 sequence */
7685
        while (insize <= 4 && (in + insize) <= endin);
7686
7687
        if (outsize <= 0) {
7688
            Py_ssize_t startinpos, endinpos, outpos;
7689
7690
            /* last character in partial decode? */
7691
            if (in + insize >= endin && !final)
7692
                break;
7693
7694
            startinpos = in - startin;
7695
            endinpos = startinpos + 1;
7696
            outpos = out - *buf;
7697
            if (unicode_decode_call_errorhandler_wchar(
7698
                    errors, &errorHandler,
7699
                    encoding, reason,
7700
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7701
                    buf, bufsize, &outpos))
7702
            {
7703
                goto error;
7704
            }
7705
            out = *buf + outpos;
7706
        }
7707
        else {
7708
            in += insize;
7709
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7710
            out += outsize;
7711
        }
7712
    }
7713
7714
    /* Shrink the buffer */
7715
    assert(out - *buf <= *bufsize);
7716
    *bufsize = out - *buf;
7717
    /* (in - startin) <= size and size is an int */
7718
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7719
7720
error:
7721
    Py_XDECREF(encoding_obj);
7722
    Py_XDECREF(errorHandler);
7723
    Py_XDECREF(exc);
7724
    return ret;
7725
}
7726
7727
static PyObject *
7728
decode_code_page_stateful(int code_page,
7729
                          const char *s, Py_ssize_t size,
7730
                          const char *errors, Py_ssize_t *consumed)
7731
{
7732
    wchar_t *buf = NULL;
7733
    Py_ssize_t bufsize = 0;
7734
    int chunk_size, final, converted, done;
7735
7736
    if (code_page < 0) {
7737
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7738
        return NULL;
7739
    }
7740
    if (size < 0) {
7741
        PyErr_BadInternalCall();
7742
        return NULL;
7743
    }
7744
7745
    if (consumed)
7746
        *consumed = 0;
7747
7748
    do
7749
    {
7750
#ifdef NEED_RETRY
7751
        if (size > DECODING_CHUNK_SIZE) {
7752
            chunk_size = DECODING_CHUNK_SIZE;
7753
            final = 0;
7754
            done = 0;
7755
        }
7756
        else
7757
#endif
7758
        {
7759
            chunk_size = (int)size;
7760
            final = (consumed == NULL);
7761
            done = 1;
7762
        }
7763
7764
        if (chunk_size == 0 && done) {
7765
            if (buf != NULL)
7766
                break;
7767
            _Py_RETURN_UNICODE_EMPTY();
7768
        }
7769
7770
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7771
                                            s, chunk_size);
7772
        if (converted == -2)
7773
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7774
                                                s, chunk_size,
7775
                                                errors, final);
7776
        assert(converted != 0 || done);
7777
7778
        if (converted < 0) {
7779
            PyMem_Free(buf);
7780
            return NULL;
7781
        }
7782
7783
        if (consumed)
7784
            *consumed += converted;
7785
7786
        s += converted;
7787
        size -= converted;
7788
    } while (!done);
7789
7790
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7791
    PyMem_Free(buf);
7792
    return v;
7793
}
7794
7795
PyObject *
7796
PyUnicode_DecodeCodePageStateful(int code_page,
7797
                                 const char *s,
7798
                                 Py_ssize_t size,
7799
                                 const char *errors,
7800
                                 Py_ssize_t *consumed)
7801
{
7802
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7803
}
7804
7805
PyObject *
7806
PyUnicode_DecodeMBCSStateful(const char *s,
7807
                             Py_ssize_t size,
7808
                             const char *errors,
7809
                             Py_ssize_t *consumed)
7810
{
7811
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7812
}
7813
7814
PyObject *
7815
PyUnicode_DecodeMBCS(const char *s,
7816
                     Py_ssize_t size,
7817
                     const char *errors)
7818
{
7819
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7820
}
7821
7822
static DWORD
7823
encode_code_page_flags(UINT code_page, const char *errors)
7824
{
7825
    if (code_page == CP_UTF8) {
7826
        return WC_ERR_INVALID_CHARS;
7827
    }
7828
    else if (code_page == CP_UTF7) {
7829
        /* CP_UTF7 only supports flags=0 */
7830
        return 0;
7831
    }
7832
    else {
7833
        if (errors != NULL && strcmp(errors, "replace") == 0)
7834
            return 0;
7835
        else
7836
            return WC_NO_BEST_FIT_CHARS;
7837
    }
7838
}
7839
7840
/*
7841
 * Encode a Unicode string to a Windows code page into a byte string in strict
7842
 * mode.
7843
 *
7844
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7845
 * an OSError and returns -1 on other error.
7846
 */
7847
static int
7848
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7849
                        PyObject *unicode, Py_ssize_t offset, int len,
7850
                        const char* errors)
7851
{
7852
    BOOL usedDefaultChar = FALSE;
7853
    BOOL *pusedDefaultChar = &usedDefaultChar;
7854
    int outsize;
7855
    wchar_t *p;
7856
    Py_ssize_t size;
7857
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7858
    char *out;
7859
    /* Create a substring so that we can get the UTF-16 representation
7860
       of just the slice under consideration. */
7861
    PyObject *substring;
7862
    int ret = -1;
7863
7864
    assert(len > 0);
7865
7866
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7867
        pusedDefaultChar = &usedDefaultChar;
7868
    else
7869
        pusedDefaultChar = NULL;
7870
7871
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7872
    if (substring == NULL)
7873
        return -1;
7874
    p = PyUnicode_AsWideCharString(substring, &size);
7875
    Py_CLEAR(substring);
7876
    if (p == NULL) {
7877
        return -1;
7878
    }
7879
    assert(size <= INT_MAX);
7880
7881
    /* First get the size of the result */
7882
    outsize = WideCharToMultiByte(code_page, flags,
7883
                                  p, (int)size,
7884
                                  NULL, 0,
7885
                                  NULL, pusedDefaultChar);
7886
    if (outsize <= 0)
7887
        goto error;
7888
    /* If we used a default char, then we failed! */
7889
    if (pusedDefaultChar && *pusedDefaultChar) {
7890
        ret = -2;
7891
        goto done;
7892
    }
7893
7894
    if (*writer == NULL) {
7895
        /* Create string object */
7896
        *writer = PyBytesWriter_Create(outsize);
7897
        if (*writer == NULL) {
7898
            goto done;
7899
        }
7900
        out = PyBytesWriter_GetData(*writer);
7901
    }
7902
    else {
7903
        /* Extend string object */
7904
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
7905
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
7906
            goto done;
7907
        }
7908
        out = (char*)PyBytesWriter_GetData(*writer) + n;
7909
    }
7910
7911
    /* Do the conversion */
7912
    outsize = WideCharToMultiByte(code_page, flags,
7913
                                  p, (int)size,
7914
                                  out, outsize,
7915
                                  NULL, pusedDefaultChar);
7916
    if (outsize <= 0)
7917
        goto error;
7918
    if (pusedDefaultChar && *pusedDefaultChar) {
7919
        ret = -2;
7920
        goto done;
7921
    }
7922
    ret = 0;
7923
7924
done:
7925
    PyMem_Free(p);
7926
    return ret;
7927
7928
error:
7929
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7930
        ret = -2;
7931
        goto done;
7932
    }
7933
    PyErr_SetFromWindowsErr(0);
7934
    goto done;
7935
}
7936
7937
/*
7938
 * Encode a Unicode string to a Windows code page into a byte string using an
7939
 * error handler.
7940
 *
7941
 * Returns consumed characters if succeed, or raise an OSError and returns
7942
 * -1 on other error.
7943
 */
7944
static int
7945
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
7946
                        PyObject *unicode, Py_ssize_t unicode_offset,
7947
                        Py_ssize_t insize, const char* errors)
7948
{
7949
    const DWORD flags = encode_code_page_flags(code_page, errors);
7950
    Py_ssize_t pos = unicode_offset;
7951
    Py_ssize_t endin = unicode_offset + insize;
7952
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7953
       2000 English version of the message. */
7954
    const char *reason = "invalid character";
7955
    /* 4=maximum length of a UTF-8 sequence */
7956
    char buffer[4];
7957
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7958
    Py_ssize_t outsize;
7959
    char *out;
7960
    PyObject *errorHandler = NULL;
7961
    PyObject *exc = NULL;
7962
    PyObject *encoding_obj = NULL;
7963
    const char *encoding;
7964
    Py_ssize_t newpos;
7965
    PyObject *rep;
7966
    int ret = -1;
7967
7968
    assert(insize > 0);
7969
7970
    encoding = code_page_name(code_page, &encoding_obj);
7971
    if (encoding == NULL)
7972
        return -1;
7973
7974
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7975
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7976
           then we raise a UnicodeEncodeError. */
7977
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7978
        if (exc != NULL) {
7979
            PyCodec_StrictErrors(exc);
7980
            Py_DECREF(exc);
7981
        }
7982
        Py_XDECREF(encoding_obj);
7983
        return -1;
7984
    }
7985
7986
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7987
        pusedDefaultChar = &usedDefaultChar;
7988
    else
7989
        pusedDefaultChar = NULL;
7990
7991
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7992
        PyErr_NoMemory();
7993
        goto error;
7994
    }
7995
    outsize = insize * Py_ARRAY_LENGTH(buffer);
7996
7997
    if (*writer == NULL) {
7998
        /* Create string object */
7999
        *writer = PyBytesWriter_Create(outsize);
8000
        if (*writer == NULL) {
8001
            goto error;
8002
        }
8003
        out = PyBytesWriter_GetData(*writer);
8004
    }
8005
    else {
8006
        /* Extend string object */
8007
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8008
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8009
            goto error;
8010
        }
8011
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8012
    }
8013
8014
    /* Encode the string character per character */
8015
    while (pos < endin)
8016
    {
8017
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8018
        wchar_t chars[2];
8019
        int charsize;
8020
        if (ch < 0x10000) {
8021
            chars[0] = (wchar_t)ch;
8022
            charsize = 1;
8023
        }
8024
        else {
8025
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8026
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8027
            charsize = 2;
8028
        }
8029
8030
        outsize = WideCharToMultiByte(code_page, flags,
8031
                                      chars, charsize,
8032
                                      buffer, Py_ARRAY_LENGTH(buffer),
8033
                                      NULL, pusedDefaultChar);
8034
        if (outsize > 0) {
8035
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8036
            {
8037
                pos++;
8038
                memcpy(out, buffer, outsize);
8039
                out += outsize;
8040
                continue;
8041
            }
8042
        }
8043
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8044
            PyErr_SetFromWindowsErr(0);
8045
            goto error;
8046
        }
8047
8048
        rep = unicode_encode_call_errorhandler(
8049
                  errors, &errorHandler, encoding, reason,
8050
                  unicode, &exc,
8051
                  pos, pos + 1, &newpos);
8052
        if (rep == NULL)
8053
            goto error;
8054
8055
        Py_ssize_t morebytes = pos - newpos;
8056
        if (PyBytes_Check(rep)) {
8057
            outsize = PyBytes_GET_SIZE(rep);
8058
            morebytes += outsize;
8059
            if (morebytes > 0) {
8060
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8061
                if (out == NULL) {
8062
                    Py_DECREF(rep);
8063
                    goto error;
8064
                }
8065
            }
8066
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8067
            out += outsize;
8068
        }
8069
        else {
8070
            Py_ssize_t i;
8071
            int kind;
8072
            const void *data;
8073
8074
            outsize = PyUnicode_GET_LENGTH(rep);
8075
            morebytes += outsize;
8076
            if (morebytes > 0) {
8077
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8078
                if (out == NULL) {
8079
                    Py_DECREF(rep);
8080
                    goto error;
8081
                }
8082
            }
8083
            kind = PyUnicode_KIND(rep);
8084
            data = PyUnicode_DATA(rep);
8085
            for (i=0; i < outsize; i++) {
8086
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8087
                if (ch > 127) {
8088
                    raise_encode_exception(&exc,
8089
                        encoding, unicode,
8090
                        pos, pos + 1,
8091
                        "unable to encode error handler result to ASCII");
8092
                    Py_DECREF(rep);
8093
                    goto error;
8094
                }
8095
                *out = (unsigned char)ch;
8096
                out++;
8097
            }
8098
        }
8099
        pos = newpos;
8100
        Py_DECREF(rep);
8101
    }
8102
    /* write a NUL byte */
8103
    *out = 0;
8104
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8105
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8106
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8107
        goto error;
8108
    }
8109
    ret = 0;
8110
8111
error:
8112
    Py_XDECREF(encoding_obj);
8113
    Py_XDECREF(errorHandler);
8114
    Py_XDECREF(exc);
8115
    return ret;
8116
}
8117
8118
8119
PyObject *
8120
PyUnicode_EncodeCodePage(int code_page,
8121
                         PyObject *unicode,
8122
                         const char *errors)
8123
{
8124
    Py_ssize_t len;
8125
    PyBytesWriter *writer = NULL;
8126
    Py_ssize_t offset;
8127
    int chunk_len, ret, done;
8128
8129
    if (!PyUnicode_Check(unicode)) {
8130
        PyErr_BadArgument();
8131
        return NULL;
8132
    }
8133
8134
    len = PyUnicode_GET_LENGTH(unicode);
8135
8136
    if (code_page < 0) {
8137
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8138
        return NULL;
8139
    }
8140
8141
    if (len == 0)
8142
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8143
8144
    offset = 0;
8145
    do
8146
    {
8147
#ifdef NEED_RETRY
8148
        if (len > DECODING_CHUNK_SIZE) {
8149
            chunk_len = DECODING_CHUNK_SIZE;
8150
            done = 0;
8151
        }
8152
        else
8153
#endif
8154
        {
8155
            chunk_len = (int)len;
8156
            done = 1;
8157
        }
8158
8159
        ret = encode_code_page_strict(code_page, &writer,
8160
                                      unicode, offset, chunk_len,
8161
                                      errors);
8162
        if (ret == -2)
8163
            ret = encode_code_page_errors(code_page, &writer,
8164
                                          unicode, offset,
8165
                                          chunk_len, errors);
8166
        if (ret < 0) {
8167
            PyBytesWriter_Discard(writer);
8168
            return NULL;
8169
        }
8170
8171
        offset += chunk_len;
8172
        len -= chunk_len;
8173
    } while (!done);
8174
8175
    return PyBytesWriter_Finish(writer);
8176
}
8177
8178
8179
PyObject *
8180
PyUnicode_AsMBCSString(PyObject *unicode)
8181
{
8182
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8183
}
8184
8185
#undef NEED_RETRY
8186
8187
#endif /* MS_WINDOWS */
8188
8189
/* --- Character Mapping Codec -------------------------------------------- */
8190
8191
static int
8192
charmap_decode_string(const char *s,
8193
                      Py_ssize_t size,
8194
                      PyObject *mapping,
8195
                      const char *errors,
8196
                      _PyUnicodeWriter *writer)
8197
24.6k
{
8198
24.6k
    const char *starts = s;
8199
24.6k
    const char *e;
8200
24.6k
    Py_ssize_t startinpos, endinpos;
8201
24.6k
    PyObject *errorHandler = NULL, *exc = NULL;
8202
24.6k
    Py_ssize_t maplen;
8203
24.6k
    int mapkind;
8204
24.6k
    const void *mapdata;
8205
24.6k
    Py_UCS4 x;
8206
24.6k
    unsigned char ch;
8207
8208
24.6k
    maplen = PyUnicode_GET_LENGTH(mapping);
8209
24.6k
    mapdata = PyUnicode_DATA(mapping);
8210
24.6k
    mapkind = PyUnicode_KIND(mapping);
8211
8212
24.6k
    e = s + size;
8213
8214
24.6k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8215
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8216
         * is disabled in encoding aliases, latin1 is preferred because
8217
         * its implementation is faster. */
8218
164
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8219
164
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8220
164
        Py_UCS4 maxchar = writer->maxchar;
8221
8222
164
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8223
9.70k
        while (s < e) {
8224
9.54k
            ch = *s;
8225
9.54k
            x = mapdata_ucs1[ch];
8226
9.54k
            if (x > maxchar) {
8227
153
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8228
0
                    goto onError;
8229
153
                maxchar = writer->maxchar;
8230
153
                outdata = (Py_UCS1 *)writer->data;
8231
153
            }
8232
9.54k
            outdata[writer->pos] = x;
8233
9.54k
            writer->pos++;
8234
9.54k
            ++s;
8235
9.54k
        }
8236
164
        return 0;
8237
164
    }
8238
8239
102k
    while (s < e) {
8240
88.9k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8241
88.9k
            int outkind = writer->kind;
8242
88.9k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8243
88.9k
            if (outkind == PyUnicode_1BYTE_KIND) {
8244
47.4k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8245
47.4k
                Py_UCS4 maxchar = writer->maxchar;
8246
297k
                while (s < e) {
8247
295k
                    ch = *s;
8248
295k
                    x = mapdata_ucs2[ch];
8249
295k
                    if (x > maxchar)
8250
45.3k
                        goto Error;
8251
249k
                    outdata[writer->pos] = x;
8252
249k
                    writer->pos++;
8253
249k
                    ++s;
8254
249k
                }
8255
2.09k
                break;
8256
47.4k
            }
8257
41.5k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8258
41.5k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8259
1.54M
                while (s < e) {
8260
1.54M
                    ch = *s;
8261
1.54M
                    x = mapdata_ucs2[ch];
8262
1.54M
                    if (x == 0xFFFE)
8263
33.0k
                        goto Error;
8264
1.50M
                    outdata[writer->pos] = x;
8265
1.50M
                    writer->pos++;
8266
1.50M
                    ++s;
8267
1.50M
                }
8268
8.51k
                break;
8269
41.5k
            }
8270
88.9k
        }
8271
0
        ch = *s;
8272
8273
0
        if (ch < maplen)
8274
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8275
0
        else
8276
0
            x = 0xfffe; /* invalid value */
8277
78.3k
Error:
8278
78.3k
        if (x == 0xfffe)
8279
53.5k
        {
8280
            /* undefined mapping */
8281
53.5k
            startinpos = s-starts;
8282
53.5k
            endinpos = startinpos+1;
8283
53.5k
            if (unicode_decode_call_errorhandler_writer(
8284
53.5k
                    errors, &errorHandler,
8285
53.5k
                    "charmap", "character maps to <undefined>",
8286
53.5k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8287
53.5k
                    writer)) {
8288
18
                goto onError;
8289
18
            }
8290
53.5k
            continue;
8291
53.5k
        }
8292
8293
24.7k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8294
0
            goto onError;
8295
24.7k
        ++s;
8296
24.7k
    }
8297
24.4k
    Py_XDECREF(errorHandler);
8298
24.4k
    Py_XDECREF(exc);
8299
24.4k
    return 0;
8300
8301
18
onError:
8302
18
    Py_XDECREF(errorHandler);
8303
18
    Py_XDECREF(exc);
8304
18
    return -1;
8305
24.4k
}
8306
8307
static int
8308
charmap_decode_mapping(const char *s,
8309
                       Py_ssize_t size,
8310
                       PyObject *mapping,
8311
                       const char *errors,
8312
                       _PyUnicodeWriter *writer)
8313
0
{
8314
0
    const char *starts = s;
8315
0
    const char *e;
8316
0
    Py_ssize_t startinpos, endinpos;
8317
0
    PyObject *errorHandler = NULL, *exc = NULL;
8318
0
    unsigned char ch;
8319
0
    PyObject *key, *item = NULL;
8320
8321
0
    e = s + size;
8322
8323
0
    while (s < e) {
8324
0
        ch = *s;
8325
8326
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8327
0
        key = PyLong_FromLong((long)ch);
8328
0
        if (key == NULL)
8329
0
            goto onError;
8330
8331
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8332
0
        Py_DECREF(key);
8333
0
        if (rc == 0) {
8334
            /* No mapping found means: mapping is undefined. */
8335
0
            goto Undefined;
8336
0
        }
8337
0
        if (item == NULL) {
8338
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8339
                /* No mapping found means: mapping is undefined. */
8340
0
                PyErr_Clear();
8341
0
                goto Undefined;
8342
0
            } else
8343
0
                goto onError;
8344
0
        }
8345
8346
        /* Apply mapping */
8347
0
        if (item == Py_None)
8348
0
            goto Undefined;
8349
0
        if (PyLong_Check(item)) {
8350
0
            long value = PyLong_AsLong(item);
8351
0
            if (value == 0xFFFE)
8352
0
                goto Undefined;
8353
0
            if (value < 0 || value > MAX_UNICODE) {
8354
0
                PyErr_Format(PyExc_TypeError,
8355
0
                             "character mapping must be in range(0x%x)",
8356
0
                             (unsigned long)MAX_UNICODE + 1);
8357
0
                goto onError;
8358
0
            }
8359
8360
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8361
0
                goto onError;
8362
0
        }
8363
0
        else if (PyUnicode_Check(item)) {
8364
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8365
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8366
0
                if (value == 0xFFFE)
8367
0
                    goto Undefined;
8368
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8369
0
                    goto onError;
8370
0
            }
8371
0
            else {
8372
0
                writer->overallocate = 1;
8373
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8374
0
                    goto onError;
8375
0
            }
8376
0
        }
8377
0
        else {
8378
            /* wrong return value */
8379
0
            PyErr_SetString(PyExc_TypeError,
8380
0
                            "character mapping must return integer, None or str");
8381
0
            goto onError;
8382
0
        }
8383
0
        Py_CLEAR(item);
8384
0
        ++s;
8385
0
        continue;
8386
8387
0
Undefined:
8388
        /* undefined mapping */
8389
0
        Py_CLEAR(item);
8390
0
        startinpos = s-starts;
8391
0
        endinpos = startinpos+1;
8392
0
        if (unicode_decode_call_errorhandler_writer(
8393
0
                errors, &errorHandler,
8394
0
                "charmap", "character maps to <undefined>",
8395
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8396
0
                writer)) {
8397
0
            goto onError;
8398
0
        }
8399
0
    }
8400
0
    Py_XDECREF(errorHandler);
8401
0
    Py_XDECREF(exc);
8402
0
    return 0;
8403
8404
0
onError:
8405
0
    Py_XDECREF(item);
8406
0
    Py_XDECREF(errorHandler);
8407
0
    Py_XDECREF(exc);
8408
0
    return -1;
8409
0
}
8410
8411
PyObject *
8412
PyUnicode_DecodeCharmap(const char *s,
8413
                        Py_ssize_t size,
8414
                        PyObject *mapping,
8415
                        const char *errors)
8416
24.6k
{
8417
24.6k
    _PyUnicodeWriter writer;
8418
8419
    /* Default to Latin-1 */
8420
24.6k
    if (mapping == NULL)
8421
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8422
8423
24.6k
    if (size == 0)
8424
0
        _Py_RETURN_UNICODE_EMPTY();
8425
24.6k
    _PyUnicodeWriter_Init(&writer);
8426
24.6k
    writer.min_length = size;
8427
24.6k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8428
0
        goto onError;
8429
8430
24.6k
    if (PyUnicode_CheckExact(mapping)) {
8431
24.6k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8432
18
            goto onError;
8433
24.6k
    }
8434
0
    else {
8435
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8436
0
            goto onError;
8437
0
    }
8438
24.6k
    return _PyUnicodeWriter_Finish(&writer);
8439
8440
18
  onError:
8441
18
    _PyUnicodeWriter_Dealloc(&writer);
8442
18
    return NULL;
8443
24.6k
}
8444
8445
/* Charmap encoding: the lookup table */
8446
8447
/*[clinic input]
8448
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8449
[clinic start generated code]*/
8450
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8451
8452
struct encoding_map {
8453
    PyObject_HEAD
8454
    unsigned char level1[32];
8455
    int count2, count3;
8456
    unsigned char level23[1];
8457
};
8458
8459
/*[clinic input]
8460
EncodingMap.size
8461
8462
Return the size (in bytes) of this object.
8463
[clinic start generated code]*/
8464
8465
static PyObject *
8466
EncodingMap_size_impl(struct encoding_map *self)
8467
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8468
0
{
8469
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8470
0
                           128*self->count3);
8471
0
}
8472
8473
static PyMethodDef encoding_map_methods[] = {
8474
    ENCODINGMAP_SIZE_METHODDEF
8475
    {NULL, NULL}
8476
};
8477
8478
static PyTypeObject EncodingMapType = {
8479
    PyVarObject_HEAD_INIT(NULL, 0)
8480
    .tp_name = "EncodingMap",
8481
    .tp_basicsize = sizeof(struct encoding_map),
8482
    /* methods */
8483
    .tp_flags = Py_TPFLAGS_DEFAULT,
8484
    .tp_methods = encoding_map_methods,
8485
};
8486
8487
PyObject*
8488
PyUnicode_BuildEncodingMap(PyObject* string)
8489
113
{
8490
113
    PyObject *result;
8491
113
    struct encoding_map *mresult;
8492
113
    int i;
8493
113
    int need_dict = 0;
8494
113
    unsigned char level1[32];
8495
113
    unsigned char level2[512];
8496
113
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8497
113
    int count2 = 0, count3 = 0;
8498
113
    int kind;
8499
113
    const void *data;
8500
113
    int length;
8501
113
    Py_UCS4 ch;
8502
8503
113
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8504
0
        PyErr_BadArgument();
8505
0
        return NULL;
8506
0
    }
8507
113
    kind = PyUnicode_KIND(string);
8508
113
    data = PyUnicode_DATA(string);
8509
113
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8510
113
    memset(level1, 0xFF, sizeof level1);
8511
113
    memset(level2, 0xFF, sizeof level2);
8512
8513
    /* If there isn't a one-to-one mapping of NULL to \0,
8514
       or if there are non-BMP characters, we need to use
8515
       a mapping dictionary. */
8516
113
    if (PyUnicode_READ(kind, data, 0) != 0)
8517
0
        need_dict = 1;
8518
28.9k
    for (i = 1; i < length; i++) {
8519
28.8k
        int l1, l2;
8520
28.8k
        ch = PyUnicode_READ(kind, data, i);
8521
28.8k
        if (ch == 0 || ch > 0xFFFF) {
8522
0
            need_dict = 1;
8523
0
            break;
8524
0
        }
8525
28.8k
        if (ch == 0xFFFE)
8526
            /* unmapped character */
8527
725
            continue;
8528
28.0k
        l1 = ch >> 11;
8529
28.0k
        l2 = ch >> 7;
8530
28.0k
        if (level1[l1] == 0xFF)
8531
205
            level1[l1] = count2++;
8532
28.0k
        if (level2[l2] == 0xFF)
8533
617
            level2[l2] = count3++;
8534
28.0k
    }
8535
8536
113
    if (count2 >= 0xFF || count3 >= 0xFF)
8537
0
        need_dict = 1;
8538
8539
113
    if (need_dict) {
8540
0
        PyObject *result = PyDict_New();
8541
0
        if (!result)
8542
0
            return NULL;
8543
0
        for (i = 0; i < length; i++) {
8544
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8545
0
            PyObject *key = PyLong_FromLong(c);
8546
0
            if (key == NULL) {
8547
0
                Py_DECREF(result);
8548
0
                return NULL;
8549
0
            }
8550
0
            PyObject *value = PyLong_FromLong(i);
8551
0
            if (value == NULL) {
8552
0
                Py_DECREF(key);
8553
0
                Py_DECREF(result);
8554
0
                return NULL;
8555
0
            }
8556
0
            int rc = PyDict_SetItem(result, key, value);
8557
0
            Py_DECREF(key);
8558
0
            Py_DECREF(value);
8559
0
            if (rc < 0) {
8560
0
                Py_DECREF(result);
8561
0
                return NULL;
8562
0
            }
8563
0
        }
8564
0
        return result;
8565
0
    }
8566
8567
    /* Create a three-level trie */
8568
113
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8569
113
                             16*count2 + 128*count3 - 1);
8570
113
    if (!result) {
8571
0
        return PyErr_NoMemory();
8572
0
    }
8573
8574
113
    _PyObject_Init(result, &EncodingMapType);
8575
113
    mresult = (struct encoding_map*)result;
8576
113
    mresult->count2 = count2;
8577
113
    mresult->count3 = count3;
8578
113
    mlevel1 = mresult->level1;
8579
113
    mlevel2 = mresult->level23;
8580
113
    mlevel3 = mresult->level23 + 16*count2;
8581
113
    memcpy(mlevel1, level1, 32);
8582
113
    memset(mlevel2, 0xFF, 16*count2);
8583
113
    memset(mlevel3, 0, 128*count3);
8584
113
    count3 = 0;
8585
28.9k
    for (i = 1; i < length; i++) {
8586
28.8k
        int o1, o2, o3, i2, i3;
8587
28.8k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8588
28.8k
        if (ch == 0xFFFE)
8589
            /* unmapped character */
8590
725
            continue;
8591
28.0k
        o1 = ch>>11;
8592
28.0k
        o2 = (ch>>7) & 0xF;
8593
28.0k
        i2 = 16*mlevel1[o1] + o2;
8594
28.0k
        if (mlevel2[i2] == 0xFF)
8595
617
            mlevel2[i2] = count3++;
8596
28.0k
        o3 = ch & 0x7F;
8597
28.0k
        i3 = 128*mlevel2[i2] + o3;
8598
28.0k
        mlevel3[i3] = i;
8599
28.0k
    }
8600
113
    return result;
8601
113
}
8602
8603
static int
8604
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8605
0
{
8606
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8607
0
    int l1 = c>>11;
8608
0
    int l2 = (c>>7) & 0xF;
8609
0
    int l3 = c & 0x7F;
8610
0
    int i;
8611
8612
0
    if (c > 0xFFFF)
8613
0
        return -1;
8614
0
    if (c == 0)
8615
0
        return 0;
8616
    /* level 1*/
8617
0
    i = map->level1[l1];
8618
0
    if (i == 0xFF) {
8619
0
        return -1;
8620
0
    }
8621
    /* level 2*/
8622
0
    i = map->level23[16*i+l2];
8623
0
    if (i == 0xFF) {
8624
0
        return -1;
8625
0
    }
8626
    /* level 3 */
8627
0
    i = map->level23[16*map->count2 + 128*i + l3];
8628
0
    if (i == 0) {
8629
0
        return -1;
8630
0
    }
8631
0
    return i;
8632
0
}
8633
8634
/* Lookup the character in the mapping.
8635
   On success, return PyLong, PyBytes or None (if the character can't be found).
8636
   If the result is PyLong, put its value in replace.
8637
   On error, return NULL.
8638
   */
8639
static PyObject *
8640
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8641
0
{
8642
0
    PyObject *w = PyLong_FromLong((long)c);
8643
0
    PyObject *x;
8644
8645
0
    if (w == NULL)
8646
0
        return NULL;
8647
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8648
0
    Py_DECREF(w);
8649
0
    if (rc == 0) {
8650
        /* No mapping found means: mapping is undefined. */
8651
0
        Py_RETURN_NONE;
8652
0
    }
8653
0
    if (x == NULL) {
8654
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8655
            /* No mapping found means: mapping is undefined. */
8656
0
            PyErr_Clear();
8657
0
            Py_RETURN_NONE;
8658
0
        } else
8659
0
            return NULL;
8660
0
    }
8661
0
    else if (x == Py_None)
8662
0
        return x;
8663
0
    else if (PyLong_Check(x)) {
8664
0
        long value = PyLong_AsLong(x);
8665
0
        if (value < 0 || value > 255) {
8666
0
            PyErr_SetString(PyExc_TypeError,
8667
0
                            "character mapping must be in range(256)");
8668
0
            Py_DECREF(x);
8669
0
            return NULL;
8670
0
        }
8671
0
        *replace = (unsigned char)value;
8672
0
        return x;
8673
0
    }
8674
0
    else if (PyBytes_Check(x))
8675
0
        return x;
8676
0
    else {
8677
        /* wrong return value */
8678
0
        PyErr_Format(PyExc_TypeError,
8679
0
                     "character mapping must return integer, bytes or None, not %.400s",
8680
0
                     Py_TYPE(x)->tp_name);
8681
0
        Py_DECREF(x);
8682
0
        return NULL;
8683
0
    }
8684
0
}
8685
8686
static int
8687
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8688
0
{
8689
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8690
    /* exponentially overallocate to minimize reallocations */
8691
0
    if (requiredsize < 2 * outsize)
8692
0
        requiredsize = 2 * outsize;
8693
0
    return PyBytesWriter_Resize(writer, requiredsize);
8694
0
}
8695
8696
typedef enum charmapencode_result {
8697
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8698
} charmapencode_result;
8699
/* lookup the character, put the result in the output string and adjust
8700
   various state variables. Resize the output bytes object if not enough
8701
   space is available. Return a new reference to the object that
8702
   was put in the output buffer, or Py_None, if the mapping was undefined
8703
   (in which case no character was written) or NULL, if a
8704
   reallocation error occurred. The caller must decref the result */
8705
static charmapencode_result
8706
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8707
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8708
0
{
8709
0
    PyObject *rep;
8710
0
    unsigned char replace;
8711
0
    char *outstart;
8712
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8713
8714
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8715
0
        int res = encoding_map_lookup(c, mapping);
8716
0
        Py_ssize_t requiredsize = *outpos+1;
8717
0
        if (res == -1) {
8718
0
            return enc_FAILED;
8719
0
        }
8720
8721
0
        if (outsize<requiredsize) {
8722
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8723
0
                return enc_EXCEPTION;
8724
0
            }
8725
0
        }
8726
0
        outstart = _PyBytesWriter_GetData(writer);
8727
0
        outstart[(*outpos)++] = (char)res;
8728
0
        return enc_SUCCESS;
8729
0
    }
8730
8731
0
    rep = charmapencode_lookup(c, mapping, &replace);
8732
0
    if (rep==NULL)
8733
0
        return enc_EXCEPTION;
8734
0
    else if (rep==Py_None) {
8735
0
        Py_DECREF(rep);
8736
0
        return enc_FAILED;
8737
0
    } else {
8738
0
        if (PyLong_Check(rep)) {
8739
0
            Py_ssize_t requiredsize = *outpos+1;
8740
0
            if (outsize<requiredsize)
8741
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8742
0
                    Py_DECREF(rep);
8743
0
                    return enc_EXCEPTION;
8744
0
                }
8745
0
            outstart = _PyBytesWriter_GetData(writer);
8746
0
            outstart[(*outpos)++] = (char)replace;
8747
0
        }
8748
0
        else {
8749
0
            const char *repchars = PyBytes_AS_STRING(rep);
8750
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8751
0
            Py_ssize_t requiredsize = *outpos+repsize;
8752
0
            if (outsize<requiredsize)
8753
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8754
0
                    Py_DECREF(rep);
8755
0
                    return enc_EXCEPTION;
8756
0
                }
8757
0
            outstart = _PyBytesWriter_GetData(writer);
8758
0
            memcpy(outstart + *outpos, repchars, repsize);
8759
0
            *outpos += repsize;
8760
0
        }
8761
0
    }
8762
0
    Py_DECREF(rep);
8763
0
    return enc_SUCCESS;
8764
0
}
8765
8766
/* handle an error in _PyUnicode_EncodeCharmap()
8767
   Return 0 on success, -1 on error */
8768
static int
8769
charmap_encoding_error(
8770
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8771
    PyObject **exceptionObject,
8772
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8773
    PyBytesWriter *writer, Py_ssize_t *respos)
8774
0
{
8775
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8776
0
    Py_ssize_t size, repsize;
8777
0
    Py_ssize_t newpos;
8778
0
    int kind;
8779
0
    const void *data;
8780
0
    Py_ssize_t index;
8781
    /* startpos for collecting unencodable chars */
8782
0
    Py_ssize_t collstartpos = *inpos;
8783
0
    Py_ssize_t collendpos = *inpos+1;
8784
0
    Py_ssize_t collpos;
8785
0
    const char *encoding = "charmap";
8786
0
    const char *reason = "character maps to <undefined>";
8787
0
    charmapencode_result x;
8788
0
    Py_UCS4 ch;
8789
0
    int val;
8790
8791
0
    size = PyUnicode_GET_LENGTH(unicode);
8792
    /* find all unencodable characters */
8793
0
    while (collendpos < size) {
8794
0
        PyObject *rep;
8795
0
        unsigned char replace;
8796
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8797
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8798
0
            val = encoding_map_lookup(ch, mapping);
8799
0
            if (val != -1)
8800
0
                break;
8801
0
            ++collendpos;
8802
0
            continue;
8803
0
        }
8804
8805
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8806
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8807
0
        if (rep==NULL)
8808
0
            return -1;
8809
0
        else if (rep!=Py_None) {
8810
0
            Py_DECREF(rep);
8811
0
            break;
8812
0
        }
8813
0
        Py_DECREF(rep);
8814
0
        ++collendpos;
8815
0
    }
8816
    /* cache callback name lookup
8817
     * (if not done yet, i.e. it's the first error) */
8818
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8819
0
        *error_handler = _Py_GetErrorHandler(errors);
8820
8821
0
    switch (*error_handler) {
8822
0
    case _Py_ERROR_STRICT:
8823
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8824
0
        return -1;
8825
8826
0
    case _Py_ERROR_REPLACE:
8827
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8828
0
            x = charmapencode_output('?', mapping, writer, respos);
8829
0
            if (x==enc_EXCEPTION) {
8830
0
                return -1;
8831
0
            }
8832
0
            else if (x==enc_FAILED) {
8833
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8834
0
                return -1;
8835
0
            }
8836
0
        }
8837
0
        _Py_FALLTHROUGH;
8838
0
    case _Py_ERROR_IGNORE:
8839
0
        *inpos = collendpos;
8840
0
        break;
8841
8842
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8843
        /* generate replacement (temporarily (mis)uses p) */
8844
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8845
0
            char buffer[2+29+1+1];
8846
0
            char *cp;
8847
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8848
0
            for (cp = buffer; *cp; ++cp) {
8849
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8850
0
                if (x==enc_EXCEPTION)
8851
0
                    return -1;
8852
0
                else if (x==enc_FAILED) {
8853
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8854
0
                    return -1;
8855
0
                }
8856
0
            }
8857
0
        }
8858
0
        *inpos = collendpos;
8859
0
        break;
8860
8861
0
    default:
8862
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8863
0
                                                      encoding, reason, unicode, exceptionObject,
8864
0
                                                      collstartpos, collendpos, &newpos);
8865
0
        if (repunicode == NULL)
8866
0
            return -1;
8867
0
        if (PyBytes_Check(repunicode)) {
8868
            /* Directly copy bytes result to output. */
8869
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8870
0
            Py_ssize_t requiredsize;
8871
0
            repsize = PyBytes_Size(repunicode);
8872
0
            requiredsize = *respos + repsize;
8873
0
            if (requiredsize > outsize)
8874
                /* Make room for all additional bytes. */
8875
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8876
0
                    Py_DECREF(repunicode);
8877
0
                    return -1;
8878
0
                }
8879
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8880
0
                   PyBytes_AsString(repunicode),  repsize);
8881
0
            *respos += repsize;
8882
0
            *inpos = newpos;
8883
0
            Py_DECREF(repunicode);
8884
0
            break;
8885
0
        }
8886
        /* generate replacement  */
8887
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8888
0
        data = PyUnicode_DATA(repunicode);
8889
0
        kind = PyUnicode_KIND(repunicode);
8890
0
        for (index = 0; index < repsize; index++) {
8891
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8892
0
            x = charmapencode_output(repch, mapping, writer, respos);
8893
0
            if (x==enc_EXCEPTION) {
8894
0
                Py_DECREF(repunicode);
8895
0
                return -1;
8896
0
            }
8897
0
            else if (x==enc_FAILED) {
8898
0
                Py_DECREF(repunicode);
8899
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8900
0
                return -1;
8901
0
            }
8902
0
        }
8903
0
        *inpos = newpos;
8904
0
        Py_DECREF(repunicode);
8905
0
    }
8906
0
    return 0;
8907
0
}
8908
8909
PyObject *
8910
_PyUnicode_EncodeCharmap(PyObject *unicode,
8911
                         PyObject *mapping,
8912
                         const char *errors)
8913
0
{
8914
    /* Default to Latin-1 */
8915
0
    if (mapping == NULL) {
8916
0
        return unicode_encode_ucs1(unicode, errors, 256);
8917
0
    }
8918
8919
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
8920
0
    if (size == 0) {
8921
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8922
0
    }
8923
0
    const void *data = PyUnicode_DATA(unicode);
8924
0
    int kind = PyUnicode_KIND(unicode);
8925
8926
0
    PyObject *error_handler_obj = NULL;
8927
0
    PyObject *exc = NULL;
8928
8929
    /* output object */
8930
0
    PyBytesWriter *writer;
8931
    /* allocate enough for a simple encoding without
8932
       replacements, if we need more, we'll resize */
8933
0
    writer = PyBytesWriter_Create(size);
8934
0
    if (writer == NULL) {
8935
0
        goto onError;
8936
0
    }
8937
8938
    /* current input position */
8939
0
    Py_ssize_t inpos = 0;
8940
    /* current output position */
8941
0
    Py_ssize_t respos = 0;
8942
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8943
8944
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8945
0
        char *outstart = _PyBytesWriter_GetData(writer);
8946
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8947
8948
0
        while (inpos<size) {
8949
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8950
8951
            /* try to encode it */
8952
0
            int res = encoding_map_lookup(ch, mapping);
8953
0
            Py_ssize_t requiredsize = respos+1;
8954
0
            if (res == -1) {
8955
0
                goto enc_FAILED;
8956
0
            }
8957
8958
0
            if (outsize<requiredsize) {
8959
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
8960
0
                    goto onError;
8961
0
                }
8962
0
                outstart = _PyBytesWriter_GetData(writer);
8963
0
                outsize = _PyBytesWriter_GetSize(writer);
8964
0
            }
8965
0
            outstart[respos++] = (char)res;
8966
8967
            /* done with this character => adjust input position */
8968
0
            ++inpos;
8969
0
            continue;
8970
8971
0
enc_FAILED:
8972
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
8973
0
                                       &exc,
8974
0
                                       &error_handler, &error_handler_obj, errors,
8975
0
                                       writer, &respos)) {
8976
0
                goto onError;
8977
0
            }
8978
0
            outstart = _PyBytesWriter_GetData(writer);
8979
0
            outsize = _PyBytesWriter_GetSize(writer);
8980
0
        }
8981
0
    }
8982
0
    else {
8983
0
        while (inpos<size) {
8984
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8985
            /* try to encode it */
8986
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
8987
0
            if (x==enc_EXCEPTION) { /* error */
8988
0
                goto onError;
8989
0
            }
8990
0
            if (x==enc_FAILED) { /* unencodable character */
8991
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
8992
0
                                           &exc,
8993
0
                                           &error_handler, &error_handler_obj, errors,
8994
0
                                           writer, &respos)) {
8995
0
                    goto onError;
8996
0
                }
8997
0
            }
8998
0
            else {
8999
                /* done with this character => adjust input position */
9000
0
                ++inpos;
9001
0
            }
9002
0
        }
9003
0
    }
9004
9005
0
    Py_XDECREF(exc);
9006
0
    Py_XDECREF(error_handler_obj);
9007
9008
    /* Resize if we allocated too much */
9009
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9010
9011
0
  onError:
9012
0
    PyBytesWriter_Discard(writer);
9013
0
    Py_XDECREF(exc);
9014
0
    Py_XDECREF(error_handler_obj);
9015
0
    return NULL;
9016
0
}
9017
9018
PyObject *
9019
PyUnicode_AsCharmapString(PyObject *unicode,
9020
                          PyObject *mapping)
9021
0
{
9022
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9023
0
        PyErr_BadArgument();
9024
0
        return NULL;
9025
0
    }
9026
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9027
0
}
9028
9029
/* create or adjust a UnicodeTranslateError */
9030
static void
9031
make_translate_exception(PyObject **exceptionObject,
9032
                         PyObject *unicode,
9033
                         Py_ssize_t startpos, Py_ssize_t endpos,
9034
                         const char *reason)
9035
0
{
9036
0
    if (*exceptionObject == NULL) {
9037
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9038
0
            unicode, startpos, endpos, reason);
9039
0
    }
9040
0
    else {
9041
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9042
0
            goto onError;
9043
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9044
0
            goto onError;
9045
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9046
0
            goto onError;
9047
0
        return;
9048
0
      onError:
9049
0
        Py_CLEAR(*exceptionObject);
9050
0
    }
9051
0
}
9052
9053
/* error handling callback helper:
9054
   build arguments, call the callback and check the arguments,
9055
   put the result into newpos and return the replacement string, which
9056
   has to be freed by the caller */
9057
static PyObject *
9058
unicode_translate_call_errorhandler(const char *errors,
9059
                                    PyObject **errorHandler,
9060
                                    const char *reason,
9061
                                    PyObject *unicode, PyObject **exceptionObject,
9062
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9063
                                    Py_ssize_t *newpos)
9064
0
{
9065
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9066
9067
0
    Py_ssize_t i_newpos;
9068
0
    PyObject *restuple;
9069
0
    PyObject *resunicode;
9070
9071
0
    if (*errorHandler == NULL) {
9072
0
        *errorHandler = PyCodec_LookupError(errors);
9073
0
        if (*errorHandler == NULL)
9074
0
            return NULL;
9075
0
    }
9076
9077
0
    make_translate_exception(exceptionObject,
9078
0
                             unicode, startpos, endpos, reason);
9079
0
    if (*exceptionObject == NULL)
9080
0
        return NULL;
9081
9082
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9083
0
    if (restuple == NULL)
9084
0
        return NULL;
9085
0
    if (!PyTuple_Check(restuple)) {
9086
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9087
0
        Py_DECREF(restuple);
9088
0
        return NULL;
9089
0
    }
9090
0
    if (!PyArg_ParseTuple(restuple, argparse,
9091
0
                          &resunicode, &i_newpos)) {
9092
0
        Py_DECREF(restuple);
9093
0
        return NULL;
9094
0
    }
9095
0
    if (i_newpos<0)
9096
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9097
0
    else
9098
0
        *newpos = i_newpos;
9099
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9100
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9101
0
        Py_DECREF(restuple);
9102
0
        return NULL;
9103
0
    }
9104
0
    Py_INCREF(resunicode);
9105
0
    Py_DECREF(restuple);
9106
0
    return resunicode;
9107
0
}
9108
9109
/* Lookup the character ch in the mapping and put the result in result,
9110
   which must be decrefed by the caller.
9111
   The result can be PyLong, PyUnicode, None or NULL.
9112
   If the result is PyLong, put its value in replace.
9113
   Return 0 on success, -1 on error */
9114
static int
9115
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9116
338
{
9117
338
    PyObject *w = PyLong_FromLong((long)c);
9118
338
    PyObject *x;
9119
9120
338
    if (w == NULL)
9121
0
        return -1;
9122
338
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9123
338
    Py_DECREF(w);
9124
338
    if (rc == 0) {
9125
        /* No mapping found means: use 1:1 mapping. */
9126
158
        *result = NULL;
9127
158
        return 0;
9128
158
    }
9129
180
    if (x == NULL) {
9130
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9131
            /* No mapping found means: use 1:1 mapping. */
9132
0
            PyErr_Clear();
9133
0
            *result = NULL;
9134
0
            return 0;
9135
0
        } else
9136
0
            return -1;
9137
0
    }
9138
180
    else if (x == Py_None) {
9139
0
        *result = x;
9140
0
        return 0;
9141
0
    }
9142
180
    else if (PyLong_Check(x)) {
9143
0
        long value = PyLong_AsLong(x);
9144
0
        if (value < 0 || value > MAX_UNICODE) {
9145
0
            PyErr_Format(PyExc_ValueError,
9146
0
                         "character mapping must be in range(0x%x)",
9147
0
                         MAX_UNICODE+1);
9148
0
            Py_DECREF(x);
9149
0
            return -1;
9150
0
        }
9151
0
        *result = x;
9152
0
        *replace = (Py_UCS4)value;
9153
0
        return 0;
9154
0
    }
9155
180
    else if (PyUnicode_Check(x)) {
9156
180
        *result = x;
9157
180
        return 0;
9158
180
    }
9159
0
    else {
9160
        /* wrong return value */
9161
0
        PyErr_SetString(PyExc_TypeError,
9162
0
                        "character mapping must return integer, None or str");
9163
0
        Py_DECREF(x);
9164
0
        return -1;
9165
0
    }
9166
180
}
9167
9168
/* lookup the character, write the result into the writer.
9169
   Return 1 if the result was written into the writer, return 0 if the mapping
9170
   was undefined, raise an exception return -1 on error. */
9171
static int
9172
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9173
                        _PyUnicodeWriter *writer)
9174
210
{
9175
210
    PyObject *item;
9176
210
    Py_UCS4 replace;
9177
9178
210
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9179
0
        return -1;
9180
9181
210
    if (item == NULL) {
9182
        /* not found => default to 1:1 mapping */
9183
86
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9184
0
            return -1;
9185
0
        }
9186
86
        return 1;
9187
86
    }
9188
9189
124
    if (item == Py_None) {
9190
0
        Py_DECREF(item);
9191
0
        return 0;
9192
0
    }
9193
9194
124
    if (PyLong_Check(item)) {
9195
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9196
0
            Py_DECREF(item);
9197
0
            return -1;
9198
0
        }
9199
0
        Py_DECREF(item);
9200
0
        return 1;
9201
0
    }
9202
9203
124
    if (!PyUnicode_Check(item)) {
9204
0
        Py_DECREF(item);
9205
0
        return -1;
9206
0
    }
9207
9208
124
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9209
0
        Py_DECREF(item);
9210
0
        return -1;
9211
0
    }
9212
9213
124
    Py_DECREF(item);
9214
124
    return 1;
9215
124
}
9216
9217
static int
9218
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9219
                              Py_UCS1 *translate)
9220
128
{
9221
128
    PyObject *item = NULL;
9222
128
    Py_UCS4 replace;
9223
128
    int ret = 0;
9224
9225
128
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9226
0
        return -1;
9227
0
    }
9228
9229
128
    if (item == Py_None) {
9230
        /* deletion */
9231
0
        translate[ch] = 0xfe;
9232
0
    }
9233
128
    else if (item == NULL) {
9234
        /* not found => default to 1:1 mapping */
9235
72
        translate[ch] = ch;
9236
72
        return 1;
9237
72
    }
9238
56
    else if (PyLong_Check(item)) {
9239
0
        if (replace > 127) {
9240
            /* invalid character or character outside ASCII:
9241
               skip the fast translate */
9242
0
            goto exit;
9243
0
        }
9244
0
        translate[ch] = (Py_UCS1)replace;
9245
0
    }
9246
56
    else if (PyUnicode_Check(item)) {
9247
56
        if (PyUnicode_GET_LENGTH(item) != 1)
9248
56
            goto exit;
9249
9250
0
        replace = PyUnicode_READ_CHAR(item, 0);
9251
0
        if (replace > 127)
9252
0
            goto exit;
9253
0
        translate[ch] = (Py_UCS1)replace;
9254
0
    }
9255
0
    else {
9256
        /* not None, NULL, long or unicode */
9257
0
        goto exit;
9258
0
    }
9259
0
    ret = 1;
9260
9261
56
  exit:
9262
56
    Py_DECREF(item);
9263
56
    return ret;
9264
0
}
9265
9266
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9267
   was translated into writer, return 0 if the input string was partially
9268
   translated into writer, raise an exception and return -1 on error. */
9269
static int
9270
unicode_fast_translate(PyObject *input, PyObject *mapping,
9271
                       _PyUnicodeWriter *writer, int ignore,
9272
                       Py_ssize_t *input_pos)
9273
104
{
9274
104
    Py_UCS1 ascii_table[128], ch, ch2;
9275
104
    Py_ssize_t len;
9276
104
    const Py_UCS1 *in, *end;
9277
104
    Py_UCS1 *out;
9278
104
    int res = 0;
9279
9280
104
    len = PyUnicode_GET_LENGTH(input);
9281
9282
104
    memset(ascii_table, 0xff, 128);
9283
9284
104
    in = PyUnicode_1BYTE_DATA(input);
9285
104
    end = in + len;
9286
9287
104
    assert(PyUnicode_IS_ASCII(writer->buffer));
9288
104
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9289
104
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9290
9291
190
    for (; in < end; in++) {
9292
142
        ch = *in;
9293
142
        ch2 = ascii_table[ch];
9294
142
        if (ch2 == 0xff) {
9295
128
            int translate = unicode_fast_translate_lookup(mapping, ch,
9296
128
                                                          ascii_table);
9297
128
            if (translate < 0)
9298
0
                return -1;
9299
128
            if (translate == 0)
9300
56
                goto exit;
9301
72
            ch2 = ascii_table[ch];
9302
72
        }
9303
86
        if (ch2 == 0xfe) {
9304
0
            if (ignore)
9305
0
                continue;
9306
0
            goto exit;
9307
0
        }
9308
86
        assert(ch2 < 128);
9309
86
        *out = ch2;
9310
86
        out++;
9311
86
    }
9312
48
    res = 1;
9313
9314
104
exit:
9315
104
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9316
104
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9317
104
    return res;
9318
48
}
9319
9320
static PyObject *
9321
_PyUnicode_TranslateCharmap(PyObject *input,
9322
                            PyObject *mapping,
9323
                            const char *errors)
9324
104
{
9325
    /* input object */
9326
104
    const void *data;
9327
104
    Py_ssize_t size, i;
9328
104
    int kind;
9329
    /* output buffer */
9330
104
    _PyUnicodeWriter writer;
9331
    /* error handler */
9332
104
    const char *reason = "character maps to <undefined>";
9333
104
    PyObject *errorHandler = NULL;
9334
104
    PyObject *exc = NULL;
9335
104
    int ignore;
9336
104
    int res;
9337
9338
104
    if (mapping == NULL) {
9339
0
        PyErr_BadArgument();
9340
0
        return NULL;
9341
0
    }
9342
9343
104
    data = PyUnicode_DATA(input);
9344
104
    kind = PyUnicode_KIND(input);
9345
104
    size = PyUnicode_GET_LENGTH(input);
9346
9347
104
    if (size == 0)
9348
0
        return PyUnicode_FromObject(input);
9349
9350
    /* allocate enough for a simple 1:1 translation without
9351
       replacements, if we need more, we'll resize */
9352
104
    _PyUnicodeWriter_Init(&writer);
9353
104
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9354
0
        goto onError;
9355
9356
104
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9357
9358
104
    if (PyUnicode_IS_ASCII(input)) {
9359
104
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9360
104
        if (res < 0) {
9361
0
            _PyUnicodeWriter_Dealloc(&writer);
9362
0
            return NULL;
9363
0
        }
9364
104
        if (res == 1)
9365
48
            return _PyUnicodeWriter_Finish(&writer);
9366
104
    }
9367
0
    else {
9368
0
        i = 0;
9369
0
    }
9370
9371
266
    while (i<size) {
9372
        /* try to encode it */
9373
210
        int translate;
9374
210
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9375
210
        Py_ssize_t newpos;
9376
        /* startpos for collecting untranslatable chars */
9377
210
        Py_ssize_t collstart;
9378
210
        Py_ssize_t collend;
9379
210
        Py_UCS4 ch;
9380
9381
210
        ch = PyUnicode_READ(kind, data, i);
9382
210
        translate = charmaptranslate_output(ch, mapping, &writer);
9383
210
        if (translate < 0)
9384
0
            goto onError;
9385
9386
210
        if (translate != 0) {
9387
            /* it worked => adjust input pointer */
9388
210
            ++i;
9389
210
            continue;
9390
210
        }
9391
9392
        /* untranslatable character */
9393
0
        collstart = i;
9394
0
        collend = i+1;
9395
9396
        /* find all untranslatable characters */
9397
0
        while (collend < size) {
9398
0
            PyObject *x;
9399
0
            Py_UCS4 replace;
9400
0
            ch = PyUnicode_READ(kind, data, collend);
9401
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9402
0
                goto onError;
9403
0
            Py_XDECREF(x);
9404
0
            if (x != Py_None)
9405
0
                break;
9406
0
            ++collend;
9407
0
        }
9408
9409
0
        if (ignore) {
9410
0
            i = collend;
9411
0
        }
9412
0
        else {
9413
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9414
0
                                                             reason, input, &exc,
9415
0
                                                             collstart, collend, &newpos);
9416
0
            if (repunicode == NULL)
9417
0
                goto onError;
9418
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9419
0
                Py_DECREF(repunicode);
9420
0
                goto onError;
9421
0
            }
9422
0
            Py_DECREF(repunicode);
9423
0
            i = newpos;
9424
0
        }
9425
0
    }
9426
56
    Py_XDECREF(exc);
9427
56
    Py_XDECREF(errorHandler);
9428
56
    return _PyUnicodeWriter_Finish(&writer);
9429
9430
0
  onError:
9431
0
    _PyUnicodeWriter_Dealloc(&writer);
9432
0
    Py_XDECREF(exc);
9433
0
    Py_XDECREF(errorHandler);
9434
0
    return NULL;
9435
56
}
9436
9437
PyObject *
9438
PyUnicode_Translate(PyObject *str,
9439
                    PyObject *mapping,
9440
                    const char *errors)
9441
0
{
9442
0
    if (ensure_unicode(str) < 0)
9443
0
        return NULL;
9444
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9445
0
}
9446
9447
PyObject *
9448
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9449
4.43M
{
9450
4.43M
    if (!PyUnicode_Check(unicode)) {
9451
0
        PyErr_BadInternalCall();
9452
0
        return NULL;
9453
0
    }
9454
4.43M
    if (PyUnicode_IS_ASCII(unicode)) {
9455
        /* If the string is already ASCII, just return the same string */
9456
4.43M
        return Py_NewRef(unicode);
9457
4.43M
    }
9458
9459
2.41k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9460
2.41k
    PyObject *result = PyUnicode_New(len, 127);
9461
2.41k
    if (result == NULL) {
9462
0
        return NULL;
9463
0
    }
9464
9465
2.41k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9466
2.41k
    int kind = PyUnicode_KIND(unicode);
9467
2.41k
    const void *data = PyUnicode_DATA(unicode);
9468
2.41k
    Py_ssize_t i;
9469
33.8k
    for (i = 0; i < len; ++i) {
9470
31.5k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9471
31.5k
        if (ch < 127) {
9472
28.6k
            out[i] = ch;
9473
28.6k
        }
9474
2.86k
        else if (Py_UNICODE_ISSPACE(ch)) {
9475
1.11k
            out[i] = ' ';
9476
1.11k
        }
9477
1.74k
        else {
9478
1.74k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9479
1.74k
            if (decimal < 0) {
9480
145
                out[i] = '?';
9481
145
                out[i+1] = '\0';
9482
145
                _PyUnicode_LENGTH(result) = i + 1;
9483
145
                break;
9484
145
            }
9485
1.60k
            out[i] = '0' + decimal;
9486
1.60k
        }
9487
31.5k
    }
9488
9489
2.41k
    assert(_PyUnicode_CheckConsistency(result, 1));
9490
2.41k
    return result;
9491
2.41k
}
9492
9493
/* --- Helpers ------------------------------------------------------------ */
9494
9495
/* helper macro to fixup start/end slice values */
9496
#define ADJUST_INDICES(start, end, len) \
9497
167M
    do {                                \
9498
167M
        if (end > len) {                \
9499
136M
            end = len;                  \
9500
136M
        }                               \
9501
167M
        else if (end < 0) {             \
9502
0
            end += len;                 \
9503
0
            if (end < 0) {              \
9504
0
                end = 0;                \
9505
0
            }                           \
9506
0
        }                               \
9507
167M
        if (start < 0) {                \
9508
21.4k
            start += len;               \
9509
21.4k
            if (start < 0) {            \
9510
0
                start = 0;              \
9511
0
            }                           \
9512
21.4k
        }                               \
9513
167M
    } while (0)
9514
9515
static Py_ssize_t
9516
any_find_slice(PyObject* s1, PyObject* s2,
9517
               Py_ssize_t start,
9518
               Py_ssize_t end,
9519
               int direction)
9520
30.3M
{
9521
30.3M
    int kind1, kind2;
9522
30.3M
    const void *buf1, *buf2;
9523
30.3M
    Py_ssize_t len1, len2, result;
9524
9525
30.3M
    kind1 = PyUnicode_KIND(s1);
9526
30.3M
    kind2 = PyUnicode_KIND(s2);
9527
30.3M
    if (kind1 < kind2)
9528
0
        return -1;
9529
9530
30.3M
    len1 = PyUnicode_GET_LENGTH(s1);
9531
30.3M
    len2 = PyUnicode_GET_LENGTH(s2);
9532
30.3M
    ADJUST_INDICES(start, end, len1);
9533
30.3M
    if (end - start < len2)
9534
3.07M
        return -1;
9535
9536
27.3M
    buf1 = PyUnicode_DATA(s1);
9537
27.3M
    buf2 = PyUnicode_DATA(s2);
9538
27.3M
    if (len2 == 1) {
9539
27.2M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9540
27.2M
        result = findchar((const char *)buf1 + kind1*start,
9541
27.2M
                          kind1, end - start, ch, direction);
9542
27.2M
        if (result == -1)
9543
3.88M
            return -1;
9544
23.3M
        else
9545
23.3M
            return start + result;
9546
27.2M
    }
9547
9548
68.3k
    if (kind2 != kind1) {
9549
46.9k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9550
46.9k
        if (!buf2)
9551
0
            return -2;
9552
46.9k
    }
9553
9554
68.3k
    if (direction > 0) {
9555
68.3k
        switch (kind1) {
9556
21.3k
        case PyUnicode_1BYTE_KIND:
9557
21.3k
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9558
6.58k
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9559
14.7k
            else
9560
14.7k
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9561
21.3k
            break;
9562
20.3k
        case PyUnicode_2BYTE_KIND:
9563
20.3k
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9564
20.3k
            break;
9565
26.5k
        case PyUnicode_4BYTE_KIND:
9566
26.5k
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9567
26.5k
            break;
9568
0
        default:
9569
0
            Py_UNREACHABLE();
9570
68.3k
        }
9571
68.3k
    }
9572
0
    else {
9573
0
        switch (kind1) {
9574
0
        case PyUnicode_1BYTE_KIND:
9575
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9576
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9577
0
            else
9578
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9579
0
            break;
9580
0
        case PyUnicode_2BYTE_KIND:
9581
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9582
0
            break;
9583
0
        case PyUnicode_4BYTE_KIND:
9584
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9585
0
            break;
9586
0
        default:
9587
0
            Py_UNREACHABLE();
9588
0
        }
9589
0
    }
9590
9591
68.3k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9592
68.3k
    if (kind2 != kind1)
9593
46.9k
        PyMem_Free((void *)buf2);
9594
9595
68.3k
    return result;
9596
68.3k
}
9597
9598
9599
Py_ssize_t
9600
PyUnicode_Count(PyObject *str,
9601
                PyObject *substr,
9602
                Py_ssize_t start,
9603
                Py_ssize_t end)
9604
0
{
9605
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9606
0
        return -1;
9607
9608
0
    return unicode_count_impl(str, substr, start, end);
9609
0
}
9610
9611
Py_ssize_t
9612
PyUnicode_Find(PyObject *str,
9613
               PyObject *substr,
9614
               Py_ssize_t start,
9615
               Py_ssize_t end,
9616
               int direction)
9617
0
{
9618
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9619
0
        return -2;
9620
9621
0
    return any_find_slice(str, substr, start, end, direction);
9622
0
}
9623
9624
Py_ssize_t
9625
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9626
                   Py_ssize_t start, Py_ssize_t end,
9627
                   int direction)
9628
993k
{
9629
993k
    int kind;
9630
993k
    Py_ssize_t len, result;
9631
993k
    len = PyUnicode_GET_LENGTH(str);
9632
993k
    ADJUST_INDICES(start, end, len);
9633
993k
    if (end - start < 1)
9634
0
        return -1;
9635
993k
    kind = PyUnicode_KIND(str);
9636
993k
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9637
993k
                      kind, end-start, ch, direction);
9638
993k
    if (result == -1)
9639
454k
        return -1;
9640
538k
    else
9641
538k
        return start + result;
9642
993k
}
9643
9644
static int
9645
tailmatch(PyObject *self,
9646
          PyObject *substring,
9647
          Py_ssize_t start,
9648
          Py_ssize_t end,
9649
          int direction)
9650
104M
{
9651
104M
    int kind_self;
9652
104M
    int kind_sub;
9653
104M
    const void *data_self;
9654
104M
    const void *data_sub;
9655
104M
    Py_ssize_t offset;
9656
104M
    Py_ssize_t i;
9657
104M
    Py_ssize_t end_sub;
9658
9659
104M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9660
104M
    end -= PyUnicode_GET_LENGTH(substring);
9661
104M
    if (end < start)
9662
10.9M
        return 0;
9663
9664
93.5M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9665
0
        return 1;
9666
9667
93.5M
    kind_self = PyUnicode_KIND(self);
9668
93.5M
    data_self = PyUnicode_DATA(self);
9669
93.5M
    kind_sub = PyUnicode_KIND(substring);
9670
93.5M
    data_sub = PyUnicode_DATA(substring);
9671
93.5M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9672
9673
93.5M
    if (direction > 0)
9674
7.93M
        offset = end;
9675
85.6M
    else
9676
85.6M
        offset = start;
9677
9678
93.5M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9679
93.5M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9680
47.9M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9681
47.9M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9682
        /* If both are of the same kind, memcmp is sufficient */
9683
16.4M
        if (kind_self == kind_sub) {
9684
9.69M
            return ! memcmp((char *)data_self +
9685
9.69M
                                (offset * PyUnicode_KIND(substring)),
9686
9.69M
                            data_sub,
9687
9.69M
                            PyUnicode_GET_LENGTH(substring) *
9688
9.69M
                                PyUnicode_KIND(substring));
9689
9.69M
        }
9690
        /* otherwise we have to compare each character by first accessing it */
9691
6.78M
        else {
9692
            /* We do not need to compare 0 and len(substring)-1 because
9693
               the if statement above ensured already that they are equal
9694
               when we end up here. */
9695
6.84M
            for (i = 1; i < end_sub; ++i) {
9696
56.4k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9697
56.4k
                    PyUnicode_READ(kind_sub, data_sub, i))
9698
3.25k
                    return 0;
9699
56.4k
            }
9700
6.78M
            return 1;
9701
6.78M
        }
9702
16.4M
    }
9703
9704
77.0M
    return 0;
9705
93.5M
}
9706
9707
Py_ssize_t
9708
PyUnicode_Tailmatch(PyObject *str,
9709
                    PyObject *substr,
9710
                    Py_ssize_t start,
9711
                    Py_ssize_t end,
9712
                    int direction)
9713
0
{
9714
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9715
0
        return -1;
9716
9717
0
    return tailmatch(str, substr, start, end, direction);
9718
0
}
9719
9720
static PyObject *
9721
ascii_upper_or_lower(PyObject *self, int lower)
9722
78.4M
{
9723
78.4M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9724
78.4M
    const char *data = PyUnicode_DATA(self);
9725
78.4M
    char *resdata;
9726
78.4M
    PyObject *res;
9727
9728
78.4M
    res = PyUnicode_New(len, 127);
9729
78.4M
    if (res == NULL)
9730
0
        return NULL;
9731
78.4M
    resdata = PyUnicode_DATA(res);
9732
78.4M
    if (lower)
9733
78.4M
        _Py_bytes_lower(resdata, data, len);
9734
102
    else
9735
102
        _Py_bytes_upper(resdata, data, len);
9736
78.4M
    return res;
9737
78.4M
}
9738
9739
static Py_UCS4
9740
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9741
190k
{
9742
190k
    Py_ssize_t j;
9743
190k
    int final_sigma;
9744
190k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9745
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9746
9747
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9748
9749
    where ! is a negation and \p{xxx} is a character with property xxx.
9750
    */
9751
378k
    for (j = i - 1; j >= 0; j--) {
9752
376k
        c = PyUnicode_READ(kind, data, j);
9753
376k
        if (!_PyUnicode_IsCaseIgnorable(c))
9754
188k
            break;
9755
376k
    }
9756
190k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9757
190k
    if (final_sigma) {
9758
304k
        for (j = i + 1; j < length; j++) {
9759
300k
            c = PyUnicode_READ(kind, data, j);
9760
300k
            if (!_PyUnicode_IsCaseIgnorable(c))
9761
139k
                break;
9762
300k
        }
9763
143k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9764
143k
    }
9765
190k
    return (final_sigma) ? 0x3C2 : 0x3C3;
9766
190k
}
9767
9768
static int
9769
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9770
           Py_UCS4 c, Py_UCS4 *mapped)
9771
76.4M
{
9772
    /* Obscure special case. */
9773
76.4M
    if (c == 0x3A3) {
9774
190k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9775
190k
        return 1;
9776
190k
    }
9777
76.2M
    return _PyUnicode_ToLowerFull(c, mapped);
9778
76.4M
}
9779
9780
static Py_ssize_t
9781
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9782
0
{
9783
0
    Py_ssize_t i, k = 0;
9784
0
    int n_res, j;
9785
0
    Py_UCS4 c, mapped[3];
9786
9787
0
    c = PyUnicode_READ(kind, data, 0);
9788
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9789
0
    for (j = 0; j < n_res; j++) {
9790
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9791
0
        res[k++] = mapped[j];
9792
0
    }
9793
0
    for (i = 1; i < length; i++) {
9794
0
        c = PyUnicode_READ(kind, data, i);
9795
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9796
0
        for (j = 0; j < n_res; j++) {
9797
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9798
0
            res[k++] = mapped[j];
9799
0
        }
9800
0
    }
9801
0
    return k;
9802
0
}
9803
9804
static Py_ssize_t
9805
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9806
0
    Py_ssize_t i, k = 0;
9807
9808
0
    for (i = 0; i < length; i++) {
9809
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9810
0
        int n_res, j;
9811
0
        if (Py_UNICODE_ISUPPER(c)) {
9812
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9813
0
        }
9814
0
        else if (Py_UNICODE_ISLOWER(c)) {
9815
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9816
0
        }
9817
0
        else {
9818
0
            n_res = 1;
9819
0
            mapped[0] = c;
9820
0
        }
9821
0
        for (j = 0; j < n_res; j++) {
9822
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9823
0
            res[k++] = mapped[j];
9824
0
        }
9825
0
    }
9826
0
    return k;
9827
0
}
9828
9829
static Py_ssize_t
9830
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9831
                  Py_UCS4 *maxchar, int lower)
9832
7.11M
{
9833
7.11M
    Py_ssize_t i, k = 0;
9834
9835
83.5M
    for (i = 0; i < length; i++) {
9836
76.4M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9837
76.4M
        int n_res, j;
9838
76.4M
        if (lower)
9839
76.4M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9840
0
        else
9841
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9842
152M
        for (j = 0; j < n_res; j++) {
9843
76.4M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9844
76.4M
            res[k++] = mapped[j];
9845
76.4M
        }
9846
76.4M
    }
9847
7.11M
    return k;
9848
7.11M
}
9849
9850
static Py_ssize_t
9851
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9852
0
{
9853
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9854
0
}
9855
9856
static Py_ssize_t
9857
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9858
7.11M
{
9859
7.11M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9860
7.11M
}
9861
9862
static Py_ssize_t
9863
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9864
0
{
9865
0
    Py_ssize_t i, k = 0;
9866
9867
0
    for (i = 0; i < length; i++) {
9868
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9869
0
        Py_UCS4 mapped[3];
9870
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9871
0
        for (j = 0; j < n_res; j++) {
9872
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9873
0
            res[k++] = mapped[j];
9874
0
        }
9875
0
    }
9876
0
    return k;
9877
0
}
9878
9879
static Py_ssize_t
9880
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9881
0
{
9882
0
    Py_ssize_t i, k = 0;
9883
0
    int previous_is_cased;
9884
9885
0
    previous_is_cased = 0;
9886
0
    for (i = 0; i < length; i++) {
9887
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9888
0
        Py_UCS4 mapped[3];
9889
0
        int n_res, j;
9890
9891
0
        if (previous_is_cased)
9892
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9893
0
        else
9894
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9895
9896
0
        for (j = 0; j < n_res; j++) {
9897
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9898
0
            res[k++] = mapped[j];
9899
0
        }
9900
9901
0
        previous_is_cased = _PyUnicode_IsCased(c);
9902
0
    }
9903
0
    return k;
9904
0
}
9905
9906
static PyObject *
9907
case_operation(PyObject *self,
9908
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9909
7.11M
{
9910
7.11M
    PyObject *res = NULL;
9911
7.11M
    Py_ssize_t length, newlength = 0;
9912
7.11M
    int kind, outkind;
9913
7.11M
    const void *data;
9914
7.11M
    void *outdata;
9915
7.11M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9916
9917
7.11M
    kind = PyUnicode_KIND(self);
9918
7.11M
    data = PyUnicode_DATA(self);
9919
7.11M
    length = PyUnicode_GET_LENGTH(self);
9920
7.11M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9921
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9922
0
        return NULL;
9923
0
    }
9924
7.11M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9925
7.11M
    if (tmp == NULL)
9926
0
        return PyErr_NoMemory();
9927
7.11M
    newlength = perform(kind, data, length, tmp, &maxchar);
9928
7.11M
    res = PyUnicode_New(newlength, maxchar);
9929
7.11M
    if (res == NULL)
9930
0
        goto leave;
9931
7.11M
    tmpend = tmp + newlength;
9932
7.11M
    outdata = PyUnicode_DATA(res);
9933
7.11M
    outkind = PyUnicode_KIND(res);
9934
7.11M
    switch (outkind) {
9935
223k
    case PyUnicode_1BYTE_KIND:
9936
223k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9937
223k
        break;
9938
6.85M
    case PyUnicode_2BYTE_KIND:
9939
6.85M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9940
6.85M
        break;
9941
42.7k
    case PyUnicode_4BYTE_KIND:
9942
42.7k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9943
42.7k
        break;
9944
0
    default:
9945
0
        Py_UNREACHABLE();
9946
7.11M
    }
9947
7.11M
  leave:
9948
7.11M
    PyMem_Free(tmp);
9949
7.11M
    return res;
9950
7.11M
}
9951
9952
PyObject *
9953
PyUnicode_Join(PyObject *separator, PyObject *seq)
9954
25.7M
{
9955
25.7M
    PyObject *res;
9956
25.7M
    PyObject *fseq;
9957
25.7M
    Py_ssize_t seqlen;
9958
25.7M
    PyObject **items;
9959
9960
25.7M
    fseq = PySequence_Fast(seq, "can only join an iterable");
9961
25.7M
    if (fseq == NULL) {
9962
647
        return NULL;
9963
647
    }
9964
9965
25.7M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9966
9967
25.7M
    items = PySequence_Fast_ITEMS(fseq);
9968
25.7M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9969
25.7M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9970
9971
25.7M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9972
9973
25.7M
    Py_DECREF(fseq);
9974
25.7M
    return res;
9975
25.7M
}
9976
9977
PyObject *
9978
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9979
65.9M
{
9980
65.9M
    PyObject *res = NULL; /* the result */
9981
65.9M
    PyObject *sep = NULL;
9982
65.9M
    Py_ssize_t seplen;
9983
65.9M
    PyObject *item;
9984
65.9M
    Py_ssize_t sz, i, res_offset;
9985
65.9M
    Py_UCS4 maxchar;
9986
65.9M
    Py_UCS4 item_maxchar;
9987
65.9M
    int use_memcpy;
9988
65.9M
    unsigned char *res_data = NULL, *sep_data = NULL;
9989
65.9M
    PyObject *last_obj;
9990
65.9M
    int kind = 0;
9991
9992
    /* If empty sequence, return u"". */
9993
65.9M
    if (seqlen == 0) {
9994
7.21M
        _Py_RETURN_UNICODE_EMPTY();
9995
7.21M
    }
9996
9997
    /* If singleton sequence with an exact Unicode, return that. */
9998
58.6M
    last_obj = NULL;
9999
58.6M
    if (seqlen == 1) {
10000
8.33M
        if (PyUnicode_CheckExact(items[0])) {
10001
6.66M
            res = items[0];
10002
6.66M
            return Py_NewRef(res);
10003
6.66M
        }
10004
1.67M
        seplen = 0;
10005
1.67M
        maxchar = 0;
10006
1.67M
    }
10007
50.3M
    else {
10008
        /* Set up sep and seplen */
10009
50.3M
        if (separator == NULL) {
10010
            /* fall back to a blank space separator */
10011
0
            sep = PyUnicode_FromOrdinal(' ');
10012
0
            if (!sep)
10013
0
                goto onError;
10014
0
            seplen = 1;
10015
0
            maxchar = 32;
10016
0
        }
10017
50.3M
        else {
10018
50.3M
            if (!PyUnicode_Check(separator)) {
10019
0
                PyErr_Format(PyExc_TypeError,
10020
0
                             "separator: expected str instance,"
10021
0
                             " %.80s found",
10022
0
                             Py_TYPE(separator)->tp_name);
10023
0
                goto onError;
10024
0
            }
10025
50.3M
            sep = separator;
10026
50.3M
            seplen = PyUnicode_GET_LENGTH(separator);
10027
50.3M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10028
            /* inc refcount to keep this code path symmetric with the
10029
               above case of a blank separator */
10030
50.3M
            Py_INCREF(sep);
10031
50.3M
        }
10032
50.3M
        last_obj = sep;
10033
50.3M
    }
10034
10035
    /* There are at least two things to join, or else we have a subclass
10036
     * of str in the sequence.
10037
     * Do a pre-pass to figure out the total amount of space we'll
10038
     * need (sz), and see whether all argument are strings.
10039
     */
10040
52.0M
    sz = 0;
10041
#ifdef Py_DEBUG
10042
    use_memcpy = 0;
10043
#else
10044
52.0M
    use_memcpy = 1;
10045
52.0M
#endif
10046
408M
    for (i = 0; i < seqlen; i++) {
10047
356M
        size_t add_sz;
10048
356M
        item = items[i];
10049
356M
        if (!PyUnicode_Check(item)) {
10050
0
            PyErr_Format(PyExc_TypeError,
10051
0
                         "sequence item %zd: expected str instance,"
10052
0
                         " %.80s found",
10053
0
                         i, Py_TYPE(item)->tp_name);
10054
0
            goto onError;
10055
0
        }
10056
356M
        add_sz = PyUnicode_GET_LENGTH(item);
10057
356M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10058
356M
        maxchar = Py_MAX(maxchar, item_maxchar);
10059
356M
        if (i != 0) {
10060
304M
            add_sz += seplen;
10061
304M
        }
10062
356M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10063
0
            PyErr_SetString(PyExc_OverflowError,
10064
0
                            "join() result is too long for a Python string");
10065
0
            goto onError;
10066
0
        }
10067
356M
        sz += add_sz;
10068
356M
        if (use_memcpy && last_obj != NULL) {
10069
288M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10070
6.10M
                use_memcpy = 0;
10071
288M
        }
10072
356M
        last_obj = item;
10073
356M
    }
10074
10075
52.0M
    res = PyUnicode_New(sz, maxchar);
10076
52.0M
    if (res == NULL)
10077
0
        goto onError;
10078
10079
    /* Catenate everything. */
10080
#ifdef Py_DEBUG
10081
    use_memcpy = 0;
10082
#else
10083
52.0M
    if (use_memcpy) {
10084
45.9M
        res_data = PyUnicode_1BYTE_DATA(res);
10085
45.9M
        kind = PyUnicode_KIND(res);
10086
45.9M
        if (seplen != 0)
10087
19.3k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10088
45.9M
    }
10089
52.0M
#endif
10090
52.0M
    if (use_memcpy) {
10091
303M
        for (i = 0; i < seqlen; ++i) {
10092
258M
            Py_ssize_t itemlen;
10093
258M
            item = items[i];
10094
10095
            /* Copy item, and maybe the separator. */
10096
258M
            if (i && seplen != 0) {
10097
26.1k
                memcpy(res_data,
10098
26.1k
                          sep_data,
10099
26.1k
                          kind * seplen);
10100
26.1k
                res_data += kind * seplen;
10101
26.1k
            }
10102
10103
258M
            itemlen = PyUnicode_GET_LENGTH(item);
10104
258M
            if (itemlen != 0) {
10105
224M
                memcpy(res_data,
10106
224M
                          PyUnicode_DATA(item),
10107
224M
                          kind * itemlen);
10108
224M
                res_data += kind * itemlen;
10109
224M
            }
10110
258M
        }
10111
45.9M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10112
45.9M
                           + kind * PyUnicode_GET_LENGTH(res));
10113
45.9M
    }
10114
6.10M
    else {
10115
104M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10116
98.7M
            Py_ssize_t itemlen;
10117
98.7M
            item = items[i];
10118
10119
            /* Copy item, and maybe the separator. */
10120
98.7M
            if (i && seplen != 0) {
10121
63.6k
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10122
63.6k
                res_offset += seplen;
10123
63.6k
            }
10124
10125
98.7M
            itemlen = PyUnicode_GET_LENGTH(item);
10126
98.7M
            if (itemlen != 0) {
10127
96.3M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10128
96.3M
                res_offset += itemlen;
10129
96.3M
            }
10130
98.7M
        }
10131
6.10M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10132
6.10M
    }
10133
10134
52.0M
    Py_XDECREF(sep);
10135
52.0M
    assert(_PyUnicode_CheckConsistency(res, 1));
10136
52.0M
    return res;
10137
10138
0
  onError:
10139
0
    Py_XDECREF(sep);
10140
0
    Py_XDECREF(res);
10141
0
    return NULL;
10142
52.0M
}
10143
10144
void
10145
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10146
                    Py_UCS4 fill_char)
10147
641
{
10148
641
    const int kind = PyUnicode_KIND(unicode);
10149
641
    void *data = PyUnicode_DATA(unicode);
10150
641
    assert(_PyUnicode_IsModifiable(unicode));
10151
641
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10152
641
    assert(start >= 0);
10153
641
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10154
641
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10155
641
}
10156
10157
Py_ssize_t
10158
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10159
               Py_UCS4 fill_char)
10160
641
{
10161
641
    Py_ssize_t maxlen;
10162
10163
641
    if (!PyUnicode_Check(unicode)) {
10164
0
        PyErr_BadInternalCall();
10165
0
        return -1;
10166
0
    }
10167
641
    if (unicode_check_modifiable(unicode))
10168
0
        return -1;
10169
10170
641
    if (start < 0) {
10171
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10172
0
        return -1;
10173
0
    }
10174
641
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10175
0
        PyErr_SetString(PyExc_ValueError,
10176
0
                         "fill character is bigger than "
10177
0
                         "the string maximum character");
10178
0
        return -1;
10179
0
    }
10180
10181
641
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10182
641
    length = Py_MIN(maxlen, length);
10183
641
    if (length <= 0)
10184
0
        return 0;
10185
10186
641
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10187
641
    return length;
10188
641
}
10189
10190
static PyObject *
10191
pad(PyObject *self,
10192
    Py_ssize_t left,
10193
    Py_ssize_t right,
10194
    Py_UCS4 fill)
10195
0
{
10196
0
    PyObject *u;
10197
0
    Py_UCS4 maxchar;
10198
0
    int kind;
10199
0
    void *data;
10200
10201
0
    if (left < 0)
10202
0
        left = 0;
10203
0
    if (right < 0)
10204
0
        right = 0;
10205
10206
0
    if (left == 0 && right == 0)
10207
0
        return unicode_result_unchanged(self);
10208
10209
0
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10210
0
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10211
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10212
0
        return NULL;
10213
0
    }
10214
0
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10215
0
    maxchar = Py_MAX(maxchar, fill);
10216
0
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10217
0
    if (!u)
10218
0
        return NULL;
10219
10220
0
    kind = PyUnicode_KIND(u);
10221
0
    data = PyUnicode_DATA(u);
10222
0
    if (left)
10223
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10224
0
    if (right)
10225
0
        _PyUnicode_Fill(kind, data, fill,
10226
0
                        left + _PyUnicode_LENGTH(self), right);
10227
0
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10228
0
    assert(_PyUnicode_CheckConsistency(u, 1));
10229
0
    return u;
10230
0
}
10231
10232
PyObject *
10233
PyUnicode_Splitlines(PyObject *string, int keepends)
10234
13.4k
{
10235
13.4k
    PyObject *list;
10236
10237
13.4k
    if (ensure_unicode(string) < 0)
10238
0
        return NULL;
10239
10240
13.4k
    switch (PyUnicode_KIND(string)) {
10241
3.54k
    case PyUnicode_1BYTE_KIND:
10242
3.54k
        if (PyUnicode_IS_ASCII(string))
10243
2.71k
            list = asciilib_splitlines(
10244
2.71k
                string, PyUnicode_1BYTE_DATA(string),
10245
2.71k
                PyUnicode_GET_LENGTH(string), keepends);
10246
830
        else
10247
830
            list = ucs1lib_splitlines(
10248
830
                string, PyUnicode_1BYTE_DATA(string),
10249
830
                PyUnicode_GET_LENGTH(string), keepends);
10250
3.54k
        break;
10251
7.02k
    case PyUnicode_2BYTE_KIND:
10252
7.02k
        list = ucs2lib_splitlines(
10253
7.02k
            string, PyUnicode_2BYTE_DATA(string),
10254
7.02k
            PyUnicode_GET_LENGTH(string), keepends);
10255
7.02k
        break;
10256
2.92k
    case PyUnicode_4BYTE_KIND:
10257
2.92k
        list = ucs4lib_splitlines(
10258
2.92k
            string, PyUnicode_4BYTE_DATA(string),
10259
2.92k
            PyUnicode_GET_LENGTH(string), keepends);
10260
2.92k
        break;
10261
0
    default:
10262
0
        Py_UNREACHABLE();
10263
13.4k
    }
10264
13.4k
    return list;
10265
13.4k
}
10266
10267
static PyObject *
10268
split(PyObject *self,
10269
      PyObject *substring,
10270
      Py_ssize_t maxcount)
10271
23.9M
{
10272
23.9M
    int kind1, kind2;
10273
23.9M
    const void *buf1, *buf2;
10274
23.9M
    Py_ssize_t len1, len2;
10275
23.9M
    PyObject* out;
10276
23.9M
    len1 = PyUnicode_GET_LENGTH(self);
10277
23.9M
    kind1 = PyUnicode_KIND(self);
10278
10279
23.9M
    if (substring == NULL) {
10280
174k
        if (maxcount < 0) {
10281
148k
            maxcount = (len1 - 1) / 2 + 1;
10282
148k
        }
10283
174k
        switch (kind1) {
10284
112k
        case PyUnicode_1BYTE_KIND:
10285
112k
            if (PyUnicode_IS_ASCII(self))
10286
86.3k
                return asciilib_split_whitespace(
10287
86.3k
                    self,  PyUnicode_1BYTE_DATA(self),
10288
86.3k
                    len1, maxcount
10289
86.3k
                    );
10290
26.2k
            else
10291
26.2k
                return ucs1lib_split_whitespace(
10292
26.2k
                    self,  PyUnicode_1BYTE_DATA(self),
10293
26.2k
                    len1, maxcount
10294
26.2k
                    );
10295
49.3k
        case PyUnicode_2BYTE_KIND:
10296
49.3k
            return ucs2lib_split_whitespace(
10297
49.3k
                self,  PyUnicode_2BYTE_DATA(self),
10298
49.3k
                len1, maxcount
10299
49.3k
                );
10300
12.0k
        case PyUnicode_4BYTE_KIND:
10301
12.0k
            return ucs4lib_split_whitespace(
10302
12.0k
                self,  PyUnicode_4BYTE_DATA(self),
10303
12.0k
                len1, maxcount
10304
12.0k
                );
10305
0
        default:
10306
0
            Py_UNREACHABLE();
10307
174k
        }
10308
174k
    }
10309
10310
23.7M
    kind2 = PyUnicode_KIND(substring);
10311
23.7M
    len2 = PyUnicode_GET_LENGTH(substring);
10312
23.7M
    if (maxcount < 0) {
10313
        // if len2 == 0, it will raise ValueError.
10314
15.6M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10315
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10316
15.6M
        maxcount = maxcount < 0 ? len1 : maxcount;
10317
15.6M
    }
10318
23.7M
    if (kind1 < kind2 || len1 < len2) {
10319
2.72M
        out = PyList_New(1);
10320
2.72M
        if (out == NULL)
10321
0
            return NULL;
10322
2.72M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10323
2.72M
        return out;
10324
2.72M
    }
10325
21.0M
    buf1 = PyUnicode_DATA(self);
10326
21.0M
    buf2 = PyUnicode_DATA(substring);
10327
21.0M
    if (kind2 != kind1) {
10328
215k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10329
215k
        if (!buf2)
10330
0
            return NULL;
10331
215k
    }
10332
10333
21.0M
    switch (kind1) {
10334
20.7M
    case PyUnicode_1BYTE_KIND:
10335
20.7M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10336
19.3M
            out = asciilib_split(
10337
19.3M
                self,  buf1, len1, buf2, len2, maxcount);
10338
1.43M
        else
10339
1.43M
            out = ucs1lib_split(
10340
1.43M
                self,  buf1, len1, buf2, len2, maxcount);
10341
20.7M
        break;
10342
177k
    case PyUnicode_2BYTE_KIND:
10343
177k
        out = ucs2lib_split(
10344
177k
            self,  buf1, len1, buf2, len2, maxcount);
10345
177k
        break;
10346
37.8k
    case PyUnicode_4BYTE_KIND:
10347
37.8k
        out = ucs4lib_split(
10348
37.8k
            self,  buf1, len1, buf2, len2, maxcount);
10349
37.8k
        break;
10350
0
    default:
10351
0
        out = NULL;
10352
21.0M
    }
10353
21.0M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10354
21.0M
    if (kind2 != kind1)
10355
215k
        PyMem_Free((void *)buf2);
10356
21.0M
    return out;
10357
21.0M
}
10358
10359
static PyObject *
10360
rsplit(PyObject *self,
10361
       PyObject *substring,
10362
       Py_ssize_t maxcount)
10363
50
{
10364
50
    int kind1, kind2;
10365
50
    const void *buf1, *buf2;
10366
50
    Py_ssize_t len1, len2;
10367
50
    PyObject* out;
10368
10369
50
    len1 = PyUnicode_GET_LENGTH(self);
10370
50
    kind1 = PyUnicode_KIND(self);
10371
10372
50
    if (substring == NULL) {
10373
0
        if (maxcount < 0) {
10374
0
            maxcount = (len1 - 1) / 2 + 1;
10375
0
        }
10376
0
        switch (kind1) {
10377
0
        case PyUnicode_1BYTE_KIND:
10378
0
            if (PyUnicode_IS_ASCII(self))
10379
0
                return asciilib_rsplit_whitespace(
10380
0
                    self,  PyUnicode_1BYTE_DATA(self),
10381
0
                    len1, maxcount
10382
0
                    );
10383
0
            else
10384
0
                return ucs1lib_rsplit_whitespace(
10385
0
                    self,  PyUnicode_1BYTE_DATA(self),
10386
0
                    len1, maxcount
10387
0
                    );
10388
0
        case PyUnicode_2BYTE_KIND:
10389
0
            return ucs2lib_rsplit_whitespace(
10390
0
                self,  PyUnicode_2BYTE_DATA(self),
10391
0
                len1, maxcount
10392
0
                );
10393
0
        case PyUnicode_4BYTE_KIND:
10394
0
            return ucs4lib_rsplit_whitespace(
10395
0
                self,  PyUnicode_4BYTE_DATA(self),
10396
0
                len1, maxcount
10397
0
                );
10398
0
        default:
10399
0
            Py_UNREACHABLE();
10400
0
        }
10401
0
    }
10402
50
    kind2 = PyUnicode_KIND(substring);
10403
50
    len2 = PyUnicode_GET_LENGTH(substring);
10404
50
    if (maxcount < 0) {
10405
        // if len2 == 0, it will raise ValueError.
10406
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10407
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10408
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10409
0
    }
10410
50
    if (kind1 < kind2 || len1 < len2) {
10411
0
        out = PyList_New(1);
10412
0
        if (out == NULL)
10413
0
            return NULL;
10414
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10415
0
        return out;
10416
0
    }
10417
50
    buf1 = PyUnicode_DATA(self);
10418
50
    buf2 = PyUnicode_DATA(substring);
10419
50
    if (kind2 != kind1) {
10420
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10421
0
        if (!buf2)
10422
0
            return NULL;
10423
0
    }
10424
10425
50
    switch (kind1) {
10426
50
    case PyUnicode_1BYTE_KIND:
10427
50
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10428
50
            out = asciilib_rsplit(
10429
50
                self,  buf1, len1, buf2, len2, maxcount);
10430
0
        else
10431
0
            out = ucs1lib_rsplit(
10432
0
                self,  buf1, len1, buf2, len2, maxcount);
10433
50
        break;
10434
0
    case PyUnicode_2BYTE_KIND:
10435
0
        out = ucs2lib_rsplit(
10436
0
            self,  buf1, len1, buf2, len2, maxcount);
10437
0
        break;
10438
0
    case PyUnicode_4BYTE_KIND:
10439
0
        out = ucs4lib_rsplit(
10440
0
            self,  buf1, len1, buf2, len2, maxcount);
10441
0
        break;
10442
0
    default:
10443
0
        out = NULL;
10444
50
    }
10445
50
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10446
50
    if (kind2 != kind1)
10447
0
        PyMem_Free((void *)buf2);
10448
50
    return out;
10449
50
}
10450
10451
static Py_ssize_t
10452
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10453
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10454
189M
{
10455
189M
    switch (kind) {
10456
24.5M
    case PyUnicode_1BYTE_KIND:
10457
24.5M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10458
19.9M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10459
4.62M
        else
10460
4.62M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10461
73.6M
    case PyUnicode_2BYTE_KIND:
10462
73.6M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10463
91.5M
    case PyUnicode_4BYTE_KIND:
10464
91.5M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10465
189M
    }
10466
189M
    Py_UNREACHABLE();
10467
189M
}
10468
10469
static Py_ssize_t
10470
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10471
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10472
41.5M
{
10473
41.5M
    switch (kind) {
10474
34.9M
    case PyUnicode_1BYTE_KIND:
10475
34.9M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10476
6.42M
    case PyUnicode_2BYTE_KIND:
10477
6.42M
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10478
139k
    case PyUnicode_4BYTE_KIND:
10479
139k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10480
41.5M
    }
10481
41.5M
    Py_UNREACHABLE();
10482
41.5M
}
10483
10484
static void
10485
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10486
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10487
1.67M
{
10488
1.67M
    int kind = PyUnicode_KIND(u);
10489
1.67M
    void *data = PyUnicode_DATA(u);
10490
1.67M
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10491
1.67M
    if (kind == PyUnicode_1BYTE_KIND) {
10492
585k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10493
585k
                                      (Py_UCS1 *)data + len,
10494
585k
                                      u1, u2, maxcount);
10495
585k
    }
10496
1.09M
    else if (kind == PyUnicode_2BYTE_KIND) {
10497
1.07M
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10498
1.07M
                                      (Py_UCS2 *)data + len,
10499
1.07M
                                      u1, u2, maxcount);
10500
1.07M
    }
10501
19.1k
    else {
10502
19.1k
        assert(kind == PyUnicode_4BYTE_KIND);
10503
19.1k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10504
19.1k
                                      (Py_UCS4 *)data + len,
10505
19.1k
                                      u1, u2, maxcount);
10506
19.1k
    }
10507
1.67M
}
10508
10509
static PyObject *
10510
replace(PyObject *self, PyObject *str1,
10511
        PyObject *str2, Py_ssize_t maxcount)
10512
74.8M
{
10513
74.8M
    PyObject *u;
10514
74.8M
    const char *sbuf = PyUnicode_DATA(self);
10515
74.8M
    const void *buf1 = PyUnicode_DATA(str1);
10516
74.8M
    const void *buf2 = PyUnicode_DATA(str2);
10517
74.8M
    int srelease = 0, release1 = 0, release2 = 0;
10518
74.8M
    int skind = PyUnicode_KIND(self);
10519
74.8M
    int kind1 = PyUnicode_KIND(str1);
10520
74.8M
    int kind2 = PyUnicode_KIND(str2);
10521
74.8M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10522
74.8M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10523
74.8M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10524
74.8M
    int mayshrink;
10525
74.8M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10526
10527
74.8M
    if (slen < len1)
10528
25.9M
        goto nothing;
10529
10530
48.9M
    if (maxcount < 0)
10531
48.9M
        maxcount = PY_SSIZE_T_MAX;
10532
0
    else if (maxcount == 0)
10533
0
        goto nothing;
10534
10535
48.9M
    if (str1 == str2)
10536
0
        goto nothing;
10537
10538
48.9M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10539
48.9M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10540
48.9M
    if (maxchar < maxchar_str1)
10541
        /* substring too wide to be present */
10542
0
        goto nothing;
10543
48.9M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10544
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10545
       result string. */
10546
48.9M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10547
48.9M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10548
10549
48.9M
    if (len1 == len2) {
10550
        /* same length */
10551
7.45M
        if (len1 == 0)
10552
0
            goto nothing;
10553
7.45M
        if (len1 == 1) {
10554
            /* replace characters */
10555
7.45M
            Py_UCS4 u1, u2;
10556
7.45M
            Py_ssize_t pos;
10557
10558
7.45M
            u1 = PyUnicode_READ(kind1, buf1, 0);
10559
7.45M
            pos = findchar(sbuf, skind, slen, u1, 1);
10560
7.45M
            if (pos < 0)
10561
5.77M
                goto nothing;
10562
1.67M
            u2 = PyUnicode_READ(kind2, buf2, 0);
10563
1.67M
            u = PyUnicode_New(slen, maxchar);
10564
1.67M
            if (!u)
10565
0
                goto error;
10566
10567
1.67M
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10568
1.67M
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10569
1.67M
        }
10570
0
        else {
10571
0
            int rkind = skind;
10572
0
            char *res;
10573
0
            Py_ssize_t i;
10574
10575
0
            if (kind1 < rkind) {
10576
                /* widen substring */
10577
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10578
0
                if (!buf1) goto error;
10579
0
                release1 = 1;
10580
0
            }
10581
0
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10582
0
            if (i < 0)
10583
0
                goto nothing;
10584
0
            if (rkind > kind2) {
10585
                /* widen replacement */
10586
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10587
0
                if (!buf2) goto error;
10588
0
                release2 = 1;
10589
0
            }
10590
0
            else if (rkind < kind2) {
10591
                /* widen self and buf1 */
10592
0
                rkind = kind2;
10593
0
                if (release1) {
10594
0
                    assert(buf1 != PyUnicode_DATA(str1));
10595
0
                    PyMem_Free((void *)buf1);
10596
0
                    buf1 = PyUnicode_DATA(str1);
10597
0
                    release1 = 0;
10598
0
                }
10599
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10600
0
                if (!sbuf) goto error;
10601
0
                srelease = 1;
10602
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10603
0
                if (!buf1) goto error;
10604
0
                release1 = 1;
10605
0
            }
10606
0
            u = PyUnicode_New(slen, maxchar);
10607
0
            if (!u)
10608
0
                goto error;
10609
0
            assert(PyUnicode_KIND(u) == rkind);
10610
0
            res = PyUnicode_DATA(u);
10611
10612
0
            memcpy(res, sbuf, rkind * slen);
10613
            /* change everything in-place, starting with this one */
10614
0
            memcpy(res + rkind * i,
10615
0
                   buf2,
10616
0
                   rkind * len2);
10617
0
            i += len1;
10618
10619
0
            while ( --maxcount > 0) {
10620
0
                i = anylib_find(rkind, self,
10621
0
                                sbuf+rkind*i, slen-i,
10622
0
                                str1, buf1, len1, i);
10623
0
                if (i == -1)
10624
0
                    break;
10625
0
                memcpy(res + rkind * i,
10626
0
                       buf2,
10627
0
                       rkind * len2);
10628
0
                i += len1;
10629
0
            }
10630
0
        }
10631
7.45M
    }
10632
41.5M
    else {
10633
41.5M
        Py_ssize_t n, i, j, ires;
10634
41.5M
        Py_ssize_t new_size;
10635
41.5M
        int rkind = skind;
10636
41.5M
        char *res;
10637
10638
41.5M
        if (kind1 < rkind) {
10639
            /* widen substring */
10640
6.56M
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10641
6.56M
            if (!buf1) goto error;
10642
6.56M
            release1 = 1;
10643
6.56M
        }
10644
41.5M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10645
41.5M
        if (n == 0)
10646
36.5M
            goto nothing;
10647
5.01M
        if (kind2 < rkind) {
10648
            /* widen replacement */
10649
1.31M
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10650
1.31M
            if (!buf2) goto error;
10651
1.31M
            release2 = 1;
10652
1.31M
        }
10653
3.69M
        else if (kind2 > rkind) {
10654
            /* widen self and buf1 */
10655
0
            rkind = kind2;
10656
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10657
0
            if (!sbuf) goto error;
10658
0
            srelease = 1;
10659
0
            if (release1) {
10660
0
                assert(buf1 != PyUnicode_DATA(str1));
10661
0
                PyMem_Free((void *)buf1);
10662
0
                buf1 = PyUnicode_DATA(str1);
10663
0
                release1 = 0;
10664
0
            }
10665
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10666
0
            if (!buf1) goto error;
10667
0
            release1 = 1;
10668
0
        }
10669
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10670
           PyUnicode_GET_LENGTH(str1)); */
10671
5.01M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10672
0
                PyErr_SetString(PyExc_OverflowError,
10673
0
                                "replace string is too long");
10674
0
                goto error;
10675
0
        }
10676
5.01M
        new_size = slen + n * (len2 - len1);
10677
5.01M
        if (new_size == 0) {
10678
0
            u = _PyUnicode_GetEmpty();
10679
0
            goto done;
10680
0
        }
10681
5.01M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10682
0
            PyErr_SetString(PyExc_OverflowError,
10683
0
                            "replace string is too long");
10684
0
            goto error;
10685
0
        }
10686
5.01M
        u = PyUnicode_New(new_size, maxchar);
10687
5.01M
        if (!u)
10688
0
            goto error;
10689
5.01M
        assert(PyUnicode_KIND(u) == rkind);
10690
5.01M
        res = PyUnicode_DATA(u);
10691
5.01M
        ires = i = 0;
10692
5.01M
        if (len1 > 0) {
10693
194M
            while (n-- > 0) {
10694
                /* look for next match */
10695
189M
                j = anylib_find(rkind, self,
10696
189M
                                sbuf + rkind * i, slen-i,
10697
189M
                                str1, buf1, len1, i);
10698
189M
                if (j == -1)
10699
0
                    break;
10700
189M
                else if (j > i) {
10701
                    /* copy unchanged part [i:j] */
10702
21.9M
                    memcpy(res + rkind * ires,
10703
21.9M
                           sbuf + rkind * i,
10704
21.9M
                           rkind * (j-i));
10705
21.9M
                    ires += j - i;
10706
21.9M
                }
10707
                /* copy substitution string */
10708
189M
                if (len2 > 0) {
10709
189M
                    memcpy(res + rkind * ires,
10710
189M
                           buf2,
10711
189M
                           rkind * len2);
10712
189M
                    ires += len2;
10713
189M
                }
10714
189M
                i = j + len1;
10715
189M
            }
10716
5.01M
            if (i < slen)
10717
                /* copy tail [i:] */
10718
4.91M
                memcpy(res + rkind * ires,
10719
4.91M
                       sbuf + rkind * i,
10720
4.91M
                       rkind * (slen-i));
10721
5.01M
        }
10722
0
        else {
10723
            /* interleave */
10724
0
            while (n > 0) {
10725
0
                memcpy(res + rkind * ires,
10726
0
                       buf2,
10727
0
                       rkind * len2);
10728
0
                ires += len2;
10729
0
                if (--n <= 0)
10730
0
                    break;
10731
0
                memcpy(res + rkind * ires,
10732
0
                       sbuf + rkind * i,
10733
0
                       rkind);
10734
0
                ires++;
10735
0
                i++;
10736
0
            }
10737
0
            memcpy(res + rkind * ires,
10738
0
                   sbuf + rkind * i,
10739
0
                   rkind * (slen-i));
10740
0
        }
10741
5.01M
    }
10742
10743
6.68M
    if (mayshrink) {
10744
0
        unicode_adjust_maxchar(&u);
10745
0
        if (u == NULL)
10746
0
            goto error;
10747
0
    }
10748
10749
6.68M
  done:
10750
6.68M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10751
6.68M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10752
6.68M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10753
6.68M
    if (srelease)
10754
0
        PyMem_Free((void *)sbuf);
10755
6.68M
    if (release1)
10756
1.31M
        PyMem_Free((void *)buf1);
10757
6.68M
    if (release2)
10758
1.31M
        PyMem_Free((void *)buf2);
10759
6.68M
    assert(_PyUnicode_CheckConsistency(u, 1));
10760
6.68M
    return u;
10761
10762
68.2M
  nothing:
10763
    /* nothing to replace; return original string (when possible) */
10764
68.2M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10765
68.2M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10766
68.2M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10767
68.2M
    if (srelease)
10768
0
        PyMem_Free((void *)sbuf);
10769
68.2M
    if (release1)
10770
5.25M
        PyMem_Free((void *)buf1);
10771
68.2M
    if (release2)
10772
0
        PyMem_Free((void *)buf2);
10773
68.2M
    return unicode_result_unchanged(self);
10774
10775
0
  error:
10776
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10777
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10778
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10779
0
    if (srelease)
10780
0
        PyMem_Free((void *)sbuf);
10781
0
    if (release1)
10782
0
        PyMem_Free((void *)buf1);
10783
0
    if (release2)
10784
0
        PyMem_Free((void *)buf2);
10785
0
    return NULL;
10786
6.68M
}
10787
10788
/* --- Unicode Object Methods --------------------------------------------- */
10789
10790
/*[clinic input]
10791
@permit_long_docstring_body
10792
str.title as unicode_title
10793
10794
Return a version of the string where each word is titlecased.
10795
10796
More specifically, words start with uppercased characters and all remaining
10797
cased characters have lower case.
10798
[clinic start generated code]*/
10799
10800
static PyObject *
10801
unicode_title_impl(PyObject *self)
10802
/*[clinic end generated code: output=c75ae03809574902 input=533ce0eb6a7f5d1b]*/
10803
0
{
10804
0
    return case_operation(self, do_title);
10805
0
}
10806
10807
/*[clinic input]
10808
@permit_long_docstring_body
10809
str.capitalize as unicode_capitalize
10810
10811
Return a capitalized version of the string.
10812
10813
More specifically, make the first character have upper case and the rest lower
10814
case.
10815
[clinic start generated code]*/
10816
10817
static PyObject *
10818
unicode_capitalize_impl(PyObject *self)
10819
/*[clinic end generated code: output=e49a4c333cdb7667 input=a4a15ade41f6f9e9]*/
10820
0
{
10821
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10822
0
        return unicode_result_unchanged(self);
10823
0
    return case_operation(self, do_capitalize);
10824
0
}
10825
10826
/*[clinic input]
10827
str.casefold as unicode_casefold
10828
10829
Return a version of the string suitable for caseless comparisons.
10830
[clinic start generated code]*/
10831
10832
static PyObject *
10833
unicode_casefold_impl(PyObject *self)
10834
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10835
0
{
10836
0
    if (PyUnicode_IS_ASCII(self))
10837
0
        return ascii_upper_or_lower(self, 1);
10838
0
    return case_operation(self, do_casefold);
10839
0
}
10840
10841
10842
/* Argument converter. Accepts a single Unicode character. */
10843
10844
static int
10845
convert_uc(PyObject *obj, void *addr)
10846
0
{
10847
0
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10848
10849
0
    if (!PyUnicode_Check(obj)) {
10850
0
        PyErr_Format(PyExc_TypeError,
10851
0
                     "The fill character must be a unicode character, "
10852
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10853
0
        return 0;
10854
0
    }
10855
0
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10856
0
        PyErr_SetString(PyExc_TypeError,
10857
0
                        "The fill character must be exactly one character long");
10858
0
        return 0;
10859
0
    }
10860
0
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10861
0
    return 1;
10862
0
}
10863
10864
/*[clinic input]
10865
str.center as unicode_center
10866
10867
    width: Py_ssize_t
10868
    fillchar: Py_UCS4 = ' '
10869
    /
10870
10871
Return a centered string of length width.
10872
10873
Padding is done using the specified fill character (default is a space).
10874
[clinic start generated code]*/
10875
10876
static PyObject *
10877
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10878
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10879
0
{
10880
0
    Py_ssize_t marg, left;
10881
10882
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10883
0
        return unicode_result_unchanged(self);
10884
10885
0
    marg = width - PyUnicode_GET_LENGTH(self);
10886
0
    left = marg / 2 + (marg & width & 1);
10887
10888
0
    return pad(self, left, marg - left, fillchar);
10889
0
}
10890
10891
/* This function assumes that str1 and str2 are readied by the caller. */
10892
10893
static int
10894
unicode_compare(PyObject *str1, PyObject *str2)
10895
19.6M
{
10896
19.6M
#define COMPARE(TYPE1, TYPE2) \
10897
19.6M
    do { \
10898
18.5M
        TYPE1* p1 = (TYPE1 *)data1; \
10899
18.5M
        TYPE2* p2 = (TYPE2 *)data2; \
10900
18.5M
        TYPE1* end = p1 + len; \
10901
18.5M
        Py_UCS4 c1, c2; \
10902
18.5M
        for (; p1 != end; p1++, p2++) { \
10903
18.5M
            c1 = *p1; \
10904
18.5M
            c2 = *p2; \
10905
18.5M
            if (c1 != c2) \
10906
18.5M
                return (c1 < c2) ? -1 : 1; \
10907
18.5M
        } \
10908
18.5M
    } \
10909
18.5M
    while (0)
10910
10911
19.6M
    int kind1, kind2;
10912
19.6M
    const void *data1, *data2;
10913
19.6M
    Py_ssize_t len1, len2, len;
10914
10915
19.6M
    kind1 = PyUnicode_KIND(str1);
10916
19.6M
    kind2 = PyUnicode_KIND(str2);
10917
19.6M
    data1 = PyUnicode_DATA(str1);
10918
19.6M
    data2 = PyUnicode_DATA(str2);
10919
19.6M
    len1 = PyUnicode_GET_LENGTH(str1);
10920
19.6M
    len2 = PyUnicode_GET_LENGTH(str2);
10921
19.6M
    len = Py_MIN(len1, len2);
10922
10923
19.6M
    switch(kind1) {
10924
1.63M
    case PyUnicode_1BYTE_KIND:
10925
1.63M
    {
10926
1.63M
        switch(kind2) {
10927
79.2k
        case PyUnicode_1BYTE_KIND:
10928
79.2k
        {
10929
79.2k
            int cmp = memcmp(data1, data2, len);
10930
            /* normalize result of memcmp() into the range [-1; 1] */
10931
79.2k
            if (cmp < 0)
10932
56.1k
                return -1;
10933
23.1k
            if (cmp > 0)
10934
22.4k
                return 1;
10935
694
            break;
10936
23.1k
        }
10937
1.31M
        case PyUnicode_2BYTE_KIND:
10938
1.31M
            COMPARE(Py_UCS1, Py_UCS2);
10939
0
            break;
10940
235k
        case PyUnicode_4BYTE_KIND:
10941
235k
            COMPARE(Py_UCS1, Py_UCS4);
10942
0
            break;
10943
0
        default:
10944
0
            Py_UNREACHABLE();
10945
1.63M
        }
10946
694
        break;
10947
1.63M
    }
10948
16.3M
    case PyUnicode_2BYTE_KIND:
10949
16.3M
    {
10950
16.3M
        switch(kind2) {
10951
4.72k
        case PyUnicode_1BYTE_KIND:
10952
4.72k
            COMPARE(Py_UCS2, Py_UCS1);
10953
0
            break;
10954
14.7M
        case PyUnicode_2BYTE_KIND:
10955
14.7M
        {
10956
14.7M
            COMPARE(Py_UCS2, Py_UCS2);
10957
0
            break;
10958
14.7M
        }
10959
1.58M
        case PyUnicode_4BYTE_KIND:
10960
1.58M
            COMPARE(Py_UCS2, Py_UCS4);
10961
0
            break;
10962
0
        default:
10963
0
            Py_UNREACHABLE();
10964
16.3M
        }
10965
0
        break;
10966
16.3M
    }
10967
1.69M
    case PyUnicode_4BYTE_KIND:
10968
1.69M
    {
10969
1.69M
        switch(kind2) {
10970
2.33k
        case PyUnicode_1BYTE_KIND:
10971
2.33k
            COMPARE(Py_UCS4, Py_UCS1);
10972
0
            break;
10973
673k
        case PyUnicode_2BYTE_KIND:
10974
673k
            COMPARE(Py_UCS4, Py_UCS2);
10975
0
            break;
10976
1.01M
        case PyUnicode_4BYTE_KIND:
10977
1.01M
        {
10978
1.01M
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10979
1.01M
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10980
            /* normalize result of wmemcmp() into the range [-1; 1] */
10981
1.01M
            if (cmp < 0)
10982
499k
                return -1;
10983
514k
            if (cmp > 0)
10984
514k
                return 1;
10985
#else
10986
            COMPARE(Py_UCS4, Py_UCS4);
10987
#endif
10988
0
            break;
10989
514k
        }
10990
0
        default:
10991
0
            Py_UNREACHABLE();
10992
1.69M
        }
10993
0
        break;
10994
1.69M
    }
10995
0
    default:
10996
0
        Py_UNREACHABLE();
10997
19.6M
    }
10998
10999
694
    if (len1 == len2)
11000
683
        return 0;
11001
11
    if (len1 < len2)
11002
11
        return -1;
11003
0
    else
11004
0
        return 1;
11005
11006
11
#undef COMPARE
11007
11
}
11008
11009
11010
int
11011
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11012
289M
{
11013
289M
    assert(PyUnicode_Check(str1));
11014
289M
    assert(PyUnicode_Check(str2));
11015
289M
    if (str1 == str2) {
11016
79.9M
        return 1;
11017
79.9M
    }
11018
209M
    return unicode_eq(str1, str2);
11019
289M
}
11020
11021
11022
int
11023
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11024
0
{
11025
0
    if (!PyUnicode_Check(str1)) {
11026
0
        PyErr_Format(PyExc_TypeError,
11027
0
                     "first argument must be str, not %T", str1);
11028
0
        return -1;
11029
0
    }
11030
0
    if (!PyUnicode_Check(str2)) {
11031
0
        PyErr_Format(PyExc_TypeError,
11032
0
                     "second argument must be str, not %T", str2);
11033
0
        return -1;
11034
0
    }
11035
11036
0
    return _PyUnicode_Equal(str1, str2);
11037
0
}
11038
11039
11040
int
11041
PyUnicode_Compare(PyObject *left, PyObject *right)
11042
14.6k
{
11043
14.6k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11044
        /* a string is equal to itself */
11045
14.6k
        if (left == right)
11046
0
            return 0;
11047
11048
14.6k
        return unicode_compare(left, right);
11049
14.6k
    }
11050
0
    PyErr_Format(PyExc_TypeError,
11051
0
                 "Can't compare %.100s and %.100s",
11052
0
                 Py_TYPE(left)->tp_name,
11053
0
                 Py_TYPE(right)->tp_name);
11054
0
    return -1;
11055
14.6k
}
11056
11057
int
11058
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11059
1.95M
{
11060
1.95M
    Py_ssize_t i;
11061
1.95M
    int kind;
11062
1.95M
    Py_UCS4 chr;
11063
11064
1.95M
    assert(_PyUnicode_CHECK(uni));
11065
1.95M
    kind = PyUnicode_KIND(uni);
11066
1.95M
    if (kind == PyUnicode_1BYTE_KIND) {
11067
1.95M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11068
1.95M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11069
1.95M
        size_t len, len2 = strlen(str);
11070
1.95M
        int cmp;
11071
11072
1.95M
        len = Py_MIN(len1, len2);
11073
1.95M
        cmp = memcmp(data, str, len);
11074
1.95M
        if (cmp != 0) {
11075
1.43M
            if (cmp < 0)
11076
9.14k
                return -1;
11077
1.42M
            else
11078
1.42M
                return 1;
11079
1.43M
        }
11080
515k
        if (len1 > len2)
11081
100
            return 1; /* uni is longer */
11082
515k
        if (len1 < len2)
11083
782
            return -1; /* str is longer */
11084
515k
        return 0;
11085
515k
    }
11086
1.43k
    else {
11087
1.43k
        const void *data = PyUnicode_DATA(uni);
11088
        /* Compare Unicode string and source character set string */
11089
2.67k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11090
2.43k
            if (chr != (unsigned char)str[i])
11091
1.19k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11092
        /* This check keeps Python strings that end in '\0' from comparing equal
11093
         to C strings identical up to that point. */
11094
242
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11095
242
            return 1; /* uni is longer */
11096
0
        if (str[i])
11097
0
            return -1; /* str is longer */
11098
0
        return 0;
11099
0
    }
11100
1.95M
}
11101
11102
int
11103
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11104
24
{
11105
24
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11106
24
}
11107
11108
int
11109
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11110
24
{
11111
24
    assert(_PyUnicode_CHECK(unicode));
11112
24
    assert(str);
11113
11114
24
    if (PyUnicode_IS_ASCII(unicode)) {
11115
24
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11116
24
        return size == len &&
11117
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11118
24
    }
11119
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11120
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11121
0
        return size == len &&
11122
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11123
0
    }
11124
11125
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11126
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11127
0
        return 0;
11128
0
    }
11129
0
    const unsigned char *s = (const unsigned char *)str;
11130
0
    const unsigned char *ends = s + (size_t)size;
11131
0
    int kind = PyUnicode_KIND(unicode);
11132
0
    const void *data = PyUnicode_DATA(unicode);
11133
    /* Compare Unicode string and UTF-8 string */
11134
0
    for (Py_ssize_t i = 0; i < len; i++) {
11135
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11136
0
        if (ch < 0x80) {
11137
0
            if (ends == s || s[0] != ch) {
11138
0
                return 0;
11139
0
            }
11140
0
            s += 1;
11141
0
        }
11142
0
        else if (ch < 0x800) {
11143
0
            if ((ends - s) < 2 ||
11144
0
                s[0] != (0xc0 | (ch >> 6)) ||
11145
0
                s[1] != (0x80 | (ch & 0x3f)))
11146
0
            {
11147
0
                return 0;
11148
0
            }
11149
0
            s += 2;
11150
0
        }
11151
0
        else if (ch < 0x10000) {
11152
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11153
0
                (ends - s) < 3 ||
11154
0
                s[0] != (0xe0 | (ch >> 12)) ||
11155
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11156
0
                s[2] != (0x80 | (ch & 0x3f)))
11157
0
            {
11158
0
                return 0;
11159
0
            }
11160
0
            s += 3;
11161
0
        }
11162
0
        else {
11163
0
            assert(ch <= MAX_UNICODE);
11164
0
            if ((ends - s) < 4 ||
11165
0
                s[0] != (0xf0 | (ch >> 18)) ||
11166
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11167
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11168
0
                s[3] != (0x80 | (ch & 0x3f)))
11169
0
            {
11170
0
                return 0;
11171
0
            }
11172
0
            s += 4;
11173
0
        }
11174
0
    }
11175
0
    return s == ends;
11176
0
}
11177
11178
int
11179
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11180
7.27M
{
11181
7.27M
    size_t len;
11182
7.27M
    assert(_PyUnicode_CHECK(unicode));
11183
7.27M
    assert(str);
11184
#ifndef NDEBUG
11185
    for (const char *p = str; *p; p++) {
11186
        assert((unsigned char)*p < 128);
11187
    }
11188
#endif
11189
7.27M
    if (!PyUnicode_IS_ASCII(unicode))
11190
151k
        return 0;
11191
7.12M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11192
7.12M
    return strlen(str) == len &&
11193
515k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11194
7.27M
}
11195
11196
int
11197
_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11198
0
{
11199
0
    PyObject *right_uni;
11200
11201
0
    assert(_PyUnicode_CHECK(left));
11202
0
    assert(right->string);
11203
#ifndef NDEBUG
11204
    for (const char *p = right->string; *p; p++) {
11205
        assert((unsigned char)*p < 128);
11206
    }
11207
#endif
11208
11209
0
    if (!PyUnicode_IS_ASCII(left))
11210
0
        return 0;
11211
11212
0
    right_uni = _PyUnicode_FromId(right);       /* borrowed */
11213
0
    if (right_uni == NULL) {
11214
        /* memory error or bad data */
11215
0
        PyErr_Clear();
11216
0
        return _PyUnicode_EqualToASCIIString(left, right->string);
11217
0
    }
11218
11219
0
    if (left == right_uni)
11220
0
        return 1;
11221
11222
0
    assert(PyUnicode_CHECK_INTERNED(right_uni));
11223
0
    if (PyUnicode_CHECK_INTERNED(left)) {
11224
0
        return 0;
11225
0
    }
11226
11227
0
    Py_hash_t right_hash = PyUnicode_HASH(right_uni);
11228
0
    assert(right_hash != -1);
11229
0
    Py_hash_t hash = PyUnicode_HASH(left);
11230
0
    if (hash != -1 && hash != right_hash) {
11231
0
        return 0;
11232
0
    }
11233
11234
0
    return unicode_eq(left, right_uni);
11235
0
}
11236
11237
PyObject *
11238
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11239
41.6M
{
11240
41.6M
    int result;
11241
11242
41.6M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11243
90.6k
        Py_RETURN_NOTIMPLEMENTED;
11244
11245
41.5M
    if (left == right) {
11246
1.48k
        switch (op) {
11247
1.37k
        case Py_EQ:
11248
1.37k
        case Py_LE:
11249
1.37k
        case Py_GE:
11250
            /* a string is equal to itself */
11251
1.37k
            Py_RETURN_TRUE;
11252
107
        case Py_NE:
11253
107
        case Py_LT:
11254
107
        case Py_GT:
11255
107
            Py_RETURN_FALSE;
11256
0
        default:
11257
0
            PyErr_BadArgument();
11258
0
            return NULL;
11259
1.48k
        }
11260
1.48k
    }
11261
41.5M
    else if (op == Py_EQ || op == Py_NE) {
11262
21.8M
        result = unicode_eq(left, right);
11263
21.8M
        result ^= (op == Py_NE);
11264
21.8M
        return PyBool_FromLong(result);
11265
21.8M
    }
11266
19.6M
    else {
11267
19.6M
        result = unicode_compare(left, right);
11268
19.6M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11269
19.6M
    }
11270
41.5M
}
11271
11272
int
11273
PyUnicode_Contains(PyObject *str, PyObject *substr)
11274
83.7M
{
11275
83.7M
    int kind1, kind2;
11276
83.7M
    const void *buf1, *buf2;
11277
83.7M
    Py_ssize_t len1, len2;
11278
83.7M
    int result;
11279
11280
83.7M
    if (!PyUnicode_Check(substr)) {
11281
0
        PyErr_Format(PyExc_TypeError,
11282
0
                     "'in <string>' requires string as left operand, not %.100s",
11283
0
                     Py_TYPE(substr)->tp_name);
11284
0
        return -1;
11285
0
    }
11286
83.7M
    if (ensure_unicode(str) < 0)
11287
0
        return -1;
11288
11289
83.7M
    kind1 = PyUnicode_KIND(str);
11290
83.7M
    kind2 = PyUnicode_KIND(substr);
11291
83.7M
    if (kind1 < kind2)
11292
4.01M
        return 0;
11293
79.7M
    len1 = PyUnicode_GET_LENGTH(str);
11294
79.7M
    len2 = PyUnicode_GET_LENGTH(substr);
11295
79.7M
    if (len1 < len2)
11296
22.0k
        return 0;
11297
79.7M
    buf1 = PyUnicode_DATA(str);
11298
79.7M
    buf2 = PyUnicode_DATA(substr);
11299
79.7M
    if (len2 == 1) {
11300
79.6M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11301
79.6M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11302
79.6M
        return result;
11303
79.6M
    }
11304
48.3k
    if (kind2 != kind1) {
11305
17.0k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11306
17.0k
        if (!buf2)
11307
0
            return -1;
11308
17.0k
    }
11309
11310
48.3k
    switch (kind1) {
11311
31.3k
    case PyUnicode_1BYTE_KIND:
11312
31.3k
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11313
31.3k
        break;
11314
13.1k
    case PyUnicode_2BYTE_KIND:
11315
13.1k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11316
13.1k
        break;
11317
3.90k
    case PyUnicode_4BYTE_KIND:
11318
3.90k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11319
3.90k
        break;
11320
0
    default:
11321
0
        Py_UNREACHABLE();
11322
48.3k
    }
11323
11324
48.3k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11325
48.3k
    if (kind2 != kind1)
11326
17.0k
        PyMem_Free((void *)buf2);
11327
11328
48.3k
    return result;
11329
48.3k
}
11330
11331
/* Concat to string or Unicode object giving a new Unicode object. */
11332
11333
PyObject *
11334
PyUnicode_Concat(PyObject *left, PyObject *right)
11335
40.7M
{
11336
40.7M
    PyObject *result;
11337
40.7M
    Py_UCS4 maxchar, maxchar2;
11338
40.7M
    Py_ssize_t left_len, right_len, new_len;
11339
11340
40.7M
    if (ensure_unicode(left) < 0)
11341
0
        return NULL;
11342
11343
40.7M
    if (!PyUnicode_Check(right)) {
11344
0
        PyErr_Format(PyExc_TypeError,
11345
0
            "can only concatenate str (not \"%.200s\") to str",
11346
0
            Py_TYPE(right)->tp_name);
11347
0
        return NULL;
11348
0
    }
11349
11350
    /* Shortcuts */
11351
40.7M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11352
40.7M
    if (left == empty) {
11353
71.8k
        return PyUnicode_FromObject(right);
11354
71.8k
    }
11355
40.7M
    if (right == empty) {
11356
4.11M
        return PyUnicode_FromObject(left);
11357
4.11M
    }
11358
11359
36.6M
    left_len = PyUnicode_GET_LENGTH(left);
11360
36.6M
    right_len = PyUnicode_GET_LENGTH(right);
11361
36.6M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11362
0
        PyErr_SetString(PyExc_OverflowError,
11363
0
                        "strings are too large to concat");
11364
0
        return NULL;
11365
0
    }
11366
36.6M
    new_len = left_len + right_len;
11367
11368
36.6M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11369
36.6M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11370
36.6M
    maxchar = Py_MAX(maxchar, maxchar2);
11371
11372
    /* Concat the two Unicode strings */
11373
36.6M
    result = PyUnicode_New(new_len, maxchar);
11374
36.6M
    if (result == NULL)
11375
0
        return NULL;
11376
36.6M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11377
36.6M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11378
36.6M
    assert(_PyUnicode_CheckConsistency(result, 1));
11379
36.6M
    return result;
11380
36.6M
}
11381
11382
void
11383
PyUnicode_Append(PyObject **p_left, PyObject *right)
11384
1.23M
{
11385
1.23M
    PyObject *left, *res;
11386
1.23M
    Py_UCS4 maxchar, maxchar2;
11387
1.23M
    Py_ssize_t left_len, right_len, new_len;
11388
11389
1.23M
    if (p_left == NULL) {
11390
0
        if (!PyErr_Occurred())
11391
0
            PyErr_BadInternalCall();
11392
0
        return;
11393
0
    }
11394
1.23M
    left = *p_left;
11395
1.23M
    if (right == NULL || left == NULL
11396
1.23M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11397
0
        if (!PyErr_Occurred())
11398
0
            PyErr_BadInternalCall();
11399
0
        goto error;
11400
0
    }
11401
11402
    /* Shortcuts */
11403
1.23M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11404
1.23M
    if (left == empty) {
11405
423k
        Py_DECREF(left);
11406
423k
        *p_left = Py_NewRef(right);
11407
423k
        return;
11408
423k
    }
11409
807k
    if (right == empty) {
11410
0
        return;
11411
0
    }
11412
11413
807k
    left_len = PyUnicode_GET_LENGTH(left);
11414
807k
    right_len = PyUnicode_GET_LENGTH(right);
11415
807k
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11416
0
        PyErr_SetString(PyExc_OverflowError,
11417
0
                        "strings are too large to concat");
11418
0
        goto error;
11419
0
    }
11420
807k
    new_len = left_len + right_len;
11421
11422
807k
    if (_PyUnicode_IsModifiable(left)
11423
807k
        && PyUnicode_CheckExact(right)
11424
807k
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11425
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11426
           to change the structure size, but characters are stored just after
11427
           the structure, and so it requires to move all characters which is
11428
           not so different than duplicating the string. */
11429
763k
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11430
763k
    {
11431
        /* append inplace */
11432
763k
        if (unicode_resize(p_left, new_len) != 0)
11433
0
            goto error;
11434
11435
        /* copy 'right' into the newly allocated area of 'left' */
11436
763k
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11437
763k
    }
11438
44.1k
    else {
11439
44.1k
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11440
44.1k
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11441
44.1k
        maxchar = Py_MAX(maxchar, maxchar2);
11442
11443
        /* Concat the two Unicode strings */
11444
44.1k
        res = PyUnicode_New(new_len, maxchar);
11445
44.1k
        if (res == NULL)
11446
0
            goto error;
11447
44.1k
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11448
44.1k
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11449
44.1k
        Py_DECREF(left);
11450
44.1k
        *p_left = res;
11451
44.1k
    }
11452
807k
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11453
807k
    return;
11454
11455
0
error:
11456
0
    Py_CLEAR(*p_left);
11457
0
}
11458
11459
void
11460
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11461
0
{
11462
0
    PyUnicode_Append(pleft, right);
11463
0
    Py_XDECREF(right);
11464
0
}
11465
11466
/*[clinic input]
11467
@permit_long_summary
11468
@text_signature "($self, sub[, start[, end]], /)"
11469
str.count as unicode_count -> Py_ssize_t
11470
11471
    self as str: self
11472
    sub as substr: unicode
11473
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11474
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11475
    /
11476
11477
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11478
11479
Optional arguments start and end are interpreted as in slice notation.
11480
[clinic start generated code]*/
11481
11482
static Py_ssize_t
11483
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11484
                   Py_ssize_t end)
11485
/*[clinic end generated code: output=8fcc3aef0b18edbf input=8590716ee228b935]*/
11486
31.2M
{
11487
31.2M
    assert(PyUnicode_Check(str));
11488
31.2M
    assert(PyUnicode_Check(substr));
11489
11490
31.2M
    Py_ssize_t result;
11491
31.2M
    int kind1, kind2;
11492
31.2M
    const void *buf1 = NULL, *buf2 = NULL;
11493
31.2M
    Py_ssize_t len1, len2;
11494
11495
31.2M
    kind1 = PyUnicode_KIND(str);
11496
31.2M
    kind2 = PyUnicode_KIND(substr);
11497
31.2M
    if (kind1 < kind2)
11498
0
        return 0;
11499
11500
31.2M
    len1 = PyUnicode_GET_LENGTH(str);
11501
31.2M
    len2 = PyUnicode_GET_LENGTH(substr);
11502
31.2M
    ADJUST_INDICES(start, end, len1);
11503
31.2M
    if (end - start < len2)
11504
6.86M
        return 0;
11505
11506
24.4M
    buf1 = PyUnicode_DATA(str);
11507
24.4M
    buf2 = PyUnicode_DATA(substr);
11508
24.4M
    if (kind2 != kind1) {
11509
6.48M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11510
6.48M
        if (!buf2)
11511
0
            goto onError;
11512
6.48M
    }
11513
11514
    // We don't reuse `anylib_count` here because of the explicit casts.
11515
24.4M
    switch (kind1) {
11516
17.9M
    case PyUnicode_1BYTE_KIND:
11517
17.9M
        result = ucs1lib_count(
11518
17.9M
            ((const Py_UCS1*)buf1) + start, end - start,
11519
17.9M
            buf2, len2, PY_SSIZE_T_MAX
11520
17.9M
            );
11521
17.9M
        break;
11522
5.36M
    case PyUnicode_2BYTE_KIND:
11523
5.36M
        result = ucs2lib_count(
11524
5.36M
            ((const Py_UCS2*)buf1) + start, end - start,
11525
5.36M
            buf2, len2, PY_SSIZE_T_MAX
11526
5.36M
            );
11527
5.36M
        break;
11528
1.12M
    case PyUnicode_4BYTE_KIND:
11529
1.12M
        result = ucs4lib_count(
11530
1.12M
            ((const Py_UCS4*)buf1) + start, end - start,
11531
1.12M
            buf2, len2, PY_SSIZE_T_MAX
11532
1.12M
            );
11533
1.12M
        break;
11534
0
    default:
11535
0
        Py_UNREACHABLE();
11536
24.4M
    }
11537
11538
24.4M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11539
24.4M
    if (kind2 != kind1)
11540
6.48M
        PyMem_Free((void *)buf2);
11541
11542
24.4M
    return result;
11543
0
  onError:
11544
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11545
0
    if (kind2 != kind1)
11546
0
        PyMem_Free((void *)buf2);
11547
0
    return -1;
11548
24.4M
}
11549
11550
/*[clinic input]
11551
str.encode as unicode_encode
11552
11553
    encoding: str(c_default="NULL") = 'utf-8'
11554
        The encoding in which to encode the string.
11555
    errors: str(c_default="NULL") = 'strict'
11556
        The error handling scheme to use for encoding errors.
11557
        The default is 'strict' meaning that encoding errors raise a
11558
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11559
        'xmlcharrefreplace' as well as any other name registered with
11560
        codecs.register_error that can handle UnicodeEncodeErrors.
11561
11562
Encode the string using the codec registered for encoding.
11563
[clinic start generated code]*/
11564
11565
static PyObject *
11566
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11567
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11568
18.6M
{
11569
18.6M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11570
18.6M
}
11571
11572
/*[clinic input]
11573
str.expandtabs as unicode_expandtabs
11574
11575
    tabsize: int = 8
11576
11577
Return a copy where all tab characters are expanded using spaces.
11578
11579
If tabsize is not given, a tab size of 8 characters is assumed.
11580
[clinic start generated code]*/
11581
11582
static PyObject *
11583
unicode_expandtabs_impl(PyObject *self, int tabsize)
11584
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11585
6.91M
{
11586
6.91M
    Py_ssize_t i, j, line_pos, src_len, incr;
11587
6.91M
    Py_UCS4 ch;
11588
6.91M
    PyObject *u;
11589
6.91M
    const void *src_data;
11590
6.91M
    void *dest_data;
11591
6.91M
    int kind;
11592
6.91M
    int found;
11593
11594
    /* First pass: determine size of output string */
11595
6.91M
    src_len = PyUnicode_GET_LENGTH(self);
11596
6.91M
    i = j = line_pos = 0;
11597
6.91M
    kind = PyUnicode_KIND(self);
11598
6.91M
    src_data = PyUnicode_DATA(self);
11599
6.91M
    found = 0;
11600
142M
    for (; i < src_len; i++) {
11601
135M
        ch = PyUnicode_READ(kind, src_data, i);
11602
135M
        if (ch == '\t') {
11603
17.0M
            found = 1;
11604
17.0M
            if (tabsize > 0) {
11605
17.0M
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11606
17.0M
                if (j > PY_SSIZE_T_MAX - incr)
11607
0
                    goto overflow;
11608
17.0M
                line_pos += incr;
11609
17.0M
                j += incr;
11610
17.0M
            }
11611
17.0M
        }
11612
118M
        else {
11613
118M
            if (j > PY_SSIZE_T_MAX - 1)
11614
0
                goto overflow;
11615
118M
            line_pos++;
11616
118M
            j++;
11617
118M
            if (ch == '\n' || ch == '\r')
11618
14.8k
                line_pos = 0;
11619
118M
        }
11620
135M
    }
11621
6.91M
    if (!found)
11622
6.67M
        return unicode_result_unchanged(self);
11623
11624
    /* Second pass: create output string and fill it */
11625
248k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11626
248k
    if (!u)
11627
0
        return NULL;
11628
248k
    dest_data = PyUnicode_DATA(u);
11629
11630
248k
    i = j = line_pos = 0;
11631
11632
34.1M
    for (; i < src_len; i++) {
11633
33.9M
        ch = PyUnicode_READ(kind, src_data, i);
11634
33.9M
        if (ch == '\t') {
11635
17.0M
            if (tabsize > 0) {
11636
17.0M
                incr = tabsize - (line_pos % tabsize);
11637
17.0M
                line_pos += incr;
11638
17.0M
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11639
17.0M
                j += incr;
11640
17.0M
            }
11641
17.0M
        }
11642
16.8M
        else {
11643
16.8M
            line_pos++;
11644
16.8M
            PyUnicode_WRITE(kind, dest_data, j, ch);
11645
16.8M
            j++;
11646
16.8M
            if (ch == '\n' || ch == '\r')
11647
0
                line_pos = 0;
11648
16.8M
        }
11649
33.9M
    }
11650
248k
    assert (j == PyUnicode_GET_LENGTH(u));
11651
248k
    return unicode_result(u);
11652
11653
0
  overflow:
11654
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11655
0
    return NULL;
11656
248k
}
11657
11658
/*[clinic input]
11659
@permit_long_summary
11660
str.find as unicode_find = str.count
11661
11662
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11663
11664
Optional arguments start and end are interpreted as in slice notation.
11665
Return -1 on failure.
11666
[clinic start generated code]*/
11667
11668
static Py_ssize_t
11669
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11670
                  Py_ssize_t end)
11671
/*[clinic end generated code: output=51dbe6255712e278 input=3a9d650fe4c24695]*/
11672
30.2M
{
11673
30.2M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11674
30.2M
    if (result < 0) {
11675
6.95M
        return -1;
11676
6.95M
    }
11677
23.2M
    return result;
11678
30.2M
}
11679
11680
static PyObject *
11681
unicode_getitem(PyObject *self, Py_ssize_t index)
11682
52.8M
{
11683
52.8M
    const void *data;
11684
52.8M
    int kind;
11685
52.8M
    Py_UCS4 ch;
11686
11687
52.8M
    if (!PyUnicode_Check(self)) {
11688
0
        PyErr_BadArgument();
11689
0
        return NULL;
11690
0
    }
11691
52.8M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11692
384
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11693
384
        return NULL;
11694
384
    }
11695
52.8M
    kind = PyUnicode_KIND(self);
11696
52.8M
    data = PyUnicode_DATA(self);
11697
52.8M
    ch = PyUnicode_READ(kind, data, index);
11698
52.8M
    return unicode_char(ch);
11699
52.8M
}
11700
11701
/* Believe it or not, this produces the same value for ASCII strings
11702
   as bytes_hash(). */
11703
static Py_hash_t
11704
unicode_hash(PyObject *self)
11705
45.5M
{
11706
45.5M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11707
11708
#ifdef Py_DEBUG
11709
    assert(_Py_HashSecret_Initialized);
11710
#endif
11711
45.5M
    Py_hash_t hash = PyUnicode_HASH(self);
11712
45.5M
    if (hash != -1) {
11713
301k
        return hash;
11714
301k
    }
11715
45.2M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11716
45.2M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11717
11718
45.2M
    PyUnicode_SET_HASH(self, x);
11719
45.2M
    return x;
11720
45.5M
}
11721
11722
/*[clinic input]
11723
@permit_long_summary
11724
str.index as unicode_index = str.count
11725
11726
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11727
11728
Optional arguments start and end are interpreted as in slice notation.
11729
Raises ValueError when the substring is not found.
11730
[clinic start generated code]*/
11731
11732
static Py_ssize_t
11733
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11734
                   Py_ssize_t end)
11735
/*[clinic end generated code: output=77558288837cdf40 input=ae5e48f69ed75b06]*/
11736
0
{
11737
0
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11738
0
    if (result == -1) {
11739
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
11740
0
    }
11741
0
    else if (result < 0) {
11742
0
        return -1;
11743
0
    }
11744
0
    return result;
11745
0
}
11746
11747
/*[clinic input]
11748
str.isascii as unicode_isascii
11749
11750
Return True if all characters in the string are ASCII, False otherwise.
11751
11752
ASCII characters have code points in the range U+0000-U+007F.
11753
Empty string is ASCII too.
11754
[clinic start generated code]*/
11755
11756
static PyObject *
11757
unicode_isascii_impl(PyObject *self)
11758
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11759
9.09k
{
11760
9.09k
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11761
9.09k
}
11762
11763
/*[clinic input]
11764
@permit_long_docstring_body
11765
str.islower as unicode_islower
11766
11767
Return True if the string is a lowercase string, False otherwise.
11768
11769
A string is lowercase if all cased characters in the string are lowercase and
11770
there is at least one cased character in the string.
11771
[clinic start generated code]*/
11772
11773
static PyObject *
11774
unicode_islower_impl(PyObject *self)
11775
/*[clinic end generated code: output=dbd41995bd005b81 input=c6fc0295241a1aaa]*/
11776
0
{
11777
0
    Py_ssize_t i, length;
11778
0
    int kind;
11779
0
    const void *data;
11780
0
    int cased;
11781
11782
0
    length = PyUnicode_GET_LENGTH(self);
11783
0
    kind = PyUnicode_KIND(self);
11784
0
    data = PyUnicode_DATA(self);
11785
11786
    /* Shortcut for single character strings */
11787
0
    if (length == 1)
11788
0
        return PyBool_FromLong(
11789
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11790
11791
    /* Special case for empty strings */
11792
0
    if (length == 0)
11793
0
        Py_RETURN_FALSE;
11794
11795
0
    cased = 0;
11796
0
    for (i = 0; i < length; i++) {
11797
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11798
11799
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11800
0
            Py_RETURN_FALSE;
11801
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11802
0
            cased = 1;
11803
0
    }
11804
0
    return PyBool_FromLong(cased);
11805
0
}
11806
11807
/*[clinic input]
11808
@permit_long_docstring_body
11809
str.isupper as unicode_isupper
11810
11811
Return True if the string is an uppercase string, False otherwise.
11812
11813
A string is uppercase if all cased characters in the string are uppercase and
11814
there is at least one cased character in the string.
11815
[clinic start generated code]*/
11816
11817
static PyObject *
11818
unicode_isupper_impl(PyObject *self)
11819
/*[clinic end generated code: output=049209c8e7f15f59 input=8d5cb33e67efde72]*/
11820
6.98k
{
11821
6.98k
    Py_ssize_t i, length;
11822
6.98k
    int kind;
11823
6.98k
    const void *data;
11824
6.98k
    int cased;
11825
11826
6.98k
    length = PyUnicode_GET_LENGTH(self);
11827
6.98k
    kind = PyUnicode_KIND(self);
11828
6.98k
    data = PyUnicode_DATA(self);
11829
11830
    /* Shortcut for single character strings */
11831
6.98k
    if (length == 1)
11832
0
        return PyBool_FromLong(
11833
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11834
11835
    /* Special case for empty strings */
11836
6.98k
    if (length == 0)
11837
0
        Py_RETURN_FALSE;
11838
11839
6.98k
    cased = 0;
11840
89.1k
    for (i = 0; i < length; i++) {
11841
83.0k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11842
11843
83.0k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11844
840
            Py_RETURN_FALSE;
11845
82.1k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11846
6.23k
            cased = 1;
11847
83.0k
    }
11848
6.14k
    return PyBool_FromLong(cased);
11849
6.98k
}
11850
11851
/*[clinic input]
11852
str.istitle as unicode_istitle
11853
11854
Return True if the string is a title-cased string, False otherwise.
11855
11856
In a title-cased string, upper- and title-case characters may only
11857
follow uncased characters and lowercase characters only cased ones.
11858
[clinic start generated code]*/
11859
11860
static PyObject *
11861
unicode_istitle_impl(PyObject *self)
11862
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11863
0
{
11864
0
    Py_ssize_t i, length;
11865
0
    int kind;
11866
0
    const void *data;
11867
0
    int cased, previous_is_cased;
11868
11869
0
    length = PyUnicode_GET_LENGTH(self);
11870
0
    kind = PyUnicode_KIND(self);
11871
0
    data = PyUnicode_DATA(self);
11872
11873
    /* Shortcut for single character strings */
11874
0
    if (length == 1) {
11875
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11876
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11877
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11878
0
    }
11879
11880
    /* Special case for empty strings */
11881
0
    if (length == 0)
11882
0
        Py_RETURN_FALSE;
11883
11884
0
    cased = 0;
11885
0
    previous_is_cased = 0;
11886
0
    for (i = 0; i < length; i++) {
11887
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11888
11889
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11890
0
            if (previous_is_cased)
11891
0
                Py_RETURN_FALSE;
11892
0
            previous_is_cased = 1;
11893
0
            cased = 1;
11894
0
        }
11895
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11896
0
            if (!previous_is_cased)
11897
0
                Py_RETURN_FALSE;
11898
0
            previous_is_cased = 1;
11899
0
            cased = 1;
11900
0
        }
11901
0
        else
11902
0
            previous_is_cased = 0;
11903
0
    }
11904
0
    return PyBool_FromLong(cased);
11905
0
}
11906
11907
/*[clinic input]
11908
@permit_long_docstring_body
11909
str.isspace as unicode_isspace
11910
11911
Return True if the string is a whitespace string, False otherwise.
11912
11913
A string is whitespace if all characters in the string are whitespace and there
11914
is at least one character in the string.
11915
[clinic start generated code]*/
11916
11917
static PyObject *
11918
unicode_isspace_impl(PyObject *self)
11919
/*[clinic end generated code: output=163a63bfa08ac2b9 input=44fe05e248c6e159]*/
11920
25.7M
{
11921
25.7M
    Py_ssize_t i, length;
11922
25.7M
    int kind;
11923
25.7M
    const void *data;
11924
11925
25.7M
    length = PyUnicode_GET_LENGTH(self);
11926
25.7M
    kind = PyUnicode_KIND(self);
11927
25.7M
    data = PyUnicode_DATA(self);
11928
11929
    /* Shortcut for single character strings */
11930
25.7M
    if (length == 1)
11931
25.7M
        return PyBool_FromLong(
11932
25.7M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11933
11934
    /* Special case for empty strings */
11935
0
    if (length == 0)
11936
0
        Py_RETURN_FALSE;
11937
11938
0
    for (i = 0; i < length; i++) {
11939
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11940
0
        if (!Py_UNICODE_ISSPACE(ch))
11941
0
            Py_RETURN_FALSE;
11942
0
    }
11943
0
    Py_RETURN_TRUE;
11944
0
}
11945
11946
/*[clinic input]
11947
@permit_long_docstring_body
11948
str.isalpha as unicode_isalpha
11949
11950
Return True if the string is an alphabetic string, False otherwise.
11951
11952
A string is alphabetic if all characters in the string are alphabetic and there
11953
is at least one character in the string.
11954
[clinic start generated code]*/
11955
11956
static PyObject *
11957
unicode_isalpha_impl(PyObject *self)
11958
/*[clinic end generated code: output=cc81b9ac3883ec4f input=c233000624a56e0d]*/
11959
0
{
11960
0
    Py_ssize_t i, length;
11961
0
    int kind;
11962
0
    const void *data;
11963
11964
0
    length = PyUnicode_GET_LENGTH(self);
11965
0
    kind = PyUnicode_KIND(self);
11966
0
    data = PyUnicode_DATA(self);
11967
11968
    /* Shortcut for single character strings */
11969
0
    if (length == 1)
11970
0
        return PyBool_FromLong(
11971
0
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11972
11973
    /* Special case for empty strings */
11974
0
    if (length == 0)
11975
0
        Py_RETURN_FALSE;
11976
11977
0
    for (i = 0; i < length; i++) {
11978
0
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11979
0
            Py_RETURN_FALSE;
11980
0
    }
11981
0
    Py_RETURN_TRUE;
11982
0
}
11983
11984
/*[clinic input]
11985
@permit_long_docstring_body
11986
str.isalnum as unicode_isalnum
11987
11988
Return True if the string is an alpha-numeric string, False otherwise.
11989
11990
A string is alpha-numeric if all characters in the string are alpha-numeric and
11991
there is at least one character in the string.
11992
[clinic start generated code]*/
11993
11994
static PyObject *
11995
unicode_isalnum_impl(PyObject *self)
11996
/*[clinic end generated code: output=a5a23490ffc3660c input=5d63ba9c9bafdb6b]*/
11997
0
{
11998
0
    int kind;
11999
0
    const void *data;
12000
0
    Py_ssize_t len, i;
12001
12002
0
    kind = PyUnicode_KIND(self);
12003
0
    data = PyUnicode_DATA(self);
12004
0
    len = PyUnicode_GET_LENGTH(self);
12005
12006
    /* Shortcut for single character strings */
12007
0
    if (len == 1) {
12008
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12009
0
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12010
0
    }
12011
12012
    /* Special case for empty strings */
12013
0
    if (len == 0)
12014
0
        Py_RETURN_FALSE;
12015
12016
0
    for (i = 0; i < len; i++) {
12017
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12018
0
        if (!Py_UNICODE_ISALNUM(ch))
12019
0
            Py_RETURN_FALSE;
12020
0
    }
12021
0
    Py_RETURN_TRUE;
12022
0
}
12023
12024
/*[clinic input]
12025
@permit_long_docstring_body
12026
str.isdecimal as unicode_isdecimal
12027
12028
Return True if the string is a decimal string, False otherwise.
12029
12030
A string is a decimal string if all characters in the string are decimal and
12031
there is at least one character in the string.
12032
[clinic start generated code]*/
12033
12034
static PyObject *
12035
unicode_isdecimal_impl(PyObject *self)
12036
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=8e84a58b414935a3]*/
12037
0
{
12038
0
    Py_ssize_t i, length;
12039
0
    int kind;
12040
0
    const void *data;
12041
12042
0
    length = PyUnicode_GET_LENGTH(self);
12043
0
    kind = PyUnicode_KIND(self);
12044
0
    data = PyUnicode_DATA(self);
12045
12046
    /* Shortcut for single character strings */
12047
0
    if (length == 1)
12048
0
        return PyBool_FromLong(
12049
0
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12050
12051
    /* Special case for empty strings */
12052
0
    if (length == 0)
12053
0
        Py_RETURN_FALSE;
12054
12055
0
    for (i = 0; i < length; i++) {
12056
0
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12057
0
            Py_RETURN_FALSE;
12058
0
    }
12059
0
    Py_RETURN_TRUE;
12060
0
}
12061
12062
/*[clinic input]
12063
@permit_long_docstring_body
12064
str.isdigit as unicode_isdigit
12065
12066
Return True if the string is a digit string, False otherwise.
12067
12068
A string is a digit string if all characters in the string are digits and there
12069
is at least one character in the string.
12070
[clinic start generated code]*/
12071
12072
static PyObject *
12073
unicode_isdigit_impl(PyObject *self)
12074
/*[clinic end generated code: output=10a6985311da6858 input=99e284affb54d4a0]*/
12075
1.67M
{
12076
1.67M
    Py_ssize_t i, length;
12077
1.67M
    int kind;
12078
1.67M
    const void *data;
12079
12080
1.67M
    length = PyUnicode_GET_LENGTH(self);
12081
1.67M
    kind = PyUnicode_KIND(self);
12082
1.67M
    data = PyUnicode_DATA(self);
12083
12084
    /* Shortcut for single character strings */
12085
1.67M
    if (length == 1) {
12086
1.67M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12087
1.67M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12088
1.67M
    }
12089
12090
    /* Special case for empty strings */
12091
306
    if (length == 0)
12092
0
        Py_RETURN_FALSE;
12093
12094
1.09k
    for (i = 0; i < length; i++) {
12095
786
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12096
0
            Py_RETURN_FALSE;
12097
786
    }
12098
306
    Py_RETURN_TRUE;
12099
306
}
12100
12101
/*[clinic input]
12102
@permit_long_docstring_body
12103
str.isnumeric as unicode_isnumeric
12104
12105
Return True if the string is a numeric string, False otherwise.
12106
12107
A string is numeric if all characters in the string are numeric and there is at
12108
least one character in the string.
12109
[clinic start generated code]*/
12110
12111
static PyObject *
12112
unicode_isnumeric_impl(PyObject *self)
12113
/*[clinic end generated code: output=9172a32d9013051a input=e9f5b6b8b29b0ee6]*/
12114
0
{
12115
0
    Py_ssize_t i, length;
12116
0
    int kind;
12117
0
    const void *data;
12118
12119
0
    length = PyUnicode_GET_LENGTH(self);
12120
0
    kind = PyUnicode_KIND(self);
12121
0
    data = PyUnicode_DATA(self);
12122
12123
    /* Shortcut for single character strings */
12124
0
    if (length == 1)
12125
0
        return PyBool_FromLong(
12126
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12127
12128
    /* Special case for empty strings */
12129
0
    if (length == 0)
12130
0
        Py_RETURN_FALSE;
12131
12132
0
    for (i = 0; i < length; i++) {
12133
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12134
0
            Py_RETURN_FALSE;
12135
0
    }
12136
0
    Py_RETURN_TRUE;
12137
0
}
12138
12139
Py_ssize_t
12140
_PyUnicode_ScanIdentifier(PyObject *self)
12141
13.3k
{
12142
13.3k
    Py_ssize_t i;
12143
13.3k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12144
13.3k
    if (len == 0) {
12145
        /* an empty string is not a valid identifier */
12146
0
        return 0;
12147
0
    }
12148
12149
13.3k
    int kind = PyUnicode_KIND(self);
12150
13.3k
    const void *data = PyUnicode_DATA(self);
12151
13.3k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12152
    /* PEP 3131 says that the first character must be in
12153
       XID_Start and subsequent characters in XID_Continue,
12154
       and for the ASCII range, the 2.x rules apply (i.e
12155
       start with letters and underscore, continue with
12156
       letters, digits, underscore). However, given the current
12157
       definition of XID_Start and XID_Continue, it is sufficient
12158
       to check just for these, except that _ must be allowed
12159
       as starting an identifier.  */
12160
13.3k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12161
447
        return 0;
12162
447
    }
12163
12164
47.0k
    for (i = 1; i < len; i++) {
12165
34.4k
        ch = PyUnicode_READ(kind, data, i);
12166
34.4k
        if (!_PyUnicode_IsXidContinue(ch)) {
12167
242
            return i;
12168
242
        }
12169
34.4k
    }
12170
12.6k
    return i;
12171
12.8k
}
12172
12173
int
12174
PyUnicode_IsIdentifier(PyObject *self)
12175
1.21k
{
12176
1.21k
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12177
1.21k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12178
    /* an empty string is not a valid identifier */
12179
1.21k
    return len && i == len;
12180
1.21k
}
12181
12182
/*[clinic input]
12183
@permit_long_docstring_body
12184
str.isidentifier as unicode_isidentifier
12185
12186
Return True if the string is a valid Python identifier, False otherwise.
12187
12188
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12189
such as "def" or "class".
12190
[clinic start generated code]*/
12191
12192
static PyObject *
12193
unicode_isidentifier_impl(PyObject *self)
12194
/*[clinic end generated code: output=fe585a9666572905 input=86315dd889d7bd04]*/
12195
558
{
12196
558
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12197
558
}
12198
12199
/*[clinic input]
12200
@permit_long_summary
12201
str.isprintable as unicode_isprintable
12202
12203
Return True if all characters in the string are printable, False otherwise.
12204
12205
A character is printable if repr() may use it in its output.
12206
[clinic start generated code]*/
12207
12208
static PyObject *
12209
unicode_isprintable_impl(PyObject *self)
12210
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12211
1.15M
{
12212
1.15M
    Py_ssize_t i, length;
12213
1.15M
    int kind;
12214
1.15M
    const void *data;
12215
12216
1.15M
    length = PyUnicode_GET_LENGTH(self);
12217
1.15M
    kind = PyUnicode_KIND(self);
12218
1.15M
    data = PyUnicode_DATA(self);
12219
12220
    /* Shortcut for single character strings */
12221
1.15M
    if (length == 1)
12222
1.15M
        return PyBool_FromLong(
12223
1.15M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12224
12225
0
    for (i = 0; i < length; i++) {
12226
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12227
0
            Py_RETURN_FALSE;
12228
0
        }
12229
0
    }
12230
0
    Py_RETURN_TRUE;
12231
0
}
12232
12233
/*[clinic input]
12234
@permit_long_docstring_body
12235
str.join as unicode_join
12236
12237
    iterable: object
12238
    /
12239
12240
Concatenate any number of strings.
12241
12242
The string whose method is called is inserted in between each given string.
12243
The result is returned as a new string.
12244
12245
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12246
[clinic start generated code]*/
12247
12248
static PyObject *
12249
unicode_join(PyObject *self, PyObject *iterable)
12250
/*[clinic end generated code: output=6857e7cecfe7bf98 input=bac724ed412ef3f8]*/
12251
23.3M
{
12252
23.3M
    return PyUnicode_Join(self, iterable);
12253
23.3M
}
12254
12255
static Py_ssize_t
12256
unicode_length(PyObject *self)
12257
39.7M
{
12258
39.7M
    return PyUnicode_GET_LENGTH(self);
12259
39.7M
}
12260
12261
/*[clinic input]
12262
str.ljust as unicode_ljust
12263
12264
    width: Py_ssize_t
12265
    fillchar: Py_UCS4 = ' '
12266
    /
12267
12268
Return a left-justified string of length width.
12269
12270
Padding is done using the specified fill character (default is a space).
12271
[clinic start generated code]*/
12272
12273
static PyObject *
12274
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12275
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12276
0
{
12277
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12278
0
        return unicode_result_unchanged(self);
12279
12280
0
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12281
0
}
12282
12283
/*[clinic input]
12284
str.lower as unicode_lower
12285
12286
Return a copy of the string converted to lowercase.
12287
[clinic start generated code]*/
12288
12289
static PyObject *
12290
unicode_lower_impl(PyObject *self)
12291
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12292
85.5M
{
12293
85.5M
    if (PyUnicode_IS_ASCII(self))
12294
78.4M
        return ascii_upper_or_lower(self, 1);
12295
7.11M
    return case_operation(self, do_lower);
12296
85.5M
}
12297
12298
61.5M
#define LEFTSTRIP 0
12299
79.9M
#define RIGHTSTRIP 1
12300
35.7M
#define BOTHSTRIP 2
12301
12302
/* Arrays indexed by above */
12303
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12304
12305
0
#define STRIPNAME(i) (stripfuncnames[i])
12306
12307
/* externally visible for str.strip(unicode) */
12308
PyObject *
12309
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12310
9.05M
{
12311
9.05M
    const void *data;
12312
9.05M
    int kind;
12313
9.05M
    Py_ssize_t i, j, len;
12314
9.05M
    BLOOM_MASK sepmask;
12315
9.05M
    Py_ssize_t seplen;
12316
12317
9.05M
    kind = PyUnicode_KIND(self);
12318
9.05M
    data = PyUnicode_DATA(self);
12319
9.05M
    len = PyUnicode_GET_LENGTH(self);
12320
9.05M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12321
9.05M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12322
9.05M
                              PyUnicode_DATA(sepobj),
12323
9.05M
                              seplen);
12324
12325
9.05M
    i = 0;
12326
9.05M
    if (striptype != RIGHTSTRIP) {
12327
543k
        while (i < len) {
12328
540k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12329
540k
            if (!BLOOM(sepmask, ch))
12330
500k
                break;
12331
40.5k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12332
3.12k
                break;
12333
37.4k
            i++;
12334
37.4k
        }
12335
506k
    }
12336
12337
9.05M
    j = len;
12338
9.05M
    if (striptype != LEFTSTRIP) {
12339
8.54M
        j--;
12340
9.04M
        while (j >= i) {
12341
4.61M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12342
4.61M
            if (!BLOOM(sepmask, ch))
12343
4.08M
                break;
12344
531k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12345
32.4k
                break;
12346
498k
            j--;
12347
498k
        }
12348
12349
8.54M
        j++;
12350
8.54M
    }
12351
12352
9.05M
    return PyUnicode_Substring(self, i, j);
12353
9.05M
}
12354
12355
PyObject*
12356
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12357
239M
{
12358
239M
    const unsigned char *data;
12359
239M
    int kind;
12360
239M
    Py_ssize_t length;
12361
12362
239M
    length = PyUnicode_GET_LENGTH(self);
12363
239M
    end = Py_MIN(end, length);
12364
12365
239M
    if (start == 0 && end == length)
12366
54.0M
        return unicode_result_unchanged(self);
12367
12368
185M
    if (start < 0 || end < 0) {
12369
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12370
0
        return NULL;
12371
0
    }
12372
185M
    if (start >= length || end < start)
12373
177k
        _Py_RETURN_UNICODE_EMPTY();
12374
12375
184M
    length = end - start;
12376
184M
    if (PyUnicode_IS_ASCII(self)) {
12377
45.9M
        data = PyUnicode_1BYTE_DATA(self);
12378
45.9M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12379
45.9M
    }
12380
138M
    else {
12381
138M
        kind = PyUnicode_KIND(self);
12382
138M
        data = PyUnicode_1BYTE_DATA(self);
12383
138M
        return PyUnicode_FromKindAndData(kind,
12384
138M
                                         data + kind * start,
12385
138M
                                         length);
12386
138M
    }
12387
184M
}
12388
12389
static PyObject *
12390
do_strip(PyObject *self, int striptype)
12391
50.0M
{
12392
50.0M
    Py_ssize_t len, i, j;
12393
12394
50.0M
    len = PyUnicode_GET_LENGTH(self);
12395
12396
50.0M
    if (PyUnicode_IS_ASCII(self)) {
12397
38.9M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12398
12399
38.9M
        i = 0;
12400
38.9M
        if (striptype != RIGHTSTRIP) {
12401
29.1M
            while (i < len) {
12402
23.3M
                Py_UCS1 ch = data[i];
12403
23.3M
                if (!_Py_ascii_whitespace[ch])
12404
22.5M
                    break;
12405
767k
                i++;
12406
767k
            }
12407
28.4M
        }
12408
12409
38.9M
        j = len;
12410
38.9M
        if (striptype != LEFTSTRIP) {
12411
38.5M
            j--;
12412
51.4M
            while (j >= i) {
12413
39.9M
                Py_UCS1 ch = data[j];
12414
39.9M
                if (!_Py_ascii_whitespace[ch])
12415
26.9M
                    break;
12416
12.9M
                j--;
12417
12.9M
            }
12418
38.5M
            j++;
12419
38.5M
        }
12420
38.9M
    }
12421
11.1M
    else {
12422
11.1M
        int kind = PyUnicode_KIND(self);
12423
11.1M
        const void *data = PyUnicode_DATA(self);
12424
12425
11.1M
        i = 0;
12426
11.1M
        if (striptype != RIGHTSTRIP) {
12427
10.9M
            while (i < len) {
12428
10.9M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12429
10.9M
                if (!Py_UNICODE_ISSPACE(ch))
12430
9.28M
                    break;
12431
1.63M
                i++;
12432
1.63M
            }
12433
9.28M
        }
12434
12435
11.1M
        j = len;
12436
11.1M
        if (striptype != LEFTSTRIP) {
12437
9.51M
            j--;
12438
10.3M
            while (j >= i) {
12439
10.2M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12440
10.2M
                if (!Py_UNICODE_ISSPACE(ch))
12441
9.47M
                    break;
12442
810k
                j--;
12443
810k
            }
12444
9.51M
            j++;
12445
9.51M
        }
12446
11.1M
    }
12447
12448
50.0M
    return PyUnicode_Substring(self, i, j);
12449
50.0M
}
12450
12451
12452
static PyObject *
12453
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12454
59.0M
{
12455
59.0M
    if (sep != Py_None) {
12456
9.05M
        if (PyUnicode_Check(sep))
12457
9.05M
            return _PyUnicode_XStrip(self, striptype, sep);
12458
0
        else {
12459
0
            PyErr_Format(PyExc_TypeError,
12460
0
                         "%s arg must be None or str",
12461
0
                         STRIPNAME(striptype));
12462
0
            return NULL;
12463
0
        }
12464
9.05M
    }
12465
12466
50.0M
    return do_strip(self, striptype);
12467
59.0M
}
12468
12469
12470
/*[clinic input]
12471
@permit_long_summary
12472
str.strip as unicode_strip
12473
12474
    chars: object = None
12475
    /
12476
12477
Return a copy of the string with leading and trailing whitespace removed.
12478
12479
If chars is given and not None, remove characters in chars instead.
12480
[clinic start generated code]*/
12481
12482
static PyObject *
12483
unicode_strip_impl(PyObject *self, PyObject *chars)
12484
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12485
35.7M
{
12486
35.7M
    return do_argstrip(self, BOTHSTRIP, chars);
12487
35.7M
}
12488
12489
12490
/*[clinic input]
12491
str.lstrip as unicode_lstrip
12492
12493
    chars: object = None
12494
    /
12495
12496
Return a copy of the string with leading whitespace removed.
12497
12498
If chars is given and not None, remove characters in chars instead.
12499
[clinic start generated code]*/
12500
12501
static PyObject *
12502
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12503
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12504
2.50M
{
12505
2.50M
    return do_argstrip(self, LEFTSTRIP, chars);
12506
2.50M
}
12507
12508
12509
/*[clinic input]
12510
str.rstrip as unicode_rstrip
12511
12512
    chars: object = None
12513
    /
12514
12515
Return a copy of the string with trailing whitespace removed.
12516
12517
If chars is given and not None, remove characters in chars instead.
12518
[clinic start generated code]*/
12519
12520
static PyObject *
12521
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12522
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12523
20.8M
{
12524
20.8M
    return do_argstrip(self, RIGHTSTRIP, chars);
12525
20.8M
}
12526
12527
12528
static PyObject*
12529
unicode_repeat(PyObject *str, Py_ssize_t len)
12530
384k
{
12531
384k
    PyObject *u;
12532
384k
    Py_ssize_t nchars, n;
12533
12534
384k
    if (len < 1)
12535
35.4k
        _Py_RETURN_UNICODE_EMPTY();
12536
12537
    /* no repeat, return original string */
12538
348k
    if (len == 1)
12539
110k
        return unicode_result_unchanged(str);
12540
12541
238k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12542
0
        PyErr_SetString(PyExc_OverflowError,
12543
0
                        "repeated string is too long");
12544
0
        return NULL;
12545
0
    }
12546
238k
    nchars = len * PyUnicode_GET_LENGTH(str);
12547
12548
238k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12549
238k
    if (!u)
12550
0
        return NULL;
12551
238k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12552
12553
238k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12554
235k
        int kind = PyUnicode_KIND(str);
12555
235k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12556
235k
        if (kind == PyUnicode_1BYTE_KIND) {
12557
235k
            void *to = PyUnicode_DATA(u);
12558
235k
            memset(to, (unsigned char)fill_char, len);
12559
235k
        }
12560
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12561
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12562
0
            for (n = 0; n < len; ++n)
12563
0
                ucs2[n] = fill_char;
12564
0
        } else {
12565
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12566
0
            assert(kind == PyUnicode_4BYTE_KIND);
12567
0
            for (n = 0; n < len; ++n)
12568
0
                ucs4[n] = fill_char;
12569
0
        }
12570
235k
    }
12571
2.23k
    else {
12572
2.23k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12573
2.23k
        char *to = (char *) PyUnicode_DATA(u);
12574
2.23k
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12575
2.23k
            PyUnicode_GET_LENGTH(str) * char_size);
12576
2.23k
    }
12577
12578
238k
    assert(_PyUnicode_CheckConsistency(u, 1));
12579
238k
    return u;
12580
238k
}
12581
12582
PyObject *
12583
PyUnicode_Replace(PyObject *str,
12584
                  PyObject *substr,
12585
                  PyObject *replstr,
12586
                  Py_ssize_t maxcount)
12587
3
{
12588
3
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12589
3
            ensure_unicode(replstr) < 0)
12590
0
        return NULL;
12591
3
    return replace(str, substr, replstr, maxcount);
12592
3
}
12593
12594
/*[clinic input]
12595
@permit_long_docstring_body
12596
str.replace as unicode_replace
12597
12598
    old: unicode
12599
    new: unicode
12600
    /
12601
    count: Py_ssize_t = -1
12602
        Maximum number of occurrences to replace.
12603
        -1 (the default value) means replace all occurrences.
12604
12605
Return a copy with all occurrences of substring old replaced by new.
12606
12607
If the optional argument count is given, only the first count occurrences are
12608
replaced.
12609
[clinic start generated code]*/
12610
12611
static PyObject *
12612
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12613
                     Py_ssize_t count)
12614
/*[clinic end generated code: output=b63f1a8b5eebf448 input=f27ca92ac46b65a1]*/
12615
74.8M
{
12616
74.8M
    return replace(self, old, new, count);
12617
74.8M
}
12618
12619
/*[clinic input]
12620
@permit_long_docstring_body
12621
str.removeprefix as unicode_removeprefix
12622
12623
    prefix: unicode
12624
    /
12625
12626
Return a str with the given prefix string removed if present.
12627
12628
If the string starts with the prefix string, return string[len(prefix):].
12629
Otherwise, return a copy of the original string.
12630
[clinic start generated code]*/
12631
12632
static PyObject *
12633
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12634
/*[clinic end generated code: output=f1e5945e9763bcb9 input=1989a856dbb813f1]*/
12635
0
{
12636
0
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12637
0
    if (match == -1) {
12638
0
        return NULL;
12639
0
    }
12640
0
    if (match) {
12641
0
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12642
0
                                   PyUnicode_GET_LENGTH(self));
12643
0
    }
12644
0
    return unicode_result_unchanged(self);
12645
0
}
12646
12647
/*[clinic input]
12648
str.removesuffix as unicode_removesuffix
12649
12650
    suffix: unicode
12651
    /
12652
12653
Return a str with the given suffix string removed if present.
12654
12655
If the string ends with the suffix string and that suffix is not empty,
12656
return string[:-len(suffix)]. Otherwise, return a copy of the original
12657
string.
12658
[clinic start generated code]*/
12659
12660
static PyObject *
12661
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12662
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12663
0
{
12664
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12665
0
    if (match == -1) {
12666
0
        return NULL;
12667
0
    }
12668
0
    if (match) {
12669
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12670
0
                                            - PyUnicode_GET_LENGTH(suffix));
12671
0
    }
12672
0
    return unicode_result_unchanged(self);
12673
0
}
12674
12675
static PyObject *
12676
unicode_repr(PyObject *unicode)
12677
3.81M
{
12678
3.81M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12679
3.81M
    const void *idata = PyUnicode_DATA(unicode);
12680
12681
    /* Compute length of output, quote characters, and
12682
       maximum character */
12683
3.81M
    Py_ssize_t osize = 0;
12684
3.81M
    Py_UCS4 maxch = 127;
12685
3.81M
    Py_ssize_t squote = 0;
12686
3.81M
    Py_ssize_t dquote = 0;
12687
3.81M
    int ikind = PyUnicode_KIND(unicode);
12688
132M
    for (Py_ssize_t i = 0; i < isize; i++) {
12689
128M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12690
128M
        Py_ssize_t incr = 1;
12691
128M
        switch (ch) {
12692
182k
        case '\'': squote++; break;
12693
575k
        case '"':  dquote++; break;
12694
195k
        case '\\': case '\t': case '\r': case '\n':
12695
195k
            incr = 2;
12696
195k
            break;
12697
127M
        default:
12698
            /* Fast-path ASCII */
12699
127M
            if (ch < ' ' || ch == 0x7f)
12700
75.5M
                incr = 4; /* \xHH */
12701
51.7M
            else if (ch < 0x7f)
12702
44.3M
                ;
12703
7.41M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12704
7.31M
                maxch = (ch > maxch) ? ch : maxch;
12705
101k
            else if (ch < 0x100)
12706
29.3k
                incr = 4; /* \xHH */
12707
72.3k
            else if (ch < 0x10000)
12708
51.7k
                incr = 6; /* \uHHHH */
12709
20.6k
            else
12710
20.6k
                incr = 10; /* \uHHHHHHHH */
12711
128M
        }
12712
128M
        if (osize > PY_SSIZE_T_MAX - incr) {
12713
0
            PyErr_SetString(PyExc_OverflowError,
12714
0
                            "string is too long to generate repr");
12715
0
            return NULL;
12716
0
        }
12717
128M
        osize += incr;
12718
128M
    }
12719
12720
3.81M
    Py_UCS4 quote = '\'';
12721
3.81M
    int changed = (osize != isize);
12722
3.81M
    if (squote) {
12723
85.5k
        changed = 1;
12724
85.5k
        if (dquote)
12725
            /* Both squote and dquote present. Use squote,
12726
               and escape them */
12727
7.74k
            osize += squote;
12728
77.8k
        else
12729
77.8k
            quote = '"';
12730
85.5k
    }
12731
3.81M
    osize += 2;   /* quotes */
12732
12733
3.81M
    PyObject *repr = PyUnicode_New(osize, maxch);
12734
3.81M
    if (repr == NULL)
12735
0
        return NULL;
12736
3.81M
    int okind = PyUnicode_KIND(repr);
12737
3.81M
    void *odata = PyUnicode_DATA(repr);
12738
12739
3.81M
    if (!changed) {
12740
3.12M
        PyUnicode_WRITE(okind, odata, 0, quote);
12741
12742
3.12M
        _PyUnicode_FastCopyCharacters(repr, 1,
12743
3.12M
                                      unicode, 0,
12744
3.12M
                                      isize);
12745
12746
3.12M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12747
3.12M
    }
12748
689k
    else {
12749
689k
        switch (okind) {
12750
463k
        case PyUnicode_1BYTE_KIND:
12751
463k
            ucs1lib_repr(unicode, quote, odata);
12752
463k
            break;
12753
221k
        case PyUnicode_2BYTE_KIND:
12754
221k
            ucs2lib_repr(unicode, quote, odata);
12755
221k
            break;
12756
3.94k
        default:
12757
3.94k
            assert(okind == PyUnicode_4BYTE_KIND);
12758
3.94k
            ucs4lib_repr(unicode, quote, odata);
12759
689k
        }
12760
689k
    }
12761
12762
3.81M
    assert(_PyUnicode_CheckConsistency(repr, 1));
12763
3.81M
    return repr;
12764
3.81M
}
12765
12766
/*[clinic input]
12767
@permit_long_summary
12768
str.rfind as unicode_rfind = str.count
12769
12770
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12771
12772
Optional arguments start and end are interpreted as in slice notation.
12773
Return -1 on failure.
12774
[clinic start generated code]*/
12775
12776
static Py_ssize_t
12777
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12778
                   Py_ssize_t end)
12779
/*[clinic end generated code: output=880b29f01dd014c8 input=7f7e97d5cd3299a2]*/
12780
10.9k
{
12781
10.9k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12782
10.9k
    if (result < 0) {
12783
7.14k
        return -1;
12784
7.14k
    }
12785
3.77k
    return result;
12786
10.9k
}
12787
12788
/*[clinic input]
12789
@permit_long_summary
12790
str.rindex as unicode_rindex = str.count
12791
12792
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12793
12794
Optional arguments start and end are interpreted as in slice notation.
12795
Raises ValueError when the substring is not found.
12796
[clinic start generated code]*/
12797
12798
static Py_ssize_t
12799
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12800
                    Py_ssize_t end)
12801
/*[clinic end generated code: output=5f3aef124c867fe1 input=0363a324740b3e62]*/
12802
135k
{
12803
135k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12804
135k
    if (result == -1) {
12805
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12806
0
    }
12807
135k
    else if (result < 0) {
12808
0
        return -1;
12809
0
    }
12810
135k
    return result;
12811
135k
}
12812
12813
/*[clinic input]
12814
str.rjust as unicode_rjust
12815
12816
    width: Py_ssize_t
12817
    fillchar: Py_UCS4 = ' '
12818
    /
12819
12820
Return a right-justified string of length width.
12821
12822
Padding is done using the specified fill character (default is a space).
12823
[clinic start generated code]*/
12824
12825
static PyObject *
12826
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12827
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12828
0
{
12829
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12830
0
        return unicode_result_unchanged(self);
12831
12832
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12833
0
}
12834
12835
PyObject *
12836
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12837
0
{
12838
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12839
0
        return NULL;
12840
12841
0
    return split(s, sep, maxsplit);
12842
0
}
12843
12844
/*[clinic input]
12845
@permit_long_summary
12846
str.split as unicode_split
12847
12848
    sep: object = None
12849
        The separator used to split the string.
12850
12851
        When set to None (the default value), will split on any whitespace
12852
        character (including \n \r \t \f and spaces) and will discard
12853
        empty strings from the result.
12854
    maxsplit: Py_ssize_t = -1
12855
        Maximum number of splits.
12856
        -1 (the default value) means no limit.
12857
12858
Return a list of the substrings in the string, using sep as the separator string.
12859
12860
Splitting starts at the front of the string and works to the end.
12861
12862
Note, str.split() is mainly useful for data that has been intentionally
12863
delimited.  With natural text that includes punctuation, consider using
12864
the regular expression module.
12865
12866
[clinic start generated code]*/
12867
12868
static PyObject *
12869
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12870
/*[clinic end generated code: output=3a65b1db356948dc input=2c1fd08a78e038b8]*/
12871
23.9M
{
12872
23.9M
    if (sep == Py_None)
12873
174k
        return split(self, NULL, maxsplit);
12874
23.7M
    if (PyUnicode_Check(sep))
12875
23.7M
        return split(self, sep, maxsplit);
12876
12877
0
    PyErr_Format(PyExc_TypeError,
12878
0
                 "must be str or None, not %.100s",
12879
0
                 Py_TYPE(sep)->tp_name);
12880
0
    return NULL;
12881
23.7M
}
12882
12883
PyObject *
12884
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12885
8.85M
{
12886
8.85M
    PyObject* out;
12887
8.85M
    int kind1, kind2;
12888
8.85M
    const void *buf1, *buf2;
12889
8.85M
    Py_ssize_t len1, len2;
12890
12891
8.85M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12892
0
        return NULL;
12893
12894
8.85M
    kind1 = PyUnicode_KIND(str_obj);
12895
8.85M
    kind2 = PyUnicode_KIND(sep_obj);
12896
8.85M
    len1 = PyUnicode_GET_LENGTH(str_obj);
12897
8.85M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12898
8.85M
    if (kind1 < kind2 || len1 < len2) {
12899
1.22k
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12900
1.22k
        return PyTuple_Pack(3, str_obj, empty, empty);
12901
1.22k
    }
12902
8.85M
    buf1 = PyUnicode_DATA(str_obj);
12903
8.85M
    buf2 = PyUnicode_DATA(sep_obj);
12904
8.85M
    if (kind2 != kind1) {
12905
90.5k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12906
90.5k
        if (!buf2)
12907
0
            return NULL;
12908
90.5k
    }
12909
12910
8.85M
    switch (kind1) {
12911
8.76M
    case PyUnicode_1BYTE_KIND:
12912
8.76M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12913
3.17M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12914
5.58M
        else
12915
5.58M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12916
8.76M
        break;
12917
80.2k
    case PyUnicode_2BYTE_KIND:
12918
80.2k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12919
80.2k
        break;
12920
10.3k
    case PyUnicode_4BYTE_KIND:
12921
10.3k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12922
10.3k
        break;
12923
0
    default:
12924
0
        Py_UNREACHABLE();
12925
8.85M
    }
12926
12927
8.85M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12928
8.85M
    if (kind2 != kind1)
12929
90.5k
        PyMem_Free((void *)buf2);
12930
12931
8.85M
    return out;
12932
8.85M
}
12933
12934
12935
PyObject *
12936
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12937
12.4k
{
12938
12.4k
    PyObject* out;
12939
12.4k
    int kind1, kind2;
12940
12.4k
    const void *buf1, *buf2;
12941
12.4k
    Py_ssize_t len1, len2;
12942
12943
12.4k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12944
0
        return NULL;
12945
12946
12.4k
    kind1 = PyUnicode_KIND(str_obj);
12947
12.4k
    kind2 = PyUnicode_KIND(sep_obj);
12948
12.4k
    len1 = PyUnicode_GET_LENGTH(str_obj);
12949
12.4k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12950
12.4k
    if (kind1 < kind2 || len1 < len2) {
12951
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12952
0
        return PyTuple_Pack(3, empty, empty, str_obj);
12953
0
    }
12954
12.4k
    buf1 = PyUnicode_DATA(str_obj);
12955
12.4k
    buf2 = PyUnicode_DATA(sep_obj);
12956
12.4k
    if (kind2 != kind1) {
12957
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12958
0
        if (!buf2)
12959
0
            return NULL;
12960
0
    }
12961
12962
12.4k
    switch (kind1) {
12963
12.4k
    case PyUnicode_1BYTE_KIND:
12964
12.4k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12965
12.4k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12966
0
        else
12967
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12968
12.4k
        break;
12969
0
    case PyUnicode_2BYTE_KIND:
12970
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12971
0
        break;
12972
0
    case PyUnicode_4BYTE_KIND:
12973
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12974
0
        break;
12975
0
    default:
12976
0
        Py_UNREACHABLE();
12977
12.4k
    }
12978
12979
12.4k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12980
12.4k
    if (kind2 != kind1)
12981
0
        PyMem_Free((void *)buf2);
12982
12983
12.4k
    return out;
12984
12.4k
}
12985
12986
/*[clinic input]
12987
@permit_long_docstring_body
12988
str.partition as unicode_partition
12989
12990
    sep: object
12991
    /
12992
12993
Partition the string into three parts using the given separator.
12994
12995
This will search for the separator in the string.  If the separator is found,
12996
returns a 3-tuple containing the part before the separator, the separator
12997
itself, and the part after it.
12998
12999
If the separator is not found, returns a 3-tuple containing the original string
13000
and two empty strings.
13001
[clinic start generated code]*/
13002
13003
static PyObject *
13004
unicode_partition(PyObject *self, PyObject *sep)
13005
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=4d854b520d7b0e97]*/
13006
8.85M
{
13007
8.85M
    return PyUnicode_Partition(self, sep);
13008
8.85M
}
13009
13010
/*[clinic input]
13011
@permit_long_docstring_body
13012
str.rpartition as unicode_rpartition = str.partition
13013
13014
Partition the string into three parts using the given separator.
13015
13016
This will search for the separator in the string, starting at the end. If
13017
the separator is found, returns a 3-tuple containing the part before the
13018
separator, the separator itself, and the part after it.
13019
13020
If the separator is not found, returns a 3-tuple containing two empty strings
13021
and the original string.
13022
[clinic start generated code]*/
13023
13024
static PyObject *
13025
unicode_rpartition(PyObject *self, PyObject *sep)
13026
/*[clinic end generated code: output=1aa13cf1156572aa input=a6adabe91e75b486]*/
13027
12.4k
{
13028
12.4k
    return PyUnicode_RPartition(self, sep);
13029
12.4k
}
13030
13031
PyObject *
13032
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13033
0
{
13034
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13035
0
        return NULL;
13036
13037
0
    return rsplit(s, sep, maxsplit);
13038
0
}
13039
13040
/*[clinic input]
13041
@permit_long_summary
13042
str.rsplit as unicode_rsplit = str.split
13043
13044
Return a list of the substrings in the string, using sep as the separator string.
13045
13046
Splitting starts at the end of the string and works to the front.
13047
[clinic start generated code]*/
13048
13049
static PyObject *
13050
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13051
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13052
50
{
13053
50
    if (sep == Py_None)
13054
0
        return rsplit(self, NULL, maxsplit);
13055
50
    if (PyUnicode_Check(sep))
13056
50
        return rsplit(self, sep, maxsplit);
13057
13058
0
    PyErr_Format(PyExc_TypeError,
13059
0
                 "must be str or None, not %.100s",
13060
0
                 Py_TYPE(sep)->tp_name);
13061
0
    return NULL;
13062
50
}
13063
13064
/*[clinic input]
13065
@permit_long_docstring_body
13066
str.splitlines as unicode_splitlines
13067
13068
    keepends: bool = False
13069
13070
Return a list of the lines in the string, breaking at line boundaries.
13071
13072
Line breaks are not included in the resulting list unless keepends is given and
13073
true.
13074
[clinic start generated code]*/
13075
13076
static PyObject *
13077
unicode_splitlines_impl(PyObject *self, int keepends)
13078
/*[clinic end generated code: output=f664dcdad153ec40 input=39eeafbfef61c827]*/
13079
13.4k
{
13080
13.4k
    return PyUnicode_Splitlines(self, keepends);
13081
13.4k
}
13082
13083
static
13084
PyObject *unicode_str(PyObject *self)
13085
3.10M
{
13086
3.10M
    return unicode_result_unchanged(self);
13087
3.10M
}
13088
13089
/*[clinic input]
13090
@permit_long_summary
13091
str.swapcase as unicode_swapcase
13092
13093
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13094
[clinic start generated code]*/
13095
13096
static PyObject *
13097
unicode_swapcase_impl(PyObject *self)
13098
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13099
0
{
13100
0
    return case_operation(self, do_swapcase);
13101
0
}
13102
13103
/*[clinic input]
13104
13105
@staticmethod
13106
str.maketrans as unicode_maketrans
13107
13108
  x: object
13109
13110
  y: unicode=NULL
13111
13112
  z: unicode=NULL
13113
13114
  /
13115
13116
Return a translation table usable for str.translate().
13117
13118
If there is only one argument, it must be a dictionary mapping Unicode
13119
ordinals (integers) or characters to Unicode ordinals, strings or None.
13120
Character keys will be then converted to ordinals.
13121
If there are two arguments, they must be strings of equal length, and
13122
in the resulting dictionary, each character in x will be mapped to the
13123
character at the same position in y. If there is a third argument, it
13124
must be a string, whose characters will be mapped to None in the result.
13125
[clinic start generated code]*/
13126
13127
static PyObject *
13128
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13129
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13130
0
{
13131
0
    PyObject *new = NULL, *key, *value;
13132
0
    Py_ssize_t i = 0;
13133
0
    int res;
13134
13135
0
    new = PyDict_New();
13136
0
    if (!new)
13137
0
        return NULL;
13138
0
    if (y != NULL) {
13139
0
        int x_kind, y_kind, z_kind;
13140
0
        const void *x_data, *y_data, *z_data;
13141
13142
        /* x must be a string too, of equal length */
13143
0
        if (!PyUnicode_Check(x)) {
13144
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13145
0
                            "be a string if there is a second argument");
13146
0
            goto err;
13147
0
        }
13148
0
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13149
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13150
0
                            "arguments must have equal length");
13151
0
            goto err;
13152
0
        }
13153
        /* create entries for translating chars in x to those in y */
13154
0
        x_kind = PyUnicode_KIND(x);
13155
0
        y_kind = PyUnicode_KIND(y);
13156
0
        x_data = PyUnicode_DATA(x);
13157
0
        y_data = PyUnicode_DATA(y);
13158
0
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13159
0
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13160
0
            if (!key)
13161
0
                goto err;
13162
0
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13163
0
            if (!value) {
13164
0
                Py_DECREF(key);
13165
0
                goto err;
13166
0
            }
13167
0
            res = PyDict_SetItem(new, key, value);
13168
0
            Py_DECREF(key);
13169
0
            Py_DECREF(value);
13170
0
            if (res < 0)
13171
0
                goto err;
13172
0
        }
13173
        /* create entries for deleting chars in z */
13174
0
        if (z != NULL) {
13175
0
            z_kind = PyUnicode_KIND(z);
13176
0
            z_data = PyUnicode_DATA(z);
13177
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13178
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13179
0
                if (!key)
13180
0
                    goto err;
13181
0
                res = PyDict_SetItem(new, key, Py_None);
13182
0
                Py_DECREF(key);
13183
0
                if (res < 0)
13184
0
                    goto err;
13185
0
            }
13186
0
        }
13187
0
    } else {
13188
0
        int kind;
13189
0
        const void *data;
13190
13191
        /* x must be a dict */
13192
0
        if (!PyDict_CheckExact(x)) {
13193
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13194
0
                            "to maketrans it must be a dict");
13195
0
            goto err;
13196
0
        }
13197
        /* copy entries into the new dict, converting string keys to int keys */
13198
0
        while (PyDict_Next(x, &i, &key, &value)) {
13199
0
            if (PyUnicode_Check(key)) {
13200
                /* convert string keys to integer keys */
13201
0
                PyObject *newkey;
13202
0
                if (PyUnicode_GET_LENGTH(key) != 1) {
13203
0
                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13204
0
                                    "table must be of length 1");
13205
0
                    goto err;
13206
0
                }
13207
0
                kind = PyUnicode_KIND(key);
13208
0
                data = PyUnicode_DATA(key);
13209
0
                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13210
0
                if (!newkey)
13211
0
                    goto err;
13212
0
                res = PyDict_SetItem(new, newkey, value);
13213
0
                Py_DECREF(newkey);
13214
0
                if (res < 0)
13215
0
                    goto err;
13216
0
            } else if (PyLong_Check(key)) {
13217
                /* just keep integer keys */
13218
0
                if (PyDict_SetItem(new, key, value) < 0)
13219
0
                    goto err;
13220
0
            } else {
13221
0
                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13222
0
                                "be strings or integers");
13223
0
                goto err;
13224
0
            }
13225
0
        }
13226
0
    }
13227
0
    return new;
13228
0
  err:
13229
0
    Py_DECREF(new);
13230
0
    return NULL;
13231
0
}
13232
13233
/*[clinic input]
13234
@permit_long_docstring_body
13235
str.translate as unicode_translate
13236
13237
    table: object
13238
        Translation table, which must be a mapping of Unicode ordinals to
13239
        Unicode ordinals, strings, or None.
13240
    /
13241
13242
Replace each character in the string using the given translation table.
13243
13244
The table must implement lookup/indexing via __getitem__, for instance a
13245
dictionary or list.  If this operation raises LookupError, the character is
13246
left untouched.  Characters mapped to None are deleted.
13247
[clinic start generated code]*/
13248
13249
static PyObject *
13250
unicode_translate(PyObject *self, PyObject *table)
13251
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=699e5fa0ebf9f5e9]*/
13252
104
{
13253
104
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13254
104
}
13255
13256
/*[clinic input]
13257
str.upper as unicode_upper
13258
13259
Return a copy of the string converted to uppercase.
13260
[clinic start generated code]*/
13261
13262
static PyObject *
13263
unicode_upper_impl(PyObject *self)
13264
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13265
102
{
13266
102
    if (PyUnicode_IS_ASCII(self))
13267
102
        return ascii_upper_or_lower(self, 0);
13268
0
    return case_operation(self, do_upper);
13269
102
}
13270
13271
/*[clinic input]
13272
@permit_long_summary
13273
str.zfill as unicode_zfill
13274
13275
    width: Py_ssize_t
13276
    /
13277
13278
Pad a numeric string with zeros on the left, to fill a field of the given width.
13279
13280
The string is never truncated.
13281
[clinic start generated code]*/
13282
13283
static PyObject *
13284
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13285
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13286
0
{
13287
0
    Py_ssize_t fill;
13288
0
    PyObject *u;
13289
0
    int kind;
13290
0
    const void *data;
13291
0
    Py_UCS4 chr;
13292
13293
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13294
0
        return unicode_result_unchanged(self);
13295
13296
0
    fill = width - PyUnicode_GET_LENGTH(self);
13297
13298
0
    u = pad(self, fill, 0, '0');
13299
13300
0
    if (u == NULL)
13301
0
        return NULL;
13302
13303
0
    kind = PyUnicode_KIND(u);
13304
0
    data = PyUnicode_DATA(u);
13305
0
    chr = PyUnicode_READ(kind, data, fill);
13306
13307
0
    if (chr == '+' || chr == '-') {
13308
        /* move sign to beginning of string */
13309
0
        PyUnicode_WRITE(kind, data, 0, chr);
13310
0
        PyUnicode_WRITE(kind, data, fill, '0');
13311
0
    }
13312
13313
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13314
0
    return u;
13315
0
}
13316
13317
/*[clinic input]
13318
@permit_long_summary
13319
@text_signature "($self, prefix[, start[, end]], /)"
13320
str.startswith as unicode_startswith
13321
13322
    prefix as subobj: object
13323
        A string or a tuple of strings to try.
13324
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13325
        Optional start position. Default: start of the string.
13326
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13327
        Optional stop position. Default: end of the string.
13328
    /
13329
13330
Return True if the string starts with the specified prefix, False otherwise.
13331
[clinic start generated code]*/
13332
13333
static PyObject *
13334
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13335
                        Py_ssize_t end)
13336
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13337
76.3M
{
13338
76.3M
    if (PyTuple_Check(subobj)) {
13339
8.92M
        Py_ssize_t i;
13340
32.3M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13341
23.4M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13342
23.4M
            if (!PyUnicode_Check(substring)) {
13343
0
                PyErr_Format(PyExc_TypeError,
13344
0
                             "tuple for startswith must only contain str, "
13345
0
                             "not %.100s",
13346
0
                             Py_TYPE(substring)->tp_name);
13347
0
                return NULL;
13348
0
            }
13349
23.4M
            int result = tailmatch(self, substring, start, end, -1);
13350
23.4M
            if (result < 0) {
13351
0
                return NULL;
13352
0
            }
13353
23.4M
            if (result) {
13354
30.8k
                Py_RETURN_TRUE;
13355
30.8k
            }
13356
23.4M
        }
13357
        /* nothing matched */
13358
8.92M
        Py_RETURN_FALSE;
13359
8.92M
    }
13360
67.3M
    if (!PyUnicode_Check(subobj)) {
13361
0
        PyErr_Format(PyExc_TypeError,
13362
0
                     "startswith first arg must be str or "
13363
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13364
0
        return NULL;
13365
0
    }
13366
67.3M
    int result = tailmatch(self, subobj, start, end, -1);
13367
67.3M
    if (result < 0) {
13368
0
        return NULL;
13369
0
    }
13370
67.3M
    return PyBool_FromLong(result);
13371
67.3M
}
13372
13373
13374
/*[clinic input]
13375
@permit_long_summary
13376
@text_signature "($self, suffix[, start[, end]], /)"
13377
str.endswith as unicode_endswith
13378
13379
    suffix as subobj: object
13380
        A string or a tuple of strings to try.
13381
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13382
        Optional start position. Default: start of the string.
13383
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13384
        Optional stop position. Default: end of the string.
13385
    /
13386
13387
Return True if the string ends with the specified suffix, False otherwise.
13388
[clinic start generated code]*/
13389
13390
static PyObject *
13391
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13392
                      Py_ssize_t end)
13393
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13394
13.4M
{
13395
13.4M
    if (PyTuple_Check(subobj)) {
13396
210k
        Py_ssize_t i;
13397
405k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13398
371k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13399
371k
            if (!PyUnicode_Check(substring)) {
13400
0
                PyErr_Format(PyExc_TypeError,
13401
0
                             "tuple for endswith must only contain str, "
13402
0
                             "not %.100s",
13403
0
                             Py_TYPE(substring)->tp_name);
13404
0
                return NULL;
13405
0
            }
13406
371k
            int result = tailmatch(self, substring, start, end, +1);
13407
371k
            if (result < 0) {
13408
0
                return NULL;
13409
0
            }
13410
371k
            if (result) {
13411
176k
                Py_RETURN_TRUE;
13412
176k
            }
13413
371k
        }
13414
210k
        Py_RETURN_FALSE;
13415
210k
    }
13416
13.2M
    if (!PyUnicode_Check(subobj)) {
13417
0
        PyErr_Format(PyExc_TypeError,
13418
0
                     "endswith first arg must be str or "
13419
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13420
0
        return NULL;
13421
0
    }
13422
13.2M
    int result = tailmatch(self, subobj, start, end, +1);
13423
13.2M
    if (result < 0) {
13424
0
        return NULL;
13425
0
    }
13426
13.2M
    return PyBool_FromLong(result);
13427
13.2M
}
13428
13429
13430
#include "stringlib/unicode_format.h"
13431
13432
PyDoc_STRVAR(format__doc__,
13433
             "format($self, /, *args, **kwargs)\n\
13434
--\n\
13435
\n\
13436
Return a formatted version of the string, using substitutions from args and kwargs.\n\
13437
The substitutions are identified by braces ('{' and '}').");
13438
13439
PyDoc_STRVAR(format_map__doc__,
13440
             "format_map($self, mapping, /)\n\
13441
--\n\
13442
\n\
13443
Return a formatted version of the string, using substitutions from mapping.\n\
13444
The substitutions are identified by braces ('{' and '}').");
13445
13446
/*[clinic input]
13447
str.__format__ as unicode___format__
13448
13449
    format_spec: unicode
13450
    /
13451
13452
Return a formatted version of the string as described by format_spec.
13453
[clinic start generated code]*/
13454
13455
static PyObject *
13456
unicode___format___impl(PyObject *self, PyObject *format_spec)
13457
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13458
0
{
13459
0
    _PyUnicodeWriter writer;
13460
0
    int ret;
13461
13462
0
    _PyUnicodeWriter_Init(&writer);
13463
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13464
0
                                          self, format_spec, 0,
13465
0
                                          PyUnicode_GET_LENGTH(format_spec));
13466
0
    if (ret == -1) {
13467
0
        _PyUnicodeWriter_Dealloc(&writer);
13468
0
        return NULL;
13469
0
    }
13470
0
    return _PyUnicodeWriter_Finish(&writer);
13471
0
}
13472
13473
/*[clinic input]
13474
str.__sizeof__ as unicode_sizeof
13475
13476
Return the size of the string in memory, in bytes.
13477
[clinic start generated code]*/
13478
13479
static PyObject *
13480
unicode_sizeof_impl(PyObject *self)
13481
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13482
0
{
13483
0
    Py_ssize_t size;
13484
13485
    /* If it's a compact object, account for base structure +
13486
       character data. */
13487
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
13488
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13489
0
    }
13490
0
    else if (PyUnicode_IS_COMPACT(self)) {
13491
0
        size = sizeof(PyCompactUnicodeObject) +
13492
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13493
0
    }
13494
0
    else {
13495
        /* If it is a two-block object, account for base object, and
13496
           for character block if present. */
13497
0
        size = sizeof(PyUnicodeObject);
13498
0
        if (_PyUnicode_DATA_ANY(self))
13499
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13500
0
                PyUnicode_KIND(self);
13501
0
    }
13502
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13503
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13504
13505
0
    return PyLong_FromSsize_t(size);
13506
0
}
13507
13508
static PyObject *
13509
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13510
0
{
13511
0
    PyObject *copy = _PyUnicode_Copy(v);
13512
0
    if (!copy)
13513
0
        return NULL;
13514
0
    return Py_BuildValue("(N)", copy);
13515
0
}
13516
13517
/*
13518
This function searchs the longest common leading whitespace
13519
of all lines in the [src, end).
13520
It returns the length of the common leading whitespace and sets `output` to
13521
point to the beginning of the common leading whitespace if length > 0.
13522
*/
13523
static Py_ssize_t
13524
search_longest_common_leading_whitespace(
13525
    const char *const src,
13526
    const char *const end,
13527
    const char **output)
13528
0
{
13529
    // [_start, _start + _len)
13530
    // describes the current longest common leading whitespace
13531
0
    const char *_start = NULL;
13532
0
    Py_ssize_t _len = 0;
13533
13534
0
    for (const char *iter = src; iter < end; ++iter) {
13535
0
        const char *line_start = iter;
13536
0
        const char *leading_whitespace_end = NULL;
13537
13538
        // scan the whole line
13539
0
        while (iter < end && *iter != '\n') {
13540
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
13541
                /* `iter` points to the first non-whitespace character
13542
                   in this line */
13543
0
                if (iter == line_start) {
13544
                    // some line has no indent, fast exit!
13545
0
                    return 0;
13546
0
                }
13547
0
                leading_whitespace_end = iter;
13548
0
            }
13549
0
            ++iter;
13550
0
        }
13551
13552
        // if this line has all white space, skip it
13553
0
        if (!leading_whitespace_end) {
13554
0
            continue;
13555
0
        }
13556
13557
0
        if (!_start) {
13558
            // update the first leading whitespace
13559
0
            _start = line_start;
13560
0
            _len = leading_whitespace_end - line_start;
13561
0
            assert(_len > 0);
13562
0
        }
13563
0
        else {
13564
            /* We then compare with the current longest leading whitespace.
13565
13566
               [line_start, leading_whitespace_end) is the leading
13567
               whitespace of this line,
13568
13569
               [_start, _start + _len) is the leading whitespace of the
13570
               current longest leading whitespace. */
13571
0
            Py_ssize_t new_len = 0;
13572
0
            const char *_iter = _start, *line_iter = line_start;
13573
13574
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
13575
0
                   && *_iter == *line_iter)
13576
0
            {
13577
0
                ++_iter;
13578
0
                ++line_iter;
13579
0
                ++new_len;
13580
0
            }
13581
13582
0
            _len = new_len;
13583
0
            if (_len == 0) {
13584
                // No common things now, fast exit!
13585
0
                return 0;
13586
0
            }
13587
0
        }
13588
0
    }
13589
13590
0
    assert(_len >= 0);
13591
0
    if (_len > 0) {
13592
0
        *output = _start;
13593
0
    }
13594
0
    return _len;
13595
0
}
13596
13597
/* Dedent a string.
13598
   Behaviour is expected to be an exact match of `textwrap.dedent`.
13599
   Return a new reference on success, NULL with exception set on error.
13600
   */
13601
PyObject *
13602
_PyUnicode_Dedent(PyObject *unicode)
13603
0
{
13604
0
    Py_ssize_t src_len = 0;
13605
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
13606
0
    if (!src) {
13607
0
        return NULL;
13608
0
    }
13609
0
    assert(src_len >= 0);
13610
0
    if (src_len == 0) {
13611
0
        return Py_NewRef(unicode);
13612
0
    }
13613
13614
0
    const char *const end = src + src_len;
13615
13616
    // [whitespace_start, whitespace_start + whitespace_len)
13617
    // describes the current longest common leading whitespace
13618
0
    const char *whitespace_start = NULL;
13619
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
13620
0
        src, end, &whitespace_start);
13621
13622
0
    if (whitespace_len == 0) {
13623
0
        return Py_NewRef(unicode);
13624
0
    }
13625
13626
    // now we should trigger a dedent
13627
0
    char *dest = PyMem_Malloc(src_len);
13628
0
    if (!dest) {
13629
0
        PyErr_NoMemory();
13630
0
        return NULL;
13631
0
    }
13632
0
    char *dest_iter = dest;
13633
13634
0
    for (const char *iter = src; iter < end; ++iter) {
13635
0
        const char *line_start = iter;
13636
0
        bool in_leading_space = true;
13637
13638
        // iterate over a line to find the end of a line
13639
0
        while (iter < end && *iter != '\n') {
13640
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
13641
0
                in_leading_space = false;
13642
0
            }
13643
0
            ++iter;
13644
0
        }
13645
13646
        // invariant: *iter == '\n' or iter == end
13647
0
        bool append_newline = iter < end;
13648
13649
        // if this line has all white space, write '\n' and continue
13650
0
        if (in_leading_space && append_newline) {
13651
0
            *dest_iter++ = '\n';
13652
0
            continue;
13653
0
        }
13654
13655
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
13656
            conditionally append '\n' */
13657
13658
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
13659
0
        assert(new_line_len >= 0);
13660
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
13661
13662
0
        dest_iter += new_line_len;
13663
13664
0
        if (append_newline) {
13665
0
            *dest_iter++ = '\n';
13666
0
        }
13667
0
    }
13668
13669
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
13670
0
    PyMem_Free(dest);
13671
0
    return res;
13672
0
}
13673
13674
static PyMethodDef unicode_methods[] = {
13675
    UNICODE_ENCODE_METHODDEF
13676
    UNICODE_REPLACE_METHODDEF
13677
    UNICODE_SPLIT_METHODDEF
13678
    UNICODE_RSPLIT_METHODDEF
13679
    UNICODE_JOIN_METHODDEF
13680
    UNICODE_CAPITALIZE_METHODDEF
13681
    UNICODE_CASEFOLD_METHODDEF
13682
    UNICODE_TITLE_METHODDEF
13683
    UNICODE_CENTER_METHODDEF
13684
    UNICODE_COUNT_METHODDEF
13685
    UNICODE_EXPANDTABS_METHODDEF
13686
    UNICODE_FIND_METHODDEF
13687
    UNICODE_PARTITION_METHODDEF
13688
    UNICODE_INDEX_METHODDEF
13689
    UNICODE_LJUST_METHODDEF
13690
    UNICODE_LOWER_METHODDEF
13691
    UNICODE_LSTRIP_METHODDEF
13692
    UNICODE_RFIND_METHODDEF
13693
    UNICODE_RINDEX_METHODDEF
13694
    UNICODE_RJUST_METHODDEF
13695
    UNICODE_RSTRIP_METHODDEF
13696
    UNICODE_RPARTITION_METHODDEF
13697
    UNICODE_SPLITLINES_METHODDEF
13698
    UNICODE_STRIP_METHODDEF
13699
    UNICODE_SWAPCASE_METHODDEF
13700
    UNICODE_TRANSLATE_METHODDEF
13701
    UNICODE_UPPER_METHODDEF
13702
    UNICODE_STARTSWITH_METHODDEF
13703
    UNICODE_ENDSWITH_METHODDEF
13704
    UNICODE_REMOVEPREFIX_METHODDEF
13705
    UNICODE_REMOVESUFFIX_METHODDEF
13706
    UNICODE_ISASCII_METHODDEF
13707
    UNICODE_ISLOWER_METHODDEF
13708
    UNICODE_ISUPPER_METHODDEF
13709
    UNICODE_ISTITLE_METHODDEF
13710
    UNICODE_ISSPACE_METHODDEF
13711
    UNICODE_ISDECIMAL_METHODDEF
13712
    UNICODE_ISDIGIT_METHODDEF
13713
    UNICODE_ISNUMERIC_METHODDEF
13714
    UNICODE_ISALPHA_METHODDEF
13715
    UNICODE_ISALNUM_METHODDEF
13716
    UNICODE_ISIDENTIFIER_METHODDEF
13717
    UNICODE_ISPRINTABLE_METHODDEF
13718
    UNICODE_ZFILL_METHODDEF
13719
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13720
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
13721
    UNICODE___FORMAT___METHODDEF
13722
    UNICODE_MAKETRANS_METHODDEF
13723
    UNICODE_SIZEOF_METHODDEF
13724
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13725
    {NULL, NULL}
13726
};
13727
13728
static PyObject *
13729
unicode_mod(PyObject *v, PyObject *w)
13730
21.0M
{
13731
21.0M
    if (!PyUnicode_Check(v))
13732
0
        Py_RETURN_NOTIMPLEMENTED;
13733
21.0M
    return PyUnicode_Format(v, w);
13734
21.0M
}
13735
13736
static PyNumberMethods unicode_as_number = {
13737
    0,              /*nb_add*/
13738
    0,              /*nb_subtract*/
13739
    0,              /*nb_multiply*/
13740
    unicode_mod,            /*nb_remainder*/
13741
};
13742
13743
static PySequenceMethods unicode_as_sequence = {
13744
    unicode_length,     /* sq_length */
13745
    PyUnicode_Concat,   /* sq_concat */
13746
    unicode_repeat,     /* sq_repeat */
13747
    unicode_getitem,    /* sq_item */
13748
    0,                  /* sq_slice */
13749
    0,                  /* sq_ass_item */
13750
    0,                  /* sq_ass_slice */
13751
    PyUnicode_Contains, /* sq_contains */
13752
};
13753
13754
static PyObject*
13755
unicode_subscript(PyObject* self, PyObject* item)
13756
110M
{
13757
110M
    if (_PyIndex_Check(item)) {
13758
52.8M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13759
52.8M
        if (i == -1 && PyErr_Occurred())
13760
0
            return NULL;
13761
52.8M
        if (i < 0)
13762
55.8k
            i += PyUnicode_GET_LENGTH(self);
13763
52.8M
        return unicode_getitem(self, i);
13764
57.2M
    } else if (PySlice_Check(item)) {
13765
57.2M
        Py_ssize_t start, stop, step, slicelength, i;
13766
57.2M
        size_t cur;
13767
57.2M
        PyObject *result;
13768
57.2M
        const void *src_data;
13769
57.2M
        void *dest_data;
13770
57.2M
        int src_kind, dest_kind;
13771
57.2M
        Py_UCS4 ch, max_char, kind_limit;
13772
13773
57.2M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13774
0
            return NULL;
13775
0
        }
13776
57.2M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13777
57.2M
                                            &start, &stop, step);
13778
13779
57.2M
        if (slicelength <= 0) {
13780
10.6M
            _Py_RETURN_UNICODE_EMPTY();
13781
46.5M
        } else if (start == 0 && step == 1 &&
13782
11.2M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
13783
2.01M
            return unicode_result_unchanged(self);
13784
44.5M
        } else if (step == 1) {
13785
44.5M
            return PyUnicode_Substring(self,
13786
44.5M
                                       start, start + slicelength);
13787
44.5M
        }
13788
        /* General case */
13789
0
        src_kind = PyUnicode_KIND(self);
13790
0
        src_data = PyUnicode_DATA(self);
13791
0
        if (!PyUnicode_IS_ASCII(self)) {
13792
0
            kind_limit = kind_maxchar_limit(src_kind);
13793
0
            max_char = 0;
13794
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13795
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
13796
0
                if (ch > max_char) {
13797
0
                    max_char = ch;
13798
0
                    if (max_char >= kind_limit)
13799
0
                        break;
13800
0
                }
13801
0
            }
13802
0
        }
13803
0
        else
13804
0
            max_char = 127;
13805
0
        result = PyUnicode_New(slicelength, max_char);
13806
0
        if (result == NULL)
13807
0
            return NULL;
13808
0
        dest_kind = PyUnicode_KIND(result);
13809
0
        dest_data = PyUnicode_DATA(result);
13810
13811
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13812
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13813
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13814
0
        }
13815
0
        assert(_PyUnicode_CheckConsistency(result, 1));
13816
0
        return result;
13817
0
    } else {
13818
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13819
0
                     Py_TYPE(item)->tp_name);
13820
0
        return NULL;
13821
0
    }
13822
110M
}
13823
13824
static PyMappingMethods unicode_as_mapping = {
13825
    unicode_length,     /* mp_length */
13826
    unicode_subscript,  /* mp_subscript */
13827
    0,                  /* mp_ass_subscript */
13828
};
13829
13830
13831
static PyObject *
13832
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
13833
13834
/*[clinic input]
13835
@classmethod
13836
str.__new__ as unicode_new
13837
13838
    object as x: object = NULL
13839
    encoding: str = NULL
13840
    errors: str = NULL
13841
13842
[clinic start generated code]*/
13843
13844
static PyObject *
13845
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
13846
                 const char *errors)
13847
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
13848
10.6M
{
13849
10.6M
    PyObject *unicode;
13850
10.6M
    if (x == NULL) {
13851
0
        unicode = _PyUnicode_GetEmpty();
13852
0
    }
13853
10.6M
    else if (encoding == NULL && errors == NULL) {
13854
10.6M
        unicode = PyObject_Str(x);
13855
10.6M
    }
13856
0
    else {
13857
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
13858
0
    }
13859
13860
10.6M
    if (unicode != NULL && type != &PyUnicode_Type) {
13861
10.6M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
13862
10.6M
    }
13863
10.6M
    return unicode;
13864
10.6M
}
13865
13866
static const char *
13867
arg_as_utf8(PyObject *obj, const char *name)
13868
820k
{
13869
820k
    if (!PyUnicode_Check(obj)) {
13870
0
        PyErr_Format(PyExc_TypeError,
13871
0
                     "str() argument '%s' must be str, not %T",
13872
0
                     name, obj);
13873
0
        return NULL;
13874
0
    }
13875
820k
    return _PyUnicode_AsUTF8NoNUL(obj);
13876
820k
}
13877
13878
static PyObject *
13879
unicode_vectorcall(PyObject *type, PyObject *const *args,
13880
                   size_t nargsf, PyObject *kwnames)
13881
543k
{
13882
543k
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
13883
13884
543k
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
13885
543k
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
13886
        // Fallback to unicode_new()
13887
0
        PyObject *tuple = PyTuple_FromArray(args, nargs);
13888
0
        if (tuple == NULL) {
13889
0
            return NULL;
13890
0
        }
13891
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
13892
0
        if (dict == NULL) {
13893
0
            Py_DECREF(tuple);
13894
0
            return NULL;
13895
0
        }
13896
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
13897
0
        Py_DECREF(tuple);
13898
0
        Py_DECREF(dict);
13899
0
        return ret;
13900
0
    }
13901
543k
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
13902
0
        return NULL;
13903
0
    }
13904
543k
    if (nargs == 0) {
13905
0
        return _PyUnicode_GetEmpty();
13906
0
    }
13907
543k
    PyObject *object = args[0];
13908
543k
    if (nargs == 1) {
13909
502
        return PyObject_Str(object);
13910
502
    }
13911
542k
    const char *encoding = arg_as_utf8(args[1], "encoding");
13912
542k
    if (encoding == NULL) {
13913
0
        return NULL;
13914
0
    }
13915
542k
    const char *errors = NULL;
13916
542k
    if (nargs == 3) {
13917
278k
        errors = arg_as_utf8(args[2], "errors");
13918
278k
        if (errors == NULL) {
13919
0
            return NULL;
13920
0
        }
13921
278k
    }
13922
542k
    return PyUnicode_FromEncodedObject(object, encoding, errors);
13923
542k
}
13924
13925
static PyObject *
13926
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
13927
10.6M
{
13928
10.6M
    PyObject *self;
13929
10.6M
    Py_ssize_t length, char_size;
13930
10.6M
    int share_utf8;
13931
10.6M
    int kind;
13932
10.6M
    void *data;
13933
13934
10.6M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13935
10.6M
    assert(_PyUnicode_CHECK(unicode));
13936
13937
10.6M
    self = type->tp_alloc(type, 0);
13938
10.6M
    if (self == NULL) {
13939
0
        return NULL;
13940
0
    }
13941
10.6M
    kind = PyUnicode_KIND(unicode);
13942
10.6M
    length = PyUnicode_GET_LENGTH(unicode);
13943
13944
10.6M
    _PyUnicode_LENGTH(self) = length;
13945
#ifdef Py_DEBUG
13946
    _PyUnicode_HASH(self) = -1;
13947
#else
13948
10.6M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13949
10.6M
#endif
13950
10.6M
    _PyUnicode_STATE(self).interned = 0;
13951
10.6M
    _PyUnicode_STATE(self).kind = kind;
13952
10.6M
    _PyUnicode_STATE(self).compact = 0;
13953
10.6M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13954
10.6M
    _PyUnicode_STATE(self).statically_allocated = 0;
13955
10.6M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
13956
10.6M
    PyUnicode_SET_UTF8(self, NULL);
13957
10.6M
    _PyUnicode_DATA_ANY(self) = NULL;
13958
13959
10.6M
    share_utf8 = 0;
13960
10.6M
    if (kind == PyUnicode_1BYTE_KIND) {
13961
9.38M
        char_size = 1;
13962
9.38M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13963
9.34M
            share_utf8 = 1;
13964
9.38M
    }
13965
1.25M
    else if (kind == PyUnicode_2BYTE_KIND) {
13966
1.20M
        char_size = 2;
13967
1.20M
    }
13968
50.6k
    else {
13969
50.6k
        assert(kind == PyUnicode_4BYTE_KIND);
13970
50.6k
        char_size = 4;
13971
50.6k
    }
13972
13973
    /* Ensure we won't overflow the length. */
13974
10.6M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13975
0
        PyErr_NoMemory();
13976
0
        goto onError;
13977
0
    }
13978
10.6M
    data = PyMem_Malloc((length + 1) * char_size);
13979
10.6M
    if (data == NULL) {
13980
0
        PyErr_NoMemory();
13981
0
        goto onError;
13982
0
    }
13983
13984
10.6M
    _PyUnicode_DATA_ANY(self) = data;
13985
10.6M
    if (share_utf8) {
13986
9.34M
        PyUnicode_SET_UTF8_LENGTH(self, length);
13987
9.34M
        PyUnicode_SET_UTF8(self, data);
13988
9.34M
    }
13989
13990
10.6M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
13991
10.6M
    assert(_PyUnicode_CheckConsistency(self, 1));
13992
#ifdef Py_DEBUG
13993
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13994
#endif
13995
10.6M
    return self;
13996
13997
0
onError:
13998
0
    Py_DECREF(self);
13999
0
    return NULL;
14000
10.6M
}
14001
14002
void
14003
_PyUnicode_ExactDealloc(PyObject *op)
14004
84.7M
{
14005
84.7M
    assert(PyUnicode_CheckExact(op));
14006
84.7M
    unicode_dealloc(op);
14007
84.7M
}
14008
14009
PyDoc_STRVAR(unicode_doc,
14010
"str(object='') -> str\n\
14011
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14012
\n\
14013
Create a new string object from the given object. If encoding or\n\
14014
errors is specified, then the object must expose a data buffer\n\
14015
that will be decoded using the given encoding and error handler.\n\
14016
Otherwise, returns the result of object.__str__() (if defined)\n\
14017
or repr(object).\n\
14018
encoding defaults to 'utf-8'.\n\
14019
errors defaults to 'strict'.");
14020
14021
static PyObject *unicode_iter(PyObject *seq);
14022
14023
PyTypeObject PyUnicode_Type = {
14024
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14025
    "str",                        /* tp_name */
14026
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14027
    0,                            /* tp_itemsize */
14028
    /* Slots */
14029
    unicode_dealloc,              /* tp_dealloc */
14030
    0,                            /* tp_vectorcall_offset */
14031
    0,                            /* tp_getattr */
14032
    0,                            /* tp_setattr */
14033
    0,                            /* tp_as_async */
14034
    unicode_repr,                 /* tp_repr */
14035
    &unicode_as_number,           /* tp_as_number */
14036
    &unicode_as_sequence,         /* tp_as_sequence */
14037
    &unicode_as_mapping,          /* tp_as_mapping */
14038
    unicode_hash,                 /* tp_hash*/
14039
    0,                            /* tp_call*/
14040
    unicode_str,                  /* tp_str */
14041
    PyObject_GenericGetAttr,      /* tp_getattro */
14042
    0,                            /* tp_setattro */
14043
    0,                            /* tp_as_buffer */
14044
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14045
        Py_TPFLAGS_UNICODE_SUBCLASS |
14046
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14047
    unicode_doc,                  /* tp_doc */
14048
    0,                            /* tp_traverse */
14049
    0,                            /* tp_clear */
14050
    PyUnicode_RichCompare,        /* tp_richcompare */
14051
    0,                            /* tp_weaklistoffset */
14052
    unicode_iter,                 /* tp_iter */
14053
    0,                            /* tp_iternext */
14054
    unicode_methods,              /* tp_methods */
14055
    0,                            /* tp_members */
14056
    0,                            /* tp_getset */
14057
    0,                            /* tp_base */
14058
    0,                            /* tp_dict */
14059
    0,                            /* tp_descr_get */
14060
    0,                            /* tp_descr_set */
14061
    0,                            /* tp_dictoffset */
14062
    0,                            /* tp_init */
14063
    0,                            /* tp_alloc */
14064
    unicode_new,                  /* tp_new */
14065
    PyObject_Free,                /* tp_free */
14066
    .tp_vectorcall = unicode_vectorcall,
14067
};
14068
14069
/* Initialize the Unicode implementation */
14070
14071
static void
14072
_init_global_state(void)
14073
22
{
14074
22
    static int initialized = 0;
14075
22
    if (initialized) {
14076
0
        return;
14077
0
    }
14078
22
    initialized = 1;
14079
14080
    /* initialize the linebreak bloom filter */
14081
22
    const Py_UCS2 linebreak[] = {
14082
22
        0x000A, /* LINE FEED */
14083
22
        0x000D, /* CARRIAGE RETURN */
14084
22
        0x001C, /* FILE SEPARATOR */
14085
22
        0x001D, /* GROUP SEPARATOR */
14086
22
        0x001E, /* RECORD SEPARATOR */
14087
22
        0x0085, /* NEXT LINE */
14088
22
        0x2028, /* LINE SEPARATOR */
14089
22
        0x2029, /* PARAGRAPH SEPARATOR */
14090
22
    };
14091
22
    bloom_linebreak = make_bloom_mask(
14092
22
        PyUnicode_2BYTE_KIND, linebreak,
14093
22
        Py_ARRAY_LENGTH(linebreak));
14094
22
}
14095
14096
void
14097
_PyUnicode_InitState(PyInterpreterState *interp)
14098
22
{
14099
22
    if (!_Py_IsMainInterpreter(interp)) {
14100
0
        return;
14101
0
    }
14102
22
    _init_global_state();
14103
22
}
14104
14105
14106
PyStatus
14107
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14108
22
{
14109
22
    if (_Py_IsMainInterpreter(interp)) {
14110
22
        PyStatus status = init_global_interned_strings(interp);
14111
22
        if (_PyStatus_EXCEPTION(status)) {
14112
0
            return status;
14113
0
        }
14114
22
    }
14115
22
    assert(INTERNED_STRINGS);
14116
14117
22
    if (init_interned_dict(interp)) {
14118
0
        PyErr_Clear();
14119
0
        return _PyStatus_ERR("failed to create interned dict");
14120
0
    }
14121
14122
22
    return _PyStatus_OK();
14123
22
}
14124
14125
14126
PyStatus
14127
_PyUnicode_InitTypes(PyInterpreterState *interp)
14128
22
{
14129
22
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
14130
0
        goto error;
14131
0
    }
14132
22
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
14133
0
        goto error;
14134
0
    }
14135
22
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
14136
0
        goto error;
14137
0
    }
14138
22
    return _PyStatus_OK();
14139
14140
0
error:
14141
0
    return _PyStatus_ERR("Can't initialize unicode types");
14142
22
}
14143
14144
static /* non-null */ PyObject*
14145
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
14146
23.6k
{
14147
    // Note that this steals a reference to `s`, but in many cases that
14148
    // stolen ref is returned, requiring no decref/incref.
14149
14150
23.6k
    assert(s != NULL);
14151
23.6k
    assert(_PyUnicode_CHECK(s));
14152
23.6k
    assert(_PyUnicode_STATE(s).statically_allocated);
14153
23.6k
    assert(!PyUnicode_CHECK_INTERNED(s));
14154
14155
#ifdef Py_DEBUG
14156
    /* We must not add process-global interned string if there's already a
14157
     * per-interpreter interned_dict, which might contain duplicates.
14158
     */
14159
    PyObject *interned = get_interned_dict(interp);
14160
    assert(interned == NULL);
14161
#endif
14162
14163
    /* Look in the global cache first. */
14164
23.6k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14165
    /* We should only init each string once */
14166
23.6k
    assert(r == NULL);
14167
    /* but just in case (for the non-debug build), handle this */
14168
23.6k
    if (r != NULL && r != s) {
14169
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
14170
0
        assert(_PyUnicode_CHECK(r));
14171
0
        Py_DECREF(s);
14172
0
        return Py_NewRef(r);
14173
0
    }
14174
14175
23.6k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
14176
0
        Py_FatalError("failed to intern static string");
14177
0
    }
14178
14179
23.6k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14180
23.6k
    return s;
14181
23.6k
}
14182
14183
void
14184
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
14185
23.6k
{
14186
    // This should only be called as part of runtime initialization
14187
23.6k
    assert(!Py_IsInitialized());
14188
14189
23.6k
    *p = intern_static(interp, *p);
14190
23.6k
    assert(*p);
14191
23.6k
}
14192
14193
static void
14194
immortalize_interned(PyObject *s)
14195
130k
{
14196
130k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
14197
130k
    assert(!_Py_IsImmortal(s));
14198
#ifdef Py_REF_DEBUG
14199
    /* The reference count value should be excluded from the RefTotal.
14200
       The decrements to these objects will not be registered so they
14201
       need to be accounted for in here. */
14202
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
14203
        _Py_DecRefTotal(_PyThreadState_GET());
14204
    }
14205
#endif
14206
130k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
14207
130k
    _Py_SetImmortal(s);
14208
130k
}
14209
14210
static /* non-null */ PyObject*
14211
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
14212
              bool immortalize)
14213
38.2M
{
14214
    // Note that this steals a reference to `s`, but in many cases that
14215
    // stolen ref is returned, requiring no decref/incref.
14216
14217
#ifdef Py_DEBUG
14218
    assert(s != NULL);
14219
    assert(_PyUnicode_CHECK(s));
14220
#else
14221
38.2M
    if (s == NULL || !PyUnicode_Check(s)) {
14222
0
        return s;
14223
0
    }
14224
38.2M
#endif
14225
14226
    /* If it's a subclass, we don't really know what putting
14227
       it in the interned dict might do. */
14228
38.2M
    if (!PyUnicode_CheckExact(s)) {
14229
0
        return s;
14230
0
    }
14231
14232
    /* Is it already interned? */
14233
38.2M
    switch (PyUnicode_CHECK_INTERNED(s)) {
14234
3.49M
        case SSTATE_NOT_INTERNED:
14235
            // no, go on
14236
3.49M
            break;
14237
23.1k
        case SSTATE_INTERNED_MORTAL:
14238
            // yes but we might need to make it immortal
14239
23.1k
            if (immortalize) {
14240
5.62k
                immortalize_interned(s);
14241
5.62k
            }
14242
23.1k
            return s;
14243
34.7M
        default:
14244
            // all done
14245
34.7M
            return s;
14246
38.2M
    }
14247
14248
    /* Statically allocated strings must be already interned. */
14249
38.2M
    assert(!_PyUnicode_STATE(s).statically_allocated);
14250
14251
#if Py_GIL_DISABLED
14252
    /* In the free-threaded build, all interned strings are immortal */
14253
    immortalize = 1;
14254
#endif
14255
14256
    /* If it's already immortal, intern it as such */
14257
3.49M
    if (_Py_IsImmortal(s)) {
14258
0
        immortalize = 1;
14259
0
    }
14260
14261
    /* if it's a short string, get the singleton */
14262
3.49M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
14263
22.4k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
14264
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
14265
0
        assert(PyUnicode_CHECK_INTERNED(r));
14266
0
        Py_DECREF(s);
14267
0
        return r;
14268
0
    }
14269
#ifdef Py_DEBUG
14270
    assert(!unicode_is_singleton(s));
14271
#endif
14272
14273
    /* Look in the global cache now. */
14274
3.49M
    {
14275
3.49M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14276
3.49M
        if (r != NULL) {
14277
346k
            assert(_PyUnicode_STATE(r).statically_allocated);
14278
346k
            assert(r != s);  // r must be statically_allocated; s is not
14279
346k
            Py_DECREF(s);
14280
346k
            return Py_NewRef(r);
14281
346k
        }
14282
3.49M
    }
14283
14284
    /* Do a setdefault on the per-interpreter cache. */
14285
3.15M
    PyObject *interned = get_interned_dict(interp);
14286
3.15M
    assert(interned != NULL);
14287
#ifdef Py_GIL_DISABLED
14288
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
14289
#endif
14290
3.15M
    FT_MUTEX_LOCK(INTERN_MUTEX);
14291
3.15M
    PyObject *t;
14292
3.15M
    {
14293
3.15M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
14294
3.15M
        if (res < 0) {
14295
0
            PyErr_Clear();
14296
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14297
0
            return s;
14298
0
        }
14299
3.15M
        else if (res == 1) {
14300
            // value was already present (not inserted)
14301
2.34M
            Py_DECREF(s);
14302
2.34M
            if (immortalize &&
14303
652k
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
14304
5.80k
                immortalize_interned(t);
14305
5.80k
            }
14306
2.34M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14307
2.34M
            return t;
14308
2.34M
        }
14309
803k
        else {
14310
            // value was newly inserted
14311
803k
            assert (s == t);
14312
803k
            Py_DECREF(t);
14313
803k
        }
14314
3.15M
    }
14315
14316
    /* NOT_INTERNED -> INTERNED_MORTAL */
14317
14318
3.15M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
14319
14320
803k
    if (!_Py_IsImmortal(s)) {
14321
        /* The two references in interned dict (key and value) are not counted.
14322
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
14323
803k
        Py_DECREF(s);
14324
803k
        Py_DECREF(s);
14325
803k
    }
14326
803k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
14327
14328
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
14329
14330
#ifdef Py_DEBUG
14331
    if (_Py_IsImmortal(s)) {
14332
        assert(immortalize);
14333
    }
14334
#endif
14335
803k
    if (immortalize) {
14336
118k
        immortalize_interned(s);
14337
118k
    }
14338
14339
803k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
14340
803k
    return s;
14341
3.15M
}
14342
14343
void
14344
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
14345
2.90M
{
14346
2.90M
    *p = intern_common(interp, *p, 1);
14347
2.90M
    assert(*p);
14348
2.90M
}
14349
14350
void
14351
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
14352
35.3M
{
14353
35.3M
    *p = intern_common(interp, *p, 0);
14354
35.3M
    assert(*p);
14355
35.3M
}
14356
14357
14358
void
14359
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
14360
0
{
14361
0
    _PyUnicode_InternImmortal(interp, p);
14362
0
    return;
14363
0
}
14364
14365
void
14366
PyUnicode_InternInPlace(PyObject **p)
14367
0
{
14368
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14369
0
    _PyUnicode_InternMortal(interp, p);
14370
0
}
14371
14372
// Public-looking name kept for the stable ABI; user should not call this:
14373
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14374
void
14375
PyUnicode_InternImmortal(PyObject **p)
14376
0
{
14377
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14378
0
    _PyUnicode_InternImmortal(interp, p);
14379
0
}
14380
14381
PyObject *
14382
PyUnicode_InternFromString(const char *cp)
14383
1.14M
{
14384
1.14M
    PyObject *s = PyUnicode_FromString(cp);
14385
1.14M
    if (s == NULL) {
14386
0
        return NULL;
14387
0
    }
14388
1.14M
    PyInterpreterState *interp = _PyInterpreterState_GET();
14389
1.14M
    _PyUnicode_InternMortal(interp, &s);
14390
1.14M
    return s;
14391
1.14M
}
14392
14393
14394
void
14395
_PyUnicode_ClearInterned(PyInterpreterState *interp)
14396
0
{
14397
0
    PyObject *interned = get_interned_dict(interp);
14398
0
    if (interned == NULL) {
14399
0
        return;
14400
0
    }
14401
0
    assert(PyDict_CheckExact(interned));
14402
14403
0
    if (has_shared_intern_dict(interp)) {
14404
        // the dict doesn't belong to this interpreter, skip the debug
14405
        // checks on it and just clear the pointer to it
14406
0
        clear_interned_dict(interp);
14407
0
        return;
14408
0
    }
14409
14410
#ifdef INTERNED_STATS
14411
    fprintf(stderr, "releasing %zd interned strings\n",
14412
            PyDict_GET_SIZE(interned));
14413
14414
    Py_ssize_t total_length = 0;
14415
#endif
14416
0
    Py_ssize_t pos = 0;
14417
0
    PyObject *s, *ignored_value;
14418
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
14419
0
        int shared = 0;
14420
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
14421
0
        case SSTATE_INTERNED_IMMORTAL:
14422
            /* Make immortal interned strings mortal again. */
14423
            // Skip the Immortal Instance check and restore
14424
            // the two references (key and value) ignored
14425
            // by PyUnicode_InternInPlace().
14426
0
            _Py_SetMortal(s, 2);
14427
#ifdef Py_REF_DEBUG
14428
            /* let's be pedantic with the ref total */
14429
            _Py_IncRefTotal(_PyThreadState_GET());
14430
            _Py_IncRefTotal(_PyThreadState_GET());
14431
#endif
14432
#ifdef INTERNED_STATS
14433
            total_length += PyUnicode_GET_LENGTH(s);
14434
#endif
14435
0
            break;
14436
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
14437
            /* It is shared between interpreters, so we should unmark it
14438
               only when this is the last interpreter in which it's
14439
               interned.  We immortalize all the statically initialized
14440
               strings during startup, so we can rely on the
14441
               main interpreter to be the last one. */
14442
0
            if (!_Py_IsMainInterpreter(interp)) {
14443
0
                shared = 1;
14444
0
            }
14445
0
            break;
14446
0
        case SSTATE_INTERNED_MORTAL:
14447
            // Restore 2 references held by the interned dict; these will
14448
            // be decref'd by clear_interned_dict's PyDict_Clear.
14449
0
            _Py_RefcntAdd(s, 2);
14450
#ifdef Py_REF_DEBUG
14451
            /* let's be pedantic with the ref total */
14452
            _Py_IncRefTotal(_PyThreadState_GET());
14453
            _Py_IncRefTotal(_PyThreadState_GET());
14454
#endif
14455
0
            break;
14456
0
        case SSTATE_NOT_INTERNED:
14457
0
            _Py_FALLTHROUGH;
14458
0
        default:
14459
0
            Py_UNREACHABLE();
14460
0
        }
14461
0
        if (!shared) {
14462
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
14463
0
        }
14464
0
    }
14465
#ifdef INTERNED_STATS
14466
    fprintf(stderr,
14467
            "total length of all interned strings: %zd characters\n",
14468
            total_length);
14469
#endif
14470
14471
0
    struct _Py_unicode_state *state = &interp->unicode;
14472
0
    struct _Py_unicode_ids *ids = &state->ids;
14473
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
14474
0
        Py_XINCREF(ids->array[i]);
14475
0
    }
14476
0
    clear_interned_dict(interp);
14477
0
    if (_Py_IsMainInterpreter(interp)) {
14478
0
        clear_global_interned_strings();
14479
0
    }
14480
0
}
14481
14482
14483
/********************* Unicode Iterator **************************/
14484
14485
typedef struct {
14486
    PyObject_HEAD
14487
    Py_ssize_t it_index;
14488
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14489
} unicodeiterobject;
14490
14491
static void
14492
unicodeiter_dealloc(PyObject *op)
14493
1.68M
{
14494
1.68M
    unicodeiterobject *it = (unicodeiterobject *)op;
14495
1.68M
    _PyObject_GC_UNTRACK(it);
14496
1.68M
    Py_XDECREF(it->it_seq);
14497
1.68M
    PyObject_GC_Del(it);
14498
1.68M
}
14499
14500
static int
14501
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
14502
1
{
14503
1
    unicodeiterobject *it = (unicodeiterobject *)op;
14504
1
    Py_VISIT(it->it_seq);
14505
1
    return 0;
14506
1
}
14507
14508
static PyObject *
14509
unicodeiter_next(PyObject *op)
14510
140M
{
14511
140M
    unicodeiterobject *it = (unicodeiterobject *)op;
14512
140M
    PyObject *seq;
14513
14514
140M
    assert(it != NULL);
14515
140M
    seq = it->it_seq;
14516
140M
    if (seq == NULL)
14517
0
        return NULL;
14518
140M
    assert(_PyUnicode_CHECK(seq));
14519
14520
140M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14521
139M
        int kind = PyUnicode_KIND(seq);
14522
139M
        const void *data = PyUnicode_DATA(seq);
14523
139M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14524
139M
        it->it_index++;
14525
139M
        return unicode_char(chr);
14526
139M
    }
14527
14528
848k
    it->it_seq = NULL;
14529
848k
    Py_DECREF(seq);
14530
848k
    return NULL;
14531
140M
}
14532
14533
static PyObject *
14534
unicode_ascii_iter_next(PyObject *op)
14535
111M
{
14536
111M
    unicodeiterobject *it = (unicodeiterobject *)op;
14537
111M
    assert(it != NULL);
14538
111M
    PyObject *seq = it->it_seq;
14539
111M
    if (seq == NULL) {
14540
0
        return NULL;
14541
0
    }
14542
111M
    assert(_PyUnicode_CHECK(seq));
14543
111M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
14544
111M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14545
110M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14546
110M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14547
110M
                                              data, it->it_index);
14548
110M
        it->it_index++;
14549
110M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14550
110M
    }
14551
753k
    it->it_seq = NULL;
14552
753k
    Py_DECREF(seq);
14553
753k
    return NULL;
14554
111M
}
14555
14556
static PyObject *
14557
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
14558
0
{
14559
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14560
0
    Py_ssize_t len = 0;
14561
0
    if (it->it_seq)
14562
0
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14563
0
    return PyLong_FromSsize_t(len);
14564
0
}
14565
14566
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14567
14568
static PyObject *
14569
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
14570
0
{
14571
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14572
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
14573
14574
    /* _PyEval_GetBuiltin can invoke arbitrary code,
14575
     * call must be before access of iterator pointers.
14576
     * see issue #101765 */
14577
14578
0
    if (it->it_seq != NULL) {
14579
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
14580
0
    } else {
14581
0
        PyObject *u = _PyUnicode_GetEmpty();
14582
0
        if (u == NULL) {
14583
0
            Py_XDECREF(iter);
14584
0
            return NULL;
14585
0
        }
14586
0
        return Py_BuildValue("N(N)", iter, u);
14587
0
    }
14588
0
}
14589
14590
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14591
14592
static PyObject *
14593
unicodeiter_setstate(PyObject *op, PyObject *state)
14594
0
{
14595
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14596
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
14597
0
    if (index == -1 && PyErr_Occurred())
14598
0
        return NULL;
14599
0
    if (it->it_seq != NULL) {
14600
0
        if (index < 0)
14601
0
            index = 0;
14602
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14603
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14604
0
        it->it_index = index;
14605
0
    }
14606
0
    Py_RETURN_NONE;
14607
0
}
14608
14609
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14610
14611
static PyMethodDef unicodeiter_methods[] = {
14612
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
14613
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
14614
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
14615
    {NULL,      NULL}       /* sentinel */
14616
};
14617
14618
PyTypeObject PyUnicodeIter_Type = {
14619
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14620
    "str_iterator",         /* tp_name */
14621
    sizeof(unicodeiterobject),      /* tp_basicsize */
14622
    0,                  /* tp_itemsize */
14623
    /* methods */
14624
    unicodeiter_dealloc,/* tp_dealloc */
14625
    0,                  /* tp_vectorcall_offset */
14626
    0,                  /* tp_getattr */
14627
    0,                  /* tp_setattr */
14628
    0,                  /* tp_as_async */
14629
    0,                  /* tp_repr */
14630
    0,                  /* tp_as_number */
14631
    0,                  /* tp_as_sequence */
14632
    0,                  /* tp_as_mapping */
14633
    0,                  /* tp_hash */
14634
    0,                  /* tp_call */
14635
    0,                  /* tp_str */
14636
    PyObject_GenericGetAttr,        /* tp_getattro */
14637
    0,                  /* tp_setattro */
14638
    0,                  /* tp_as_buffer */
14639
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14640
    0,                  /* tp_doc */
14641
    unicodeiter_traverse, /* tp_traverse */
14642
    0,                  /* tp_clear */
14643
    0,                  /* tp_richcompare */
14644
    0,                  /* tp_weaklistoffset */
14645
    PyObject_SelfIter,          /* tp_iter */
14646
    unicodeiter_next,   /* tp_iternext */
14647
    unicodeiter_methods,            /* tp_methods */
14648
    0,
14649
};
14650
14651
PyTypeObject _PyUnicodeASCIIIter_Type = {
14652
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14653
    .tp_name = "str_ascii_iterator",
14654
    .tp_basicsize = sizeof(unicodeiterobject),
14655
    .tp_dealloc = unicodeiter_dealloc,
14656
    .tp_getattro = PyObject_GenericGetAttr,
14657
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14658
    .tp_traverse = unicodeiter_traverse,
14659
    .tp_iter = PyObject_SelfIter,
14660
    .tp_iternext = unicode_ascii_iter_next,
14661
    .tp_methods = unicodeiter_methods,
14662
};
14663
14664
static PyObject *
14665
unicode_iter(PyObject *seq)
14666
1.68M
{
14667
1.68M
    unicodeiterobject *it;
14668
14669
1.68M
    if (!PyUnicode_Check(seq)) {
14670
0
        PyErr_BadInternalCall();
14671
0
        return NULL;
14672
0
    }
14673
1.68M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14674
835k
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14675
835k
    }
14676
848k
    else {
14677
848k
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14678
848k
    }
14679
1.68M
    if (it == NULL)
14680
0
        return NULL;
14681
1.68M
    it->it_index = 0;
14682
1.68M
    it->it_seq = Py_NewRef(seq);
14683
1.68M
    _PyObject_GC_TRACK(it);
14684
1.68M
    return (PyObject *)it;
14685
1.68M
}
14686
14687
static int
14688
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14689
88
{
14690
88
    int res;
14691
88
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14692
88
    if (res == -2) {
14693
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
14694
0
        return -1;
14695
0
    }
14696
88
    if (res < 0) {
14697
0
        PyErr_NoMemory();
14698
0
        return -1;
14699
0
    }
14700
88
    return 0;
14701
88
}
14702
14703
14704
static int
14705
config_get_codec_name(wchar_t **config_encoding)
14706
44
{
14707
44
    char *encoding;
14708
44
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
14709
0
        return -1;
14710
0
    }
14711
14712
44
    PyObject *name_obj = NULL;
14713
44
    PyObject *codec = _PyCodec_Lookup(encoding);
14714
44
    PyMem_RawFree(encoding);
14715
14716
44
    if (!codec)
14717
0
        goto error;
14718
14719
44
    name_obj = PyObject_GetAttrString(codec, "name");
14720
44
    Py_CLEAR(codec);
14721
44
    if (!name_obj) {
14722
0
        goto error;
14723
0
    }
14724
14725
44
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14726
44
    Py_DECREF(name_obj);
14727
44
    if (wname == NULL) {
14728
0
        goto error;
14729
0
    }
14730
14731
44
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14732
44
    if (raw_wname == NULL) {
14733
0
        PyMem_Free(wname);
14734
0
        PyErr_NoMemory();
14735
0
        goto error;
14736
0
    }
14737
14738
44
    PyMem_RawFree(*config_encoding);
14739
44
    *config_encoding = raw_wname;
14740
14741
44
    PyMem_Free(wname);
14742
44
    return 0;
14743
14744
0
error:
14745
0
    Py_XDECREF(codec);
14746
0
    Py_XDECREF(name_obj);
14747
0
    return -1;
14748
44
}
14749
14750
14751
static PyStatus
14752
init_stdio_encoding(PyInterpreterState *interp)
14753
22
{
14754
    /* Update the stdio encoding to the normalized Python codec name. */
14755
22
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14756
22
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
14757
0
        return _PyStatus_ERR("failed to get the Python codec name "
14758
0
                             "of the stdio encoding");
14759
0
    }
14760
22
    return _PyStatus_OK();
14761
22
}
14762
14763
14764
static int
14765
init_fs_codec(PyInterpreterState *interp)
14766
22
{
14767
22
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
14768
14769
22
    _Py_error_handler error_handler;
14770
22
    error_handler = get_error_handler_wide(config->filesystem_errors);
14771
22
    if (error_handler == _Py_ERROR_UNKNOWN) {
14772
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
14773
0
        return -1;
14774
0
    }
14775
14776
22
    char *encoding, *errors;
14777
22
    if (encode_wstr_utf8(config->filesystem_encoding,
14778
22
                         &encoding,
14779
22
                         "filesystem_encoding") < 0) {
14780
0
        return -1;
14781
0
    }
14782
14783
22
    if (encode_wstr_utf8(config->filesystem_errors,
14784
22
                         &errors,
14785
22
                         "filesystem_errors") < 0) {
14786
0
        PyMem_RawFree(encoding);
14787
0
        return -1;
14788
0
    }
14789
14790
22
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
14791
22
    PyMem_RawFree(fs_codec->encoding);
14792
22
    fs_codec->encoding = encoding;
14793
    /* encoding has been normalized by init_fs_encoding() */
14794
22
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
14795
22
    PyMem_RawFree(fs_codec->errors);
14796
22
    fs_codec->errors = errors;
14797
22
    fs_codec->error_handler = error_handler;
14798
14799
#ifdef _Py_FORCE_UTF8_FS_ENCODING
14800
    assert(fs_codec->utf8 == 1);
14801
#endif
14802
14803
    /* At this point, PyUnicode_EncodeFSDefault() and
14804
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
14805
       the C implementation of the filesystem encoding. */
14806
14807
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
14808
       global configuration variables. */
14809
22
    if (_Py_IsMainInterpreter(interp)) {
14810
14811
22
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
14812
22
                                      fs_codec->errors) < 0) {
14813
0
            PyErr_NoMemory();
14814
0
            return -1;
14815
0
        }
14816
22
    }
14817
22
    return 0;
14818
22
}
14819
14820
14821
static PyStatus
14822
init_fs_encoding(PyThreadState *tstate)
14823
22
{
14824
22
    PyInterpreterState *interp = tstate->interp;
14825
14826
    /* Update the filesystem encoding to the normalized Python codec name.
14827
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
14828
       (Python codec name). */
14829
22
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14830
22
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
14831
0
        _Py_DumpPathConfig(tstate);
14832
0
        return _PyStatus_ERR("failed to get the Python codec "
14833
0
                             "of the filesystem encoding");
14834
0
    }
14835
14836
22
    if (init_fs_codec(interp) < 0) {
14837
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
14838
0
    }
14839
22
    return _PyStatus_OK();
14840
22
}
14841
14842
14843
PyStatus
14844
_PyUnicode_InitEncodings(PyThreadState *tstate)
14845
22
{
14846
22
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
14847
22
    if (_PyStatus_EXCEPTION(status)) {
14848
0
        return status;
14849
0
    }
14850
22
    status = init_fs_encoding(tstate);
14851
22
    if (_PyStatus_EXCEPTION(status)) {
14852
0
        return status;
14853
0
    }
14854
14855
22
    return init_stdio_encoding(tstate->interp);
14856
22
}
14857
14858
14859
static void
14860
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
14861
0
{
14862
0
    PyMem_RawFree(fs_codec->encoding);
14863
0
    fs_codec->encoding = NULL;
14864
0
    fs_codec->utf8 = 0;
14865
0
    PyMem_RawFree(fs_codec->errors);
14866
0
    fs_codec->errors = NULL;
14867
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
14868
0
}
14869
14870
14871
#ifdef MS_WINDOWS
14872
int
14873
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
14874
{
14875
    PyInterpreterState *interp = _PyInterpreterState_GET();
14876
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
14877
14878
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
14879
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
14880
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
14881
    if (encoding == NULL || errors == NULL) {
14882
        PyMem_RawFree(encoding);
14883
        PyMem_RawFree(errors);
14884
        PyErr_NoMemory();
14885
        return -1;
14886
    }
14887
14888
    PyMem_RawFree(config->filesystem_encoding);
14889
    config->filesystem_encoding = encoding;
14890
    PyMem_RawFree(config->filesystem_errors);
14891
    config->filesystem_errors = errors;
14892
14893
    return init_fs_codec(interp);
14894
}
14895
#endif
14896
14897
14898
#ifdef Py_DEBUG
14899
static inline int
14900
unicode_is_finalizing(void)
14901
{
14902
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
14903
}
14904
#endif
14905
14906
14907
void
14908
_PyUnicode_FiniTypes(PyInterpreterState *interp)
14909
0
{
14910
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
14911
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
14912
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
14913
0
}
14914
14915
14916
void
14917
_PyUnicode_Fini(PyInterpreterState *interp)
14918
0
{
14919
0
    struct _Py_unicode_state *state = &interp->unicode;
14920
14921
0
    if (!has_shared_intern_dict(interp)) {
14922
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
14923
0
        assert(get_interned_dict(interp) == NULL);
14924
0
    }
14925
14926
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
14927
14928
    // bpo-47182: force a unicodedata CAPI capsule re-import on
14929
    // subsequent initialization of interpreter.
14930
0
    interp->unicode.ucnhash_capi = NULL;
14931
14932
0
    unicode_clear_identifiers(state);
14933
0
}
14934
14935
/* A _string module, to export formatter_parser and formatter_field_name_split
14936
   to the string.Formatter class implemented in Python. */
14937
14938
static PyMethodDef _string_methods[] = {
14939
    {"formatter_field_name_split", formatter_field_name_split,
14940
     METH_O, PyDoc_STR("split the argument as a field name")},
14941
    {"formatter_parser", formatter_parser,
14942
     METH_O, PyDoc_STR("parse the argument as a format string")},
14943
    {NULL, NULL}
14944
};
14945
14946
static PyModuleDef_Slot module_slots[] = {
14947
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
14948
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
14949
    {0, NULL}
14950
};
14951
14952
static struct PyModuleDef _string_module = {
14953
    PyModuleDef_HEAD_INIT,
14954
    .m_name = "_string",
14955
    .m_doc = PyDoc_STR("string helper module"),
14956
    .m_size = 0,
14957
    .m_methods = _string_methods,
14958
    .m_slots = module_slots,
14959
};
14960
14961
PyMODINIT_FUNC
14962
PyInit__string(void)
14963
6
{
14964
6
    return PyModuleDef_Init(&_string_module);
14965
6
}
14966
14967
14968
#undef PyUnicode_KIND
14969
int PyUnicode_KIND(PyObject *op)
14970
0
{
14971
0
    if (!PyUnicode_Check(op)) {
14972
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14973
0
        return -1;
14974
0
    }
14975
0
    return _PyASCIIObject_CAST(op)->state.kind;
14976
0
}
14977
14978
#undef PyUnicode_DATA
14979
void* PyUnicode_DATA(PyObject *op)
14980
0
{
14981
0
    if (!PyUnicode_Check(op)) {
14982
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14983
0
        return NULL;
14984
0
    }
14985
0
    return _PyUnicode_DATA(op);
14986
0
}