Coverage Report

Created: 2025-11-02 06:30

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_initconfig.h"    // _PyStatus_OK()
50
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
51
#include "pycore_long.h"          // _PyLong_FormatWriter()
52
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
54
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
55
#include "pycore_pyhash.h"        // _Py_HashSecret_t
56
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
57
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
58
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
59
#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart
60
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
61
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
62
63
#include "stringlib/eq.h"         // unicode_eq()
64
#include <stddef.h>               // ptrdiff_t
65
66
#ifdef MS_WINDOWS
67
#include <windows.h>
68
#endif
69
70
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
71
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
72
#endif
73
74
/* Uncomment to display statistics on interned strings at exit
75
   in _PyUnicode_ClearInterned(). */
76
/* #define INTERNED_STATS 1 */
77
78
79
/*[clinic input]
80
class str "PyObject *" "&PyUnicode_Type"
81
[clinic start generated code]*/
82
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
83
84
/*[python input]
85
class Py_UCS4_converter(CConverter):
86
    type = 'Py_UCS4'
87
    converter = 'convert_uc'
88
89
    def converter_init(self):
90
        if self.default is not unspecified:
91
            self.c_default = ascii(self.default)
92
            if len(self.c_default) > 4 or self.c_default[0] != "'":
93
                self.c_default = hex(ord(self.default))
94
95
[python start generated code]*/
96
/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
97
98
/* --- Globals ------------------------------------------------------------
99
100
NOTE: In the interpreter's initialization phase, some globals are currently
101
      initialized dynamically as needed. In the process Unicode objects may
102
      be created before the Unicode type is ready.
103
104
*/
105
106
11.1M
#define MAX_UNICODE _Py_MAX_UNICODE
107
143M
#define ensure_unicode _PyUnicode_EnsureUnicode
108
109
#ifdef Py_DEBUG
110
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
111
#else
112
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
113
#endif
114
115
static inline char* _PyUnicode_UTF8(PyObject *op)
116
192M
{
117
192M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
118
192M
}
119
120
static inline char* PyUnicode_UTF8(PyObject *op)
121
67.7M
{
122
67.7M
    assert(_PyUnicode_CHECK(op));
123
67.7M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
124
54.7M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
125
54.7M
    }
126
12.9M
    else {
127
12.9M
         return _PyUnicode_UTF8(op);
128
12.9M
    }
129
67.7M
}
130
131
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
132
20.0M
{
133
20.0M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
134
20.0M
}
135
136
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
137
30.4M
{
138
30.4M
    assert(_PyUnicode_CHECK(op));
139
30.4M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
140
27.3M
         return _PyASCIIObject_CAST(op)->length;
141
27.3M
    }
142
3.17M
    else {
143
3.17M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
144
3.17M
    }
145
30.4M
}
146
147
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
148
20.0M
{
149
20.0M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
150
20.0M
}
151
152
#define _PyUnicode_LENGTH(op)                           \
153
526M
    (_PyASCIIObject_CAST(op)->length)
154
#define _PyUnicode_STATE(op)                            \
155
3.26G
    (_PyASCIIObject_CAST(op)->state)
156
#define _PyUnicode_HASH(op)                             \
157
475M
    (_PyASCIIObject_CAST(op)->hash)
158
159
106M
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
160
161
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
162
43.8M
{
163
43.8M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
164
43.8M
}
165
166
#define _PyUnicode_DATA_ANY(op)                         \
167
42.2M
    (_PyUnicodeObject_CAST(op)->data.any)
168
169
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
170
0
{
171
0
    assert(_PyUnicode_CHECK(op));
172
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
173
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
174
0
}
175
176
/* true if the Unicode object has an allocated UTF-8 memory block
177
   (not shared with other data) */
178
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
179
526M
{
180
526M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
181
169M
            && _PyUnicode_UTF8(op) != NULL
182
9.48M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
183
526M
}
184
185
186
214M
#define LATIN1 _Py_LATIN1_CHR
187
188
/* Forward declaration */
189
static PyObject *
190
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
191
                    const char *errors);
192
static PyObject *
193
unicode_decode_utf8(const char *s, Py_ssize_t size,
194
                    _Py_error_handler error_handler, const char *errors,
195
                    Py_ssize_t *consumed);
196
#ifdef Py_DEBUG
197
static inline int unicode_is_finalizing(void);
198
static int unicode_is_singleton(PyObject *unicode);
199
#endif
200
201
202
// Return a reference to the immortal empty string singleton.
203
PyObject*
204
_PyUnicode_GetEmpty(void)
205
117M
{
206
117M
    _Py_DECLARE_STR(empty, "");
207
117M
    return &_Py_STR(empty);
208
117M
}
209
210
/* This dictionary holds per-interpreter interned strings.
211
 * See InternalDocs/string_interning.md for details.
212
 */
213
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
214
3.25M
{
215
3.25M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
216
3.25M
}
217
218
/* This hashtable holds statically allocated interned strings.
219
 * See InternalDocs/string_interning.md for details.
220
 */
221
3.02M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
222
223
/* Get number of all interned strings for the current interpreter. */
224
Py_ssize_t
225
_PyUnicode_InternedSize(void)
226
0
{
227
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
228
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
229
0
}
230
231
/* Get number of immortal interned strings for the current interpreter. */
232
Py_ssize_t
233
_PyUnicode_InternedSize_Immortal(void)
234
0
{
235
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
236
0
    PyObject *key, *value;
237
0
    Py_ssize_t pos = 0;
238
0
    Py_ssize_t count = 0;
239
240
    // It's tempting to keep a count and avoid a loop here. But, this function
241
    // is intended for refleak tests. It spends extra work to report the true
242
    // value, to help detect bugs in optimizations.
243
244
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
245
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
246
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
247
0
           count++;
248
0
       }
249
0
    }
250
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
251
0
}
252
253
static Py_hash_t unicode_hash(PyObject *);
254
255
static Py_uhash_t
256
hashtable_unicode_hash(const void *key)
257
3.02M
{
258
3.02M
    return unicode_hash((PyObject *)key);
259
3.02M
}
260
261
static int
262
hashtable_unicode_compare(const void *key1, const void *key2)
263
290k
{
264
290k
    PyObject *obj1 = (PyObject *)key1;
265
290k
    PyObject *obj2 = (PyObject *)key2;
266
290k
    if (obj1 != NULL && obj2 != NULL) {
267
290k
        return unicode_eq(obj1, obj2);
268
290k
    }
269
0
    else {
270
0
        return obj1 == obj2;
271
0
    }
272
290k
}
273
274
/* Return true if this interpreter should share the main interpreter's
275
   intern_dict.  That's important for interpreters which load basic
276
   single-phase init extension modules (m_size == -1).  There could be interned
277
   immortal strings that are shared between interpreters, due to the
278
   PyDict_Update(mdict, m_copy) call in import_find_extension().
279
280
   It's not safe to deallocate those strings until all interpreters that
281
   potentially use them are freed.  By storing them in the main interpreter, we
282
   ensure they get freed after all other interpreters are freed.
283
*/
284
static bool
285
has_shared_intern_dict(PyInterpreterState *interp)
286
16
{
287
16
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
288
16
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
289
16
}
290
291
static int
292
init_interned_dict(PyInterpreterState *interp)
293
16
{
294
16
    assert(get_interned_dict(interp) == NULL);
295
16
    PyObject *interned;
296
16
    if (has_shared_intern_dict(interp)) {
297
0
        interned = get_interned_dict(_PyInterpreterState_Main());
298
0
        Py_INCREF(interned);
299
0
    }
300
16
    else {
301
16
        interned = PyDict_New();
302
16
        if (interned == NULL) {
303
0
            return -1;
304
0
        }
305
16
    }
306
16
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
307
16
    return 0;
308
16
}
309
310
static void
311
clear_interned_dict(PyInterpreterState *interp)
312
0
{
313
0
    PyObject *interned = get_interned_dict(interp);
314
0
    if (interned != NULL) {
315
0
        if (!has_shared_intern_dict(interp)) {
316
            // only clear if the dict belongs to this interpreter
317
0
            PyDict_Clear(interned);
318
0
        }
319
0
        Py_DECREF(interned);
320
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
321
0
    }
322
0
}
323
324
static PyStatus
325
init_global_interned_strings(PyInterpreterState *interp)
326
16
{
327
16
    assert(INTERNED_STRINGS == NULL);
328
16
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
329
330
16
    INTERNED_STRINGS = _Py_hashtable_new_full(
331
16
        hashtable_unicode_hash,
332
16
        hashtable_unicode_compare,
333
        // Objects stored here are immortal and statically allocated,
334
        // so we don't need key_destroy_func & value_destroy_func:
335
16
        NULL,
336
16
        NULL,
337
16
        &hashtable_alloc
338
16
    );
339
16
    if (INTERNED_STRINGS == NULL) {
340
0
        PyErr_Clear();
341
0
        return _PyStatus_ERR("failed to create global interned dict");
342
0
    }
343
344
    /* Intern statically allocated string identifiers, deepfreeze strings,
345
        * and one-byte latin-1 strings.
346
        * This must be done before any module initialization so that statically
347
        * allocated string identifiers are used instead of heap allocated strings.
348
        * Deepfreeze uses the interned identifiers if present to save space
349
        * else generates them and they are interned to speed up dict lookups.
350
    */
351
16
    _PyUnicode_InitStaticStrings(interp);
352
353
4.11k
    for (int i = 0; i < 256; i++) {
354
4.09k
        PyObject *s = LATIN1(i);
355
4.09k
        _PyUnicode_InternStatic(interp, &s);
356
4.09k
        assert(s == LATIN1(i));
357
4.09k
    }
358
#ifdef Py_DEBUG
359
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
360
361
    for (int i = 0; i < 256; i++) {
362
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
363
    }
364
#endif
365
16
    return _PyStatus_OK();
366
16
}
367
368
static void clear_global_interned_strings(void)
369
0
{
370
0
    if (INTERNED_STRINGS != NULL) {
371
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
372
0
        INTERNED_STRINGS = NULL;
373
0
    }
374
0
}
375
376
#define _Py_RETURN_UNICODE_EMPTY()   \
377
44.2M
    do {                             \
378
44.2M
        return _PyUnicode_GetEmpty();\
379
44.2M
    } while (0)
380
381
382
/* Fast detection of the most frequent whitespace characters */
383
const unsigned char _Py_ascii_whitespace[] = {
384
    0, 0, 0, 0, 0, 0, 0, 0,
385
/*     case 0x0009: * CHARACTER TABULATION */
386
/*     case 0x000A: * LINE FEED */
387
/*     case 0x000B: * LINE TABULATION */
388
/*     case 0x000C: * FORM FEED */
389
/*     case 0x000D: * CARRIAGE RETURN */
390
    0, 1, 1, 1, 1, 1, 0, 0,
391
    0, 0, 0, 0, 0, 0, 0, 0,
392
/*     case 0x001C: * FILE SEPARATOR */
393
/*     case 0x001D: * GROUP SEPARATOR */
394
/*     case 0x001E: * RECORD SEPARATOR */
395
/*     case 0x001F: * UNIT SEPARATOR */
396
    0, 0, 0, 0, 1, 1, 1, 1,
397
/*     case 0x0020: * SPACE */
398
    1, 0, 0, 0, 0, 0, 0, 0,
399
    0, 0, 0, 0, 0, 0, 0, 0,
400
    0, 0, 0, 0, 0, 0, 0, 0,
401
    0, 0, 0, 0, 0, 0, 0, 0,
402
403
    0, 0, 0, 0, 0, 0, 0, 0,
404
    0, 0, 0, 0, 0, 0, 0, 0,
405
    0, 0, 0, 0, 0, 0, 0, 0,
406
    0, 0, 0, 0, 0, 0, 0, 0,
407
    0, 0, 0, 0, 0, 0, 0, 0,
408
    0, 0, 0, 0, 0, 0, 0, 0,
409
    0, 0, 0, 0, 0, 0, 0, 0,
410
    0, 0, 0, 0, 0, 0, 0, 0
411
};
412
413
/* forward */
414
static PyObject* get_latin1_char(unsigned char ch);
415
416
417
static PyObject *
418
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
419
static PyObject *
420
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
421
static PyObject *
422
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
423
424
static PyObject *
425
unicode_encode_call_errorhandler(const char *errors,
426
       PyObject **errorHandler,const char *encoding, const char *reason,
427
       PyObject *unicode, PyObject **exceptionObject,
428
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
429
430
static void
431
raise_encode_exception(PyObject **exceptionObject,
432
                       const char *encoding,
433
                       PyObject *unicode,
434
                       Py_ssize_t startpos, Py_ssize_t endpos,
435
                       const char *reason);
436
437
/* Same for linebreaks */
438
static const unsigned char ascii_linebreak[] = {
439
    0, 0, 0, 0, 0, 0, 0, 0,
440
/*         0x000A, * LINE FEED */
441
/*         0x000B, * LINE TABULATION */
442
/*         0x000C, * FORM FEED */
443
/*         0x000D, * CARRIAGE RETURN */
444
    0, 0, 1, 1, 1, 1, 0, 0,
445
    0, 0, 0, 0, 0, 0, 0, 0,
446
/*         0x001C, * FILE SEPARATOR */
447
/*         0x001D, * GROUP SEPARATOR */
448
/*         0x001E, * RECORD SEPARATOR */
449
    0, 0, 0, 0, 1, 1, 1, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0,
452
    0, 0, 0, 0, 0, 0, 0, 0,
453
    0, 0, 0, 0, 0, 0, 0, 0,
454
455
    0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 0, 0, 0, 0, 0, 0, 0,
459
    0, 0, 0, 0, 0, 0, 0, 0,
460
    0, 0, 0, 0, 0, 0, 0, 0,
461
    0, 0, 0, 0, 0, 0, 0, 0,
462
    0, 0, 0, 0, 0, 0, 0, 0
463
};
464
465
static int convert_uc(PyObject *obj, void *addr);
466
467
struct encoding_map;
468
#include "clinic/unicodeobject.c.h"
469
470
_Py_error_handler
471
_Py_GetErrorHandler(const char *errors)
472
581k
{
473
581k
    if (errors == NULL || strcmp(errors, "strict") == 0) {
474
223k
        return _Py_ERROR_STRICT;
475
223k
    }
476
358k
    if (strcmp(errors, "surrogateescape") == 0) {
477
179k
        return _Py_ERROR_SURROGATEESCAPE;
478
179k
    }
479
179k
    if (strcmp(errors, "replace") == 0) {
480
179k
        return _Py_ERROR_REPLACE;
481
179k
    }
482
0
    if (strcmp(errors, "ignore") == 0) {
483
0
        return _Py_ERROR_IGNORE;
484
0
    }
485
0
    if (strcmp(errors, "backslashreplace") == 0) {
486
0
        return _Py_ERROR_BACKSLASHREPLACE;
487
0
    }
488
0
    if (strcmp(errors, "surrogatepass") == 0) {
489
0
        return _Py_ERROR_SURROGATEPASS;
490
0
    }
491
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
492
0
        return _Py_ERROR_XMLCHARREFREPLACE;
493
0
    }
494
0
    return _Py_ERROR_OTHER;
495
0
}
496
497
498
static _Py_error_handler
499
get_error_handler_wide(const wchar_t *errors)
500
5.56k
{
501
5.56k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
502
0
        return _Py_ERROR_STRICT;
503
0
    }
504
5.56k
    if (wcscmp(errors, L"surrogateescape") == 0) {
505
5.56k
        return _Py_ERROR_SURROGATEESCAPE;
506
5.56k
    }
507
0
    if (wcscmp(errors, L"replace") == 0) {
508
0
        return _Py_ERROR_REPLACE;
509
0
    }
510
0
    if (wcscmp(errors, L"ignore") == 0) {
511
0
        return _Py_ERROR_IGNORE;
512
0
    }
513
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
514
0
        return _Py_ERROR_BACKSLASHREPLACE;
515
0
    }
516
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
517
0
        return _Py_ERROR_SURROGATEPASS;
518
0
    }
519
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
520
0
        return _Py_ERROR_XMLCHARREFREPLACE;
521
0
    }
522
0
    return _Py_ERROR_OTHER;
523
0
}
524
525
526
static inline int
527
unicode_check_encoding_errors(const char *encoding, const char *errors)
528
22.3M
{
529
22.3M
    if (encoding == NULL && errors == NULL) {
530
11.9M
        return 0;
531
11.9M
    }
532
533
10.4M
    PyInterpreterState *interp = _PyInterpreterState_GET();
534
10.4M
#ifndef Py_DEBUG
535
    /* In release mode, only check in development mode (-X dev) */
536
10.4M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
537
10.4M
        return 0;
538
10.4M
    }
539
#else
540
    /* Always check in debug mode */
541
#endif
542
543
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
544
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
545
0
    if (!interp->unicode.fs_codec.encoding) {
546
0
        return 0;
547
0
    }
548
549
    /* Disable checks during Python finalization. For example, it allows to
550
       call _PyObject_Dump() during finalization for debugging purpose. */
551
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
552
0
        return 0;
553
0
    }
554
555
0
    if (encoding != NULL
556
        // Fast path for the most common built-in encodings. Even if the codec
557
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
558
        // create a temporary Unicode string (the key in the cache).
559
0
        && strcmp(encoding, "utf-8") != 0
560
0
        && strcmp(encoding, "utf8") != 0
561
0
        && strcmp(encoding, "ascii") != 0)
562
0
    {
563
0
        PyObject *handler = _PyCodec_Lookup(encoding);
564
0
        if (handler == NULL) {
565
0
            return -1;
566
0
        }
567
0
        Py_DECREF(handler);
568
0
    }
569
570
0
    if (errors != NULL
571
        // Fast path for the most common built-in error handlers.
572
0
        && strcmp(errors, "strict") != 0
573
0
        && strcmp(errors, "ignore") != 0
574
0
        && strcmp(errors, "replace") != 0
575
0
        && strcmp(errors, "surrogateescape") != 0
576
0
        && strcmp(errors, "surrogatepass") != 0)
577
0
    {
578
0
        PyObject *handler = PyCodec_LookupError(errors);
579
0
        if (handler == NULL) {
580
0
            return -1;
581
0
        }
582
0
        Py_DECREF(handler);
583
0
    }
584
0
    return 0;
585
0
}
586
587
588
int
589
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
590
0
{
591
0
#define CHECK(expr) \
592
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
593
594
0
    assert(op != NULL);
595
0
    CHECK(PyUnicode_Check(op));
596
597
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
598
0
    int kind = ascii->state.kind;
599
600
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
601
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
602
0
    }
603
0
    else {
604
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
605
0
        void *data;
606
607
0
        if (ascii->state.compact == 1) {
608
0
            data = compact + 1;
609
0
            CHECK(kind == PyUnicode_1BYTE_KIND
610
0
                                 || kind == PyUnicode_2BYTE_KIND
611
0
                                 || kind == PyUnicode_4BYTE_KIND);
612
0
            CHECK(ascii->state.ascii == 0);
613
0
            CHECK(_PyUnicode_UTF8(op) != data);
614
0
        }
615
0
        else {
616
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
617
618
0
            data = unicode->data.any;
619
0
            CHECK(kind == PyUnicode_1BYTE_KIND
620
0
                     || kind == PyUnicode_2BYTE_KIND
621
0
                     || kind == PyUnicode_4BYTE_KIND);
622
0
            CHECK(ascii->state.compact == 0);
623
0
            CHECK(data != NULL);
624
0
            if (ascii->state.ascii) {
625
0
                CHECK(_PyUnicode_UTF8(op) == data);
626
0
                CHECK(compact->utf8_length == ascii->length);
627
0
            }
628
0
            else {
629
0
                CHECK(_PyUnicode_UTF8(op) != data);
630
0
            }
631
0
        }
632
0
#ifndef Py_GIL_DISABLED
633
0
        if (_PyUnicode_UTF8(op) == NULL)
634
0
            CHECK(compact->utf8_length == 0);
635
0
#endif
636
0
    }
637
638
    /* check that the best kind is used: O(n) operation */
639
0
    if (check_content) {
640
0
        Py_ssize_t i;
641
0
        Py_UCS4 maxchar = 0;
642
0
        const void *data;
643
0
        Py_UCS4 ch;
644
645
0
        data = PyUnicode_DATA(ascii);
646
0
        for (i=0; i < ascii->length; i++)
647
0
        {
648
0
            ch = PyUnicode_READ(kind, data, i);
649
0
            if (ch > maxchar)
650
0
                maxchar = ch;
651
0
        }
652
0
        if (kind == PyUnicode_1BYTE_KIND) {
653
0
            if (ascii->state.ascii == 0) {
654
0
                CHECK(maxchar >= 128);
655
0
                CHECK(maxchar <= 255);
656
0
            }
657
0
            else
658
0
                CHECK(maxchar < 128);
659
0
        }
660
0
        else if (kind == PyUnicode_2BYTE_KIND) {
661
0
            CHECK(maxchar >= 0x100);
662
0
            CHECK(maxchar <= 0xFFFF);
663
0
        }
664
0
        else {
665
0
            CHECK(maxchar >= 0x10000);
666
0
            CHECK(maxchar <= MAX_UNICODE);
667
0
        }
668
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
669
0
    }
670
671
    /* Check interning state */
672
#ifdef Py_DEBUG
673
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
674
    // extensions can make immortal strings mortal (but with a high enough
675
    // refcount).
676
    // The other way is extremely unlikely (worth a potential failed assertion
677
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
678
    switch (PyUnicode_CHECK_INTERNED(op)) {
679
        case SSTATE_NOT_INTERNED:
680
            if (ascii->state.statically_allocated) {
681
                // This state is for two exceptions:
682
                // - strings are currently checked before they're interned
683
                // - the 256 one-latin1-character strings
684
                //   are static but use SSTATE_NOT_INTERNED
685
            }
686
            else {
687
                CHECK(!_Py_IsImmortal(op));
688
            }
689
            break;
690
        case SSTATE_INTERNED_MORTAL:
691
            CHECK(!ascii->state.statically_allocated);
692
            CHECK(!_Py_IsImmortal(op));
693
            break;
694
        case SSTATE_INTERNED_IMMORTAL:
695
            CHECK(!ascii->state.statically_allocated);
696
            break;
697
        case SSTATE_INTERNED_IMMORTAL_STATIC:
698
            CHECK(ascii->state.statically_allocated);
699
            break;
700
        default:
701
            Py_UNREACHABLE();
702
    }
703
#endif
704
705
0
    return 1;
706
707
0
#undef CHECK
708
0
}
709
710
PyObject*
711
_PyUnicode_Result(PyObject *unicode)
712
50.0M
{
713
50.0M
    assert(_PyUnicode_CHECK(unicode));
714
715
50.0M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
716
50.0M
    if (length == 0) {
717
241
        PyObject *empty = _PyUnicode_GetEmpty();
718
241
        if (unicode != empty) {
719
0
            Py_DECREF(unicode);
720
0
        }
721
241
        return empty;
722
241
    }
723
724
50.0M
    if (length == 1) {
725
262k
        int kind = PyUnicode_KIND(unicode);
726
262k
        if (kind == PyUnicode_1BYTE_KIND) {
727
89.0k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
728
89.0k
            Py_UCS1 ch = data[0];
729
89.0k
            PyObject *latin1_char = LATIN1(ch);
730
89.0k
            if (unicode != latin1_char) {
731
83.0k
                Py_DECREF(unicode);
732
83.0k
            }
733
89.0k
            return latin1_char;
734
89.0k
        }
735
262k
    }
736
737
50.0M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
738
49.9M
    return unicode;
739
50.0M
}
740
551k
#define unicode_result _PyUnicode_Result
741
742
static PyObject*
743
unicode_result_unchanged(PyObject *unicode)
744
158M
{
745
158M
    if (PyUnicode_CheckExact(unicode)) {
746
155M
        return Py_NewRef(unicode);
747
155M
    }
748
3.26M
    else
749
        /* Subtype -- return genuine unicode string with the same value. */
750
3.26M
        return _PyUnicode_Copy(unicode);
751
158M
}
752
753
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
754
   ASCII, Latin1, UTF-8, etc. */
755
static char*
756
backslashreplace(PyBytesWriter *writer, char *str,
757
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
758
0
{
759
0
    Py_ssize_t size, i;
760
0
    Py_UCS4 ch;
761
0
    int kind;
762
0
    const void *data;
763
764
0
    kind = PyUnicode_KIND(unicode);
765
0
    data = PyUnicode_DATA(unicode);
766
767
0
    size = 0;
768
    /* determine replacement size */
769
0
    for (i = collstart; i < collend; ++i) {
770
0
        Py_ssize_t incr;
771
772
0
        ch = PyUnicode_READ(kind, data, i);
773
0
        if (ch < 0x100)
774
0
            incr = 2+2;
775
0
        else if (ch < 0x10000)
776
0
            incr = 2+4;
777
0
        else {
778
0
            assert(ch <= MAX_UNICODE);
779
0
            incr = 2+8;
780
0
        }
781
0
        if (size > PY_SSIZE_T_MAX - incr) {
782
0
            PyErr_SetString(PyExc_OverflowError,
783
0
                            "encoded result is too long for a Python string");
784
0
            return NULL;
785
0
        }
786
0
        size += incr;
787
0
    }
788
789
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
790
0
    if (str == NULL) {
791
0
        return NULL;
792
0
    }
793
794
    /* generate replacement */
795
0
    for (i = collstart; i < collend; ++i) {
796
0
        ch = PyUnicode_READ(kind, data, i);
797
0
        *str++ = '\\';
798
0
        if (ch >= 0x00010000) {
799
0
            *str++ = 'U';
800
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
801
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
802
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
803
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
804
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
805
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
806
0
        }
807
0
        else if (ch >= 0x100) {
808
0
            *str++ = 'u';
809
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
810
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
811
0
        }
812
0
        else
813
0
            *str++ = 'x';
814
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
815
0
        *str++ = Py_hexdigits[ch&0xf];
816
0
    }
817
0
    return str;
818
0
}
819
820
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
821
   ASCII, Latin1, UTF-8, etc. */
822
static char*
823
xmlcharrefreplace(PyBytesWriter *writer, char *str,
824
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
825
0
{
826
0
    Py_ssize_t size, i;
827
0
    Py_UCS4 ch;
828
0
    int kind;
829
0
    const void *data;
830
831
0
    kind = PyUnicode_KIND(unicode);
832
0
    data = PyUnicode_DATA(unicode);
833
834
0
    size = 0;
835
    /* determine replacement size */
836
0
    for (i = collstart; i < collend; ++i) {
837
0
        Py_ssize_t incr;
838
839
0
        ch = PyUnicode_READ(kind, data, i);
840
0
        if (ch < 10)
841
0
            incr = 2+1+1;
842
0
        else if (ch < 100)
843
0
            incr = 2+2+1;
844
0
        else if (ch < 1000)
845
0
            incr = 2+3+1;
846
0
        else if (ch < 10000)
847
0
            incr = 2+4+1;
848
0
        else if (ch < 100000)
849
0
            incr = 2+5+1;
850
0
        else if (ch < 1000000)
851
0
            incr = 2+6+1;
852
0
        else {
853
0
            assert(ch <= MAX_UNICODE);
854
0
            incr = 2+7+1;
855
0
        }
856
0
        if (size > PY_SSIZE_T_MAX - incr) {
857
0
            PyErr_SetString(PyExc_OverflowError,
858
0
                            "encoded result is too long for a Python string");
859
0
            return NULL;
860
0
        }
861
0
        size += incr;
862
0
    }
863
864
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
865
0
    if (str == NULL) {
866
0
        return NULL;
867
0
    }
868
869
    /* generate replacement */
870
0
    for (i = collstart; i < collend; ++i) {
871
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
872
0
        if (size < 0) {
873
0
            return NULL;
874
0
        }
875
0
        str += size;
876
0
    }
877
0
    return str;
878
0
}
879
880
/* --- Bloom Filters ----------------------------------------------------- */
881
882
/* stuff to implement simple "bloom filters" for Unicode characters.
883
   to keep things simple, we use a single bitmask, using the least 5
884
   bits from each unicode characters as the bit index. */
885
886
/* the linebreak mask is set up by _PyUnicode_Init() below */
887
888
#if LONG_BIT >= 128
889
#define BLOOM_WIDTH 128
890
#elif LONG_BIT >= 64
891
51.8M
#define BLOOM_WIDTH 64
892
#elif LONG_BIT >= 32
893
#define BLOOM_WIDTH 32
894
#else
895
#error "LONG_BIT is smaller than 32"
896
#endif
897
898
21.7M
#define BLOOM_MASK unsigned long
899
900
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
901
902
73.2M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
903
904
#define BLOOM_LINEBREAK(ch)                                             \
905
260M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
906
260M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
907
908
static inline BLOOM_MASK
909
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
910
10.8M
{
911
10.8M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
912
10.8M
    do {                                               \
913
10.8M
        TYPE *data = (TYPE *)PTR;                      \
914
10.8M
        TYPE *end = data + LEN;                        \
915
10.8M
        Py_UCS4 ch;                                    \
916
23.4M
        for (; data != end; data++) {                  \
917
12.6M
            ch = *data;                                \
918
12.6M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
919
12.6M
        }                                              \
920
10.8M
        break;                                         \
921
10.8M
    } while (0)
922
923
    /* calculate simple bloom-style bitmask for a given unicode string */
924
925
10.8M
    BLOOM_MASK mask;
926
927
10.8M
    mask = 0;
928
10.8M
    switch (kind) {
929
10.8M
    case PyUnicode_1BYTE_KIND:
930
10.8M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
931
10.8M
        break;
932
16
    case PyUnicode_2BYTE_KIND:
933
16
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
934
16
        break;
935
0
    case PyUnicode_4BYTE_KIND:
936
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
937
0
        break;
938
0
    default:
939
0
        Py_UNREACHABLE();
940
10.8M
    }
941
10.8M
    return mask;
942
943
10.8M
#undef BLOOM_UPDATE
944
10.8M
}
945
946
/* Compilation of templated routines */
947
948
1.17M
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
949
950
#include "stringlib/asciilib.h"
951
#include "stringlib/fastsearch.h"
952
#include "stringlib/partition.h"
953
#include "stringlib/split.h"
954
#include "stringlib/count.h"
955
#include "stringlib/find.h"
956
#include "stringlib/find_max_char.h"
957
#include "stringlib/undef.h"
958
959
#include "stringlib/ucs1lib.h"
960
#include "stringlib/fastsearch.h"
961
#include "stringlib/partition.h"
962
#include "stringlib/split.h"
963
#include "stringlib/count.h"
964
#include "stringlib/find.h"
965
#include "stringlib/replace.h"
966
#include "stringlib/repr.h"
967
#include "stringlib/find_max_char.h"
968
#include "stringlib/undef.h"
969
970
#include "stringlib/ucs2lib.h"
971
#include "stringlib/fastsearch.h"
972
#include "stringlib/partition.h"
973
#include "stringlib/split.h"
974
#include "stringlib/count.h"
975
#include "stringlib/find.h"
976
#include "stringlib/replace.h"
977
#include "stringlib/repr.h"
978
#include "stringlib/find_max_char.h"
979
#include "stringlib/undef.h"
980
981
#include "stringlib/ucs4lib.h"
982
#include "stringlib/fastsearch.h"
983
#include "stringlib/partition.h"
984
#include "stringlib/split.h"
985
#include "stringlib/count.h"
986
#include "stringlib/find.h"
987
#include "stringlib/replace.h"
988
#include "stringlib/repr.h"
989
#include "stringlib/find_max_char.h"
990
#include "stringlib/undef.h"
991
992
#undef STRINGLIB_GET_EMPTY
993
994
/* --- Unicode Object ----------------------------------------------------- */
995
996
static inline Py_ssize_t
997
findchar(const void *s, int kind,
998
         Py_ssize_t size, Py_UCS4 ch,
999
         int direction)
1000
114M
{
1001
114M
    switch (kind) {
1002
105M
    case PyUnicode_1BYTE_KIND:
1003
105M
        if ((Py_UCS1) ch != ch)
1004
3.51k
            return -1;
1005
105M
        if (direction > 0)
1006
105M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1007
7.58k
        else
1008
7.58k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1009
8.65M
    case PyUnicode_2BYTE_KIND:
1010
8.65M
        if ((Py_UCS2) ch != ch)
1011
0
            return -1;
1012
8.65M
        if (direction > 0)
1013
8.63M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1014
22.2k
        else
1015
22.2k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1016
1.23M
    case PyUnicode_4BYTE_KIND:
1017
1.23M
        if (direction > 0)
1018
1.14M
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1019
89.1k
        else
1020
89.1k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1021
0
    default:
1022
0
        Py_UNREACHABLE();
1023
114M
    }
1024
114M
}
1025
1026
#ifdef Py_DEBUG
1027
/* Fill the data of a Unicode string with invalid characters to detect bugs
1028
   earlier.
1029
1030
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1031
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1032
   invalid character in Unicode 6.0. */
1033
static void
1034
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1035
{
1036
    int kind = PyUnicode_KIND(unicode);
1037
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1038
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1039
    if (length <= old_length)
1040
        return;
1041
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1042
}
1043
#endif
1044
1045
static PyObject*
1046
resize_copy(PyObject *unicode, Py_ssize_t length)
1047
0
{
1048
0
    Py_ssize_t copy_length;
1049
0
    PyObject *copy;
1050
1051
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1052
0
    if (copy == NULL)
1053
0
        return NULL;
1054
1055
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1056
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1057
0
    return copy;
1058
0
}
1059
1060
PyObject*
1061
_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
1062
61.4M
{
1063
61.4M
    Py_ssize_t char_size;
1064
61.4M
    Py_ssize_t struct_size;
1065
61.4M
    Py_ssize_t new_size;
1066
61.4M
    PyObject *new_unicode;
1067
#ifdef Py_DEBUG
1068
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1069
#endif
1070
1071
61.4M
    if (!_PyUnicode_IsModifiable(unicode)) {
1072
0
        PyObject *copy = resize_copy(unicode, length);
1073
0
        if (copy == NULL) {
1074
0
            return NULL;
1075
0
        }
1076
0
        Py_DECREF(unicode);
1077
0
        return copy;
1078
0
    }
1079
61.4M
    assert(PyUnicode_IS_COMPACT(unicode));
1080
1081
61.4M
    char_size = PyUnicode_KIND(unicode);
1082
61.4M
    if (PyUnicode_IS_ASCII(unicode))
1083
52.3M
        struct_size = sizeof(PyASCIIObject);
1084
9.05M
    else
1085
9.05M
        struct_size = sizeof(PyCompactUnicodeObject);
1086
1087
61.4M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1088
0
        PyErr_NoMemory();
1089
0
        return NULL;
1090
0
    }
1091
61.4M
    new_size = (struct_size + (length + 1) * char_size);
1092
1093
61.4M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1094
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1095
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1096
0
        PyUnicode_SET_UTF8(unicode, NULL);
1097
0
    }
1098
#ifdef Py_TRACE_REFS
1099
    _Py_ForgetReference(unicode);
1100
#endif
1101
61.4M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1102
1103
61.4M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1104
61.4M
    if (new_unicode == NULL) {
1105
0
        _Py_NewReferenceNoTotal(unicode);
1106
0
        PyErr_NoMemory();
1107
0
        return NULL;
1108
0
    }
1109
61.4M
    unicode = new_unicode;
1110
61.4M
    _Py_NewReferenceNoTotal(unicode);
1111
1112
61.4M
    _PyUnicode_LENGTH(unicode) = length;
1113
#ifdef Py_DEBUG
1114
    unicode_fill_invalid(unicode, old_length);
1115
#endif
1116
61.4M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1117
61.4M
                    length, 0);
1118
61.4M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1119
61.4M
    return unicode;
1120
61.4M
}
1121
1122
static int
1123
resize_inplace(PyObject *unicode, Py_ssize_t length)
1124
0
{
1125
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1126
0
    assert(Py_REFCNT(unicode) == 1);
1127
1128
0
    Py_ssize_t new_size;
1129
0
    Py_ssize_t char_size;
1130
0
    int share_utf8;
1131
0
    void *data;
1132
#ifdef Py_DEBUG
1133
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1134
#endif
1135
1136
0
    data = _PyUnicode_DATA_ANY(unicode);
1137
0
    char_size = PyUnicode_KIND(unicode);
1138
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1139
1140
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1141
0
        PyErr_NoMemory();
1142
0
        return -1;
1143
0
    }
1144
0
    new_size = (length + 1) * char_size;
1145
1146
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1147
0
    {
1148
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1149
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1150
0
        PyUnicode_SET_UTF8(unicode, NULL);
1151
0
    }
1152
1153
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1154
0
    if (data == NULL) {
1155
0
        PyErr_NoMemory();
1156
0
        return -1;
1157
0
    }
1158
0
    _PyUnicode_DATA_ANY(unicode) = data;
1159
0
    if (share_utf8) {
1160
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1161
0
        PyUnicode_SET_UTF8(unicode, data);
1162
0
    }
1163
0
    _PyUnicode_LENGTH(unicode) = length;
1164
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1165
#ifdef Py_DEBUG
1166
    unicode_fill_invalid(unicode, old_length);
1167
#endif
1168
1169
    /* check for integer overflow */
1170
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1171
0
        PyErr_NoMemory();
1172
0
        return -1;
1173
0
    }
1174
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1175
0
    return 0;
1176
0
}
1177
1178
static const char*
1179
unicode_kind_name(PyObject *unicode)
1180
0
{
1181
    /* don't check consistency: unicode_kind_name() is called from
1182
       _PyUnicode_Dump() */
1183
0
    if (!PyUnicode_IS_COMPACT(unicode))
1184
0
    {
1185
0
        switch (PyUnicode_KIND(unicode))
1186
0
        {
1187
0
        case PyUnicode_1BYTE_KIND:
1188
0
            if (PyUnicode_IS_ASCII(unicode))
1189
0
                return "legacy ascii";
1190
0
            else
1191
0
                return "legacy latin1";
1192
0
        case PyUnicode_2BYTE_KIND:
1193
0
            return "legacy UCS2";
1194
0
        case PyUnicode_4BYTE_KIND:
1195
0
            return "legacy UCS4";
1196
0
        default:
1197
0
            return "<legacy invalid kind>";
1198
0
        }
1199
0
    }
1200
0
    switch (PyUnicode_KIND(unicode)) {
1201
0
    case PyUnicode_1BYTE_KIND:
1202
0
        if (PyUnicode_IS_ASCII(unicode))
1203
0
            return "ascii";
1204
0
        else
1205
0
            return "latin1";
1206
0
    case PyUnicode_2BYTE_KIND:
1207
0
        return "UCS2";
1208
0
    case PyUnicode_4BYTE_KIND:
1209
0
        return "UCS4";
1210
0
    default:
1211
0
        return "<invalid compact kind>";
1212
0
    }
1213
0
}
1214
1215
#ifdef Py_DEBUG
1216
/* Functions wrapping macros for use in debugger */
1217
const char *_PyUnicode_utf8(void *unicode_raw){
1218
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1219
    return PyUnicode_UTF8(unicode);
1220
}
1221
1222
const void *_PyUnicode_compact_data(void *unicode_raw) {
1223
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1224
    return _PyUnicode_COMPACT_DATA(unicode);
1225
}
1226
const void *_PyUnicode_data(void *unicode_raw) {
1227
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1228
    printf("obj %p\n", (void*)unicode);
1229
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1230
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1231
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1232
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1233
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1234
    return PyUnicode_DATA(unicode);
1235
}
1236
1237
void
1238
_PyUnicode_Dump(PyObject *op)
1239
{
1240
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1241
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1242
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1243
    const void *data;
1244
1245
    if (ascii->state.compact)
1246
    {
1247
        if (ascii->state.ascii)
1248
            data = (ascii + 1);
1249
        else
1250
            data = (compact + 1);
1251
    }
1252
    else
1253
        data = unicode->data.any;
1254
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1255
1256
    if (!ascii->state.ascii) {
1257
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1258
    }
1259
    printf(", data=%p\n", data);
1260
}
1261
#endif
1262
1263
1264
PyObject *
1265
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1266
477M
{
1267
    /* Optimization for empty strings */
1268
477M
    if (size == 0) {
1269
23.2M
        return _PyUnicode_GetEmpty();
1270
23.2M
    }
1271
1272
454M
    PyObject *obj;
1273
454M
    PyCompactUnicodeObject *unicode;
1274
454M
    void *data;
1275
454M
    int kind;
1276
454M
    int is_ascii;
1277
454M
    Py_ssize_t char_size;
1278
454M
    Py_ssize_t struct_size;
1279
1280
454M
    is_ascii = 0;
1281
454M
    struct_size = sizeof(PyCompactUnicodeObject);
1282
454M
    if (maxchar < 128) {
1283
304M
        kind = PyUnicode_1BYTE_KIND;
1284
304M
        char_size = 1;
1285
304M
        is_ascii = 1;
1286
304M
        struct_size = sizeof(PyASCIIObject);
1287
304M
    }
1288
150M
    else if (maxchar < 256) {
1289
12.8M
        kind = PyUnicode_1BYTE_KIND;
1290
12.8M
        char_size = 1;
1291
12.8M
    }
1292
137M
    else if (maxchar < 65536) {
1293
131M
        kind = PyUnicode_2BYTE_KIND;
1294
131M
        char_size = 2;
1295
131M
    }
1296
5.63M
    else {
1297
5.63M
        if (maxchar > MAX_UNICODE) {
1298
0
            PyErr_SetString(PyExc_SystemError,
1299
0
                            "invalid maximum character passed to PyUnicode_New");
1300
0
            return NULL;
1301
0
        }
1302
5.63M
        kind = PyUnicode_4BYTE_KIND;
1303
5.63M
        char_size = 4;
1304
5.63M
    }
1305
1306
    /* Ensure we won't overflow the size. */
1307
454M
    if (size < 0) {
1308
0
        PyErr_SetString(PyExc_SystemError,
1309
0
                        "Negative size passed to PyUnicode_New");
1310
0
        return NULL;
1311
0
    }
1312
454M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1313
0
        return PyErr_NoMemory();
1314
1315
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1316
     * PyObject_New() so we are able to allocate space for the object and
1317
     * it's data buffer.
1318
     */
1319
454M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1320
454M
    if (obj == NULL) {
1321
0
        return PyErr_NoMemory();
1322
0
    }
1323
454M
    _PyObject_Init(obj, &PyUnicode_Type);
1324
1325
454M
    unicode = (PyCompactUnicodeObject *)obj;
1326
454M
    if (is_ascii)
1327
304M
        data = ((PyASCIIObject*)obj) + 1;
1328
150M
    else
1329
150M
        data = unicode + 1;
1330
454M
    _PyUnicode_LENGTH(unicode) = size;
1331
454M
    _PyUnicode_HASH(unicode) = -1;
1332
454M
    _PyUnicode_STATE(unicode).interned = 0;
1333
454M
    _PyUnicode_STATE(unicode).kind = kind;
1334
454M
    _PyUnicode_STATE(unicode).compact = 1;
1335
454M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1336
454M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1337
454M
    if (is_ascii) {
1338
304M
        ((char*)data)[size] = 0;
1339
304M
    }
1340
150M
    else if (kind == PyUnicode_1BYTE_KIND) {
1341
12.8M
        ((char*)data)[size] = 0;
1342
12.8M
        unicode->utf8 = NULL;
1343
12.8M
        unicode->utf8_length = 0;
1344
12.8M
    }
1345
137M
    else {
1346
137M
        unicode->utf8 = NULL;
1347
137M
        unicode->utf8_length = 0;
1348
137M
        if (kind == PyUnicode_2BYTE_KIND)
1349
131M
            ((Py_UCS2*)data)[size] = 0;
1350
5.63M
        else /* kind == PyUnicode_4BYTE_KIND */
1351
5.63M
            ((Py_UCS4*)data)[size] = 0;
1352
137M
    }
1353
#ifdef Py_DEBUG
1354
    unicode_fill_invalid((PyObject*)unicode, 0);
1355
#endif
1356
454M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1357
454M
    return obj;
1358
454M
}
1359
1360
static int
1361
unicode_check_modifiable(PyObject *unicode)
1362
644
{
1363
644
    if (!_PyUnicode_IsModifiable(unicode)) {
1364
0
        PyErr_SetString(PyExc_SystemError,
1365
0
                        "Cannot modify a string currently used");
1366
0
        return -1;
1367
0
    }
1368
644
    return 0;
1369
644
}
1370
1371
static int
1372
_copy_characters(PyObject *to, Py_ssize_t to_start,
1373
                 PyObject *from, Py_ssize_t from_start,
1374
                 Py_ssize_t how_many, int check_maxchar)
1375
307M
{
1376
307M
    int from_kind, to_kind;
1377
307M
    const void *from_data;
1378
307M
    void *to_data;
1379
1380
307M
    assert(0 <= how_many);
1381
307M
    assert(0 <= from_start);
1382
307M
    assert(0 <= to_start);
1383
307M
    assert(PyUnicode_Check(from));
1384
307M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1385
1386
307M
    assert(to == NULL || PyUnicode_Check(to));
1387
1388
307M
    if (how_many == 0) {
1389
254k
        return 0;
1390
254k
    }
1391
1392
307M
    assert(to != NULL);
1393
307M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1394
1395
307M
    from_kind = PyUnicode_KIND(from);
1396
307M
    from_data = PyUnicode_DATA(from);
1397
307M
    to_kind = PyUnicode_KIND(to);
1398
307M
    to_data = PyUnicode_DATA(to);
1399
1400
#ifdef Py_DEBUG
1401
    if (!check_maxchar
1402
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1403
    {
1404
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1405
        Py_UCS4 ch;
1406
        Py_ssize_t i;
1407
        for (i=0; i < how_many; i++) {
1408
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1409
            assert(ch <= to_maxchar);
1410
        }
1411
    }
1412
#endif
1413
1414
307M
    if (from_kind == to_kind) {
1415
205M
        if (check_maxchar
1416
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1417
0
        {
1418
            /* Writing Latin-1 characters into an ASCII string requires to
1419
               check that all written characters are pure ASCII */
1420
0
            Py_UCS4 max_char;
1421
0
            max_char = ucs1lib_find_max_char(from_data,
1422
0
                                             (const Py_UCS1*)from_data + how_many);
1423
0
            if (max_char >= 128)
1424
0
                return -1;
1425
0
        }
1426
205M
        memcpy((char*)to_data + to_kind * to_start,
1427
205M
                  (const char*)from_data + from_kind * from_start,
1428
205M
                  to_kind * how_many);
1429
205M
    }
1430
102M
    else if (from_kind == PyUnicode_1BYTE_KIND
1431
100M
             && to_kind == PyUnicode_2BYTE_KIND)
1432
85.0M
    {
1433
85.0M
        _PyUnicode_CONVERT_BYTES(
1434
85.0M
            Py_UCS1, Py_UCS2,
1435
85.0M
            PyUnicode_1BYTE_DATA(from) + from_start,
1436
85.0M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1437
85.0M
            PyUnicode_2BYTE_DATA(to) + to_start
1438
85.0M
            );
1439
85.0M
    }
1440
17.4M
    else if (from_kind == PyUnicode_1BYTE_KIND
1441
15.4M
             && to_kind == PyUnicode_4BYTE_KIND)
1442
15.4M
    {
1443
15.4M
        _PyUnicode_CONVERT_BYTES(
1444
15.4M
            Py_UCS1, Py_UCS4,
1445
15.4M
            PyUnicode_1BYTE_DATA(from) + from_start,
1446
15.4M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1447
15.4M
            PyUnicode_4BYTE_DATA(to) + to_start
1448
15.4M
            );
1449
15.4M
    }
1450
1.94M
    else if (from_kind == PyUnicode_2BYTE_KIND
1451
1.92M
             && to_kind == PyUnicode_4BYTE_KIND)
1452
1.92M
    {
1453
1.92M
        _PyUnicode_CONVERT_BYTES(
1454
1.92M
            Py_UCS2, Py_UCS4,
1455
1.92M
            PyUnicode_2BYTE_DATA(from) + from_start,
1456
1.92M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1457
1.92M
            PyUnicode_4BYTE_DATA(to) + to_start
1458
1.92M
            );
1459
1.92M
    }
1460
23.8k
    else {
1461
23.8k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1462
1463
23.8k
        if (!check_maxchar) {
1464
23.8k
            if (from_kind == PyUnicode_2BYTE_KIND
1465
2.31k
                && to_kind == PyUnicode_1BYTE_KIND)
1466
2.31k
            {
1467
2.31k
                _PyUnicode_CONVERT_BYTES(
1468
2.31k
                    Py_UCS2, Py_UCS1,
1469
2.31k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1470
2.31k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1471
2.31k
                    PyUnicode_1BYTE_DATA(to) + to_start
1472
2.31k
                    );
1473
2.31k
            }
1474
21.5k
            else if (from_kind == PyUnicode_4BYTE_KIND
1475
21.5k
                     && to_kind == PyUnicode_1BYTE_KIND)
1476
8.23k
            {
1477
8.23k
                _PyUnicode_CONVERT_BYTES(
1478
8.23k
                    Py_UCS4, Py_UCS1,
1479
8.23k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1480
8.23k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1481
8.23k
                    PyUnicode_1BYTE_DATA(to) + to_start
1482
8.23k
                    );
1483
8.23k
            }
1484
13.2k
            else if (from_kind == PyUnicode_4BYTE_KIND
1485
13.2k
                     && to_kind == PyUnicode_2BYTE_KIND)
1486
13.2k
            {
1487
13.2k
                _PyUnicode_CONVERT_BYTES(
1488
13.2k
                    Py_UCS4, Py_UCS2,
1489
13.2k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1490
13.2k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1491
13.2k
                    PyUnicode_2BYTE_DATA(to) + to_start
1492
13.2k
                    );
1493
13.2k
            }
1494
0
            else {
1495
0
                Py_UNREACHABLE();
1496
0
            }
1497
23.8k
        }
1498
0
        else {
1499
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1500
0
            Py_UCS4 ch;
1501
0
            Py_ssize_t i;
1502
1503
0
            for (i=0; i < how_many; i++) {
1504
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1505
0
                if (ch > to_maxchar)
1506
0
                    return -1;
1507
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1508
0
            }
1509
0
        }
1510
23.8k
    }
1511
307M
    return 0;
1512
307M
}
1513
1514
void
1515
_PyUnicode_FastCopyCharacters(
1516
    PyObject *to, Py_ssize_t to_start,
1517
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1518
307M
{
1519
307M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1520
307M
}
1521
1522
Py_ssize_t
1523
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1524
                         PyObject *from, Py_ssize_t from_start,
1525
                         Py_ssize_t how_many)
1526
0
{
1527
0
    int err;
1528
1529
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1530
0
        PyErr_BadInternalCall();
1531
0
        return -1;
1532
0
    }
1533
1534
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1535
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1536
0
        return -1;
1537
0
    }
1538
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1539
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1540
0
        return -1;
1541
0
    }
1542
0
    if (how_many < 0) {
1543
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1544
0
        return -1;
1545
0
    }
1546
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1547
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1548
0
        PyErr_Format(PyExc_SystemError,
1549
0
                     "Cannot write %zi characters at %zi "
1550
0
                     "in a string of %zi characters",
1551
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1552
0
        return -1;
1553
0
    }
1554
1555
0
    if (how_many == 0)
1556
0
        return 0;
1557
1558
0
    if (unicode_check_modifiable(to))
1559
0
        return -1;
1560
1561
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1562
0
    if (err) {
1563
0
        PyErr_Format(PyExc_SystemError,
1564
0
                     "Cannot copy %s characters "
1565
0
                     "into a string of %s characters",
1566
0
                     unicode_kind_name(from),
1567
0
                     unicode_kind_name(to));
1568
0
        return -1;
1569
0
    }
1570
0
    return how_many;
1571
0
}
1572
1573
/* Find the maximum code point and count the number of surrogate pairs so a
1574
   correct string length can be computed before converting a string to UCS4.
1575
   This function counts single surrogates as a character and not as a pair.
1576
1577
   Return 0 on success, or -1 on error. */
1578
static int
1579
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1580
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1581
16.4k
{
1582
16.4k
    const wchar_t *iter;
1583
16.4k
    Py_UCS4 ch;
1584
1585
16.4k
    assert(num_surrogates != NULL && maxchar != NULL);
1586
16.4k
    *num_surrogates = 0;
1587
16.4k
    *maxchar = 0;
1588
1589
361k
    for (iter = begin; iter < end; ) {
1590
#if SIZEOF_WCHAR_T == 2
1591
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1592
            && (iter+1) < end
1593
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1594
        {
1595
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1596
            ++(*num_surrogates);
1597
            iter += 2;
1598
        }
1599
        else
1600
#endif
1601
345k
        {
1602
345k
            ch = *iter;
1603
345k
            iter++;
1604
345k
        }
1605
345k
        if (ch > *maxchar) {
1606
70.7k
            *maxchar = ch;
1607
70.7k
            if (*maxchar > MAX_UNICODE) {
1608
0
                PyErr_Format(PyExc_ValueError,
1609
0
                             "character U+%x is not in range [U+0000; U+%x]",
1610
0
                             ch, MAX_UNICODE);
1611
0
                return -1;
1612
0
            }
1613
70.7k
        }
1614
345k
    }
1615
16.4k
    return 0;
1616
16.4k
}
1617
1618
static void
1619
unicode_dealloc(PyObject *unicode)
1620
464M
{
1621
#ifdef Py_DEBUG
1622
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1623
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1624
    }
1625
#endif
1626
464M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1627
        /* This should never get called, but we also don't want to SEGV if
1628
        * we accidentally decref an immortal string out of existence. Since
1629
        * the string is an immortal object, just re-set the reference count.
1630
        */
1631
#ifdef Py_DEBUG
1632
        Py_UNREACHABLE();
1633
#endif
1634
0
        _Py_SetImmortal(unicode);
1635
0
        return;
1636
0
    }
1637
464M
    switch (_PyUnicode_STATE(unicode).interned) {
1638
464M
        case SSTATE_NOT_INTERNED:
1639
464M
            break;
1640
556k
        case SSTATE_INTERNED_MORTAL:
1641
            /* Remove the object from the intern dict.
1642
             * Before doing so, we set the refcount to 2: the key and value
1643
             * in the interned_dict.
1644
             */
1645
556k
            assert(Py_REFCNT(unicode) == 0);
1646
556k
            Py_SET_REFCNT(unicode, 2);
1647
#ifdef Py_REF_DEBUG
1648
            /* let's be pedantic with the ref total */
1649
            _Py_IncRefTotal(_PyThreadState_GET());
1650
            _Py_IncRefTotal(_PyThreadState_GET());
1651
#endif
1652
556k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1653
556k
            PyObject *interned = get_interned_dict(interp);
1654
556k
            assert(interned != NULL);
1655
556k
            PyObject *popped;
1656
556k
            int r = PyDict_Pop(interned, unicode, &popped);
1657
556k
            if (r == -1) {
1658
0
                PyErr_FormatUnraisable("Exception ignored while "
1659
0
                                       "removing an interned string %R",
1660
0
                                       unicode);
1661
                // We don't know what happened to the string. It's probably
1662
                // best to leak it:
1663
                // - if it was popped, there are no more references to it
1664
                //   so it can't cause trouble (except wasted memory)
1665
                // - if it wasn't popped, it'll remain interned
1666
0
                _Py_SetImmortal(unicode);
1667
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1668
0
                return;
1669
0
            }
1670
556k
            if (r == 0) {
1671
                // The interned string was not found in the interned_dict.
1672
#ifdef Py_DEBUG
1673
                Py_UNREACHABLE();
1674
#endif
1675
0
                _Py_SetImmortal(unicode);
1676
0
                return;
1677
0
            }
1678
            // Successfully popped.
1679
556k
            assert(popped == unicode);
1680
            // Only our `popped` reference should be left; remove it too.
1681
556k
            assert(Py_REFCNT(unicode) == 1);
1682
556k
            Py_SET_REFCNT(unicode, 0);
1683
#ifdef Py_REF_DEBUG
1684
            /* let's be pedantic with the ref total */
1685
            _Py_DecRefTotal(_PyThreadState_GET());
1686
#endif
1687
556k
            break;
1688
0
        default:
1689
            // As with `statically_allocated` above.
1690
#ifdef Py_REF_DEBUG
1691
            Py_UNREACHABLE();
1692
#endif
1693
0
            _Py_SetImmortal(unicode);
1694
0
            return;
1695
464M
    }
1696
464M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1697
160k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1698
160k
    }
1699
464M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1700
10.5M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1701
10.5M
    }
1702
1703
464M
    Py_TYPE(unicode)->tp_free(unicode);
1704
464M
}
1705
1706
#ifdef Py_DEBUG
1707
static int
1708
unicode_is_singleton(PyObject *unicode)
1709
{
1710
    if (unicode == &_Py_STR(empty)) {
1711
        return 1;
1712
    }
1713
1714
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1715
    if (ascii->length == 1) {
1716
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1717
        if (ch < 256 && LATIN1(ch) == unicode) {
1718
            return 1;
1719
        }
1720
    }
1721
    return 0;
1722
}
1723
#endif
1724
1725
int
1726
_PyUnicode_IsModifiable(PyObject *unicode)
1727
62.4M
{
1728
62.4M
    assert(_PyUnicode_CHECK(unicode));
1729
62.4M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1730
40.8k
        return 0;
1731
62.4M
    if (PyUnicode_HASH(unicode) != -1)
1732
0
        return 0;
1733
62.4M
    if (PyUnicode_CHECK_INTERNED(unicode))
1734
0
        return 0;
1735
62.4M
    if (!PyUnicode_CheckExact(unicode))
1736
0
        return 0;
1737
#ifdef Py_DEBUG
1738
    /* singleton refcount is greater than 1 */
1739
    assert(!unicode_is_singleton(unicode));
1740
#endif
1741
62.4M
    return 1;
1742
62.4M
}
1743
1744
static int
1745
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1746
510k
{
1747
510k
    PyObject *unicode;
1748
510k
    Py_ssize_t old_length;
1749
1750
510k
    assert(p_unicode != NULL);
1751
510k
    unicode = *p_unicode;
1752
1753
510k
    assert(unicode != NULL);
1754
510k
    assert(PyUnicode_Check(unicode));
1755
510k
    assert(0 <= length);
1756
1757
510k
    old_length = PyUnicode_GET_LENGTH(unicode);
1758
510k
    if (old_length == length)
1759
0
        return 0;
1760
1761
510k
    if (length == 0) {
1762
0
        PyObject *empty = _PyUnicode_GetEmpty();
1763
0
        Py_SETREF(*p_unicode, empty);
1764
0
        return 0;
1765
0
    }
1766
1767
510k
    if (!_PyUnicode_IsModifiable(unicode)) {
1768
0
        PyObject *copy = resize_copy(unicode, length);
1769
0
        if (copy == NULL)
1770
0
            return -1;
1771
0
        Py_SETREF(*p_unicode, copy);
1772
0
        return 0;
1773
0
    }
1774
1775
510k
    if (PyUnicode_IS_COMPACT(unicode)) {
1776
510k
        PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
1777
510k
        if (new_unicode == NULL)
1778
0
            return -1;
1779
510k
        *p_unicode = new_unicode;
1780
510k
        return 0;
1781
510k
    }
1782
0
    return resize_inplace(unicode, length);
1783
510k
}
1784
1785
int
1786
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1787
0
{
1788
0
    PyObject *unicode;
1789
0
    if (p_unicode == NULL) {
1790
0
        PyErr_BadInternalCall();
1791
0
        return -1;
1792
0
    }
1793
0
    unicode = *p_unicode;
1794
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1795
0
    {
1796
0
        PyErr_BadInternalCall();
1797
0
        return -1;
1798
0
    }
1799
0
    return unicode_resize(p_unicode, length);
1800
0
}
1801
1802
static PyObject*
1803
get_latin1_char(Py_UCS1 ch)
1804
214M
{
1805
214M
    PyObject *o = LATIN1(ch);
1806
214M
    return o;
1807
214M
}
1808
1809
static PyObject*
1810
unicode_char(Py_UCS4 ch)
1811
256M
{
1812
256M
    PyObject *unicode;
1813
1814
256M
    assert(ch <= MAX_UNICODE);
1815
1816
256M
    if (ch < 256) {
1817
170M
        return get_latin1_char(ch);
1818
170M
    }
1819
1820
85.5M
    unicode = PyUnicode_New(1, ch);
1821
85.5M
    if (unicode == NULL)
1822
0
        return NULL;
1823
1824
85.5M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1825
85.5M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1826
82.3M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1827
82.3M
    } else {
1828
3.28M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1829
3.28M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1830
3.28M
    }
1831
85.5M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1832
85.5M
    return unicode;
1833
85.5M
}
1834
1835
1836
static inline void
1837
unicode_write_widechar(int kind, void *data,
1838
                       const wchar_t *u, Py_ssize_t size,
1839
                       Py_ssize_t num_surrogates)
1840
16.4k
{
1841
16.4k
    switch (kind) {
1842
16.4k
    case PyUnicode_1BYTE_KIND:
1843
16.4k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1844
16.4k
        break;
1845
1846
0
    case PyUnicode_2BYTE_KIND:
1847
#if SIZEOF_WCHAR_T == 2
1848
        memcpy(data, u, size * 2);
1849
#else
1850
0
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1851
0
#endif
1852
0
        break;
1853
1854
0
    case PyUnicode_4BYTE_KIND:
1855
0
    {
1856
#if SIZEOF_WCHAR_T == 2
1857
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1858
        // surrogate pairs.
1859
        const wchar_t *end = u + size;
1860
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1861
#  ifndef NDEBUG
1862
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1863
#  endif
1864
        for (const wchar_t *iter = u; iter < end; ) {
1865
            assert(ucs4_out < ucs4_end);
1866
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1867
                && (iter+1) < end
1868
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1869
            {
1870
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1871
                iter += 2;
1872
            }
1873
            else {
1874
                *ucs4_out++ = *iter;
1875
                iter++;
1876
            }
1877
        }
1878
        assert(ucs4_out == ucs4_end);
1879
#else
1880
0
        assert(num_surrogates == 0);
1881
0
        memcpy(data, u, size * 4);
1882
0
#endif
1883
0
        break;
1884
0
    }
1885
0
    default:
1886
0
        Py_UNREACHABLE();
1887
16.4k
    }
1888
16.4k
}
1889
1890
1891
PyObject *
1892
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1893
16.4k
{
1894
16.4k
    PyObject *unicode;
1895
16.4k
    Py_UCS4 maxchar = 0;
1896
16.4k
    Py_ssize_t num_surrogates;
1897
1898
16.4k
    if (u == NULL && size != 0) {
1899
0
        PyErr_BadInternalCall();
1900
0
        return NULL;
1901
0
    }
1902
1903
16.4k
    if (size == -1) {
1904
576
        size = wcslen(u);
1905
576
    }
1906
1907
    /* If the Unicode data is known at construction time, we can apply
1908
       some optimizations which share commonly used objects. */
1909
1910
    /* Optimization for empty strings */
1911
16.4k
    if (size == 0)
1912
32
        _Py_RETURN_UNICODE_EMPTY();
1913
1914
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1915
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1916
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1917
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1918
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1919
        if (!converted) {
1920
            return NULL;
1921
        }
1922
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1923
        PyMem_Free(converted);
1924
        return unicode;
1925
    }
1926
#endif
1927
1928
    /* Single character Unicode objects in the Latin-1 range are
1929
       shared when using this constructor */
1930
16.4k
    if (size == 1 && (Py_UCS4)*u < 256)
1931
0
        return get_latin1_char((unsigned char)*u);
1932
1933
    /* If not empty and not single character, copy the Unicode data
1934
       into the new object */
1935
16.4k
    if (find_maxchar_surrogates(u, u + size,
1936
16.4k
                                &maxchar, &num_surrogates) == -1)
1937
0
        return NULL;
1938
1939
16.4k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1940
16.4k
    if (!unicode)
1941
0
        return NULL;
1942
1943
16.4k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1944
16.4k
                           u, size, num_surrogates);
1945
1946
16.4k
    return unicode_result(unicode);
1947
16.4k
}
1948
1949
1950
int
1951
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
1952
                              const wchar_t *str,
1953
                              Py_ssize_t size)
1954
0
{
1955
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
1956
1957
0
    if (size < 0) {
1958
0
        size = wcslen(str);
1959
0
    }
1960
1961
0
    if (size == 0) {
1962
0
        return 0;
1963
0
    }
1964
1965
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1966
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1967
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1968
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1969
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
1970
        if (!converted) {
1971
            return -1;
1972
        }
1973
1974
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
1975
        PyMem_Free(converted);
1976
        return res;
1977
    }
1978
#endif
1979
1980
0
    Py_UCS4 maxchar = 0;
1981
0
    Py_ssize_t num_surrogates;
1982
0
    if (find_maxchar_surrogates(str, str + size,
1983
0
                                &maxchar, &num_surrogates) == -1) {
1984
0
        return -1;
1985
0
    }
1986
1987
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
1988
0
        return -1;
1989
0
    }
1990
1991
0
    int kind = writer->kind;
1992
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
1993
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
1994
1995
0
    writer->pos += size - num_surrogates;
1996
0
    return 0;
1997
0
}
1998
1999
2000
PyObject *
2001
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2002
612k
{
2003
612k
    if (size < 0) {
2004
0
        PyErr_SetString(PyExc_SystemError,
2005
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2006
0
        return NULL;
2007
0
    }
2008
612k
    if (u != NULL) {
2009
612k
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2010
612k
    }
2011
0
    if (size > 0) {
2012
0
        PyErr_SetString(PyExc_SystemError,
2013
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2014
0
        return NULL;
2015
0
    }
2016
0
    return _PyUnicode_GetEmpty();
2017
0
}
2018
2019
PyObject *
2020
PyUnicode_FromString(const char *u)
2021
6.62M
{
2022
6.62M
    size_t size = strlen(u);
2023
6.62M
    if (size > PY_SSIZE_T_MAX) {
2024
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2025
0
        return NULL;
2026
0
    }
2027
6.62M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2028
6.62M
}
2029
2030
2031
PyObject *
2032
_PyUnicode_FromId(_Py_Identifier *id)
2033
0
{
2034
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2035
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2036
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2037
2038
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2039
0
    if (index < 0) {
2040
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2041
2042
0
        PyMutex_Lock(&rt_ids->mutex);
2043
        // Check again to detect concurrent access. Another thread can have
2044
        // initialized the index while this thread waited for the lock.
2045
0
        index = _Py_atomic_load_ssize(&id->index);
2046
0
        if (index < 0) {
2047
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2048
0
            index = rt_ids->next_index;
2049
0
            rt_ids->next_index++;
2050
0
            _Py_atomic_store_ssize(&id->index, index);
2051
0
        }
2052
0
        PyMutex_Unlock(&rt_ids->mutex);
2053
0
    }
2054
0
    assert(index >= 0);
2055
2056
0
    PyObject *obj;
2057
0
    if (index < ids->size) {
2058
0
        obj = ids->array[index];
2059
0
        if (obj) {
2060
            // Return a borrowed reference
2061
0
            goto end;
2062
0
        }
2063
0
    }
2064
2065
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2066
0
                                       NULL, NULL);
2067
0
    if (!obj) {
2068
0
        goto end;
2069
0
    }
2070
0
    _PyUnicode_InternImmortal(interp, &obj);
2071
2072
0
    if (index >= ids->size) {
2073
        // Overallocate to reduce the number of realloc
2074
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2075
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2076
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2077
0
        if (new_array == NULL) {
2078
0
            PyErr_NoMemory();
2079
0
            obj = NULL;
2080
0
            goto end;
2081
0
        }
2082
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2083
0
        ids->array = new_array;
2084
0
        ids->size = new_size;
2085
0
    }
2086
2087
    // The array stores a strong reference
2088
0
    ids->array[index] = obj;
2089
2090
0
end:
2091
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2092
    // Return a borrowed reference
2093
0
    return obj;
2094
0
}
2095
2096
2097
static void
2098
unicode_clear_identifiers(struct _Py_unicode_state *state)
2099
0
{
2100
0
    struct _Py_unicode_ids *ids = &state->ids;
2101
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2102
0
        Py_XDECREF(ids->array[i]);
2103
0
    }
2104
0
    ids->size = 0;
2105
0
    PyMem_Free(ids->array);
2106
0
    ids->array = NULL;
2107
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2108
    // after Py_Finalize().
2109
0
}
2110
2111
2112
/* Internal function, doesn't check maximum character */
2113
2114
PyObject*
2115
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2116
89.2M
{
2117
89.2M
    const unsigned char *s = (const unsigned char *)buffer;
2118
89.2M
    PyObject *unicode;
2119
89.2M
    if (size == 1) {
2120
#ifdef Py_DEBUG
2121
        assert((unsigned char)s[0] < 128);
2122
#endif
2123
29.3M
        return get_latin1_char(s[0]);
2124
29.3M
    }
2125
59.8M
    unicode = PyUnicode_New(size, 127);
2126
59.8M
    if (!unicode)
2127
0
        return NULL;
2128
59.8M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2129
59.8M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2130
59.8M
    return unicode;
2131
59.8M
}
2132
2133
static Py_UCS4
2134
kind_maxchar_limit(int kind)
2135
0
{
2136
0
    switch (kind) {
2137
0
    case PyUnicode_1BYTE_KIND:
2138
0
        return 0x80;
2139
0
    case PyUnicode_2BYTE_KIND:
2140
0
        return 0x100;
2141
0
    case PyUnicode_4BYTE_KIND:
2142
0
        return 0x10000;
2143
0
    default:
2144
0
        Py_UNREACHABLE();
2145
0
    }
2146
0
}
2147
2148
static PyObject*
2149
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2150
43.9M
{
2151
43.9M
    PyObject *res;
2152
43.9M
    unsigned char max_char;
2153
2154
43.9M
    if (size == 0) {
2155
5.30M
        _Py_RETURN_UNICODE_EMPTY();
2156
5.30M
    }
2157
43.9M
    assert(size > 0);
2158
38.6M
    if (size == 1) {
2159
12.5M
        return get_latin1_char(u[0]);
2160
12.5M
    }
2161
2162
26.0M
    max_char = ucs1lib_find_max_char(u, u + size);
2163
26.0M
    res = PyUnicode_New(size, max_char);
2164
26.0M
    if (!res)
2165
0
        return NULL;
2166
26.0M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2167
26.0M
    assert(_PyUnicode_CheckConsistency(res, 1));
2168
26.0M
    return res;
2169
26.0M
}
2170
2171
static PyObject*
2172
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2173
88.4M
{
2174
88.4M
    PyObject *res;
2175
88.4M
    Py_UCS2 max_char;
2176
2177
88.4M
    if (size == 0)
2178
11.3M
        _Py_RETURN_UNICODE_EMPTY();
2179
88.4M
    assert(size > 0);
2180
77.0M
    if (size == 1)
2181
51.1M
        return unicode_char(u[0]);
2182
2183
25.8M
    max_char = ucs2lib_find_max_char(u, u + size);
2184
25.8M
    res = PyUnicode_New(size, max_char);
2185
25.8M
    if (!res)
2186
0
        return NULL;
2187
25.8M
    if (max_char >= 256)
2188
16.1M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2189
9.69M
    else {
2190
9.69M
        _PyUnicode_CONVERT_BYTES(
2191
9.69M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2192
9.69M
    }
2193
25.8M
    assert(_PyUnicode_CheckConsistency(res, 1));
2194
25.8M
    return res;
2195
25.8M
}
2196
2197
static PyObject*
2198
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2199
61.4M
{
2200
61.4M
    PyObject *res;
2201
61.4M
    Py_UCS4 max_char;
2202
2203
61.4M
    if (size == 0)
2204
7.29M
        _Py_RETURN_UNICODE_EMPTY();
2205
61.4M
    assert(size > 0);
2206
54.2M
    if (size == 1)
2207
37.0M
        return unicode_char(u[0]);
2208
2209
17.1M
    max_char = ucs4lib_find_max_char(u, u + size);
2210
17.1M
    res = PyUnicode_New(size, max_char);
2211
17.1M
    if (!res)
2212
0
        return NULL;
2213
17.1M
    if (max_char < 256)
2214
11.8M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2215
17.1M
                                 PyUnicode_1BYTE_DATA(res));
2216
5.29M
    else if (max_char < 0x10000)
2217
3.72M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2218
5.29M
                                 PyUnicode_2BYTE_DATA(res));
2219
1.57M
    else
2220
1.57M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2221
17.1M
    assert(_PyUnicode_CheckConsistency(res, 1));
2222
17.1M
    return res;
2223
17.1M
}
2224
2225
2226
int
2227
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2228
                          Py_UCS4 *str,
2229
                          Py_ssize_t size)
2230
0
{
2231
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2232
2233
0
    if (size < 0) {
2234
0
        PyErr_SetString(PyExc_ValueError,
2235
0
                        "size must be positive");
2236
0
        return -1;
2237
0
    }
2238
2239
0
    if (size == 0) {
2240
0
        return 0;
2241
0
    }
2242
2243
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2244
2245
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2246
0
        return -1;
2247
0
    }
2248
2249
0
    int kind = writer->kind;
2250
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2251
0
    if (kind == PyUnicode_1BYTE_KIND) {
2252
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2253
0
                                 str, str + size,
2254
0
                                 data);
2255
0
    }
2256
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2257
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2258
0
                                 str, str + size,
2259
0
                                 data);
2260
0
    }
2261
0
    else {
2262
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2263
0
    }
2264
0
    writer->pos += size;
2265
2266
0
    return 0;
2267
0
}
2268
2269
2270
PyObject*
2271
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2272
144M
{
2273
144M
    if (size < 0) {
2274
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2275
0
        return NULL;
2276
0
    }
2277
144M
    switch (kind) {
2278
21.5M
    case PyUnicode_1BYTE_KIND:
2279
21.5M
        return _PyUnicode_FromUCS1(buffer, size);
2280
73.5M
    case PyUnicode_2BYTE_KIND:
2281
73.5M
        return _PyUnicode_FromUCS2(buffer, size);
2282
49.5M
    case PyUnicode_4BYTE_KIND:
2283
49.5M
        return _PyUnicode_FromUCS4(buffer, size);
2284
0
    default:
2285
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2286
0
        return NULL;
2287
144M
    }
2288
144M
}
2289
2290
Py_UCS4
2291
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2292
14.1M
{
2293
14.1M
    int kind;
2294
14.1M
    const void *startptr, *endptr;
2295
2296
14.1M
    assert(0 <= start);
2297
14.1M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2298
14.1M
    assert(start <= end);
2299
2300
14.1M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2301
0
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2302
2303
14.1M
    if (start == end)
2304
0
        return 127;
2305
2306
14.1M
    if (PyUnicode_IS_ASCII(unicode))
2307
14.1M
        return 127;
2308
2309
32.3k
    kind = PyUnicode_KIND(unicode);
2310
32.3k
    startptr = PyUnicode_DATA(unicode);
2311
32.3k
    endptr = (char *)startptr + end * kind;
2312
32.3k
    startptr = (char *)startptr + start * kind;
2313
32.3k
    switch(kind) {
2314
1.62k
    case PyUnicode_1BYTE_KIND:
2315
1.62k
        return ucs1lib_find_max_char(startptr, endptr);
2316
4.08k
    case PyUnicode_2BYTE_KIND:
2317
4.08k
        return ucs2lib_find_max_char(startptr, endptr);
2318
26.6k
    case PyUnicode_4BYTE_KIND:
2319
26.6k
        return ucs4lib_find_max_char(startptr, endptr);
2320
0
    default:
2321
0
        Py_UNREACHABLE();
2322
32.3k
    }
2323
32.3k
}
2324
2325
/* Ensure that a string uses the most efficient storage, if it is not the
2326
   case: create a new string with of the right kind. Write NULL into *p_unicode
2327
   on error. */
2328
static void
2329
unicode_adjust_maxchar(PyObject **p_unicode)
2330
0
{
2331
0
    PyObject *unicode, *copy;
2332
0
    Py_UCS4 max_char;
2333
0
    Py_ssize_t len;
2334
0
    int kind;
2335
2336
0
    assert(p_unicode != NULL);
2337
0
    unicode = *p_unicode;
2338
0
    if (PyUnicode_IS_ASCII(unicode))
2339
0
        return;
2340
2341
0
    len = PyUnicode_GET_LENGTH(unicode);
2342
0
    kind = PyUnicode_KIND(unicode);
2343
0
    if (kind == PyUnicode_1BYTE_KIND) {
2344
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2345
0
        max_char = ucs1lib_find_max_char(u, u + len);
2346
0
        if (max_char >= 128)
2347
0
            return;
2348
0
    }
2349
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2350
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2351
0
        max_char = ucs2lib_find_max_char(u, u + len);
2352
0
        if (max_char >= 256)
2353
0
            return;
2354
0
    }
2355
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2356
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2357
0
        max_char = ucs4lib_find_max_char(u, u + len);
2358
0
        if (max_char >= 0x10000)
2359
0
            return;
2360
0
    }
2361
0
    else
2362
0
        Py_UNREACHABLE();
2363
2364
0
    copy = PyUnicode_New(len, max_char);
2365
0
    if (copy != NULL)
2366
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2367
0
    Py_DECREF(unicode);
2368
0
    *p_unicode = copy;
2369
0
}
2370
2371
PyObject*
2372
_PyUnicode_Copy(PyObject *unicode)
2373
3.26M
{
2374
3.26M
    Py_ssize_t length;
2375
3.26M
    PyObject *copy;
2376
2377
3.26M
    if (!PyUnicode_Check(unicode)) {
2378
0
        PyErr_BadInternalCall();
2379
0
        return NULL;
2380
0
    }
2381
2382
3.26M
    length = PyUnicode_GET_LENGTH(unicode);
2383
3.26M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2384
3.26M
    if (!copy)
2385
0
        return NULL;
2386
3.26M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2387
2388
3.26M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2389
3.26M
              length * PyUnicode_KIND(unicode));
2390
3.26M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2391
3.26M
    return copy;
2392
3.26M
}
2393
2394
2395
/* Widen Unicode objects to larger buffers. Don't write terminating null
2396
   character. Return NULL on error. */
2397
2398
static void*
2399
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2400
12.5M
{
2401
12.5M
    void *result;
2402
2403
12.5M
    assert(skind < kind);
2404
12.5M
    switch (kind) {
2405
11.4M
    case PyUnicode_2BYTE_KIND:
2406
11.4M
        result = PyMem_New(Py_UCS2, len);
2407
11.4M
        if (!result)
2408
0
            return PyErr_NoMemory();
2409
11.4M
        assert(skind == PyUnicode_1BYTE_KIND);
2410
11.4M
        _PyUnicode_CONVERT_BYTES(
2411
11.4M
            Py_UCS1, Py_UCS2,
2412
11.4M
            (const Py_UCS1 *)data,
2413
11.4M
            ((const Py_UCS1 *)data) + len,
2414
11.4M
            result);
2415
11.4M
        return result;
2416
1.15M
    case PyUnicode_4BYTE_KIND:
2417
1.15M
        result = PyMem_New(Py_UCS4, len);
2418
1.15M
        if (!result)
2419
0
            return PyErr_NoMemory();
2420
1.15M
        if (skind == PyUnicode_2BYTE_KIND) {
2421
0
            _PyUnicode_CONVERT_BYTES(
2422
0
                Py_UCS2, Py_UCS4,
2423
0
                (const Py_UCS2 *)data,
2424
0
                ((const Py_UCS2 *)data) + len,
2425
0
                result);
2426
0
        }
2427
1.15M
        else {
2428
1.15M
            assert(skind == PyUnicode_1BYTE_KIND);
2429
1.15M
            _PyUnicode_CONVERT_BYTES(
2430
1.15M
                Py_UCS1, Py_UCS4,
2431
1.15M
                (const Py_UCS1 *)data,
2432
1.15M
                ((const Py_UCS1 *)data) + len,
2433
1.15M
                result);
2434
1.15M
        }
2435
1.15M
        return result;
2436
0
    default:
2437
0
        Py_UNREACHABLE();
2438
0
        return NULL;
2439
12.5M
    }
2440
12.5M
}
2441
2442
static Py_UCS4*
2443
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2444
        int copy_null)
2445
75.4k
{
2446
75.4k
    int kind;
2447
75.4k
    const void *data;
2448
75.4k
    Py_ssize_t len, targetlen;
2449
75.4k
    kind = PyUnicode_KIND(string);
2450
75.4k
    data = PyUnicode_DATA(string);
2451
75.4k
    len = PyUnicode_GET_LENGTH(string);
2452
75.4k
    targetlen = len;
2453
75.4k
    if (copy_null)
2454
0
        targetlen++;
2455
75.4k
    if (!target) {
2456
0
        target = PyMem_New(Py_UCS4, targetlen);
2457
0
        if (!target) {
2458
0
            PyErr_NoMemory();
2459
0
            return NULL;
2460
0
        }
2461
0
    }
2462
75.4k
    else {
2463
75.4k
        if (targetsize < targetlen) {
2464
0
            PyErr_Format(PyExc_SystemError,
2465
0
                         "string is longer than the buffer");
2466
0
            if (copy_null && 0 < targetsize)
2467
0
                target[0] = 0;
2468
0
            return NULL;
2469
0
        }
2470
75.4k
    }
2471
75.4k
    if (kind == PyUnicode_1BYTE_KIND) {
2472
57.0k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2473
57.0k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2474
57.0k
    }
2475
18.3k
    else if (kind == PyUnicode_2BYTE_KIND) {
2476
13.8k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2477
13.8k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2478
13.8k
    }
2479
4.50k
    else if (kind == PyUnicode_4BYTE_KIND) {
2480
4.50k
        memcpy(target, data, len * sizeof(Py_UCS4));
2481
4.50k
    }
2482
0
    else {
2483
0
        Py_UNREACHABLE();
2484
0
    }
2485
75.4k
    if (copy_null)
2486
0
        target[len] = 0;
2487
75.4k
    return target;
2488
75.4k
}
2489
2490
Py_UCS4*
2491
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2492
                 int copy_null)
2493
75.4k
{
2494
75.4k
    if (target == NULL || targetsize < 0) {
2495
0
        PyErr_BadInternalCall();
2496
0
        return NULL;
2497
0
    }
2498
75.4k
    return as_ucs4(string, target, targetsize, copy_null);
2499
75.4k
}
2500
2501
Py_UCS4*
2502
PyUnicode_AsUCS4Copy(PyObject *string)
2503
0
{
2504
0
    return as_ucs4(string, NULL, 0, 1);
2505
0
}
2506
2507
/* maximum number of characters required for output of %jo or %jd or %p.
2508
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2509
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2510
   plus 1 for the terminal NUL. */
2511
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2512
2513
static int
2514
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2515
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2516
27.0k
{
2517
27.0k
    Py_ssize_t length, fill, arglen;
2518
27.0k
    Py_UCS4 maxchar;
2519
2520
27.0k
    length = PyUnicode_GET_LENGTH(str);
2521
27.0k
    if ((precision == -1 || precision >= length)
2522
27.0k
        && width <= length)
2523
27.0k
        return _PyUnicodeWriter_WriteStr(writer, str);
2524
2525
48
    if (precision != -1)
2526
48
        length = Py_MIN(precision, length);
2527
2528
48
    arglen = Py_MAX(length, width);
2529
48
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2530
23
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2531
25
    else
2532
25
        maxchar = writer->maxchar;
2533
2534
48
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2535
0
        return -1;
2536
2537
48
    fill = Py_MAX(width - length, 0);
2538
48
    if (fill && !(flags & F_LJUST)) {
2539
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2540
0
            return -1;
2541
0
        writer->pos += fill;
2542
0
    }
2543
2544
48
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2545
48
                                  str, 0, length);
2546
48
    writer->pos += length;
2547
2548
48
    if (fill && (flags & F_LJUST)) {
2549
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2550
0
            return -1;
2551
0
        writer->pos += fill;
2552
0
    }
2553
2554
48
    return 0;
2555
48
}
2556
2557
static int
2558
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2559
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2560
5.46M
{
2561
    /* UTF-8 */
2562
5.46M
    Py_ssize_t *pconsumed = NULL;
2563
5.46M
    Py_ssize_t length;
2564
5.46M
    if (precision == -1) {
2565
195k
        length = strlen(str);
2566
195k
    }
2567
5.27M
    else {
2568
5.27M
        length = 0;
2569
21.6M
        while (length < precision && str[length]) {
2570
16.4M
            length++;
2571
16.4M
        }
2572
5.27M
        if (length == precision) {
2573
            /* The input string is not NUL-terminated.  If it ends with an
2574
             * incomplete UTF-8 sequence, truncate the string just before it.
2575
             * Incomplete sequences in the middle and sequences which cannot
2576
             * be valid prefixes are still treated as errors and replaced
2577
             * with \xfffd. */
2578
1.88k
            pconsumed = &length;
2579
1.88k
        }
2580
5.27M
    }
2581
2582
5.46M
    if (width < 0) {
2583
5.46M
        return _PyUnicode_DecodeUTF8Writer(writer, str, length,
2584
5.46M
                                           _Py_ERROR_REPLACE, "replace", pconsumed);
2585
5.46M
    }
2586
2587
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2588
0
                                                     "replace", pconsumed);
2589
0
    if (unicode == NULL)
2590
0
        return -1;
2591
2592
0
    int res = unicode_fromformat_write_str(writer, unicode,
2593
0
                                           width, -1, flags);
2594
0
    Py_DECREF(unicode);
2595
0
    return res;
2596
0
}
2597
2598
static int
2599
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2600
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2601
0
{
2602
0
    Py_ssize_t length;
2603
0
    if (precision == -1) {
2604
0
        length = wcslen(str);
2605
0
    }
2606
0
    else {
2607
0
        length = 0;
2608
0
        while (length < precision && str[length]) {
2609
0
            length++;
2610
0
        }
2611
0
    }
2612
2613
0
    if (width < 0) {
2614
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2615
0
                                             str, length);
2616
0
    }
2617
2618
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2619
0
    if (unicode == NULL)
2620
0
        return -1;
2621
2622
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2623
0
    Py_DECREF(unicode);
2624
0
    return res;
2625
0
}
2626
2627
0
#define F_LONG 1
2628
0
#define F_LONGLONG 2
2629
78.7k
#define F_SIZE 3
2630
0
#define F_PTRDIFF 4
2631
0
#define F_INTMAX 5
2632
2633
static const char*
2634
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2635
                       const char *f, va_list *vargs)
2636
31.9M
{
2637
31.9M
    const char *p;
2638
31.9M
    Py_ssize_t len;
2639
31.9M
    int flags = 0;
2640
31.9M
    Py_ssize_t width;
2641
31.9M
    Py_ssize_t precision;
2642
2643
31.9M
    p = f;
2644
31.9M
    f++;
2645
31.9M
    if (*f == '%') {
2646
5.25M
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2647
0
            return NULL;
2648
5.25M
        f++;
2649
5.25M
        return f;
2650
5.25M
    }
2651
2652
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2653
    /* Flags '+', ' ' and '#' are not particularly useful.
2654
     * They are not worth the implementation and maintenance costs.
2655
     * In addition, '#' should add "0" for "o" conversions for compatibility
2656
     * with printf, but it would confuse Python users. */
2657
26.7M
    while (1) {
2658
26.7M
        switch (*f++) {
2659
0
        case '-': flags |= F_LJUST; continue;
2660
2.16k
        case '0': flags |= F_ZERO; continue;
2661
0
        case '#': flags |= F_ALT; continue;
2662
26.7M
        }
2663
26.7M
        f--;
2664
26.7M
        break;
2665
26.7M
    }
2666
2667
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2668
26.7M
    width = -1;
2669
26.7M
    if (*f == '*') {
2670
0
        width = va_arg(*vargs, int);
2671
0
        if (width < 0) {
2672
0
            flags |= F_LJUST;
2673
0
            width = -width;
2674
0
        }
2675
0
        f++;
2676
0
    }
2677
26.7M
    else if (Py_ISDIGIT((unsigned)*f)) {
2678
2.16k
        width = *f - '0';
2679
2.16k
        f++;
2680
2.16k
        while (Py_ISDIGIT((unsigned)*f)) {
2681
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2682
0
                PyErr_SetString(PyExc_ValueError,
2683
0
                                "width too big");
2684
0
                return NULL;
2685
0
            }
2686
0
            width = (width * 10) + (*f - '0');
2687
0
            f++;
2688
0
        }
2689
2.16k
    }
2690
26.7M
    precision = -1;
2691
26.7M
    if (*f == '.') {
2692
5.27M
        f++;
2693
5.27M
        if (*f == '*') {
2694
0
            precision = va_arg(*vargs, int);
2695
0
            if (precision < 0) {
2696
0
                precision = -2;
2697
0
            }
2698
0
            f++;
2699
0
        }
2700
5.27M
        else if (Py_ISDIGIT((unsigned)*f)) {
2701
5.27M
            precision = (*f - '0');
2702
5.27M
            f++;
2703
15.8M
            while (Py_ISDIGIT((unsigned)*f)) {
2704
10.5M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2705
0
                    PyErr_SetString(PyExc_ValueError,
2706
0
                                    "precision too big");
2707
0
                    return NULL;
2708
0
                }
2709
10.5M
                precision = (precision * 10) + (*f - '0');
2710
10.5M
                f++;
2711
10.5M
            }
2712
5.27M
        }
2713
5.27M
    }
2714
2715
26.7M
    int sizemod = 0;
2716
26.7M
    if (*f == 'l') {
2717
0
        if (f[1] == 'l') {
2718
0
            sizemod = F_LONGLONG;
2719
0
            f += 2;
2720
0
        }
2721
0
        else {
2722
0
            sizemod = F_LONG;
2723
0
            ++f;
2724
0
        }
2725
0
    }
2726
26.7M
    else if (*f == 'z') {
2727
39.3k
        sizemod = F_SIZE;
2728
39.3k
        ++f;
2729
39.3k
    }
2730
26.6M
    else if (*f == 't') {
2731
0
        sizemod = F_PTRDIFF;
2732
0
        ++f;
2733
0
    }
2734
26.6M
    else if (*f == 'j') {
2735
0
        sizemod = F_INTMAX;
2736
0
        ++f;
2737
0
    }
2738
26.7M
    if (f[0] != '\0' && f[1] == '\0')
2739
5.34M
        writer->overallocate = 0;
2740
2741
26.7M
    switch (*f) {
2742
15.9M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2743
15.9M
        break;
2744
5.26M
    case 'c': case 'p':
2745
5.26M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2746
5.26M
        break;
2747
5.46M
    case 's':
2748
5.46M
    case 'V':
2749
5.46M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2750
5.46M
        break;
2751
5.46M
    default:
2752
27.0k
        if (sizemod) goto invalid_format;
2753
27.0k
        break;
2754
26.7M
    }
2755
2756
26.7M
    switch (*f) {
2757
5.26M
    case 'c':
2758
5.26M
    {
2759
5.26M
        int ordinal = va_arg(*vargs, int);
2760
5.26M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2761
0
            PyErr_SetString(PyExc_OverflowError,
2762
0
                            "character argument not in range(0x110000)");
2763
0
            return NULL;
2764
0
        }
2765
5.26M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2766
0
            return NULL;
2767
5.26M
        break;
2768
5.26M
    }
2769
2770
15.9M
    case 'd': case 'i':
2771
15.9M
    case 'o': case 'u': case 'x': case 'X':
2772
15.9M
    {
2773
15.9M
        char buffer[MAX_INTMAX_CHARS];
2774
2775
        // Fill buffer using sprinf, with one of many possible format
2776
        // strings, like "%llX" for `long long` in hexadecimal.
2777
        // The type/size is in `sizemod`; the format is in `*f`.
2778
2779
        // Use macros with nested switches to keep the sprintf format strings
2780
        // as compile-time literals, avoiding warnings and maybe allowing
2781
        // optimizations.
2782
2783
        // `SPRINT` macro does one sprintf
2784
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2785
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2786
15.9M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2787
15.9M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2788
2789
        // One inner switch to handle all format variants
2790
15.9M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2791
15.9M
            switch (*f) {                                                     \
2792
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2793
0
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2794
1.57k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2795
1.13k
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2796
15.9M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2797
15.9M
            }
2798
2799
        // Outer switch to handle all the sizes/types
2800
15.9M
        switch (sizemod) {
2801
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2802
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2803
39.3k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2804
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2805
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2806
15.9M
            default:         DO_SPRINTS("", int, unsigned int); break;
2807
15.9M
        }
2808
15.9M
        #undef SPRINT
2809
15.9M
        #undef DO_SPRINTS
2810
2811
15.9M
        assert(len >= 0);
2812
2813
15.9M
        int sign = (buffer[0] == '-');
2814
15.9M
        len -= sign;
2815
2816
15.9M
        precision = Py_MAX(precision, len);
2817
15.9M
        width = Py_MAX(width, precision + sign);
2818
15.9M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2819
2.16k
            precision = width - sign;
2820
2.16k
        }
2821
2822
15.9M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2823
15.9M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2824
2825
15.9M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2826
0
            return NULL;
2827
2828
15.9M
        if (spacepad && !(flags & F_LJUST)) {
2829
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2830
0
                return NULL;
2831
0
            writer->pos += spacepad;
2832
0
        }
2833
2834
15.9M
        if (sign) {
2835
0
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2836
0
                return NULL;
2837
0
        }
2838
2839
15.9M
        if (zeropad) {
2840
644
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2841
0
                return NULL;
2842
644
            writer->pos += zeropad;
2843
644
        }
2844
2845
15.9M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2846
0
            return NULL;
2847
2848
15.9M
        if (spacepad && (flags & F_LJUST)) {
2849
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2850
0
                return NULL;
2851
0
            writer->pos += spacepad;
2852
0
        }
2853
15.9M
        break;
2854
15.9M
    }
2855
2856
15.9M
    case 'p':
2857
0
    {
2858
0
        char number[MAX_INTMAX_CHARS];
2859
2860
0
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2861
0
        assert(len >= 0);
2862
2863
        /* %p is ill-defined:  ensure leading 0x. */
2864
0
        if (number[1] == 'X')
2865
0
            number[1] = 'x';
2866
0
        else if (number[1] != 'x') {
2867
0
            memmove(number + 2, number,
2868
0
                    strlen(number) + 1);
2869
0
            number[0] = '0';
2870
0
            number[1] = 'x';
2871
0
            len += 2;
2872
0
        }
2873
2874
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2875
0
            return NULL;
2876
0
        break;
2877
0
    }
2878
2879
5.46M
    case 's':
2880
5.46M
    {
2881
5.46M
        if (sizemod) {
2882
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2883
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2884
0
                return NULL;
2885
0
        }
2886
5.46M
        else {
2887
            /* UTF-8 */
2888
5.46M
            const char *s = va_arg(*vargs, const char*);
2889
5.46M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2890
0
                return NULL;
2891
5.46M
        }
2892
5.46M
        break;
2893
5.46M
    }
2894
2895
5.46M
    case 'U':
2896
26.3k
    {
2897
26.3k
        PyObject *obj = va_arg(*vargs, PyObject *);
2898
26.3k
        assert(obj && _PyUnicode_CHECK(obj));
2899
2900
26.3k
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2901
0
            return NULL;
2902
26.3k
        break;
2903
26.3k
    }
2904
2905
26.3k
    case 'V':
2906
548
    {
2907
548
        PyObject *obj = va_arg(*vargs, PyObject *);
2908
548
        const char *str;
2909
548
        const wchar_t *wstr;
2910
548
        if (sizemod) {
2911
0
            wstr = va_arg(*vargs, const wchar_t*);
2912
0
        }
2913
548
        else {
2914
548
            str = va_arg(*vargs, const char *);
2915
548
        }
2916
548
        if (obj) {
2917
0
            assert(_PyUnicode_CHECK(obj));
2918
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2919
0
                return NULL;
2920
0
        }
2921
548
        else if (sizemod) {
2922
0
            assert(wstr != NULL);
2923
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2924
0
                return NULL;
2925
0
        }
2926
548
        else {
2927
548
            assert(str != NULL);
2928
548
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
2929
0
                return NULL;
2930
548
        }
2931
548
        break;
2932
548
    }
2933
2934
548
    case 'S':
2935
41
    {
2936
41
        PyObject *obj = va_arg(*vargs, PyObject *);
2937
41
        PyObject *str;
2938
41
        assert(obj);
2939
41
        str = PyObject_Str(obj);
2940
41
        if (!str)
2941
0
            return NULL;
2942
41
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2943
0
            Py_DECREF(str);
2944
0
            return NULL;
2945
0
        }
2946
41
        Py_DECREF(str);
2947
41
        break;
2948
41
    }
2949
2950
655
    case 'R':
2951
655
    {
2952
655
        PyObject *obj = va_arg(*vargs, PyObject *);
2953
655
        PyObject *repr;
2954
655
        assert(obj);
2955
655
        repr = PyObject_Repr(obj);
2956
655
        if (!repr)
2957
0
            return NULL;
2958
655
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2959
0
            Py_DECREF(repr);
2960
0
            return NULL;
2961
0
        }
2962
655
        Py_DECREF(repr);
2963
655
        break;
2964
655
    }
2965
2966
0
    case 'A':
2967
0
    {
2968
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2969
0
        PyObject *ascii;
2970
0
        assert(obj);
2971
0
        ascii = PyObject_ASCII(obj);
2972
0
        if (!ascii)
2973
0
            return NULL;
2974
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2975
0
            Py_DECREF(ascii);
2976
0
            return NULL;
2977
0
        }
2978
0
        Py_DECREF(ascii);
2979
0
        break;
2980
0
    }
2981
2982
0
    case 'T':
2983
0
    {
2984
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2985
0
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2986
2987
0
        PyObject *type_name;
2988
0
        if (flags & F_ALT) {
2989
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
2990
0
        }
2991
0
        else {
2992
0
            type_name = PyType_GetFullyQualifiedName(type);
2993
0
        }
2994
0
        Py_DECREF(type);
2995
0
        if (!type_name) {
2996
0
            return NULL;
2997
0
        }
2998
2999
0
        if (unicode_fromformat_write_str(writer, type_name,
3000
0
                                         width, precision, flags) == -1) {
3001
0
            Py_DECREF(type_name);
3002
0
            return NULL;
3003
0
        }
3004
0
        Py_DECREF(type_name);
3005
0
        break;
3006
0
    }
3007
3008
0
    case 'N':
3009
0
    {
3010
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3011
0
        assert(type_raw != NULL);
3012
3013
0
        if (!PyType_Check(type_raw)) {
3014
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3015
0
            return NULL;
3016
0
        }
3017
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3018
3019
0
        PyObject *type_name;
3020
0
        if (flags & F_ALT) {
3021
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3022
0
        }
3023
0
        else {
3024
0
            type_name = PyType_GetFullyQualifiedName(type);
3025
0
        }
3026
0
        if (!type_name) {
3027
0
            return NULL;
3028
0
        }
3029
0
        if (unicode_fromformat_write_str(writer, type_name,
3030
0
                                         width, precision, flags) == -1) {
3031
0
            Py_DECREF(type_name);
3032
0
            return NULL;
3033
0
        }
3034
0
        Py_DECREF(type_name);
3035
0
        break;
3036
0
    }
3037
3038
0
    default:
3039
0
    invalid_format:
3040
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3041
0
        return NULL;
3042
26.7M
    }
3043
3044
26.7M
    f++;
3045
26.7M
    return f;
3046
26.7M
}
3047
3048
static int
3049
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3050
13.4M
{
3051
13.4M
    Py_ssize_t len = strlen(format);
3052
13.4M
    writer->min_length += len + 100;
3053
13.4M
    writer->overallocate = 1;
3054
3055
    // Copy varags to be able to pass a reference to a subfunction.
3056
13.4M
    va_list vargs2;
3057
13.4M
    va_copy(vargs2, vargs);
3058
3059
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3060
    // to be encoded to ASCII.
3061
13.4M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3062
13.4M
    if (!is_ascii) {
3063
0
        Py_ssize_t i;
3064
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3065
0
        PyErr_Format(PyExc_ValueError,
3066
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3067
0
            "string, got a non-ASCII byte: 0x%02x",
3068
0
            (unsigned char)format[i]);
3069
0
        goto fail;
3070
0
    }
3071
3072
74.8M
    for (const char *f = format; *f; ) {
3073
61.4M
        if (*f == '%') {
3074
31.9M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3075
31.9M
            if (f == NULL)
3076
0
                goto fail;
3077
31.9M
        }
3078
29.4M
        else {
3079
29.4M
            const char *p = strchr(f, '%');
3080
29.4M
            if (p != NULL) {
3081
21.4M
                len = p - f;
3082
21.4M
            }
3083
8.07M
            else {
3084
8.07M
                len = strlen(f);
3085
8.07M
                writer->overallocate = 0;
3086
8.07M
            }
3087
3088
29.4M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3089
0
                goto fail;
3090
0
            }
3091
29.4M
            f += len;
3092
29.4M
        }
3093
61.4M
    }
3094
13.4M
    va_end(vargs2);
3095
13.4M
    return 0;
3096
3097
0
  fail:
3098
0
    va_end(vargs2);
3099
0
    return -1;
3100
13.4M
}
3101
3102
PyObject *
3103
PyUnicode_FromFormatV(const char *format, va_list vargs)
3104
13.4M
{
3105
13.4M
    _PyUnicodeWriter writer;
3106
13.4M
    _PyUnicodeWriter_Init(&writer);
3107
3108
13.4M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3109
0
        _PyUnicodeWriter_Dealloc(&writer);
3110
0
        return NULL;
3111
0
    }
3112
13.4M
    return _PyUnicodeWriter_Finish(&writer);
3113
13.4M
}
3114
3115
PyObject *
3116
PyUnicode_FromFormat(const char *format, ...)
3117
9.67k
{
3118
9.67k
    PyObject* ret;
3119
9.67k
    va_list vargs;
3120
3121
9.67k
    va_start(vargs, format);
3122
9.67k
    ret = PyUnicode_FromFormatV(format, vargs);
3123
9.67k
    va_end(vargs);
3124
9.67k
    return ret;
3125
9.67k
}
3126
3127
int
3128
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3129
0
{
3130
0
    va_list vargs;
3131
0
    va_start(vargs, format);
3132
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3133
0
    va_end(vargs);
3134
0
    return res;
3135
0
}
3136
3137
int
3138
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3139
                         va_list vargs)
3140
0
{
3141
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3142
0
    Py_ssize_t old_pos = _writer->pos;
3143
3144
0
    int res = unicode_from_format(_writer, format, vargs);
3145
3146
0
    if (res < 0) {
3147
0
        _writer->pos = old_pos;
3148
0
    }
3149
0
    return res;
3150
0
}
3151
3152
static Py_ssize_t
3153
unicode_get_widechar_size(PyObject *unicode)
3154
6.86k
{
3155
6.86k
    Py_ssize_t res;
3156
3157
6.86k
    assert(unicode != NULL);
3158
6.86k
    assert(_PyUnicode_CHECK(unicode));
3159
3160
6.86k
    res = _PyUnicode_LENGTH(unicode);
3161
#if SIZEOF_WCHAR_T == 2
3162
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3163
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3164
        const Py_UCS4 *end = s + res;
3165
        for (; s < end; ++s) {
3166
            if (*s > 0xFFFF) {
3167
                ++res;
3168
            }
3169
        }
3170
    }
3171
#endif
3172
6.86k
    return res;
3173
6.86k
}
3174
3175
static void
3176
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3177
6.86k
{
3178
6.86k
    assert(unicode != NULL);
3179
6.86k
    assert(_PyUnicode_CHECK(unicode));
3180
3181
6.86k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3182
0
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3183
0
        return;
3184
0
    }
3185
3186
6.86k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3187
6.86k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3188
588k
        for (; size--; ++s, ++w) {
3189
581k
            *w = *s;
3190
581k
        }
3191
6.86k
    }
3192
0
    else {
3193
0
#if SIZEOF_WCHAR_T == 4
3194
0
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3195
0
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3196
0
        for (; size--; ++s, ++w) {
3197
0
            *w = *s;
3198
0
        }
3199
#else
3200
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3201
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3202
        for (; size--; ++s, ++w) {
3203
            Py_UCS4 ch = *s;
3204
            if (ch > 0xFFFF) {
3205
                assert(ch <= MAX_UNICODE);
3206
                /* encode surrogate pair in this case */
3207
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3208
                if (!size--)
3209
                    break;
3210
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3211
            }
3212
            else {
3213
                *w = ch;
3214
            }
3215
        }
3216
#endif
3217
0
    }
3218
6.86k
}
3219
3220
#ifdef HAVE_WCHAR_H
3221
3222
/* Convert a Unicode object to a wide character string.
3223
3224
   - If w is NULL: return the number of wide characters (including the null
3225
     character) required to convert the unicode object. Ignore size argument.
3226
3227
   - Otherwise: return the number of wide characters (excluding the null
3228
     character) written into w. Write at most size wide characters (including
3229
     the null character). */
3230
Py_ssize_t
3231
PyUnicode_AsWideChar(PyObject *unicode,
3232
                     wchar_t *w,
3233
                     Py_ssize_t size)
3234
5.61k
{
3235
5.61k
    Py_ssize_t res;
3236
3237
5.61k
    if (unicode == NULL) {
3238
0
        PyErr_BadInternalCall();
3239
0
        return -1;
3240
0
    }
3241
5.61k
    if (!PyUnicode_Check(unicode)) {
3242
0
        PyErr_BadArgument();
3243
0
        return -1;
3244
0
    }
3245
3246
5.61k
    res = unicode_get_widechar_size(unicode);
3247
5.61k
    if (w == NULL) {
3248
0
        return res + 1;
3249
0
    }
3250
3251
5.61k
    if (size > res) {
3252
5.61k
        size = res + 1;
3253
5.61k
    }
3254
0
    else {
3255
0
        res = size;
3256
0
    }
3257
5.61k
    unicode_copy_as_widechar(unicode, w, size);
3258
3259
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3260
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3261
       non-Unicode locales and hence needs conversion first. */
3262
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3263
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3264
            return -1;
3265
        }
3266
    }
3267
#endif
3268
3269
5.61k
    return res;
3270
5.61k
}
3271
3272
wchar_t*
3273
PyUnicode_AsWideCharString(PyObject *unicode,
3274
                           Py_ssize_t *size)
3275
1.25k
{
3276
1.25k
    wchar_t *buffer;
3277
1.25k
    Py_ssize_t buflen;
3278
3279
1.25k
    if (unicode == NULL) {
3280
0
        PyErr_BadInternalCall();
3281
0
        return NULL;
3282
0
    }
3283
1.25k
    if (!PyUnicode_Check(unicode)) {
3284
0
        PyErr_BadArgument();
3285
0
        return NULL;
3286
0
    }
3287
3288
1.25k
    buflen = unicode_get_widechar_size(unicode);
3289
1.25k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3290
1.25k
    if (buffer == NULL) {
3291
0
        PyErr_NoMemory();
3292
0
        return NULL;
3293
0
    }
3294
1.25k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3295
3296
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3297
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3298
       non-Unicode locales and hence needs conversion first. */
3299
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3300
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3301
            return NULL;
3302
        }
3303
    }
3304
#endif
3305
3306
1.25k
    if (size != NULL) {
3307
808
        *size = buflen;
3308
808
    }
3309
448
    else if (wcslen(buffer) != (size_t)buflen) {
3310
0
        PyMem_Free(buffer);
3311
0
        PyErr_SetString(PyExc_ValueError,
3312
0
                        "embedded null character");
3313
0
        return NULL;
3314
0
    }
3315
1.25k
    return buffer;
3316
1.25k
}
3317
3318
#endif /* HAVE_WCHAR_H */
3319
3320
int
3321
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3322
0
{
3323
0
    wchar_t **p = (wchar_t **)ptr;
3324
0
    if (obj == NULL) {
3325
0
        PyMem_Free(*p);
3326
0
        *p = NULL;
3327
0
        return 1;
3328
0
    }
3329
0
    if (PyUnicode_Check(obj)) {
3330
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3331
0
        if (*p == NULL) {
3332
0
            return 0;
3333
0
        }
3334
0
        return Py_CLEANUP_SUPPORTED;
3335
0
    }
3336
0
    PyErr_Format(PyExc_TypeError,
3337
0
                 "argument must be str, not %.50s",
3338
0
                 Py_TYPE(obj)->tp_name);
3339
0
    return 0;
3340
0
}
3341
3342
int
3343
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3344
0
{
3345
0
    wchar_t **p = (wchar_t **)ptr;
3346
0
    if (obj == NULL) {
3347
0
        PyMem_Free(*p);
3348
0
        *p = NULL;
3349
0
        return 1;
3350
0
    }
3351
0
    if (obj == Py_None) {
3352
0
        *p = NULL;
3353
0
        return 1;
3354
0
    }
3355
0
    if (PyUnicode_Check(obj)) {
3356
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3357
0
        if (*p == NULL) {
3358
0
            return 0;
3359
0
        }
3360
0
        return Py_CLEANUP_SUPPORTED;
3361
0
    }
3362
0
    PyErr_Format(PyExc_TypeError,
3363
0
                 "argument must be str or None, not %.50s",
3364
0
                 Py_TYPE(obj)->tp_name);
3365
0
    return 0;
3366
0
}
3367
3368
PyObject *
3369
PyUnicode_FromOrdinal(int ordinal)
3370
194k
{
3371
194k
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3372
0
        PyErr_SetString(PyExc_ValueError,
3373
0
                        "chr() arg not in range(0x110000)");
3374
0
        return NULL;
3375
0
    }
3376
3377
194k
    return unicode_char((Py_UCS4)ordinal);
3378
194k
}
3379
3380
PyObject *
3381
PyUnicode_FromObject(PyObject *obj)
3382
5.26M
{
3383
    /* XXX Perhaps we should make this API an alias of
3384
       PyObject_Str() instead ?! */
3385
5.26M
    if (PyUnicode_CheckExact(obj)) {
3386
5.26M
        return Py_NewRef(obj);
3387
5.26M
    }
3388
0
    if (PyUnicode_Check(obj)) {
3389
        /* For a Unicode subtype that's not a Unicode object,
3390
           return a true Unicode object with the same data. */
3391
0
        return _PyUnicode_Copy(obj);
3392
0
    }
3393
0
    PyErr_Format(PyExc_TypeError,
3394
0
                 "Can't convert '%.100s' object to str implicitly",
3395
0
                 Py_TYPE(obj)->tp_name);
3396
0
    return NULL;
3397
0
}
3398
3399
PyObject *
3400
PyUnicode_FromEncodedObject(PyObject *obj,
3401
                            const char *encoding,
3402
                            const char *errors)
3403
5.46M
{
3404
5.46M
    Py_buffer buffer;
3405
5.46M
    PyObject *v;
3406
3407
5.46M
    if (obj == NULL) {
3408
0
        PyErr_BadInternalCall();
3409
0
        return NULL;
3410
0
    }
3411
3412
    /* Decoding bytes objects is the most common case and should be fast */
3413
5.46M
    if (PyBytes_Check(obj)) {
3414
4.80M
        if (PyBytes_GET_SIZE(obj) == 0) {
3415
362k
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3416
0
                return NULL;
3417
0
            }
3418
362k
            _Py_RETURN_UNICODE_EMPTY();
3419
362k
        }
3420
4.44M
        return PyUnicode_Decode(
3421
4.44M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3422
4.44M
                encoding, errors);
3423
4.80M
    }
3424
3425
655k
    if (PyUnicode_Check(obj)) {
3426
0
        PyErr_SetString(PyExc_TypeError,
3427
0
                        "decoding str is not supported");
3428
0
        return NULL;
3429
0
    }
3430
3431
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3432
655k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3433
0
        PyErr_Format(PyExc_TypeError,
3434
0
                     "decoding to str: need a bytes-like object, %.80s found",
3435
0
                     Py_TYPE(obj)->tp_name);
3436
0
        return NULL;
3437
0
    }
3438
3439
655k
    if (buffer.len == 0) {
3440
0
        PyBuffer_Release(&buffer);
3441
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3442
0
            return NULL;
3443
0
        }
3444
0
        _Py_RETURN_UNICODE_EMPTY();
3445
0
    }
3446
3447
655k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3448
655k
    PyBuffer_Release(&buffer);
3449
655k
    return v;
3450
655k
}
3451
3452
/* Normalize an encoding name like encodings.normalize_encoding()
3453
   but allow to convert to lowercase if *to_lower* is true.
3454
   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
3455
int
3456
_Py_normalize_encoding(const char *encoding,
3457
                       char *lower,
3458
                       size_t lower_len,
3459
                       int to_lower)
3460
10.0M
{
3461
10.0M
    const char *e;
3462
10.0M
    char *l;
3463
10.0M
    char *l_end;
3464
10.0M
    int punct;
3465
3466
10.0M
    assert(encoding != NULL);
3467
3468
10.0M
    e = encoding;
3469
10.0M
    l = lower;
3470
10.0M
    l_end = &lower[lower_len - 1];
3471
10.0M
    punct = 0;
3472
156M
    while (1) {
3473
156M
        char c = *e;
3474
156M
        if (c == 0) {
3475
9.51M
            break;
3476
9.51M
        }
3477
3478
147M
        if (Py_ISALNUM(c) || c == '.') {
3479
60.9M
            if (punct && l != lower) {
3480
9.33M
                if (l == l_end) {
3481
1.31k
                    return 0;
3482
1.31k
                }
3483
9.33M
                *l++ = '_';
3484
9.33M
            }
3485
60.9M
            punct = 0;
3486
3487
60.9M
            if (l == l_end) {
3488
554k
                return 0;
3489
554k
            }
3490
60.4M
            *l++ = to_lower ? Py_TOLOWER(c) : c;
3491
60.4M
        }
3492
86.4M
        else {
3493
86.4M
            punct = 1;
3494
86.4M
        }
3495
3496
146M
        e++;
3497
146M
    }
3498
9.51M
    *l = '\0';
3499
9.51M
    return 1;
3500
10.0M
}
3501
3502
PyObject *
3503
PyUnicode_Decode(const char *s,
3504
                 Py_ssize_t size,
3505
                 const char *encoding,
3506
                 const char *errors)
3507
5.10M
{
3508
5.10M
    PyObject *buffer = NULL, *unicode;
3509
5.10M
    Py_buffer info;
3510
5.10M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3511
3512
5.10M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3513
0
        return NULL;
3514
0
    }
3515
3516
5.10M
    if (size == 0) {
3517
0
        _Py_RETURN_UNICODE_EMPTY();
3518
0
    }
3519
3520
5.10M
    if (encoding == NULL) {
3521
38.4k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3522
38.4k
    }
3523
3524
    /* Shortcuts for common default encodings */
3525
5.06M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3526
5.05M
        char *lower = buflower;
3527
3528
        /* Fast paths */
3529
5.05M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3530
822k
            lower += 3;
3531
822k
            if (*lower == '_') {
3532
                /* Match "utf8" and "utf_8" */
3533
822k
                lower++;
3534
822k
            }
3535
3536
822k
            if (lower[0] == '8' && lower[1] == 0) {
3537
822k
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3538
822k
            }
3539
821
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3540
106
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3541
106
            }
3542
715
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3543
100
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3544
100
            }
3545
822k
        }
3546
4.23M
        else {
3547
4.23M
            if (strcmp(lower, "ascii") == 0
3548
3.85M
                || strcmp(lower, "us_ascii") == 0) {
3549
532k
                return PyUnicode_DecodeASCII(s, size, errors);
3550
532k
            }
3551
    #ifdef MS_WINDOWS
3552
            else if (strcmp(lower, "mbcs") == 0) {
3553
                return PyUnicode_DecodeMBCS(s, size, errors);
3554
            }
3555
    #endif
3556
3.70M
            else if (strcmp(lower, "latin1") == 0
3557
3.70M
                     || strcmp(lower, "latin_1") == 0
3558
352k
                     || strcmp(lower, "iso_8859_1") == 0
3559
3.37M
                     || strcmp(lower, "iso8859_1") == 0) {
3560
3.37M
                return PyUnicode_DecodeLatin1(s, size, errors);
3561
3.37M
            }
3562
4.23M
        }
3563
5.05M
    }
3564
3565
    /* Decode via the codec registry */
3566
337k
    buffer = NULL;
3567
337k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3568
0
        goto onError;
3569
337k
    buffer = PyMemoryView_FromBuffer(&info);
3570
337k
    if (buffer == NULL)
3571
0
        goto onError;
3572
337k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3573
337k
    if (unicode == NULL)
3574
127k
        goto onError;
3575
210k
    if (!PyUnicode_Check(unicode)) {
3576
0
        PyErr_Format(PyExc_TypeError,
3577
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3578
0
                     "use codecs.decode() to decode to arbitrary types",
3579
0
                     encoding,
3580
0
                     Py_TYPE(unicode)->tp_name);
3581
0
        Py_DECREF(unicode);
3582
0
        goto onError;
3583
0
    }
3584
210k
    Py_DECREF(buffer);
3585
210k
    return unicode_result(unicode);
3586
3587
127k
  onError:
3588
127k
    Py_XDECREF(buffer);
3589
127k
    return NULL;
3590
210k
}
3591
3592
PyAPI_FUNC(PyObject *)
3593
PyUnicode_AsDecodedObject(PyObject *unicode,
3594
                          const char *encoding,
3595
                          const char *errors)
3596
0
{
3597
0
    if (!PyUnicode_Check(unicode)) {
3598
0
        PyErr_BadArgument();
3599
0
        return NULL;
3600
0
    }
3601
3602
0
    if (encoding == NULL)
3603
0
        encoding = PyUnicode_GetDefaultEncoding();
3604
3605
    /* Decode via the codec registry */
3606
0
    return PyCodec_Decode(unicode, encoding, errors);
3607
0
}
3608
3609
PyAPI_FUNC(PyObject *)
3610
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3611
                           const char *encoding,
3612
                           const char *errors)
3613
0
{
3614
0
    PyObject *v;
3615
3616
0
    if (!PyUnicode_Check(unicode)) {
3617
0
        PyErr_BadArgument();
3618
0
        goto onError;
3619
0
    }
3620
3621
0
    if (encoding == NULL)
3622
0
        encoding = PyUnicode_GetDefaultEncoding();
3623
3624
    /* Decode via the codec registry */
3625
0
    v = PyCodec_Decode(unicode, encoding, errors);
3626
0
    if (v == NULL)
3627
0
        goto onError;
3628
0
    if (!PyUnicode_Check(v)) {
3629
0
        PyErr_Format(PyExc_TypeError,
3630
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3631
0
                     "use codecs.decode() to decode to arbitrary types",
3632
0
                     encoding,
3633
0
                     Py_TYPE(unicode)->tp_name);
3634
0
        Py_DECREF(v);
3635
0
        goto onError;
3636
0
    }
3637
0
    return unicode_result(v);
3638
3639
0
  onError:
3640
0
    return NULL;
3641
0
}
3642
3643
PyAPI_FUNC(PyObject *)
3644
PyUnicode_AsEncodedObject(PyObject *unicode,
3645
                          const char *encoding,
3646
                          const char *errors)
3647
0
{
3648
0
    PyObject *v;
3649
3650
0
    if (!PyUnicode_Check(unicode)) {
3651
0
        PyErr_BadArgument();
3652
0
        goto onError;
3653
0
    }
3654
3655
0
    if (encoding == NULL)
3656
0
        encoding = PyUnicode_GetDefaultEncoding();
3657
3658
    /* Encode via the codec registry */
3659
0
    v = PyCodec_Encode(unicode, encoding, errors);
3660
0
    if (v == NULL)
3661
0
        goto onError;
3662
0
    return v;
3663
3664
0
  onError:
3665
0
    return NULL;
3666
0
}
3667
3668
3669
static PyObject *
3670
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3671
                      int current_locale)
3672
408
{
3673
408
    Py_ssize_t wlen;
3674
408
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3675
408
    if (wstr == NULL) {
3676
0
        return NULL;
3677
0
    }
3678
3679
408
    if ((size_t)wlen != wcslen(wstr)) {
3680
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3681
0
        PyMem_Free(wstr);
3682
0
        return NULL;
3683
0
    }
3684
3685
408
    char *str;
3686
408
    size_t error_pos;
3687
408
    const char *reason;
3688
408
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3689
408
                                 current_locale, error_handler);
3690
408
    PyMem_Free(wstr);
3691
3692
408
    if (res != 0) {
3693
0
        if (res == -2) {
3694
0
            PyObject *exc;
3695
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3696
0
                    "locale", unicode,
3697
0
                    (Py_ssize_t)error_pos,
3698
0
                    (Py_ssize_t)(error_pos+1),
3699
0
                    reason);
3700
0
            if (exc != NULL) {
3701
0
                PyCodec_StrictErrors(exc);
3702
0
                Py_DECREF(exc);
3703
0
            }
3704
0
        }
3705
0
        else if (res == -3) {
3706
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3707
0
        }
3708
0
        else {
3709
0
            PyErr_NoMemory();
3710
0
        }
3711
0
        return NULL;
3712
0
    }
3713
3714
408
    PyObject *bytes = PyBytes_FromString(str);
3715
408
    PyMem_RawFree(str);
3716
408
    return bytes;
3717
408
}
3718
3719
PyObject *
3720
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3721
0
{
3722
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3723
0
    return unicode_encode_locale(unicode, error_handler, 1);
3724
0
}
3725
3726
PyObject *
3727
PyUnicode_EncodeFSDefault(PyObject *unicode)
3728
17.4k
{
3729
17.4k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3730
17.4k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3731
17.4k
    if (fs_codec->utf8) {
3732
17.0k
        return unicode_encode_utf8(unicode,
3733
17.0k
                                   fs_codec->error_handler,
3734
17.0k
                                   fs_codec->errors);
3735
17.0k
    }
3736
408
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3737
408
    else if (fs_codec->encoding) {
3738
0
        return PyUnicode_AsEncodedString(unicode,
3739
0
                                         fs_codec->encoding,
3740
0
                                         fs_codec->errors);
3741
0
    }
3742
408
#endif
3743
408
    else {
3744
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3745
           machinery is not ready and so cannot be used:
3746
           use wcstombs() in this case. */
3747
408
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3748
408
        const wchar_t *filesystem_errors = config->filesystem_errors;
3749
408
        assert(filesystem_errors != NULL);
3750
408
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3751
408
        assert(errors != _Py_ERROR_UNKNOWN);
3752
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3753
        return unicode_encode_utf8(unicode, errors, NULL);
3754
#else
3755
408
        return unicode_encode_locale(unicode, errors, 0);
3756
408
#endif
3757
408
    }
3758
17.4k
}
3759
3760
PyObject *
3761
PyUnicode_AsEncodedString(PyObject *unicode,
3762
                          const char *encoding,
3763
                          const char *errors)
3764
16.9M
{
3765
16.9M
    PyObject *v;
3766
16.9M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3767
3768
16.9M
    if (!PyUnicode_Check(unicode)) {
3769
0
        PyErr_BadArgument();
3770
0
        return NULL;
3771
0
    }
3772
3773
16.9M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3774
0
        return NULL;
3775
0
    }
3776
3777
16.9M
    if (encoding == NULL) {
3778
11.9M
        return _PyUnicode_AsUTF8String(unicode, errors);
3779
11.9M
    }
3780
3781
    /* Shortcuts for common default encodings */
3782
4.98M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3783
4.44M
        char *lower = buflower;
3784
3785
        /* Fast paths */
3786
4.44M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3787
4.31M
            lower += 3;
3788
4.31M
            if (*lower == '_') {
3789
                /* Match "utf8" and "utf_8" */
3790
4.31M
                lower++;
3791
4.31M
            }
3792
3793
4.31M
            if (lower[0] == '8' && lower[1] == 0) {
3794
4.31M
                return _PyUnicode_AsUTF8String(unicode, errors);
3795
4.31M
            }
3796
0
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3797
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3798
0
            }
3799
0
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3800
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3801
0
            }
3802
4.31M
        }
3803
128k
        else {
3804
128k
            if (strcmp(lower, "ascii") == 0
3805
108k
                || strcmp(lower, "us_ascii") == 0) {
3806
108k
                return _PyUnicode_AsASCIIString(unicode, errors);
3807
108k
            }
3808
#ifdef MS_WINDOWS
3809
            else if (strcmp(lower, "mbcs") == 0) {
3810
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3811
            }
3812
#endif
3813
20.1k
            else if (strcmp(lower, "latin1") == 0 ||
3814
20.1k
                     strcmp(lower, "latin_1") == 0 ||
3815
20.1k
                     strcmp(lower, "iso_8859_1") == 0 ||
3816
20.1k
                     strcmp(lower, "iso8859_1") == 0) {
3817
0
                return _PyUnicode_AsLatin1String(unicode, errors);
3818
0
            }
3819
128k
        }
3820
4.44M
    }
3821
3822
    /* Encode via the codec registry */
3823
567k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3824
567k
    if (v == NULL)
3825
0
        return NULL;
3826
3827
    /* The normal path */
3828
567k
    if (PyBytes_Check(v))
3829
567k
        return v;
3830
3831
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3832
0
    if (PyByteArray_Check(v)) {
3833
0
        int error;
3834
0
        PyObject *b;
3835
3836
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3837
0
            "encoder %s returned bytearray instead of bytes; "
3838
0
            "use codecs.encode() to encode to arbitrary types",
3839
0
            encoding);
3840
0
        if (error) {
3841
0
            Py_DECREF(v);
3842
0
            return NULL;
3843
0
        }
3844
3845
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3846
0
                                      PyByteArray_GET_SIZE(v));
3847
0
        Py_DECREF(v);
3848
0
        return b;
3849
0
    }
3850
3851
0
    PyErr_Format(PyExc_TypeError,
3852
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3853
0
                 "use codecs.encode() to encode to arbitrary types",
3854
0
                 encoding,
3855
0
                 Py_TYPE(v)->tp_name);
3856
0
    Py_DECREF(v);
3857
0
    return NULL;
3858
0
}
3859
3860
PyAPI_FUNC(PyObject *)
3861
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3862
                           const char *encoding,
3863
                           const char *errors)
3864
0
{
3865
0
    PyObject *v;
3866
3867
0
    if (!PyUnicode_Check(unicode)) {
3868
0
        PyErr_BadArgument();
3869
0
        goto onError;
3870
0
    }
3871
3872
0
    if (encoding == NULL)
3873
0
        encoding = PyUnicode_GetDefaultEncoding();
3874
3875
    /* Encode via the codec registry */
3876
0
    v = PyCodec_Encode(unicode, encoding, errors);
3877
0
    if (v == NULL)
3878
0
        goto onError;
3879
0
    if (!PyUnicode_Check(v)) {
3880
0
        PyErr_Format(PyExc_TypeError,
3881
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3882
0
                     "use codecs.encode() to encode to arbitrary types",
3883
0
                     encoding,
3884
0
                     Py_TYPE(v)->tp_name);
3885
0
        Py_DECREF(v);
3886
0
        goto onError;
3887
0
    }
3888
0
    return v;
3889
3890
0
  onError:
3891
0
    return NULL;
3892
0
}
3893
3894
static PyObject*
3895
unicode_decode_locale(const char *str, Py_ssize_t len,
3896
                      _Py_error_handler errors, int current_locale)
3897
15.7k
{
3898
15.7k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3899
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3900
0
        return NULL;
3901
0
    }
3902
3903
15.7k
    wchar_t *wstr;
3904
15.7k
    size_t wlen;
3905
15.7k
    const char *reason;
3906
15.7k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3907
15.7k
                                 current_locale, errors);
3908
15.7k
    if (res != 0) {
3909
0
        if (res == -2) {
3910
0
            PyObject *exc;
3911
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3912
0
                                        "locale", str, len,
3913
0
                                        (Py_ssize_t)wlen,
3914
0
                                        (Py_ssize_t)(wlen + 1),
3915
0
                                        reason);
3916
0
            if (exc != NULL) {
3917
0
                PyCodec_StrictErrors(exc);
3918
0
                Py_DECREF(exc);
3919
0
            }
3920
0
        }
3921
0
        else if (res == -3) {
3922
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3923
0
        }
3924
0
        else {
3925
0
            PyErr_NoMemory();
3926
0
        }
3927
0
        return NULL;
3928
0
    }
3929
3930
15.7k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3931
15.7k
    PyMem_RawFree(wstr);
3932
15.7k
    return unicode;
3933
15.7k
}
3934
3935
PyObject*
3936
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3937
                              const char *errors)
3938
0
{
3939
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3940
0
    return unicode_decode_locale(str, len, error_handler, 1);
3941
0
}
3942
3943
PyObject*
3944
PyUnicode_DecodeLocale(const char *str, const char *errors)
3945
10.5k
{
3946
10.5k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3947
10.5k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3948
10.5k
    return unicode_decode_locale(str, size, error_handler, 1);
3949
10.5k
}
3950
3951
3952
PyObject*
3953
0
PyUnicode_DecodeFSDefault(const char *s) {
3954
0
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3955
0
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3956
0
}
3957
3958
PyObject*
3959
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3960
6.74k
{
3961
6.74k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3962
6.74k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3963
6.74k
    if (fs_codec->utf8) {
3964
1.60k
        return unicode_decode_utf8(s, size,
3965
1.60k
                                   fs_codec->error_handler,
3966
1.60k
                                   fs_codec->errors,
3967
1.60k
                                   NULL);
3968
1.60k
    }
3969
5.13k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3970
5.13k
    else if (fs_codec->encoding) {
3971
0
        return PyUnicode_Decode(s, size,
3972
0
                                fs_codec->encoding,
3973
0
                                fs_codec->errors);
3974
0
    }
3975
5.13k
#endif
3976
5.13k
    else {
3977
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3978
           machinery is not ready and so cannot be used:
3979
           use mbstowcs() in this case. */
3980
5.13k
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3981
5.13k
        const wchar_t *filesystem_errors = config->filesystem_errors;
3982
5.13k
        assert(filesystem_errors != NULL);
3983
5.13k
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3984
5.13k
        assert(errors != _Py_ERROR_UNKNOWN);
3985
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3986
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3987
#else
3988
5.13k
        return unicode_decode_locale(s, size, errors, 0);
3989
5.13k
#endif
3990
5.13k
    }
3991
6.74k
}
3992
3993
3994
int
3995
PyUnicode_FSConverter(PyObject* arg, void* addr)
3996
11.1k
{
3997
11.1k
    PyObject *path = NULL;
3998
11.1k
    PyObject *output = NULL;
3999
11.1k
    Py_ssize_t size;
4000
11.1k
    const char *data;
4001
11.1k
    if (arg == NULL) {
4002
0
        Py_DECREF(*(PyObject**)addr);
4003
0
        *(PyObject**)addr = NULL;
4004
0
        return 1;
4005
0
    }
4006
11.1k
    path = PyOS_FSPath(arg);
4007
11.1k
    if (path == NULL) {
4008
0
        return 0;
4009
0
    }
4010
11.1k
    if (PyBytes_Check(path)) {
4011
0
        output = path;
4012
0
    }
4013
11.1k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4014
11.1k
        output = PyUnicode_EncodeFSDefault(path);
4015
11.1k
        Py_DECREF(path);
4016
11.1k
        if (!output) {
4017
0
            return 0;
4018
0
        }
4019
11.1k
        assert(PyBytes_Check(output));
4020
11.1k
    }
4021
4022
11.1k
    size = PyBytes_GET_SIZE(output);
4023
11.1k
    data = PyBytes_AS_STRING(output);
4024
11.1k
    if ((size_t)size != strlen(data)) {
4025
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4026
0
        Py_DECREF(output);
4027
0
        return 0;
4028
0
    }
4029
11.1k
    *(PyObject**)addr = output;
4030
11.1k
    return Py_CLEANUP_SUPPORTED;
4031
11.1k
}
4032
4033
4034
int
4035
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4036
21.1k
{
4037
21.1k
    if (arg == NULL) {
4038
0
        Py_DECREF(*(PyObject**)addr);
4039
0
        *(PyObject**)addr = NULL;
4040
0
        return 1;
4041
0
    }
4042
4043
21.1k
    PyObject *path = PyOS_FSPath(arg);
4044
21.1k
    if (path == NULL) {
4045
0
        return 0;
4046
0
    }
4047
4048
21.1k
    PyObject *output = NULL;
4049
21.1k
    if (PyUnicode_Check(path)) {
4050
21.1k
        output = path;
4051
21.1k
    }
4052
0
    else if (PyBytes_Check(path)) {
4053
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4054
0
                                                  PyBytes_GET_SIZE(path));
4055
0
        Py_DECREF(path);
4056
0
        if (!output) {
4057
0
            return 0;
4058
0
        }
4059
0
    }
4060
0
    else {
4061
0
        PyErr_Format(PyExc_TypeError,
4062
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4063
0
                     Py_TYPE(arg)->tp_name);
4064
0
        Py_DECREF(path);
4065
0
        return 0;
4066
0
    }
4067
4068
21.1k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4069
21.1k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4070
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4071
0
        Py_DECREF(output);
4072
0
        return 0;
4073
0
    }
4074
21.1k
    *(PyObject**)addr = output;
4075
21.1k
    return Py_CLEANUP_SUPPORTED;
4076
21.1k
}
4077
4078
4079
static int unicode_fill_utf8(PyObject *unicode);
4080
4081
4082
static int
4083
unicode_ensure_utf8(PyObject *unicode)
4084
20.7M
{
4085
20.7M
    int err = 0;
4086
20.7M
    if (PyUnicode_UTF8(unicode) == NULL) {
4087
166k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4088
166k
        if (PyUnicode_UTF8(unicode) == NULL) {
4089
166k
            err = unicode_fill_utf8(unicode);
4090
166k
        }
4091
166k
        Py_END_CRITICAL_SECTION();
4092
166k
    }
4093
20.7M
    return err;
4094
20.7M
}
4095
4096
const char *
4097
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4098
20.7M
{
4099
20.7M
    if (!PyUnicode_Check(unicode)) {
4100
0
        PyErr_BadArgument();
4101
0
        if (psize) {
4102
0
            *psize = -1;
4103
0
        }
4104
0
        return NULL;
4105
0
    }
4106
4107
20.7M
    if (unicode_ensure_utf8(unicode) == -1) {
4108
207
        if (psize) {
4109
207
            *psize = -1;
4110
207
        }
4111
207
        return NULL;
4112
207
    }
4113
4114
20.7M
    if (psize) {
4115
20.6M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4116
20.6M
    }
4117
20.7M
    return PyUnicode_UTF8(unicode);
4118
20.7M
}
4119
4120
const char *
4121
PyUnicode_AsUTF8(PyObject *unicode)
4122
65.0k
{
4123
65.0k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4124
65.0k
}
4125
4126
const char *
4127
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4128
843k
{
4129
843k
    Py_ssize_t size;
4130
843k
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4131
843k
    if (s && strlen(s) != (size_t)size) {
4132
155
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4133
155
        return NULL;
4134
155
    }
4135
843k
    return s;
4136
843k
}
4137
4138
/*
4139
PyUnicode_GetSize() has been deprecated since Python 3.3
4140
because it returned length of Py_UNICODE.
4141
4142
But this function is part of stable abi, because it doesn't
4143
include Py_UNICODE in signature and it was not excluded from
4144
stable ABI in PEP 384.
4145
*/
4146
PyAPI_FUNC(Py_ssize_t)
4147
PyUnicode_GetSize(PyObject *unicode)
4148
0
{
4149
0
    PyErr_SetString(PyExc_RuntimeError,
4150
0
                    "PyUnicode_GetSize has been removed.");
4151
0
    return -1;
4152
0
}
4153
4154
Py_ssize_t
4155
PyUnicode_GetLength(PyObject *unicode)
4156
18.8k
{
4157
18.8k
    if (!PyUnicode_Check(unicode)) {
4158
0
        PyErr_BadArgument();
4159
0
        return -1;
4160
0
    }
4161
18.8k
    return PyUnicode_GET_LENGTH(unicode);
4162
18.8k
}
4163
4164
Py_UCS4
4165
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4166
22
{
4167
22
    const void *data;
4168
22
    int kind;
4169
4170
22
    if (!PyUnicode_Check(unicode)) {
4171
0
        PyErr_BadArgument();
4172
0
        return (Py_UCS4)-1;
4173
0
    }
4174
22
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4175
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4176
0
        return (Py_UCS4)-1;
4177
0
    }
4178
22
    data = PyUnicode_DATA(unicode);
4179
22
    kind = PyUnicode_KIND(unicode);
4180
22
    return PyUnicode_READ(kind, data, index);
4181
22
}
4182
4183
int
4184
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4185
0
{
4186
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4187
0
        PyErr_BadArgument();
4188
0
        return -1;
4189
0
    }
4190
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4191
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4192
0
        return -1;
4193
0
    }
4194
0
    if (unicode_check_modifiable(unicode))
4195
0
        return -1;
4196
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4197
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4198
0
        return -1;
4199
0
    }
4200
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4201
0
                    index, ch);
4202
0
    return 0;
4203
0
}
4204
4205
const char *
4206
PyUnicode_GetDefaultEncoding(void)
4207
0
{
4208
0
    return "utf-8";
4209
0
}
4210
4211
/* create or adjust a UnicodeDecodeError */
4212
static void
4213
make_decode_exception(PyObject **exceptionObject,
4214
                      const char *encoding,
4215
                      const char *input, Py_ssize_t length,
4216
                      Py_ssize_t startpos, Py_ssize_t endpos,
4217
                      const char *reason)
4218
276k
{
4219
276k
    if (*exceptionObject == NULL) {
4220
77.3k
        *exceptionObject = PyUnicodeDecodeError_Create(
4221
77.3k
            encoding, input, length, startpos, endpos, reason);
4222
77.3k
    }
4223
199k
    else {
4224
199k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4225
0
            goto onError;
4226
199k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4227
0
            goto onError;
4228
199k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4229
0
            goto onError;
4230
199k
    }
4231
276k
    return;
4232
4233
276k
onError:
4234
0
    Py_CLEAR(*exceptionObject);
4235
0
}
4236
4237
#ifdef MS_WINDOWS
4238
static int
4239
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4240
{
4241
    if (newsize > *size) {
4242
        wchar_t *newbuf = *buf;
4243
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4244
            PyErr_NoMemory();
4245
            return -1;
4246
        }
4247
        *buf = newbuf;
4248
    }
4249
    *size = newsize;
4250
    return 0;
4251
}
4252
4253
/* error handling callback helper:
4254
   build arguments, call the callback and check the arguments,
4255
   if no exception occurred, copy the replacement to the output
4256
   and adjust various state variables.
4257
   return 0 on success, -1 on error
4258
*/
4259
4260
static int
4261
unicode_decode_call_errorhandler_wchar(
4262
    const char *errors, PyObject **errorHandler,
4263
    const char *encoding, const char *reason,
4264
    const char **input, const char **inend, Py_ssize_t *startinpos,
4265
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4266
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4267
{
4268
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4269
4270
    PyObject *restuple = NULL;
4271
    PyObject *repunicode = NULL;
4272
    Py_ssize_t outsize;
4273
    Py_ssize_t insize;
4274
    Py_ssize_t requiredsize;
4275
    Py_ssize_t newpos;
4276
    PyObject *inputobj = NULL;
4277
    Py_ssize_t repwlen;
4278
4279
    if (*errorHandler == NULL) {
4280
        *errorHandler = PyCodec_LookupError(errors);
4281
        if (*errorHandler == NULL)
4282
            goto onError;
4283
    }
4284
4285
    make_decode_exception(exceptionObject,
4286
        encoding,
4287
        *input, *inend - *input,
4288
        *startinpos, *endinpos,
4289
        reason);
4290
    if (*exceptionObject == NULL)
4291
        goto onError;
4292
4293
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4294
    if (restuple == NULL)
4295
        goto onError;
4296
    if (!PyTuple_Check(restuple)) {
4297
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4298
        goto onError;
4299
    }
4300
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4301
        goto onError;
4302
4303
    /* Copy back the bytes variables, which might have been modified by the
4304
       callback */
4305
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4306
    if (!inputobj)
4307
        goto onError;
4308
    *input = PyBytes_AS_STRING(inputobj);
4309
    insize = PyBytes_GET_SIZE(inputobj);
4310
    *inend = *input + insize;
4311
    /* we can DECREF safely, as the exception has another reference,
4312
       so the object won't go away. */
4313
    Py_DECREF(inputobj);
4314
4315
    if (newpos<0)
4316
        newpos = insize+newpos;
4317
    if (newpos<0 || newpos>insize) {
4318
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4319
        goto onError;
4320
    }
4321
4322
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4323
    if (repwlen < 0)
4324
        goto onError;
4325
    repwlen--;
4326
    /* need more space? (at least enough for what we
4327
       have+the replacement+the rest of the string (starting
4328
       at the new input position), so we won't have to check space
4329
       when there are no errors in the rest of the string) */
4330
    requiredsize = *outpos;
4331
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4332
        goto overflow;
4333
    requiredsize += repwlen;
4334
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4335
        goto overflow;
4336
    requiredsize += insize - newpos;
4337
    outsize = *bufsize;
4338
    if (requiredsize > outsize) {
4339
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4340
            requiredsize = 2*outsize;
4341
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4342
            goto onError;
4343
        }
4344
    }
4345
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4346
    *outpos += repwlen;
4347
    *endinpos = newpos;
4348
    *inptr = *input + newpos;
4349
4350
    /* we made it! */
4351
    Py_DECREF(restuple);
4352
    return 0;
4353
4354
  overflow:
4355
    PyErr_SetString(PyExc_OverflowError,
4356
                    "decoded result is too long for a Python string");
4357
4358
  onError:
4359
    Py_XDECREF(restuple);
4360
    return -1;
4361
}
4362
#endif   /* MS_WINDOWS */
4363
4364
static int
4365
unicode_decode_call_errorhandler_writer(
4366
    const char *errors, PyObject **errorHandler,
4367
    const char *encoding, const char *reason,
4368
    const char **input, const char **inend, Py_ssize_t *startinpos,
4369
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4370
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4371
276k
{
4372
276k
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4373
4374
276k
    PyObject *restuple = NULL;
4375
276k
    PyObject *repunicode = NULL;
4376
276k
    Py_ssize_t insize;
4377
276k
    Py_ssize_t newpos;
4378
276k
    Py_ssize_t replen;
4379
276k
    Py_ssize_t remain;
4380
276k
    PyObject *inputobj = NULL;
4381
276k
    int need_to_grow = 0;
4382
276k
    const char *new_inptr;
4383
4384
276k
    if (*errorHandler == NULL) {
4385
77.3k
        *errorHandler = PyCodec_LookupError(errors);
4386
77.3k
        if (*errorHandler == NULL)
4387
0
            goto onError;
4388
77.3k
    }
4389
4390
276k
    make_decode_exception(exceptionObject,
4391
276k
        encoding,
4392
276k
        *input, *inend - *input,
4393
276k
        *startinpos, *endinpos,
4394
276k
        reason);
4395
276k
    if (*exceptionObject == NULL)
4396
0
        goto onError;
4397
4398
276k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4399
276k
    if (restuple == NULL)
4400
43.2k
        goto onError;
4401
233k
    if (!PyTuple_Check(restuple)) {
4402
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4403
0
        goto onError;
4404
0
    }
4405
233k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4406
0
        goto onError;
4407
4408
    /* Copy back the bytes variables, which might have been modified by the
4409
       callback */
4410
233k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4411
233k
    if (!inputobj)
4412
0
        goto onError;
4413
233k
    remain = *inend - *input - *endinpos;
4414
233k
    *input = PyBytes_AS_STRING(inputobj);
4415
233k
    insize = PyBytes_GET_SIZE(inputobj);
4416
233k
    *inend = *input + insize;
4417
    /* we can DECREF safely, as the exception has another reference,
4418
       so the object won't go away. */
4419
233k
    Py_DECREF(inputobj);
4420
4421
233k
    if (newpos<0)
4422
0
        newpos = insize+newpos;
4423
233k
    if (newpos<0 || newpos>insize) {
4424
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4425
0
        goto onError;
4426
0
    }
4427
4428
233k
    replen = PyUnicode_GET_LENGTH(repunicode);
4429
233k
    if (replen > 1) {
4430
16.1k
        writer->min_length += replen - 1;
4431
16.1k
        need_to_grow = 1;
4432
16.1k
    }
4433
233k
    new_inptr = *input + newpos;
4434
233k
    if (*inend - new_inptr > remain) {
4435
        /* We don't know the decoding algorithm here so we make the worst
4436
           assumption that one byte decodes to one unicode character.
4437
           If unfortunately one byte could decode to more unicode characters,
4438
           the decoder may write out-of-bound then.  Is it possible for the
4439
           algorithms using this function? */
4440
4.62k
        writer->min_length += *inend - new_inptr - remain;
4441
4.62k
        need_to_grow = 1;
4442
4.62k
    }
4443
233k
    if (need_to_grow) {
4444
16.3k
        writer->overallocate = 1;
4445
16.3k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4446
16.3k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4447
0
            goto onError;
4448
16.3k
    }
4449
233k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4450
0
        goto onError;
4451
4452
233k
    *endinpos = newpos;
4453
233k
    *inptr = new_inptr;
4454
4455
    /* we made it! */
4456
233k
    Py_DECREF(restuple);
4457
233k
    return 0;
4458
4459
43.2k
  onError:
4460
43.2k
    Py_XDECREF(restuple);
4461
43.2k
    return -1;
4462
233k
}
4463
4464
/* --- UTF-7 Codec -------------------------------------------------------- */
4465
4466
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4467
4468
/* Three simple macros defining base-64. */
4469
4470
/* Is c a base-64 character? */
4471
4472
#define IS_BASE64(c) \
4473
323k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4474
323k
     ((c) >= 'a' && (c) <= 'z') ||     \
4475
323k
     ((c) >= '0' && (c) <= '9') ||     \
4476
323k
     (c) == '+' || (c) == '/')
4477
4478
/* given that c is a base-64 character, what is its base-64 value? */
4479
4480
#define FROM_BASE64(c)                                                  \
4481
281k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4482
281k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4483
198k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4484
114k
     (c) == '+' ? 62 : 63)
4485
4486
/* What is the base-64 character of the bottom 6 bits of n? */
4487
4488
#define TO_BASE64(n)  \
4489
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4490
4491
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4492
 * decoded as itself.  We are permissive on decoding; the only ASCII
4493
 * byte not decoding to itself is the + which begins a base64
4494
 * string. */
4495
4496
#define DECODE_DIRECT(c)                                \
4497
7.51M
    ((c) <= 127 && (c) != '+')
4498
4499
/* The UTF-7 encoder treats ASCII characters differently according to
4500
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4501
 * the above).  See RFC2152.  This array identifies these different
4502
 * sets:
4503
 * 0 : "Set D"
4504
 *     alphanumeric and '(),-./:?
4505
 * 1 : "Set O"
4506
 *     !"#$%&*;<=>@[]^_`{|}
4507
 * 2 : "whitespace"
4508
 *     ht nl cr sp
4509
 * 3 : special (must be base64 encoded)
4510
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4511
 */
4512
4513
static
4514
char utf7_category[128] = {
4515
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4516
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4517
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4518
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4519
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4520
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4521
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4522
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4523
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4524
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4525
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4526
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4527
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4528
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4529
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4530
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4531
};
4532
4533
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4534
 * answer depends on whether we are encoding set O as itself, and also
4535
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4536
 * clear that the answers to these questions vary between
4537
 * applications, so this code needs to be flexible.  */
4538
4539
#define ENCODE_DIRECT(c) \
4540
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4541
4542
PyObject *
4543
PyUnicode_DecodeUTF7(const char *s,
4544
                     Py_ssize_t size,
4545
                     const char *errors)
4546
0
{
4547
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4548
0
}
4549
4550
/* The decoder.  The only state we preserve is our read position,
4551
 * i.e. how many characters we have consumed.  So if we end in the
4552
 * middle of a shift sequence we have to back off the read position
4553
 * and the output to the beginning of the sequence, otherwise we lose
4554
 * all the shift state (seen bits, number of bits seen, high
4555
 * surrogate). */
4556
4557
PyObject *
4558
PyUnicode_DecodeUTF7Stateful(const char *s,
4559
                             Py_ssize_t size,
4560
                             const char *errors,
4561
                             Py_ssize_t *consumed)
4562
27.5k
{
4563
27.5k
    const char *starts = s;
4564
27.5k
    Py_ssize_t startinpos;
4565
27.5k
    Py_ssize_t endinpos;
4566
27.5k
    const char *e;
4567
27.5k
    _PyUnicodeWriter writer;
4568
27.5k
    const char *errmsg = "";
4569
27.5k
    int inShift = 0;
4570
27.5k
    Py_ssize_t shiftOutStart;
4571
27.5k
    unsigned int base64bits = 0;
4572
27.5k
    unsigned long base64buffer = 0;
4573
27.5k
    Py_UCS4 surrogate = 0;
4574
27.5k
    PyObject *errorHandler = NULL;
4575
27.5k
    PyObject *exc = NULL;
4576
4577
27.5k
    if (size == 0) {
4578
0
        if (consumed)
4579
0
            *consumed = 0;
4580
0
        _Py_RETURN_UNICODE_EMPTY();
4581
0
    }
4582
4583
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4584
27.5k
    _PyUnicodeWriter_Init(&writer);
4585
27.5k
    writer.min_length = size;
4586
4587
27.5k
    shiftOutStart = 0;
4588
27.5k
    e = s + size;
4589
4590
7.85M
    while (s < e) {
4591
7.84M
        Py_UCS4 ch;
4592
7.84M
      restart:
4593
7.84M
        ch = (unsigned char) *s;
4594
4595
7.84M
        if (inShift) { /* in a base-64 section */
4596
298k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4597
281k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4598
281k
                base64bits += 6;
4599
281k
                s++;
4600
281k
                if (base64bits >= 16) {
4601
                    /* we have enough bits for a UTF-16 value */
4602
98.6k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4603
98.6k
                    base64bits -= 16;
4604
98.6k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4605
98.6k
                    assert(outCh <= 0xffff);
4606
98.6k
                    if (surrogate) {
4607
                        /* expecting a second surrogate */
4608
8.73k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4609
3.90k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4610
3.90k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4611
0
                                goto onError;
4612
3.90k
                            surrogate = 0;
4613
3.90k
                            continue;
4614
3.90k
                        }
4615
4.83k
                        else {
4616
4.83k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4617
0
                                goto onError;
4618
4.83k
                            surrogate = 0;
4619
4.83k
                        }
4620
8.73k
                    }
4621
94.7k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4622
                        /* first surrogate */
4623
12.6k
                        surrogate = outCh;
4624
12.6k
                    }
4625
82.0k
                    else {
4626
82.0k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4627
0
                            goto onError;
4628
82.0k
                    }
4629
94.7k
                }
4630
281k
            }
4631
17.4k
            else { /* now leaving a base-64 section */
4632
17.4k
                inShift = 0;
4633
17.4k
                if (base64bits > 0) { /* left-over bits */
4634
13.9k
                    if (base64bits >= 6) {
4635
                        /* We've seen at least one base-64 character */
4636
7.61k
                        s++;
4637
7.61k
                        errmsg = "partial character in shift sequence";
4638
7.61k
                        goto utf7Error;
4639
7.61k
                    }
4640
6.32k
                    else {
4641
                        /* Some bits remain; they should be zero */
4642
6.32k
                        if (base64buffer != 0) {
4643
1.38k
                            s++;
4644
1.38k
                            errmsg = "non-zero padding bits in shift sequence";
4645
1.38k
                            goto utf7Error;
4646
1.38k
                        }
4647
6.32k
                    }
4648
13.9k
                }
4649
8.46k
                if (surrogate && DECODE_DIRECT(ch)) {
4650
2.86k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4651
0
                        goto onError;
4652
2.86k
                }
4653
8.46k
                surrogate = 0;
4654
8.46k
                if (ch == '-') {
4655
                    /* '-' is absorbed; other terminating
4656
                       characters are preserved */
4657
2.15k
                    s++;
4658
2.15k
                }
4659
8.46k
            }
4660
298k
        }
4661
7.54M
        else if ( ch == '+' ) {
4662
27.0k
            startinpos = s-starts;
4663
27.0k
            s++; /* consume '+' */
4664
27.0k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4665
2.24k
                s++;
4666
2.24k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4667
0
                    goto onError;
4668
2.24k
            }
4669
24.7k
            else if (s < e && !IS_BASE64(*s)) {
4670
3.24k
                s++;
4671
3.24k
                errmsg = "ill-formed sequence";
4672
3.24k
                goto utf7Error;
4673
3.24k
            }
4674
21.5k
            else { /* begin base64-encoded section */
4675
21.5k
                inShift = 1;
4676
21.5k
                surrogate = 0;
4677
21.5k
                shiftOutStart = writer.pos;
4678
21.5k
                base64bits = 0;
4679
21.5k
                base64buffer = 0;
4680
21.5k
            }
4681
27.0k
        }
4682
7.51M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4683
7.42M
            s++;
4684
7.42M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4685
0
                goto onError;
4686
7.42M
        }
4687
89.8k
        else {
4688
89.8k
            startinpos = s-starts;
4689
89.8k
            s++;
4690
89.8k
            errmsg = "unexpected special character";
4691
89.8k
            goto utf7Error;
4692
89.8k
        }
4693
7.73M
        continue;
4694
7.73M
utf7Error:
4695
102k
        endinpos = s-starts;
4696
102k
        if (unicode_decode_call_errorhandler_writer(
4697
102k
                errors, &errorHandler,
4698
102k
                "utf7", errmsg,
4699
102k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4700
102k
                &writer))
4701
11.7k
            goto onError;
4702
102k
    }
4703
4704
    /* end of string */
4705
4706
15.7k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4707
        /* if we're in an inconsistent state, that's an error */
4708
4.04k
        inShift = 0;
4709
4.04k
        if (surrogate ||
4710
3.43k
                (base64bits >= 6) ||
4711
2.69k
                (base64bits > 0 && base64buffer != 0)) {
4712
2.69k
            endinpos = size;
4713
2.69k
            if (unicode_decode_call_errorhandler_writer(
4714
2.69k
                    errors, &errorHandler,
4715
2.69k
                    "utf7", "unterminated shift sequence",
4716
2.69k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4717
2.69k
                    &writer))
4718
2.33k
                goto onError;
4719
368
            if (s < e)
4720
0
                goto restart;
4721
368
        }
4722
4.04k
    }
4723
4724
    /* return state */
4725
13.4k
    if (consumed) {
4726
0
        if (inShift) {
4727
0
            *consumed = startinpos;
4728
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4729
0
                PyObject *result = PyUnicode_FromKindAndData(
4730
0
                        writer.kind, writer.data, shiftOutStart);
4731
0
                Py_XDECREF(errorHandler);
4732
0
                Py_XDECREF(exc);
4733
0
                _PyUnicodeWriter_Dealloc(&writer);
4734
0
                return result;
4735
0
            }
4736
0
            writer.pos = shiftOutStart; /* back off output */
4737
0
        }
4738
0
        else {
4739
0
            *consumed = s-starts;
4740
0
        }
4741
0
    }
4742
4743
13.4k
    Py_XDECREF(errorHandler);
4744
13.4k
    Py_XDECREF(exc);
4745
13.4k
    return _PyUnicodeWriter_Finish(&writer);
4746
4747
14.1k
  onError:
4748
14.1k
    Py_XDECREF(errorHandler);
4749
14.1k
    Py_XDECREF(exc);
4750
14.1k
    _PyUnicodeWriter_Dealloc(&writer);
4751
14.1k
    return NULL;
4752
13.4k
}
4753
4754
4755
PyObject *
4756
_PyUnicode_EncodeUTF7(PyObject *str,
4757
                      const char *errors)
4758
0
{
4759
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4760
0
    if (len == 0) {
4761
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4762
0
    }
4763
0
    int kind = PyUnicode_KIND(str);
4764
0
    const void *data = PyUnicode_DATA(str);
4765
4766
    /* It might be possible to tighten this worst case */
4767
0
    if (len > PY_SSIZE_T_MAX / 8) {
4768
0
        return PyErr_NoMemory();
4769
0
    }
4770
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4771
0
    if (writer == NULL) {
4772
0
        return NULL;
4773
0
    }
4774
4775
0
    int inShift = 0;
4776
0
    unsigned int base64bits = 0;
4777
0
    unsigned long base64buffer = 0;
4778
0
    char *out = PyBytesWriter_GetData(writer);
4779
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4780
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4781
4782
0
        if (inShift) {
4783
0
            if (ENCODE_DIRECT(ch)) {
4784
                /* shifting out */
4785
0
                if (base64bits) { /* output remaining bits */
4786
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4787
0
                    base64buffer = 0;
4788
0
                    base64bits = 0;
4789
0
                }
4790
0
                inShift = 0;
4791
                /* Characters not in the BASE64 set implicitly unshift the sequence
4792
                   so no '-' is required, except if the character is itself a '-' */
4793
0
                if (IS_BASE64(ch) || ch == '-') {
4794
0
                    *out++ = '-';
4795
0
                }
4796
0
                *out++ = (char) ch;
4797
0
            }
4798
0
            else {
4799
0
                goto encode_char;
4800
0
            }
4801
0
        }
4802
0
        else { /* not in a shift sequence */
4803
0
            if (ch == '+') {
4804
0
                *out++ = '+';
4805
0
                        *out++ = '-';
4806
0
            }
4807
0
            else if (ENCODE_DIRECT(ch)) {
4808
0
                *out++ = (char) ch;
4809
0
            }
4810
0
            else {
4811
0
                *out++ = '+';
4812
0
                inShift = 1;
4813
0
                goto encode_char;
4814
0
            }
4815
0
        }
4816
0
        continue;
4817
0
encode_char:
4818
0
        if (ch >= 0x10000) {
4819
0
            assert(ch <= MAX_UNICODE);
4820
4821
            /* code first surrogate */
4822
0
            base64bits += 16;
4823
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4824
0
            while (base64bits >= 6) {
4825
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4826
0
                base64bits -= 6;
4827
0
            }
4828
            /* prepare second surrogate */
4829
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4830
0
        }
4831
0
        base64bits += 16;
4832
0
        base64buffer = (base64buffer << 16) | ch;
4833
0
        while (base64bits >= 6) {
4834
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4835
0
            base64bits -= 6;
4836
0
        }
4837
0
    }
4838
0
    if (base64bits)
4839
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4840
0
    if (inShift)
4841
0
        *out++ = '-';
4842
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4843
0
}
4844
4845
#undef IS_BASE64
4846
#undef FROM_BASE64
4847
#undef TO_BASE64
4848
#undef DECODE_DIRECT
4849
#undef ENCODE_DIRECT
4850
4851
/* --- UTF-8 Codec -------------------------------------------------------- */
4852
4853
PyObject *
4854
PyUnicode_DecodeUTF8(const char *s,
4855
                     Py_ssize_t size,
4856
                     const char *errors)
4857
2.22M
{
4858
2.22M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4859
2.22M
}
4860
4861
#include "stringlib/asciilib.h"
4862
#include "stringlib/codecs.h"
4863
#include "stringlib/undef.h"
4864
4865
#include "stringlib/ucs1lib.h"
4866
#include "stringlib/codecs.h"
4867
#include "stringlib/undef.h"
4868
4869
#include "stringlib/ucs2lib.h"
4870
#include "stringlib/codecs.h"
4871
#include "stringlib/undef.h"
4872
4873
#include "stringlib/ucs4lib.h"
4874
#include "stringlib/codecs.h"
4875
#include "stringlib/undef.h"
4876
4877
#if (SIZEOF_SIZE_T == 8)
4878
/* Mask to quickly check whether a C 'size_t' contains a
4879
   non-ASCII, UTF8-encoded char. */
4880
124M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4881
// used to count codepoints in UTF-8 string.
4882
317M
# define VECTOR_0101     0x0101010101010101ULL
4883
2.83M
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4884
#elif (SIZEOF_SIZE_T == 4)
4885
# define ASCII_CHAR_MASK 0x80808080U
4886
# define VECTOR_0101     0x01010101U
4887
# define VECTOR_00FF     0x00ff00ffU
4888
#else
4889
# error C 'size_t' size should be either 4 or 8!
4890
#endif
4891
4892
#if (defined(__clang__) || defined(__GNUC__))
4893
#define HAVE_CTZ 1
4894
static inline unsigned int
4895
ctz(size_t v)
4896
562k
{
4897
562k
    return __builtin_ctzll((unsigned long long)v);
4898
562k
}
4899
#elif defined(_MSC_VER)
4900
#define HAVE_CTZ 1
4901
static inline unsigned int
4902
ctz(size_t v)
4903
{
4904
    unsigned long pos;
4905
#if SIZEOF_SIZE_T == 4
4906
    _BitScanForward(&pos, v);
4907
#else
4908
    _BitScanForward64(&pos, v);
4909
#endif /* SIZEOF_SIZE_T */
4910
    return pos;
4911
}
4912
#else
4913
#define HAVE_CTZ 0
4914
#endif
4915
4916
#if HAVE_CTZ && PY_LITTLE_ENDIAN
4917
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
4918
static size_t
4919
load_unaligned(const unsigned char *p, size_t size)
4920
13.7M
{
4921
13.7M
    union {
4922
13.7M
        size_t s;
4923
13.7M
        unsigned char b[SIZEOF_SIZE_T];
4924
13.7M
    } u;
4925
13.7M
    u.s = 0;
4926
    // This switch statement assumes little endian because:
4927
    // * union is faster than bitwise or and shift.
4928
    // * big endian machine is rare and hard to maintain.
4929
13.7M
    switch (size) {
4930
0
    default:
4931
0
#if SIZEOF_SIZE_T == 8
4932
0
    case 8:
4933
0
        u.b[7] = p[7];
4934
0
        _Py_FALLTHROUGH;
4935
801k
    case 7:
4936
801k
        u.b[6] = p[6];
4937
801k
        _Py_FALLTHROUGH;
4938
1.92M
    case 6:
4939
1.92M
        u.b[5] = p[5];
4940
1.92M
        _Py_FALLTHROUGH;
4941
2.48M
    case 5:
4942
2.48M
        u.b[4] = p[4];
4943
2.48M
        _Py_FALLTHROUGH;
4944
2.48M
#endif
4945
2.93M
    case 4:
4946
2.93M
        u.b[3] = p[3];
4947
2.93M
        _Py_FALLTHROUGH;
4948
9.20M
    case 3:
4949
9.20M
        u.b[2] = p[2];
4950
9.20M
        _Py_FALLTHROUGH;
4951
12.2M
    case 2:
4952
12.2M
        u.b[1] = p[1];
4953
12.2M
        _Py_FALLTHROUGH;
4954
13.6M
    case 1:
4955
13.6M
        u.b[0] = p[0];
4956
13.6M
        break;
4957
182k
    case 0:
4958
182k
        break;
4959
13.7M
    }
4960
13.7M
    return u.s;
4961
13.7M
}
4962
#endif
4963
4964
/*
4965
 * Find the first non-ASCII character in a byte sequence.
4966
 *
4967
 * This function scans a range of bytes from `start` to `end` and returns the
4968
 * index of the first byte that is not an ASCII character (i.e., has the most
4969
 * significant bit set). If all characters in the range are ASCII, it returns
4970
 * `end - start`.
4971
 */
4972
static Py_ssize_t
4973
find_first_nonascii(const unsigned char *start, const unsigned char *end)
4974
14.1M
{
4975
    // The search is done in `size_t` chunks.
4976
    // The start and end might not be aligned at `size_t` boundaries,
4977
    // so they're handled specially.
4978
4979
14.1M
    const unsigned char *p = start;
4980
4981
14.1M
    if (end - start >= SIZEOF_SIZE_T) {
4982
        // Avoid unaligned read.
4983
3.34M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
4984
3.34M
        size_t u;
4985
3.34M
        memcpy(&u, p, sizeof(size_t));
4986
3.34M
        u &= ASCII_CHAR_MASK;
4987
3.34M
        if (u) {
4988
204k
            return (ctz(u) - 7) / 8;
4989
204k
        }
4990
3.14M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
4991
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
4992
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
4993
        while (p < p2) {
4994
            if (*p & 0x80) {
4995
                return p - start;
4996
            }
4997
            p++;
4998
        }
4999
#endif
5000
5001
3.14M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5002
108M
        while (p <= e) {
5003
106M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5004
106M
            if (u) {
5005
185k
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5006
185k
                return p - start + (ctz(u) - 7) / 8;
5007
#else
5008
                // big endian and minor compilers are difficult to test.
5009
                // fallback to per byte check.
5010
                break;
5011
#endif
5012
185k
            }
5013
105M
            p += SIZEOF_SIZE_T;
5014
105M
        }
5015
3.14M
    }
5016
13.7M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5017
14.1M
    assert((end - p) < SIZEOF_SIZE_T);
5018
    // we can not use *(const size_t*)p to avoid buffer overrun.
5019
13.7M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5020
13.7M
    if (u) {
5021
173k
        return p - start + (ctz(u) - 7) / 8;
5022
173k
    }
5023
13.6M
    return end - start;
5024
#else
5025
    while (p < end) {
5026
        if (*p & 0x80) {
5027
            break;
5028
        }
5029
        p++;
5030
    }
5031
    return p - start;
5032
#endif
5033
13.7M
}
5034
5035
static inline int
5036
scalar_utf8_start_char(unsigned int ch)
5037
718k
{
5038
    // 0xxxxxxx or 11xxxxxx are first byte.
5039
718k
    return (~ch >> 7 | ch >> 6) & 1;
5040
718k
}
5041
5042
static inline size_t
5043
vector_utf8_start_chars(size_t v)
5044
317M
{
5045
317M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5046
317M
}
5047
5048
5049
// Count the number of UTF-8 code points in a given byte sequence.
5050
static Py_ssize_t
5051
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5052
260k
{
5053
260k
    Py_ssize_t len = 0;
5054
5055
260k
    if (end - s >= SIZEOF_SIZE_T) {
5056
195k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5057
16.5k
            len += scalar_utf8_start_char(*s++);
5058
16.5k
        }
5059
5060
1.59M
        while (s + SIZEOF_SIZE_T <= end) {
5061
1.41M
            const unsigned char *e = end;
5062
1.41M
            if (e - s > SIZEOF_SIZE_T * 255) {
5063
1.23M
                e = s + SIZEOF_SIZE_T * 255;
5064
1.23M
            }
5065
1.41M
            Py_ssize_t vstart = 0;
5066
318M
            while (s + SIZEOF_SIZE_T <= e) {
5067
317M
                size_t v = *(size_t*)s;
5068
317M
                size_t vs = vector_utf8_start_chars(v);
5069
317M
                vstart += vs;
5070
317M
                s += SIZEOF_SIZE_T;
5071
317M
            }
5072
1.41M
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5073
1.41M
            vstart += vstart >> 16;
5074
1.41M
#if SIZEOF_SIZE_T == 8
5075
1.41M
            vstart += vstart >> 32;
5076
1.41M
#endif
5077
1.41M
            len += vstart & 0x7ff;
5078
1.41M
        }
5079
178k
    }
5080
961k
    while (s < end) {
5081
701k
        len += scalar_utf8_start_char(*s++);
5082
701k
    }
5083
260k
    return len;
5084
260k
}
5085
5086
static Py_ssize_t
5087
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5088
6.01M
{
5089
6.01M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5090
6.01M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5091
5.99M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5092
608k
    {
5093
        /* Fast path, see in STRINGLIB(utf8_decode) for
5094
           an explanation. */
5095
608k
        const char *p = start;
5096
608k
        Py_UCS1 *q = dest;
5097
1.48M
        while (p + SIZEOF_SIZE_T <= end) {
5098
1.01M
            size_t value = *(const size_t *) p;
5099
1.01M
            if (value & ASCII_CHAR_MASK)
5100
138k
                break;
5101
880k
            *((size_t *)q) = value;
5102
880k
            p += SIZEOF_SIZE_T;
5103
880k
            q += SIZEOF_SIZE_T;
5104
880k
        }
5105
2.49M
        while (p < end) {
5106
2.05M
            if ((unsigned char)*p & 0x80)
5107
161k
                break;
5108
1.89M
            *q++ = *p++;
5109
1.89M
        }
5110
608k
        return p - start;
5111
608k
    }
5112
5.40M
#endif
5113
5.40M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5114
5.40M
                                         (const unsigned char*)end);
5115
5.40M
    memcpy(dest, start, pos);
5116
5.40M
    return pos;
5117
6.01M
}
5118
5119
static int
5120
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5121
                         const char *starts, const char *s, const char *end,
5122
                         _Py_error_handler error_handler,
5123
                         const char *errors,
5124
                         Py_ssize_t *consumed)
5125
564k
{
5126
564k
    Py_ssize_t startinpos, endinpos;
5127
564k
    const char *errmsg = "";
5128
564k
    PyObject *error_handler_obj = NULL;
5129
564k
    PyObject *exc = NULL;
5130
5131
148M
    while (s < end) {
5132
148M
        Py_UCS4 ch;
5133
148M
        int kind = writer->kind;
5134
5135
148M
        if (kind == PyUnicode_1BYTE_KIND) {
5136
531k
            if (PyUnicode_IS_ASCII(writer->buffer))
5137
302k
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5138
228k
            else
5139
228k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5140
147M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5141
71.1M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5142
76.4M
        } else {
5143
76.4M
            assert(kind == PyUnicode_4BYTE_KIND);
5144
76.4M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5145
76.4M
        }
5146
5147
148M
        switch (ch) {
5148
494k
        case 0:
5149
494k
            if (s == end || consumed)
5150
472k
                goto End;
5151
22.8k
            errmsg = "unexpected end of data";
5152
22.8k
            startinpos = s - starts;
5153
22.8k
            endinpos = end - starts;
5154
22.8k
            break;
5155
118M
        case 1:
5156
118M
            errmsg = "invalid start byte";
5157
118M
            startinpos = s - starts;
5158
118M
            endinpos = startinpos + 1;
5159
118M
            break;
5160
27.9M
        case 2:
5161
27.9M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5162
0
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5163
0
            {
5164
                /* Truncated surrogate code in range D800-DFFF */
5165
0
                goto End;
5166
0
            }
5167
27.9M
            _Py_FALLTHROUGH;
5168
29.0M
        case 3:
5169
29.1M
        case 4:
5170
29.1M
            errmsg = "invalid continuation byte";
5171
29.1M
            startinpos = s - starts;
5172
29.1M
            endinpos = startinpos + ch - 1;
5173
29.1M
            break;
5174
302k
        default:
5175
            // ch doesn't fit into kind, so change the buffer kind to write
5176
            // the character
5177
302k
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5178
0
                goto onError;
5179
302k
            continue;
5180
148M
        }
5181
5182
147M
        if (error_handler == _Py_ERROR_UNKNOWN)
5183
122k
            error_handler = _Py_GetErrorHandler(errors);
5184
5185
147M
        switch (error_handler) {
5186
0
        case _Py_ERROR_IGNORE:
5187
0
            s += (endinpos - startinpos);
5188
0
            break;
5189
5190
147M
        case _Py_ERROR_REPLACE:
5191
147M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5192
0
                goto onError;
5193
147M
            s += (endinpos - startinpos);
5194
147M
            break;
5195
5196
2.21k
        case _Py_ERROR_SURROGATEESCAPE:
5197
2.21k
        {
5198
2.21k
            Py_ssize_t i;
5199
5200
2.21k
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5201
0
                goto onError;
5202
4.70k
            for (i=startinpos; i<endinpos; i++) {
5203
2.49k
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5204
2.49k
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5205
2.49k
                                ch + 0xdc00);
5206
2.49k
                writer->pos++;
5207
2.49k
            }
5208
2.21k
            s += (endinpos - startinpos);
5209
2.21k
            break;
5210
2.21k
        }
5211
5212
775
        default:
5213
775
            if (unicode_decode_call_errorhandler_writer(
5214
775
                    errors, &error_handler_obj,
5215
775
                    "utf-8", errmsg,
5216
775
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5217
775
                    writer)) {
5218
775
                goto onError;
5219
775
            }
5220
5221
0
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5222
0
                return -1;
5223
0
            }
5224
147M
        }
5225
147M
    }
5226
5227
564k
End:
5228
564k
    if (consumed)
5229
975
        *consumed = s - starts;
5230
5231
564k
    Py_XDECREF(error_handler_obj);
5232
564k
    Py_XDECREF(exc);
5233
564k
    return 0;
5234
5235
775
onError:
5236
775
    Py_XDECREF(error_handler_obj);
5237
775
    Py_XDECREF(exc);
5238
775
    return -1;
5239
564k
}
5240
5241
5242
static PyObject *
5243
unicode_decode_utf8(const char *s, Py_ssize_t size,
5244
                    _Py_error_handler error_handler, const char *errors,
5245
                    Py_ssize_t *consumed)
5246
10.4M
{
5247
10.4M
    if (size == 0) {
5248
71.9k
        if (consumed) {
5249
0
            *consumed = 0;
5250
0
        }
5251
71.9k
        _Py_RETURN_UNICODE_EMPTY();
5252
71.9k
    }
5253
5254
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5255
10.3M
    if (size == 1 && (unsigned char)s[0] < 128) {
5256
1.62M
        if (consumed) {
5257
0
            *consumed = 1;
5258
0
        }
5259
1.62M
        return get_latin1_char((unsigned char)s[0]);
5260
1.62M
    }
5261
5262
    // I don't know this check is necessary or not. But there is a test
5263
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5264
8.76M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5265
0
        PyErr_NoMemory();
5266
0
        return NULL;
5267
0
    }
5268
5269
8.76M
    const char *starts = s;
5270
8.76M
    const char *end = s + size;
5271
5272
8.76M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5273
8.76M
    if (pos == size) {  // fast path: ASCII string.
5274
8.24M
        PyObject *u = PyUnicode_New(size, 127);
5275
8.24M
        if (u == NULL) {
5276
0
            return NULL;
5277
0
        }
5278
8.24M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5279
8.24M
        if (consumed) {
5280
0
            *consumed = size;
5281
0
        }
5282
8.24M
        return u;
5283
8.24M
    }
5284
5285
520k
    int maxchr = 127;
5286
520k
    Py_ssize_t maxsize = size;
5287
5288
520k
    unsigned char ch = (unsigned char)(s[pos]);
5289
    // error handler other than strict may remove/replace the invalid byte.
5290
    // consumed != NULL allows 1~3 bytes remainings.
5291
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5292
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5293
    // reallocation and copy.
5294
520k
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5295
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5296
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5297
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5298
        // means that it is no longer necessary to allocate several times the required amount
5299
        // of memory.
5300
260k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5301
260k
        if (ch < 0xc4) { // latin1
5302
146k
            maxchr = 0xff;
5303
146k
        }
5304
113k
        else if (ch < 0xf0) { // ucs2
5305
103k
            maxchr = 0xffff;
5306
103k
        }
5307
9.86k
        else { // ucs4
5308
9.86k
            maxchr = 0x10ffff;
5309
9.86k
        }
5310
260k
    }
5311
520k
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5312
520k
    if (!u) {
5313
0
        return NULL;
5314
0
    }
5315
5316
    // Use _PyUnicodeWriter after fast path is failed.
5317
520k
    _PyUnicodeWriter writer;
5318
520k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5319
520k
    if (maxchr <= 255) {
5320
407k
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5321
407k
        s += pos;
5322
407k
        writer.pos = pos;
5323
407k
    }
5324
5325
520k
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5326
520k
                                 error_handler, errors,
5327
520k
                                 consumed) < 0) {
5328
775
        _PyUnicodeWriter_Dealloc(&writer);
5329
775
        return NULL;
5330
775
    }
5331
519k
    return _PyUnicodeWriter_Finish(&writer);
5332
520k
}
5333
5334
5335
// Used by PyUnicodeWriter_WriteUTF8() implementation
5336
int
5337
_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
5338
                            const char *s, Py_ssize_t size,
5339
                            _Py_error_handler error_handler, const char *errors,
5340
                            Py_ssize_t *consumed)
5341
5.46M
{
5342
5.46M
    if (size == 0) {
5343
8.26k
        if (consumed) {
5344
0
            *consumed = 0;
5345
0
        }
5346
8.26k
        return 0;
5347
8.26k
    }
5348
5349
    // fast path: try ASCII string.
5350
5.46M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5351
0
        return -1;
5352
0
    }
5353
5354
5.46M
    const char *starts = s;
5355
5.46M
    const char *end = s + size;
5356
5.46M
    Py_ssize_t decoded = 0;
5357
5.46M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5358
5.46M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5359
5.45M
        decoded = ascii_decode(s, end, dest);
5360
5.45M
        writer->pos += decoded;
5361
5362
5.45M
        if (decoded == size) {
5363
5.41M
            if (consumed) {
5364
914
                *consumed = size;
5365
914
            }
5366
5.41M
            return 0;
5367
5.41M
        }
5368
42.3k
        s += decoded;
5369
42.3k
    }
5370
5371
44.5k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5372
44.5k
                                    error_handler, errors, consumed);
5373
5.46M
}
5374
5375
5376
PyObject *
5377
PyUnicode_DecodeUTF8Stateful(const char *s,
5378
                             Py_ssize_t size,
5379
                             const char *errors,
5380
                             Py_ssize_t *consumed)
5381
10.4M
{
5382
10.4M
    return unicode_decode_utf8(s, size,
5383
10.4M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5384
10.4M
                               errors, consumed);
5385
10.4M
}
5386
5387
5388
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5389
   non-zero, use strict error handler otherwise.
5390
5391
   On success, write a pointer to a newly allocated wide character string into
5392
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5393
   (in number of wchar_t units) into *wlen (if wlen is set).
5394
5395
   On memory allocation failure, return -1.
5396
5397
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5398
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5399
   is not NULL, write the decoding error message into *reason. */
5400
int
5401
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5402
                 const char **reason, _Py_error_handler errors)
5403
5.24k
{
5404
5.24k
    const char *orig_s = s;
5405
5.24k
    const char *e;
5406
5.24k
    wchar_t *unicode;
5407
5.24k
    Py_ssize_t outpos;
5408
5409
5.24k
    int surrogateescape = 0;
5410
5.24k
    int surrogatepass = 0;
5411
5.24k
    switch (errors)
5412
5.24k
    {
5413
0
    case _Py_ERROR_STRICT:
5414
0
        break;
5415
5.24k
    case _Py_ERROR_SURROGATEESCAPE:
5416
5.24k
        surrogateescape = 1;
5417
5.24k
        break;
5418
0
    case _Py_ERROR_SURROGATEPASS:
5419
0
        surrogatepass = 1;
5420
0
        break;
5421
0
    default:
5422
0
        return -3;
5423
5.24k
    }
5424
5425
    /* Note: size will always be longer than the resulting Unicode
5426
       character count */
5427
5.24k
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5428
0
        return -1;
5429
0
    }
5430
5431
5.24k
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5432
5.24k
    if (!unicode) {
5433
0
        return -1;
5434
0
    }
5435
5436
    /* Unpack UTF-8 encoded data */
5437
5.24k
    e = s + size;
5438
5.24k
    outpos = 0;
5439
5.24k
    while (s < e) {
5440
5.24k
        Py_UCS4 ch;
5441
5.24k
#if SIZEOF_WCHAR_T == 4
5442
5.24k
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5443
#else
5444
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5445
#endif
5446
5.24k
        if (ch > 0xFF) {
5447
0
#if SIZEOF_WCHAR_T == 4
5448
0
            Py_UNREACHABLE();
5449
#else
5450
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5451
            /* write a surrogate pair */
5452
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5453
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5454
#endif
5455
0
        }
5456
5.24k
        else {
5457
5.24k
            if (!ch && s == e) {
5458
5.24k
                break;
5459
5.24k
            }
5460
5461
0
            if (surrogateescape) {
5462
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5463
0
            }
5464
0
            else {
5465
                /* Is it a valid three-byte code? */
5466
0
                if (surrogatepass
5467
0
                    && (e - s) >= 3
5468
0
                    && (s[0] & 0xf0) == 0xe0
5469
0
                    && (s[1] & 0xc0) == 0x80
5470
0
                    && (s[2] & 0xc0) == 0x80)
5471
0
                {
5472
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5473
0
                    s += 3;
5474
0
                    unicode[outpos++] = ch;
5475
0
                }
5476
0
                else {
5477
0
                    PyMem_RawFree(unicode );
5478
0
                    if (reason != NULL) {
5479
0
                        switch (ch) {
5480
0
                        case 0:
5481
0
                            *reason = "unexpected end of data";
5482
0
                            break;
5483
0
                        case 1:
5484
0
                            *reason = "invalid start byte";
5485
0
                            break;
5486
                        /* 2, 3, 4 */
5487
0
                        default:
5488
0
                            *reason = "invalid continuation byte";
5489
0
                            break;
5490
0
                        }
5491
0
                    }
5492
0
                    if (wlen != NULL) {
5493
0
                        *wlen = s - orig_s;
5494
0
                    }
5495
0
                    return -2;
5496
0
                }
5497
0
            }
5498
0
        }
5499
5.24k
    }
5500
5.24k
    unicode[outpos] = L'\0';
5501
5.24k
    if (wlen) {
5502
5.24k
        *wlen = outpos;
5503
5.24k
    }
5504
5.24k
    *wstr = unicode;
5505
5.24k
    return 0;
5506
5.24k
}
5507
5508
5509
wchar_t*
5510
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5511
                               size_t *wlen)
5512
0
{
5513
0
    wchar_t *wstr;
5514
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5515
0
                               &wstr, wlen,
5516
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5517
0
    if (res != 0) {
5518
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5519
0
        assert(res != -3);
5520
0
        if (wlen) {
5521
0
            *wlen = (size_t)res;
5522
0
        }
5523
0
        return NULL;
5524
0
    }
5525
0
    return wstr;
5526
0
}
5527
5528
5529
/* UTF-8 encoder.
5530
5531
   On success, return 0 and write the newly allocated character string (use
5532
   PyMem_Free() to free the memory) into *str.
5533
5534
   On encoding failure, return -2 and write the position of the invalid
5535
   surrogate character into *error_pos (if error_pos is set) and the decoding
5536
   error message into *reason (if reason is set).
5537
5538
   On memory allocation failure, return -1. */
5539
int
5540
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5541
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5542
632
{
5543
632
    const Py_ssize_t max_char_size = 4;
5544
632
    Py_ssize_t len = wcslen(text);
5545
5546
632
    assert(len >= 0);
5547
5548
632
    int surrogateescape = 0;
5549
632
    int surrogatepass = 0;
5550
632
    switch (errors)
5551
632
    {
5552
64
    case _Py_ERROR_STRICT:
5553
64
        break;
5554
568
    case _Py_ERROR_SURROGATEESCAPE:
5555
568
        surrogateescape = 1;
5556
568
        break;
5557
0
    case _Py_ERROR_SURROGATEPASS:
5558
0
        surrogatepass = 1;
5559
0
        break;
5560
0
    default:
5561
0
        return -3;
5562
632
    }
5563
5564
632
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5565
0
        return -1;
5566
0
    }
5567
632
    char *bytes;
5568
632
    if (raw_malloc) {
5569
632
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5570
632
    }
5571
0
    else {
5572
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5573
0
    }
5574
632
    if (bytes == NULL) {
5575
0
        return -1;
5576
0
    }
5577
5578
632
    char *p = bytes;
5579
632
    Py_ssize_t i;
5580
41.6k
    for (i = 0; i < len; ) {
5581
41.0k
        Py_ssize_t ch_pos = i;
5582
41.0k
        Py_UCS4 ch = text[i];
5583
41.0k
        i++;
5584
#if Py_UNICODE_SIZE == 2
5585
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5586
            && i < len
5587
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5588
        {
5589
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5590
            i++;
5591
        }
5592
#endif
5593
5594
41.0k
        if (ch < 0x80) {
5595
            /* Encode ASCII */
5596
41.0k
            *p++ = (char) ch;
5597
5598
41.0k
        }
5599
0
        else if (ch < 0x0800) {
5600
            /* Encode Latin-1 */
5601
0
            *p++ = (char)(0xc0 | (ch >> 6));
5602
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5603
0
        }
5604
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5605
            /* surrogateescape error handler */
5606
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5607
0
                if (error_pos != NULL) {
5608
0
                    *error_pos = (size_t)ch_pos;
5609
0
                }
5610
0
                if (reason != NULL) {
5611
0
                    *reason = "encoding error";
5612
0
                }
5613
0
                if (raw_malloc) {
5614
0
                    PyMem_RawFree(bytes);
5615
0
                }
5616
0
                else {
5617
0
                    PyMem_Free(bytes);
5618
0
                }
5619
0
                return -2;
5620
0
            }
5621
0
            *p++ = (char)(ch & 0xff);
5622
0
        }
5623
0
        else if (ch < 0x10000) {
5624
0
            *p++ = (char)(0xe0 | (ch >> 12));
5625
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5626
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5627
0
        }
5628
0
        else {  /* ch >= 0x10000 */
5629
0
            assert(ch <= MAX_UNICODE);
5630
            /* Encode UCS4 Unicode ordinals */
5631
0
            *p++ = (char)(0xf0 | (ch >> 18));
5632
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5633
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5634
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5635
0
        }
5636
41.0k
    }
5637
632
    *p++ = '\0';
5638
5639
632
    size_t final_size = (p - bytes);
5640
632
    char *bytes2;
5641
632
    if (raw_malloc) {
5642
632
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5643
632
    }
5644
0
    else {
5645
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5646
0
    }
5647
632
    if (bytes2 == NULL) {
5648
0
        if (error_pos != NULL) {
5649
0
            *error_pos = (size_t)-1;
5650
0
        }
5651
0
        if (raw_malloc) {
5652
0
            PyMem_RawFree(bytes);
5653
0
        }
5654
0
        else {
5655
0
            PyMem_Free(bytes);
5656
0
        }
5657
0
        return -1;
5658
0
    }
5659
632
    *str = bytes2;
5660
632
    return 0;
5661
632
}
5662
5663
5664
/* Primary internal function which creates utf8 encoded bytes objects.
5665
5666
   Allocation strategy:  if the string is short, convert into a stack buffer
5667
   and allocate exactly as much space needed at the end.  Else allocate the
5668
   maximum possible needed (4 result bytes per Unicode character), and return
5669
   the excess memory at the end.
5670
*/
5671
static PyObject *
5672
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5673
                    const char *errors)
5674
16.2M
{
5675
16.2M
    if (!PyUnicode_Check(unicode)) {
5676
0
        PyErr_BadArgument();
5677
0
        return NULL;
5678
0
    }
5679
5680
16.2M
    if (PyUnicode_UTF8(unicode))
5681
9.83M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5682
9.83M
                                         PyUnicode_UTF8_LENGTH(unicode));
5683
5684
6.44M
    int kind = PyUnicode_KIND(unicode);
5685
6.44M
    const void *data = PyUnicode_DATA(unicode);
5686
6.44M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5687
5688
6.44M
    PyBytesWriter *writer;
5689
6.44M
    char *end;
5690
5691
6.44M
    switch (kind) {
5692
0
    default:
5693
0
        Py_UNREACHABLE();
5694
4.95M
    case PyUnicode_1BYTE_KIND:
5695
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5696
4.95M
        assert(!PyUnicode_IS_ASCII(unicode));
5697
4.95M
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5698
4.95M
                                      error_handler, errors, &end);
5699
4.95M
        break;
5700
1.42M
    case PyUnicode_2BYTE_KIND:
5701
1.42M
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5702
1.42M
                                      error_handler, errors, &end);
5703
1.42M
        break;
5704
61.5k
    case PyUnicode_4BYTE_KIND:
5705
61.5k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5706
61.5k
                                      error_handler, errors, &end);
5707
61.5k
        break;
5708
6.44M
    }
5709
5710
6.44M
    if (writer == NULL) {
5711
170k
        PyBytesWriter_Discard(writer);
5712
170k
        return NULL;
5713
170k
    }
5714
6.27M
    return PyBytesWriter_FinishWithPointer(writer, end);
5715
6.44M
}
5716
5717
static int
5718
unicode_fill_utf8(PyObject *unicode)
5719
166k
{
5720
166k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5721
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5722
166k
    assert(!PyUnicode_IS_ASCII(unicode));
5723
5724
166k
    int kind = PyUnicode_KIND(unicode);
5725
166k
    const void *data = PyUnicode_DATA(unicode);
5726
166k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5727
5728
166k
    PyBytesWriter *writer;
5729
166k
    char *end;
5730
5731
166k
    switch (kind) {
5732
0
    default:
5733
0
        Py_UNREACHABLE();
5734
132k
    case PyUnicode_1BYTE_KIND:
5735
132k
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5736
132k
                                      _Py_ERROR_STRICT, NULL, &end);
5737
132k
        break;
5738
28.0k
    case PyUnicode_2BYTE_KIND:
5739
28.0k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5740
28.0k
                                      _Py_ERROR_STRICT, NULL, &end);
5741
28.0k
        break;
5742
5.64k
    case PyUnicode_4BYTE_KIND:
5743
5.64k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5744
5.64k
                                      _Py_ERROR_STRICT, NULL, &end);
5745
5.64k
        break;
5746
166k
    }
5747
166k
    if (writer == NULL) {
5748
207
        return -1;
5749
207
    }
5750
5751
166k
    const char *start = PyBytesWriter_GetData(writer);
5752
166k
    Py_ssize_t len = end - start;
5753
5754
166k
    char *cache = PyMem_Malloc(len + 1);
5755
166k
    if (cache == NULL) {
5756
0
        PyBytesWriter_Discard(writer);
5757
0
        PyErr_NoMemory();
5758
0
        return -1;
5759
0
    }
5760
166k
    memcpy(cache, start, len);
5761
166k
    cache[len] = '\0';
5762
166k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5763
166k
    PyUnicode_SET_UTF8(unicode, cache);
5764
166k
    PyBytesWriter_Discard(writer);
5765
166k
    return 0;
5766
166k
}
5767
5768
PyObject *
5769
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5770
16.2M
{
5771
16.2M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5772
16.2M
}
5773
5774
5775
PyObject *
5776
PyUnicode_AsUTF8String(PyObject *unicode)
5777
2.92k
{
5778
2.92k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5779
2.92k
}
5780
5781
/* --- UTF-32 Codec ------------------------------------------------------- */
5782
5783
PyObject *
5784
PyUnicode_DecodeUTF32(const char *s,
5785
                      Py_ssize_t size,
5786
                      const char *errors,
5787
                      int *byteorder)
5788
100
{
5789
100
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5790
100
}
5791
5792
PyObject *
5793
PyUnicode_DecodeUTF32Stateful(const char *s,
5794
                              Py_ssize_t size,
5795
                              const char *errors,
5796
                              int *byteorder,
5797
                              Py_ssize_t *consumed)
5798
15.6k
{
5799
15.6k
    const char *starts = s;
5800
15.6k
    Py_ssize_t startinpos;
5801
15.6k
    Py_ssize_t endinpos;
5802
15.6k
    _PyUnicodeWriter writer;
5803
15.6k
    const unsigned char *q, *e;
5804
15.6k
    int le, bo = 0;       /* assume native ordering by default */
5805
15.6k
    const char *encoding;
5806
15.6k
    const char *errmsg = "";
5807
15.6k
    PyObject *errorHandler = NULL;
5808
15.6k
    PyObject *exc = NULL;
5809
5810
15.6k
    q = (const unsigned char *)s;
5811
15.6k
    e = q + size;
5812
5813
15.6k
    if (byteorder)
5814
15.5k
        bo = *byteorder;
5815
5816
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5817
       byte order setting accordingly. In native mode, the leading BOM
5818
       mark is skipped, in all other modes, it is copied to the output
5819
       stream as-is (giving a ZWNBSP character). */
5820
15.6k
    if (bo == 0 && size >= 4) {
5821
12.9k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5822
12.9k
        if (bom == 0x0000FEFF) {
5823
126
            bo = -1;
5824
126
            q += 4;
5825
126
        }
5826
12.8k
        else if (bom == 0xFFFE0000) {
5827
199
            bo = 1;
5828
199
            q += 4;
5829
199
        }
5830
12.9k
        if (byteorder)
5831
12.8k
            *byteorder = bo;
5832
12.9k
    }
5833
5834
15.6k
    if (q == e) {
5835
71
        if (consumed)
5836
0
            *consumed = size;
5837
71
        _Py_RETURN_UNICODE_EMPTY();
5838
71
    }
5839
5840
#ifdef WORDS_BIGENDIAN
5841
    le = bo < 0;
5842
#else
5843
15.6k
    le = bo <= 0;
5844
15.6k
#endif
5845
15.6k
    encoding = le ? "utf-32-le" : "utf-32-be";
5846
5847
15.6k
    _PyUnicodeWriter_Init(&writer);
5848
15.6k
    writer.min_length = (e - q + 3) / 4;
5849
15.6k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5850
0
        goto onError;
5851
5852
95.4k
    while (1) {
5853
95.4k
        Py_UCS4 ch = 0;
5854
95.4k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5855
5856
95.4k
        if (e - q >= 4) {
5857
83.4k
            int kind = writer.kind;
5858
83.4k
            void *data = writer.data;
5859
83.4k
            const unsigned char *last = e - 4;
5860
83.4k
            Py_ssize_t pos = writer.pos;
5861
83.4k
            if (le) {
5862
106k
                do {
5863
106k
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5864
106k
                    if (ch > maxch)
5865
78.8k
                        break;
5866
27.8k
                    if (kind != PyUnicode_1BYTE_KIND &&
5867
8.96k
                        Py_UNICODE_IS_SURROGATE(ch))
5868
202
                        break;
5869
27.6k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5870
27.6k
                    q += 4;
5871
27.6k
                } while (q <= last);
5872
80.1k
            }
5873
3.23k
            else {
5874
5.77k
                do {
5875
5.77k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5876
5.77k
                    if (ch > maxch)
5877
3.00k
                        break;
5878
2.76k
                    if (kind != PyUnicode_1BYTE_KIND &&
5879
2.17k
                        Py_UNICODE_IS_SURROGATE(ch))
5880
106
                        break;
5881
2.66k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5882
2.66k
                    q += 4;
5883
2.66k
                } while (q <= last);
5884
3.23k
            }
5885
83.4k
            writer.pos = pos;
5886
83.4k
        }
5887
5888
95.4k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5889
312
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5890
312
            startinpos = ((const char *)q) - starts;
5891
312
            endinpos = startinpos + 4;
5892
312
        }
5893
95.0k
        else if (ch <= maxch) {
5894
13.2k
            if (q == e || consumed)
5895
3.75k
                break;
5896
            /* remaining bytes at the end? (size should be divisible by 4) */
5897
9.51k
            errmsg = "truncated data";
5898
9.51k
            startinpos = ((const char *)q) - starts;
5899
9.51k
            endinpos = ((const char *)e) - starts;
5900
9.51k
        }
5901
81.8k
        else {
5902
81.8k
            if (ch < 0x110000) {
5903
4.08k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5904
0
                    goto onError;
5905
4.08k
                q += 4;
5906
4.08k
                continue;
5907
4.08k
            }
5908
77.7k
            errmsg = "code point not in range(0x110000)";
5909
77.7k
            startinpos = ((const char *)q) - starts;
5910
77.7k
            endinpos = startinpos + 4;
5911
77.7k
        }
5912
5913
        /* The remaining input chars are ignored if the callback
5914
           chooses to skip the input */
5915
87.5k
        if (unicode_decode_call_errorhandler_writer(
5916
87.5k
                errors, &errorHandler,
5917
87.5k
                encoding, errmsg,
5918
87.5k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5919
87.5k
                &writer))
5920
11.8k
            goto onError;
5921
87.5k
    }
5922
5923
3.75k
    if (consumed)
5924
0
        *consumed = (const char *)q-starts;
5925
5926
3.75k
    Py_XDECREF(errorHandler);
5927
3.75k
    Py_XDECREF(exc);
5928
3.75k
    return _PyUnicodeWriter_Finish(&writer);
5929
5930
11.8k
  onError:
5931
11.8k
    _PyUnicodeWriter_Dealloc(&writer);
5932
11.8k
    Py_XDECREF(errorHandler);
5933
11.8k
    Py_XDECREF(exc);
5934
11.8k
    return NULL;
5935
15.6k
}
5936
5937
PyObject *
5938
_PyUnicode_EncodeUTF32(PyObject *str,
5939
                       const char *errors,
5940
                       int byteorder)
5941
0
{
5942
0
    if (!PyUnicode_Check(str)) {
5943
0
        PyErr_BadArgument();
5944
0
        return NULL;
5945
0
    }
5946
0
    int kind = PyUnicode_KIND(str);
5947
0
    const void *data = PyUnicode_DATA(str);
5948
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
5949
5950
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5951
0
        return PyErr_NoMemory();
5952
0
    Py_ssize_t nsize = len + (byteorder == 0);
5953
5954
0
#if PY_LITTLE_ENDIAN
5955
0
    int native_ordering = byteorder <= 0;
5956
#else
5957
    int native_ordering = byteorder >= 0;
5958
#endif
5959
5960
0
    if (kind == PyUnicode_1BYTE_KIND) {
5961
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
5962
        // on short strings
5963
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5964
0
        if (v == NULL) {
5965
0
            return NULL;
5966
0
        }
5967
5968
        /* output buffer is 4-bytes aligned */
5969
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5970
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
5971
0
        if (byteorder == 0) {
5972
0
            *out++ = 0xFEFF;
5973
0
        }
5974
0
        if (len > 0) {
5975
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
5976
0
                                 &out, native_ordering);
5977
0
        }
5978
0
        return v;
5979
0
    }
5980
5981
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
5982
0
    if (writer == NULL) {
5983
0
        return NULL;
5984
0
    }
5985
5986
    /* output buffer is 4-bytes aligned */
5987
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
5988
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
5989
0
    if (byteorder == 0) {
5990
0
        *out++ = 0xFEFF;
5991
0
    }
5992
0
    if (len == 0) {
5993
0
        return PyBytesWriter_Finish(writer);
5994
0
    }
5995
5996
0
    const char *encoding;
5997
0
    if (byteorder == -1)
5998
0
        encoding = "utf-32-le";
5999
0
    else if (byteorder == 1)
6000
0
        encoding = "utf-32-be";
6001
0
    else
6002
0
        encoding = "utf-32";
6003
6004
0
    PyObject *errorHandler = NULL;
6005
0
    PyObject *exc = NULL;
6006
0
    PyObject *rep = NULL;
6007
6008
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6009
0
        if (kind == PyUnicode_2BYTE_KIND) {
6010
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6011
0
                                        &out, native_ordering);
6012
0
        }
6013
0
        else {
6014
0
            assert(kind == PyUnicode_4BYTE_KIND);
6015
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6016
0
                                        &out, native_ordering);
6017
0
        }
6018
0
        if (pos == len)
6019
0
            break;
6020
6021
0
        Py_ssize_t newpos;
6022
0
        rep = unicode_encode_call_errorhandler(
6023
0
                errors, &errorHandler,
6024
0
                encoding, "surrogates not allowed",
6025
0
                str, &exc, pos, pos + 1, &newpos);
6026
0
        if (!rep)
6027
0
            goto error;
6028
6029
0
        Py_ssize_t repsize, moreunits;
6030
0
        if (PyBytes_Check(rep)) {
6031
0
            repsize = PyBytes_GET_SIZE(rep);
6032
0
            if (repsize & 3) {
6033
0
                raise_encode_exception(&exc, encoding,
6034
0
                                       str, pos, pos + 1,
6035
0
                                       "surrogates not allowed");
6036
0
                goto error;
6037
0
            }
6038
0
            moreunits = repsize / 4;
6039
0
        }
6040
0
        else {
6041
0
            assert(PyUnicode_Check(rep));
6042
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6043
0
            if (!PyUnicode_IS_ASCII(rep)) {
6044
0
                raise_encode_exception(&exc, encoding,
6045
0
                                       str, pos, pos + 1,
6046
0
                                       "surrogates not allowed");
6047
0
                goto error;
6048
0
            }
6049
0
        }
6050
0
        moreunits += pos - newpos;
6051
0
        pos = newpos;
6052
6053
        /* four bytes are reserved for each surrogate */
6054
0
        if (moreunits > 0) {
6055
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6056
0
            if (out == NULL) {
6057
0
                goto error;
6058
0
            }
6059
0
        }
6060
6061
0
        if (PyBytes_Check(rep)) {
6062
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6063
0
            out += repsize / 4;
6064
0
        }
6065
0
        else {
6066
            /* rep is unicode */
6067
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6068
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6069
0
                                 &out, native_ordering);
6070
0
        }
6071
6072
0
        Py_CLEAR(rep);
6073
0
    }
6074
6075
0
    Py_XDECREF(errorHandler);
6076
0
    Py_XDECREF(exc);
6077
6078
    /* Cut back to size actually needed. This is necessary for, for example,
6079
       encoding of a string containing isolated surrogates and the 'ignore'
6080
       handler is used. */
6081
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6082
6083
0
  error:
6084
0
    Py_XDECREF(rep);
6085
0
    Py_XDECREF(errorHandler);
6086
0
    Py_XDECREF(exc);
6087
0
    PyBytesWriter_Discard(writer);
6088
0
    return NULL;
6089
0
}
6090
6091
PyObject *
6092
PyUnicode_AsUTF32String(PyObject *unicode)
6093
0
{
6094
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6095
0
}
6096
6097
/* --- UTF-16 Codec ------------------------------------------------------- */
6098
6099
PyObject *
6100
PyUnicode_DecodeUTF16(const char *s,
6101
                      Py_ssize_t size,
6102
                      const char *errors,
6103
                      int *byteorder)
6104
106
{
6105
106
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6106
106
}
6107
6108
PyObject *
6109
PyUnicode_DecodeUTF16Stateful(const char *s,
6110
                              Py_ssize_t size,
6111
                              const char *errors,
6112
                              int *byteorder,
6113
                              Py_ssize_t *consumed)
6114
12.8k
{
6115
12.8k
    const char *starts = s;
6116
12.8k
    Py_ssize_t startinpos;
6117
12.8k
    Py_ssize_t endinpos;
6118
12.8k
    _PyUnicodeWriter writer;
6119
12.8k
    const unsigned char *q, *e;
6120
12.8k
    int bo = 0;       /* assume native ordering by default */
6121
12.8k
    int native_ordering;
6122
12.8k
    const char *errmsg = "";
6123
12.8k
    PyObject *errorHandler = NULL;
6124
12.8k
    PyObject *exc = NULL;
6125
12.8k
    const char *encoding;
6126
6127
12.8k
    q = (const unsigned char *)s;
6128
12.8k
    e = q + size;
6129
6130
12.8k
    if (byteorder)
6131
12.7k
        bo = *byteorder;
6132
6133
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6134
       byte order setting accordingly. In native mode, the leading BOM
6135
       mark is skipped, in all other modes, it is copied to the output
6136
       stream as-is (giving a ZWNBSP character). */
6137
12.8k
    if (bo == 0 && size >= 2) {
6138
12.2k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6139
12.2k
        if (bom == 0xFEFF) {
6140
318
            q += 2;
6141
318
            bo = -1;
6142
318
        }
6143
11.8k
        else if (bom == 0xFFFE) {
6144
1.82k
            q += 2;
6145
1.82k
            bo = 1;
6146
1.82k
        }
6147
12.2k
        if (byteorder)
6148
12.0k
            *byteorder = bo;
6149
12.2k
    }
6150
6151
12.8k
    if (q == e) {
6152
70
        if (consumed)
6153
0
            *consumed = size;
6154
70
        _Py_RETURN_UNICODE_EMPTY();
6155
70
    }
6156
6157
12.8k
#if PY_LITTLE_ENDIAN
6158
12.8k
    native_ordering = bo <= 0;
6159
12.8k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6160
#else
6161
    native_ordering = bo >= 0;
6162
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6163
#endif
6164
6165
    /* Note: size will always be longer than the resulting Unicode
6166
       character count normally.  Error handler will take care of
6167
       resizing when needed. */
6168
12.8k
    _PyUnicodeWriter_Init(&writer);
6169
12.8k
    writer.min_length = (e - q + 1) / 2;
6170
12.8k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6171
0
        goto onError;
6172
6173
49.0k
    while (1) {
6174
49.0k
        Py_UCS4 ch = 0;
6175
49.0k
        if (e - q >= 2) {
6176
42.4k
            int kind = writer.kind;
6177
42.4k
            if (kind == PyUnicode_1BYTE_KIND) {
6178
15.4k
                if (PyUnicode_IS_ASCII(writer.buffer))
6179
12.2k
                    ch = asciilib_utf16_decode(&q, e,
6180
12.2k
                            (Py_UCS1*)writer.data, &writer.pos,
6181
12.2k
                            native_ordering);
6182
3.18k
                else
6183
3.18k
                    ch = ucs1lib_utf16_decode(&q, e,
6184
3.18k
                            (Py_UCS1*)writer.data, &writer.pos,
6185
3.18k
                            native_ordering);
6186
27.0k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6187
11.0k
                ch = ucs2lib_utf16_decode(&q, e,
6188
11.0k
                        (Py_UCS2*)writer.data, &writer.pos,
6189
11.0k
                        native_ordering);
6190
15.9k
            } else {
6191
15.9k
                assert(kind == PyUnicode_4BYTE_KIND);
6192
15.9k
                ch = ucs4lib_utf16_decode(&q, e,
6193
15.9k
                        (Py_UCS4*)writer.data, &writer.pos,
6194
15.9k
                        native_ordering);
6195
15.9k
            }
6196
42.4k
        }
6197
6198
49.0k
        switch (ch)
6199
49.0k
        {
6200
12.5k
        case 0:
6201
            /* remaining byte at the end? (size should be even) */
6202
12.5k
            if (q == e || consumed)
6203
7.98k
                goto End;
6204
4.61k
            errmsg = "truncated data";
6205
4.61k
            startinpos = ((const char *)q) - starts;
6206
4.61k
            endinpos = ((const char *)e) - starts;
6207
4.61k
            break;
6208
            /* The remaining input chars are ignored if the callback
6209
               chooses to skip the input */
6210
1.57k
        case 1:
6211
1.57k
            q -= 2;
6212
1.57k
            if (consumed)
6213
0
                goto End;
6214
1.57k
            errmsg = "unexpected end of data";
6215
1.57k
            startinpos = ((const char *)q) - starts;
6216
1.57k
            endinpos = ((const char *)e) - starts;
6217
1.57k
            break;
6218
13.8k
        case 2:
6219
13.8k
            errmsg = "illegal encoding";
6220
13.8k
            startinpos = ((const char *)q) - 2 - starts;
6221
13.8k
            endinpos = startinpos + 2;
6222
13.8k
            break;
6223
5.92k
        case 3:
6224
5.92k
            errmsg = "illegal UTF-16 surrogate";
6225
5.92k
            startinpos = ((const char *)q) - 4 - starts;
6226
5.92k
            endinpos = startinpos + 2;
6227
5.92k
            break;
6228
15.1k
        default:
6229
15.1k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6230
0
                goto onError;
6231
15.1k
            continue;
6232
49.0k
        }
6233
6234
25.9k
        if (unicode_decode_call_errorhandler_writer(
6235
25.9k
                errors,
6236
25.9k
                &errorHandler,
6237
25.9k
                encoding, errmsg,
6238
25.9k
                &starts,
6239
25.9k
                (const char **)&e,
6240
25.9k
                &startinpos,
6241
25.9k
                &endinpos,
6242
25.9k
                &exc,
6243
25.9k
                (const char **)&q,
6244
25.9k
                &writer))
6245
4.83k
            goto onError;
6246
25.9k
    }
6247
6248
7.98k
End:
6249
7.98k
    if (consumed)
6250
0
        *consumed = (const char *)q-starts;
6251
6252
7.98k
    Py_XDECREF(errorHandler);
6253
7.98k
    Py_XDECREF(exc);
6254
7.98k
    return _PyUnicodeWriter_Finish(&writer);
6255
6256
4.83k
  onError:
6257
4.83k
    _PyUnicodeWriter_Dealloc(&writer);
6258
4.83k
    Py_XDECREF(errorHandler);
6259
4.83k
    Py_XDECREF(exc);
6260
4.83k
    return NULL;
6261
12.8k
}
6262
6263
PyObject *
6264
_PyUnicode_EncodeUTF16(PyObject *str,
6265
                       const char *errors,
6266
                       int byteorder)
6267
0
{
6268
0
    if (!PyUnicode_Check(str)) {
6269
0
        PyErr_BadArgument();
6270
0
        return NULL;
6271
0
    }
6272
0
    int kind = PyUnicode_KIND(str);
6273
0
    const void *data = PyUnicode_DATA(str);
6274
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6275
6276
0
    Py_ssize_t pairs = 0;
6277
0
    if (kind == PyUnicode_4BYTE_KIND) {
6278
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6279
0
        const Py_UCS4 *end = in + len;
6280
0
        while (in < end) {
6281
0
            if (*in++ >= 0x10000) {
6282
0
                pairs++;
6283
0
            }
6284
0
        }
6285
0
    }
6286
0
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6287
0
        return PyErr_NoMemory();
6288
0
    }
6289
0
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6290
6291
#if PY_BIG_ENDIAN
6292
    int native_ordering = byteorder >= 0;
6293
#else
6294
0
    int native_ordering = byteorder <= 0;
6295
0
#endif
6296
6297
0
    if (kind == PyUnicode_1BYTE_KIND) {
6298
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6299
        // on short strings
6300
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6301
0
        if (v == NULL) {
6302
0
            return NULL;
6303
0
        }
6304
6305
        /* output buffer is 2-bytes aligned */
6306
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6307
0
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6308
0
        if (byteorder == 0) {
6309
0
            *out++ = 0xFEFF;
6310
0
        }
6311
0
        if (len > 0) {
6312
0
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6313
0
        }
6314
0
        return v;
6315
0
    }
6316
6317
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6318
0
    if (writer == NULL) {
6319
0
        return NULL;
6320
0
    }
6321
6322
    /* output buffer is 2-bytes aligned */
6323
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6324
0
    unsigned short *out = PyBytesWriter_GetData(writer);
6325
0
    if (byteorder == 0) {
6326
0
        *out++ = 0xFEFF;
6327
0
    }
6328
0
    if (len == 0) {
6329
0
        return PyBytesWriter_Finish(writer);
6330
0
    }
6331
6332
0
    const char *encoding;
6333
0
    if (byteorder < 0) {
6334
0
        encoding = "utf-16-le";
6335
0
    }
6336
0
    else if (byteorder > 0) {
6337
0
        encoding = "utf-16-be";
6338
0
    }
6339
0
    else {
6340
0
        encoding = "utf-16";
6341
0
    }
6342
6343
0
    PyObject *errorHandler = NULL;
6344
0
    PyObject *exc = NULL;
6345
0
    PyObject *rep = NULL;
6346
6347
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6348
0
        if (kind == PyUnicode_2BYTE_KIND) {
6349
0
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6350
0
                                        &out, native_ordering);
6351
0
        }
6352
0
        else {
6353
0
            assert(kind == PyUnicode_4BYTE_KIND);
6354
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6355
0
                                        &out, native_ordering);
6356
0
        }
6357
0
        if (pos == len)
6358
0
            break;
6359
6360
0
        Py_ssize_t newpos;
6361
0
        rep = unicode_encode_call_errorhandler(
6362
0
                errors, &errorHandler,
6363
0
                encoding, "surrogates not allowed",
6364
0
                str, &exc, pos, pos + 1, &newpos);
6365
0
        if (!rep)
6366
0
            goto error;
6367
6368
0
        Py_ssize_t repsize, moreunits;
6369
0
        if (PyBytes_Check(rep)) {
6370
0
            repsize = PyBytes_GET_SIZE(rep);
6371
0
            if (repsize & 1) {
6372
0
                raise_encode_exception(&exc, encoding,
6373
0
                                       str, pos, pos + 1,
6374
0
                                       "surrogates not allowed");
6375
0
                goto error;
6376
0
            }
6377
0
            moreunits = repsize / 2;
6378
0
        }
6379
0
        else {
6380
0
            assert(PyUnicode_Check(rep));
6381
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6382
0
            if (!PyUnicode_IS_ASCII(rep)) {
6383
0
                raise_encode_exception(&exc, encoding,
6384
0
                                       str, pos, pos + 1,
6385
0
                                       "surrogates not allowed");
6386
0
                goto error;
6387
0
            }
6388
0
        }
6389
0
        moreunits += pos - newpos;
6390
0
        pos = newpos;
6391
6392
        /* two bytes are reserved for each surrogate */
6393
0
        if (moreunits > 0) {
6394
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6395
0
            if (out == NULL) {
6396
0
                goto error;
6397
0
            }
6398
0
        }
6399
6400
0
        if (PyBytes_Check(rep)) {
6401
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6402
0
            out += repsize / 2;
6403
0
        } else {
6404
            /* rep is unicode */
6405
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6406
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6407
0
                                 &out, native_ordering);
6408
0
        }
6409
6410
0
        Py_CLEAR(rep);
6411
0
    }
6412
6413
0
    Py_XDECREF(errorHandler);
6414
0
    Py_XDECREF(exc);
6415
6416
    /* Cut back to size actually needed. This is necessary for, for example,
6417
    encoding of a string containing isolated surrogates and the 'ignore' handler
6418
    is used. */
6419
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6420
6421
0
  error:
6422
0
    Py_XDECREF(rep);
6423
0
    Py_XDECREF(errorHandler);
6424
0
    Py_XDECREF(exc);
6425
0
    PyBytesWriter_Discard(writer);
6426
0
    return NULL;
6427
0
}
6428
6429
PyObject *
6430
PyUnicode_AsUTF16String(PyObject *unicode)
6431
0
{
6432
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6433
0
}
6434
6435
_PyUnicode_Name_CAPI *
6436
_PyUnicode_GetNameCAPI(void)
6437
1.60k
{
6438
1.60k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6439
1.60k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6440
6441
1.60k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6442
1.60k
    if (ucnhash_capi == NULL) {
6443
1
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6444
1
                PyUnicodeData_CAPSULE_NAME, 1);
6445
6446
        // It's fine if we overwrite the value here. It's always the same value.
6447
1
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6448
1
    }
6449
1.60k
    return ucnhash_capi;
6450
1.60k
}
6451
6452
/* --- Unicode Escape Codec ----------------------------------------------- */
6453
6454
PyObject *
6455
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6456
                               Py_ssize_t size,
6457
                               const char *errors,
6458
                               Py_ssize_t *consumed,
6459
                               int *first_invalid_escape_char,
6460
                               const char **first_invalid_escape_ptr)
6461
30.5k
{
6462
30.5k
    const char *starts = s;
6463
30.5k
    const char *initial_starts = starts;
6464
30.5k
    _PyUnicodeWriter writer;
6465
30.5k
    const char *end;
6466
30.5k
    PyObject *errorHandler = NULL;
6467
30.5k
    PyObject *exc = NULL;
6468
30.5k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6469
6470
    // so we can remember if we've seen an invalid escape char or not
6471
30.5k
    *first_invalid_escape_char = -1;
6472
30.5k
    *first_invalid_escape_ptr = NULL;
6473
6474
30.5k
    if (size == 0) {
6475
1.83k
        if (consumed) {
6476
0
            *consumed = 0;
6477
0
        }
6478
1.83k
        _Py_RETURN_UNICODE_EMPTY();
6479
1.83k
    }
6480
    /* Escaped strings will always be longer than the resulting
6481
       Unicode string, so we start with size here and then reduce the
6482
       length after conversion to the true value.
6483
       (but if the error callback returns a long replacement string
6484
       we'll have to allocate more space) */
6485
28.7k
    _PyUnicodeWriter_Init(&writer);
6486
28.7k
    writer.min_length = size;
6487
28.7k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6488
0
        goto onError;
6489
0
    }
6490
6491
28.7k
    end = s + size;
6492
171k
    while (s < end) {
6493
143k
        unsigned char c = (unsigned char) *s++;
6494
143k
        Py_UCS4 ch;
6495
143k
        int count;
6496
143k
        const char *message;
6497
6498
143k
#define WRITE_ASCII_CHAR(ch)                                                  \
6499
143k
            do {                                                              \
6500
15.0k
                assert(ch <= 127);                                            \
6501
15.0k
                assert(writer.pos < writer.size);                             \
6502
15.0k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6503
15.0k
            } while(0)
6504
6505
143k
#define WRITE_CHAR(ch)                                                        \
6506
143k
            do {                                                              \
6507
132k
                if (ch <= writer.maxchar) {                                   \
6508
117k
                    assert(writer.pos < writer.size);                         \
6509
117k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6510
117k
                }                                                             \
6511
132k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6512
0
                    goto onError;                                             \
6513
0
                }                                                             \
6514
132k
            } while(0)
6515
6516
        /* Non-escape characters are interpreted as Unicode ordinals */
6517
143k
        if (c != '\\') {
6518
92.3k
            WRITE_CHAR(c);
6519
92.3k
            continue;
6520
92.3k
        }
6521
6522
50.8k
        Py_ssize_t startinpos = s - starts - 1;
6523
        /* \ - Escapes */
6524
50.8k
        if (s >= end) {
6525
0
            message = "\\ at end of string";
6526
0
            goto incomplete;
6527
0
        }
6528
50.8k
        c = (unsigned char) *s++;
6529
6530
50.8k
        assert(writer.pos < writer.size);
6531
50.8k
        switch (c) {
6532
6533
            /* \x escapes */
6534
668
        case '\n': continue;
6535
1.41k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6536
895
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6537
1.18k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6538
1.13k
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6539
        /* FF */
6540
729
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6541
766
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6542
930
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6543
1.40k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6544
        /* VT */
6545
831
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6546
        /* BEL, not classic C */
6547
717
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6548
6549
            /* \OOO (octal) escapes */
6550
3.85k
        case '0': case '1': case '2': case '3':
6551
6.82k
        case '4': case '5': case '6': case '7':
6552
6.82k
            ch = c - '0';
6553
6.82k
            if (s < end && '0' <= *s && *s <= '7') {
6554
2.65k
                ch = (ch<<3) + *s++ - '0';
6555
2.65k
                if (s < end && '0' <= *s && *s <= '7') {
6556
1.28k
                    ch = (ch<<3) + *s++ - '0';
6557
1.28k
                }
6558
2.65k
            }
6559
6.82k
            if (ch > 0377) {
6560
1.11k
                if (*first_invalid_escape_char == -1) {
6561
807
                    *first_invalid_escape_char = ch;
6562
807
                    if (starts == initial_starts) {
6563
                        /* Back up 3 chars, since we've already incremented s. */
6564
807
                        *first_invalid_escape_ptr = s - 3;
6565
807
                    }
6566
807
                }
6567
1.11k
            }
6568
6.82k
            WRITE_CHAR(ch);
6569
6.82k
            continue;
6570
6571
            /* hex escapes */
6572
            /* \xXX */
6573
6.82k
        case 'x':
6574
5.96k
            count = 2;
6575
5.96k
            message = "truncated \\xXX escape";
6576
5.96k
            goto hexescape;
6577
6578
            /* \uXXXX */
6579
9.19k
        case 'u':
6580
9.19k
            count = 4;
6581
9.19k
            message = "truncated \\uXXXX escape";
6582
9.19k
            goto hexescape;
6583
6584
            /* \UXXXXXXXX */
6585
11.5k
        case 'U':
6586
11.5k
            count = 8;
6587
11.5k
            message = "truncated \\UXXXXXXXX escape";
6588
26.7k
        hexescape:
6589
168k
            for (ch = 0; count; ++s, --count) {
6590
141k
                if (s >= end) {
6591
6
                    goto incomplete;
6592
6
                }
6593
141k
                c = (unsigned char)*s;
6594
141k
                ch <<= 4;
6595
141k
                if (c >= '0' && c <= '9') {
6596
109k
                    ch += c - '0';
6597
109k
                }
6598
32.2k
                else if (c >= 'a' && c <= 'f') {
6599
31.9k
                    ch += c - ('a' - 10);
6600
31.9k
                }
6601
247
                else if (c >= 'A' && c <= 'F') {
6602
240
                    ch += c - ('A' - 10);
6603
240
                }
6604
7
                else {
6605
7
                    goto error;
6606
7
                }
6607
141k
            }
6608
6609
            /* when we get here, ch is a 32-bit unicode character */
6610
26.7k
            if (ch > MAX_UNICODE) {
6611
1
                message = "illegal Unicode character";
6612
1
                goto error;
6613
1
            }
6614
6615
26.7k
            WRITE_CHAR(ch);
6616
26.7k
            continue;
6617
6618
            /* \N{name} */
6619
26.7k
        case 'N':
6620
1.60k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6621
1.60k
            if (ucnhash_capi == NULL) {
6622
0
                PyErr_SetString(
6623
0
                        PyExc_UnicodeError,
6624
0
                        "\\N escapes not supported (can't load unicodedata module)"
6625
0
                );
6626
0
                goto onError;
6627
0
            }
6628
6629
1.60k
            message = "malformed \\N character escape";
6630
1.60k
            if (s >= end) {
6631
4
                goto incomplete;
6632
4
            }
6633
1.59k
            if (*s == '{') {
6634
1.59k
                const char *start = ++s;
6635
1.59k
                size_t namelen;
6636
                /* look for the closing brace */
6637
20.0k
                while (s < end && *s != '}')
6638
18.4k
                    s++;
6639
1.59k
                if (s >= end) {
6640
16
                    goto incomplete;
6641
16
                }
6642
1.57k
                namelen = s - start;
6643
1.57k
                if (namelen) {
6644
                    /* found a name.  look it up in the unicode database */
6645
1.57k
                    s++;
6646
1.57k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6647
1.57k
                    if (namelen <= INT_MAX &&
6648
1.57k
                        ucnhash_capi->getcode(start, (int)namelen,
6649
1.57k
                                              &ch, 0)) {
6650
1.52k
                        assert(ch <= MAX_UNICODE);
6651
1.52k
                        WRITE_CHAR(ch);
6652
1.52k
                        continue;
6653
1.52k
                    }
6654
58
                    message = "unknown Unicode character name";
6655
58
                }
6656
1.57k
            }
6657
62
            goto error;
6658
6659
5.04k
        default:
6660
5.04k
            if (*first_invalid_escape_char == -1) {
6661
3.74k
                *first_invalid_escape_char = c;
6662
3.74k
                if (starts == initial_starts) {
6663
                    /* Back up one char, since we've already incremented s. */
6664
3.74k
                    *first_invalid_escape_ptr = s - 1;
6665
3.74k
                }
6666
3.74k
            }
6667
5.04k
            WRITE_ASCII_CHAR('\\');
6668
5.04k
            WRITE_CHAR(c);
6669
5.04k
            continue;
6670
50.8k
        }
6671
6672
26
      incomplete:
6673
26
        if (consumed) {
6674
0
            *consumed = startinpos;
6675
0
            break;
6676
0
        }
6677
96
      error:;
6678
96
        Py_ssize_t endinpos = s-starts;
6679
96
        writer.min_length = end - s + writer.pos;
6680
96
        if (unicode_decode_call_errorhandler_writer(
6681
96
                errors, &errorHandler,
6682
96
                "unicodeescape", message,
6683
96
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6684
96
                &writer)) {
6685
96
            goto onError;
6686
96
        }
6687
96
        assert(end - s <= writer.size - writer.pos);
6688
6689
0
#undef WRITE_ASCII_CHAR
6690
0
#undef WRITE_CHAR
6691
0
    }
6692
6693
28.6k
    Py_XDECREF(errorHandler);
6694
28.6k
    Py_XDECREF(exc);
6695
28.6k
    return _PyUnicodeWriter_Finish(&writer);
6696
6697
96
  onError:
6698
96
    _PyUnicodeWriter_Dealloc(&writer);
6699
96
    Py_XDECREF(errorHandler);
6700
96
    Py_XDECREF(exc);
6701
96
    return NULL;
6702
28.7k
}
6703
6704
PyObject *
6705
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6706
                              Py_ssize_t size,
6707
                              const char *errors,
6708
                              Py_ssize_t *consumed)
6709
0
{
6710
0
    int first_invalid_escape_char;
6711
0
    const char *first_invalid_escape_ptr;
6712
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6713
0
                                                      consumed,
6714
0
                                                      &first_invalid_escape_char,
6715
0
                                                      &first_invalid_escape_ptr);
6716
0
    if (result == NULL)
6717
0
        return NULL;
6718
0
    if (first_invalid_escape_char != -1) {
6719
0
        if (first_invalid_escape_char > 0xff) {
6720
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6721
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6722
0
                                 "Such sequences will not work in the future. ",
6723
0
                                 first_invalid_escape_char) < 0)
6724
0
            {
6725
0
                Py_DECREF(result);
6726
0
                return NULL;
6727
0
            }
6728
0
        }
6729
0
        else {
6730
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6731
0
                                 "\"\\%c\" is an invalid escape sequence. "
6732
0
                                 "Such sequences will not work in the future. ",
6733
0
                                 first_invalid_escape_char) < 0)
6734
0
            {
6735
0
                Py_DECREF(result);
6736
0
                return NULL;
6737
0
            }
6738
0
        }
6739
0
    }
6740
0
    return result;
6741
0
}
6742
6743
PyObject *
6744
PyUnicode_DecodeUnicodeEscape(const char *s,
6745
                              Py_ssize_t size,
6746
                              const char *errors)
6747
0
{
6748
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6749
0
}
6750
6751
/* Return a Unicode-Escape string version of the Unicode object. */
6752
6753
PyObject *
6754
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6755
308k
{
6756
308k
    if (!PyUnicode_Check(unicode)) {
6757
0
        PyErr_BadArgument();
6758
0
        return NULL;
6759
0
    }
6760
6761
308k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6762
308k
    if (len == 0) {
6763
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6764
0
    }
6765
308k
    int kind = PyUnicode_KIND(unicode);
6766
308k
    const void *data = PyUnicode_DATA(unicode);
6767
6768
    /* Initial allocation is based on the longest-possible character
6769
     * escape.
6770
     *
6771
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6772
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6773
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6774
308k
    Py_ssize_t expandsize = kind * 2 + 2;
6775
308k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6776
0
        return PyErr_NoMemory();
6777
0
    }
6778
6779
308k
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6780
308k
    if (writer == NULL) {
6781
0
        return NULL;
6782
0
    }
6783
308k
    char *p = PyBytesWriter_GetData(writer);
6784
6785
617k
    for (Py_ssize_t i = 0; i < len; i++) {
6786
308k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6787
6788
        /* U+0000-U+00ff range */
6789
308k
        if (ch < 0x100) {
6790
302k
            if (ch >= ' ' && ch < 127) {
6791
23.9k
                if (ch != '\\') {
6792
                    /* Copy printable US ASCII as-is */
6793
0
                    *p++ = (char) ch;
6794
0
                }
6795
                /* Escape backslashes */
6796
23.9k
                else {
6797
23.9k
                    *p++ = '\\';
6798
23.9k
                    *p++ = '\\';
6799
23.9k
                }
6800
23.9k
            }
6801
6802
            /* Map special whitespace to '\t', \n', '\r' */
6803
278k
            else if (ch == '\t') {
6804
2.74k
                *p++ = '\\';
6805
2.74k
                *p++ = 't';
6806
2.74k
            }
6807
275k
            else if (ch == '\n') {
6808
4.11k
                *p++ = '\\';
6809
4.11k
                *p++ = 'n';
6810
4.11k
            }
6811
271k
            else if (ch == '\r') {
6812
518
                *p++ = '\\';
6813
518
                *p++ = 'r';
6814
518
            }
6815
6816
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6817
270k
            else {
6818
270k
                *p++ = '\\';
6819
270k
                *p++ = 'x';
6820
270k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6821
270k
                *p++ = Py_hexdigits[ch & 0x000F];
6822
270k
            }
6823
302k
        }
6824
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6825
6.82k
        else if (ch < 0x10000) {
6826
5.62k
            *p++ = '\\';
6827
5.62k
            *p++ = 'u';
6828
5.62k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6829
5.62k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6830
5.62k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6831
5.62k
            *p++ = Py_hexdigits[ch & 0x000F];
6832
5.62k
        }
6833
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6834
1.20k
        else {
6835
6836
            /* Make sure that the first two digits are zero */
6837
1.20k
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6838
1.20k
            *p++ = '\\';
6839
1.20k
            *p++ = 'U';
6840
1.20k
            *p++ = '0';
6841
1.20k
            *p++ = '0';
6842
1.20k
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6843
1.20k
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6844
1.20k
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6845
1.20k
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6846
1.20k
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6847
1.20k
            *p++ = Py_hexdigits[ch & 0x0000000F];
6848
1.20k
        }
6849
308k
    }
6850
6851
308k
    return PyBytesWriter_FinishWithPointer(writer, p);
6852
308k
}
6853
6854
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6855
6856
PyObject *
6857
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6858
                                          Py_ssize_t size,
6859
                                          const char *errors,
6860
                                          Py_ssize_t *consumed)
6861
0
{
6862
0
    const char *starts = s;
6863
0
    _PyUnicodeWriter writer;
6864
0
    const char *end;
6865
0
    PyObject *errorHandler = NULL;
6866
0
    PyObject *exc = NULL;
6867
6868
0
    if (size == 0) {
6869
0
        if (consumed) {
6870
0
            *consumed = 0;
6871
0
        }
6872
0
        _Py_RETURN_UNICODE_EMPTY();
6873
0
    }
6874
6875
    /* Escaped strings will always be longer than the resulting
6876
       Unicode string, so we start with size here and then reduce the
6877
       length after conversion to the true value. (But decoding error
6878
       handler might have to resize the string) */
6879
0
    _PyUnicodeWriter_Init(&writer);
6880
0
    writer.min_length = size;
6881
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6882
0
        goto onError;
6883
0
    }
6884
6885
0
    end = s + size;
6886
0
    while (s < end) {
6887
0
        unsigned char c = (unsigned char) *s++;
6888
0
        Py_UCS4 ch;
6889
0
        int count;
6890
0
        const char *message;
6891
6892
0
#define WRITE_CHAR(ch)                                                        \
6893
0
            do {                                                              \
6894
0
                if (ch <= writer.maxchar) {                                   \
6895
0
                    assert(writer.pos < writer.size);                         \
6896
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6897
0
                }                                                             \
6898
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6899
0
                    goto onError;                                             \
6900
0
                }                                                             \
6901
0
            } while(0)
6902
6903
        /* Non-escape characters are interpreted as Unicode ordinals */
6904
0
        if (c != '\\' || (s >= end && !consumed)) {
6905
0
            WRITE_CHAR(c);
6906
0
            continue;
6907
0
        }
6908
6909
0
        Py_ssize_t startinpos = s - starts - 1;
6910
        /* \ - Escapes */
6911
0
        if (s >= end) {
6912
0
            assert(consumed);
6913
            // Set message to silent compiler warning.
6914
            // Actually it is never used.
6915
0
            message = "\\ at end of string";
6916
0
            goto incomplete;
6917
0
        }
6918
6919
0
        c = (unsigned char) *s++;
6920
0
        if (c == 'u') {
6921
0
            count = 4;
6922
0
            message = "truncated \\uXXXX escape";
6923
0
        }
6924
0
        else if (c == 'U') {
6925
0
            count = 8;
6926
0
            message = "truncated \\UXXXXXXXX escape";
6927
0
        }
6928
0
        else {
6929
0
            assert(writer.pos < writer.size);
6930
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6931
0
            WRITE_CHAR(c);
6932
0
            continue;
6933
0
        }
6934
6935
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6936
0
        for (ch = 0; count; ++s, --count) {
6937
0
            if (s >= end) {
6938
0
                goto incomplete;
6939
0
            }
6940
0
            c = (unsigned char)*s;
6941
0
            ch <<= 4;
6942
0
            if (c >= '0' && c <= '9') {
6943
0
                ch += c - '0';
6944
0
            }
6945
0
            else if (c >= 'a' && c <= 'f') {
6946
0
                ch += c - ('a' - 10);
6947
0
            }
6948
0
            else if (c >= 'A' && c <= 'F') {
6949
0
                ch += c - ('A' - 10);
6950
0
            }
6951
0
            else {
6952
0
                goto error;
6953
0
            }
6954
0
        }
6955
0
        if (ch > MAX_UNICODE) {
6956
0
            message = "\\Uxxxxxxxx out of range";
6957
0
            goto error;
6958
0
        }
6959
0
        WRITE_CHAR(ch);
6960
0
        continue;
6961
6962
0
      incomplete:
6963
0
        if (consumed) {
6964
0
            *consumed = startinpos;
6965
0
            break;
6966
0
        }
6967
0
      error:;
6968
0
        Py_ssize_t endinpos = s-starts;
6969
0
        writer.min_length = end - s + writer.pos;
6970
0
        if (unicode_decode_call_errorhandler_writer(
6971
0
                errors, &errorHandler,
6972
0
                "rawunicodeescape", message,
6973
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6974
0
                &writer)) {
6975
0
            goto onError;
6976
0
        }
6977
0
        assert(end - s <= writer.size - writer.pos);
6978
6979
0
#undef WRITE_CHAR
6980
0
    }
6981
0
    Py_XDECREF(errorHandler);
6982
0
    Py_XDECREF(exc);
6983
0
    return _PyUnicodeWriter_Finish(&writer);
6984
6985
0
  onError:
6986
0
    _PyUnicodeWriter_Dealloc(&writer);
6987
0
    Py_XDECREF(errorHandler);
6988
0
    Py_XDECREF(exc);
6989
0
    return NULL;
6990
0
}
6991
6992
PyObject *
6993
PyUnicode_DecodeRawUnicodeEscape(const char *s,
6994
                                 Py_ssize_t size,
6995
                                 const char *errors)
6996
0
{
6997
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6998
0
}
6999
7000
7001
PyObject *
7002
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7003
237k
{
7004
237k
    if (!PyUnicode_Check(unicode)) {
7005
0
        PyErr_BadArgument();
7006
0
        return NULL;
7007
0
    }
7008
237k
    int kind = PyUnicode_KIND(unicode);
7009
237k
    const void *data = PyUnicode_DATA(unicode);
7010
237k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7011
237k
    if (len == 0) {
7012
444
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7013
444
    }
7014
237k
    if (kind == PyUnicode_1BYTE_KIND) {
7015
237k
        return PyBytes_FromStringAndSize(data, len);
7016
237k
    }
7017
7018
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7019
       bytes, and 1 byte characters 4. */
7020
302
    Py_ssize_t expandsize = kind * 2 + 2;
7021
302
    if (len > PY_SSIZE_T_MAX / expandsize) {
7022
0
        return PyErr_NoMemory();
7023
0
    }
7024
7025
302
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7026
302
    if (writer == NULL) {
7027
0
        return NULL;
7028
0
    }
7029
302
    char *p = PyBytesWriter_GetData(writer);
7030
7031
5.03M
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7032
5.03M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7033
7034
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7035
5.03M
        if (ch < 0x100) {
7036
5.00M
            *p++ = (char) ch;
7037
5.00M
        }
7038
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7039
31.3k
        else if (ch < 0x10000) {
7040
30.6k
            *p++ = '\\';
7041
30.6k
            *p++ = 'u';
7042
30.6k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7043
30.6k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7044
30.6k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7045
30.6k
            *p++ = Py_hexdigits[ch & 15];
7046
30.6k
        }
7047
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7048
687
        else {
7049
687
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7050
687
            *p++ = '\\';
7051
687
            *p++ = 'U';
7052
687
            *p++ = '0';
7053
687
            *p++ = '0';
7054
687
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7055
687
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7056
687
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7057
687
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7058
687
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7059
687
            *p++ = Py_hexdigits[ch & 15];
7060
687
        }
7061
5.03M
    }
7062
7063
302
    return PyBytesWriter_FinishWithPointer(writer, p);
7064
302
}
7065
7066
/* --- Latin-1 Codec ------------------------------------------------------ */
7067
7068
PyObject *
7069
PyUnicode_DecodeLatin1(const char *s,
7070
                       Py_ssize_t size,
7071
                       const char *errors)
7072
3.38M
{
7073
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7074
3.38M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7075
3.38M
}
7076
7077
/* create or adjust a UnicodeEncodeError */
7078
static void
7079
make_encode_exception(PyObject **exceptionObject,
7080
                      const char *encoding,
7081
                      PyObject *unicode,
7082
                      Py_ssize_t startpos, Py_ssize_t endpos,
7083
                      const char *reason)
7084
221k
{
7085
221k
    if (*exceptionObject == NULL) {
7086
221k
        *exceptionObject = PyObject_CallFunction(
7087
221k
            PyExc_UnicodeEncodeError, "sOnns",
7088
221k
            encoding, unicode, startpos, endpos, reason);
7089
221k
    }
7090
0
    else {
7091
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7092
0
            goto onError;
7093
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7094
0
            goto onError;
7095
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7096
0
            goto onError;
7097
0
        return;
7098
0
      onError:
7099
0
        Py_CLEAR(*exceptionObject);
7100
0
    }
7101
221k
}
7102
7103
/* raises a UnicodeEncodeError */
7104
static void
7105
raise_encode_exception(PyObject **exceptionObject,
7106
                       const char *encoding,
7107
                       PyObject *unicode,
7108
                       Py_ssize_t startpos, Py_ssize_t endpos,
7109
                       const char *reason)
7110
40.5k
{
7111
40.5k
    make_encode_exception(exceptionObject,
7112
40.5k
                          encoding, unicode, startpos, endpos, reason);
7113
40.5k
    if (*exceptionObject != NULL)
7114
40.5k
        PyCodec_StrictErrors(*exceptionObject);
7115
40.5k
}
7116
7117
/* error handling callback helper:
7118
   build arguments, call the callback and check the arguments,
7119
   put the result into newpos and return the replacement string, which
7120
   has to be freed by the caller */
7121
static PyObject *
7122
unicode_encode_call_errorhandler(const char *errors,
7123
                                 PyObject **errorHandler,
7124
                                 const char *encoding, const char *reason,
7125
                                 PyObject *unicode, PyObject **exceptionObject,
7126
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7127
                                 Py_ssize_t *newpos)
7128
180k
{
7129
180k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7130
180k
    Py_ssize_t len;
7131
180k
    PyObject *restuple;
7132
180k
    PyObject *resunicode;
7133
7134
180k
    if (*errorHandler == NULL) {
7135
180k
        *errorHandler = PyCodec_LookupError(errors);
7136
180k
        if (*errorHandler == NULL)
7137
0
            return NULL;
7138
180k
    }
7139
7140
180k
    len = PyUnicode_GET_LENGTH(unicode);
7141
7142
180k
    make_encode_exception(exceptionObject,
7143
180k
                          encoding, unicode, startpos, endpos, reason);
7144
180k
    if (*exceptionObject == NULL)
7145
0
        return NULL;
7146
7147
180k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7148
180k
    if (restuple == NULL)
7149
180k
        return NULL;
7150
0
    if (!PyTuple_Check(restuple)) {
7151
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7152
0
        Py_DECREF(restuple);
7153
0
        return NULL;
7154
0
    }
7155
0
    if (!PyArg_ParseTuple(restuple, argparse,
7156
0
                          &resunicode, newpos)) {
7157
0
        Py_DECREF(restuple);
7158
0
        return NULL;
7159
0
    }
7160
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7161
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7162
0
        Py_DECREF(restuple);
7163
0
        return NULL;
7164
0
    }
7165
0
    if (*newpos<0)
7166
0
        *newpos = len + *newpos;
7167
0
    if (*newpos<0 || *newpos>len) {
7168
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7169
0
        Py_DECREF(restuple);
7170
0
        return NULL;
7171
0
    }
7172
0
    Py_INCREF(resunicode);
7173
0
    Py_DECREF(restuple);
7174
0
    return resunicode;
7175
0
}
7176
7177
static PyObject *
7178
unicode_encode_ucs1(PyObject *unicode,
7179
                    const char *errors,
7180
                    const Py_UCS4 limit)
7181
51.0k
{
7182
    /* input state */
7183
51.0k
    Py_ssize_t pos=0, size;
7184
51.0k
    int kind;
7185
51.0k
    const void *data;
7186
51.0k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7187
51.0k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7188
51.0k
    PyObject *error_handler_obj = NULL;
7189
51.0k
    PyObject *exc = NULL;
7190
51.0k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7191
51.0k
    PyObject *rep = NULL;
7192
7193
51.0k
    size = PyUnicode_GET_LENGTH(unicode);
7194
51.0k
    kind = PyUnicode_KIND(unicode);
7195
51.0k
    data = PyUnicode_DATA(unicode);
7196
    /* allocate enough for a simple encoding without
7197
       replacements, if we need more, we'll resize */
7198
51.0k
    if (size == 0)
7199
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7200
7201
    /* output object */
7202
51.0k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7203
51.0k
    if (writer == NULL) {
7204
0
        return NULL;
7205
0
    }
7206
    /* pointer into the output */
7207
51.0k
    char *str = PyBytesWriter_GetData(writer);
7208
7209
3.37M
    while (pos < size) {
7210
3.37M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7211
7212
        /* can we encode this? */
7213
3.37M
        if (ch < limit) {
7214
            /* no overflow check, because we know that the space is enough */
7215
3.32M
            *str++ = (char)ch;
7216
3.32M
            ++pos;
7217
3.32M
        }
7218
51.0k
        else {
7219
51.0k
            Py_ssize_t newpos, i;
7220
            /* startpos for collecting unencodable chars */
7221
51.0k
            Py_ssize_t collstart = pos;
7222
51.0k
            Py_ssize_t collend = collstart + 1;
7223
            /* find all unecodable characters */
7224
7225
351k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7226
300k
                ++collend;
7227
7228
            /* Only overallocate the buffer if it's not the last write */
7229
51.0k
            writer->overallocate = (collend < size);
7230
7231
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7232
51.0k
            if (error_handler == _Py_ERROR_UNKNOWN)
7233
51.0k
                error_handler = _Py_GetErrorHandler(errors);
7234
7235
51.0k
            switch (error_handler) {
7236
40.5k
            case _Py_ERROR_STRICT:
7237
40.5k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7238
40.5k
                goto onError;
7239
7240
0
            case _Py_ERROR_REPLACE:
7241
0
                memset(str, '?', collend - collstart);
7242
0
                str += (collend - collstart);
7243
0
                _Py_FALLTHROUGH;
7244
0
            case _Py_ERROR_IGNORE:
7245
0
                pos = collend;
7246
0
                break;
7247
7248
0
            case _Py_ERROR_BACKSLASHREPLACE:
7249
                /* subtract preallocated bytes */
7250
0
                writer->size -= (collend - collstart);
7251
0
                str = backslashreplace(writer, str,
7252
0
                                       unicode, collstart, collend);
7253
0
                if (str == NULL)
7254
0
                    goto onError;
7255
0
                pos = collend;
7256
0
                break;
7257
7258
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7259
                /* subtract preallocated bytes */
7260
0
                writer->size -= (collend - collstart);
7261
0
                str = xmlcharrefreplace(writer, str,
7262
0
                                        unicode, collstart, collend);
7263
0
                if (str == NULL)
7264
0
                    goto onError;
7265
0
                pos = collend;
7266
0
                break;
7267
7268
10.5k
            case _Py_ERROR_SURROGATEESCAPE:
7269
10.5k
                for (i = collstart; i < collend; ++i) {
7270
10.5k
                    ch = PyUnicode_READ(kind, data, i);
7271
10.5k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7272
                        /* Not a UTF-8b surrogate */
7273
10.5k
                        break;
7274
10.5k
                    }
7275
0
                    *str++ = (char)(ch - 0xdc00);
7276
0
                    ++pos;
7277
0
                }
7278
10.5k
                if (i >= collend)
7279
0
                    break;
7280
10.5k
                collstart = pos;
7281
10.5k
                assert(collstart != collend);
7282
10.5k
                _Py_FALLTHROUGH;
7283
7284
10.5k
            default:
7285
10.5k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7286
10.5k
                                                       encoding, reason, unicode, &exc,
7287
10.5k
                                                       collstart, collend, &newpos);
7288
10.5k
                if (rep == NULL)
7289
10.5k
                    goto onError;
7290
7291
0
                if (newpos < collstart) {
7292
0
                    writer->overallocate = 1;
7293
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7294
0
                                                             collstart - newpos,
7295
0
                                                             str);
7296
0
                    if (str == NULL) {
7297
0
                        goto onError;
7298
0
                    }
7299
0
                }
7300
0
                else {
7301
                    /* subtract preallocated bytes */
7302
0
                    writer->size -= newpos - collstart;
7303
                    /* Only overallocate the buffer if it's not the last write */
7304
0
                    writer->overallocate = (newpos < size);
7305
0
                }
7306
7307
0
                char *rep_str;
7308
0
                Py_ssize_t rep_len;
7309
0
                if (PyBytes_Check(rep)) {
7310
                    /* Directly copy bytes result to output. */
7311
0
                    rep_str = PyBytes_AS_STRING(rep);
7312
0
                    rep_len = PyBytes_GET_SIZE(rep);
7313
0
                }
7314
0
                else {
7315
0
                    assert(PyUnicode_Check(rep));
7316
7317
0
                    if (limit == 256 ?
7318
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7319
0
                        !PyUnicode_IS_ASCII(rep))
7320
0
                    {
7321
                        /* Not all characters are smaller than limit */
7322
0
                        raise_encode_exception(&exc, encoding, unicode,
7323
0
                                               collstart, collend, reason);
7324
0
                        goto onError;
7325
0
                    }
7326
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7327
0
                    rep_str = PyUnicode_DATA(rep);
7328
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7329
0
                }
7330
7331
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7332
0
                if (str == NULL) {
7333
0
                    goto onError;
7334
0
                }
7335
0
                memcpy(str, rep_str, rep_len);
7336
0
                str += rep_len;
7337
7338
0
                pos = newpos;
7339
0
                Py_CLEAR(rep);
7340
51.0k
            }
7341
7342
            /* If overallocation was disabled, ensure that it was the last
7343
               write. Otherwise, we missed an optimization */
7344
51.0k
            assert(writer->overallocate || pos == size);
7345
0
        }
7346
3.37M
    }
7347
7348
0
    Py_XDECREF(error_handler_obj);
7349
0
    Py_XDECREF(exc);
7350
0
    return PyBytesWriter_FinishWithPointer(writer, str);
7351
7352
51.0k
  onError:
7353
51.0k
    Py_XDECREF(rep);
7354
51.0k
    PyBytesWriter_Discard(writer);
7355
51.0k
    Py_XDECREF(error_handler_obj);
7356
51.0k
    Py_XDECREF(exc);
7357
51.0k
    return NULL;
7358
51.0k
}
7359
7360
PyObject *
7361
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7362
0
{
7363
0
    if (!PyUnicode_Check(unicode)) {
7364
0
        PyErr_BadArgument();
7365
0
        return NULL;
7366
0
    }
7367
    /* Fast path: if it is a one-byte string, construct
7368
       bytes object directly. */
7369
0
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7370
0
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7371
0
                                         PyUnicode_GET_LENGTH(unicode));
7372
    /* Non-Latin-1 characters present. Defer to above function to
7373
       raise the exception. */
7374
0
    return unicode_encode_ucs1(unicode, errors, 256);
7375
0
}
7376
7377
PyObject*
7378
PyUnicode_AsLatin1String(PyObject *unicode)
7379
0
{
7380
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7381
0
}
7382
7383
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7384
7385
PyObject *
7386
PyUnicode_DecodeASCII(const char *s,
7387
                      Py_ssize_t size,
7388
                      const char *errors)
7389
560k
{
7390
560k
    const char *starts = s;
7391
560k
    const char *e = s + size;
7392
560k
    PyObject *error_handler_obj = NULL;
7393
560k
    PyObject *exc = NULL;
7394
560k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7395
7396
560k
    if (size == 0)
7397
0
        _Py_RETURN_UNICODE_EMPTY();
7398
7399
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7400
560k
    if (size == 1 && (unsigned char)s[0] < 128) {
7401
7.35k
        return get_latin1_char((unsigned char)s[0]);
7402
7.35k
    }
7403
7404
    // Shortcut for simple case
7405
552k
    PyObject *u = PyUnicode_New(size, 127);
7406
552k
    if (u == NULL) {
7407
0
        return NULL;
7408
0
    }
7409
552k
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7410
552k
    if (outpos == size) {
7411
391k
        return u;
7412
391k
    }
7413
7414
161k
    _PyUnicodeWriter writer;
7415
161k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7416
161k
    writer.pos = outpos;
7417
7418
161k
    s += outpos;
7419
161k
    int kind = writer.kind;
7420
161k
    void *data = writer.data;
7421
161k
    Py_ssize_t startinpos, endinpos;
7422
7423
18.3M
    while (s < e) {
7424
18.1M
        unsigned char c = (unsigned char)*s;
7425
18.1M
        if (c < 128) {
7426
6.68M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7427
6.68M
            writer.pos++;
7428
6.68M
            ++s;
7429
6.68M
            continue;
7430
6.68M
        }
7431
7432
        /* byte outsize range 0x00..0x7f: call the error handler */
7433
7434
11.4M
        if (error_handler == _Py_ERROR_UNKNOWN)
7435
161k
            error_handler = _Py_GetErrorHandler(errors);
7436
7437
11.4M
        switch (error_handler)
7438
11.4M
        {
7439
767k
        case _Py_ERROR_REPLACE:
7440
11.4M
        case _Py_ERROR_SURROGATEESCAPE:
7441
            /* Fast-path: the error handler only writes one character,
7442
               but we may switch to UCS2 at the first write */
7443
11.4M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7444
0
                goto onError;
7445
11.4M
            kind = writer.kind;
7446
11.4M
            data = writer.data;
7447
7448
11.4M
            if (error_handler == _Py_ERROR_REPLACE)
7449
767k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7450
10.6M
            else
7451
10.6M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7452
11.4M
            writer.pos++;
7453
11.4M
            ++s;
7454
11.4M
            break;
7455
7456
0
        case _Py_ERROR_IGNORE:
7457
0
            ++s;
7458
0
            break;
7459
7460
11.5k
        default:
7461
11.5k
            startinpos = s-starts;
7462
11.5k
            endinpos = startinpos + 1;
7463
11.5k
            if (unicode_decode_call_errorhandler_writer(
7464
11.5k
                    errors, &error_handler_obj,
7465
11.5k
                    "ascii", "ordinal not in range(128)",
7466
11.5k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7467
11.5k
                    &writer))
7468
11.5k
                goto onError;
7469
0
            kind = writer.kind;
7470
0
            data = writer.data;
7471
11.4M
        }
7472
11.4M
    }
7473
149k
    Py_XDECREF(error_handler_obj);
7474
149k
    Py_XDECREF(exc);
7475
149k
    return _PyUnicodeWriter_Finish(&writer);
7476
7477
11.5k
  onError:
7478
11.5k
    _PyUnicodeWriter_Dealloc(&writer);
7479
11.5k
    Py_XDECREF(error_handler_obj);
7480
11.5k
    Py_XDECREF(exc);
7481
11.5k
    return NULL;
7482
161k
}
7483
7484
PyObject *
7485
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7486
108k
{
7487
108k
    if (!PyUnicode_Check(unicode)) {
7488
0
        PyErr_BadArgument();
7489
0
        return NULL;
7490
0
    }
7491
    /* Fast path: if it is an ASCII-only string, construct bytes object
7492
       directly. Else defer to above function to raise the exception. */
7493
108k
    if (PyUnicode_IS_ASCII(unicode))
7494
56.9k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7495
56.9k
                                         PyUnicode_GET_LENGTH(unicode));
7496
51.0k
    return unicode_encode_ucs1(unicode, errors, 128);
7497
108k
}
7498
7499
PyObject *
7500
PyUnicode_AsASCIIString(PyObject *unicode)
7501
4
{
7502
4
    return _PyUnicode_AsASCIIString(unicode, NULL);
7503
4
}
7504
7505
#ifdef MS_WINDOWS
7506
7507
/* --- MBCS codecs for Windows -------------------------------------------- */
7508
7509
#if SIZEOF_INT < SIZEOF_SIZE_T
7510
#define NEED_RETRY
7511
#endif
7512
7513
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7514
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7515
   both cases also and avoids partial characters overrunning the
7516
   length limit in MultiByteToWideChar on Windows */
7517
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7518
7519
#ifndef WC_ERR_INVALID_CHARS
7520
#  define WC_ERR_INVALID_CHARS 0x0080
7521
#endif
7522
7523
static const char*
7524
code_page_name(UINT code_page, PyObject **obj)
7525
{
7526
    *obj = NULL;
7527
    if (code_page == CP_ACP)
7528
        return "mbcs";
7529
7530
    *obj = PyBytes_FromFormat("cp%u", code_page);
7531
    if (*obj == NULL)
7532
        return NULL;
7533
    return PyBytes_AS_STRING(*obj);
7534
}
7535
7536
static DWORD
7537
decode_code_page_flags(UINT code_page)
7538
{
7539
    if (code_page == CP_UTF7) {
7540
        /* The CP_UTF7 decoder only supports flags=0 */
7541
        return 0;
7542
    }
7543
    else
7544
        return MB_ERR_INVALID_CHARS;
7545
}
7546
7547
/*
7548
 * Decode a byte string from a Windows code page into unicode object in strict
7549
 * mode.
7550
 *
7551
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7552
 * OSError and returns -1 on other error.
7553
 */
7554
static int
7555
decode_code_page_strict(UINT code_page,
7556
                        wchar_t **buf,
7557
                        Py_ssize_t *bufsize,
7558
                        const char *in,
7559
                        int insize)
7560
{
7561
    DWORD flags = MB_ERR_INVALID_CHARS;
7562
    wchar_t *out;
7563
    DWORD outsize;
7564
7565
    /* First get the size of the result */
7566
    assert(insize > 0);
7567
    while ((outsize = MultiByteToWideChar(code_page, flags,
7568
                                          in, insize, NULL, 0)) <= 0)
7569
    {
7570
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7571
            goto error;
7572
        }
7573
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7574
        flags = 0;
7575
    }
7576
7577
    /* Extend a wchar_t* buffer */
7578
    Py_ssize_t n = *bufsize;   /* Get the current length */
7579
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7580
        return -1;
7581
    }
7582
    out = *buf + n;
7583
7584
    /* Do the conversion */
7585
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7586
    if (outsize <= 0)
7587
        goto error;
7588
    return insize;
7589
7590
error:
7591
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7592
        return -2;
7593
    PyErr_SetFromWindowsErr(0);
7594
    return -1;
7595
}
7596
7597
/*
7598
 * Decode a byte string from a code page into unicode object with an error
7599
 * handler.
7600
 *
7601
 * Returns consumed size if succeed, or raise an OSError or
7602
 * UnicodeDecodeError exception and returns -1 on error.
7603
 */
7604
static int
7605
decode_code_page_errors(UINT code_page,
7606
                        wchar_t **buf,
7607
                        Py_ssize_t *bufsize,
7608
                        const char *in, const int size,
7609
                        const char *errors, int final)
7610
{
7611
    const char *startin = in;
7612
    const char *endin = in + size;
7613
    DWORD flags = MB_ERR_INVALID_CHARS;
7614
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7615
       2000 English version of the message. */
7616
    const char *reason = "No mapping for the Unicode character exists "
7617
                         "in the target code page.";
7618
    /* each step cannot decode more than 1 character, but a character can be
7619
       represented as a surrogate pair */
7620
    wchar_t buffer[2], *out;
7621
    int insize;
7622
    Py_ssize_t outsize;
7623
    PyObject *errorHandler = NULL;
7624
    PyObject *exc = NULL;
7625
    PyObject *encoding_obj = NULL;
7626
    const char *encoding;
7627
    DWORD err;
7628
    int ret = -1;
7629
7630
    assert(size > 0);
7631
7632
    encoding = code_page_name(code_page, &encoding_obj);
7633
    if (encoding == NULL)
7634
        return -1;
7635
7636
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7637
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7638
           UnicodeDecodeError. */
7639
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7640
        if (exc != NULL) {
7641
            PyCodec_StrictErrors(exc);
7642
            Py_CLEAR(exc);
7643
        }
7644
        goto error;
7645
    }
7646
7647
    /* Extend a wchar_t* buffer */
7648
    Py_ssize_t n = *bufsize;   /* Get the current length */
7649
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7650
        PyErr_NoMemory();
7651
        goto error;
7652
    }
7653
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7654
        goto error;
7655
    }
7656
    out = *buf + n;
7657
7658
    /* Decode the byte string character per character */
7659
    while (in < endin)
7660
    {
7661
        /* Decode a character */
7662
        insize = 1;
7663
        do
7664
        {
7665
            outsize = MultiByteToWideChar(code_page, flags,
7666
                                          in, insize,
7667
                                          buffer, Py_ARRAY_LENGTH(buffer));
7668
            if (outsize > 0)
7669
                break;
7670
            err = GetLastError();
7671
            if (err == ERROR_INVALID_FLAGS && flags) {
7672
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7673
                flags = 0;
7674
                continue;
7675
            }
7676
            if (err != ERROR_NO_UNICODE_TRANSLATION
7677
                && err != ERROR_INSUFFICIENT_BUFFER)
7678
            {
7679
                PyErr_SetFromWindowsErr(err);
7680
                goto error;
7681
            }
7682
            insize++;
7683
        }
7684
        /* 4=maximum length of a UTF-8 sequence */
7685
        while (insize <= 4 && (in + insize) <= endin);
7686
7687
        if (outsize <= 0) {
7688
            Py_ssize_t startinpos, endinpos, outpos;
7689
7690
            /* last character in partial decode? */
7691
            if (in + insize >= endin && !final)
7692
                break;
7693
7694
            startinpos = in - startin;
7695
            endinpos = startinpos + 1;
7696
            outpos = out - *buf;
7697
            if (unicode_decode_call_errorhandler_wchar(
7698
                    errors, &errorHandler,
7699
                    encoding, reason,
7700
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7701
                    buf, bufsize, &outpos))
7702
            {
7703
                goto error;
7704
            }
7705
            out = *buf + outpos;
7706
        }
7707
        else {
7708
            in += insize;
7709
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7710
            out += outsize;
7711
        }
7712
    }
7713
7714
    /* Shrink the buffer */
7715
    assert(out - *buf <= *bufsize);
7716
    *bufsize = out - *buf;
7717
    /* (in - startin) <= size and size is an int */
7718
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7719
7720
error:
7721
    Py_XDECREF(encoding_obj);
7722
    Py_XDECREF(errorHandler);
7723
    Py_XDECREF(exc);
7724
    return ret;
7725
}
7726
7727
static PyObject *
7728
decode_code_page_stateful(int code_page,
7729
                          const char *s, Py_ssize_t size,
7730
                          const char *errors, Py_ssize_t *consumed)
7731
{
7732
    wchar_t *buf = NULL;
7733
    Py_ssize_t bufsize = 0;
7734
    int chunk_size, final, converted, done;
7735
7736
    if (code_page < 0) {
7737
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7738
        return NULL;
7739
    }
7740
    if (size < 0) {
7741
        PyErr_BadInternalCall();
7742
        return NULL;
7743
    }
7744
7745
    if (consumed)
7746
        *consumed = 0;
7747
7748
    do
7749
    {
7750
#ifdef NEED_RETRY
7751
        if (size > DECODING_CHUNK_SIZE) {
7752
            chunk_size = DECODING_CHUNK_SIZE;
7753
            final = 0;
7754
            done = 0;
7755
        }
7756
        else
7757
#endif
7758
        {
7759
            chunk_size = (int)size;
7760
            final = (consumed == NULL);
7761
            done = 1;
7762
        }
7763
7764
        if (chunk_size == 0 && done) {
7765
            if (buf != NULL)
7766
                break;
7767
            _Py_RETURN_UNICODE_EMPTY();
7768
        }
7769
7770
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7771
                                            s, chunk_size);
7772
        if (converted == -2)
7773
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7774
                                                s, chunk_size,
7775
                                                errors, final);
7776
        assert(converted != 0 || done);
7777
7778
        if (converted < 0) {
7779
            PyMem_Free(buf);
7780
            return NULL;
7781
        }
7782
7783
        if (consumed)
7784
            *consumed += converted;
7785
7786
        s += converted;
7787
        size -= converted;
7788
    } while (!done);
7789
7790
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7791
    PyMem_Free(buf);
7792
    return v;
7793
}
7794
7795
PyObject *
7796
PyUnicode_DecodeCodePageStateful(int code_page,
7797
                                 const char *s,
7798
                                 Py_ssize_t size,
7799
                                 const char *errors,
7800
                                 Py_ssize_t *consumed)
7801
{
7802
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7803
}
7804
7805
PyObject *
7806
PyUnicode_DecodeMBCSStateful(const char *s,
7807
                             Py_ssize_t size,
7808
                             const char *errors,
7809
                             Py_ssize_t *consumed)
7810
{
7811
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7812
}
7813
7814
PyObject *
7815
PyUnicode_DecodeMBCS(const char *s,
7816
                     Py_ssize_t size,
7817
                     const char *errors)
7818
{
7819
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7820
}
7821
7822
static DWORD
7823
encode_code_page_flags(UINT code_page, const char *errors)
7824
{
7825
    if (code_page == CP_UTF8) {
7826
        return WC_ERR_INVALID_CHARS;
7827
    }
7828
    else if (code_page == CP_UTF7) {
7829
        /* CP_UTF7 only supports flags=0 */
7830
        return 0;
7831
    }
7832
    else {
7833
        if (errors != NULL && strcmp(errors, "replace") == 0)
7834
            return 0;
7835
        else
7836
            return WC_NO_BEST_FIT_CHARS;
7837
    }
7838
}
7839
7840
/*
7841
 * Encode a Unicode string to a Windows code page into a byte string in strict
7842
 * mode.
7843
 *
7844
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7845
 * an OSError and returns -1 on other error.
7846
 */
7847
static int
7848
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7849
                        PyObject *unicode, Py_ssize_t offset, int len,
7850
                        const char* errors)
7851
{
7852
    BOOL usedDefaultChar = FALSE;
7853
    BOOL *pusedDefaultChar = &usedDefaultChar;
7854
    int outsize;
7855
    wchar_t *p;
7856
    Py_ssize_t size;
7857
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7858
    char *out;
7859
    /* Create a substring so that we can get the UTF-16 representation
7860
       of just the slice under consideration. */
7861
    PyObject *substring;
7862
    int ret = -1;
7863
7864
    assert(len > 0);
7865
7866
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7867
        pusedDefaultChar = &usedDefaultChar;
7868
    else
7869
        pusedDefaultChar = NULL;
7870
7871
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7872
    if (substring == NULL)
7873
        return -1;
7874
    p = PyUnicode_AsWideCharString(substring, &size);
7875
    Py_CLEAR(substring);
7876
    if (p == NULL) {
7877
        return -1;
7878
    }
7879
    assert(size <= INT_MAX);
7880
7881
    /* First get the size of the result */
7882
    outsize = WideCharToMultiByte(code_page, flags,
7883
                                  p, (int)size,
7884
                                  NULL, 0,
7885
                                  NULL, pusedDefaultChar);
7886
    if (outsize <= 0)
7887
        goto error;
7888
    /* If we used a default char, then we failed! */
7889
    if (pusedDefaultChar && *pusedDefaultChar) {
7890
        ret = -2;
7891
        goto done;
7892
    }
7893
7894
    if (*writer == NULL) {
7895
        /* Create string object */
7896
        *writer = PyBytesWriter_Create(outsize);
7897
        if (*writer == NULL) {
7898
            goto done;
7899
        }
7900
        out = PyBytesWriter_GetData(*writer);
7901
    }
7902
    else {
7903
        /* Extend string object */
7904
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
7905
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
7906
            goto done;
7907
        }
7908
        out = (char*)PyBytesWriter_GetData(*writer) + n;
7909
    }
7910
7911
    /* Do the conversion */
7912
    outsize = WideCharToMultiByte(code_page, flags,
7913
                                  p, (int)size,
7914
                                  out, outsize,
7915
                                  NULL, pusedDefaultChar);
7916
    if (outsize <= 0)
7917
        goto error;
7918
    if (pusedDefaultChar && *pusedDefaultChar) {
7919
        ret = -2;
7920
        goto done;
7921
    }
7922
    ret = 0;
7923
7924
done:
7925
    PyMem_Free(p);
7926
    return ret;
7927
7928
error:
7929
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7930
        ret = -2;
7931
        goto done;
7932
    }
7933
    PyErr_SetFromWindowsErr(0);
7934
    goto done;
7935
}
7936
7937
/*
7938
 * Encode a Unicode string to a Windows code page into a byte string using an
7939
 * error handler.
7940
 *
7941
 * Returns consumed characters if succeed, or raise an OSError and returns
7942
 * -1 on other error.
7943
 */
7944
static int
7945
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
7946
                        PyObject *unicode, Py_ssize_t unicode_offset,
7947
                        Py_ssize_t insize, const char* errors)
7948
{
7949
    const DWORD flags = encode_code_page_flags(code_page, errors);
7950
    Py_ssize_t pos = unicode_offset;
7951
    Py_ssize_t endin = unicode_offset + insize;
7952
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7953
       2000 English version of the message. */
7954
    const char *reason = "invalid character";
7955
    /* 4=maximum length of a UTF-8 sequence */
7956
    char buffer[4];
7957
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7958
    Py_ssize_t outsize;
7959
    char *out;
7960
    PyObject *errorHandler = NULL;
7961
    PyObject *exc = NULL;
7962
    PyObject *encoding_obj = NULL;
7963
    const char *encoding;
7964
    Py_ssize_t newpos;
7965
    PyObject *rep;
7966
    int ret = -1;
7967
7968
    assert(insize > 0);
7969
7970
    encoding = code_page_name(code_page, &encoding_obj);
7971
    if (encoding == NULL)
7972
        return -1;
7973
7974
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7975
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7976
           then we raise a UnicodeEncodeError. */
7977
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7978
        if (exc != NULL) {
7979
            PyCodec_StrictErrors(exc);
7980
            Py_DECREF(exc);
7981
        }
7982
        Py_XDECREF(encoding_obj);
7983
        return -1;
7984
    }
7985
7986
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7987
        pusedDefaultChar = &usedDefaultChar;
7988
    else
7989
        pusedDefaultChar = NULL;
7990
7991
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7992
        PyErr_NoMemory();
7993
        goto error;
7994
    }
7995
    outsize = insize * Py_ARRAY_LENGTH(buffer);
7996
7997
    if (*writer == NULL) {
7998
        /* Create string object */
7999
        *writer = PyBytesWriter_Create(outsize);
8000
        if (*writer == NULL) {
8001
            goto error;
8002
        }
8003
        out = PyBytesWriter_GetData(*writer);
8004
    }
8005
    else {
8006
        /* Extend string object */
8007
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8008
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8009
            goto error;
8010
        }
8011
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8012
    }
8013
8014
    /* Encode the string character per character */
8015
    while (pos < endin)
8016
    {
8017
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8018
        wchar_t chars[2];
8019
        int charsize;
8020
        if (ch < 0x10000) {
8021
            chars[0] = (wchar_t)ch;
8022
            charsize = 1;
8023
        }
8024
        else {
8025
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8026
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8027
            charsize = 2;
8028
        }
8029
8030
        outsize = WideCharToMultiByte(code_page, flags,
8031
                                      chars, charsize,
8032
                                      buffer, Py_ARRAY_LENGTH(buffer),
8033
                                      NULL, pusedDefaultChar);
8034
        if (outsize > 0) {
8035
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8036
            {
8037
                pos++;
8038
                memcpy(out, buffer, outsize);
8039
                out += outsize;
8040
                continue;
8041
            }
8042
        }
8043
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8044
            PyErr_SetFromWindowsErr(0);
8045
            goto error;
8046
        }
8047
8048
        rep = unicode_encode_call_errorhandler(
8049
                  errors, &errorHandler, encoding, reason,
8050
                  unicode, &exc,
8051
                  pos, pos + 1, &newpos);
8052
        if (rep == NULL)
8053
            goto error;
8054
8055
        Py_ssize_t morebytes = pos - newpos;
8056
        if (PyBytes_Check(rep)) {
8057
            outsize = PyBytes_GET_SIZE(rep);
8058
            morebytes += outsize;
8059
            if (morebytes > 0) {
8060
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8061
                if (out == NULL) {
8062
                    Py_DECREF(rep);
8063
                    goto error;
8064
                }
8065
            }
8066
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8067
            out += outsize;
8068
        }
8069
        else {
8070
            Py_ssize_t i;
8071
            int kind;
8072
            const void *data;
8073
8074
            outsize = PyUnicode_GET_LENGTH(rep);
8075
            morebytes += outsize;
8076
            if (morebytes > 0) {
8077
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8078
                if (out == NULL) {
8079
                    Py_DECREF(rep);
8080
                    goto error;
8081
                }
8082
            }
8083
            kind = PyUnicode_KIND(rep);
8084
            data = PyUnicode_DATA(rep);
8085
            for (i=0; i < outsize; i++) {
8086
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8087
                if (ch > 127) {
8088
                    raise_encode_exception(&exc,
8089
                        encoding, unicode,
8090
                        pos, pos + 1,
8091
                        "unable to encode error handler result to ASCII");
8092
                    Py_DECREF(rep);
8093
                    goto error;
8094
                }
8095
                *out = (unsigned char)ch;
8096
                out++;
8097
            }
8098
        }
8099
        pos = newpos;
8100
        Py_DECREF(rep);
8101
    }
8102
    /* write a NUL byte */
8103
    *out = 0;
8104
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8105
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8106
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8107
        goto error;
8108
    }
8109
    ret = 0;
8110
8111
error:
8112
    Py_XDECREF(encoding_obj);
8113
    Py_XDECREF(errorHandler);
8114
    Py_XDECREF(exc);
8115
    return ret;
8116
}
8117
8118
8119
PyObject *
8120
PyUnicode_EncodeCodePage(int code_page,
8121
                         PyObject *unicode,
8122
                         const char *errors)
8123
{
8124
    Py_ssize_t len;
8125
    PyBytesWriter *writer = NULL;
8126
    Py_ssize_t offset;
8127
    int chunk_len, ret, done;
8128
8129
    if (!PyUnicode_Check(unicode)) {
8130
        PyErr_BadArgument();
8131
        return NULL;
8132
    }
8133
8134
    len = PyUnicode_GET_LENGTH(unicode);
8135
8136
    if (code_page < 0) {
8137
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8138
        return NULL;
8139
    }
8140
8141
    if (len == 0)
8142
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8143
8144
    offset = 0;
8145
    do
8146
    {
8147
#ifdef NEED_RETRY
8148
        if (len > DECODING_CHUNK_SIZE) {
8149
            chunk_len = DECODING_CHUNK_SIZE;
8150
            done = 0;
8151
        }
8152
        else
8153
#endif
8154
        {
8155
            chunk_len = (int)len;
8156
            done = 1;
8157
        }
8158
8159
        ret = encode_code_page_strict(code_page, &writer,
8160
                                      unicode, offset, chunk_len,
8161
                                      errors);
8162
        if (ret == -2)
8163
            ret = encode_code_page_errors(code_page, &writer,
8164
                                          unicode, offset,
8165
                                          chunk_len, errors);
8166
        if (ret < 0) {
8167
            PyBytesWriter_Discard(writer);
8168
            return NULL;
8169
        }
8170
8171
        offset += chunk_len;
8172
        len -= chunk_len;
8173
    } while (!done);
8174
8175
    return PyBytesWriter_Finish(writer);
8176
}
8177
8178
8179
PyObject *
8180
PyUnicode_AsMBCSString(PyObject *unicode)
8181
{
8182
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8183
}
8184
8185
#undef NEED_RETRY
8186
8187
#endif /* MS_WINDOWS */
8188
8189
/* --- Character Mapping Codec -------------------------------------------- */
8190
8191
static int
8192
charmap_decode_string(const char *s,
8193
                      Py_ssize_t size,
8194
                      PyObject *mapping,
8195
                      const char *errors,
8196
                      _PyUnicodeWriter *writer)
8197
20.9k
{
8198
20.9k
    const char *starts = s;
8199
20.9k
    const char *e;
8200
20.9k
    Py_ssize_t startinpos, endinpos;
8201
20.9k
    PyObject *errorHandler = NULL, *exc = NULL;
8202
20.9k
    Py_ssize_t maplen;
8203
20.9k
    int mapkind;
8204
20.9k
    const void *mapdata;
8205
20.9k
    Py_UCS4 x;
8206
20.9k
    unsigned char ch;
8207
8208
20.9k
    maplen = PyUnicode_GET_LENGTH(mapping);
8209
20.9k
    mapdata = PyUnicode_DATA(mapping);
8210
20.9k
    mapkind = PyUnicode_KIND(mapping);
8211
8212
20.9k
    e = s + size;
8213
8214
20.9k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8215
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8216
         * is disabled in encoding aliases, latin1 is preferred because
8217
         * its implementation is faster. */
8218
133
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8219
133
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8220
133
        Py_UCS4 maxchar = writer->maxchar;
8221
8222
133
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8223
2.21k
        while (s < e) {
8224
2.08k
            ch = *s;
8225
2.08k
            x = mapdata_ucs1[ch];
8226
2.08k
            if (x > maxchar) {
8227
122
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8228
0
                    goto onError;
8229
122
                maxchar = writer->maxchar;
8230
122
                outdata = (Py_UCS1 *)writer->data;
8231
122
            }
8232
2.08k
            outdata[writer->pos] = x;
8233
2.08k
            writer->pos++;
8234
2.08k
            ++s;
8235
2.08k
        }
8236
133
        return 0;
8237
133
    }
8238
8239
87.7k
    while (s < e) {
8240
76.1k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8241
76.1k
            int outkind = writer->kind;
8242
76.1k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8243
76.1k
            if (outkind == PyUnicode_1BYTE_KIND) {
8244
40.3k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8245
40.3k
                Py_UCS4 maxchar = writer->maxchar;
8246
158k
                while (s < e) {
8247
156k
                    ch = *s;
8248
156k
                    x = mapdata_ucs2[ch];
8249
156k
                    if (x > maxchar)
8250
38.5k
                        goto Error;
8251
117k
                    outdata[writer->pos] = x;
8252
117k
                    writer->pos++;
8253
117k
                    ++s;
8254
117k
                }
8255
1.87k
                break;
8256
40.3k
            }
8257
35.7k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8258
35.7k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8259
1.21M
                while (s < e) {
8260
1.20M
                    ch = *s;
8261
1.20M
                    x = mapdata_ucs2[ch];
8262
1.20M
                    if (x == 0xFFFE)
8263
28.4k
                        goto Error;
8264
1.17M
                    outdata[writer->pos] = x;
8265
1.17M
                    writer->pos++;
8266
1.17M
                    ++s;
8267
1.17M
                }
8268
7.33k
                break;
8269
35.7k
            }
8270
76.1k
        }
8271
0
        ch = *s;
8272
8273
0
        if (ch < maplen)
8274
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8275
0
        else
8276
0
            x = 0xfffe; /* invalid value */
8277
66.9k
Error:
8278
66.9k
        if (x == 0xfffe)
8279
45.7k
        {
8280
            /* undefined mapping */
8281
45.7k
            startinpos = s-starts;
8282
45.7k
            endinpos = startinpos+1;
8283
45.7k
            if (unicode_decode_call_errorhandler_writer(
8284
45.7k
                    errors, &errorHandler,
8285
45.7k
                    "charmap", "character maps to <undefined>",
8286
45.7k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8287
45.7k
                    writer)) {
8288
18
                goto onError;
8289
18
            }
8290
45.7k
            continue;
8291
45.7k
        }
8292
8293
21.2k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8294
0
            goto onError;
8295
21.2k
        ++s;
8296
21.2k
    }
8297
20.7k
    Py_XDECREF(errorHandler);
8298
20.7k
    Py_XDECREF(exc);
8299
20.7k
    return 0;
8300
8301
18
onError:
8302
18
    Py_XDECREF(errorHandler);
8303
18
    Py_XDECREF(exc);
8304
18
    return -1;
8305
20.8k
}
8306
8307
static int
8308
charmap_decode_mapping(const char *s,
8309
                       Py_ssize_t size,
8310
                       PyObject *mapping,
8311
                       const char *errors,
8312
                       _PyUnicodeWriter *writer)
8313
0
{
8314
0
    const char *starts = s;
8315
0
    const char *e;
8316
0
    Py_ssize_t startinpos, endinpos;
8317
0
    PyObject *errorHandler = NULL, *exc = NULL;
8318
0
    unsigned char ch;
8319
0
    PyObject *key, *item = NULL;
8320
8321
0
    e = s + size;
8322
8323
0
    while (s < e) {
8324
0
        ch = *s;
8325
8326
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8327
0
        key = PyLong_FromLong((long)ch);
8328
0
        if (key == NULL)
8329
0
            goto onError;
8330
8331
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8332
0
        Py_DECREF(key);
8333
0
        if (rc == 0) {
8334
            /* No mapping found means: mapping is undefined. */
8335
0
            goto Undefined;
8336
0
        }
8337
0
        if (item == NULL) {
8338
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8339
                /* No mapping found means: mapping is undefined. */
8340
0
                PyErr_Clear();
8341
0
                goto Undefined;
8342
0
            } else
8343
0
                goto onError;
8344
0
        }
8345
8346
        /* Apply mapping */
8347
0
        if (item == Py_None)
8348
0
            goto Undefined;
8349
0
        if (PyLong_Check(item)) {
8350
0
            long value = PyLong_AsLong(item);
8351
0
            if (value == 0xFFFE)
8352
0
                goto Undefined;
8353
0
            if (value < 0 || value > MAX_UNICODE) {
8354
0
                PyErr_Format(PyExc_TypeError,
8355
0
                             "character mapping must be in range(0x%x)",
8356
0
                             (unsigned long)MAX_UNICODE + 1);
8357
0
                goto onError;
8358
0
            }
8359
8360
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8361
0
                goto onError;
8362
0
        }
8363
0
        else if (PyUnicode_Check(item)) {
8364
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8365
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8366
0
                if (value == 0xFFFE)
8367
0
                    goto Undefined;
8368
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8369
0
                    goto onError;
8370
0
            }
8371
0
            else {
8372
0
                writer->overallocate = 1;
8373
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8374
0
                    goto onError;
8375
0
            }
8376
0
        }
8377
0
        else {
8378
            /* wrong return value */
8379
0
            PyErr_SetString(PyExc_TypeError,
8380
0
                            "character mapping must return integer, None or str");
8381
0
            goto onError;
8382
0
        }
8383
0
        Py_CLEAR(item);
8384
0
        ++s;
8385
0
        continue;
8386
8387
0
Undefined:
8388
        /* undefined mapping */
8389
0
        Py_CLEAR(item);
8390
0
        startinpos = s-starts;
8391
0
        endinpos = startinpos+1;
8392
0
        if (unicode_decode_call_errorhandler_writer(
8393
0
                errors, &errorHandler,
8394
0
                "charmap", "character maps to <undefined>",
8395
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8396
0
                writer)) {
8397
0
            goto onError;
8398
0
        }
8399
0
    }
8400
0
    Py_XDECREF(errorHandler);
8401
0
    Py_XDECREF(exc);
8402
0
    return 0;
8403
8404
0
onError:
8405
0
    Py_XDECREF(item);
8406
0
    Py_XDECREF(errorHandler);
8407
0
    Py_XDECREF(exc);
8408
0
    return -1;
8409
0
}
8410
8411
PyObject *
8412
PyUnicode_DecodeCharmap(const char *s,
8413
                        Py_ssize_t size,
8414
                        PyObject *mapping,
8415
                        const char *errors)
8416
20.9k
{
8417
20.9k
    _PyUnicodeWriter writer;
8418
8419
    /* Default to Latin-1 */
8420
20.9k
    if (mapping == NULL)
8421
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8422
8423
20.9k
    if (size == 0)
8424
0
        _Py_RETURN_UNICODE_EMPTY();
8425
20.9k
    _PyUnicodeWriter_Init(&writer);
8426
20.9k
    writer.min_length = size;
8427
20.9k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8428
0
        goto onError;
8429
8430
20.9k
    if (PyUnicode_CheckExact(mapping)) {
8431
20.9k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8432
18
            goto onError;
8433
20.9k
    }
8434
0
    else {
8435
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8436
0
            goto onError;
8437
0
    }
8438
20.9k
    return _PyUnicodeWriter_Finish(&writer);
8439
8440
18
  onError:
8441
18
    _PyUnicodeWriter_Dealloc(&writer);
8442
18
    return NULL;
8443
20.9k
}
8444
8445
/* Charmap encoding: the lookup table */
8446
8447
/*[clinic input]
8448
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8449
[clinic start generated code]*/
8450
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8451
8452
struct encoding_map {
8453
    PyObject_HEAD
8454
    unsigned char level1[32];
8455
    int count2, count3;
8456
    unsigned char level23[1];
8457
};
8458
8459
/*[clinic input]
8460
EncodingMap.size
8461
8462
Return the size (in bytes) of this object.
8463
[clinic start generated code]*/
8464
8465
static PyObject *
8466
EncodingMap_size_impl(struct encoding_map *self)
8467
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8468
0
{
8469
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8470
0
                           128*self->count3);
8471
0
}
8472
8473
static PyMethodDef encoding_map_methods[] = {
8474
    ENCODINGMAP_SIZE_METHODDEF
8475
    {NULL, NULL}
8476
};
8477
8478
static PyTypeObject EncodingMapType = {
8479
    PyVarObject_HEAD_INIT(NULL, 0)
8480
    .tp_name = "EncodingMap",
8481
    .tp_basicsize = sizeof(struct encoding_map),
8482
    /* methods */
8483
    .tp_flags = Py_TPFLAGS_DEFAULT,
8484
    .tp_methods = encoding_map_methods,
8485
};
8486
8487
PyObject*
8488
PyUnicode_BuildEncodingMap(PyObject* string)
8489
112
{
8490
112
    PyObject *result;
8491
112
    struct encoding_map *mresult;
8492
112
    int i;
8493
112
    int need_dict = 0;
8494
112
    unsigned char level1[32];
8495
112
    unsigned char level2[512];
8496
112
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8497
112
    int count2 = 0, count3 = 0;
8498
112
    int kind;
8499
112
    const void *data;
8500
112
    int length;
8501
112
    Py_UCS4 ch;
8502
8503
112
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8504
0
        PyErr_BadArgument();
8505
0
        return NULL;
8506
0
    }
8507
112
    kind = PyUnicode_KIND(string);
8508
112
    data = PyUnicode_DATA(string);
8509
112
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8510
112
    memset(level1, 0xFF, sizeof level1);
8511
112
    memset(level2, 0xFF, sizeof level2);
8512
8513
    /* If there isn't a one-to-one mapping of NULL to \0,
8514
       or if there are non-BMP characters, we need to use
8515
       a mapping dictionary. */
8516
112
    if (PyUnicode_READ(kind, data, 0) != 0)
8517
0
        need_dict = 1;
8518
28.6k
    for (i = 1; i < length; i++) {
8519
28.5k
        int l1, l2;
8520
28.5k
        ch = PyUnicode_READ(kind, data, i);
8521
28.5k
        if (ch == 0 || ch > 0xFFFF) {
8522
0
            need_dict = 1;
8523
0
            break;
8524
0
        }
8525
28.5k
        if (ch == 0xFFFE)
8526
            /* unmapped character */
8527
702
            continue;
8528
27.8k
        l1 = ch >> 11;
8529
27.8k
        l2 = ch >> 7;
8530
27.8k
        if (level1[l1] == 0xFF)
8531
203
            level1[l1] = count2++;
8532
27.8k
        if (level2[l2] == 0xFF)
8533
609
            level2[l2] = count3++;
8534
27.8k
    }
8535
8536
112
    if (count2 >= 0xFF || count3 >= 0xFF)
8537
0
        need_dict = 1;
8538
8539
112
    if (need_dict) {
8540
0
        PyObject *result = PyDict_New();
8541
0
        if (!result)
8542
0
            return NULL;
8543
0
        for (i = 0; i < length; i++) {
8544
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8545
0
            PyObject *key = PyLong_FromLong(c);
8546
0
            if (key == NULL) {
8547
0
                Py_DECREF(result);
8548
0
                return NULL;
8549
0
            }
8550
0
            PyObject *value = PyLong_FromLong(i);
8551
0
            if (value == NULL) {
8552
0
                Py_DECREF(key);
8553
0
                Py_DECREF(result);
8554
0
                return NULL;
8555
0
            }
8556
0
            int rc = PyDict_SetItem(result, key, value);
8557
0
            Py_DECREF(key);
8558
0
            Py_DECREF(value);
8559
0
            if (rc < 0) {
8560
0
                Py_DECREF(result);
8561
0
                return NULL;
8562
0
            }
8563
0
        }
8564
0
        return result;
8565
0
    }
8566
8567
    /* Create a three-level trie */
8568
112
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8569
112
                             16*count2 + 128*count3 - 1);
8570
112
    if (!result) {
8571
0
        return PyErr_NoMemory();
8572
0
    }
8573
8574
112
    _PyObject_Init(result, &EncodingMapType);
8575
112
    mresult = (struct encoding_map*)result;
8576
112
    mresult->count2 = count2;
8577
112
    mresult->count3 = count3;
8578
112
    mlevel1 = mresult->level1;
8579
112
    mlevel2 = mresult->level23;
8580
112
    mlevel3 = mresult->level23 + 16*count2;
8581
112
    memcpy(mlevel1, level1, 32);
8582
112
    memset(mlevel2, 0xFF, 16*count2);
8583
112
    memset(mlevel3, 0, 128*count3);
8584
112
    count3 = 0;
8585
28.6k
    for (i = 1; i < length; i++) {
8586
28.5k
        int o1, o2, o3, i2, i3;
8587
28.5k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8588
28.5k
        if (ch == 0xFFFE)
8589
            /* unmapped character */
8590
702
            continue;
8591
27.8k
        o1 = ch>>11;
8592
27.8k
        o2 = (ch>>7) & 0xF;
8593
27.8k
        i2 = 16*mlevel1[o1] + o2;
8594
27.8k
        if (mlevel2[i2] == 0xFF)
8595
609
            mlevel2[i2] = count3++;
8596
27.8k
        o3 = ch & 0x7F;
8597
27.8k
        i3 = 128*mlevel2[i2] + o3;
8598
27.8k
        mlevel3[i3] = i;
8599
27.8k
    }
8600
112
    return result;
8601
112
}
8602
8603
static int
8604
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8605
0
{
8606
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8607
0
    int l1 = c>>11;
8608
0
    int l2 = (c>>7) & 0xF;
8609
0
    int l3 = c & 0x7F;
8610
0
    int i;
8611
8612
0
    if (c > 0xFFFF)
8613
0
        return -1;
8614
0
    if (c == 0)
8615
0
        return 0;
8616
    /* level 1*/
8617
0
    i = map->level1[l1];
8618
0
    if (i == 0xFF) {
8619
0
        return -1;
8620
0
    }
8621
    /* level 2*/
8622
0
    i = map->level23[16*i+l2];
8623
0
    if (i == 0xFF) {
8624
0
        return -1;
8625
0
    }
8626
    /* level 3 */
8627
0
    i = map->level23[16*map->count2 + 128*i + l3];
8628
0
    if (i == 0) {
8629
0
        return -1;
8630
0
    }
8631
0
    return i;
8632
0
}
8633
8634
/* Lookup the character in the mapping.
8635
   On success, return PyLong, PyBytes or None (if the character can't be found).
8636
   If the result is PyLong, put its value in replace.
8637
   On error, return NULL.
8638
   */
8639
static PyObject *
8640
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8641
0
{
8642
0
    PyObject *w = PyLong_FromLong((long)c);
8643
0
    PyObject *x;
8644
8645
0
    if (w == NULL)
8646
0
        return NULL;
8647
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8648
0
    Py_DECREF(w);
8649
0
    if (rc == 0) {
8650
        /* No mapping found means: mapping is undefined. */
8651
0
        Py_RETURN_NONE;
8652
0
    }
8653
0
    if (x == NULL) {
8654
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8655
            /* No mapping found means: mapping is undefined. */
8656
0
            PyErr_Clear();
8657
0
            Py_RETURN_NONE;
8658
0
        } else
8659
0
            return NULL;
8660
0
    }
8661
0
    else if (x == Py_None)
8662
0
        return x;
8663
0
    else if (PyLong_Check(x)) {
8664
0
        long value = PyLong_AsLong(x);
8665
0
        if (value < 0 || value > 255) {
8666
0
            PyErr_SetString(PyExc_TypeError,
8667
0
                            "character mapping must be in range(256)");
8668
0
            Py_DECREF(x);
8669
0
            return NULL;
8670
0
        }
8671
0
        *replace = (unsigned char)value;
8672
0
        return x;
8673
0
    }
8674
0
    else if (PyBytes_Check(x))
8675
0
        return x;
8676
0
    else {
8677
        /* wrong return value */
8678
0
        PyErr_Format(PyExc_TypeError,
8679
0
                     "character mapping must return integer, bytes or None, not %.400s",
8680
0
                     Py_TYPE(x)->tp_name);
8681
0
        Py_DECREF(x);
8682
0
        return NULL;
8683
0
    }
8684
0
}
8685
8686
static int
8687
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8688
0
{
8689
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8690
    /* exponentially overallocate to minimize reallocations */
8691
0
    if (requiredsize < 2 * outsize)
8692
0
        requiredsize = 2 * outsize;
8693
0
    return PyBytesWriter_Resize(writer, requiredsize);
8694
0
}
8695
8696
typedef enum charmapencode_result {
8697
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8698
} charmapencode_result;
8699
/* lookup the character, put the result in the output string and adjust
8700
   various state variables. Resize the output bytes object if not enough
8701
   space is available. Return a new reference to the object that
8702
   was put in the output buffer, or Py_None, if the mapping was undefined
8703
   (in which case no character was written) or NULL, if a
8704
   reallocation error occurred. The caller must decref the result */
8705
static charmapencode_result
8706
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8707
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8708
0
{
8709
0
    PyObject *rep;
8710
0
    unsigned char replace;
8711
0
    char *outstart;
8712
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8713
8714
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8715
0
        int res = encoding_map_lookup(c, mapping);
8716
0
        Py_ssize_t requiredsize = *outpos+1;
8717
0
        if (res == -1) {
8718
0
            return enc_FAILED;
8719
0
        }
8720
8721
0
        if (outsize<requiredsize) {
8722
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8723
0
                return enc_EXCEPTION;
8724
0
            }
8725
0
        }
8726
0
        outstart = _PyBytesWriter_GetData(writer);
8727
0
        outstart[(*outpos)++] = (char)res;
8728
0
        return enc_SUCCESS;
8729
0
    }
8730
8731
0
    rep = charmapencode_lookup(c, mapping, &replace);
8732
0
    if (rep==NULL)
8733
0
        return enc_EXCEPTION;
8734
0
    else if (rep==Py_None) {
8735
0
        Py_DECREF(rep);
8736
0
        return enc_FAILED;
8737
0
    } else {
8738
0
        if (PyLong_Check(rep)) {
8739
0
            Py_ssize_t requiredsize = *outpos+1;
8740
0
            if (outsize<requiredsize)
8741
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8742
0
                    Py_DECREF(rep);
8743
0
                    return enc_EXCEPTION;
8744
0
                }
8745
0
            outstart = _PyBytesWriter_GetData(writer);
8746
0
            outstart[(*outpos)++] = (char)replace;
8747
0
        }
8748
0
        else {
8749
0
            const char *repchars = PyBytes_AS_STRING(rep);
8750
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8751
0
            Py_ssize_t requiredsize = *outpos+repsize;
8752
0
            if (outsize<requiredsize)
8753
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8754
0
                    Py_DECREF(rep);
8755
0
                    return enc_EXCEPTION;
8756
0
                }
8757
0
            outstart = _PyBytesWriter_GetData(writer);
8758
0
            memcpy(outstart + *outpos, repchars, repsize);
8759
0
            *outpos += repsize;
8760
0
        }
8761
0
    }
8762
0
    Py_DECREF(rep);
8763
0
    return enc_SUCCESS;
8764
0
}
8765
8766
/* handle an error in _PyUnicode_EncodeCharmap()
8767
   Return 0 on success, -1 on error */
8768
static int
8769
charmap_encoding_error(
8770
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8771
    PyObject **exceptionObject,
8772
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8773
    PyBytesWriter *writer, Py_ssize_t *respos)
8774
0
{
8775
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8776
0
    Py_ssize_t size, repsize;
8777
0
    Py_ssize_t newpos;
8778
0
    int kind;
8779
0
    const void *data;
8780
0
    Py_ssize_t index;
8781
    /* startpos for collecting unencodable chars */
8782
0
    Py_ssize_t collstartpos = *inpos;
8783
0
    Py_ssize_t collendpos = *inpos+1;
8784
0
    Py_ssize_t collpos;
8785
0
    const char *encoding = "charmap";
8786
0
    const char *reason = "character maps to <undefined>";
8787
0
    charmapencode_result x;
8788
0
    Py_UCS4 ch;
8789
0
    int val;
8790
8791
0
    size = PyUnicode_GET_LENGTH(unicode);
8792
    /* find all unencodable characters */
8793
0
    while (collendpos < size) {
8794
0
        PyObject *rep;
8795
0
        unsigned char replace;
8796
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8797
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8798
0
            val = encoding_map_lookup(ch, mapping);
8799
0
            if (val != -1)
8800
0
                break;
8801
0
            ++collendpos;
8802
0
            continue;
8803
0
        }
8804
8805
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8806
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8807
0
        if (rep==NULL)
8808
0
            return -1;
8809
0
        else if (rep!=Py_None) {
8810
0
            Py_DECREF(rep);
8811
0
            break;
8812
0
        }
8813
0
        Py_DECREF(rep);
8814
0
        ++collendpos;
8815
0
    }
8816
    /* cache callback name lookup
8817
     * (if not done yet, i.e. it's the first error) */
8818
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8819
0
        *error_handler = _Py_GetErrorHandler(errors);
8820
8821
0
    switch (*error_handler) {
8822
0
    case _Py_ERROR_STRICT:
8823
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8824
0
        return -1;
8825
8826
0
    case _Py_ERROR_REPLACE:
8827
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8828
0
            x = charmapencode_output('?', mapping, writer, respos);
8829
0
            if (x==enc_EXCEPTION) {
8830
0
                return -1;
8831
0
            }
8832
0
            else if (x==enc_FAILED) {
8833
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8834
0
                return -1;
8835
0
            }
8836
0
        }
8837
0
        _Py_FALLTHROUGH;
8838
0
    case _Py_ERROR_IGNORE:
8839
0
        *inpos = collendpos;
8840
0
        break;
8841
8842
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8843
        /* generate replacement (temporarily (mis)uses p) */
8844
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8845
0
            char buffer[2+29+1+1];
8846
0
            char *cp;
8847
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8848
0
            for (cp = buffer; *cp; ++cp) {
8849
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8850
0
                if (x==enc_EXCEPTION)
8851
0
                    return -1;
8852
0
                else if (x==enc_FAILED) {
8853
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8854
0
                    return -1;
8855
0
                }
8856
0
            }
8857
0
        }
8858
0
        *inpos = collendpos;
8859
0
        break;
8860
8861
0
    default:
8862
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8863
0
                                                      encoding, reason, unicode, exceptionObject,
8864
0
                                                      collstartpos, collendpos, &newpos);
8865
0
        if (repunicode == NULL)
8866
0
            return -1;
8867
0
        if (PyBytes_Check(repunicode)) {
8868
            /* Directly copy bytes result to output. */
8869
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8870
0
            Py_ssize_t requiredsize;
8871
0
            repsize = PyBytes_Size(repunicode);
8872
0
            requiredsize = *respos + repsize;
8873
0
            if (requiredsize > outsize)
8874
                /* Make room for all additional bytes. */
8875
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8876
0
                    Py_DECREF(repunicode);
8877
0
                    return -1;
8878
0
                }
8879
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8880
0
                   PyBytes_AsString(repunicode),  repsize);
8881
0
            *respos += repsize;
8882
0
            *inpos = newpos;
8883
0
            Py_DECREF(repunicode);
8884
0
            break;
8885
0
        }
8886
        /* generate replacement  */
8887
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8888
0
        data = PyUnicode_DATA(repunicode);
8889
0
        kind = PyUnicode_KIND(repunicode);
8890
0
        for (index = 0; index < repsize; index++) {
8891
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8892
0
            x = charmapencode_output(repch, mapping, writer, respos);
8893
0
            if (x==enc_EXCEPTION) {
8894
0
                Py_DECREF(repunicode);
8895
0
                return -1;
8896
0
            }
8897
0
            else if (x==enc_FAILED) {
8898
0
                Py_DECREF(repunicode);
8899
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8900
0
                return -1;
8901
0
            }
8902
0
        }
8903
0
        *inpos = newpos;
8904
0
        Py_DECREF(repunicode);
8905
0
    }
8906
0
    return 0;
8907
0
}
8908
8909
PyObject *
8910
_PyUnicode_EncodeCharmap(PyObject *unicode,
8911
                         PyObject *mapping,
8912
                         const char *errors)
8913
0
{
8914
    /* Default to Latin-1 */
8915
0
    if (mapping == NULL) {
8916
0
        return unicode_encode_ucs1(unicode, errors, 256);
8917
0
    }
8918
8919
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
8920
0
    if (size == 0) {
8921
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8922
0
    }
8923
0
    const void *data = PyUnicode_DATA(unicode);
8924
0
    int kind = PyUnicode_KIND(unicode);
8925
8926
0
    PyObject *error_handler_obj = NULL;
8927
0
    PyObject *exc = NULL;
8928
8929
    /* output object */
8930
0
    PyBytesWriter *writer;
8931
    /* allocate enough for a simple encoding without
8932
       replacements, if we need more, we'll resize */
8933
0
    writer = PyBytesWriter_Create(size);
8934
0
    if (writer == NULL) {
8935
0
        goto onError;
8936
0
    }
8937
8938
    /* current input position */
8939
0
    Py_ssize_t inpos = 0;
8940
    /* current output position */
8941
0
    Py_ssize_t respos = 0;
8942
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8943
8944
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8945
0
        char *outstart = _PyBytesWriter_GetData(writer);
8946
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8947
8948
0
        while (inpos<size) {
8949
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8950
8951
            /* try to encode it */
8952
0
            int res = encoding_map_lookup(ch, mapping);
8953
0
            Py_ssize_t requiredsize = respos+1;
8954
0
            if (res == -1) {
8955
0
                goto enc_FAILED;
8956
0
            }
8957
8958
0
            if (outsize<requiredsize) {
8959
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
8960
0
                    goto onError;
8961
0
                }
8962
0
                outstart = _PyBytesWriter_GetData(writer);
8963
0
                outsize = _PyBytesWriter_GetSize(writer);
8964
0
            }
8965
0
            outstart[respos++] = (char)res;
8966
8967
            /* done with this character => adjust input position */
8968
0
            ++inpos;
8969
0
            continue;
8970
8971
0
enc_FAILED:
8972
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
8973
0
                                       &exc,
8974
0
                                       &error_handler, &error_handler_obj, errors,
8975
0
                                       writer, &respos)) {
8976
0
                goto onError;
8977
0
            }
8978
0
            outstart = _PyBytesWriter_GetData(writer);
8979
0
            outsize = _PyBytesWriter_GetSize(writer);
8980
0
        }
8981
0
    }
8982
0
    else {
8983
0
        while (inpos<size) {
8984
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8985
            /* try to encode it */
8986
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
8987
0
            if (x==enc_EXCEPTION) { /* error */
8988
0
                goto onError;
8989
0
            }
8990
0
            if (x==enc_FAILED) { /* unencodable character */
8991
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
8992
0
                                           &exc,
8993
0
                                           &error_handler, &error_handler_obj, errors,
8994
0
                                           writer, &respos)) {
8995
0
                    goto onError;
8996
0
                }
8997
0
            }
8998
0
            else {
8999
                /* done with this character => adjust input position */
9000
0
                ++inpos;
9001
0
            }
9002
0
        }
9003
0
    }
9004
9005
0
    Py_XDECREF(exc);
9006
0
    Py_XDECREF(error_handler_obj);
9007
9008
    /* Resize if we allocated too much */
9009
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9010
9011
0
  onError:
9012
0
    PyBytesWriter_Discard(writer);
9013
0
    Py_XDECREF(exc);
9014
0
    Py_XDECREF(error_handler_obj);
9015
0
    return NULL;
9016
0
}
9017
9018
PyObject *
9019
PyUnicode_AsCharmapString(PyObject *unicode,
9020
                          PyObject *mapping)
9021
0
{
9022
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9023
0
        PyErr_BadArgument();
9024
0
        return NULL;
9025
0
    }
9026
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9027
0
}
9028
9029
/* create or adjust a UnicodeTranslateError */
9030
static void
9031
make_translate_exception(PyObject **exceptionObject,
9032
                         PyObject *unicode,
9033
                         Py_ssize_t startpos, Py_ssize_t endpos,
9034
                         const char *reason)
9035
0
{
9036
0
    if (*exceptionObject == NULL) {
9037
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9038
0
            unicode, startpos, endpos, reason);
9039
0
    }
9040
0
    else {
9041
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9042
0
            goto onError;
9043
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9044
0
            goto onError;
9045
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9046
0
            goto onError;
9047
0
        return;
9048
0
      onError:
9049
0
        Py_CLEAR(*exceptionObject);
9050
0
    }
9051
0
}
9052
9053
/* error handling callback helper:
9054
   build arguments, call the callback and check the arguments,
9055
   put the result into newpos and return the replacement string, which
9056
   has to be freed by the caller */
9057
static PyObject *
9058
unicode_translate_call_errorhandler(const char *errors,
9059
                                    PyObject **errorHandler,
9060
                                    const char *reason,
9061
                                    PyObject *unicode, PyObject **exceptionObject,
9062
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9063
                                    Py_ssize_t *newpos)
9064
0
{
9065
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9066
9067
0
    Py_ssize_t i_newpos;
9068
0
    PyObject *restuple;
9069
0
    PyObject *resunicode;
9070
9071
0
    if (*errorHandler == NULL) {
9072
0
        *errorHandler = PyCodec_LookupError(errors);
9073
0
        if (*errorHandler == NULL)
9074
0
            return NULL;
9075
0
    }
9076
9077
0
    make_translate_exception(exceptionObject,
9078
0
                             unicode, startpos, endpos, reason);
9079
0
    if (*exceptionObject == NULL)
9080
0
        return NULL;
9081
9082
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9083
0
    if (restuple == NULL)
9084
0
        return NULL;
9085
0
    if (!PyTuple_Check(restuple)) {
9086
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9087
0
        Py_DECREF(restuple);
9088
0
        return NULL;
9089
0
    }
9090
0
    if (!PyArg_ParseTuple(restuple, argparse,
9091
0
                          &resunicode, &i_newpos)) {
9092
0
        Py_DECREF(restuple);
9093
0
        return NULL;
9094
0
    }
9095
0
    if (i_newpos<0)
9096
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9097
0
    else
9098
0
        *newpos = i_newpos;
9099
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9100
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9101
0
        Py_DECREF(restuple);
9102
0
        return NULL;
9103
0
    }
9104
0
    Py_INCREF(resunicode);
9105
0
    Py_DECREF(restuple);
9106
0
    return resunicode;
9107
0
}
9108
9109
/* Lookup the character ch in the mapping and put the result in result,
9110
   which must be decrefed by the caller.
9111
   The result can be PyLong, PyUnicode, None or NULL.
9112
   If the result is PyLong, put its value in replace.
9113
   Return 0 on success, -1 on error */
9114
static int
9115
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9116
338
{
9117
338
    PyObject *w = PyLong_FromLong((long)c);
9118
338
    PyObject *x;
9119
9120
338
    if (w == NULL)
9121
0
        return -1;
9122
338
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9123
338
    Py_DECREF(w);
9124
338
    if (rc == 0) {
9125
        /* No mapping found means: use 1:1 mapping. */
9126
158
        *result = NULL;
9127
158
        return 0;
9128
158
    }
9129
180
    if (x == NULL) {
9130
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9131
            /* No mapping found means: use 1:1 mapping. */
9132
0
            PyErr_Clear();
9133
0
            *result = NULL;
9134
0
            return 0;
9135
0
        } else
9136
0
            return -1;
9137
0
    }
9138
180
    else if (x == Py_None) {
9139
0
        *result = x;
9140
0
        return 0;
9141
0
    }
9142
180
    else if (PyLong_Check(x)) {
9143
0
        long value = PyLong_AsLong(x);
9144
0
        if (value < 0 || value > MAX_UNICODE) {
9145
0
            PyErr_Format(PyExc_ValueError,
9146
0
                         "character mapping must be in range(0x%x)",
9147
0
                         MAX_UNICODE+1);
9148
0
            Py_DECREF(x);
9149
0
            return -1;
9150
0
        }
9151
0
        *result = x;
9152
0
        *replace = (Py_UCS4)value;
9153
0
        return 0;
9154
0
    }
9155
180
    else if (PyUnicode_Check(x)) {
9156
180
        *result = x;
9157
180
        return 0;
9158
180
    }
9159
0
    else {
9160
        /* wrong return value */
9161
0
        PyErr_SetString(PyExc_TypeError,
9162
0
                        "character mapping must return integer, None or str");
9163
0
        Py_DECREF(x);
9164
0
        return -1;
9165
0
    }
9166
180
}
9167
9168
/* lookup the character, write the result into the writer.
9169
   Return 1 if the result was written into the writer, return 0 if the mapping
9170
   was undefined, raise an exception return -1 on error. */
9171
static int
9172
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9173
                        _PyUnicodeWriter *writer)
9174
200
{
9175
200
    PyObject *item;
9176
200
    Py_UCS4 replace;
9177
9178
200
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9179
0
        return -1;
9180
9181
200
    if (item == NULL) {
9182
        /* not found => default to 1:1 mapping */
9183
76
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9184
0
            return -1;
9185
0
        }
9186
76
        return 1;
9187
76
    }
9188
9189
124
    if (item == Py_None) {
9190
0
        Py_DECREF(item);
9191
0
        return 0;
9192
0
    }
9193
9194
124
    if (PyLong_Check(item)) {
9195
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9196
0
            Py_DECREF(item);
9197
0
            return -1;
9198
0
        }
9199
0
        Py_DECREF(item);
9200
0
        return 1;
9201
0
    }
9202
9203
124
    if (!PyUnicode_Check(item)) {
9204
0
        Py_DECREF(item);
9205
0
        return -1;
9206
0
    }
9207
9208
124
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9209
0
        Py_DECREF(item);
9210
0
        return -1;
9211
0
    }
9212
9213
124
    Py_DECREF(item);
9214
124
    return 1;
9215
124
}
9216
9217
static int
9218
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9219
                              Py_UCS1 *translate)
9220
138
{
9221
138
    PyObject *item = NULL;
9222
138
    Py_UCS4 replace;
9223
138
    int ret = 0;
9224
9225
138
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9226
0
        return -1;
9227
0
    }
9228
9229
138
    if (item == Py_None) {
9230
        /* deletion */
9231
0
        translate[ch] = 0xfe;
9232
0
    }
9233
138
    else if (item == NULL) {
9234
        /* not found => default to 1:1 mapping */
9235
82
        translate[ch] = ch;
9236
82
        return 1;
9237
82
    }
9238
56
    else if (PyLong_Check(item)) {
9239
0
        if (replace > 127) {
9240
            /* invalid character or character outside ASCII:
9241
               skip the fast translate */
9242
0
            goto exit;
9243
0
        }
9244
0
        translate[ch] = (Py_UCS1)replace;
9245
0
    }
9246
56
    else if (PyUnicode_Check(item)) {
9247
56
        if (PyUnicode_GET_LENGTH(item) != 1)
9248
56
            goto exit;
9249
9250
0
        replace = PyUnicode_READ_CHAR(item, 0);
9251
0
        if (replace > 127)
9252
0
            goto exit;
9253
0
        translate[ch] = (Py_UCS1)replace;
9254
0
    }
9255
0
    else {
9256
        /* not None, NULL, long or unicode */
9257
0
        goto exit;
9258
0
    }
9259
0
    ret = 1;
9260
9261
56
  exit:
9262
56
    Py_DECREF(item);
9263
56
    return ret;
9264
0
}
9265
9266
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9267
   was translated into writer, return 0 if the input string was partially
9268
   translated into writer, raise an exception and return -1 on error. */
9269
static int
9270
unicode_fast_translate(PyObject *input, PyObject *mapping,
9271
                       _PyUnicodeWriter *writer, int ignore,
9272
                       Py_ssize_t *input_pos)
9273
104
{
9274
104
    Py_UCS1 ascii_table[128], ch, ch2;
9275
104
    Py_ssize_t len;
9276
104
    const Py_UCS1 *in, *end;
9277
104
    Py_UCS1 *out;
9278
104
    int res = 0;
9279
9280
104
    len = PyUnicode_GET_LENGTH(input);
9281
9282
104
    memset(ascii_table, 0xff, 128);
9283
9284
104
    in = PyUnicode_1BYTE_DATA(input);
9285
104
    end = in + len;
9286
9287
104
    assert(PyUnicode_IS_ASCII(writer->buffer));
9288
104
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9289
104
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9290
9291
200
    for (; in < end; in++) {
9292
152
        ch = *in;
9293
152
        ch2 = ascii_table[ch];
9294
152
        if (ch2 == 0xff) {
9295
138
            int translate = unicode_fast_translate_lookup(mapping, ch,
9296
138
                                                          ascii_table);
9297
138
            if (translate < 0)
9298
0
                return -1;
9299
138
            if (translate == 0)
9300
56
                goto exit;
9301
82
            ch2 = ascii_table[ch];
9302
82
        }
9303
96
        if (ch2 == 0xfe) {
9304
0
            if (ignore)
9305
0
                continue;
9306
0
            goto exit;
9307
0
        }
9308
96
        assert(ch2 < 128);
9309
96
        *out = ch2;
9310
96
        out++;
9311
96
    }
9312
48
    res = 1;
9313
9314
104
exit:
9315
104
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9316
104
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9317
104
    return res;
9318
48
}
9319
9320
static PyObject *
9321
_PyUnicode_TranslateCharmap(PyObject *input,
9322
                            PyObject *mapping,
9323
                            const char *errors)
9324
104
{
9325
    /* input object */
9326
104
    const void *data;
9327
104
    Py_ssize_t size, i;
9328
104
    int kind;
9329
    /* output buffer */
9330
104
    _PyUnicodeWriter writer;
9331
    /* error handler */
9332
104
    const char *reason = "character maps to <undefined>";
9333
104
    PyObject *errorHandler = NULL;
9334
104
    PyObject *exc = NULL;
9335
104
    int ignore;
9336
104
    int res;
9337
9338
104
    if (mapping == NULL) {
9339
0
        PyErr_BadArgument();
9340
0
        return NULL;
9341
0
    }
9342
9343
104
    data = PyUnicode_DATA(input);
9344
104
    kind = PyUnicode_KIND(input);
9345
104
    size = PyUnicode_GET_LENGTH(input);
9346
9347
104
    if (size == 0)
9348
0
        return PyUnicode_FromObject(input);
9349
9350
    /* allocate enough for a simple 1:1 translation without
9351
       replacements, if we need more, we'll resize */
9352
104
    _PyUnicodeWriter_Init(&writer);
9353
104
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9354
0
        goto onError;
9355
9356
104
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9357
9358
104
    if (PyUnicode_IS_ASCII(input)) {
9359
104
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9360
104
        if (res < 0) {
9361
0
            _PyUnicodeWriter_Dealloc(&writer);
9362
0
            return NULL;
9363
0
        }
9364
104
        if (res == 1)
9365
48
            return _PyUnicodeWriter_Finish(&writer);
9366
104
    }
9367
0
    else {
9368
0
        i = 0;
9369
0
    }
9370
9371
256
    while (i<size) {
9372
        /* try to encode it */
9373
200
        int translate;
9374
200
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9375
200
        Py_ssize_t newpos;
9376
        /* startpos for collecting untranslatable chars */
9377
200
        Py_ssize_t collstart;
9378
200
        Py_ssize_t collend;
9379
200
        Py_UCS4 ch;
9380
9381
200
        ch = PyUnicode_READ(kind, data, i);
9382
200
        translate = charmaptranslate_output(ch, mapping, &writer);
9383
200
        if (translate < 0)
9384
0
            goto onError;
9385
9386
200
        if (translate != 0) {
9387
            /* it worked => adjust input pointer */
9388
200
            ++i;
9389
200
            continue;
9390
200
        }
9391
9392
        /* untranslatable character */
9393
0
        collstart = i;
9394
0
        collend = i+1;
9395
9396
        /* find all untranslatable characters */
9397
0
        while (collend < size) {
9398
0
            PyObject *x;
9399
0
            Py_UCS4 replace;
9400
0
            ch = PyUnicode_READ(kind, data, collend);
9401
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9402
0
                goto onError;
9403
0
            Py_XDECREF(x);
9404
0
            if (x != Py_None)
9405
0
                break;
9406
0
            ++collend;
9407
0
        }
9408
9409
0
        if (ignore) {
9410
0
            i = collend;
9411
0
        }
9412
0
        else {
9413
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9414
0
                                                             reason, input, &exc,
9415
0
                                                             collstart, collend, &newpos);
9416
0
            if (repunicode == NULL)
9417
0
                goto onError;
9418
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9419
0
                Py_DECREF(repunicode);
9420
0
                goto onError;
9421
0
            }
9422
0
            Py_DECREF(repunicode);
9423
0
            i = newpos;
9424
0
        }
9425
0
    }
9426
56
    Py_XDECREF(exc);
9427
56
    Py_XDECREF(errorHandler);
9428
56
    return _PyUnicodeWriter_Finish(&writer);
9429
9430
0
  onError:
9431
0
    _PyUnicodeWriter_Dealloc(&writer);
9432
0
    Py_XDECREF(exc);
9433
0
    Py_XDECREF(errorHandler);
9434
0
    return NULL;
9435
56
}
9436
9437
PyObject *
9438
PyUnicode_Translate(PyObject *str,
9439
                    PyObject *mapping,
9440
                    const char *errors)
9441
0
{
9442
0
    if (ensure_unicode(str) < 0)
9443
0
        return NULL;
9444
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9445
0
}
9446
9447
PyObject *
9448
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9449
4.14M
{
9450
4.14M
    if (!PyUnicode_Check(unicode)) {
9451
0
        PyErr_BadInternalCall();
9452
0
        return NULL;
9453
0
    }
9454
4.14M
    if (PyUnicode_IS_ASCII(unicode)) {
9455
        /* If the string is already ASCII, just return the same string */
9456
4.14M
        return Py_NewRef(unicode);
9457
4.14M
    }
9458
9459
2.29k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9460
2.29k
    PyObject *result = PyUnicode_New(len, 127);
9461
2.29k
    if (result == NULL) {
9462
0
        return NULL;
9463
0
    }
9464
9465
2.29k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9466
2.29k
    int kind = PyUnicode_KIND(unicode);
9467
2.29k
    const void *data = PyUnicode_DATA(unicode);
9468
2.29k
    Py_ssize_t i;
9469
32.4k
    for (i = 0; i < len; ++i) {
9470
30.3k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9471
30.3k
        if (ch < 127) {
9472
27.5k
            out[i] = ch;
9473
27.5k
        }
9474
2.74k
        else if (Py_UNICODE_ISSPACE(ch)) {
9475
1.06k
            out[i] = ' ';
9476
1.06k
        }
9477
1.67k
        else {
9478
1.67k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9479
1.67k
            if (decimal < 0) {
9480
128
                out[i] = '?';
9481
128
                out[i+1] = '\0';
9482
128
                _PyUnicode_LENGTH(result) = i + 1;
9483
128
                break;
9484
128
            }
9485
1.54k
            out[i] = '0' + decimal;
9486
1.54k
        }
9487
30.3k
    }
9488
9489
2.29k
    assert(_PyUnicode_CheckConsistency(result, 1));
9490
2.29k
    return result;
9491
2.29k
}
9492
9493
/* --- Helpers ------------------------------------------------------------ */
9494
9495
/* helper macro to fixup start/end slice values */
9496
#define ADJUST_INDICES(start, end, len) \
9497
182M
    do {                                \
9498
182M
        if (end > len) {                \
9499
149M
            end = len;                  \
9500
149M
        }                               \
9501
182M
        else if (end < 0) {             \
9502
0
            end += len;                 \
9503
0
            if (end < 0) {              \
9504
0
                end = 0;                \
9505
0
            }                           \
9506
0
        }                               \
9507
182M
        if (start < 0) {                \
9508
20.3k
            start += len;               \
9509
20.3k
            if (start < 0) {            \
9510
0
                start = 0;              \
9511
0
            }                           \
9512
20.3k
        }                               \
9513
182M
    } while (0)
9514
9515
static Py_ssize_t
9516
any_find_slice(PyObject* s1, PyObject* s2,
9517
               Py_ssize_t start,
9518
               Py_ssize_t end,
9519
               int direction)
9520
32.8M
{
9521
32.8M
    int kind1, kind2;
9522
32.8M
    const void *buf1, *buf2;
9523
32.8M
    Py_ssize_t len1, len2, result;
9524
9525
32.8M
    kind1 = PyUnicode_KIND(s1);
9526
32.8M
    kind2 = PyUnicode_KIND(s2);
9527
32.8M
    if (kind1 < kind2)
9528
0
        return -1;
9529
9530
32.8M
    len1 = PyUnicode_GET_LENGTH(s1);
9531
32.8M
    len2 = PyUnicode_GET_LENGTH(s2);
9532
32.8M
    ADJUST_INDICES(start, end, len1);
9533
32.8M
    if (end - start < len2)
9534
4.70M
        return -1;
9535
9536
28.1M
    buf1 = PyUnicode_DATA(s1);
9537
28.1M
    buf2 = PyUnicode_DATA(s2);
9538
28.1M
    if (len2 == 1) {
9539
28.1M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9540
28.1M
        result = findchar((const char *)buf1 + kind1*start,
9541
28.1M
                          kind1, end - start, ch, direction);
9542
28.1M
        if (result == -1)
9543
3.71M
            return -1;
9544
24.4M
        else
9545
24.4M
            return start + result;
9546
28.1M
    }
9547
9548
32.9k
    if (kind2 != kind1) {
9549
20.1k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9550
20.1k
        if (!buf2)
9551
0
            return -2;
9552
20.1k
    }
9553
9554
32.9k
    if (direction > 0) {
9555
32.9k
        switch (kind1) {
9556
12.7k
        case PyUnicode_1BYTE_KIND:
9557
12.7k
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9558
6.98k
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9559
5.81k
            else
9560
5.81k
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9561
12.7k
            break;
9562
9.04k
        case PyUnicode_2BYTE_KIND:
9563
9.04k
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9564
9.04k
            break;
9565
11.1k
        case PyUnicode_4BYTE_KIND:
9566
11.1k
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9567
11.1k
            break;
9568
0
        default:
9569
0
            Py_UNREACHABLE();
9570
32.9k
        }
9571
32.9k
    }
9572
0
    else {
9573
0
        switch (kind1) {
9574
0
        case PyUnicode_1BYTE_KIND:
9575
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9576
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9577
0
            else
9578
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9579
0
            break;
9580
0
        case PyUnicode_2BYTE_KIND:
9581
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9582
0
            break;
9583
0
        case PyUnicode_4BYTE_KIND:
9584
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9585
0
            break;
9586
0
        default:
9587
0
            Py_UNREACHABLE();
9588
0
        }
9589
0
    }
9590
9591
32.9k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9592
32.9k
    if (kind2 != kind1)
9593
20.1k
        PyMem_Free((void *)buf2);
9594
9595
32.9k
    return result;
9596
32.9k
}
9597
9598
9599
Py_ssize_t
9600
PyUnicode_Count(PyObject *str,
9601
                PyObject *substr,
9602
                Py_ssize_t start,
9603
                Py_ssize_t end)
9604
0
{
9605
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9606
0
        return -1;
9607
9608
0
    return unicode_count_impl(str, substr, start, end);
9609
0
}
9610
9611
Py_ssize_t
9612
PyUnicode_Find(PyObject *str,
9613
               PyObject *substr,
9614
               Py_ssize_t start,
9615
               Py_ssize_t end,
9616
               int direction)
9617
0
{
9618
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9619
0
        return -2;
9620
9621
0
    return any_find_slice(str, substr, start, end, direction);
9622
0
}
9623
9624
Py_ssize_t
9625
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9626
                   Py_ssize_t start, Py_ssize_t end,
9627
                   int direction)
9628
524k
{
9629
524k
    int kind;
9630
524k
    Py_ssize_t len, result;
9631
524k
    len = PyUnicode_GET_LENGTH(str);
9632
524k
    ADJUST_INDICES(start, end, len);
9633
524k
    if (end - start < 1)
9634
0
        return -1;
9635
524k
    kind = PyUnicode_KIND(str);
9636
524k
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9637
524k
                      kind, end-start, ch, direction);
9638
524k
    if (result == -1)
9639
56.2k
        return -1;
9640
468k
    else
9641
468k
        return start + result;
9642
524k
}
9643
9644
static int
9645
tailmatch(PyObject *self,
9646
          PyObject *substring,
9647
          Py_ssize_t start,
9648
          Py_ssize_t end,
9649
          int direction)
9650
118M
{
9651
118M
    int kind_self;
9652
118M
    int kind_sub;
9653
118M
    const void *data_self;
9654
118M
    const void *data_sub;
9655
118M
    Py_ssize_t offset;
9656
118M
    Py_ssize_t i;
9657
118M
    Py_ssize_t end_sub;
9658
9659
118M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9660
118M
    end -= PyUnicode_GET_LENGTH(substring);
9661
118M
    if (end < start)
9662
10.9M
        return 0;
9663
9664
107M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9665
0
        return 1;
9666
9667
107M
    kind_self = PyUnicode_KIND(self);
9668
107M
    data_self = PyUnicode_DATA(self);
9669
107M
    kind_sub = PyUnicode_KIND(substring);
9670
107M
    data_sub = PyUnicode_DATA(substring);
9671
107M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9672
9673
107M
    if (direction > 0)
9674
7.55M
        offset = end;
9675
99.6M
    else
9676
99.6M
        offset = start;
9677
9678
107M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9679
107M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9680
49.8M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9681
49.8M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9682
        /* If both are of the same kind, memcmp is sufficient */
9683
15.6M
        if (kind_self == kind_sub) {
9684
11.1M
            return ! memcmp((char *)data_self +
9685
11.1M
                                (offset * PyUnicode_KIND(substring)),
9686
11.1M
                            data_sub,
9687
11.1M
                            PyUnicode_GET_LENGTH(substring) *
9688
11.1M
                                PyUnicode_KIND(substring));
9689
11.1M
        }
9690
        /* otherwise we have to compare each character by first accessing it */
9691
4.43M
        else {
9692
            /* We do not need to compare 0 and len(substring)-1 because
9693
               the if statement above ensured already that they are equal
9694
               when we end up here. */
9695
4.48M
            for (i = 1; i < end_sub; ++i) {
9696
60.6k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9697
60.6k
                    PyUnicode_READ(kind_sub, data_sub, i))
9698
3.34k
                    return 0;
9699
60.6k
            }
9700
4.42M
            return 1;
9701
4.43M
        }
9702
15.6M
    }
9703
9704
91.6M
    return 0;
9705
107M
}
9706
9707
Py_ssize_t
9708
PyUnicode_Tailmatch(PyObject *str,
9709
                    PyObject *substr,
9710
                    Py_ssize_t start,
9711
                    Py_ssize_t end,
9712
                    int direction)
9713
0
{
9714
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9715
0
        return -1;
9716
9717
0
    return tailmatch(str, substr, start, end, direction);
9718
0
}
9719
9720
static PyObject *
9721
ascii_upper_or_lower(PyObject *self, int lower)
9722
67.8M
{
9723
67.8M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9724
67.8M
    const char *data = PyUnicode_DATA(self);
9725
67.8M
    char *resdata;
9726
67.8M
    PyObject *res;
9727
9728
67.8M
    res = PyUnicode_New(len, 127);
9729
67.8M
    if (res == NULL)
9730
0
        return NULL;
9731
67.8M
    resdata = PyUnicode_DATA(res);
9732
67.8M
    if (lower)
9733
67.8M
        _Py_bytes_lower(resdata, data, len);
9734
102
    else
9735
102
        _Py_bytes_upper(resdata, data, len);
9736
67.8M
    return res;
9737
67.8M
}
9738
9739
static Py_UCS4
9740
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9741
95.3k
{
9742
95.3k
    Py_ssize_t j;
9743
95.3k
    int final_sigma;
9744
95.3k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9745
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9746
9747
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9748
9749
    where ! is a negation and \p{xxx} is a character with property xxx.
9750
    */
9751
222k
    for (j = i - 1; j >= 0; j--) {
9752
220k
        c = PyUnicode_READ(kind, data, j);
9753
220k
        if (!_PyUnicode_IsCaseIgnorable(c))
9754
93.6k
            break;
9755
220k
    }
9756
95.3k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9757
95.3k
    if (final_sigma) {
9758
166k
        for (j = i + 1; j < length; j++) {
9759
160k
            c = PyUnicode_READ(kind, data, j);
9760
160k
            if (!_PyUnicode_IsCaseIgnorable(c))
9761
61.0k
                break;
9762
160k
        }
9763
66.6k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9764
66.6k
    }
9765
95.3k
    return (final_sigma) ? 0x3C2 : 0x3C3;
9766
95.3k
}
9767
9768
static int
9769
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9770
           Py_UCS4 c, Py_UCS4 *mapped)
9771
75.2M
{
9772
    /* Obscure special case. */
9773
75.2M
    if (c == 0x3A3) {
9774
95.3k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9775
95.3k
        return 1;
9776
95.3k
    }
9777
75.1M
    return _PyUnicode_ToLowerFull(c, mapped);
9778
75.2M
}
9779
9780
static Py_ssize_t
9781
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9782
0
{
9783
0
    Py_ssize_t i, k = 0;
9784
0
    int n_res, j;
9785
0
    Py_UCS4 c, mapped[3];
9786
9787
0
    c = PyUnicode_READ(kind, data, 0);
9788
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9789
0
    for (j = 0; j < n_res; j++) {
9790
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9791
0
        res[k++] = mapped[j];
9792
0
    }
9793
0
    for (i = 1; i < length; i++) {
9794
0
        c = PyUnicode_READ(kind, data, i);
9795
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9796
0
        for (j = 0; j < n_res; j++) {
9797
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9798
0
            res[k++] = mapped[j];
9799
0
        }
9800
0
    }
9801
0
    return k;
9802
0
}
9803
9804
static Py_ssize_t
9805
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9806
0
    Py_ssize_t i, k = 0;
9807
9808
0
    for (i = 0; i < length; i++) {
9809
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9810
0
        int n_res, j;
9811
0
        if (Py_UNICODE_ISUPPER(c)) {
9812
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9813
0
        }
9814
0
        else if (Py_UNICODE_ISLOWER(c)) {
9815
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9816
0
        }
9817
0
        else {
9818
0
            n_res = 1;
9819
0
            mapped[0] = c;
9820
0
        }
9821
0
        for (j = 0; j < n_res; j++) {
9822
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9823
0
            res[k++] = mapped[j];
9824
0
        }
9825
0
    }
9826
0
    return k;
9827
0
}
9828
9829
static Py_ssize_t
9830
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9831
                  Py_UCS4 *maxchar, int lower)
9832
7.30M
{
9833
7.30M
    Py_ssize_t i, k = 0;
9834
9835
82.5M
    for (i = 0; i < length; i++) {
9836
75.2M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9837
75.2M
        int n_res, j;
9838
75.2M
        if (lower)
9839
75.2M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9840
0
        else
9841
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9842
150M
        for (j = 0; j < n_res; j++) {
9843
75.2M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9844
75.2M
            res[k++] = mapped[j];
9845
75.2M
        }
9846
75.2M
    }
9847
7.30M
    return k;
9848
7.30M
}
9849
9850
static Py_ssize_t
9851
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9852
0
{
9853
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9854
0
}
9855
9856
static Py_ssize_t
9857
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9858
7.30M
{
9859
7.30M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9860
7.30M
}
9861
9862
static Py_ssize_t
9863
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9864
0
{
9865
0
    Py_ssize_t i, k = 0;
9866
9867
0
    for (i = 0; i < length; i++) {
9868
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9869
0
        Py_UCS4 mapped[3];
9870
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9871
0
        for (j = 0; j < n_res; j++) {
9872
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9873
0
            res[k++] = mapped[j];
9874
0
        }
9875
0
    }
9876
0
    return k;
9877
0
}
9878
9879
static Py_ssize_t
9880
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9881
0
{
9882
0
    Py_ssize_t i, k = 0;
9883
0
    int previous_is_cased;
9884
9885
0
    previous_is_cased = 0;
9886
0
    for (i = 0; i < length; i++) {
9887
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9888
0
        Py_UCS4 mapped[3];
9889
0
        int n_res, j;
9890
9891
0
        if (previous_is_cased)
9892
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9893
0
        else
9894
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9895
9896
0
        for (j = 0; j < n_res; j++) {
9897
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9898
0
            res[k++] = mapped[j];
9899
0
        }
9900
9901
0
        previous_is_cased = _PyUnicode_IsCased(c);
9902
0
    }
9903
0
    return k;
9904
0
}
9905
9906
static PyObject *
9907
case_operation(PyObject *self,
9908
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9909
7.30M
{
9910
7.30M
    PyObject *res = NULL;
9911
7.30M
    Py_ssize_t length, newlength = 0;
9912
7.30M
    int kind, outkind;
9913
7.30M
    const void *data;
9914
7.30M
    void *outdata;
9915
7.30M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9916
9917
7.30M
    kind = PyUnicode_KIND(self);
9918
7.30M
    data = PyUnicode_DATA(self);
9919
7.30M
    length = PyUnicode_GET_LENGTH(self);
9920
7.30M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9921
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9922
0
        return NULL;
9923
0
    }
9924
7.30M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9925
7.30M
    if (tmp == NULL)
9926
0
        return PyErr_NoMemory();
9927
7.30M
    newlength = perform(kind, data, length, tmp, &maxchar);
9928
7.30M
    res = PyUnicode_New(newlength, maxchar);
9929
7.30M
    if (res == NULL)
9930
0
        goto leave;
9931
7.30M
    tmpend = tmp + newlength;
9932
7.30M
    outdata = PyUnicode_DATA(res);
9933
7.30M
    outkind = PyUnicode_KIND(res);
9934
7.30M
    switch (outkind) {
9935
221k
    case PyUnicode_1BYTE_KIND:
9936
221k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9937
221k
        break;
9938
7.04M
    case PyUnicode_2BYTE_KIND:
9939
7.04M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9940
7.04M
        break;
9941
42.2k
    case PyUnicode_4BYTE_KIND:
9942
42.2k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9943
42.2k
        break;
9944
0
    default:
9945
0
        Py_UNREACHABLE();
9946
7.30M
    }
9947
7.30M
  leave:
9948
7.30M
    PyMem_Free(tmp);
9949
7.30M
    return res;
9950
7.30M
}
9951
9952
PyObject *
9953
PyUnicode_Join(PyObject *separator, PyObject *seq)
9954
24.5M
{
9955
24.5M
    PyObject *res;
9956
24.5M
    PyObject *fseq;
9957
24.5M
    Py_ssize_t seqlen;
9958
24.5M
    PyObject **items;
9959
9960
24.5M
    fseq = PySequence_Fast(seq, "can only join an iterable");
9961
24.5M
    if (fseq == NULL) {
9962
646
        return NULL;
9963
646
    }
9964
9965
24.5M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9966
9967
24.5M
    items = PySequence_Fast_ITEMS(fseq);
9968
24.5M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9969
24.5M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9970
9971
24.5M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9972
9973
24.5M
    Py_DECREF(fseq);
9974
24.5M
    return res;
9975
24.5M
}
9976
9977
PyObject *
9978
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9979
76.0M
{
9980
76.0M
    PyObject *res = NULL; /* the result */
9981
76.0M
    PyObject *sep = NULL;
9982
76.0M
    Py_ssize_t seplen;
9983
76.0M
    PyObject *item;
9984
76.0M
    Py_ssize_t sz, i, res_offset;
9985
76.0M
    Py_UCS4 maxchar;
9986
76.0M
    Py_UCS4 item_maxchar;
9987
76.0M
    int use_memcpy;
9988
76.0M
    unsigned char *res_data = NULL, *sep_data = NULL;
9989
76.0M
    PyObject *last_obj;
9990
76.0M
    int kind = 0;
9991
9992
    /* If empty sequence, return u"". */
9993
76.0M
    if (seqlen == 0) {
9994
5.98M
        _Py_RETURN_UNICODE_EMPTY();
9995
5.98M
    }
9996
9997
    /* If singleton sequence with an exact Unicode, return that. */
9998
70.0M
    last_obj = NULL;
9999
70.0M
    if (seqlen == 1) {
10000
6.98M
        if (PyUnicode_CheckExact(items[0])) {
10001
5.39M
            res = items[0];
10002
5.39M
            return Py_NewRef(res);
10003
5.39M
        }
10004
1.59M
        seplen = 0;
10005
1.59M
        maxchar = 0;
10006
1.59M
    }
10007
63.0M
    else {
10008
        /* Set up sep and seplen */
10009
63.0M
        if (separator == NULL) {
10010
            /* fall back to a blank space separator */
10011
0
            sep = PyUnicode_FromOrdinal(' ');
10012
0
            if (!sep)
10013
0
                goto onError;
10014
0
            seplen = 1;
10015
0
            maxchar = 32;
10016
0
        }
10017
63.0M
        else {
10018
63.0M
            if (!PyUnicode_Check(separator)) {
10019
0
                PyErr_Format(PyExc_TypeError,
10020
0
                             "separator: expected str instance,"
10021
0
                             " %.80s found",
10022
0
                             Py_TYPE(separator)->tp_name);
10023
0
                goto onError;
10024
0
            }
10025
63.0M
            sep = separator;
10026
63.0M
            seplen = PyUnicode_GET_LENGTH(separator);
10027
63.0M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10028
            /* inc refcount to keep this code path symmetric with the
10029
               above case of a blank separator */
10030
63.0M
            Py_INCREF(sep);
10031
63.0M
        }
10032
63.0M
        last_obj = sep;
10033
63.0M
    }
10034
10035
    /* There are at least two things to join, or else we have a subclass
10036
     * of str in the sequence.
10037
     * Do a pre-pass to figure out the total amount of space we'll
10038
     * need (sz), and see whether all argument are strings.
10039
     */
10040
64.6M
    sz = 0;
10041
#ifdef Py_DEBUG
10042
    use_memcpy = 0;
10043
#else
10044
64.6M
    use_memcpy = 1;
10045
64.6M
#endif
10046
469M
    for (i = 0; i < seqlen; i++) {
10047
404M
        size_t add_sz;
10048
404M
        item = items[i];
10049
404M
        if (!PyUnicode_Check(item)) {
10050
0
            PyErr_Format(PyExc_TypeError,
10051
0
                         "sequence item %zd: expected str instance,"
10052
0
                         " %.80s found",
10053
0
                         i, Py_TYPE(item)->tp_name);
10054
0
            goto onError;
10055
0
        }
10056
404M
        add_sz = PyUnicode_GET_LENGTH(item);
10057
404M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10058
404M
        maxchar = Py_MAX(maxchar, item_maxchar);
10059
404M
        if (i != 0) {
10060
340M
            add_sz += seplen;
10061
340M
        }
10062
404M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10063
0
            PyErr_SetString(PyExc_OverflowError,
10064
0
                            "join() result is too long for a Python string");
10065
0
            goto onError;
10066
0
        }
10067
404M
        sz += add_sz;
10068
404M
        if (use_memcpy && last_obj != NULL) {
10069
338M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10070
6.05M
                use_memcpy = 0;
10071
338M
        }
10072
404M
        last_obj = item;
10073
404M
    }
10074
10075
64.6M
    res = PyUnicode_New(sz, maxchar);
10076
64.6M
    if (res == NULL)
10077
0
        goto onError;
10078
10079
    /* Catenate everything. */
10080
#ifdef Py_DEBUG
10081
    use_memcpy = 0;
10082
#else
10083
64.6M
    if (use_memcpy) {
10084
58.5M
        res_data = PyUnicode_1BYTE_DATA(res);
10085
58.5M
        kind = PyUnicode_KIND(res);
10086
58.5M
        if (seplen != 0)
10087
16.9k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10088
58.5M
    }
10089
64.6M
#endif
10090
64.6M
    if (use_memcpy) {
10091
366M
        for (i = 0; i < seqlen; ++i) {
10092
307M
            Py_ssize_t itemlen;
10093
307M
            item = items[i];
10094
10095
            /* Copy item, and maybe the separator. */
10096
307M
            if (i && seplen != 0) {
10097
23.3k
                memcpy(res_data,
10098
23.3k
                          sep_data,
10099
23.3k
                          kind * seplen);
10100
23.3k
                res_data += kind * seplen;
10101
23.3k
            }
10102
10103
307M
            itemlen = PyUnicode_GET_LENGTH(item);
10104
307M
            if (itemlen != 0) {
10105
265M
                memcpy(res_data,
10106
265M
                          PyUnicode_DATA(item),
10107
265M
                          kind * itemlen);
10108
265M
                res_data += kind * itemlen;
10109
265M
            }
10110
307M
        }
10111
58.5M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10112
58.5M
                           + kind * PyUnicode_GET_LENGTH(res));
10113
58.5M
    }
10114
6.05M
    else {
10115
103M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10116
97.2M
            Py_ssize_t itemlen;
10117
97.2M
            item = items[i];
10118
10119
            /* Copy item, and maybe the separator. */
10120
97.2M
            if (i && seplen != 0) {
10121
60.8k
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10122
60.8k
                res_offset += seplen;
10123
60.8k
            }
10124
10125
97.2M
            itemlen = PyUnicode_GET_LENGTH(item);
10126
97.2M
            if (itemlen != 0) {
10127
94.7M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10128
94.7M
                res_offset += itemlen;
10129
94.7M
            }
10130
97.2M
        }
10131
6.05M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10132
6.05M
    }
10133
10134
64.6M
    Py_XDECREF(sep);
10135
64.6M
    assert(_PyUnicode_CheckConsistency(res, 1));
10136
64.6M
    return res;
10137
10138
0
  onError:
10139
0
    Py_XDECREF(sep);
10140
0
    Py_XDECREF(res);
10141
0
    return NULL;
10142
64.6M
}
10143
10144
void
10145
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10146
                    Py_UCS4 fill_char)
10147
644
{
10148
644
    const int kind = PyUnicode_KIND(unicode);
10149
644
    void *data = PyUnicode_DATA(unicode);
10150
644
    assert(_PyUnicode_IsModifiable(unicode));
10151
644
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10152
644
    assert(start >= 0);
10153
644
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10154
644
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10155
644
}
10156
10157
Py_ssize_t
10158
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10159
               Py_UCS4 fill_char)
10160
644
{
10161
644
    Py_ssize_t maxlen;
10162
10163
644
    if (!PyUnicode_Check(unicode)) {
10164
0
        PyErr_BadInternalCall();
10165
0
        return -1;
10166
0
    }
10167
644
    if (unicode_check_modifiable(unicode))
10168
0
        return -1;
10169
10170
644
    if (start < 0) {
10171
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10172
0
        return -1;
10173
0
    }
10174
644
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10175
0
        PyErr_SetString(PyExc_ValueError,
10176
0
                         "fill character is bigger than "
10177
0
                         "the string maximum character");
10178
0
        return -1;
10179
0
    }
10180
10181
644
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10182
644
    length = Py_MIN(maxlen, length);
10183
644
    if (length <= 0)
10184
0
        return 0;
10185
10186
644
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10187
644
    return length;
10188
644
}
10189
10190
static PyObject *
10191
pad(PyObject *self,
10192
    Py_ssize_t left,
10193
    Py_ssize_t right,
10194
    Py_UCS4 fill)
10195
0
{
10196
0
    PyObject *u;
10197
0
    Py_UCS4 maxchar;
10198
0
    int kind;
10199
0
    void *data;
10200
10201
0
    if (left < 0)
10202
0
        left = 0;
10203
0
    if (right < 0)
10204
0
        right = 0;
10205
10206
0
    if (left == 0 && right == 0)
10207
0
        return unicode_result_unchanged(self);
10208
10209
0
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10210
0
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10211
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10212
0
        return NULL;
10213
0
    }
10214
0
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10215
0
    maxchar = Py_MAX(maxchar, fill);
10216
0
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10217
0
    if (!u)
10218
0
        return NULL;
10219
10220
0
    kind = PyUnicode_KIND(u);
10221
0
    data = PyUnicode_DATA(u);
10222
0
    if (left)
10223
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10224
0
    if (right)
10225
0
        _PyUnicode_Fill(kind, data, fill,
10226
0
                        left + _PyUnicode_LENGTH(self), right);
10227
0
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10228
0
    assert(_PyUnicode_CheckConsistency(u, 1));
10229
0
    return u;
10230
0
}
10231
10232
PyObject *
10233
PyUnicode_Splitlines(PyObject *string, int keepends)
10234
14.0k
{
10235
14.0k
    PyObject *list;
10236
10237
14.0k
    if (ensure_unicode(string) < 0)
10238
0
        return NULL;
10239
10240
14.0k
    switch (PyUnicode_KIND(string)) {
10241
3.58k
    case PyUnicode_1BYTE_KIND:
10242
3.58k
        if (PyUnicode_IS_ASCII(string))
10243
2.72k
            list = asciilib_splitlines(
10244
2.72k
                string, PyUnicode_1BYTE_DATA(string),
10245
2.72k
                PyUnicode_GET_LENGTH(string), keepends);
10246
856
        else
10247
856
            list = ucs1lib_splitlines(
10248
856
                string, PyUnicode_1BYTE_DATA(string),
10249
856
                PyUnicode_GET_LENGTH(string), keepends);
10250
3.58k
        break;
10251
7.36k
    case PyUnicode_2BYTE_KIND:
10252
7.36k
        list = ucs2lib_splitlines(
10253
7.36k
            string, PyUnicode_2BYTE_DATA(string),
10254
7.36k
            PyUnicode_GET_LENGTH(string), keepends);
10255
7.36k
        break;
10256
3.09k
    case PyUnicode_4BYTE_KIND:
10257
3.09k
        list = ucs4lib_splitlines(
10258
3.09k
            string, PyUnicode_4BYTE_DATA(string),
10259
3.09k
            PyUnicode_GET_LENGTH(string), keepends);
10260
3.09k
        break;
10261
0
    default:
10262
0
        Py_UNREACHABLE();
10263
14.0k
    }
10264
14.0k
    return list;
10265
14.0k
}
10266
10267
static PyObject *
10268
split(PyObject *self,
10269
      PyObject *substring,
10270
      Py_ssize_t maxcount)
10271
22.9M
{
10272
22.9M
    int kind1, kind2;
10273
22.9M
    const void *buf1, *buf2;
10274
22.9M
    Py_ssize_t len1, len2;
10275
22.9M
    PyObject* out;
10276
22.9M
    len1 = PyUnicode_GET_LENGTH(self);
10277
22.9M
    kind1 = PyUnicode_KIND(self);
10278
10279
22.9M
    if (substring == NULL) {
10280
159k
        if (maxcount < 0) {
10281
134k
            maxcount = (len1 - 1) / 2 + 1;
10282
134k
        }
10283
159k
        switch (kind1) {
10284
98.3k
        case PyUnicode_1BYTE_KIND:
10285
98.3k
            if (PyUnicode_IS_ASCII(self))
10286
72.9k
                return asciilib_split_whitespace(
10287
72.9k
                    self,  PyUnicode_1BYTE_DATA(self),
10288
72.9k
                    len1, maxcount
10289
72.9k
                    );
10290
25.4k
            else
10291
25.4k
                return ucs1lib_split_whitespace(
10292
25.4k
                    self,  PyUnicode_1BYTE_DATA(self),
10293
25.4k
                    len1, maxcount
10294
25.4k
                    );
10295
48.4k
        case PyUnicode_2BYTE_KIND:
10296
48.4k
            return ucs2lib_split_whitespace(
10297
48.4k
                self,  PyUnicode_2BYTE_DATA(self),
10298
48.4k
                len1, maxcount
10299
48.4k
                );
10300
13.1k
        case PyUnicode_4BYTE_KIND:
10301
13.1k
            return ucs4lib_split_whitespace(
10302
13.1k
                self,  PyUnicode_4BYTE_DATA(self),
10303
13.1k
                len1, maxcount
10304
13.1k
                );
10305
0
        default:
10306
0
            Py_UNREACHABLE();
10307
159k
        }
10308
159k
    }
10309
10310
22.7M
    kind2 = PyUnicode_KIND(substring);
10311
22.7M
    len2 = PyUnicode_GET_LENGTH(substring);
10312
22.7M
    if (maxcount < 0) {
10313
        // if len2 == 0, it will raise ValueError.
10314
13.3M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10315
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10316
13.3M
        maxcount = maxcount < 0 ? len1 : maxcount;
10317
13.3M
    }
10318
22.7M
    if (kind1 < kind2 || len1 < len2) {
10319
4.41M
        out = PyList_New(1);
10320
4.41M
        if (out == NULL)
10321
0
            return NULL;
10322
4.41M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10323
4.41M
        return out;
10324
4.41M
    }
10325
18.3M
    buf1 = PyUnicode_DATA(self);
10326
18.3M
    buf2 = PyUnicode_DATA(substring);
10327
18.3M
    if (kind2 != kind1) {
10328
216k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10329
216k
        if (!buf2)
10330
0
            return NULL;
10331
216k
    }
10332
10333
18.3M
    switch (kind1) {
10334
18.1M
    case PyUnicode_1BYTE_KIND:
10335
18.1M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10336
16.9M
            out = asciilib_split(
10337
16.9M
                self,  buf1, len1, buf2, len2, maxcount);
10338
1.22M
        else
10339
1.22M
            out = ucs1lib_split(
10340
1.22M
                self,  buf1, len1, buf2, len2, maxcount);
10341
18.1M
        break;
10342
175k
    case PyUnicode_2BYTE_KIND:
10343
175k
        out = ucs2lib_split(
10344
175k
            self,  buf1, len1, buf2, len2, maxcount);
10345
175k
        break;
10346
41.4k
    case PyUnicode_4BYTE_KIND:
10347
41.4k
        out = ucs4lib_split(
10348
41.4k
            self,  buf1, len1, buf2, len2, maxcount);
10349
41.4k
        break;
10350
0
    default:
10351
0
        out = NULL;
10352
18.3M
    }
10353
18.3M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10354
18.3M
    if (kind2 != kind1)
10355
216k
        PyMem_Free((void *)buf2);
10356
18.3M
    return out;
10357
18.3M
}
10358
10359
static PyObject *
10360
rsplit(PyObject *self,
10361
       PyObject *substring,
10362
       Py_ssize_t maxcount)
10363
50
{
10364
50
    int kind1, kind2;
10365
50
    const void *buf1, *buf2;
10366
50
    Py_ssize_t len1, len2;
10367
50
    PyObject* out;
10368
10369
50
    len1 = PyUnicode_GET_LENGTH(self);
10370
50
    kind1 = PyUnicode_KIND(self);
10371
10372
50
    if (substring == NULL) {
10373
0
        if (maxcount < 0) {
10374
0
            maxcount = (len1 - 1) / 2 + 1;
10375
0
        }
10376
0
        switch (kind1) {
10377
0
        case PyUnicode_1BYTE_KIND:
10378
0
            if (PyUnicode_IS_ASCII(self))
10379
0
                return asciilib_rsplit_whitespace(
10380
0
                    self,  PyUnicode_1BYTE_DATA(self),
10381
0
                    len1, maxcount
10382
0
                    );
10383
0
            else
10384
0
                return ucs1lib_rsplit_whitespace(
10385
0
                    self,  PyUnicode_1BYTE_DATA(self),
10386
0
                    len1, maxcount
10387
0
                    );
10388
0
        case PyUnicode_2BYTE_KIND:
10389
0
            return ucs2lib_rsplit_whitespace(
10390
0
                self,  PyUnicode_2BYTE_DATA(self),
10391
0
                len1, maxcount
10392
0
                );
10393
0
        case PyUnicode_4BYTE_KIND:
10394
0
            return ucs4lib_rsplit_whitespace(
10395
0
                self,  PyUnicode_4BYTE_DATA(self),
10396
0
                len1, maxcount
10397
0
                );
10398
0
        default:
10399
0
            Py_UNREACHABLE();
10400
0
        }
10401
0
    }
10402
50
    kind2 = PyUnicode_KIND(substring);
10403
50
    len2 = PyUnicode_GET_LENGTH(substring);
10404
50
    if (maxcount < 0) {
10405
        // if len2 == 0, it will raise ValueError.
10406
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10407
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10408
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10409
0
    }
10410
50
    if (kind1 < kind2 || len1 < len2) {
10411
0
        out = PyList_New(1);
10412
0
        if (out == NULL)
10413
0
            return NULL;
10414
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10415
0
        return out;
10416
0
    }
10417
50
    buf1 = PyUnicode_DATA(self);
10418
50
    buf2 = PyUnicode_DATA(substring);
10419
50
    if (kind2 != kind1) {
10420
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10421
0
        if (!buf2)
10422
0
            return NULL;
10423
0
    }
10424
10425
50
    switch (kind1) {
10426
50
    case PyUnicode_1BYTE_KIND:
10427
50
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10428
50
            out = asciilib_rsplit(
10429
50
                self,  buf1, len1, buf2, len2, maxcount);
10430
0
        else
10431
0
            out = ucs1lib_rsplit(
10432
0
                self,  buf1, len1, buf2, len2, maxcount);
10433
50
        break;
10434
0
    case PyUnicode_2BYTE_KIND:
10435
0
        out = ucs2lib_rsplit(
10436
0
            self,  buf1, len1, buf2, len2, maxcount);
10437
0
        break;
10438
0
    case PyUnicode_4BYTE_KIND:
10439
0
        out = ucs4lib_rsplit(
10440
0
            self,  buf1, len1, buf2, len2, maxcount);
10441
0
        break;
10442
0
    default:
10443
0
        out = NULL;
10444
50
    }
10445
50
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10446
50
    if (kind2 != kind1)
10447
0
        PyMem_Free((void *)buf2);
10448
50
    return out;
10449
50
}
10450
10451
static Py_ssize_t
10452
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10453
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10454
152M
{
10455
152M
    switch (kind) {
10456
25.6M
    case PyUnicode_1BYTE_KIND:
10457
25.6M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10458
21.5M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10459
4.03M
        else
10460
4.03M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10461
59.6M
    case PyUnicode_2BYTE_KIND:
10462
59.6M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10463
67.3M
    case PyUnicode_4BYTE_KIND:
10464
67.3M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10465
152M
    }
10466
152M
    Py_UNREACHABLE();
10467
152M
}
10468
10469
static Py_ssize_t
10470
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10471
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10472
50.6M
{
10473
50.6M
    switch (kind) {
10474
44.0M
    case PyUnicode_1BYTE_KIND:
10475
44.0M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10476
6.50M
    case PyUnicode_2BYTE_KIND:
10477
6.50M
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10478
132k
    case PyUnicode_4BYTE_KIND:
10479
132k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10480
50.6M
    }
10481
50.6M
    Py_UNREACHABLE();
10482
50.6M
}
10483
10484
static void
10485
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10486
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10487
2.13M
{
10488
2.13M
    int kind = PyUnicode_KIND(u);
10489
2.13M
    void *data = PyUnicode_DATA(u);
10490
2.13M
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10491
2.13M
    if (kind == PyUnicode_1BYTE_KIND) {
10492
939k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10493
939k
                                      (Py_UCS1 *)data + len,
10494
939k
                                      u1, u2, maxcount);
10495
939k
    }
10496
1.19M
    else if (kind == PyUnicode_2BYTE_KIND) {
10497
1.18M
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10498
1.18M
                                      (Py_UCS2 *)data + len,
10499
1.18M
                                      u1, u2, maxcount);
10500
1.18M
    }
10501
17.2k
    else {
10502
17.2k
        assert(kind == PyUnicode_4BYTE_KIND);
10503
17.2k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10504
17.2k
                                      (Py_UCS4 *)data + len,
10505
17.2k
                                      u1, u2, maxcount);
10506
17.2k
    }
10507
2.13M
}
10508
10509
static PyObject *
10510
replace(PyObject *self, PyObject *str1,
10511
        PyObject *str2, Py_ssize_t maxcount)
10512
97.3M
{
10513
97.3M
    PyObject *u;
10514
97.3M
    const char *sbuf = PyUnicode_DATA(self);
10515
97.3M
    const void *buf1 = PyUnicode_DATA(str1);
10516
97.3M
    const void *buf2 = PyUnicode_DATA(str2);
10517
97.3M
    int srelease = 0, release1 = 0, release2 = 0;
10518
97.3M
    int skind = PyUnicode_KIND(self);
10519
97.3M
    int kind1 = PyUnicode_KIND(str1);
10520
97.3M
    int kind2 = PyUnicode_KIND(str2);
10521
97.3M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10522
97.3M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10523
97.3M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10524
97.3M
    int mayshrink;
10525
97.3M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10526
10527
97.3M
    if (slen < len1)
10528
37.2M
        goto nothing;
10529
10530
60.0M
    if (maxcount < 0)
10531
60.0M
        maxcount = PY_SSIZE_T_MAX;
10532
0
    else if (maxcount == 0)
10533
0
        goto nothing;
10534
10535
60.0M
    if (str1 == str2)
10536
0
        goto nothing;
10537
10538
60.0M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10539
60.0M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10540
60.0M
    if (maxchar < maxchar_str1)
10541
        /* substring too wide to be present */
10542
0
        goto nothing;
10543
60.0M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10544
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10545
       result string. */
10546
60.0M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10547
60.0M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10548
10549
60.0M
    if (len1 == len2) {
10550
        /* same length */
10551
9.38M
        if (len1 == 0)
10552
0
            goto nothing;
10553
9.38M
        if (len1 == 1) {
10554
            /* replace characters */
10555
9.38M
            Py_UCS4 u1, u2;
10556
9.38M
            Py_ssize_t pos;
10557
10558
9.38M
            u1 = PyUnicode_READ(kind1, buf1, 0);
10559
9.38M
            pos = findchar(sbuf, skind, slen, u1, 1);
10560
9.38M
            if (pos < 0)
10561
7.25M
                goto nothing;
10562
2.13M
            u2 = PyUnicode_READ(kind2, buf2, 0);
10563
2.13M
            u = PyUnicode_New(slen, maxchar);
10564
2.13M
            if (!u)
10565
0
                goto error;
10566
10567
2.13M
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10568
2.13M
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10569
2.13M
        }
10570
0
        else {
10571
0
            int rkind = skind;
10572
0
            char *res;
10573
0
            Py_ssize_t i;
10574
10575
0
            if (kind1 < rkind) {
10576
                /* widen substring */
10577
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10578
0
                if (!buf1) goto error;
10579
0
                release1 = 1;
10580
0
            }
10581
0
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10582
0
            if (i < 0)
10583
0
                goto nothing;
10584
0
            if (rkind > kind2) {
10585
                /* widen replacement */
10586
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10587
0
                if (!buf2) goto error;
10588
0
                release2 = 1;
10589
0
            }
10590
0
            else if (rkind < kind2) {
10591
                /* widen self and buf1 */
10592
0
                rkind = kind2;
10593
0
                if (release1) {
10594
0
                    assert(buf1 != PyUnicode_DATA(str1));
10595
0
                    PyMem_Free((void *)buf1);
10596
0
                    buf1 = PyUnicode_DATA(str1);
10597
0
                    release1 = 0;
10598
0
                }
10599
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10600
0
                if (!sbuf) goto error;
10601
0
                srelease = 1;
10602
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10603
0
                if (!buf1) goto error;
10604
0
                release1 = 1;
10605
0
            }
10606
0
            u = PyUnicode_New(slen, maxchar);
10607
0
            if (!u)
10608
0
                goto error;
10609
0
            assert(PyUnicode_KIND(u) == rkind);
10610
0
            res = PyUnicode_DATA(u);
10611
10612
0
            memcpy(res, sbuf, rkind * slen);
10613
            /* change everything in-place, starting with this one */
10614
0
            memcpy(res + rkind * i,
10615
0
                   buf2,
10616
0
                   rkind * len2);
10617
0
            i += len1;
10618
10619
0
            while ( --maxcount > 0) {
10620
0
                i = anylib_find(rkind, self,
10621
0
                                sbuf+rkind*i, slen-i,
10622
0
                                str1, buf1, len1, i);
10623
0
                if (i == -1)
10624
0
                    break;
10625
0
                memcpy(res + rkind * i,
10626
0
                       buf2,
10627
0
                       rkind * len2);
10628
0
                i += len1;
10629
0
            }
10630
0
        }
10631
9.38M
    }
10632
50.6M
    else {
10633
50.6M
        Py_ssize_t n, i, j, ires;
10634
50.6M
        Py_ssize_t new_size;
10635
50.6M
        int rkind = skind;
10636
50.6M
        char *res;
10637
10638
50.6M
        if (kind1 < rkind) {
10639
            /* widen substring */
10640
6.64M
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10641
6.64M
            if (!buf1) goto error;
10642
6.64M
            release1 = 1;
10643
6.64M
        }
10644
50.6M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10645
50.6M
        if (n == 0)
10646
44.6M
            goto nothing;
10647
5.97M
        if (kind2 < rkind) {
10648
            /* widen replacement */
10649
1.39M
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10650
1.39M
            if (!buf2) goto error;
10651
1.39M
            release2 = 1;
10652
1.39M
        }
10653
4.57M
        else if (kind2 > rkind) {
10654
            /* widen self and buf1 */
10655
0
            rkind = kind2;
10656
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10657
0
            if (!sbuf) goto error;
10658
0
            srelease = 1;
10659
0
            if (release1) {
10660
0
                assert(buf1 != PyUnicode_DATA(str1));
10661
0
                PyMem_Free((void *)buf1);
10662
0
                buf1 = PyUnicode_DATA(str1);
10663
0
                release1 = 0;
10664
0
            }
10665
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10666
0
            if (!buf1) goto error;
10667
0
            release1 = 1;
10668
0
        }
10669
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10670
           PyUnicode_GET_LENGTH(str1)); */
10671
5.97M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10672
0
                PyErr_SetString(PyExc_OverflowError,
10673
0
                                "replace string is too long");
10674
0
                goto error;
10675
0
        }
10676
5.97M
        new_size = slen + n * (len2 - len1);
10677
5.97M
        if (new_size == 0) {
10678
0
            u = _PyUnicode_GetEmpty();
10679
0
            goto done;
10680
0
        }
10681
5.97M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10682
0
            PyErr_SetString(PyExc_OverflowError,
10683
0
                            "replace string is too long");
10684
0
            goto error;
10685
0
        }
10686
5.97M
        u = PyUnicode_New(new_size, maxchar);
10687
5.97M
        if (!u)
10688
0
            goto error;
10689
5.97M
        assert(PyUnicode_KIND(u) == rkind);
10690
5.97M
        res = PyUnicode_DATA(u);
10691
5.97M
        ires = i = 0;
10692
5.97M
        if (len1 > 0) {
10693
158M
            while (n-- > 0) {
10694
                /* look for next match */
10695
152M
                j = anylib_find(rkind, self,
10696
152M
                                sbuf + rkind * i, slen-i,
10697
152M
                                str1, buf1, len1, i);
10698
152M
                if (j == -1)
10699
0
                    break;
10700
152M
                else if (j > i) {
10701
                    /* copy unchanged part [i:j] */
10702
28.1M
                    memcpy(res + rkind * ires,
10703
28.1M
                           sbuf + rkind * i,
10704
28.1M
                           rkind * (j-i));
10705
28.1M
                    ires += j - i;
10706
28.1M
                }
10707
                /* copy substitution string */
10708
152M
                if (len2 > 0) {
10709
152M
                    memcpy(res + rkind * ires,
10710
152M
                           buf2,
10711
152M
                           rkind * len2);
10712
152M
                    ires += len2;
10713
152M
                }
10714
152M
                i = j + len1;
10715
152M
            }
10716
5.97M
            if (i < slen)
10717
                /* copy tail [i:] */
10718
5.78M
                memcpy(res + rkind * ires,
10719
5.78M
                       sbuf + rkind * i,
10720
5.78M
                       rkind * (slen-i));
10721
5.97M
        }
10722
0
        else {
10723
            /* interleave */
10724
0
            while (n > 0) {
10725
0
                memcpy(res + rkind * ires,
10726
0
                       buf2,
10727
0
                       rkind * len2);
10728
0
                ires += len2;
10729
0
                if (--n <= 0)
10730
0
                    break;
10731
0
                memcpy(res + rkind * ires,
10732
0
                       sbuf + rkind * i,
10733
0
                       rkind);
10734
0
                ires++;
10735
0
                i++;
10736
0
            }
10737
0
            memcpy(res + rkind * ires,
10738
0
                   sbuf + rkind * i,
10739
0
                   rkind * (slen-i));
10740
0
        }
10741
5.97M
    }
10742
10743
8.10M
    if (mayshrink) {
10744
0
        unicode_adjust_maxchar(&u);
10745
0
        if (u == NULL)
10746
0
            goto error;
10747
0
    }
10748
10749
8.10M
  done:
10750
8.10M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10751
8.10M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10752
8.10M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10753
8.10M
    if (srelease)
10754
0
        PyMem_Free((void *)sbuf);
10755
8.10M
    if (release1)
10756
1.39M
        PyMem_Free((void *)buf1);
10757
8.10M
    if (release2)
10758
1.39M
        PyMem_Free((void *)buf2);
10759
8.10M
    assert(_PyUnicode_CheckConsistency(u, 1));
10760
8.10M
    return u;
10761
10762
89.1M
  nothing:
10763
    /* nothing to replace; return original string (when possible) */
10764
89.1M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10765
89.1M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10766
89.1M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10767
89.1M
    if (srelease)
10768
0
        PyMem_Free((void *)sbuf);
10769
89.1M
    if (release1)
10770
5.24M
        PyMem_Free((void *)buf1);
10771
89.1M
    if (release2)
10772
0
        PyMem_Free((void *)buf2);
10773
89.1M
    return unicode_result_unchanged(self);
10774
10775
0
  error:
10776
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10777
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10778
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10779
0
    if (srelease)
10780
0
        PyMem_Free((void *)sbuf);
10781
0
    if (release1)
10782
0
        PyMem_Free((void *)buf1);
10783
0
    if (release2)
10784
0
        PyMem_Free((void *)buf2);
10785
0
    return NULL;
10786
8.10M
}
10787
10788
/* --- Unicode Object Methods --------------------------------------------- */
10789
10790
/*[clinic input]
10791
@permit_long_docstring_body
10792
str.title as unicode_title
10793
10794
Return a version of the string where each word is titlecased.
10795
10796
More specifically, words start with uppercased characters and all remaining
10797
cased characters have lower case.
10798
[clinic start generated code]*/
10799
10800
static PyObject *
10801
unicode_title_impl(PyObject *self)
10802
/*[clinic end generated code: output=c75ae03809574902 input=533ce0eb6a7f5d1b]*/
10803
0
{
10804
0
    return case_operation(self, do_title);
10805
0
}
10806
10807
/*[clinic input]
10808
@permit_long_docstring_body
10809
str.capitalize as unicode_capitalize
10810
10811
Return a capitalized version of the string.
10812
10813
More specifically, make the first character have upper case and the rest lower
10814
case.
10815
[clinic start generated code]*/
10816
10817
static PyObject *
10818
unicode_capitalize_impl(PyObject *self)
10819
/*[clinic end generated code: output=e49a4c333cdb7667 input=a4a15ade41f6f9e9]*/
10820
0
{
10821
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10822
0
        return unicode_result_unchanged(self);
10823
0
    return case_operation(self, do_capitalize);
10824
0
}
10825
10826
/*[clinic input]
10827
str.casefold as unicode_casefold
10828
10829
Return a version of the string suitable for caseless comparisons.
10830
[clinic start generated code]*/
10831
10832
static PyObject *
10833
unicode_casefold_impl(PyObject *self)
10834
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10835
0
{
10836
0
    if (PyUnicode_IS_ASCII(self))
10837
0
        return ascii_upper_or_lower(self, 1);
10838
0
    return case_operation(self, do_casefold);
10839
0
}
10840
10841
10842
/* Argument converter. Accepts a single Unicode character. */
10843
10844
static int
10845
convert_uc(PyObject *obj, void *addr)
10846
0
{
10847
0
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10848
10849
0
    if (!PyUnicode_Check(obj)) {
10850
0
        PyErr_Format(PyExc_TypeError,
10851
0
                     "The fill character must be a unicode character, "
10852
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10853
0
        return 0;
10854
0
    }
10855
0
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10856
0
        PyErr_SetString(PyExc_TypeError,
10857
0
                        "The fill character must be exactly one character long");
10858
0
        return 0;
10859
0
    }
10860
0
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10861
0
    return 1;
10862
0
}
10863
10864
/*[clinic input]
10865
str.center as unicode_center
10866
10867
    width: Py_ssize_t
10868
    fillchar: Py_UCS4 = ' '
10869
    /
10870
10871
Return a centered string of length width.
10872
10873
Padding is done using the specified fill character (default is a space).
10874
[clinic start generated code]*/
10875
10876
static PyObject *
10877
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10878
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10879
0
{
10880
0
    Py_ssize_t marg, left;
10881
10882
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10883
0
        return unicode_result_unchanged(self);
10884
10885
0
    marg = width - PyUnicode_GET_LENGTH(self);
10886
0
    left = marg / 2 + (marg & width & 1);
10887
10888
0
    return pad(self, left, marg - left, fillchar);
10889
0
}
10890
10891
/* This function assumes that str1 and str2 are readied by the caller. */
10892
10893
static int
10894
unicode_compare(PyObject *str1, PyObject *str2)
10895
16.6M
{
10896
16.6M
#define COMPARE(TYPE1, TYPE2) \
10897
16.6M
    do { \
10898
15.6M
        TYPE1* p1 = (TYPE1 *)data1; \
10899
15.6M
        TYPE2* p2 = (TYPE2 *)data2; \
10900
15.6M
        TYPE1* end = p1 + len; \
10901
15.6M
        Py_UCS4 c1, c2; \
10902
15.6M
        for (; p1 != end; p1++, p2++) { \
10903
15.6M
            c1 = *p1; \
10904
15.6M
            c2 = *p2; \
10905
15.6M
            if (c1 != c2) \
10906
15.6M
                return (c1 < c2) ? -1 : 1; \
10907
15.6M
        } \
10908
15.6M
    } \
10909
15.6M
    while (0)
10910
10911
16.6M
    int kind1, kind2;
10912
16.6M
    const void *data1, *data2;
10913
16.6M
    Py_ssize_t len1, len2, len;
10914
10915
16.6M
    kind1 = PyUnicode_KIND(str1);
10916
16.6M
    kind2 = PyUnicode_KIND(str2);
10917
16.6M
    data1 = PyUnicode_DATA(str1);
10918
16.6M
    data2 = PyUnicode_DATA(str2);
10919
16.6M
    len1 = PyUnicode_GET_LENGTH(str1);
10920
16.6M
    len2 = PyUnicode_GET_LENGTH(str2);
10921
16.6M
    len = Py_MIN(len1, len2);
10922
10923
16.6M
    switch(kind1) {
10924
1.44M
    case PyUnicode_1BYTE_KIND:
10925
1.44M
    {
10926
1.44M
        switch(kind2) {
10927
64.2k
        case PyUnicode_1BYTE_KIND:
10928
64.2k
        {
10929
64.2k
            int cmp = memcmp(data1, data2, len);
10930
            /* normalize result of memcmp() into the range [-1; 1] */
10931
64.2k
            if (cmp < 0)
10932
43.0k
                return -1;
10933
21.1k
            if (cmp > 0)
10934
20.6k
                return 1;
10935
548
            break;
10936
21.1k
        }
10937
1.08M
        case PyUnicode_2BYTE_KIND:
10938
1.08M
            COMPARE(Py_UCS1, Py_UCS2);
10939
0
            break;
10940
293k
        case PyUnicode_4BYTE_KIND:
10941
293k
            COMPARE(Py_UCS1, Py_UCS4);
10942
0
            break;
10943
0
        default:
10944
0
            Py_UNREACHABLE();
10945
1.44M
        }
10946
548
        break;
10947
1.44M
    }
10948
13.6M
    case PyUnicode_2BYTE_KIND:
10949
13.6M
    {
10950
13.6M
        switch(kind2) {
10951
4.22k
        case PyUnicode_1BYTE_KIND:
10952
4.22k
            COMPARE(Py_UCS2, Py_UCS1);
10953
0
            break;
10954
11.7M
        case PyUnicode_2BYTE_KIND:
10955
11.7M
        {
10956
11.7M
            COMPARE(Py_UCS2, Py_UCS2);
10957
0
            break;
10958
11.7M
        }
10959
1.90M
        case PyUnicode_4BYTE_KIND:
10960
1.90M
            COMPARE(Py_UCS2, Py_UCS4);
10961
0
            break;
10962
0
        default:
10963
0
            Py_UNREACHABLE();
10964
13.6M
        }
10965
0
        break;
10966
13.6M
    }
10967
1.51M
    case PyUnicode_4BYTE_KIND:
10968
1.51M
    {
10969
1.51M
        switch(kind2) {
10970
2.81k
        case PyUnicode_1BYTE_KIND:
10971
2.81k
            COMPARE(Py_UCS4, Py_UCS1);
10972
0
            break;
10973
569k
        case PyUnicode_2BYTE_KIND:
10974
569k
            COMPARE(Py_UCS4, Py_UCS2);
10975
0
            break;
10976
937k
        case PyUnicode_4BYTE_KIND:
10977
937k
        {
10978
937k
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10979
937k
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10980
            /* normalize result of wmemcmp() into the range [-1; 1] */
10981
937k
            if (cmp < 0)
10982
464k
                return -1;
10983
472k
            if (cmp > 0)
10984
472k
                return 1;
10985
#else
10986
            COMPARE(Py_UCS4, Py_UCS4);
10987
#endif
10988
0
            break;
10989
472k
        }
10990
0
        default:
10991
0
            Py_UNREACHABLE();
10992
1.51M
        }
10993
0
        break;
10994
1.51M
    }
10995
0
    default:
10996
0
        Py_UNREACHABLE();
10997
16.6M
    }
10998
10999
548
    if (len1 == len2)
11000
545
        return 0;
11001
3
    if (len1 < len2)
11002
3
        return -1;
11003
0
    else
11004
0
        return 1;
11005
11006
3
#undef COMPARE
11007
3
}
11008
11009
11010
int
11011
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11012
264M
{
11013
264M
    assert(PyUnicode_Check(str1));
11014
264M
    assert(PyUnicode_Check(str2));
11015
264M
    if (str1 == str2) {
11016
69.8M
        return 1;
11017
69.8M
    }
11018
194M
    return unicode_eq(str1, str2);
11019
264M
}
11020
11021
11022
int
11023
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11024
0
{
11025
0
    if (!PyUnicode_Check(str1)) {
11026
0
        PyErr_Format(PyExc_TypeError,
11027
0
                     "first argument must be str, not %T", str1);
11028
0
        return -1;
11029
0
    }
11030
0
    if (!PyUnicode_Check(str2)) {
11031
0
        PyErr_Format(PyExc_TypeError,
11032
0
                     "second argument must be str, not %T", str2);
11033
0
        return -1;
11034
0
    }
11035
11036
0
    return _PyUnicode_Equal(str1, str2);
11037
0
}
11038
11039
11040
int
11041
PyUnicode_Compare(PyObject *left, PyObject *right)
11042
7.14k
{
11043
7.14k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11044
        /* a string is equal to itself */
11045
7.14k
        if (left == right)
11046
0
            return 0;
11047
11048
7.14k
        return unicode_compare(left, right);
11049
7.14k
    }
11050
0
    PyErr_Format(PyExc_TypeError,
11051
0
                 "Can't compare %.100s and %.100s",
11052
0
                 Py_TYPE(left)->tp_name,
11053
0
                 Py_TYPE(right)->tp_name);
11054
0
    return -1;
11055
7.14k
}
11056
11057
int
11058
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11059
1.84M
{
11060
1.84M
    Py_ssize_t i;
11061
1.84M
    int kind;
11062
1.84M
    Py_UCS4 chr;
11063
11064
1.84M
    assert(_PyUnicode_CHECK(uni));
11065
1.84M
    kind = PyUnicode_KIND(uni);
11066
1.84M
    if (kind == PyUnicode_1BYTE_KIND) {
11067
1.84M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11068
1.84M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11069
1.84M
        size_t len, len2 = strlen(str);
11070
1.84M
        int cmp;
11071
11072
1.84M
        len = Py_MIN(len1, len2);
11073
1.84M
        cmp = memcmp(data, str, len);
11074
1.84M
        if (cmp != 0) {
11075
1.34M
            if (cmp < 0)
11076
8.26k
                return -1;
11077
1.34M
            else
11078
1.34M
                return 1;
11079
1.34M
        }
11080
493k
        if (len1 > len2)
11081
71
            return 1; /* uni is longer */
11082
493k
        if (len1 < len2)
11083
781
            return -1; /* str is longer */
11084
492k
        return 0;
11085
493k
    }
11086
1.43k
    else {
11087
1.43k
        const void *data = PyUnicode_DATA(uni);
11088
        /* Compare Unicode string and source character set string */
11089
2.69k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11090
2.44k
            if (chr != (unsigned char)str[i])
11091
1.18k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11092
        /* This check keeps Python strings that end in '\0' from comparing equal
11093
         to C strings identical up to that point. */
11094
246
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11095
246
            return 1; /* uni is longer */
11096
0
        if (str[i])
11097
0
            return -1; /* str is longer */
11098
0
        return 0;
11099
0
    }
11100
1.84M
}
11101
11102
int
11103
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11104
18
{
11105
18
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11106
18
}
11107
11108
int
11109
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11110
18
{
11111
18
    assert(_PyUnicode_CHECK(unicode));
11112
18
    assert(str);
11113
11114
18
    if (PyUnicode_IS_ASCII(unicode)) {
11115
18
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11116
18
        return size == len &&
11117
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11118
18
    }
11119
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11120
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11121
0
        return size == len &&
11122
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11123
0
    }
11124
11125
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11126
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11127
0
        return 0;
11128
0
    }
11129
0
    const unsigned char *s = (const unsigned char *)str;
11130
0
    const unsigned char *ends = s + (size_t)size;
11131
0
    int kind = PyUnicode_KIND(unicode);
11132
0
    const void *data = PyUnicode_DATA(unicode);
11133
    /* Compare Unicode string and UTF-8 string */
11134
0
    for (Py_ssize_t i = 0; i < len; i++) {
11135
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11136
0
        if (ch < 0x80) {
11137
0
            if (ends == s || s[0] != ch) {
11138
0
                return 0;
11139
0
            }
11140
0
            s += 1;
11141
0
        }
11142
0
        else if (ch < 0x800) {
11143
0
            if ((ends - s) < 2 ||
11144
0
                s[0] != (0xc0 | (ch >> 6)) ||
11145
0
                s[1] != (0x80 | (ch & 0x3f)))
11146
0
            {
11147
0
                return 0;
11148
0
            }
11149
0
            s += 2;
11150
0
        }
11151
0
        else if (ch < 0x10000) {
11152
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11153
0
                (ends - s) < 3 ||
11154
0
                s[0] != (0xe0 | (ch >> 12)) ||
11155
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11156
0
                s[2] != (0x80 | (ch & 0x3f)))
11157
0
            {
11158
0
                return 0;
11159
0
            }
11160
0
            s += 3;
11161
0
        }
11162
0
        else {
11163
0
            assert(ch <= MAX_UNICODE);
11164
0
            if ((ends - s) < 4 ||
11165
0
                s[0] != (0xf0 | (ch >> 18)) ||
11166
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11167
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11168
0
                s[3] != (0x80 | (ch & 0x3f)))
11169
0
            {
11170
0
                return 0;
11171
0
            }
11172
0
            s += 4;
11173
0
        }
11174
0
    }
11175
0
    return s == ends;
11176
0
}
11177
11178
int
11179
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11180
6.83M
{
11181
6.83M
    size_t len;
11182
6.83M
    assert(_PyUnicode_CHECK(unicode));
11183
6.83M
    assert(str);
11184
#ifndef NDEBUG
11185
    for (const char *p = str; *p; p++) {
11186
        assert((unsigned char)*p < 128);
11187
    }
11188
#endif
11189
6.83M
    if (!PyUnicode_IS_ASCII(unicode))
11190
149k
        return 0;
11191
6.68M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11192
6.68M
    return strlen(str) == len &&
11193
451k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11194
6.83M
}
11195
11196
int
11197
_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11198
0
{
11199
0
    PyObject *right_uni;
11200
11201
0
    assert(_PyUnicode_CHECK(left));
11202
0
    assert(right->string);
11203
#ifndef NDEBUG
11204
    for (const char *p = right->string; *p; p++) {
11205
        assert((unsigned char)*p < 128);
11206
    }
11207
#endif
11208
11209
0
    if (!PyUnicode_IS_ASCII(left))
11210
0
        return 0;
11211
11212
0
    right_uni = _PyUnicode_FromId(right);       /* borrowed */
11213
0
    if (right_uni == NULL) {
11214
        /* memory error or bad data */
11215
0
        PyErr_Clear();
11216
0
        return _PyUnicode_EqualToASCIIString(left, right->string);
11217
0
    }
11218
11219
0
    if (left == right_uni)
11220
0
        return 1;
11221
11222
0
    assert(PyUnicode_CHECK_INTERNED(right_uni));
11223
0
    if (PyUnicode_CHECK_INTERNED(left)) {
11224
0
        return 0;
11225
0
    }
11226
11227
0
    Py_hash_t right_hash = PyUnicode_HASH(right_uni);
11228
0
    assert(right_hash != -1);
11229
0
    Py_hash_t hash = PyUnicode_HASH(left);
11230
0
    if (hash != -1 && hash != right_hash) {
11231
0
        return 0;
11232
0
    }
11233
11234
0
    return unicode_eq(left, right_uni);
11235
0
}
11236
11237
PyObject *
11238
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11239
30.8M
{
11240
30.8M
    int result;
11241
11242
30.8M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11243
88.2k
        Py_RETURN_NOTIMPLEMENTED;
11244
11245
30.7M
    if (left == right) {
11246
1.76k
        switch (op) {
11247
1.67k
        case Py_EQ:
11248
1.67k
        case Py_LE:
11249
1.67k
        case Py_GE:
11250
            /* a string is equal to itself */
11251
1.67k
            Py_RETURN_TRUE;
11252
83
        case Py_NE:
11253
83
        case Py_LT:
11254
83
        case Py_GT:
11255
83
            Py_RETURN_FALSE;
11256
0
        default:
11257
0
            PyErr_BadArgument();
11258
0
            return NULL;
11259
1.76k
        }
11260
1.76k
    }
11261
30.7M
    else if (op == Py_EQ || op == Py_NE) {
11262
14.1M
        result = unicode_eq(left, right);
11263
14.1M
        result ^= (op == Py_NE);
11264
14.1M
        return PyBool_FromLong(result);
11265
14.1M
    }
11266
16.6M
    else {
11267
16.6M
        result = unicode_compare(left, right);
11268
16.6M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11269
16.6M
    }
11270
30.7M
}
11271
11272
int
11273
PyUnicode_Contains(PyObject *str, PyObject *substr)
11274
80.7M
{
11275
80.7M
    int kind1, kind2;
11276
80.7M
    const void *buf1, *buf2;
11277
80.7M
    Py_ssize_t len1, len2;
11278
80.7M
    int result;
11279
11280
80.7M
    if (!PyUnicode_Check(substr)) {
11281
0
        PyErr_Format(PyExc_TypeError,
11282
0
                     "'in <string>' requires string as left operand, not %.100s",
11283
0
                     Py_TYPE(substr)->tp_name);
11284
0
        return -1;
11285
0
    }
11286
80.7M
    if (ensure_unicode(str) < 0)
11287
0
        return -1;
11288
11289
80.7M
    kind1 = PyUnicode_KIND(str);
11290
80.7M
    kind2 = PyUnicode_KIND(substr);
11291
80.7M
    if (kind1 < kind2)
11292
3.78M
        return 0;
11293
76.9M
    len1 = PyUnicode_GET_LENGTH(str);
11294
76.9M
    len2 = PyUnicode_GET_LENGTH(substr);
11295
76.9M
    if (len1 < len2)
11296
19.1k
        return 0;
11297
76.9M
    buf1 = PyUnicode_DATA(str);
11298
76.9M
    buf2 = PyUnicode_DATA(substr);
11299
76.9M
    if (len2 == 1) {
11300
76.8M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11301
76.8M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11302
76.8M
        return result;
11303
76.8M
    }
11304
33.1k
    if (kind2 != kind1) {
11305
16.6k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11306
16.6k
        if (!buf2)
11307
0
            return -1;
11308
16.6k
    }
11309
11310
33.1k
    switch (kind1) {
11311
16.5k
    case PyUnicode_1BYTE_KIND:
11312
16.5k
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11313
16.5k
        break;
11314
12.8k
    case PyUnicode_2BYTE_KIND:
11315
12.8k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11316
12.8k
        break;
11317
3.76k
    case PyUnicode_4BYTE_KIND:
11318
3.76k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11319
3.76k
        break;
11320
0
    default:
11321
0
        Py_UNREACHABLE();
11322
33.1k
    }
11323
11324
33.1k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11325
33.1k
    if (kind2 != kind1)
11326
16.6k
        PyMem_Free((void *)buf2);
11327
11328
33.1k
    return result;
11329
33.1k
}
11330
11331
/* Concat to string or Unicode object giving a new Unicode object. */
11332
11333
PyObject *
11334
PyUnicode_Concat(PyObject *left, PyObject *right)
11335
47.3M
{
11336
47.3M
    PyObject *result;
11337
47.3M
    Py_UCS4 maxchar, maxchar2;
11338
47.3M
    Py_ssize_t left_len, right_len, new_len;
11339
11340
47.3M
    if (ensure_unicode(left) < 0)
11341
0
        return NULL;
11342
11343
47.3M
    if (!PyUnicode_Check(right)) {
11344
0
        PyErr_Format(PyExc_TypeError,
11345
0
            "can only concatenate str (not \"%.200s\") to str",
11346
0
            Py_TYPE(right)->tp_name);
11347
0
        return NULL;
11348
0
    }
11349
11350
    /* Shortcuts */
11351
47.3M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11352
47.3M
    if (left == empty) {
11353
66.6k
        return PyUnicode_FromObject(right);
11354
66.6k
    }
11355
47.3M
    if (right == empty) {
11356
5.17M
        return PyUnicode_FromObject(left);
11357
5.17M
    }
11358
11359
42.1M
    left_len = PyUnicode_GET_LENGTH(left);
11360
42.1M
    right_len = PyUnicode_GET_LENGTH(right);
11361
42.1M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11362
0
        PyErr_SetString(PyExc_OverflowError,
11363
0
                        "strings are too large to concat");
11364
0
        return NULL;
11365
0
    }
11366
42.1M
    new_len = left_len + right_len;
11367
11368
42.1M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11369
42.1M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11370
42.1M
    maxchar = Py_MAX(maxchar, maxchar2);
11371
11372
    /* Concat the two Unicode strings */
11373
42.1M
    result = PyUnicode_New(new_len, maxchar);
11374
42.1M
    if (result == NULL)
11375
0
        return NULL;
11376
42.1M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11377
42.1M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11378
42.1M
    assert(_PyUnicode_CheckConsistency(result, 1));
11379
42.1M
    return result;
11380
42.1M
}
11381
11382
void
11383
PyUnicode_Append(PyObject **p_left, PyObject *right)
11384
963k
{
11385
963k
    PyObject *left, *res;
11386
963k
    Py_UCS4 maxchar, maxchar2;
11387
963k
    Py_ssize_t left_len, right_len, new_len;
11388
11389
963k
    if (p_left == NULL) {
11390
0
        if (!PyErr_Occurred())
11391
0
            PyErr_BadInternalCall();
11392
0
        return;
11393
0
    }
11394
963k
    left = *p_left;
11395
963k
    if (right == NULL || left == NULL
11396
963k
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11397
0
        if (!PyErr_Occurred())
11398
0
            PyErr_BadInternalCall();
11399
0
        goto error;
11400
0
    }
11401
11402
    /* Shortcuts */
11403
963k
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11404
963k
    if (left == empty) {
11405
411k
        Py_DECREF(left);
11406
411k
        *p_left = Py_NewRef(right);
11407
411k
        return;
11408
411k
    }
11409
552k
    if (right == empty) {
11410
0
        return;
11411
0
    }
11412
11413
552k
    left_len = PyUnicode_GET_LENGTH(left);
11414
552k
    right_len = PyUnicode_GET_LENGTH(right);
11415
552k
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11416
0
        PyErr_SetString(PyExc_OverflowError,
11417
0
                        "strings are too large to concat");
11418
0
        goto error;
11419
0
    }
11420
552k
    new_len = left_len + right_len;
11421
11422
552k
    if (_PyUnicode_IsModifiable(left)
11423
552k
        && PyUnicode_CheckExact(right)
11424
552k
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11425
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11426
           to change the structure size, but characters are stored just after
11427
           the structure, and so it requires to move all characters which is
11428
           not so different than duplicating the string. */
11429
510k
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11430
510k
    {
11431
        /* append inplace */
11432
510k
        if (unicode_resize(p_left, new_len) != 0)
11433
0
            goto error;
11434
11435
        /* copy 'right' into the newly allocated area of 'left' */
11436
510k
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11437
510k
    }
11438
41.9k
    else {
11439
41.9k
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11440
41.9k
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11441
41.9k
        maxchar = Py_MAX(maxchar, maxchar2);
11442
11443
        /* Concat the two Unicode strings */
11444
41.9k
        res = PyUnicode_New(new_len, maxchar);
11445
41.9k
        if (res == NULL)
11446
0
            goto error;
11447
41.9k
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11448
41.9k
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11449
41.9k
        Py_DECREF(left);
11450
41.9k
        *p_left = res;
11451
41.9k
    }
11452
552k
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11453
552k
    return;
11454
11455
0
error:
11456
0
    Py_CLEAR(*p_left);
11457
0
}
11458
11459
void
11460
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11461
0
{
11462
0
    PyUnicode_Append(pleft, right);
11463
0
    Py_XDECREF(right);
11464
0
}
11465
11466
/*[clinic input]
11467
@permit_long_summary
11468
@text_signature "($self, sub[, start[, end]], /)"
11469
str.count as unicode_count -> Py_ssize_t
11470
11471
    self as str: self
11472
    sub as substr: unicode
11473
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11474
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11475
    /
11476
11477
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11478
11479
Optional arguments start and end are interpreted as in slice notation.
11480
[clinic start generated code]*/
11481
11482
static Py_ssize_t
11483
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11484
                   Py_ssize_t end)
11485
/*[clinic end generated code: output=8fcc3aef0b18edbf input=8590716ee228b935]*/
11486
31.1M
{
11487
31.1M
    assert(PyUnicode_Check(str));
11488
31.1M
    assert(PyUnicode_Check(substr));
11489
11490
31.1M
    Py_ssize_t result;
11491
31.1M
    int kind1, kind2;
11492
31.1M
    const void *buf1 = NULL, *buf2 = NULL;
11493
31.1M
    Py_ssize_t len1, len2;
11494
11495
31.1M
    kind1 = PyUnicode_KIND(str);
11496
31.1M
    kind2 = PyUnicode_KIND(substr);
11497
31.1M
    if (kind1 < kind2)
11498
0
        return 0;
11499
11500
31.1M
    len1 = PyUnicode_GET_LENGTH(str);
11501
31.1M
    len2 = PyUnicode_GET_LENGTH(substr);
11502
31.1M
    ADJUST_INDICES(start, end, len1);
11503
31.1M
    if (end - start < len2)
11504
10.4M
        return 0;
11505
11506
20.7M
    buf1 = PyUnicode_DATA(str);
11507
20.7M
    buf2 = PyUnicode_DATA(substr);
11508
20.7M
    if (kind2 != kind1) {
11509
4.20M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11510
4.20M
        if (!buf2)
11511
0
            goto onError;
11512
4.20M
    }
11513
11514
    // We don't reuse `anylib_count` here because of the explicit casts.
11515
20.7M
    switch (kind1) {
11516
16.5M
    case PyUnicode_1BYTE_KIND:
11517
16.5M
        result = ucs1lib_count(
11518
16.5M
            ((const Py_UCS1*)buf1) + start, end - start,
11519
16.5M
            buf2, len2, PY_SSIZE_T_MAX
11520
16.5M
            );
11521
16.5M
        break;
11522
3.27M
    case PyUnicode_2BYTE_KIND:
11523
3.27M
        result = ucs2lib_count(
11524
3.27M
            ((const Py_UCS2*)buf1) + start, end - start,
11525
3.27M
            buf2, len2, PY_SSIZE_T_MAX
11526
3.27M
            );
11527
3.27M
        break;
11528
928k
    case PyUnicode_4BYTE_KIND:
11529
928k
        result = ucs4lib_count(
11530
928k
            ((const Py_UCS4*)buf1) + start, end - start,
11531
928k
            buf2, len2, PY_SSIZE_T_MAX
11532
928k
            );
11533
928k
        break;
11534
0
    default:
11535
0
        Py_UNREACHABLE();
11536
20.7M
    }
11537
11538
20.7M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11539
20.7M
    if (kind2 != kind1)
11540
4.20M
        PyMem_Free((void *)buf2);
11541
11542
20.7M
    return result;
11543
0
  onError:
11544
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11545
0
    if (kind2 != kind1)
11546
0
        PyMem_Free((void *)buf2);
11547
0
    return -1;
11548
20.7M
}
11549
11550
/*[clinic input]
11551
str.encode as unicode_encode
11552
11553
    encoding: str(c_default="NULL") = 'utf-8'
11554
        The encoding in which to encode the string.
11555
    errors: str(c_default="NULL") = 'strict'
11556
        The error handling scheme to use for encoding errors.
11557
        The default is 'strict' meaning that encoding errors raise a
11558
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11559
        'xmlcharrefreplace' as well as any other name registered with
11560
        codecs.register_error that can handle UnicodeEncodeErrors.
11561
11562
Encode the string using the codec registered for encoding.
11563
[clinic start generated code]*/
11564
11565
static PyObject *
11566
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11567
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11568
16.6M
{
11569
16.6M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11570
16.6M
}
11571
11572
/*[clinic input]
11573
str.expandtabs as unicode_expandtabs
11574
11575
    tabsize: int = 8
11576
11577
Return a copy where all tab characters are expanded using spaces.
11578
11579
If tabsize is not given, a tab size of 8 characters is assumed.
11580
[clinic start generated code]*/
11581
11582
static PyObject *
11583
unicode_expandtabs_impl(PyObject *self, int tabsize)
11584
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11585
9.08M
{
11586
9.08M
    Py_ssize_t i, j, line_pos, src_len, incr;
11587
9.08M
    Py_UCS4 ch;
11588
9.08M
    PyObject *u;
11589
9.08M
    const void *src_data;
11590
9.08M
    void *dest_data;
11591
9.08M
    int kind;
11592
9.08M
    int found;
11593
11594
    /* First pass: determine size of output string */
11595
9.08M
    src_len = PyUnicode_GET_LENGTH(self);
11596
9.08M
    i = j = line_pos = 0;
11597
9.08M
    kind = PyUnicode_KIND(self);
11598
9.08M
    src_data = PyUnicode_DATA(self);
11599
9.08M
    found = 0;
11600
147M
    for (; i < src_len; i++) {
11601
138M
        ch = PyUnicode_READ(kind, src_data, i);
11602
138M
        if (ch == '\t') {
11603
10.3M
            found = 1;
11604
10.3M
            if (tabsize > 0) {
11605
10.3M
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11606
10.3M
                if (j > PY_SSIZE_T_MAX - incr)
11607
0
                    goto overflow;
11608
10.3M
                line_pos += incr;
11609
10.3M
                j += incr;
11610
10.3M
            }
11611
10.3M
        }
11612
127M
        else {
11613
127M
            if (j > PY_SSIZE_T_MAX - 1)
11614
0
                goto overflow;
11615
127M
            line_pos++;
11616
127M
            j++;
11617
127M
            if (ch == '\n' || ch == '\r')
11618
12.9k
                line_pos = 0;
11619
127M
        }
11620
138M
    }
11621
9.08M
    if (!found)
11622
8.76M
        return unicode_result_unchanged(self);
11623
11624
    /* Second pass: create output string and fill it */
11625
324k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11626
324k
    if (!u)
11627
0
        return NULL;
11628
324k
    dest_data = PyUnicode_DATA(u);
11629
11630
324k
    i = j = line_pos = 0;
11631
11632
31.1M
    for (; i < src_len; i++) {
11633
30.7M
        ch = PyUnicode_READ(kind, src_data, i);
11634
30.7M
        if (ch == '\t') {
11635
10.3M
            if (tabsize > 0) {
11636
10.3M
                incr = tabsize - (line_pos % tabsize);
11637
10.3M
                line_pos += incr;
11638
10.3M
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11639
10.3M
                j += incr;
11640
10.3M
            }
11641
10.3M
        }
11642
20.4M
        else {
11643
20.4M
            line_pos++;
11644
20.4M
            PyUnicode_WRITE(kind, dest_data, j, ch);
11645
20.4M
            j++;
11646
20.4M
            if (ch == '\n' || ch == '\r')
11647
0
                line_pos = 0;
11648
20.4M
        }
11649
30.7M
    }
11650
324k
    assert (j == PyUnicode_GET_LENGTH(u));
11651
324k
    return unicode_result(u);
11652
11653
0
  overflow:
11654
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11655
0
    return NULL;
11656
324k
}
11657
11658
/*[clinic input]
11659
@permit_long_summary
11660
str.find as unicode_find = str.count
11661
11662
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11663
11664
Optional arguments start and end are interpreted as in slice notation.
11665
Return -1 on failure.
11666
[clinic start generated code]*/
11667
11668
static Py_ssize_t
11669
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11670
                  Py_ssize_t end)
11671
/*[clinic end generated code: output=51dbe6255712e278 input=3a9d650fe4c24695]*/
11672
32.7M
{
11673
32.7M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11674
32.7M
    if (result < 0) {
11675
8.41M
        return -1;
11676
8.41M
    }
11677
24.3M
    return result;
11678
32.7M
}
11679
11680
static PyObject *
11681
unicode_getitem(PyObject *self, Py_ssize_t index)
11682
48.7M
{
11683
48.7M
    const void *data;
11684
48.7M
    int kind;
11685
48.7M
    Py_UCS4 ch;
11686
11687
48.7M
    if (!PyUnicode_Check(self)) {
11688
0
        PyErr_BadArgument();
11689
0
        return NULL;
11690
0
    }
11691
48.7M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11692
351
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11693
351
        return NULL;
11694
351
    }
11695
48.7M
    kind = PyUnicode_KIND(self);
11696
48.7M
    data = PyUnicode_DATA(self);
11697
48.7M
    ch = PyUnicode_READ(kind, data, index);
11698
48.7M
    return unicode_char(ch);
11699
48.7M
}
11700
11701
/* Believe it or not, this produces the same value for ASCII strings
11702
   as bytes_hash(). */
11703
static Py_hash_t
11704
unicode_hash(PyObject *self)
11705
44.1M
{
11706
44.1M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11707
11708
#ifdef Py_DEBUG
11709
    assert(_Py_HashSecret_Initialized);
11710
#endif
11711
44.1M
    Py_hash_t hash = PyUnicode_HASH(self);
11712
44.1M
    if (hash != -1) {
11713
254k
        return hash;
11714
254k
    }
11715
43.8M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11716
43.8M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11717
11718
43.8M
    PyUnicode_SET_HASH(self, x);
11719
43.8M
    return x;
11720
44.1M
}
11721
11722
/*[clinic input]
11723
@permit_long_summary
11724
str.index as unicode_index = str.count
11725
11726
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11727
11728
Optional arguments start and end are interpreted as in slice notation.
11729
Raises ValueError when the substring is not found.
11730
[clinic start generated code]*/
11731
11732
static Py_ssize_t
11733
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11734
                   Py_ssize_t end)
11735
/*[clinic end generated code: output=77558288837cdf40 input=ae5e48f69ed75b06]*/
11736
0
{
11737
0
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11738
0
    if (result == -1) {
11739
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
11740
0
    }
11741
0
    else if (result < 0) {
11742
0
        return -1;
11743
0
    }
11744
0
    return result;
11745
0
}
11746
11747
/*[clinic input]
11748
str.isascii as unicode_isascii
11749
11750
Return True if all characters in the string are ASCII, False otherwise.
11751
11752
ASCII characters have code points in the range U+0000-U+007F.
11753
Empty string is ASCII too.
11754
[clinic start generated code]*/
11755
11756
static PyObject *
11757
unicode_isascii_impl(PyObject *self)
11758
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11759
684
{
11760
684
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11761
684
}
11762
11763
/*[clinic input]
11764
@permit_long_docstring_body
11765
str.islower as unicode_islower
11766
11767
Return True if the string is a lowercase string, False otherwise.
11768
11769
A string is lowercase if all cased characters in the string are lowercase and
11770
there is at least one cased character in the string.
11771
[clinic start generated code]*/
11772
11773
static PyObject *
11774
unicode_islower_impl(PyObject *self)
11775
/*[clinic end generated code: output=dbd41995bd005b81 input=c6fc0295241a1aaa]*/
11776
0
{
11777
0
    Py_ssize_t i, length;
11778
0
    int kind;
11779
0
    const void *data;
11780
0
    int cased;
11781
11782
0
    length = PyUnicode_GET_LENGTH(self);
11783
0
    kind = PyUnicode_KIND(self);
11784
0
    data = PyUnicode_DATA(self);
11785
11786
    /* Shortcut for single character strings */
11787
0
    if (length == 1)
11788
0
        return PyBool_FromLong(
11789
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11790
11791
    /* Special case for empty strings */
11792
0
    if (length == 0)
11793
0
        Py_RETURN_FALSE;
11794
11795
0
    cased = 0;
11796
0
    for (i = 0; i < length; i++) {
11797
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11798
11799
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11800
0
            Py_RETURN_FALSE;
11801
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11802
0
            cased = 1;
11803
0
    }
11804
0
    return PyBool_FromLong(cased);
11805
0
}
11806
11807
/*[clinic input]
11808
@permit_long_docstring_body
11809
str.isupper as unicode_isupper
11810
11811
Return True if the string is an uppercase string, False otherwise.
11812
11813
A string is uppercase if all cased characters in the string are uppercase and
11814
there is at least one cased character in the string.
11815
[clinic start generated code]*/
11816
11817
static PyObject *
11818
unicode_isupper_impl(PyObject *self)
11819
/*[clinic end generated code: output=049209c8e7f15f59 input=8d5cb33e67efde72]*/
11820
6.98k
{
11821
6.98k
    Py_ssize_t i, length;
11822
6.98k
    int kind;
11823
6.98k
    const void *data;
11824
6.98k
    int cased;
11825
11826
6.98k
    length = PyUnicode_GET_LENGTH(self);
11827
6.98k
    kind = PyUnicode_KIND(self);
11828
6.98k
    data = PyUnicode_DATA(self);
11829
11830
    /* Shortcut for single character strings */
11831
6.98k
    if (length == 1)
11832
0
        return PyBool_FromLong(
11833
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11834
11835
    /* Special case for empty strings */
11836
6.98k
    if (length == 0)
11837
0
        Py_RETURN_FALSE;
11838
11839
6.98k
    cased = 0;
11840
89.1k
    for (i = 0; i < length; i++) {
11841
83.0k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11842
11843
83.0k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11844
840
            Py_RETURN_FALSE;
11845
82.1k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11846
6.23k
            cased = 1;
11847
83.0k
    }
11848
6.14k
    return PyBool_FromLong(cased);
11849
6.98k
}
11850
11851
/*[clinic input]
11852
str.istitle as unicode_istitle
11853
11854
Return True if the string is a title-cased string, False otherwise.
11855
11856
In a title-cased string, upper- and title-case characters may only
11857
follow uncased characters and lowercase characters only cased ones.
11858
[clinic start generated code]*/
11859
11860
static PyObject *
11861
unicode_istitle_impl(PyObject *self)
11862
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11863
0
{
11864
0
    Py_ssize_t i, length;
11865
0
    int kind;
11866
0
    const void *data;
11867
0
    int cased, previous_is_cased;
11868
11869
0
    length = PyUnicode_GET_LENGTH(self);
11870
0
    kind = PyUnicode_KIND(self);
11871
0
    data = PyUnicode_DATA(self);
11872
11873
    /* Shortcut for single character strings */
11874
0
    if (length == 1) {
11875
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11876
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11877
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11878
0
    }
11879
11880
    /* Special case for empty strings */
11881
0
    if (length == 0)
11882
0
        Py_RETURN_FALSE;
11883
11884
0
    cased = 0;
11885
0
    previous_is_cased = 0;
11886
0
    for (i = 0; i < length; i++) {
11887
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11888
11889
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11890
0
            if (previous_is_cased)
11891
0
                Py_RETURN_FALSE;
11892
0
            previous_is_cased = 1;
11893
0
            cased = 1;
11894
0
        }
11895
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11896
0
            if (!previous_is_cased)
11897
0
                Py_RETURN_FALSE;
11898
0
            previous_is_cased = 1;
11899
0
            cased = 1;
11900
0
        }
11901
0
        else
11902
0
            previous_is_cased = 0;
11903
0
    }
11904
0
    return PyBool_FromLong(cased);
11905
0
}
11906
11907
/*[clinic input]
11908
@permit_long_docstring_body
11909
str.isspace as unicode_isspace
11910
11911
Return True if the string is a whitespace string, False otherwise.
11912
11913
A string is whitespace if all characters in the string are whitespace and there
11914
is at least one character in the string.
11915
[clinic start generated code]*/
11916
11917
static PyObject *
11918
unicode_isspace_impl(PyObject *self)
11919
/*[clinic end generated code: output=163a63bfa08ac2b9 input=44fe05e248c6e159]*/
11920
21.3M
{
11921
21.3M
    Py_ssize_t i, length;
11922
21.3M
    int kind;
11923
21.3M
    const void *data;
11924
11925
21.3M
    length = PyUnicode_GET_LENGTH(self);
11926
21.3M
    kind = PyUnicode_KIND(self);
11927
21.3M
    data = PyUnicode_DATA(self);
11928
11929
    /* Shortcut for single character strings */
11930
21.3M
    if (length == 1)
11931
21.3M
        return PyBool_FromLong(
11932
21.3M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11933
11934
    /* Special case for empty strings */
11935
0
    if (length == 0)
11936
0
        Py_RETURN_FALSE;
11937
11938
0
    for (i = 0; i < length; i++) {
11939
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11940
0
        if (!Py_UNICODE_ISSPACE(ch))
11941
0
            Py_RETURN_FALSE;
11942
0
    }
11943
0
    Py_RETURN_TRUE;
11944
0
}
11945
11946
/*[clinic input]
11947
@permit_long_docstring_body
11948
str.isalpha as unicode_isalpha
11949
11950
Return True if the string is an alphabetic string, False otherwise.
11951
11952
A string is alphabetic if all characters in the string are alphabetic and there
11953
is at least one character in the string.
11954
[clinic start generated code]*/
11955
11956
static PyObject *
11957
unicode_isalpha_impl(PyObject *self)
11958
/*[clinic end generated code: output=cc81b9ac3883ec4f input=c233000624a56e0d]*/
11959
0
{
11960
0
    Py_ssize_t i, length;
11961
0
    int kind;
11962
0
    const void *data;
11963
11964
0
    length = PyUnicode_GET_LENGTH(self);
11965
0
    kind = PyUnicode_KIND(self);
11966
0
    data = PyUnicode_DATA(self);
11967
11968
    /* Shortcut for single character strings */
11969
0
    if (length == 1)
11970
0
        return PyBool_FromLong(
11971
0
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11972
11973
    /* Special case for empty strings */
11974
0
    if (length == 0)
11975
0
        Py_RETURN_FALSE;
11976
11977
0
    for (i = 0; i < length; i++) {
11978
0
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11979
0
            Py_RETURN_FALSE;
11980
0
    }
11981
0
    Py_RETURN_TRUE;
11982
0
}
11983
11984
/*[clinic input]
11985
@permit_long_docstring_body
11986
str.isalnum as unicode_isalnum
11987
11988
Return True if the string is an alpha-numeric string, False otherwise.
11989
11990
A string is alpha-numeric if all characters in the string are alpha-numeric and
11991
there is at least one character in the string.
11992
[clinic start generated code]*/
11993
11994
static PyObject *
11995
unicode_isalnum_impl(PyObject *self)
11996
/*[clinic end generated code: output=a5a23490ffc3660c input=5d63ba9c9bafdb6b]*/
11997
0
{
11998
0
    int kind;
11999
0
    const void *data;
12000
0
    Py_ssize_t len, i;
12001
12002
0
    kind = PyUnicode_KIND(self);
12003
0
    data = PyUnicode_DATA(self);
12004
0
    len = PyUnicode_GET_LENGTH(self);
12005
12006
    /* Shortcut for single character strings */
12007
0
    if (len == 1) {
12008
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12009
0
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12010
0
    }
12011
12012
    /* Special case for empty strings */
12013
0
    if (len == 0)
12014
0
        Py_RETURN_FALSE;
12015
12016
0
    for (i = 0; i < len; i++) {
12017
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12018
0
        if (!Py_UNICODE_ISALNUM(ch))
12019
0
            Py_RETURN_FALSE;
12020
0
    }
12021
0
    Py_RETURN_TRUE;
12022
0
}
12023
12024
/*[clinic input]
12025
@permit_long_docstring_body
12026
str.isdecimal as unicode_isdecimal
12027
12028
Return True if the string is a decimal string, False otherwise.
12029
12030
A string is a decimal string if all characters in the string are decimal and
12031
there is at least one character in the string.
12032
[clinic start generated code]*/
12033
12034
static PyObject *
12035
unicode_isdecimal_impl(PyObject *self)
12036
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=8e84a58b414935a3]*/
12037
0
{
12038
0
    Py_ssize_t i, length;
12039
0
    int kind;
12040
0
    const void *data;
12041
12042
0
    length = PyUnicode_GET_LENGTH(self);
12043
0
    kind = PyUnicode_KIND(self);
12044
0
    data = PyUnicode_DATA(self);
12045
12046
    /* Shortcut for single character strings */
12047
0
    if (length == 1)
12048
0
        return PyBool_FromLong(
12049
0
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12050
12051
    /* Special case for empty strings */
12052
0
    if (length == 0)
12053
0
        Py_RETURN_FALSE;
12054
12055
0
    for (i = 0; i < length; i++) {
12056
0
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12057
0
            Py_RETURN_FALSE;
12058
0
    }
12059
0
    Py_RETURN_TRUE;
12060
0
}
12061
12062
/*[clinic input]
12063
@permit_long_docstring_body
12064
str.isdigit as unicode_isdigit
12065
12066
Return True if the string is a digit string, False otherwise.
12067
12068
A string is a digit string if all characters in the string are digits and there
12069
is at least one character in the string.
12070
[clinic start generated code]*/
12071
12072
static PyObject *
12073
unicode_isdigit_impl(PyObject *self)
12074
/*[clinic end generated code: output=10a6985311da6858 input=99e284affb54d4a0]*/
12075
1.37M
{
12076
1.37M
    Py_ssize_t i, length;
12077
1.37M
    int kind;
12078
1.37M
    const void *data;
12079
12080
1.37M
    length = PyUnicode_GET_LENGTH(self);
12081
1.37M
    kind = PyUnicode_KIND(self);
12082
1.37M
    data = PyUnicode_DATA(self);
12083
12084
    /* Shortcut for single character strings */
12085
1.37M
    if (length == 1) {
12086
1.37M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12087
1.37M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12088
1.37M
    }
12089
12090
    /* Special case for empty strings */
12091
306
    if (length == 0)
12092
0
        Py_RETURN_FALSE;
12093
12094
1.09k
    for (i = 0; i < length; i++) {
12095
786
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12096
0
            Py_RETURN_FALSE;
12097
786
    }
12098
306
    Py_RETURN_TRUE;
12099
306
}
12100
12101
/*[clinic input]
12102
@permit_long_docstring_body
12103
str.isnumeric as unicode_isnumeric
12104
12105
Return True if the string is a numeric string, False otherwise.
12106
12107
A string is numeric if all characters in the string are numeric and there is at
12108
least one character in the string.
12109
[clinic start generated code]*/
12110
12111
static PyObject *
12112
unicode_isnumeric_impl(PyObject *self)
12113
/*[clinic end generated code: output=9172a32d9013051a input=e9f5b6b8b29b0ee6]*/
12114
0
{
12115
0
    Py_ssize_t i, length;
12116
0
    int kind;
12117
0
    const void *data;
12118
12119
0
    length = PyUnicode_GET_LENGTH(self);
12120
0
    kind = PyUnicode_KIND(self);
12121
0
    data = PyUnicode_DATA(self);
12122
12123
    /* Shortcut for single character strings */
12124
0
    if (length == 1)
12125
0
        return PyBool_FromLong(
12126
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12127
12128
    /* Special case for empty strings */
12129
0
    if (length == 0)
12130
0
        Py_RETURN_FALSE;
12131
12132
0
    for (i = 0; i < length; i++) {
12133
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12134
0
            Py_RETURN_FALSE;
12135
0
    }
12136
0
    Py_RETURN_TRUE;
12137
0
}
12138
12139
Py_ssize_t
12140
_PyUnicode_ScanIdentifier(PyObject *self)
12141
12.8k
{
12142
12.8k
    Py_ssize_t i;
12143
12.8k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12144
12.8k
    if (len == 0) {
12145
        /* an empty string is not a valid identifier */
12146
0
        return 0;
12147
0
    }
12148
12149
12.8k
    int kind = PyUnicode_KIND(self);
12150
12.8k
    const void *data = PyUnicode_DATA(self);
12151
12.8k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12152
    /* PEP 3131 says that the first character must be in
12153
       XID_Start and subsequent characters in XID_Continue,
12154
       and for the ASCII range, the 2.x rules apply (i.e
12155
       start with letters and underscore, continue with
12156
       letters, digits, underscore). However, given the current
12157
       definition of XID_Start and XID_Continue, it is sufficient
12158
       to check just for these, except that _ must be allowed
12159
       as starting an identifier.  */
12160
12.8k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12161
471
        return 0;
12162
471
    }
12163
12164
44.0k
    for (i = 1; i < len; i++) {
12165
31.9k
        ch = PyUnicode_READ(kind, data, i);
12166
31.9k
        if (!_PyUnicode_IsXidContinue(ch)) {
12167
240
            return i;
12168
240
        }
12169
31.9k
    }
12170
12.1k
    return i;
12171
12.4k
}
12172
12173
int
12174
PyUnicode_IsIdentifier(PyObject *self)
12175
958
{
12176
958
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12177
958
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12178
    /* an empty string is not a valid identifier */
12179
958
    return len && i == len;
12180
958
}
12181
12182
/*[clinic input]
12183
@permit_long_docstring_body
12184
str.isidentifier as unicode_isidentifier
12185
12186
Return True if the string is a valid Python identifier, False otherwise.
12187
12188
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12189
such as "def" or "class".
12190
[clinic start generated code]*/
12191
12192
static PyObject *
12193
unicode_isidentifier_impl(PyObject *self)
12194
/*[clinic end generated code: output=fe585a9666572905 input=86315dd889d7bd04]*/
12195
496
{
12196
496
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12197
496
}
12198
12199
/*[clinic input]
12200
@permit_long_summary
12201
str.isprintable as unicode_isprintable
12202
12203
Return True if all characters in the string are printable, False otherwise.
12204
12205
A character is printable if repr() may use it in its output.
12206
[clinic start generated code]*/
12207
12208
static PyObject *
12209
unicode_isprintable_impl(PyObject *self)
12210
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12211
1.08M
{
12212
1.08M
    Py_ssize_t i, length;
12213
1.08M
    int kind;
12214
1.08M
    const void *data;
12215
12216
1.08M
    length = PyUnicode_GET_LENGTH(self);
12217
1.08M
    kind = PyUnicode_KIND(self);
12218
1.08M
    data = PyUnicode_DATA(self);
12219
12220
    /* Shortcut for single character strings */
12221
1.08M
    if (length == 1)
12222
1.08M
        return PyBool_FromLong(
12223
1.08M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12224
12225
0
    for (i = 0; i < length; i++) {
12226
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12227
0
            Py_RETURN_FALSE;
12228
0
        }
12229
0
    }
12230
0
    Py_RETURN_TRUE;
12231
0
}
12232
12233
/*[clinic input]
12234
@permit_long_docstring_body
12235
str.join as unicode_join
12236
12237
    iterable: object
12238
    /
12239
12240
Concatenate any number of strings.
12241
12242
The string whose method is called is inserted in between each given string.
12243
The result is returned as a new string.
12244
12245
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12246
[clinic start generated code]*/
12247
12248
static PyObject *
12249
unicode_join(PyObject *self, PyObject *iterable)
12250
/*[clinic end generated code: output=6857e7cecfe7bf98 input=bac724ed412ef3f8]*/
12251
23.1M
{
12252
23.1M
    return PyUnicode_Join(self, iterable);
12253
23.1M
}
12254
12255
static Py_ssize_t
12256
unicode_length(PyObject *self)
12257
43.9M
{
12258
43.9M
    return PyUnicode_GET_LENGTH(self);
12259
43.9M
}
12260
12261
/*[clinic input]
12262
str.ljust as unicode_ljust
12263
12264
    width: Py_ssize_t
12265
    fillchar: Py_UCS4 = ' '
12266
    /
12267
12268
Return a left-justified string of length width.
12269
12270
Padding is done using the specified fill character (default is a space).
12271
[clinic start generated code]*/
12272
12273
static PyObject *
12274
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12275
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12276
0
{
12277
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12278
0
        return unicode_result_unchanged(self);
12279
12280
0
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12281
0
}
12282
12283
/*[clinic input]
12284
str.lower as unicode_lower
12285
12286
Return a copy of the string converted to lowercase.
12287
[clinic start generated code]*/
12288
12289
static PyObject *
12290
unicode_lower_impl(PyObject *self)
12291
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12292
75.1M
{
12293
75.1M
    if (PyUnicode_IS_ASCII(self))
12294
67.8M
        return ascii_upper_or_lower(self, 1);
12295
7.30M
    return case_operation(self, do_lower);
12296
75.1M
}
12297
12298
65.3M
#define LEFTSTRIP 0
12299
88.8M
#define RIGHTSTRIP 1
12300
35.0M
#define BOTHSTRIP 2
12301
12302
/* Arrays indexed by above */
12303
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12304
12305
0
#define STRIPNAME(i) (stripfuncnames[i])
12306
12307
/* externally visible for str.strip(unicode) */
12308
PyObject *
12309
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12310
10.8M
{
12311
10.8M
    const void *data;
12312
10.8M
    int kind;
12313
10.8M
    Py_ssize_t i, j, len;
12314
10.8M
    BLOOM_MASK sepmask;
12315
10.8M
    Py_ssize_t seplen;
12316
12317
10.8M
    kind = PyUnicode_KIND(self);
12318
10.8M
    data = PyUnicode_DATA(self);
12319
10.8M
    len = PyUnicode_GET_LENGTH(self);
12320
10.8M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12321
10.8M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12322
10.8M
                              PyUnicode_DATA(sepobj),
12323
10.8M
                              seplen);
12324
12325
10.8M
    i = 0;
12326
10.8M
    if (striptype != RIGHTSTRIP) {
12327
473k
        while (i < len) {
12328
470k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12329
470k
            if (!BLOOM(sepmask, ch))
12330
432k
                break;
12331
37.7k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12332
2.53k
                break;
12333
35.2k
            i++;
12334
35.2k
        }
12335
438k
    }
12336
12337
10.8M
    j = len;
12338
10.8M
    if (striptype != LEFTSTRIP) {
12339
10.4M
        j--;
12340
10.8M
        while (j >= i) {
12341
4.88M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12342
4.88M
            if (!BLOOM(sepmask, ch))
12343
4.42M
                break;
12344
460k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12345
30.3k
                break;
12346
430k
            j--;
12347
430k
        }
12348
12349
10.4M
        j++;
12350
10.4M
    }
12351
12352
10.8M
    return PyUnicode_Substring(self, i, j);
12353
10.8M
}
12354
12355
PyObject*
12356
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12357
223M
{
12358
223M
    const unsigned char *data;
12359
223M
    int kind;
12360
223M
    Py_ssize_t length;
12361
12362
223M
    length = PyUnicode_GET_LENGTH(self);
12363
223M
    end = Py_MIN(end, length);
12364
12365
223M
    if (start == 0 && end == length)
12366
56.2M
        return unicode_result_unchanged(self);
12367
12368
167M
    if (start < 0 || end < 0) {
12369
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12370
0
        return NULL;
12371
0
    }
12372
167M
    if (start >= length || end < start)
12373
173k
        _Py_RETURN_UNICODE_EMPTY();
12374
12375
166M
    length = end - start;
12376
166M
    if (PyUnicode_IS_ASCII(self)) {
12377
43.8M
        data = PyUnicode_1BYTE_DATA(self);
12378
43.8M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12379
43.8M
    }
12380
123M
    else {
12381
123M
        kind = PyUnicode_KIND(self);
12382
123M
        data = PyUnicode_1BYTE_DATA(self);
12383
123M
        return PyUnicode_FromKindAndData(kind,
12384
123M
                                         data + kind * start,
12385
123M
                                         length);
12386
123M
    }
12387
166M
}
12388
12389
static PyObject *
12390
do_strip(PyObject *self, int striptype)
12391
52.2M
{
12392
52.2M
    Py_ssize_t len, i, j;
12393
12394
52.2M
    len = PyUnicode_GET_LENGTH(self);
12395
12396
52.2M
    if (PyUnicode_IS_ASCII(self)) {
12397
42.3M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12398
12399
42.3M
        i = 0;
12400
42.3M
        if (striptype != RIGHTSTRIP) {
12401
29.5M
            while (i < len) {
12402
20.3M
                Py_UCS1 ch = data[i];
12403
20.3M
                if (!_Py_ascii_whitespace[ch])
12404
19.6M
                    break;
12405
700k
                i++;
12406
700k
            }
12407
28.8M
        }
12408
12409
42.3M
        j = len;
12410
42.3M
        if (striptype != LEFTSTRIP) {
12411
42.0M
            j--;
12412
53.7M
            while (j >= i) {
12413
36.9M
                Py_UCS1 ch = data[j];
12414
36.9M
                if (!_Py_ascii_whitespace[ch])
12415
25.2M
                    break;
12416
11.7M
                j--;
12417
11.7M
            }
12418
42.0M
            j++;
12419
42.0M
        }
12420
42.3M
    }
12421
9.85M
    else {
12422
9.85M
        int kind = PyUnicode_KIND(self);
12423
9.85M
        const void *data = PyUnicode_DATA(self);
12424
12425
9.85M
        i = 0;
12426
9.85M
        if (striptype != RIGHTSTRIP) {
12427
9.49M
            while (i < len) {
12428
9.49M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12429
9.49M
                if (!Py_UNICODE_ISSPACE(ch))
12430
8.05M
                    break;
12431
1.43M
                i++;
12432
1.43M
            }
12433
8.06M
        }
12434
12435
9.85M
        j = len;
12436
9.85M
        if (striptype != LEFTSTRIP) {
12437
8.37M
            j--;
12438
9.15M
            while (j >= i) {
12439
9.12M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12440
9.12M
                if (!Py_UNICODE_ISSPACE(ch))
12441
8.34M
                    break;
12442
778k
                j--;
12443
778k
            }
12444
8.37M
            j++;
12445
8.37M
        }
12446
9.85M
    }
12447
12448
52.2M
    return PyUnicode_Substring(self, i, j);
12449
52.2M
}
12450
12451
12452
static PyObject *
12453
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12454
63.1M
{
12455
63.1M
    if (sep != Py_None) {
12456
10.8M
        if (PyUnicode_Check(sep))
12457
10.8M
            return _PyUnicode_XStrip(self, striptype, sep);
12458
0
        else {
12459
0
            PyErr_Format(PyExc_TypeError,
12460
0
                         "%s arg must be None or str",
12461
0
                         STRIPNAME(striptype));
12462
0
            return NULL;
12463
0
        }
12464
10.8M
    }
12465
12466
52.2M
    return do_strip(self, striptype);
12467
63.1M
}
12468
12469
12470
/*[clinic input]
12471
@permit_long_summary
12472
str.strip as unicode_strip
12473
12474
    chars: object = None
12475
    /
12476
12477
Return a copy of the string with leading and trailing whitespace removed.
12478
12479
If chars is given and not None, remove characters in chars instead.
12480
[clinic start generated code]*/
12481
12482
static PyObject *
12483
unicode_strip_impl(PyObject *self, PyObject *chars)
12484
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12485
35.0M
{
12486
35.0M
    return do_argstrip(self, BOTHSTRIP, chars);
12487
35.0M
}
12488
12489
12490
/*[clinic input]
12491
str.lstrip as unicode_lstrip
12492
12493
    chars: object = None
12494
    /
12495
12496
Return a copy of the string with leading whitespace removed.
12497
12498
If chars is given and not None, remove characters in chars instead.
12499
[clinic start generated code]*/
12500
12501
static PyObject *
12502
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12503
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12504
2.29M
{
12505
2.29M
    return do_argstrip(self, LEFTSTRIP, chars);
12506
2.29M
}
12507
12508
12509
/*[clinic input]
12510
str.rstrip as unicode_rstrip
12511
12512
    chars: object = None
12513
    /
12514
12515
Return a copy of the string with trailing whitespace removed.
12516
12517
If chars is given and not None, remove characters in chars instead.
12518
[clinic start generated code]*/
12519
12520
static PyObject *
12521
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12522
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12523
25.7M
{
12524
25.7M
    return do_argstrip(self, RIGHTSTRIP, chars);
12525
25.7M
}
12526
12527
12528
static PyObject*
12529
unicode_repeat(PyObject *str, Py_ssize_t len)
12530
373k
{
12531
373k
    PyObject *u;
12532
373k
    Py_ssize_t nchars, n;
12533
12534
373k
    if (len < 1)
12535
31.4k
        _Py_RETURN_UNICODE_EMPTY();
12536
12537
    /* no repeat, return original string */
12538
342k
    if (len == 1)
12539
112k
        return unicode_result_unchanged(str);
12540
12541
229k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12542
0
        PyErr_SetString(PyExc_OverflowError,
12543
0
                        "repeated string is too long");
12544
0
        return NULL;
12545
0
    }
12546
229k
    nchars = len * PyUnicode_GET_LENGTH(str);
12547
12548
229k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12549
229k
    if (!u)
12550
0
        return NULL;
12551
229k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12552
12553
229k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12554
227k
        int kind = PyUnicode_KIND(str);
12555
227k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12556
227k
        if (kind == PyUnicode_1BYTE_KIND) {
12557
227k
            void *to = PyUnicode_DATA(u);
12558
227k
            memset(to, (unsigned char)fill_char, len);
12559
227k
        }
12560
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12561
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12562
0
            for (n = 0; n < len; ++n)
12563
0
                ucs2[n] = fill_char;
12564
0
        } else {
12565
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12566
0
            assert(kind == PyUnicode_4BYTE_KIND);
12567
0
            for (n = 0; n < len; ++n)
12568
0
                ucs4[n] = fill_char;
12569
0
        }
12570
227k
    }
12571
2.06k
    else {
12572
2.06k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12573
2.06k
        char *to = (char *) PyUnicode_DATA(u);
12574
2.06k
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12575
2.06k
            PyUnicode_GET_LENGTH(str) * char_size);
12576
2.06k
    }
12577
12578
229k
    assert(_PyUnicode_CheckConsistency(u, 1));
12579
229k
    return u;
12580
229k
}
12581
12582
PyObject *
12583
PyUnicode_Replace(PyObject *str,
12584
                  PyObject *substr,
12585
                  PyObject *replstr,
12586
                  Py_ssize_t maxcount)
12587
2
{
12588
2
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12589
2
            ensure_unicode(replstr) < 0)
12590
0
        return NULL;
12591
2
    return replace(str, substr, replstr, maxcount);
12592
2
}
12593
12594
/*[clinic input]
12595
@permit_long_docstring_body
12596
str.replace as unicode_replace
12597
12598
    old: unicode
12599
    new: unicode
12600
    /
12601
    count: Py_ssize_t = -1
12602
        Maximum number of occurrences to replace.
12603
        -1 (the default value) means replace all occurrences.
12604
12605
Return a copy with all occurrences of substring old replaced by new.
12606
12607
If the optional argument count is given, only the first count occurrences are
12608
replaced.
12609
[clinic start generated code]*/
12610
12611
static PyObject *
12612
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12613
                     Py_ssize_t count)
12614
/*[clinic end generated code: output=b63f1a8b5eebf448 input=f27ca92ac46b65a1]*/
12615
97.3M
{
12616
97.3M
    return replace(self, old, new, count);
12617
97.3M
}
12618
12619
/*[clinic input]
12620
@permit_long_docstring_body
12621
str.removeprefix as unicode_removeprefix
12622
12623
    prefix: unicode
12624
    /
12625
12626
Return a str with the given prefix string removed if present.
12627
12628
If the string starts with the prefix string, return string[len(prefix):].
12629
Otherwise, return a copy of the original string.
12630
[clinic start generated code]*/
12631
12632
static PyObject *
12633
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12634
/*[clinic end generated code: output=f1e5945e9763bcb9 input=1989a856dbb813f1]*/
12635
0
{
12636
0
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12637
0
    if (match == -1) {
12638
0
        return NULL;
12639
0
    }
12640
0
    if (match) {
12641
0
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12642
0
                                   PyUnicode_GET_LENGTH(self));
12643
0
    }
12644
0
    return unicode_result_unchanged(self);
12645
0
}
12646
12647
/*[clinic input]
12648
str.removesuffix as unicode_removesuffix
12649
12650
    suffix: unicode
12651
    /
12652
12653
Return a str with the given suffix string removed if present.
12654
12655
If the string ends with the suffix string and that suffix is not empty,
12656
return string[:-len(suffix)]. Otherwise, return a copy of the original
12657
string.
12658
[clinic start generated code]*/
12659
12660
static PyObject *
12661
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12662
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12663
0
{
12664
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12665
0
    if (match == -1) {
12666
0
        return NULL;
12667
0
    }
12668
0
    if (match) {
12669
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12670
0
                                            - PyUnicode_GET_LENGTH(suffix));
12671
0
    }
12672
0
    return unicode_result_unchanged(self);
12673
0
}
12674
12675
static PyObject *
12676
unicode_repr(PyObject *unicode)
12677
3.82M
{
12678
3.82M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12679
3.82M
    const void *idata = PyUnicode_DATA(unicode);
12680
12681
    /* Compute length of output, quote characters, and
12682
       maximum character */
12683
3.82M
    Py_ssize_t osize = 0;
12684
3.82M
    Py_UCS4 maxch = 127;
12685
3.82M
    Py_ssize_t squote = 0;
12686
3.82M
    Py_ssize_t dquote = 0;
12687
3.82M
    int ikind = PyUnicode_KIND(unicode);
12688
126M
    for (Py_ssize_t i = 0; i < isize; i++) {
12689
123M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12690
123M
        Py_ssize_t incr = 1;
12691
123M
        switch (ch) {
12692
169k
        case '\'': squote++; break;
12693
570k
        case '"':  dquote++; break;
12694
234k
        case '\\': case '\t': case '\r': case '\n':
12695
234k
            incr = 2;
12696
234k
            break;
12697
122M
        default:
12698
            /* Fast-path ASCII */
12699
122M
            if (ch < ' ' || ch == 0x7f)
12700
76.5M
                incr = 4; /* \xHH */
12701
45.6M
            else if (ch < 0x7f)
12702
38.9M
                ;
12703
6.67M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12704
6.58M
                maxch = (ch > maxch) ? ch : maxch;
12705
95.8k
            else if (ch < 0x100)
12706
28.6k
                incr = 4; /* \xHH */
12707
67.1k
            else if (ch < 0x10000)
12708
49.8k
                incr = 6; /* \uHHHH */
12709
17.2k
            else
12710
17.2k
                incr = 10; /* \uHHHHHHHH */
12711
123M
        }
12712
123M
        if (osize > PY_SSIZE_T_MAX - incr) {
12713
0
            PyErr_SetString(PyExc_OverflowError,
12714
0
                            "string is too long to generate repr");
12715
0
            return NULL;
12716
0
        }
12717
123M
        osize += incr;
12718
123M
    }
12719
12720
3.82M
    Py_UCS4 quote = '\'';
12721
3.82M
    int changed = (osize != isize);
12722
3.82M
    if (squote) {
12723
79.4k
        changed = 1;
12724
79.4k
        if (dquote)
12725
            /* Both squote and dquote present. Use squote,
12726
               and escape them */
12727
7.11k
            osize += squote;
12728
72.2k
        else
12729
72.2k
            quote = '"';
12730
79.4k
    }
12731
3.82M
    osize += 2;   /* quotes */
12732
12733
3.82M
    PyObject *repr = PyUnicode_New(osize, maxch);
12734
3.82M
    if (repr == NULL)
12735
0
        return NULL;
12736
3.82M
    int okind = PyUnicode_KIND(repr);
12737
3.82M
    void *odata = PyUnicode_DATA(repr);
12738
12739
3.82M
    if (!changed) {
12740
3.12M
        PyUnicode_WRITE(okind, odata, 0, quote);
12741
12742
3.12M
        _PyUnicode_FastCopyCharacters(repr, 1,
12743
3.12M
                                      unicode, 0,
12744
3.12M
                                      isize);
12745
12746
3.12M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12747
3.12M
    }
12748
699k
    else {
12749
699k
        switch (okind) {
12750
482k
        case PyUnicode_1BYTE_KIND:
12751
482k
            ucs1lib_repr(unicode, quote, odata);
12752
482k
            break;
12753
213k
        case PyUnicode_2BYTE_KIND:
12754
213k
            ucs2lib_repr(unicode, quote, odata);
12755
213k
            break;
12756
4.03k
        default:
12757
4.03k
            assert(okind == PyUnicode_4BYTE_KIND);
12758
4.03k
            ucs4lib_repr(unicode, quote, odata);
12759
699k
        }
12760
699k
    }
12761
12762
3.82M
    assert(_PyUnicode_CheckConsistency(repr, 1));
12763
3.82M
    return repr;
12764
3.82M
}
12765
12766
/*[clinic input]
12767
@permit_long_summary
12768
str.rfind as unicode_rfind = str.count
12769
12770
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12771
12772
Optional arguments start and end are interpreted as in slice notation.
12773
Return -1 on failure.
12774
[clinic start generated code]*/
12775
12776
static Py_ssize_t
12777
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12778
                   Py_ssize_t end)
12779
/*[clinic end generated code: output=880b29f01dd014c8 input=7f7e97d5cd3299a2]*/
12780
9.85k
{
12781
9.85k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12782
9.85k
    if (result < 0) {
12783
6.55k
        return -1;
12784
6.55k
    }
12785
3.30k
    return result;
12786
9.85k
}
12787
12788
/*[clinic input]
12789
@permit_long_summary
12790
str.rindex as unicode_rindex = str.count
12791
12792
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12793
12794
Optional arguments start and end are interpreted as in slice notation.
12795
Raises ValueError when the substring is not found.
12796
[clinic start generated code]*/
12797
12798
static Py_ssize_t
12799
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12800
                    Py_ssize_t end)
12801
/*[clinic end generated code: output=5f3aef124c867fe1 input=0363a324740b3e62]*/
12802
110k
{
12803
110k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12804
110k
    if (result == -1) {
12805
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12806
0
    }
12807
110k
    else if (result < 0) {
12808
0
        return -1;
12809
0
    }
12810
110k
    return result;
12811
110k
}
12812
12813
/*[clinic input]
12814
str.rjust as unicode_rjust
12815
12816
    width: Py_ssize_t
12817
    fillchar: Py_UCS4 = ' '
12818
    /
12819
12820
Return a right-justified string of length width.
12821
12822
Padding is done using the specified fill character (default is a space).
12823
[clinic start generated code]*/
12824
12825
static PyObject *
12826
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12827
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12828
0
{
12829
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12830
0
        return unicode_result_unchanged(self);
12831
12832
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12833
0
}
12834
12835
PyObject *
12836
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12837
0
{
12838
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12839
0
        return NULL;
12840
12841
0
    return split(s, sep, maxsplit);
12842
0
}
12843
12844
/*[clinic input]
12845
@permit_long_summary
12846
str.split as unicode_split
12847
12848
    sep: object = None
12849
        The separator used to split the string.
12850
12851
        When set to None (the default value), will split on any whitespace
12852
        character (including \n \r \t \f and spaces) and will discard
12853
        empty strings from the result.
12854
    maxsplit: Py_ssize_t = -1
12855
        Maximum number of splits.
12856
        -1 (the default value) means no limit.
12857
12858
Return a list of the substrings in the string, using sep as the separator string.
12859
12860
Splitting starts at the front of the string and works to the end.
12861
12862
Note, str.split() is mainly useful for data that has been intentionally
12863
delimited.  With natural text that includes punctuation, consider using
12864
the regular expression module.
12865
12866
[clinic start generated code]*/
12867
12868
static PyObject *
12869
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12870
/*[clinic end generated code: output=3a65b1db356948dc input=2c1fd08a78e038b8]*/
12871
22.9M
{
12872
22.9M
    if (sep == Py_None)
12873
159k
        return split(self, NULL, maxsplit);
12874
22.7M
    if (PyUnicode_Check(sep))
12875
22.7M
        return split(self, sep, maxsplit);
12876
12877
0
    PyErr_Format(PyExc_TypeError,
12878
0
                 "must be str or None, not %.100s",
12879
0
                 Py_TYPE(sep)->tp_name);
12880
0
    return NULL;
12881
22.7M
}
12882
12883
PyObject *
12884
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12885
7.52M
{
12886
7.52M
    PyObject* out;
12887
7.52M
    int kind1, kind2;
12888
7.52M
    const void *buf1, *buf2;
12889
7.52M
    Py_ssize_t len1, len2;
12890
12891
7.52M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12892
0
        return NULL;
12893
12894
7.52M
    kind1 = PyUnicode_KIND(str_obj);
12895
7.52M
    kind2 = PyUnicode_KIND(sep_obj);
12896
7.52M
    len1 = PyUnicode_GET_LENGTH(str_obj);
12897
7.52M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12898
7.52M
    if (kind1 < kind2 || len1 < len2) {
12899
1.14k
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12900
1.14k
        return PyTuple_Pack(3, str_obj, empty, empty);
12901
1.14k
    }
12902
7.52M
    buf1 = PyUnicode_DATA(str_obj);
12903
7.52M
    buf2 = PyUnicode_DATA(sep_obj);
12904
7.52M
    if (kind2 != kind1) {
12905
86.2k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12906
86.2k
        if (!buf2)
12907
0
            return NULL;
12908
86.2k
    }
12909
12910
7.52M
    switch (kind1) {
12911
7.44M
    case PyUnicode_1BYTE_KIND:
12912
7.44M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12913
2.76M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12914
4.67M
        else
12915
4.67M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12916
7.44M
        break;
12917
76.2k
    case PyUnicode_2BYTE_KIND:
12918
76.2k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12919
76.2k
        break;
12920
9.99k
    case PyUnicode_4BYTE_KIND:
12921
9.99k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12922
9.99k
        break;
12923
0
    default:
12924
0
        Py_UNREACHABLE();
12925
7.52M
    }
12926
12927
7.52M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12928
7.52M
    if (kind2 != kind1)
12929
86.2k
        PyMem_Free((void *)buf2);
12930
12931
7.52M
    return out;
12932
7.52M
}
12933
12934
12935
PyObject *
12936
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12937
10.5k
{
12938
10.5k
    PyObject* out;
12939
10.5k
    int kind1, kind2;
12940
10.5k
    const void *buf1, *buf2;
12941
10.5k
    Py_ssize_t len1, len2;
12942
12943
10.5k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12944
0
        return NULL;
12945
12946
10.5k
    kind1 = PyUnicode_KIND(str_obj);
12947
10.5k
    kind2 = PyUnicode_KIND(sep_obj);
12948
10.5k
    len1 = PyUnicode_GET_LENGTH(str_obj);
12949
10.5k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12950
10.5k
    if (kind1 < kind2 || len1 < len2) {
12951
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12952
0
        return PyTuple_Pack(3, empty, empty, str_obj);
12953
0
    }
12954
10.5k
    buf1 = PyUnicode_DATA(str_obj);
12955
10.5k
    buf2 = PyUnicode_DATA(sep_obj);
12956
10.5k
    if (kind2 != kind1) {
12957
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12958
0
        if (!buf2)
12959
0
            return NULL;
12960
0
    }
12961
12962
10.5k
    switch (kind1) {
12963
10.5k
    case PyUnicode_1BYTE_KIND:
12964
10.5k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12965
10.5k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12966
0
        else
12967
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12968
10.5k
        break;
12969
0
    case PyUnicode_2BYTE_KIND:
12970
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12971
0
        break;
12972
0
    case PyUnicode_4BYTE_KIND:
12973
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12974
0
        break;
12975
0
    default:
12976
0
        Py_UNREACHABLE();
12977
10.5k
    }
12978
12979
10.5k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12980
10.5k
    if (kind2 != kind1)
12981
0
        PyMem_Free((void *)buf2);
12982
12983
10.5k
    return out;
12984
10.5k
}
12985
12986
/*[clinic input]
12987
@permit_long_docstring_body
12988
str.partition as unicode_partition
12989
12990
    sep: object
12991
    /
12992
12993
Partition the string into three parts using the given separator.
12994
12995
This will search for the separator in the string.  If the separator is found,
12996
returns a 3-tuple containing the part before the separator, the separator
12997
itself, and the part after it.
12998
12999
If the separator is not found, returns a 3-tuple containing the original string
13000
and two empty strings.
13001
[clinic start generated code]*/
13002
13003
static PyObject *
13004
unicode_partition(PyObject *self, PyObject *sep)
13005
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=4d854b520d7b0e97]*/
13006
7.52M
{
13007
7.52M
    return PyUnicode_Partition(self, sep);
13008
7.52M
}
13009
13010
/*[clinic input]
13011
@permit_long_docstring_body
13012
str.rpartition as unicode_rpartition = str.partition
13013
13014
Partition the string into three parts using the given separator.
13015
13016
This will search for the separator in the string, starting at the end. If
13017
the separator is found, returns a 3-tuple containing the part before the
13018
separator, the separator itself, and the part after it.
13019
13020
If the separator is not found, returns a 3-tuple containing two empty strings
13021
and the original string.
13022
[clinic start generated code]*/
13023
13024
static PyObject *
13025
unicode_rpartition(PyObject *self, PyObject *sep)
13026
/*[clinic end generated code: output=1aa13cf1156572aa input=a6adabe91e75b486]*/
13027
10.5k
{
13028
10.5k
    return PyUnicode_RPartition(self, sep);
13029
10.5k
}
13030
13031
PyObject *
13032
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13033
0
{
13034
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13035
0
        return NULL;
13036
13037
0
    return rsplit(s, sep, maxsplit);
13038
0
}
13039
13040
/*[clinic input]
13041
@permit_long_summary
13042
str.rsplit as unicode_rsplit = str.split
13043
13044
Return a list of the substrings in the string, using sep as the separator string.
13045
13046
Splitting starts at the end of the string and works to the front.
13047
[clinic start generated code]*/
13048
13049
static PyObject *
13050
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13051
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13052
50
{
13053
50
    if (sep == Py_None)
13054
0
        return rsplit(self, NULL, maxsplit);
13055
50
    if (PyUnicode_Check(sep))
13056
50
        return rsplit(self, sep, maxsplit);
13057
13058
0
    PyErr_Format(PyExc_TypeError,
13059
0
                 "must be str or None, not %.100s",
13060
0
                 Py_TYPE(sep)->tp_name);
13061
0
    return NULL;
13062
50
}
13063
13064
/*[clinic input]
13065
@permit_long_docstring_body
13066
str.splitlines as unicode_splitlines
13067
13068
    keepends: bool = False
13069
13070
Return a list of the lines in the string, breaking at line boundaries.
13071
13072
Line breaks are not included in the resulting list unless keepends is given and
13073
true.
13074
[clinic start generated code]*/
13075
13076
static PyObject *
13077
unicode_splitlines_impl(PyObject *self, int keepends)
13078
/*[clinic end generated code: output=f664dcdad153ec40 input=39eeafbfef61c827]*/
13079
14.0k
{
13080
14.0k
    return PyUnicode_Splitlines(self, keepends);
13081
14.0k
}
13082
13083
static
13084
PyObject *unicode_str(PyObject *self)
13085
3.19M
{
13086
3.19M
    return unicode_result_unchanged(self);
13087
3.19M
}
13088
13089
/*[clinic input]
13090
@permit_long_summary
13091
str.swapcase as unicode_swapcase
13092
13093
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13094
[clinic start generated code]*/
13095
13096
static PyObject *
13097
unicode_swapcase_impl(PyObject *self)
13098
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13099
0
{
13100
0
    return case_operation(self, do_swapcase);
13101
0
}
13102
13103
/*[clinic input]
13104
13105
@staticmethod
13106
str.maketrans as unicode_maketrans
13107
13108
  x: object
13109
13110
  y: unicode=NULL
13111
13112
  z: unicode=NULL
13113
13114
  /
13115
13116
Return a translation table usable for str.translate().
13117
13118
If there is only one argument, it must be a dictionary mapping Unicode
13119
ordinals (integers) or characters to Unicode ordinals, strings or None.
13120
Character keys will be then converted to ordinals.
13121
If there are two arguments, they must be strings of equal length, and
13122
in the resulting dictionary, each character in x will be mapped to the
13123
character at the same position in y. If there is a third argument, it
13124
must be a string, whose characters will be mapped to None in the result.
13125
[clinic start generated code]*/
13126
13127
static PyObject *
13128
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13129
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13130
0
{
13131
0
    PyObject *new = NULL, *key, *value;
13132
0
    Py_ssize_t i = 0;
13133
0
    int res;
13134
13135
0
    new = PyDict_New();
13136
0
    if (!new)
13137
0
        return NULL;
13138
0
    if (y != NULL) {
13139
0
        int x_kind, y_kind, z_kind;
13140
0
        const void *x_data, *y_data, *z_data;
13141
13142
        /* x must be a string too, of equal length */
13143
0
        if (!PyUnicode_Check(x)) {
13144
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13145
0
                            "be a string if there is a second argument");
13146
0
            goto err;
13147
0
        }
13148
0
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13149
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13150
0
                            "arguments must have equal length");
13151
0
            goto err;
13152
0
        }
13153
        /* create entries for translating chars in x to those in y */
13154
0
        x_kind = PyUnicode_KIND(x);
13155
0
        y_kind = PyUnicode_KIND(y);
13156
0
        x_data = PyUnicode_DATA(x);
13157
0
        y_data = PyUnicode_DATA(y);
13158
0
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13159
0
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13160
0
            if (!key)
13161
0
                goto err;
13162
0
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13163
0
            if (!value) {
13164
0
                Py_DECREF(key);
13165
0
                goto err;
13166
0
            }
13167
0
            res = PyDict_SetItem(new, key, value);
13168
0
            Py_DECREF(key);
13169
0
            Py_DECREF(value);
13170
0
            if (res < 0)
13171
0
                goto err;
13172
0
        }
13173
        /* create entries for deleting chars in z */
13174
0
        if (z != NULL) {
13175
0
            z_kind = PyUnicode_KIND(z);
13176
0
            z_data = PyUnicode_DATA(z);
13177
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13178
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13179
0
                if (!key)
13180
0
                    goto err;
13181
0
                res = PyDict_SetItem(new, key, Py_None);
13182
0
                Py_DECREF(key);
13183
0
                if (res < 0)
13184
0
                    goto err;
13185
0
            }
13186
0
        }
13187
0
    } else {
13188
0
        int kind;
13189
0
        const void *data;
13190
13191
        /* x must be a dict */
13192
0
        if (!PyDict_CheckExact(x)) {
13193
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13194
0
                            "to maketrans it must be a dict");
13195
0
            goto err;
13196
0
        }
13197
        /* copy entries into the new dict, converting string keys to int keys */
13198
0
        while (PyDict_Next(x, &i, &key, &value)) {
13199
0
            if (PyUnicode_Check(key)) {
13200
                /* convert string keys to integer keys */
13201
0
                PyObject *newkey;
13202
0
                if (PyUnicode_GET_LENGTH(key) != 1) {
13203
0
                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13204
0
                                    "table must be of length 1");
13205
0
                    goto err;
13206
0
                }
13207
0
                kind = PyUnicode_KIND(key);
13208
0
                data = PyUnicode_DATA(key);
13209
0
                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13210
0
                if (!newkey)
13211
0
                    goto err;
13212
0
                res = PyDict_SetItem(new, newkey, value);
13213
0
                Py_DECREF(newkey);
13214
0
                if (res < 0)
13215
0
                    goto err;
13216
0
            } else if (PyLong_Check(key)) {
13217
                /* just keep integer keys */
13218
0
                if (PyDict_SetItem(new, key, value) < 0)
13219
0
                    goto err;
13220
0
            } else {
13221
0
                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13222
0
                                "be strings or integers");
13223
0
                goto err;
13224
0
            }
13225
0
        }
13226
0
    }
13227
0
    return new;
13228
0
  err:
13229
0
    Py_DECREF(new);
13230
0
    return NULL;
13231
0
}
13232
13233
/*[clinic input]
13234
@permit_long_docstring_body
13235
str.translate as unicode_translate
13236
13237
    table: object
13238
        Translation table, which must be a mapping of Unicode ordinals to
13239
        Unicode ordinals, strings, or None.
13240
    /
13241
13242
Replace each character in the string using the given translation table.
13243
13244
The table must implement lookup/indexing via __getitem__, for instance a
13245
dictionary or list.  If this operation raises LookupError, the character is
13246
left untouched.  Characters mapped to None are deleted.
13247
[clinic start generated code]*/
13248
13249
static PyObject *
13250
unicode_translate(PyObject *self, PyObject *table)
13251
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=699e5fa0ebf9f5e9]*/
13252
104
{
13253
104
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13254
104
}
13255
13256
/*[clinic input]
13257
str.upper as unicode_upper
13258
13259
Return a copy of the string converted to uppercase.
13260
[clinic start generated code]*/
13261
13262
static PyObject *
13263
unicode_upper_impl(PyObject *self)
13264
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13265
102
{
13266
102
    if (PyUnicode_IS_ASCII(self))
13267
102
        return ascii_upper_or_lower(self, 0);
13268
0
    return case_operation(self, do_upper);
13269
102
}
13270
13271
/*[clinic input]
13272
@permit_long_summary
13273
str.zfill as unicode_zfill
13274
13275
    width: Py_ssize_t
13276
    /
13277
13278
Pad a numeric string with zeros on the left, to fill a field of the given width.
13279
13280
The string is never truncated.
13281
[clinic start generated code]*/
13282
13283
static PyObject *
13284
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13285
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13286
0
{
13287
0
    Py_ssize_t fill;
13288
0
    PyObject *u;
13289
0
    int kind;
13290
0
    const void *data;
13291
0
    Py_UCS4 chr;
13292
13293
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13294
0
        return unicode_result_unchanged(self);
13295
13296
0
    fill = width - PyUnicode_GET_LENGTH(self);
13297
13298
0
    u = pad(self, fill, 0, '0');
13299
13300
0
    if (u == NULL)
13301
0
        return NULL;
13302
13303
0
    kind = PyUnicode_KIND(u);
13304
0
    data = PyUnicode_DATA(u);
13305
0
    chr = PyUnicode_READ(kind, data, fill);
13306
13307
0
    if (chr == '+' || chr == '-') {
13308
        /* move sign to beginning of string */
13309
0
        PyUnicode_WRITE(kind, data, 0, chr);
13310
0
        PyUnicode_WRITE(kind, data, fill, '0');
13311
0
    }
13312
13313
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13314
0
    return u;
13315
0
}
13316
13317
/*[clinic input]
13318
@permit_long_summary
13319
@text_signature "($self, prefix[, start[, end]], /)"
13320
str.startswith as unicode_startswith
13321
13322
    prefix as subobj: object
13323
        A string or a tuple of strings to try.
13324
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13325
        Optional start position. Default: start of the string.
13326
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13327
        Optional stop position. Default: end of the string.
13328
    /
13329
13330
Return True if the string starts with the specified prefix, False otherwise.
13331
[clinic start generated code]*/
13332
13333
static PyObject *
13334
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13335
                        Py_ssize_t end)
13336
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13337
85.5M
{
13338
85.5M
    if (PyTuple_Check(subobj)) {
13339
11.5M
        Py_ssize_t i;
13340
41.9M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13341
30.4M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13342
30.4M
            if (!PyUnicode_Check(substring)) {
13343
0
                PyErr_Format(PyExc_TypeError,
13344
0
                             "tuple for startswith must only contain str, "
13345
0
                             "not %.100s",
13346
0
                             Py_TYPE(substring)->tp_name);
13347
0
                return NULL;
13348
0
            }
13349
30.4M
            int result = tailmatch(self, substring, start, end, -1);
13350
30.4M
            if (result < 0) {
13351
0
                return NULL;
13352
0
            }
13353
30.4M
            if (result) {
13354
41.0k
                Py_RETURN_TRUE;
13355
41.0k
            }
13356
30.4M
        }
13357
        /* nothing matched */
13358
11.5M
        Py_RETURN_FALSE;
13359
11.5M
    }
13360
73.9M
    if (!PyUnicode_Check(subobj)) {
13361
0
        PyErr_Format(PyExc_TypeError,
13362
0
                     "startswith first arg must be str or "
13363
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13364
0
        return NULL;
13365
0
    }
13366
73.9M
    int result = tailmatch(self, subobj, start, end, -1);
13367
73.9M
    if (result < 0) {
13368
0
        return NULL;
13369
0
    }
13370
73.9M
    return PyBool_FromLong(result);
13371
73.9M
}
13372
13373
13374
/*[clinic input]
13375
@permit_long_summary
13376
@text_signature "($self, suffix[, start[, end]], /)"
13377
str.endswith as unicode_endswith
13378
13379
    suffix as subobj: object
13380
        A string or a tuple of strings to try.
13381
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13382
        Optional start position. Default: start of the string.
13383
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13384
        Optional stop position. Default: end of the string.
13385
    /
13386
13387
Return True if the string ends with the specified suffix, False otherwise.
13388
[clinic start generated code]*/
13389
13390
static PyObject *
13391
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13392
                      Py_ssize_t end)
13393
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13394
13.7M
{
13395
13.7M
    if (PyTuple_Check(subobj)) {
13396
190k
        Py_ssize_t i;
13397
364k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13398
336k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13399
336k
            if (!PyUnicode_Check(substring)) {
13400
0
                PyErr_Format(PyExc_TypeError,
13401
0
                             "tuple for endswith must only contain str, "
13402
0
                             "not %.100s",
13403
0
                             Py_TYPE(substring)->tp_name);
13404
0
                return NULL;
13405
0
            }
13406
336k
            int result = tailmatch(self, substring, start, end, +1);
13407
336k
            if (result < 0) {
13408
0
                return NULL;
13409
0
            }
13410
336k
            if (result) {
13411
162k
                Py_RETURN_TRUE;
13412
162k
            }
13413
336k
        }
13414
190k
        Py_RETURN_FALSE;
13415
190k
    }
13416
13.5M
    if (!PyUnicode_Check(subobj)) {
13417
0
        PyErr_Format(PyExc_TypeError,
13418
0
                     "endswith first arg must be str or "
13419
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13420
0
        return NULL;
13421
0
    }
13422
13.5M
    int result = tailmatch(self, subobj, start, end, +1);
13423
13.5M
    if (result < 0) {
13424
0
        return NULL;
13425
0
    }
13426
13.5M
    return PyBool_FromLong(result);
13427
13.5M
}
13428
13429
13430
#include "stringlib/unicode_format.h"
13431
13432
PyDoc_STRVAR(format__doc__,
13433
             "format($self, /, *args, **kwargs)\n\
13434
--\n\
13435
\n\
13436
Return a formatted version of the string, using substitutions from args and kwargs.\n\
13437
The substitutions are identified by braces ('{' and '}').");
13438
13439
PyDoc_STRVAR(format_map__doc__,
13440
             "format_map($self, mapping, /)\n\
13441
--\n\
13442
\n\
13443
Return a formatted version of the string, using substitutions from mapping.\n\
13444
The substitutions are identified by braces ('{' and '}').");
13445
13446
/*[clinic input]
13447
str.__format__ as unicode___format__
13448
13449
    format_spec: unicode
13450
    /
13451
13452
Return a formatted version of the string as described by format_spec.
13453
[clinic start generated code]*/
13454
13455
static PyObject *
13456
unicode___format___impl(PyObject *self, PyObject *format_spec)
13457
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13458
0
{
13459
0
    _PyUnicodeWriter writer;
13460
0
    int ret;
13461
13462
0
    _PyUnicodeWriter_Init(&writer);
13463
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13464
0
                                          self, format_spec, 0,
13465
0
                                          PyUnicode_GET_LENGTH(format_spec));
13466
0
    if (ret == -1) {
13467
0
        _PyUnicodeWriter_Dealloc(&writer);
13468
0
        return NULL;
13469
0
    }
13470
0
    return _PyUnicodeWriter_Finish(&writer);
13471
0
}
13472
13473
/*[clinic input]
13474
str.__sizeof__ as unicode_sizeof
13475
13476
Return the size of the string in memory, in bytes.
13477
[clinic start generated code]*/
13478
13479
static PyObject *
13480
unicode_sizeof_impl(PyObject *self)
13481
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13482
0
{
13483
0
    Py_ssize_t size;
13484
13485
    /* If it's a compact object, account for base structure +
13486
       character data. */
13487
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
13488
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13489
0
    }
13490
0
    else if (PyUnicode_IS_COMPACT(self)) {
13491
0
        size = sizeof(PyCompactUnicodeObject) +
13492
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13493
0
    }
13494
0
    else {
13495
        /* If it is a two-block object, account for base object, and
13496
           for character block if present. */
13497
0
        size = sizeof(PyUnicodeObject);
13498
0
        if (_PyUnicode_DATA_ANY(self))
13499
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13500
0
                PyUnicode_KIND(self);
13501
0
    }
13502
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13503
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13504
13505
0
    return PyLong_FromSsize_t(size);
13506
0
}
13507
13508
static PyObject *
13509
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13510
0
{
13511
0
    PyObject *copy = _PyUnicode_Copy(v);
13512
0
    if (!copy)
13513
0
        return NULL;
13514
0
    return Py_BuildValue("(N)", copy);
13515
0
}
13516
13517
/*
13518
This function searchs the longest common leading whitespace
13519
of all lines in the [src, end).
13520
It returns the length of the common leading whitespace and sets `output` to
13521
point to the beginning of the common leading whitespace if length > 0.
13522
*/
13523
static Py_ssize_t
13524
search_longest_common_leading_whitespace(
13525
    const char *const src,
13526
    const char *const end,
13527
    const char **output)
13528
0
{
13529
    // [_start, _start + _len)
13530
    // describes the current longest common leading whitespace
13531
0
    const char *_start = NULL;
13532
0
    Py_ssize_t _len = 0;
13533
13534
0
    for (const char *iter = src; iter < end; ++iter) {
13535
0
        const char *line_start = iter;
13536
0
        const char *leading_whitespace_end = NULL;
13537
13538
        // scan the whole line
13539
0
        while (iter < end && *iter != '\n') {
13540
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
13541
                /* `iter` points to the first non-whitespace character
13542
                   in this line */
13543
0
                if (iter == line_start) {
13544
                    // some line has no indent, fast exit!
13545
0
                    return 0;
13546
0
                }
13547
0
                leading_whitespace_end = iter;
13548
0
            }
13549
0
            ++iter;
13550
0
        }
13551
13552
        // if this line has all white space, skip it
13553
0
        if (!leading_whitespace_end) {
13554
0
            continue;
13555
0
        }
13556
13557
0
        if (!_start) {
13558
            // update the first leading whitespace
13559
0
            _start = line_start;
13560
0
            _len = leading_whitespace_end - line_start;
13561
0
            assert(_len > 0);
13562
0
        }
13563
0
        else {
13564
            /* We then compare with the current longest leading whitespace.
13565
13566
               [line_start, leading_whitespace_end) is the leading
13567
               whitespace of this line,
13568
13569
               [_start, _start + _len) is the leading whitespace of the
13570
               current longest leading whitespace. */
13571
0
            Py_ssize_t new_len = 0;
13572
0
            const char *_iter = _start, *line_iter = line_start;
13573
13574
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
13575
0
                   && *_iter == *line_iter)
13576
0
            {
13577
0
                ++_iter;
13578
0
                ++line_iter;
13579
0
                ++new_len;
13580
0
            }
13581
13582
0
            _len = new_len;
13583
0
            if (_len == 0) {
13584
                // No common things now, fast exit!
13585
0
                return 0;
13586
0
            }
13587
0
        }
13588
0
    }
13589
13590
0
    assert(_len >= 0);
13591
0
    if (_len > 0) {
13592
0
        *output = _start;
13593
0
    }
13594
0
    return _len;
13595
0
}
13596
13597
/* Dedent a string.
13598
   Behaviour is expected to be an exact match of `textwrap.dedent`.
13599
   Return a new reference on success, NULL with exception set on error.
13600
   */
13601
PyObject *
13602
_PyUnicode_Dedent(PyObject *unicode)
13603
0
{
13604
0
    Py_ssize_t src_len = 0;
13605
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
13606
0
    if (!src) {
13607
0
        return NULL;
13608
0
    }
13609
0
    assert(src_len >= 0);
13610
0
    if (src_len == 0) {
13611
0
        return Py_NewRef(unicode);
13612
0
    }
13613
13614
0
    const char *const end = src + src_len;
13615
13616
    // [whitespace_start, whitespace_start + whitespace_len)
13617
    // describes the current longest common leading whitespace
13618
0
    const char *whitespace_start = NULL;
13619
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
13620
0
        src, end, &whitespace_start);
13621
13622
0
    if (whitespace_len == 0) {
13623
0
        return Py_NewRef(unicode);
13624
0
    }
13625
13626
    // now we should trigger a dedent
13627
0
    char *dest = PyMem_Malloc(src_len);
13628
0
    if (!dest) {
13629
0
        PyErr_NoMemory();
13630
0
        return NULL;
13631
0
    }
13632
0
    char *dest_iter = dest;
13633
13634
0
    for (const char *iter = src; iter < end; ++iter) {
13635
0
        const char *line_start = iter;
13636
0
        bool in_leading_space = true;
13637
13638
        // iterate over a line to find the end of a line
13639
0
        while (iter < end && *iter != '\n') {
13640
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
13641
0
                in_leading_space = false;
13642
0
            }
13643
0
            ++iter;
13644
0
        }
13645
13646
        // invariant: *iter == '\n' or iter == end
13647
0
        bool append_newline = iter < end;
13648
13649
        // if this line has all white space, write '\n' and continue
13650
0
        if (in_leading_space && append_newline) {
13651
0
            *dest_iter++ = '\n';
13652
0
            continue;
13653
0
        }
13654
13655
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
13656
            conditionally append '\n' */
13657
13658
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
13659
0
        assert(new_line_len >= 0);
13660
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
13661
13662
0
        dest_iter += new_line_len;
13663
13664
0
        if (append_newline) {
13665
0
            *dest_iter++ = '\n';
13666
0
        }
13667
0
    }
13668
13669
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
13670
0
    PyMem_Free(dest);
13671
0
    return res;
13672
0
}
13673
13674
static PyMethodDef unicode_methods[] = {
13675
    UNICODE_ENCODE_METHODDEF
13676
    UNICODE_REPLACE_METHODDEF
13677
    UNICODE_SPLIT_METHODDEF
13678
    UNICODE_RSPLIT_METHODDEF
13679
    UNICODE_JOIN_METHODDEF
13680
    UNICODE_CAPITALIZE_METHODDEF
13681
    UNICODE_CASEFOLD_METHODDEF
13682
    UNICODE_TITLE_METHODDEF
13683
    UNICODE_CENTER_METHODDEF
13684
    UNICODE_COUNT_METHODDEF
13685
    UNICODE_EXPANDTABS_METHODDEF
13686
    UNICODE_FIND_METHODDEF
13687
    UNICODE_PARTITION_METHODDEF
13688
    UNICODE_INDEX_METHODDEF
13689
    UNICODE_LJUST_METHODDEF
13690
    UNICODE_LOWER_METHODDEF
13691
    UNICODE_LSTRIP_METHODDEF
13692
    UNICODE_RFIND_METHODDEF
13693
    UNICODE_RINDEX_METHODDEF
13694
    UNICODE_RJUST_METHODDEF
13695
    UNICODE_RSTRIP_METHODDEF
13696
    UNICODE_RPARTITION_METHODDEF
13697
    UNICODE_SPLITLINES_METHODDEF
13698
    UNICODE_STRIP_METHODDEF
13699
    UNICODE_SWAPCASE_METHODDEF
13700
    UNICODE_TRANSLATE_METHODDEF
13701
    UNICODE_UPPER_METHODDEF
13702
    UNICODE_STARTSWITH_METHODDEF
13703
    UNICODE_ENDSWITH_METHODDEF
13704
    UNICODE_REMOVEPREFIX_METHODDEF
13705
    UNICODE_REMOVESUFFIX_METHODDEF
13706
    UNICODE_ISASCII_METHODDEF
13707
    UNICODE_ISLOWER_METHODDEF
13708
    UNICODE_ISUPPER_METHODDEF
13709
    UNICODE_ISTITLE_METHODDEF
13710
    UNICODE_ISSPACE_METHODDEF
13711
    UNICODE_ISDECIMAL_METHODDEF
13712
    UNICODE_ISDIGIT_METHODDEF
13713
    UNICODE_ISNUMERIC_METHODDEF
13714
    UNICODE_ISALPHA_METHODDEF
13715
    UNICODE_ISALNUM_METHODDEF
13716
    UNICODE_ISIDENTIFIER_METHODDEF
13717
    UNICODE_ISPRINTABLE_METHODDEF
13718
    UNICODE_ZFILL_METHODDEF
13719
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13720
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
13721
    UNICODE___FORMAT___METHODDEF
13722
    UNICODE_MAKETRANS_METHODDEF
13723
    UNICODE_SIZEOF_METHODDEF
13724
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13725
    {NULL, NULL}
13726
};
13727
13728
static PyObject *
13729
unicode_mod(PyObject *v, PyObject *w)
13730
27.2M
{
13731
27.2M
    if (!PyUnicode_Check(v))
13732
0
        Py_RETURN_NOTIMPLEMENTED;
13733
27.2M
    return PyUnicode_Format(v, w);
13734
27.2M
}
13735
13736
static PyNumberMethods unicode_as_number = {
13737
    0,              /*nb_add*/
13738
    0,              /*nb_subtract*/
13739
    0,              /*nb_multiply*/
13740
    unicode_mod,            /*nb_remainder*/
13741
};
13742
13743
static PySequenceMethods unicode_as_sequence = {
13744
    unicode_length,     /* sq_length */
13745
    PyUnicode_Concat,   /* sq_concat */
13746
    unicode_repeat,     /* sq_repeat */
13747
    unicode_getitem,    /* sq_item */
13748
    0,                  /* sq_slice */
13749
    0,                  /* sq_ass_item */
13750
    0,                  /* sq_ass_slice */
13751
    PyUnicode_Contains, /* sq_contains */
13752
};
13753
13754
static PyObject*
13755
unicode_subscript(PyObject* self, PyObject* item)
13756
103M
{
13757
103M
    if (_PyIndex_Check(item)) {
13758
48.7M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13759
48.7M
        if (i == -1 && PyErr_Occurred())
13760
0
            return NULL;
13761
48.7M
        if (i < 0)
13762
52.2k
            i += PyUnicode_GET_LENGTH(self);
13763
48.7M
        return unicode_getitem(self, i);
13764
54.3M
    } else if (PySlice_Check(item)) {
13765
54.3M
        Py_ssize_t start, stop, step, slicelength, i;
13766
54.3M
        size_t cur;
13767
54.3M
        PyObject *result;
13768
54.3M
        const void *src_data;
13769
54.3M
        void *dest_data;
13770
54.3M
        int src_kind, dest_kind;
13771
54.3M
        Py_UCS4 ch, max_char, kind_limit;
13772
13773
54.3M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13774
0
            return NULL;
13775
0
        }
13776
54.3M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13777
54.3M
                                            &start, &stop, step);
13778
13779
54.3M
        if (slicelength <= 0) {
13780
13.6M
            _Py_RETURN_UNICODE_EMPTY();
13781
40.7M
        } else if (start == 0 && step == 1 &&
13782
8.75M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
13783
1.22M
            return unicode_result_unchanged(self);
13784
39.5M
        } else if (step == 1) {
13785
39.5M
            return PyUnicode_Substring(self,
13786
39.5M
                                       start, start + slicelength);
13787
39.5M
        }
13788
        /* General case */
13789
0
        src_kind = PyUnicode_KIND(self);
13790
0
        src_data = PyUnicode_DATA(self);
13791
0
        if (!PyUnicode_IS_ASCII(self)) {
13792
0
            kind_limit = kind_maxchar_limit(src_kind);
13793
0
            max_char = 0;
13794
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13795
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
13796
0
                if (ch > max_char) {
13797
0
                    max_char = ch;
13798
0
                    if (max_char >= kind_limit)
13799
0
                        break;
13800
0
                }
13801
0
            }
13802
0
        }
13803
0
        else
13804
0
            max_char = 127;
13805
0
        result = PyUnicode_New(slicelength, max_char);
13806
0
        if (result == NULL)
13807
0
            return NULL;
13808
0
        dest_kind = PyUnicode_KIND(result);
13809
0
        dest_data = PyUnicode_DATA(result);
13810
13811
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13812
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13813
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13814
0
        }
13815
0
        assert(_PyUnicode_CheckConsistency(result, 1));
13816
0
        return result;
13817
0
    } else {
13818
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13819
0
                     Py_TYPE(item)->tp_name);
13820
0
        return NULL;
13821
0
    }
13822
103M
}
13823
13824
static PyMappingMethods unicode_as_mapping = {
13825
    unicode_length,     /* mp_length */
13826
    unicode_subscript,  /* mp_subscript */
13827
    0,                  /* mp_ass_subscript */
13828
};
13829
13830
13831
static PyObject *
13832
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
13833
13834
/*[clinic input]
13835
@classmethod
13836
str.__new__ as unicode_new
13837
13838
    object as x: object = NULL
13839
    encoding: str = NULL
13840
    errors: str = NULL
13841
13842
[clinic start generated code]*/
13843
13844
static PyObject *
13845
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
13846
                 const char *errors)
13847
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
13848
10.5M
{
13849
10.5M
    PyObject *unicode;
13850
10.5M
    if (x == NULL) {
13851
0
        unicode = _PyUnicode_GetEmpty();
13852
0
    }
13853
10.5M
    else if (encoding == NULL && errors == NULL) {
13854
10.5M
        unicode = PyObject_Str(x);
13855
10.5M
    }
13856
0
    else {
13857
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
13858
0
    }
13859
13860
10.5M
    if (unicode != NULL && type != &PyUnicode_Type) {
13861
10.5M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
13862
10.5M
    }
13863
10.5M
    return unicode;
13864
10.5M
}
13865
13866
static const char *
13867
arg_as_utf8(PyObject *obj, const char *name)
13868
842k
{
13869
842k
    if (!PyUnicode_Check(obj)) {
13870
0
        PyErr_Format(PyExc_TypeError,
13871
0
                     "str() argument '%s' must be str, not %T",
13872
0
                     name, obj);
13873
0
        return NULL;
13874
0
    }
13875
842k
    return _PyUnicode_AsUTF8NoNUL(obj);
13876
842k
}
13877
13878
static PyObject *
13879
unicode_vectorcall(PyObject *type, PyObject *const *args,
13880
                   size_t nargsf, PyObject *kwnames)
13881
599k
{
13882
599k
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
13883
13884
599k
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
13885
599k
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
13886
        // Fallback to unicode_new()
13887
0
        PyObject *tuple = PyTuple_FromArray(args, nargs);
13888
0
        if (tuple == NULL) {
13889
0
            return NULL;
13890
0
        }
13891
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
13892
0
        if (dict == NULL) {
13893
0
            Py_DECREF(tuple);
13894
0
            return NULL;
13895
0
        }
13896
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
13897
0
        Py_DECREF(tuple);
13898
0
        Py_DECREF(dict);
13899
0
        return ret;
13900
0
    }
13901
599k
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
13902
0
        return NULL;
13903
0
    }
13904
599k
    if (nargs == 0) {
13905
0
        return _PyUnicode_GetEmpty();
13906
0
    }
13907
599k
    PyObject *object = args[0];
13908
599k
    if (nargs == 1) {
13909
426
        return PyObject_Str(object);
13910
426
    }
13911
599k
    const char *encoding = arg_as_utf8(args[1], "encoding");
13912
599k
    if (encoding == NULL) {
13913
155
        return NULL;
13914
155
    }
13915
598k
    const char *errors = NULL;
13916
598k
    if (nargs == 3) {
13917
243k
        errors = arg_as_utf8(args[2], "errors");
13918
243k
        if (errors == NULL) {
13919
0
            return NULL;
13920
0
        }
13921
243k
    }
13922
598k
    return PyUnicode_FromEncodedObject(object, encoding, errors);
13923
598k
}
13924
13925
static PyObject *
13926
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
13927
10.5M
{
13928
10.5M
    PyObject *self;
13929
10.5M
    Py_ssize_t length, char_size;
13930
10.5M
    int share_utf8;
13931
10.5M
    int kind;
13932
10.5M
    void *data;
13933
13934
10.5M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13935
10.5M
    assert(_PyUnicode_CHECK(unicode));
13936
13937
10.5M
    self = type->tp_alloc(type, 0);
13938
10.5M
    if (self == NULL) {
13939
0
        return NULL;
13940
0
    }
13941
10.5M
    kind = PyUnicode_KIND(unicode);
13942
10.5M
    length = PyUnicode_GET_LENGTH(unicode);
13943
13944
10.5M
    _PyUnicode_LENGTH(self) = length;
13945
#ifdef Py_DEBUG
13946
    _PyUnicode_HASH(self) = -1;
13947
#else
13948
10.5M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13949
10.5M
#endif
13950
10.5M
    _PyUnicode_STATE(self).interned = 0;
13951
10.5M
    _PyUnicode_STATE(self).kind = kind;
13952
10.5M
    _PyUnicode_STATE(self).compact = 0;
13953
10.5M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13954
10.5M
    _PyUnicode_STATE(self).statically_allocated = 0;
13955
10.5M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
13956
10.5M
    PyUnicode_SET_UTF8(self, NULL);
13957
10.5M
    _PyUnicode_DATA_ANY(self) = NULL;
13958
13959
10.5M
    share_utf8 = 0;
13960
10.5M
    if (kind == PyUnicode_1BYTE_KIND) {
13961
9.36M
        char_size = 1;
13962
9.36M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13963
9.32M
            share_utf8 = 1;
13964
9.36M
    }
13965
1.19M
    else if (kind == PyUnicode_2BYTE_KIND) {
13966
1.14M
        char_size = 2;
13967
1.14M
    }
13968
48.7k
    else {
13969
48.7k
        assert(kind == PyUnicode_4BYTE_KIND);
13970
48.7k
        char_size = 4;
13971
48.7k
    }
13972
13973
    /* Ensure we won't overflow the length. */
13974
10.5M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13975
0
        PyErr_NoMemory();
13976
0
        goto onError;
13977
0
    }
13978
10.5M
    data = PyMem_Malloc((length + 1) * char_size);
13979
10.5M
    if (data == NULL) {
13980
0
        PyErr_NoMemory();
13981
0
        goto onError;
13982
0
    }
13983
13984
10.5M
    _PyUnicode_DATA_ANY(self) = data;
13985
10.5M
    if (share_utf8) {
13986
9.32M
        PyUnicode_SET_UTF8_LENGTH(self, length);
13987
9.32M
        PyUnicode_SET_UTF8(self, data);
13988
9.32M
    }
13989
13990
10.5M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
13991
10.5M
    assert(_PyUnicode_CheckConsistency(self, 1));
13992
#ifdef Py_DEBUG
13993
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13994
#endif
13995
10.5M
    return self;
13996
13997
0
onError:
13998
0
    Py_DECREF(self);
13999
0
    return NULL;
14000
10.5M
}
14001
14002
void
14003
_PyUnicode_ExactDealloc(PyObject *op)
14004
87.9M
{
14005
87.9M
    assert(PyUnicode_CheckExact(op));
14006
87.9M
    unicode_dealloc(op);
14007
87.9M
}
14008
14009
PyDoc_STRVAR(unicode_doc,
14010
"str(object='') -> str\n\
14011
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14012
\n\
14013
Create a new string object from the given object. If encoding or\n\
14014
errors is specified, then the object must expose a data buffer\n\
14015
that will be decoded using the given encoding and error handler.\n\
14016
Otherwise, returns the result of object.__str__() (if defined)\n\
14017
or repr(object).\n\
14018
encoding defaults to 'utf-8'.\n\
14019
errors defaults to 'strict'.");
14020
14021
static PyObject *unicode_iter(PyObject *seq);
14022
14023
PyTypeObject PyUnicode_Type = {
14024
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14025
    "str",                        /* tp_name */
14026
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14027
    0,                            /* tp_itemsize */
14028
    /* Slots */
14029
    unicode_dealloc,              /* tp_dealloc */
14030
    0,                            /* tp_vectorcall_offset */
14031
    0,                            /* tp_getattr */
14032
    0,                            /* tp_setattr */
14033
    0,                            /* tp_as_async */
14034
    unicode_repr,                 /* tp_repr */
14035
    &unicode_as_number,           /* tp_as_number */
14036
    &unicode_as_sequence,         /* tp_as_sequence */
14037
    &unicode_as_mapping,          /* tp_as_mapping */
14038
    unicode_hash,                 /* tp_hash*/
14039
    0,                            /* tp_call*/
14040
    unicode_str,                  /* tp_str */
14041
    PyObject_GenericGetAttr,      /* tp_getattro */
14042
    0,                            /* tp_setattro */
14043
    0,                            /* tp_as_buffer */
14044
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14045
        Py_TPFLAGS_UNICODE_SUBCLASS |
14046
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14047
    unicode_doc,                  /* tp_doc */
14048
    0,                            /* tp_traverse */
14049
    0,                            /* tp_clear */
14050
    PyUnicode_RichCompare,        /* tp_richcompare */
14051
    0,                            /* tp_weaklistoffset */
14052
    unicode_iter,                 /* tp_iter */
14053
    0,                            /* tp_iternext */
14054
    unicode_methods,              /* tp_methods */
14055
    0,                            /* tp_members */
14056
    0,                            /* tp_getset */
14057
    0,                            /* tp_base */
14058
    0,                            /* tp_dict */
14059
    0,                            /* tp_descr_get */
14060
    0,                            /* tp_descr_set */
14061
    0,                            /* tp_dictoffset */
14062
    0,                            /* tp_init */
14063
    0,                            /* tp_alloc */
14064
    unicode_new,                  /* tp_new */
14065
    PyObject_Free,                /* tp_free */
14066
    .tp_vectorcall = unicode_vectorcall,
14067
};
14068
14069
/* Initialize the Unicode implementation */
14070
14071
static void
14072
_init_global_state(void)
14073
16
{
14074
16
    static int initialized = 0;
14075
16
    if (initialized) {
14076
0
        return;
14077
0
    }
14078
16
    initialized = 1;
14079
14080
    /* initialize the linebreak bloom filter */
14081
16
    const Py_UCS2 linebreak[] = {
14082
16
        0x000A, /* LINE FEED */
14083
16
        0x000D, /* CARRIAGE RETURN */
14084
16
        0x001C, /* FILE SEPARATOR */
14085
16
        0x001D, /* GROUP SEPARATOR */
14086
16
        0x001E, /* RECORD SEPARATOR */
14087
16
        0x0085, /* NEXT LINE */
14088
16
        0x2028, /* LINE SEPARATOR */
14089
16
        0x2029, /* PARAGRAPH SEPARATOR */
14090
16
    };
14091
16
    bloom_linebreak = make_bloom_mask(
14092
16
        PyUnicode_2BYTE_KIND, linebreak,
14093
16
        Py_ARRAY_LENGTH(linebreak));
14094
16
}
14095
14096
void
14097
_PyUnicode_InitState(PyInterpreterState *interp)
14098
16
{
14099
16
    if (!_Py_IsMainInterpreter(interp)) {
14100
0
        return;
14101
0
    }
14102
16
    _init_global_state();
14103
16
}
14104
14105
14106
PyStatus
14107
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14108
16
{
14109
16
    if (_Py_IsMainInterpreter(interp)) {
14110
16
        PyStatus status = init_global_interned_strings(interp);
14111
16
        if (_PyStatus_EXCEPTION(status)) {
14112
0
            return status;
14113
0
        }
14114
16
    }
14115
16
    assert(INTERNED_STRINGS);
14116
14117
16
    if (init_interned_dict(interp)) {
14118
0
        PyErr_Clear();
14119
0
        return _PyStatus_ERR("failed to create interned dict");
14120
0
    }
14121
14122
16
    return _PyStatus_OK();
14123
16
}
14124
14125
14126
PyStatus
14127
_PyUnicode_InitTypes(PyInterpreterState *interp)
14128
16
{
14129
16
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
14130
0
        goto error;
14131
0
    }
14132
16
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
14133
0
        goto error;
14134
0
    }
14135
16
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
14136
0
        goto error;
14137
0
    }
14138
16
    return _PyStatus_OK();
14139
14140
0
error:
14141
0
    return _PyStatus_ERR("Can't initialize unicode types");
14142
16
}
14143
14144
static /* non-null */ PyObject*
14145
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
14146
17.2k
{
14147
    // Note that this steals a reference to `s`, but in many cases that
14148
    // stolen ref is returned, requiring no decref/incref.
14149
14150
17.2k
    assert(s != NULL);
14151
17.2k
    assert(_PyUnicode_CHECK(s));
14152
17.2k
    assert(_PyUnicode_STATE(s).statically_allocated);
14153
17.2k
    assert(!PyUnicode_CHECK_INTERNED(s));
14154
14155
#ifdef Py_DEBUG
14156
    /* We must not add process-global interned string if there's already a
14157
     * per-interpreter interned_dict, which might contain duplicates.
14158
     */
14159
    PyObject *interned = get_interned_dict(interp);
14160
    assert(interned == NULL);
14161
#endif
14162
14163
    /* Look in the global cache first. */
14164
17.2k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14165
    /* We should only init each string once */
14166
17.2k
    assert(r == NULL);
14167
    /* but just in case (for the non-debug build), handle this */
14168
17.2k
    if (r != NULL && r != s) {
14169
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
14170
0
        assert(_PyUnicode_CHECK(r));
14171
0
        Py_DECREF(s);
14172
0
        return Py_NewRef(r);
14173
0
    }
14174
14175
17.2k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
14176
0
        Py_FatalError("failed to intern static string");
14177
0
    }
14178
14179
17.2k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14180
17.2k
    return s;
14181
17.2k
}
14182
14183
void
14184
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
14185
17.2k
{
14186
    // This should only be called as part of runtime initialization
14187
17.2k
    assert(!Py_IsInitialized());
14188
14189
17.2k
    *p = intern_static(interp, *p);
14190
17.2k
    assert(*p);
14191
17.2k
}
14192
14193
static void
14194
immortalize_interned(PyObject *s)
14195
100k
{
14196
100k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
14197
100k
    assert(!_Py_IsImmortal(s));
14198
#ifdef Py_REF_DEBUG
14199
    /* The reference count value should be excluded from the RefTotal.
14200
       The decrements to these objects will not be registered so they
14201
       need to be accounted for in here. */
14202
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
14203
        _Py_DecRefTotal(_PyThreadState_GET());
14204
    }
14205
#endif
14206
100k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
14207
100k
    _Py_SetImmortal(s);
14208
100k
}
14209
14210
static /* non-null */ PyObject*
14211
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
14212
              bool immortalize)
14213
34.1M
{
14214
    // Note that this steals a reference to `s`, but in many cases that
14215
    // stolen ref is returned, requiring no decref/incref.
14216
14217
#ifdef Py_DEBUG
14218
    assert(s != NULL);
14219
    assert(_PyUnicode_CHECK(s));
14220
#else
14221
34.1M
    if (s == NULL || !PyUnicode_Check(s)) {
14222
0
        return s;
14223
0
    }
14224
34.1M
#endif
14225
14226
    /* If it's a subclass, we don't really know what putting
14227
       it in the interned dict might do. */
14228
34.1M
    if (!PyUnicode_CheckExact(s)) {
14229
0
        return s;
14230
0
    }
14231
14232
    /* Is it already interned? */
14233
34.1M
    switch (PyUnicode_CHECK_INTERNED(s)) {
14234
2.98M
        case SSTATE_NOT_INTERNED:
14235
            // no, go on
14236
2.98M
            break;
14237
19.3k
        case SSTATE_INTERNED_MORTAL:
14238
            // yes but we might need to make it immortal
14239
19.3k
            if (immortalize) {
14240
5.34k
                immortalize_interned(s);
14241
5.34k
            }
14242
19.3k
            return s;
14243
31.1M
        default:
14244
            // all done
14245
31.1M
            return s;
14246
34.1M
    }
14247
14248
    /* Statically allocated strings must be already interned. */
14249
34.1M
    assert(!_PyUnicode_STATE(s).statically_allocated);
14250
14251
#if Py_GIL_DISABLED
14252
    /* In the free-threaded build, all interned strings are immortal */
14253
    immortalize = 1;
14254
#endif
14255
14256
    /* If it's already immortal, intern it as such */
14257
2.98M
    if (_Py_IsImmortal(s)) {
14258
0
        immortalize = 1;
14259
0
    }
14260
14261
    /* if it's a short string, get the singleton */
14262
2.98M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
14263
22.4k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
14264
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
14265
0
        assert(PyUnicode_CHECK_INTERNED(r));
14266
0
        Py_DECREF(s);
14267
0
        return r;
14268
0
    }
14269
#ifdef Py_DEBUG
14270
    assert(!unicode_is_singleton(s));
14271
#endif
14272
14273
    /* Look in the global cache now. */
14274
2.98M
    {
14275
2.98M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14276
2.98M
        if (r != NULL) {
14277
290k
            assert(_PyUnicode_STATE(r).statically_allocated);
14278
290k
            assert(r != s);  // r must be statically_allocated; s is not
14279
290k
            Py_DECREF(s);
14280
290k
            return Py_NewRef(r);
14281
290k
        }
14282
2.98M
    }
14283
14284
    /* Do a setdefault on the per-interpreter cache. */
14285
2.69M
    PyObject *interned = get_interned_dict(interp);
14286
2.69M
    assert(interned != NULL);
14287
#ifdef Py_GIL_DISABLED
14288
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
14289
#endif
14290
2.69M
    FT_MUTEX_LOCK(INTERN_MUTEX);
14291
2.69M
    PyObject *t;
14292
2.69M
    {
14293
2.69M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
14294
2.69M
        if (res < 0) {
14295
0
            PyErr_Clear();
14296
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14297
0
            return s;
14298
0
        }
14299
2.69M
        else if (res == 1) {
14300
            // value was already present (not inserted)
14301
2.02M
            Py_DECREF(s);
14302
2.02M
            if (immortalize &&
14303
595k
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
14304
4.21k
                immortalize_interned(t);
14305
4.21k
            }
14306
2.02M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14307
2.02M
            return t;
14308
2.02M
        }
14309
672k
        else {
14310
            // value was newly inserted
14311
672k
            assert (s == t);
14312
672k
            Py_DECREF(t);
14313
672k
        }
14314
2.69M
    }
14315
14316
    /* NOT_INTERNED -> INTERNED_MORTAL */
14317
14318
2.69M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
14319
14320
672k
    if (!_Py_IsImmortal(s)) {
14321
        /* The two references in interned dict (key and value) are not counted.
14322
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
14323
672k
        Py_DECREF(s);
14324
672k
        Py_DECREF(s);
14325
672k
    }
14326
672k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
14327
14328
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
14329
14330
#ifdef Py_DEBUG
14331
    if (_Py_IsImmortal(s)) {
14332
        assert(immortalize);
14333
    }
14334
#endif
14335
672k
    if (immortalize) {
14336
90.4k
        immortalize_interned(s);
14337
90.4k
    }
14338
14339
672k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
14340
672k
    return s;
14341
2.69M
}
14342
14343
void
14344
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
14345
2.63M
{
14346
2.63M
    *p = intern_common(interp, *p, 1);
14347
2.63M
    assert(*p);
14348
2.63M
}
14349
14350
void
14351
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
14352
31.5M
{
14353
31.5M
    *p = intern_common(interp, *p, 0);
14354
31.5M
    assert(*p);
14355
31.5M
}
14356
14357
14358
void
14359
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
14360
0
{
14361
0
    _PyUnicode_InternImmortal(interp, p);
14362
0
    return;
14363
0
}
14364
14365
void
14366
PyUnicode_InternInPlace(PyObject **p)
14367
0
{
14368
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14369
0
    _PyUnicode_InternMortal(interp, p);
14370
0
}
14371
14372
// Public-looking name kept for the stable ABI; user should not call this:
14373
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14374
void
14375
PyUnicode_InternImmortal(PyObject **p)
14376
0
{
14377
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14378
0
    _PyUnicode_InternImmortal(interp, p);
14379
0
}
14380
14381
PyObject *
14382
PyUnicode_InternFromString(const char *cp)
14383
1.17M
{
14384
1.17M
    PyObject *s = PyUnicode_FromString(cp);
14385
1.17M
    if (s == NULL) {
14386
0
        return NULL;
14387
0
    }
14388
1.17M
    PyInterpreterState *interp = _PyInterpreterState_GET();
14389
1.17M
    _PyUnicode_InternMortal(interp, &s);
14390
1.17M
    return s;
14391
1.17M
}
14392
14393
14394
void
14395
_PyUnicode_ClearInterned(PyInterpreterState *interp)
14396
0
{
14397
0
    PyObject *interned = get_interned_dict(interp);
14398
0
    if (interned == NULL) {
14399
0
        return;
14400
0
    }
14401
0
    assert(PyDict_CheckExact(interned));
14402
14403
0
    if (has_shared_intern_dict(interp)) {
14404
        // the dict doesn't belong to this interpreter, skip the debug
14405
        // checks on it and just clear the pointer to it
14406
0
        clear_interned_dict(interp);
14407
0
        return;
14408
0
    }
14409
14410
#ifdef INTERNED_STATS
14411
    fprintf(stderr, "releasing %zd interned strings\n",
14412
            PyDict_GET_SIZE(interned));
14413
14414
    Py_ssize_t total_length = 0;
14415
#endif
14416
0
    Py_ssize_t pos = 0;
14417
0
    PyObject *s, *ignored_value;
14418
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
14419
0
        int shared = 0;
14420
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
14421
0
        case SSTATE_INTERNED_IMMORTAL:
14422
            /* Make immortal interned strings mortal again. */
14423
            // Skip the Immortal Instance check and restore
14424
            // the two references (key and value) ignored
14425
            // by PyUnicode_InternInPlace().
14426
0
            _Py_SetMortal(s, 2);
14427
#ifdef Py_REF_DEBUG
14428
            /* let's be pedantic with the ref total */
14429
            _Py_IncRefTotal(_PyThreadState_GET());
14430
            _Py_IncRefTotal(_PyThreadState_GET());
14431
#endif
14432
#ifdef INTERNED_STATS
14433
            total_length += PyUnicode_GET_LENGTH(s);
14434
#endif
14435
0
            break;
14436
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
14437
            /* It is shared between interpreters, so we should unmark it
14438
               only when this is the last interpreter in which it's
14439
               interned.  We immortalize all the statically initialized
14440
               strings during startup, so we can rely on the
14441
               main interpreter to be the last one. */
14442
0
            if (!_Py_IsMainInterpreter(interp)) {
14443
0
                shared = 1;
14444
0
            }
14445
0
            break;
14446
0
        case SSTATE_INTERNED_MORTAL:
14447
            // Restore 2 references held by the interned dict; these will
14448
            // be decref'd by clear_interned_dict's PyDict_Clear.
14449
0
            _Py_RefcntAdd(s, 2);
14450
#ifdef Py_REF_DEBUG
14451
            /* let's be pedantic with the ref total */
14452
            _Py_IncRefTotal(_PyThreadState_GET());
14453
            _Py_IncRefTotal(_PyThreadState_GET());
14454
#endif
14455
0
            break;
14456
0
        case SSTATE_NOT_INTERNED:
14457
0
            _Py_FALLTHROUGH;
14458
0
        default:
14459
0
            Py_UNREACHABLE();
14460
0
        }
14461
0
        if (!shared) {
14462
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
14463
0
        }
14464
0
    }
14465
#ifdef INTERNED_STATS
14466
    fprintf(stderr,
14467
            "total length of all interned strings: %zd characters\n",
14468
            total_length);
14469
#endif
14470
14471
0
    struct _Py_unicode_state *state = &interp->unicode;
14472
0
    struct _Py_unicode_ids *ids = &state->ids;
14473
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
14474
0
        Py_XINCREF(ids->array[i]);
14475
0
    }
14476
0
    clear_interned_dict(interp);
14477
0
    if (_Py_IsMainInterpreter(interp)) {
14478
0
        clear_global_interned_strings();
14479
0
    }
14480
0
}
14481
14482
14483
/********************* Unicode Iterator **************************/
14484
14485
typedef struct {
14486
    PyObject_HEAD
14487
    Py_ssize_t it_index;
14488
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14489
} unicodeiterobject;
14490
14491
static void
14492
unicodeiter_dealloc(PyObject *op)
14493
1.94M
{
14494
1.94M
    unicodeiterobject *it = (unicodeiterobject *)op;
14495
1.94M
    _PyObject_GC_UNTRACK(it);
14496
1.94M
    Py_XDECREF(it->it_seq);
14497
1.94M
    PyObject_GC_Del(it);
14498
1.94M
}
14499
14500
static int
14501
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
14502
2
{
14503
2
    unicodeiterobject *it = (unicodeiterobject *)op;
14504
2
    Py_VISIT(it->it_seq);
14505
2
    return 0;
14506
2
}
14507
14508
static PyObject *
14509
unicodeiter_next(PyObject *op)
14510
119M
{
14511
119M
    unicodeiterobject *it = (unicodeiterobject *)op;
14512
119M
    PyObject *seq;
14513
14514
119M
    assert(it != NULL);
14515
119M
    seq = it->it_seq;
14516
119M
    if (seq == NULL)
14517
0
        return NULL;
14518
119M
    assert(_PyUnicode_CHECK(seq));
14519
14520
119M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14521
118M
        int kind = PyUnicode_KIND(seq);
14522
118M
        const void *data = PyUnicode_DATA(seq);
14523
118M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14524
118M
        it->it_index++;
14525
118M
        return unicode_char(chr);
14526
118M
    }
14527
14528
819k
    it->it_seq = NULL;
14529
819k
    Py_DECREF(seq);
14530
819k
    return NULL;
14531
119M
}
14532
14533
static PyObject *
14534
unicode_ascii_iter_next(PyObject *op)
14535
87.1M
{
14536
87.1M
    unicodeiterobject *it = (unicodeiterobject *)op;
14537
87.1M
    assert(it != NULL);
14538
87.1M
    PyObject *seq = it->it_seq;
14539
87.1M
    if (seq == NULL) {
14540
0
        return NULL;
14541
0
    }
14542
87.1M
    assert(_PyUnicode_CHECK(seq));
14543
87.1M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
14544
87.1M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14545
86.0M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14546
86.0M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14547
86.0M
                                              data, it->it_index);
14548
86.0M
        it->it_index++;
14549
86.0M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14550
86.0M
    }
14551
1.03M
    it->it_seq = NULL;
14552
1.03M
    Py_DECREF(seq);
14553
1.03M
    return NULL;
14554
87.1M
}
14555
14556
static PyObject *
14557
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
14558
0
{
14559
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14560
0
    Py_ssize_t len = 0;
14561
0
    if (it->it_seq)
14562
0
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14563
0
    return PyLong_FromSsize_t(len);
14564
0
}
14565
14566
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14567
14568
static PyObject *
14569
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
14570
0
{
14571
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14572
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
14573
14574
    /* _PyEval_GetBuiltin can invoke arbitrary code,
14575
     * call must be before access of iterator pointers.
14576
     * see issue #101765 */
14577
14578
0
    if (it->it_seq != NULL) {
14579
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
14580
0
    } else {
14581
0
        PyObject *u = _PyUnicode_GetEmpty();
14582
0
        if (u == NULL) {
14583
0
            Py_XDECREF(iter);
14584
0
            return NULL;
14585
0
        }
14586
0
        return Py_BuildValue("N(N)", iter, u);
14587
0
    }
14588
0
}
14589
14590
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14591
14592
static PyObject *
14593
unicodeiter_setstate(PyObject *op, PyObject *state)
14594
0
{
14595
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14596
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
14597
0
    if (index == -1 && PyErr_Occurred())
14598
0
        return NULL;
14599
0
    if (it->it_seq != NULL) {
14600
0
        if (index < 0)
14601
0
            index = 0;
14602
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14603
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14604
0
        it->it_index = index;
14605
0
    }
14606
0
    Py_RETURN_NONE;
14607
0
}
14608
14609
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14610
14611
static PyMethodDef unicodeiter_methods[] = {
14612
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
14613
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
14614
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
14615
    {NULL,      NULL}       /* sentinel */
14616
};
14617
14618
PyTypeObject PyUnicodeIter_Type = {
14619
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14620
    "str_iterator",         /* tp_name */
14621
    sizeof(unicodeiterobject),      /* tp_basicsize */
14622
    0,                  /* tp_itemsize */
14623
    /* methods */
14624
    unicodeiter_dealloc,/* tp_dealloc */
14625
    0,                  /* tp_vectorcall_offset */
14626
    0,                  /* tp_getattr */
14627
    0,                  /* tp_setattr */
14628
    0,                  /* tp_as_async */
14629
    0,                  /* tp_repr */
14630
    0,                  /* tp_as_number */
14631
    0,                  /* tp_as_sequence */
14632
    0,                  /* tp_as_mapping */
14633
    0,                  /* tp_hash */
14634
    0,                  /* tp_call */
14635
    0,                  /* tp_str */
14636
    PyObject_GenericGetAttr,        /* tp_getattro */
14637
    0,                  /* tp_setattro */
14638
    0,                  /* tp_as_buffer */
14639
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14640
    0,                  /* tp_doc */
14641
    unicodeiter_traverse, /* tp_traverse */
14642
    0,                  /* tp_clear */
14643
    0,                  /* tp_richcompare */
14644
    0,                  /* tp_weaklistoffset */
14645
    PyObject_SelfIter,          /* tp_iter */
14646
    unicodeiter_next,   /* tp_iternext */
14647
    unicodeiter_methods,            /* tp_methods */
14648
    0,
14649
};
14650
14651
PyTypeObject _PyUnicodeASCIIIter_Type = {
14652
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14653
    .tp_name = "str_ascii_iterator",
14654
    .tp_basicsize = sizeof(unicodeiterobject),
14655
    .tp_dealloc = unicodeiter_dealloc,
14656
    .tp_getattro = PyObject_GenericGetAttr,
14657
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14658
    .tp_traverse = unicodeiter_traverse,
14659
    .tp_iter = PyObject_SelfIter,
14660
    .tp_iternext = unicode_ascii_iter_next,
14661
    .tp_methods = unicodeiter_methods,
14662
};
14663
14664
static PyObject *
14665
unicode_iter(PyObject *seq)
14666
1.94M
{
14667
1.94M
    unicodeiterobject *it;
14668
14669
1.94M
    if (!PyUnicode_Check(seq)) {
14670
0
        PyErr_BadInternalCall();
14671
0
        return NULL;
14672
0
    }
14673
1.94M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14674
1.12M
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14675
1.12M
    }
14676
819k
    else {
14677
819k
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14678
819k
    }
14679
1.94M
    if (it == NULL)
14680
0
        return NULL;
14681
1.94M
    it->it_index = 0;
14682
1.94M
    it->it_seq = Py_NewRef(seq);
14683
1.94M
    _PyObject_GC_TRACK(it);
14684
1.94M
    return (PyObject *)it;
14685
1.94M
}
14686
14687
static int
14688
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14689
64
{
14690
64
    int res;
14691
64
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14692
64
    if (res == -2) {
14693
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
14694
0
        return -1;
14695
0
    }
14696
64
    if (res < 0) {
14697
0
        PyErr_NoMemory();
14698
0
        return -1;
14699
0
    }
14700
64
    return 0;
14701
64
}
14702
14703
14704
static int
14705
config_get_codec_name(wchar_t **config_encoding)
14706
32
{
14707
32
    char *encoding;
14708
32
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
14709
0
        return -1;
14710
0
    }
14711
14712
32
    PyObject *name_obj = NULL;
14713
32
    PyObject *codec = _PyCodec_Lookup(encoding);
14714
32
    PyMem_RawFree(encoding);
14715
14716
32
    if (!codec)
14717
0
        goto error;
14718
14719
32
    name_obj = PyObject_GetAttrString(codec, "name");
14720
32
    Py_CLEAR(codec);
14721
32
    if (!name_obj) {
14722
0
        goto error;
14723
0
    }
14724
14725
32
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14726
32
    Py_DECREF(name_obj);
14727
32
    if (wname == NULL) {
14728
0
        goto error;
14729
0
    }
14730
14731
32
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14732
32
    if (raw_wname == NULL) {
14733
0
        PyMem_Free(wname);
14734
0
        PyErr_NoMemory();
14735
0
        goto error;
14736
0
    }
14737
14738
32
    PyMem_RawFree(*config_encoding);
14739
32
    *config_encoding = raw_wname;
14740
14741
32
    PyMem_Free(wname);
14742
32
    return 0;
14743
14744
0
error:
14745
0
    Py_XDECREF(codec);
14746
0
    Py_XDECREF(name_obj);
14747
0
    return -1;
14748
32
}
14749
14750
14751
static PyStatus
14752
init_stdio_encoding(PyInterpreterState *interp)
14753
16
{
14754
    /* Update the stdio encoding to the normalized Python codec name. */
14755
16
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14756
16
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
14757
0
        return _PyStatus_ERR("failed to get the Python codec name "
14758
0
                             "of the stdio encoding");
14759
0
    }
14760
16
    return _PyStatus_OK();
14761
16
}
14762
14763
14764
static int
14765
init_fs_codec(PyInterpreterState *interp)
14766
16
{
14767
16
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
14768
14769
16
    _Py_error_handler error_handler;
14770
16
    error_handler = get_error_handler_wide(config->filesystem_errors);
14771
16
    if (error_handler == _Py_ERROR_UNKNOWN) {
14772
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
14773
0
        return -1;
14774
0
    }
14775
14776
16
    char *encoding, *errors;
14777
16
    if (encode_wstr_utf8(config->filesystem_encoding,
14778
16
                         &encoding,
14779
16
                         "filesystem_encoding") < 0) {
14780
0
        return -1;
14781
0
    }
14782
14783
16
    if (encode_wstr_utf8(config->filesystem_errors,
14784
16
                         &errors,
14785
16
                         "filesystem_errors") < 0) {
14786
0
        PyMem_RawFree(encoding);
14787
0
        return -1;
14788
0
    }
14789
14790
16
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
14791
16
    PyMem_RawFree(fs_codec->encoding);
14792
16
    fs_codec->encoding = encoding;
14793
    /* encoding has been normalized by init_fs_encoding() */
14794
16
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
14795
16
    PyMem_RawFree(fs_codec->errors);
14796
16
    fs_codec->errors = errors;
14797
16
    fs_codec->error_handler = error_handler;
14798
14799
#ifdef _Py_FORCE_UTF8_FS_ENCODING
14800
    assert(fs_codec->utf8 == 1);
14801
#endif
14802
14803
    /* At this point, PyUnicode_EncodeFSDefault() and
14804
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
14805
       the C implementation of the filesystem encoding. */
14806
14807
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
14808
       global configuration variables. */
14809
16
    if (_Py_IsMainInterpreter(interp)) {
14810
14811
16
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
14812
16
                                      fs_codec->errors) < 0) {
14813
0
            PyErr_NoMemory();
14814
0
            return -1;
14815
0
        }
14816
16
    }
14817
16
    return 0;
14818
16
}
14819
14820
14821
static PyStatus
14822
init_fs_encoding(PyThreadState *tstate)
14823
16
{
14824
16
    PyInterpreterState *interp = tstate->interp;
14825
14826
    /* Update the filesystem encoding to the normalized Python codec name.
14827
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
14828
       (Python codec name). */
14829
16
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14830
16
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
14831
0
        _Py_DumpPathConfig(tstate);
14832
0
        return _PyStatus_ERR("failed to get the Python codec "
14833
0
                             "of the filesystem encoding");
14834
0
    }
14835
14836
16
    if (init_fs_codec(interp) < 0) {
14837
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
14838
0
    }
14839
16
    return _PyStatus_OK();
14840
16
}
14841
14842
14843
PyStatus
14844
_PyUnicode_InitEncodings(PyThreadState *tstate)
14845
16
{
14846
16
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
14847
16
    if (_PyStatus_EXCEPTION(status)) {
14848
0
        return status;
14849
0
    }
14850
16
    status = init_fs_encoding(tstate);
14851
16
    if (_PyStatus_EXCEPTION(status)) {
14852
0
        return status;
14853
0
    }
14854
14855
16
    return init_stdio_encoding(tstate->interp);
14856
16
}
14857
14858
14859
static void
14860
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
14861
0
{
14862
0
    PyMem_RawFree(fs_codec->encoding);
14863
0
    fs_codec->encoding = NULL;
14864
0
    fs_codec->utf8 = 0;
14865
0
    PyMem_RawFree(fs_codec->errors);
14866
0
    fs_codec->errors = NULL;
14867
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
14868
0
}
14869
14870
14871
#ifdef MS_WINDOWS
14872
int
14873
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
14874
{
14875
    PyInterpreterState *interp = _PyInterpreterState_GET();
14876
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
14877
14878
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
14879
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
14880
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
14881
    if (encoding == NULL || errors == NULL) {
14882
        PyMem_RawFree(encoding);
14883
        PyMem_RawFree(errors);
14884
        PyErr_NoMemory();
14885
        return -1;
14886
    }
14887
14888
    PyMem_RawFree(config->filesystem_encoding);
14889
    config->filesystem_encoding = encoding;
14890
    PyMem_RawFree(config->filesystem_errors);
14891
    config->filesystem_errors = errors;
14892
14893
    return init_fs_codec(interp);
14894
}
14895
#endif
14896
14897
14898
#ifdef Py_DEBUG
14899
static inline int
14900
unicode_is_finalizing(void)
14901
{
14902
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
14903
}
14904
#endif
14905
14906
14907
void
14908
_PyUnicode_FiniTypes(PyInterpreterState *interp)
14909
0
{
14910
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
14911
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
14912
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
14913
0
}
14914
14915
14916
void
14917
_PyUnicode_Fini(PyInterpreterState *interp)
14918
0
{
14919
0
    struct _Py_unicode_state *state = &interp->unicode;
14920
14921
0
    if (!has_shared_intern_dict(interp)) {
14922
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
14923
0
        assert(get_interned_dict(interp) == NULL);
14924
0
    }
14925
14926
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
14927
14928
    // bpo-47182: force a unicodedata CAPI capsule re-import on
14929
    // subsequent initialization of interpreter.
14930
0
    interp->unicode.ucnhash_capi = NULL;
14931
14932
0
    unicode_clear_identifiers(state);
14933
0
}
14934
14935
/* A _string module, to export formatter_parser and formatter_field_name_split
14936
   to the string.Formatter class implemented in Python. */
14937
14938
static PyMethodDef _string_methods[] = {
14939
    {"formatter_field_name_split", formatter_field_name_split,
14940
     METH_O, PyDoc_STR("split the argument as a field name")},
14941
    {"formatter_parser", formatter_parser,
14942
     METH_O, PyDoc_STR("parse the argument as a format string")},
14943
    {NULL, NULL}
14944
};
14945
14946
static PyModuleDef_Slot module_slots[] = {
14947
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
14948
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
14949
    {0, NULL}
14950
};
14951
14952
static struct PyModuleDef _string_module = {
14953
    PyModuleDef_HEAD_INIT,
14954
    .m_name = "_string",
14955
    .m_doc = PyDoc_STR("string helper module"),
14956
    .m_size = 0,
14957
    .m_methods = _string_methods,
14958
    .m_slots = module_slots,
14959
};
14960
14961
PyMODINIT_FUNC
14962
PyInit__string(void)
14963
6
{
14964
6
    return PyModuleDef_Init(&_string_module);
14965
6
}
14966
14967
14968
#undef PyUnicode_KIND
14969
int PyUnicode_KIND(PyObject *op)
14970
0
{
14971
0
    if (!PyUnicode_Check(op)) {
14972
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14973
0
        return -1;
14974
0
    }
14975
0
    return _PyASCIIObject_CAST(op)->state.kind;
14976
0
}
14977
14978
#undef PyUnicode_DATA
14979
void* PyUnicode_DATA(PyObject *op)
14980
0
{
14981
0
    if (!PyUnicode_Check(op)) {
14982
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14983
0
        return NULL;
14984
0
    }
14985
0
    return _PyUnicode_DATA(op);
14986
0
}