Coverage Report

Created: 2025-11-24 06:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_initconfig.h"    // _PyStatus_OK()
50
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
51
#include "pycore_long.h"          // _PyLong_FormatWriter()
52
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
54
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
55
#include "pycore_pyhash.h"        // _Py_HashSecret_t
56
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
57
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
58
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
59
#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart
60
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
61
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
62
63
#include "stringlib/eq.h"         // unicode_eq()
64
#include <stddef.h>               // ptrdiff_t
65
66
#ifdef MS_WINDOWS
67
#include <windows.h>
68
#endif
69
70
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
71
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
72
#endif
73
74
/* Uncomment to display statistics on interned strings at exit
75
   in _PyUnicode_ClearInterned(). */
76
/* #define INTERNED_STATS 1 */
77
78
79
/*[clinic input]
80
class str "PyObject *" "&PyUnicode_Type"
81
[clinic start generated code]*/
82
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
83
84
/*[python input]
85
class Py_UCS4_converter(CConverter):
86
    type = 'Py_UCS4'
87
    converter = 'convert_uc'
88
89
    def converter_init(self):
90
        if self.default is not unspecified:
91
            self.c_default = ascii(self.default)
92
            if len(self.c_default) > 4 or self.c_default[0] != "'":
93
                self.c_default = hex(ord(self.default))
94
95
[python start generated code]*/
96
/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
97
98
/* --- Globals ------------------------------------------------------------
99
100
NOTE: In the interpreter's initialization phase, some globals are currently
101
      initialized dynamically as needed. In the process Unicode objects may
102
      be created before the Unicode type is ready.
103
104
*/
105
106
11.6M
#define MAX_UNICODE _Py_MAX_UNICODE
107
303M
#define ensure_unicode _PyUnicode_EnsureUnicode
108
109
#ifdef Py_DEBUG
110
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
111
#else
112
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
113
#endif
114
115
static inline char* _PyUnicode_UTF8(PyObject *op)
116
229M
{
117
229M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
118
229M
}
119
120
static inline char* PyUnicode_UTF8(PyObject *op)
121
93.0M
{
122
93.0M
    assert(_PyUnicode_CHECK(op));
123
93.0M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
124
78.3M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
125
78.3M
    }
126
14.6M
    else {
127
14.6M
         return _PyUnicode_UTF8(op);
128
14.6M
    }
129
93.0M
}
130
131
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
132
31.5M
{
133
31.5M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
134
31.5M
}
135
136
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
137
42.4M
{
138
42.4M
    assert(_PyUnicode_CHECK(op));
139
42.4M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
140
39.1M
         return _PyASCIIObject_CAST(op)->length;
141
39.1M
    }
142
3.36M
    else {
143
3.36M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
144
3.36M
    }
145
42.4M
}
146
147
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
148
31.5M
{
149
31.5M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
150
31.5M
}
151
152
#define _PyUnicode_LENGTH(op)                           \
153
636M
    (_PyASCIIObject_CAST(op)->length)
154
#define _PyUnicode_STATE(op)                            \
155
3.97G
    (_PyASCIIObject_CAST(op)->state)
156
#define _PyUnicode_HASH(op)                             \
157
581M
    (_PyASCIIObject_CAST(op)->hash)
158
159
1.12G
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
160
161
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
162
61.8M
{
163
61.8M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
164
61.8M
}
165
166
#define _PyUnicode_DATA_ANY(op)                         \
167
66.5M
    (_PyUnicodeObject_CAST(op)->data.any)
168
169
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
170
0
{
171
0
    assert(_PyUnicode_CHECK(op));
172
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
173
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
174
0
}
175
176
/* true if the Unicode object has an allocated UTF-8 memory block
177
   (not shared with other data) */
178
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
179
635M
{
180
635M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
181
199M
            && _PyUnicode_UTF8(op) != NULL
182
14.8M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
183
635M
}
184
185
186
252M
#define LATIN1 _Py_LATIN1_CHR
187
188
/* Forward declaration */
189
static PyObject *
190
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
191
                    const char *errors);
192
static PyObject *
193
unicode_decode_utf8(const char *s, Py_ssize_t size,
194
                    _Py_error_handler error_handler, const char *errors,
195
                    Py_ssize_t *consumed);
196
#ifdef Py_DEBUG
197
static inline int unicode_is_finalizing(void);
198
static int unicode_is_singleton(PyObject *unicode);
199
#endif
200
201
202
// Return a reference to the immortal empty string singleton.
203
PyObject*
204
_PyUnicode_GetEmpty(void)
205
132M
{
206
132M
    _Py_DECLARE_STR(empty, "");
207
132M
    return &_Py_STR(empty);
208
132M
}
209
210
/* This dictionary holds per-interpreter interned strings.
211
 * See InternalDocs/string_interning.md for details.
212
 */
213
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
214
4.52M
{
215
4.52M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
216
4.52M
}
217
218
/* This hashtable holds statically allocated interned strings.
219
 * See InternalDocs/string_interning.md for details.
220
 */
221
4.73M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
222
223
/* Get number of all interned strings for the current interpreter. */
224
Py_ssize_t
225
_PyUnicode_InternedSize(void)
226
0
{
227
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
228
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
229
0
}
230
231
/* Get number of immortal interned strings for the current interpreter. */
232
Py_ssize_t
233
_PyUnicode_InternedSize_Immortal(void)
234
0
{
235
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
236
0
    PyObject *key, *value;
237
0
    Py_ssize_t pos = 0;
238
0
    Py_ssize_t count = 0;
239
240
    // It's tempting to keep a count and avoid a loop here. But, this function
241
    // is intended for refleak tests. It spends extra work to report the true
242
    // value, to help detect bugs in optimizations.
243
244
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
245
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
246
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
247
0
           count++;
248
0
       }
249
0
    }
250
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
251
0
}
252
253
static Py_hash_t unicode_hash(PyObject *);
254
255
static Py_uhash_t
256
hashtable_unicode_hash(const void *key)
257
4.73M
{
258
4.73M
    return unicode_hash((PyObject *)key);
259
4.73M
}
260
261
static int
262
hashtable_unicode_compare(const void *key1, const void *key2)
263
619k
{
264
619k
    PyObject *obj1 = (PyObject *)key1;
265
619k
    PyObject *obj2 = (PyObject *)key2;
266
619k
    if (obj1 != NULL && obj2 != NULL) {
267
619k
        return unicode_eq(obj1, obj2);
268
619k
    }
269
0
    else {
270
0
        return obj1 == obj2;
271
0
    }
272
619k
}
273
274
/* Return true if this interpreter should share the main interpreter's
275
   intern_dict.  That's important for interpreters which load basic
276
   single-phase init extension modules (m_size == -1).  There could be interned
277
   immortal strings that are shared between interpreters, due to the
278
   PyDict_Update(mdict, m_copy) call in import_find_extension().
279
280
   It's not safe to deallocate those strings until all interpreters that
281
   potentially use them are freed.  By storing them in the main interpreter, we
282
   ensure they get freed after all other interpreters are freed.
283
*/
284
static bool
285
has_shared_intern_dict(PyInterpreterState *interp)
286
28
{
287
28
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
288
28
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
289
28
}
290
291
static int
292
init_interned_dict(PyInterpreterState *interp)
293
28
{
294
28
    assert(get_interned_dict(interp) == NULL);
295
28
    PyObject *interned;
296
28
    if (has_shared_intern_dict(interp)) {
297
0
        interned = get_interned_dict(_PyInterpreterState_Main());
298
0
        Py_INCREF(interned);
299
0
    }
300
28
    else {
301
28
        interned = PyDict_New();
302
28
        if (interned == NULL) {
303
0
            return -1;
304
0
        }
305
28
    }
306
28
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
307
28
    return 0;
308
28
}
309
310
static void
311
clear_interned_dict(PyInterpreterState *interp)
312
0
{
313
0
    PyObject *interned = get_interned_dict(interp);
314
0
    if (interned != NULL) {
315
0
        if (!has_shared_intern_dict(interp)) {
316
            // only clear if the dict belongs to this interpreter
317
0
            PyDict_Clear(interned);
318
0
        }
319
0
        Py_DECREF(interned);
320
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
321
0
    }
322
0
}
323
324
static PyStatus
325
init_global_interned_strings(PyInterpreterState *interp)
326
28
{
327
28
    assert(INTERNED_STRINGS == NULL);
328
28
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
329
330
28
    INTERNED_STRINGS = _Py_hashtable_new_full(
331
28
        hashtable_unicode_hash,
332
28
        hashtable_unicode_compare,
333
        // Objects stored here are immortal and statically allocated,
334
        // so we don't need key_destroy_func & value_destroy_func:
335
28
        NULL,
336
28
        NULL,
337
28
        &hashtable_alloc
338
28
    );
339
28
    if (INTERNED_STRINGS == NULL) {
340
0
        PyErr_Clear();
341
0
        return _PyStatus_ERR("failed to create global interned dict");
342
0
    }
343
344
    /* Intern statically allocated string identifiers, deepfreeze strings,
345
        * and one-byte latin-1 strings.
346
        * This must be done before any module initialization so that statically
347
        * allocated string identifiers are used instead of heap allocated strings.
348
        * Deepfreeze uses the interned identifiers if present to save space
349
        * else generates them and they are interned to speed up dict lookups.
350
    */
351
28
    _PyUnicode_InitStaticStrings(interp);
352
353
7.19k
    for (int i = 0; i < 256; i++) {
354
7.16k
        PyObject *s = LATIN1(i);
355
7.16k
        _PyUnicode_InternStatic(interp, &s);
356
7.16k
        assert(s == LATIN1(i));
357
7.16k
    }
358
#ifdef Py_DEBUG
359
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
360
361
    for (int i = 0; i < 256; i++) {
362
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
363
    }
364
#endif
365
28
    return _PyStatus_OK();
366
28
}
367
368
static void clear_global_interned_strings(void)
369
0
{
370
0
    if (INTERNED_STRINGS != NULL) {
371
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
372
0
        INTERNED_STRINGS = NULL;
373
0
    }
374
0
}
375
376
#define _Py_RETURN_UNICODE_EMPTY()   \
377
50.3M
    do {                             \
378
50.3M
        return _PyUnicode_GetEmpty();\
379
50.3M
    } while (0)
380
381
382
/* Fast detection of the most frequent whitespace characters */
383
const unsigned char _Py_ascii_whitespace[] = {
384
    0, 0, 0, 0, 0, 0, 0, 0,
385
/*     case 0x0009: * CHARACTER TABULATION */
386
/*     case 0x000A: * LINE FEED */
387
/*     case 0x000B: * LINE TABULATION */
388
/*     case 0x000C: * FORM FEED */
389
/*     case 0x000D: * CARRIAGE RETURN */
390
    0, 1, 1, 1, 1, 1, 0, 0,
391
    0, 0, 0, 0, 0, 0, 0, 0,
392
/*     case 0x001C: * FILE SEPARATOR */
393
/*     case 0x001D: * GROUP SEPARATOR */
394
/*     case 0x001E: * RECORD SEPARATOR */
395
/*     case 0x001F: * UNIT SEPARATOR */
396
    0, 0, 0, 0, 1, 1, 1, 1,
397
/*     case 0x0020: * SPACE */
398
    1, 0, 0, 0, 0, 0, 0, 0,
399
    0, 0, 0, 0, 0, 0, 0, 0,
400
    0, 0, 0, 0, 0, 0, 0, 0,
401
    0, 0, 0, 0, 0, 0, 0, 0,
402
403
    0, 0, 0, 0, 0, 0, 0, 0,
404
    0, 0, 0, 0, 0, 0, 0, 0,
405
    0, 0, 0, 0, 0, 0, 0, 0,
406
    0, 0, 0, 0, 0, 0, 0, 0,
407
    0, 0, 0, 0, 0, 0, 0, 0,
408
    0, 0, 0, 0, 0, 0, 0, 0,
409
    0, 0, 0, 0, 0, 0, 0, 0,
410
    0, 0, 0, 0, 0, 0, 0, 0
411
};
412
413
/* forward */
414
static PyObject* get_latin1_char(unsigned char ch);
415
416
417
static PyObject *
418
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
419
static PyObject *
420
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
421
static PyObject *
422
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
423
424
static PyObject *
425
unicode_encode_call_errorhandler(const char *errors,
426
       PyObject **errorHandler,const char *encoding, const char *reason,
427
       PyObject *unicode, PyObject **exceptionObject,
428
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
429
430
static void
431
raise_encode_exception(PyObject **exceptionObject,
432
                       const char *encoding,
433
                       PyObject *unicode,
434
                       Py_ssize_t startpos, Py_ssize_t endpos,
435
                       const char *reason);
436
437
/* Same for linebreaks */
438
static const unsigned char ascii_linebreak[] = {
439
    0, 0, 0, 0, 0, 0, 0, 0,
440
/*         0x000A, * LINE FEED */
441
/*         0x000B, * LINE TABULATION */
442
/*         0x000C, * FORM FEED */
443
/*         0x000D, * CARRIAGE RETURN */
444
    0, 0, 1, 1, 1, 1, 0, 0,
445
    0, 0, 0, 0, 0, 0, 0, 0,
446
/*         0x001C, * FILE SEPARATOR */
447
/*         0x001D, * GROUP SEPARATOR */
448
/*         0x001E, * RECORD SEPARATOR */
449
    0, 0, 0, 0, 1, 1, 1, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0,
452
    0, 0, 0, 0, 0, 0, 0, 0,
453
    0, 0, 0, 0, 0, 0, 0, 0,
454
455
    0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 0, 0, 0, 0, 0, 0, 0,
459
    0, 0, 0, 0, 0, 0, 0, 0,
460
    0, 0, 0, 0, 0, 0, 0, 0,
461
    0, 0, 0, 0, 0, 0, 0, 0,
462
    0, 0, 0, 0, 0, 0, 0, 0
463
};
464
465
static int convert_uc(PyObject *obj, void *addr);
466
467
struct encoding_map;
468
#include "clinic/unicodeobject.c.h"
469
470
_Py_error_handler
471
_Py_GetErrorHandler(const char *errors)
472
1.81M
{
473
1.81M
    if (errors == NULL || strcmp(errors, "strict") == 0) {
474
1.07M
        return _Py_ERROR_STRICT;
475
1.07M
    }
476
743k
    if (strcmp(errors, "surrogateescape") == 0) {
477
537k
        return _Py_ERROR_SURROGATEESCAPE;
478
537k
    }
479
206k
    if (strcmp(errors, "replace") == 0) {
480
206k
        return _Py_ERROR_REPLACE;
481
206k
    }
482
0
    if (strcmp(errors, "ignore") == 0) {
483
0
        return _Py_ERROR_IGNORE;
484
0
    }
485
0
    if (strcmp(errors, "backslashreplace") == 0) {
486
0
        return _Py_ERROR_BACKSLASHREPLACE;
487
0
    }
488
0
    if (strcmp(errors, "surrogatepass") == 0) {
489
0
        return _Py_ERROR_SURROGATEPASS;
490
0
    }
491
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
492
0
        return _Py_ERROR_XMLCHARREFREPLACE;
493
0
    }
494
0
    return _Py_ERROR_OTHER;
495
0
}
496
497
498
static _Py_error_handler
499
get_error_handler_wide(const wchar_t *errors)
500
9.71k
{
501
9.71k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
502
0
        return _Py_ERROR_STRICT;
503
0
    }
504
9.71k
    if (wcscmp(errors, L"surrogateescape") == 0) {
505
9.71k
        return _Py_ERROR_SURROGATEESCAPE;
506
9.71k
    }
507
0
    if (wcscmp(errors, L"replace") == 0) {
508
0
        return _Py_ERROR_REPLACE;
509
0
    }
510
0
    if (wcscmp(errors, L"ignore") == 0) {
511
0
        return _Py_ERROR_IGNORE;
512
0
    }
513
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
514
0
        return _Py_ERROR_BACKSLASHREPLACE;
515
0
    }
516
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
517
0
        return _Py_ERROR_SURROGATEPASS;
518
0
    }
519
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
520
0
        return _Py_ERROR_XMLCHARREFREPLACE;
521
0
    }
522
0
    return _Py_ERROR_OTHER;
523
0
}
524
525
526
static inline int
527
unicode_check_encoding_errors(const char *encoding, const char *errors)
528
27.5M
{
529
27.5M
    if (encoding == NULL && errors == NULL) {
530
13.1M
        return 0;
531
13.1M
    }
532
533
14.3M
    PyInterpreterState *interp = _PyInterpreterState_GET();
534
14.3M
#ifndef Py_DEBUG
535
    /* In release mode, only check in development mode (-X dev) */
536
14.3M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
537
14.3M
        return 0;
538
14.3M
    }
539
#else
540
    /* Always check in debug mode */
541
#endif
542
543
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
544
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
545
0
    if (!interp->unicode.fs_codec.encoding) {
546
0
        return 0;
547
0
    }
548
549
    /* Disable checks during Python finalization. For example, it allows to
550
     * call PyUnstable_Object_Dump() during finalization for debugging purpose.
551
     */
552
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
553
0
        return 0;
554
0
    }
555
556
0
    if (encoding != NULL
557
        // Fast path for the most common built-in encodings. Even if the codec
558
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
559
        // create a temporary Unicode string (the key in the cache).
560
0
        && strcmp(encoding, "utf-8") != 0
561
0
        && strcmp(encoding, "utf8") != 0
562
0
        && strcmp(encoding, "ascii") != 0)
563
0
    {
564
0
        PyObject *handler = _PyCodec_Lookup(encoding);
565
0
        if (handler == NULL) {
566
0
            return -1;
567
0
        }
568
0
        Py_DECREF(handler);
569
0
    }
570
571
0
    if (errors != NULL
572
        // Fast path for the most common built-in error handlers.
573
0
        && strcmp(errors, "strict") != 0
574
0
        && strcmp(errors, "ignore") != 0
575
0
        && strcmp(errors, "replace") != 0
576
0
        && strcmp(errors, "surrogateescape") != 0
577
0
        && strcmp(errors, "surrogatepass") != 0)
578
0
    {
579
0
        PyObject *handler = PyCodec_LookupError(errors);
580
0
        if (handler == NULL) {
581
0
            return -1;
582
0
        }
583
0
        Py_DECREF(handler);
584
0
    }
585
0
    return 0;
586
0
}
587
588
589
int
590
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
591
0
{
592
0
#define CHECK(expr) \
593
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
594
595
0
    assert(op != NULL);
596
0
    CHECK(PyUnicode_Check(op));
597
598
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
599
0
    int kind = ascii->state.kind;
600
601
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
602
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
603
0
    }
604
0
    else {
605
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
606
0
        void *data;
607
608
0
        if (ascii->state.compact == 1) {
609
0
            data = compact + 1;
610
0
            CHECK(kind == PyUnicode_1BYTE_KIND
611
0
                                 || kind == PyUnicode_2BYTE_KIND
612
0
                                 || kind == PyUnicode_4BYTE_KIND);
613
0
            CHECK(ascii->state.ascii == 0);
614
0
            CHECK(_PyUnicode_UTF8(op) != data);
615
0
        }
616
0
        else {
617
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
618
619
0
            data = unicode->data.any;
620
0
            CHECK(kind == PyUnicode_1BYTE_KIND
621
0
                     || kind == PyUnicode_2BYTE_KIND
622
0
                     || kind == PyUnicode_4BYTE_KIND);
623
0
            CHECK(ascii->state.compact == 0);
624
0
            CHECK(data != NULL);
625
0
            if (ascii->state.ascii) {
626
0
                CHECK(_PyUnicode_UTF8(op) == data);
627
0
                CHECK(compact->utf8_length == ascii->length);
628
0
            }
629
0
            else {
630
0
                CHECK(_PyUnicode_UTF8(op) != data);
631
0
            }
632
0
        }
633
0
#ifndef Py_GIL_DISABLED
634
0
        if (_PyUnicode_UTF8(op) == NULL)
635
0
            CHECK(compact->utf8_length == 0);
636
0
#endif
637
0
    }
638
639
    /* check that the best kind is used: O(n) operation */
640
0
    if (check_content) {
641
0
        Py_ssize_t i;
642
0
        Py_UCS4 maxchar = 0;
643
0
        const void *data;
644
0
        Py_UCS4 ch;
645
646
0
        data = PyUnicode_DATA(ascii);
647
0
        for (i=0; i < ascii->length; i++)
648
0
        {
649
0
            ch = PyUnicode_READ(kind, data, i);
650
0
            if (ch > maxchar)
651
0
                maxchar = ch;
652
0
        }
653
0
        if (kind == PyUnicode_1BYTE_KIND) {
654
0
            if (ascii->state.ascii == 0) {
655
0
                CHECK(maxchar >= 128);
656
0
                CHECK(maxchar <= 255);
657
0
            }
658
0
            else
659
0
                CHECK(maxchar < 128);
660
0
        }
661
0
        else if (kind == PyUnicode_2BYTE_KIND) {
662
0
            CHECK(maxchar >= 0x100);
663
0
            CHECK(maxchar <= 0xFFFF);
664
0
        }
665
0
        else {
666
0
            CHECK(maxchar >= 0x10000);
667
0
            CHECK(maxchar <= MAX_UNICODE);
668
0
        }
669
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
670
0
    }
671
672
    /* Check interning state */
673
#ifdef Py_DEBUG
674
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
675
    // extensions can make immortal strings mortal (but with a high enough
676
    // refcount).
677
    // The other way is extremely unlikely (worth a potential failed assertion
678
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
679
    switch (PyUnicode_CHECK_INTERNED(op)) {
680
        case SSTATE_NOT_INTERNED:
681
            if (ascii->state.statically_allocated) {
682
                // This state is for two exceptions:
683
                // - strings are currently checked before they're interned
684
                // - the 256 one-latin1-character strings
685
                //   are static but use SSTATE_NOT_INTERNED
686
            }
687
            else {
688
                CHECK(!_Py_IsImmortal(op));
689
            }
690
            break;
691
        case SSTATE_INTERNED_MORTAL:
692
            CHECK(!ascii->state.statically_allocated);
693
            CHECK(!_Py_IsImmortal(op));
694
            break;
695
        case SSTATE_INTERNED_IMMORTAL:
696
            CHECK(!ascii->state.statically_allocated);
697
            break;
698
        case SSTATE_INTERNED_IMMORTAL_STATIC:
699
            CHECK(ascii->state.statically_allocated);
700
            break;
701
        default:
702
            Py_UNREACHABLE();
703
    }
704
#endif
705
706
0
    return 1;
707
708
0
#undef CHECK
709
0
}
710
711
PyObject*
712
_PyUnicode_Result(PyObject *unicode)
713
59.8M
{
714
59.8M
    assert(_PyUnicode_CHECK(unicode));
715
716
59.8M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
717
59.8M
    if (length == 0) {
718
279
        PyObject *empty = _PyUnicode_GetEmpty();
719
279
        if (unicode != empty) {
720
0
            Py_DECREF(unicode);
721
0
        }
722
279
        return empty;
723
279
    }
724
725
59.8M
    if (length == 1) {
726
307k
        int kind = PyUnicode_KIND(unicode);
727
307k
        if (kind == PyUnicode_1BYTE_KIND) {
728
93.9k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
729
93.9k
            Py_UCS1 ch = data[0];
730
93.9k
            PyObject *latin1_char = LATIN1(ch);
731
93.9k
            if (unicode != latin1_char) {
732
89.7k
                Py_DECREF(unicode);
733
89.7k
            }
734
93.9k
            return latin1_char;
735
93.9k
        }
736
307k
    }
737
738
59.8M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
739
59.7M
    return unicode;
740
59.8M
}
741
847k
#define unicode_result _PyUnicode_Result
742
743
static PyObject*
744
unicode_result_unchanged(PyObject *unicode)
745
159M
{
746
159M
    if (PyUnicode_CheckExact(unicode)) {
747
156M
        return Py_NewRef(unicode);
748
156M
    }
749
3.40M
    else
750
        /* Subtype -- return genuine unicode string with the same value. */
751
3.40M
        return _PyUnicode_Copy(unicode);
752
159M
}
753
754
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
755
   ASCII, Latin1, UTF-8, etc. */
756
static char*
757
backslashreplace(PyBytesWriter *writer, char *str,
758
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
759
0
{
760
0
    Py_ssize_t size, i;
761
0
    Py_UCS4 ch;
762
0
    int kind;
763
0
    const void *data;
764
765
0
    kind = PyUnicode_KIND(unicode);
766
0
    data = PyUnicode_DATA(unicode);
767
768
0
    size = 0;
769
    /* determine replacement size */
770
0
    for (i = collstart; i < collend; ++i) {
771
0
        Py_ssize_t incr;
772
773
0
        ch = PyUnicode_READ(kind, data, i);
774
0
        if (ch < 0x100)
775
0
            incr = 2+2;
776
0
        else if (ch < 0x10000)
777
0
            incr = 2+4;
778
0
        else {
779
0
            assert(ch <= MAX_UNICODE);
780
0
            incr = 2+8;
781
0
        }
782
0
        if (size > PY_SSIZE_T_MAX - incr) {
783
0
            PyErr_SetString(PyExc_OverflowError,
784
0
                            "encoded result is too long for a Python string");
785
0
            return NULL;
786
0
        }
787
0
        size += incr;
788
0
    }
789
790
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
791
0
    if (str == NULL) {
792
0
        return NULL;
793
0
    }
794
795
    /* generate replacement */
796
0
    for (i = collstart; i < collend; ++i) {
797
0
        ch = PyUnicode_READ(kind, data, i);
798
0
        *str++ = '\\';
799
0
        if (ch >= 0x00010000) {
800
0
            *str++ = 'U';
801
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
802
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
803
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
804
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
805
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
806
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
807
0
        }
808
0
        else if (ch >= 0x100) {
809
0
            *str++ = 'u';
810
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
811
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
812
0
        }
813
0
        else
814
0
            *str++ = 'x';
815
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
816
0
        *str++ = Py_hexdigits[ch&0xf];
817
0
    }
818
0
    return str;
819
0
}
820
821
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
822
   ASCII, Latin1, UTF-8, etc. */
823
static char*
824
xmlcharrefreplace(PyBytesWriter *writer, char *str,
825
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
826
0
{
827
0
    Py_ssize_t size, i;
828
0
    Py_UCS4 ch;
829
0
    int kind;
830
0
    const void *data;
831
832
0
    kind = PyUnicode_KIND(unicode);
833
0
    data = PyUnicode_DATA(unicode);
834
835
0
    size = 0;
836
    /* determine replacement size */
837
0
    for (i = collstart; i < collend; ++i) {
838
0
        Py_ssize_t incr;
839
840
0
        ch = PyUnicode_READ(kind, data, i);
841
0
        if (ch < 10)
842
0
            incr = 2+1+1;
843
0
        else if (ch < 100)
844
0
            incr = 2+2+1;
845
0
        else if (ch < 1000)
846
0
            incr = 2+3+1;
847
0
        else if (ch < 10000)
848
0
            incr = 2+4+1;
849
0
        else if (ch < 100000)
850
0
            incr = 2+5+1;
851
0
        else if (ch < 1000000)
852
0
            incr = 2+6+1;
853
0
        else {
854
0
            assert(ch <= MAX_UNICODE);
855
0
            incr = 2+7+1;
856
0
        }
857
0
        if (size > PY_SSIZE_T_MAX - incr) {
858
0
            PyErr_SetString(PyExc_OverflowError,
859
0
                            "encoded result is too long for a Python string");
860
0
            return NULL;
861
0
        }
862
0
        size += incr;
863
0
    }
864
865
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
866
0
    if (str == NULL) {
867
0
        return NULL;
868
0
    }
869
870
    /* generate replacement */
871
0
    for (i = collstart; i < collend; ++i) {
872
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
873
0
        if (size < 0) {
874
0
            return NULL;
875
0
        }
876
0
        str += size;
877
0
    }
878
0
    return str;
879
0
}
880
881
/* --- Bloom Filters ----------------------------------------------------- */
882
883
/* stuff to implement simple "bloom filters" for Unicode characters.
884
   to keep things simple, we use a single bitmask, using the least 5
885
   bits from each unicode characters as the bit index. */
886
887
/* the linebreak mask is set up by _PyUnicode_Init() below */
888
889
#if LONG_BIT >= 128
890
#define BLOOM_WIDTH 128
891
#elif LONG_BIT >= 64
892
50.7M
#define BLOOM_WIDTH 64
893
#elif LONG_BIT >= 32
894
#define BLOOM_WIDTH 32
895
#else
896
#error "LONG_BIT is smaller than 32"
897
#endif
898
899
18.8M
#define BLOOM_MASK unsigned long
900
901
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
902
903
73.5M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
904
905
#define BLOOM_LINEBREAK(ch)                                             \
906
268M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
907
268M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
908
909
static inline BLOOM_MASK
910
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
911
9.41M
{
912
9.41M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
913
9.41M
    do {                                               \
914
9.41M
        TYPE *data = (TYPE *)PTR;                      \
915
9.41M
        TYPE *end = data + LEN;                        \
916
9.41M
        Py_UCS4 ch;                                    \
917
20.6M
        for (; data != end; data++) {                  \
918
11.2M
            ch = *data;                                \
919
11.2M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
920
11.2M
        }                                              \
921
9.41M
        break;                                         \
922
9.41M
    } while (0)
923
924
    /* calculate simple bloom-style bitmask for a given unicode string */
925
926
9.41M
    BLOOM_MASK mask;
927
928
9.41M
    mask = 0;
929
9.41M
    switch (kind) {
930
9.41M
    case PyUnicode_1BYTE_KIND:
931
9.41M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
932
9.41M
        break;
933
28
    case PyUnicode_2BYTE_KIND:
934
28
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
935
28
        break;
936
0
    case PyUnicode_4BYTE_KIND:
937
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
938
0
        break;
939
0
    default:
940
0
        Py_UNREACHABLE();
941
9.41M
    }
942
9.41M
    return mask;
943
944
9.41M
#undef BLOOM_UPDATE
945
9.41M
}
946
947
/* Compilation of templated routines */
948
949
1.46M
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
950
951
#include "stringlib/asciilib.h"
952
#include "stringlib/fastsearch.h"
953
#include "stringlib/partition.h"
954
#include "stringlib/split.h"
955
#include "stringlib/count.h"
956
#include "stringlib/find.h"
957
#include "stringlib/find_max_char.h"
958
#include "stringlib/undef.h"
959
960
#include "stringlib/ucs1lib.h"
961
#include "stringlib/fastsearch.h"
962
#include "stringlib/partition.h"
963
#include "stringlib/split.h"
964
#include "stringlib/count.h"
965
#include "stringlib/find.h"
966
#include "stringlib/replace.h"
967
#include "stringlib/repr.h"
968
#include "stringlib/find_max_char.h"
969
#include "stringlib/undef.h"
970
971
#include "stringlib/ucs2lib.h"
972
#include "stringlib/fastsearch.h"
973
#include "stringlib/partition.h"
974
#include "stringlib/split.h"
975
#include "stringlib/count.h"
976
#include "stringlib/find.h"
977
#include "stringlib/replace.h"
978
#include "stringlib/repr.h"
979
#include "stringlib/find_max_char.h"
980
#include "stringlib/undef.h"
981
982
#include "stringlib/ucs4lib.h"
983
#include "stringlib/fastsearch.h"
984
#include "stringlib/partition.h"
985
#include "stringlib/split.h"
986
#include "stringlib/count.h"
987
#include "stringlib/find.h"
988
#include "stringlib/replace.h"
989
#include "stringlib/repr.h"
990
#include "stringlib/find_max_char.h"
991
#include "stringlib/undef.h"
992
993
#undef STRINGLIB_GET_EMPTY
994
995
/* --- Unicode Object ----------------------------------------------------- */
996
997
static inline Py_ssize_t
998
findchar(const void *s, int kind,
999
         Py_ssize_t size, Py_UCS4 ch,
1000
         int direction)
1001
249M
{
1002
249M
    switch (kind) {
1003
237M
    case PyUnicode_1BYTE_KIND:
1004
237M
        if ((Py_UCS1) ch != ch)
1005
3.42k
            return -1;
1006
237M
        if (direction > 0)
1007
237M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1008
59.9k
        else
1009
59.9k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1010
10.1M
    case PyUnicode_2BYTE_KIND:
1011
10.1M
        if ((Py_UCS2) ch != ch)
1012
0
            return -1;
1013
10.1M
        if (direction > 0)
1014
9.89M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1015
218k
        else
1016
218k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1017
1.36M
    case PyUnicode_4BYTE_KIND:
1018
1.36M
        if (direction > 0)
1019
1.25M
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1020
103k
        else
1021
103k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1022
0
    default:
1023
0
        Py_UNREACHABLE();
1024
249M
    }
1025
249M
}
1026
1027
#ifdef Py_DEBUG
1028
/* Fill the data of a Unicode string with invalid characters to detect bugs
1029
   earlier.
1030
1031
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1032
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1033
   invalid character in Unicode 6.0. */
1034
static void
1035
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1036
{
1037
    int kind = PyUnicode_KIND(unicode);
1038
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1039
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1040
    if (length <= old_length)
1041
        return;
1042
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1043
}
1044
#endif
1045
1046
static PyObject*
1047
resize_copy(PyObject *unicode, Py_ssize_t length)
1048
0
{
1049
0
    Py_ssize_t copy_length;
1050
0
    PyObject *copy;
1051
1052
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1053
0
    if (copy == NULL)
1054
0
        return NULL;
1055
1056
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1057
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1058
0
    return copy;
1059
0
}
1060
1061
PyObject*
1062
_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
1063
70.9M
{
1064
70.9M
    Py_ssize_t char_size;
1065
70.9M
    Py_ssize_t struct_size;
1066
70.9M
    Py_ssize_t new_size;
1067
70.9M
    PyObject *new_unicode;
1068
#ifdef Py_DEBUG
1069
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1070
#endif
1071
1072
70.9M
    if (!_PyUnicode_IsModifiable(unicode)) {
1073
0
        PyObject *copy = resize_copy(unicode, length);
1074
0
        if (copy == NULL) {
1075
0
            return NULL;
1076
0
        }
1077
0
        Py_DECREF(unicode);
1078
0
        return copy;
1079
0
    }
1080
70.9M
    assert(PyUnicode_IS_COMPACT(unicode));
1081
1082
70.9M
    char_size = PyUnicode_KIND(unicode);
1083
70.9M
    if (PyUnicode_IS_ASCII(unicode))
1084
60.3M
        struct_size = sizeof(PyASCIIObject);
1085
10.6M
    else
1086
10.6M
        struct_size = sizeof(PyCompactUnicodeObject);
1087
1088
70.9M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1089
0
        PyErr_NoMemory();
1090
0
        return NULL;
1091
0
    }
1092
70.9M
    new_size = (struct_size + (length + 1) * char_size);
1093
1094
70.9M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1095
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1096
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1097
0
        PyUnicode_SET_UTF8(unicode, NULL);
1098
0
    }
1099
#ifdef Py_TRACE_REFS
1100
    _Py_ForgetReference(unicode);
1101
#endif
1102
70.9M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1103
1104
70.9M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1105
70.9M
    if (new_unicode == NULL) {
1106
0
        _Py_NewReferenceNoTotal(unicode);
1107
0
        PyErr_NoMemory();
1108
0
        return NULL;
1109
0
    }
1110
70.9M
    unicode = new_unicode;
1111
70.9M
    _Py_NewReferenceNoTotal(unicode);
1112
1113
70.9M
    _PyUnicode_LENGTH(unicode) = length;
1114
#ifdef Py_DEBUG
1115
    unicode_fill_invalid(unicode, old_length);
1116
#endif
1117
70.9M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1118
70.9M
                    length, 0);
1119
70.9M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1120
70.9M
    return unicode;
1121
70.9M
}
1122
1123
static int
1124
resize_inplace(PyObject *unicode, Py_ssize_t length)
1125
0
{
1126
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1127
0
    assert(Py_REFCNT(unicode) == 1);
1128
1129
0
    Py_ssize_t new_size;
1130
0
    Py_ssize_t char_size;
1131
0
    int share_utf8;
1132
0
    void *data;
1133
#ifdef Py_DEBUG
1134
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1135
#endif
1136
1137
0
    data = _PyUnicode_DATA_ANY(unicode);
1138
0
    char_size = PyUnicode_KIND(unicode);
1139
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1140
1141
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1142
0
        PyErr_NoMemory();
1143
0
        return -1;
1144
0
    }
1145
0
    new_size = (length + 1) * char_size;
1146
1147
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1148
0
    {
1149
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1150
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1151
0
        PyUnicode_SET_UTF8(unicode, NULL);
1152
0
    }
1153
1154
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1155
0
    if (data == NULL) {
1156
0
        PyErr_NoMemory();
1157
0
        return -1;
1158
0
    }
1159
0
    _PyUnicode_DATA_ANY(unicode) = data;
1160
0
    if (share_utf8) {
1161
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1162
0
        PyUnicode_SET_UTF8(unicode, data);
1163
0
    }
1164
0
    _PyUnicode_LENGTH(unicode) = length;
1165
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1166
#ifdef Py_DEBUG
1167
    unicode_fill_invalid(unicode, old_length);
1168
#endif
1169
1170
    /* check for integer overflow */
1171
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1172
0
        PyErr_NoMemory();
1173
0
        return -1;
1174
0
    }
1175
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1176
0
    return 0;
1177
0
}
1178
1179
static const char*
1180
unicode_kind_name(PyObject *unicode)
1181
0
{
1182
    /* don't check consistency: unicode_kind_name() is called from
1183
       _PyUnicode_Dump() */
1184
0
    if (!PyUnicode_IS_COMPACT(unicode))
1185
0
    {
1186
0
        switch (PyUnicode_KIND(unicode))
1187
0
        {
1188
0
        case PyUnicode_1BYTE_KIND:
1189
0
            if (PyUnicode_IS_ASCII(unicode))
1190
0
                return "legacy ascii";
1191
0
            else
1192
0
                return "legacy latin1";
1193
0
        case PyUnicode_2BYTE_KIND:
1194
0
            return "legacy UCS2";
1195
0
        case PyUnicode_4BYTE_KIND:
1196
0
            return "legacy UCS4";
1197
0
        default:
1198
0
            return "<legacy invalid kind>";
1199
0
        }
1200
0
    }
1201
0
    switch (PyUnicode_KIND(unicode)) {
1202
0
    case PyUnicode_1BYTE_KIND:
1203
0
        if (PyUnicode_IS_ASCII(unicode))
1204
0
            return "ascii";
1205
0
        else
1206
0
            return "latin1";
1207
0
    case PyUnicode_2BYTE_KIND:
1208
0
        return "UCS2";
1209
0
    case PyUnicode_4BYTE_KIND:
1210
0
        return "UCS4";
1211
0
    default:
1212
0
        return "<invalid compact kind>";
1213
0
    }
1214
0
}
1215
1216
#ifdef Py_DEBUG
1217
/* Functions wrapping macros for use in debugger */
1218
const char *_PyUnicode_utf8(void *unicode_raw){
1219
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1220
    return PyUnicode_UTF8(unicode);
1221
}
1222
1223
const void *_PyUnicode_compact_data(void *unicode_raw) {
1224
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1225
    return _PyUnicode_COMPACT_DATA(unicode);
1226
}
1227
const void *_PyUnicode_data(void *unicode_raw) {
1228
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1229
    printf("obj %p\n", (void*)unicode);
1230
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1231
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1232
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1233
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1234
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1235
    return PyUnicode_DATA(unicode);
1236
}
1237
1238
void
1239
_PyUnicode_Dump(PyObject *op)
1240
{
1241
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1242
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1243
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1244
    const void *data;
1245
1246
    if (ascii->state.compact)
1247
    {
1248
        if (ascii->state.ascii)
1249
            data = (ascii + 1);
1250
        else
1251
            data = (compact + 1);
1252
    }
1253
    else
1254
        data = unicode->data.any;
1255
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1256
1257
    if (!ascii->state.ascii) {
1258
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1259
    }
1260
    printf(", data=%p\n", data);
1261
}
1262
#endif
1263
1264
1265
PyObject *
1266
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1267
579M
{
1268
    /* Optimization for empty strings */
1269
579M
    if (size == 0) {
1270
30.5M
        return _PyUnicode_GetEmpty();
1271
30.5M
    }
1272
1273
548M
    PyObject *obj;
1274
548M
    PyCompactUnicodeObject *unicode;
1275
548M
    void *data;
1276
548M
    int kind;
1277
548M
    int is_ascii;
1278
548M
    Py_ssize_t char_size;
1279
548M
    Py_ssize_t struct_size;
1280
1281
548M
    is_ascii = 0;
1282
548M
    struct_size = sizeof(PyCompactUnicodeObject);
1283
548M
    if (maxchar < 128) {
1284
375M
        kind = PyUnicode_1BYTE_KIND;
1285
375M
        char_size = 1;
1286
375M
        is_ascii = 1;
1287
375M
        struct_size = sizeof(PyASCIIObject);
1288
375M
    }
1289
172M
    else if (maxchar < 256) {
1290
13.6M
        kind = PyUnicode_1BYTE_KIND;
1291
13.6M
        char_size = 1;
1292
13.6M
    }
1293
159M
    else if (maxchar < 65536) {
1294
154M
        kind = PyUnicode_2BYTE_KIND;
1295
154M
        char_size = 2;
1296
154M
    }
1297
4.98M
    else {
1298
4.98M
        if (maxchar > MAX_UNICODE) {
1299
0
            PyErr_SetString(PyExc_SystemError,
1300
0
                            "invalid maximum character passed to PyUnicode_New");
1301
0
            return NULL;
1302
0
        }
1303
4.98M
        kind = PyUnicode_4BYTE_KIND;
1304
4.98M
        char_size = 4;
1305
4.98M
    }
1306
1307
    /* Ensure we won't overflow the size. */
1308
548M
    if (size < 0) {
1309
0
        PyErr_SetString(PyExc_SystemError,
1310
0
                        "Negative size passed to PyUnicode_New");
1311
0
        return NULL;
1312
0
    }
1313
548M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1314
0
        return PyErr_NoMemory();
1315
1316
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1317
     * PyObject_New() so we are able to allocate space for the object and
1318
     * it's data buffer.
1319
     */
1320
548M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1321
548M
    if (obj == NULL) {
1322
0
        return PyErr_NoMemory();
1323
0
    }
1324
548M
    _PyObject_Init(obj, &PyUnicode_Type);
1325
1326
548M
    unicode = (PyCompactUnicodeObject *)obj;
1327
548M
    if (is_ascii)
1328
375M
        data = ((PyASCIIObject*)obj) + 1;
1329
172M
    else
1330
172M
        data = unicode + 1;
1331
548M
    _PyUnicode_LENGTH(unicode) = size;
1332
548M
    _PyUnicode_HASH(unicode) = -1;
1333
548M
    _PyUnicode_STATE(unicode).interned = 0;
1334
548M
    _PyUnicode_STATE(unicode).kind = kind;
1335
548M
    _PyUnicode_STATE(unicode).compact = 1;
1336
548M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1337
548M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1338
548M
    if (is_ascii) {
1339
375M
        ((char*)data)[size] = 0;
1340
375M
    }
1341
172M
    else if (kind == PyUnicode_1BYTE_KIND) {
1342
13.6M
        ((char*)data)[size] = 0;
1343
13.6M
        unicode->utf8 = NULL;
1344
13.6M
        unicode->utf8_length = 0;
1345
13.6M
    }
1346
159M
    else {
1347
159M
        unicode->utf8 = NULL;
1348
159M
        unicode->utf8_length = 0;
1349
159M
        if (kind == PyUnicode_2BYTE_KIND)
1350
154M
            ((Py_UCS2*)data)[size] = 0;
1351
4.98M
        else /* kind == PyUnicode_4BYTE_KIND */
1352
4.98M
            ((Py_UCS4*)data)[size] = 0;
1353
159M
    }
1354
#ifdef Py_DEBUG
1355
    unicode_fill_invalid((PyObject*)unicode, 0);
1356
#endif
1357
548M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1358
548M
    return obj;
1359
548M
}
1360
1361
static int
1362
unicode_check_modifiable(PyObject *unicode)
1363
617
{
1364
617
    if (!_PyUnicode_IsModifiable(unicode)) {
1365
0
        PyErr_SetString(PyExc_SystemError,
1366
0
                        "Cannot modify a string currently used");
1367
0
        return -1;
1368
0
    }
1369
617
    return 0;
1370
617
}
1371
1372
static int
1373
_copy_characters(PyObject *to, Py_ssize_t to_start,
1374
                 PyObject *from, Py_ssize_t from_start,
1375
                 Py_ssize_t how_many, int check_maxchar)
1376
352M
{
1377
352M
    int from_kind, to_kind;
1378
352M
    const void *from_data;
1379
352M
    void *to_data;
1380
1381
352M
    assert(0 <= how_many);
1382
352M
    assert(0 <= from_start);
1383
352M
    assert(0 <= to_start);
1384
352M
    assert(PyUnicode_Check(from));
1385
352M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1386
1387
352M
    assert(to == NULL || PyUnicode_Check(to));
1388
1389
352M
    if (how_many == 0) {
1390
436k
        return 0;
1391
436k
    }
1392
1393
352M
    assert(to != NULL);
1394
352M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1395
1396
352M
    from_kind = PyUnicode_KIND(from);
1397
352M
    from_data = PyUnicode_DATA(from);
1398
352M
    to_kind = PyUnicode_KIND(to);
1399
352M
    to_data = PyUnicode_DATA(to);
1400
1401
#ifdef Py_DEBUG
1402
    if (!check_maxchar
1403
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1404
    {
1405
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1406
        Py_UCS4 ch;
1407
        Py_ssize_t i;
1408
        for (i=0; i < how_many; i++) {
1409
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1410
            assert(ch <= to_maxchar);
1411
        }
1412
    }
1413
#endif
1414
1415
352M
    if (from_kind == to_kind) {
1416
246M
        if (check_maxchar
1417
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1418
0
        {
1419
            /* Writing Latin-1 characters into an ASCII string requires to
1420
               check that all written characters are pure ASCII */
1421
0
            Py_UCS4 max_char;
1422
0
            max_char = ucs1lib_find_max_char(from_data,
1423
0
                                             (const Py_UCS1*)from_data + how_many);
1424
0
            if (max_char >= 128)
1425
0
                return -1;
1426
0
        }
1427
246M
        memcpy((char*)to_data + to_kind * to_start,
1428
246M
                  (const char*)from_data + from_kind * from_start,
1429
246M
                  to_kind * how_many);
1430
246M
    }
1431
105M
    else if (from_kind == PyUnicode_1BYTE_KIND
1432
104M
             && to_kind == PyUnicode_2BYTE_KIND)
1433
87.4M
    {
1434
87.4M
        _PyUnicode_CONVERT_BYTES(
1435
87.4M
            Py_UCS1, Py_UCS2,
1436
87.4M
            PyUnicode_1BYTE_DATA(from) + from_start,
1437
87.4M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1438
87.4M
            PyUnicode_2BYTE_DATA(to) + to_start
1439
87.4M
            );
1440
87.4M
    }
1441
18.4M
    else if (from_kind == PyUnicode_1BYTE_KIND
1442
16.6M
             && to_kind == PyUnicode_4BYTE_KIND)
1443
16.6M
    {
1444
16.6M
        _PyUnicode_CONVERT_BYTES(
1445
16.6M
            Py_UCS1, Py_UCS4,
1446
16.6M
            PyUnicode_1BYTE_DATA(from) + from_start,
1447
16.6M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1448
16.6M
            PyUnicode_4BYTE_DATA(to) + to_start
1449
16.6M
            );
1450
16.6M
    }
1451
1.85M
    else if (from_kind == PyUnicode_2BYTE_KIND
1452
1.83M
             && to_kind == PyUnicode_4BYTE_KIND)
1453
1.82M
    {
1454
1.82M
        _PyUnicode_CONVERT_BYTES(
1455
1.82M
            Py_UCS2, Py_UCS4,
1456
1.82M
            PyUnicode_2BYTE_DATA(from) + from_start,
1457
1.82M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1458
1.82M
            PyUnicode_4BYTE_DATA(to) + to_start
1459
1.82M
            );
1460
1.82M
    }
1461
24.1k
    else {
1462
24.1k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1463
1464
24.1k
        if (!check_maxchar) {
1465
24.1k
            if (from_kind == PyUnicode_2BYTE_KIND
1466
2.51k
                && to_kind == PyUnicode_1BYTE_KIND)
1467
2.51k
            {
1468
2.51k
                _PyUnicode_CONVERT_BYTES(
1469
2.51k
                    Py_UCS2, Py_UCS1,
1470
2.51k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1471
2.51k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1472
2.51k
                    PyUnicode_1BYTE_DATA(to) + to_start
1473
2.51k
                    );
1474
2.51k
            }
1475
21.6k
            else if (from_kind == PyUnicode_4BYTE_KIND
1476
21.6k
                     && to_kind == PyUnicode_1BYTE_KIND)
1477
10.1k
            {
1478
10.1k
                _PyUnicode_CONVERT_BYTES(
1479
10.1k
                    Py_UCS4, Py_UCS1,
1480
10.1k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1481
10.1k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1482
10.1k
                    PyUnicode_1BYTE_DATA(to) + to_start
1483
10.1k
                    );
1484
10.1k
            }
1485
11.4k
            else if (from_kind == PyUnicode_4BYTE_KIND
1486
11.4k
                     && to_kind == PyUnicode_2BYTE_KIND)
1487
11.4k
            {
1488
11.4k
                _PyUnicode_CONVERT_BYTES(
1489
11.4k
                    Py_UCS4, Py_UCS2,
1490
11.4k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1491
11.4k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1492
11.4k
                    PyUnicode_2BYTE_DATA(to) + to_start
1493
11.4k
                    );
1494
11.4k
            }
1495
0
            else {
1496
0
                Py_UNREACHABLE();
1497
0
            }
1498
24.1k
        }
1499
0
        else {
1500
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1501
0
            Py_UCS4 ch;
1502
0
            Py_ssize_t i;
1503
1504
0
            for (i=0; i < how_many; i++) {
1505
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1506
0
                if (ch > to_maxchar)
1507
0
                    return -1;
1508
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1509
0
            }
1510
0
        }
1511
24.1k
    }
1512
352M
    return 0;
1513
352M
}
1514
1515
void
1516
_PyUnicode_FastCopyCharacters(
1517
    PyObject *to, Py_ssize_t to_start,
1518
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1519
352M
{
1520
352M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1521
352M
}
1522
1523
Py_ssize_t
1524
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1525
                         PyObject *from, Py_ssize_t from_start,
1526
                         Py_ssize_t how_many)
1527
0
{
1528
0
    int err;
1529
1530
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1531
0
        PyErr_BadInternalCall();
1532
0
        return -1;
1533
0
    }
1534
1535
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1536
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1537
0
        return -1;
1538
0
    }
1539
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1540
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1541
0
        return -1;
1542
0
    }
1543
0
    if (how_many < 0) {
1544
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1545
0
        return -1;
1546
0
    }
1547
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1548
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1549
0
        PyErr_Format(PyExc_SystemError,
1550
0
                     "Cannot write %zi characters at %zi "
1551
0
                     "in a string of %zi characters",
1552
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1553
0
        return -1;
1554
0
    }
1555
1556
0
    if (how_many == 0)
1557
0
        return 0;
1558
1559
0
    if (unicode_check_modifiable(to))
1560
0
        return -1;
1561
1562
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1563
0
    if (err) {
1564
0
        PyErr_Format(PyExc_SystemError,
1565
0
                     "Cannot copy %s characters "
1566
0
                     "into a string of %s characters",
1567
0
                     unicode_kind_name(from),
1568
0
                     unicode_kind_name(to));
1569
0
        return -1;
1570
0
    }
1571
0
    return how_many;
1572
0
}
1573
1574
/* Find the maximum code point and count the number of surrogate pairs so a
1575
   correct string length can be computed before converting a string to UCS4.
1576
   This function counts single surrogates as a character and not as a pair.
1577
1578
   Return 0 on success, or -1 on error. */
1579
static int
1580
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1581
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1582
373k
{
1583
373k
    const wchar_t *iter;
1584
373k
    Py_UCS4 ch;
1585
1586
373k
    assert(num_surrogates != NULL && maxchar != NULL);
1587
373k
    *num_surrogates = 0;
1588
373k
    *maxchar = 0;
1589
1590
8.74M
    for (iter = begin; iter < end; ) {
1591
#if SIZEOF_WCHAR_T == 2
1592
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1593
            && (iter+1) < end
1594
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1595
        {
1596
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1597
            ++(*num_surrogates);
1598
            iter += 2;
1599
        }
1600
        else
1601
#endif
1602
8.37M
        {
1603
8.37M
            ch = *iter;
1604
8.37M
            iter++;
1605
8.37M
        }
1606
8.37M
        if (ch > *maxchar) {
1607
1.49M
            *maxchar = ch;
1608
1.49M
            if (*maxchar > MAX_UNICODE) {
1609
0
                PyErr_Format(PyExc_ValueError,
1610
0
                             "character U+%x is not in range [U+0000; U+%x]",
1611
0
                             ch, MAX_UNICODE);
1612
0
                return -1;
1613
0
            }
1614
1.49M
        }
1615
8.37M
    }
1616
373k
    return 0;
1617
373k
}
1618
1619
static void
1620
unicode_dealloc(PyObject *unicode)
1621
564M
{
1622
#ifdef Py_DEBUG
1623
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1624
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1625
    }
1626
#endif
1627
564M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1628
        /* This should never get called, but we also don't want to SEGV if
1629
        * we accidentally decref an immortal string out of existence. Since
1630
        * the string is an immortal object, just re-set the reference count.
1631
        */
1632
#ifdef Py_DEBUG
1633
        Py_UNREACHABLE();
1634
#endif
1635
0
        _Py_SetImmortal(unicode);
1636
0
        return;
1637
0
    }
1638
564M
    switch (_PyUnicode_STATE(unicode).interned) {
1639
564M
        case SSTATE_NOT_INTERNED:
1640
564M
            break;
1641
476k
        case SSTATE_INTERNED_MORTAL:
1642
            /* Remove the object from the intern dict.
1643
             * Before doing so, we set the refcount to 2: the key and value
1644
             * in the interned_dict.
1645
             */
1646
476k
            assert(Py_REFCNT(unicode) == 0);
1647
476k
            Py_SET_REFCNT(unicode, 2);
1648
#ifdef Py_REF_DEBUG
1649
            /* let's be pedantic with the ref total */
1650
            _Py_IncRefTotal(_PyThreadState_GET());
1651
            _Py_IncRefTotal(_PyThreadState_GET());
1652
#endif
1653
476k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1654
476k
            PyObject *interned = get_interned_dict(interp);
1655
476k
            assert(interned != NULL);
1656
476k
            PyObject *popped;
1657
476k
            int r = PyDict_Pop(interned, unicode, &popped);
1658
476k
            if (r == -1) {
1659
0
                PyErr_FormatUnraisable("Exception ignored while "
1660
0
                                       "removing an interned string %R",
1661
0
                                       unicode);
1662
                // We don't know what happened to the string. It's probably
1663
                // best to leak it:
1664
                // - if it was popped, there are no more references to it
1665
                //   so it can't cause trouble (except wasted memory)
1666
                // - if it wasn't popped, it'll remain interned
1667
0
                _Py_SetImmortal(unicode);
1668
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1669
0
                return;
1670
0
            }
1671
476k
            if (r == 0) {
1672
                // The interned string was not found in the interned_dict.
1673
#ifdef Py_DEBUG
1674
                Py_UNREACHABLE();
1675
#endif
1676
0
                _Py_SetImmortal(unicode);
1677
0
                return;
1678
0
            }
1679
            // Successfully popped.
1680
476k
            assert(popped == unicode);
1681
            // Only our `popped` reference should be left; remove it too.
1682
476k
            assert(Py_REFCNT(unicode) == 1);
1683
476k
            Py_SET_REFCNT(unicode, 0);
1684
#ifdef Py_REF_DEBUG
1685
            /* let's be pedantic with the ref total */
1686
            _Py_DecRefTotal(_PyThreadState_GET());
1687
#endif
1688
476k
            break;
1689
0
        default:
1690
            // As with `statically_allocated` above.
1691
#ifdef Py_REF_DEBUG
1692
            Py_UNREACHABLE();
1693
#endif
1694
0
            _Py_SetImmortal(unicode);
1695
0
            return;
1696
564M
    }
1697
564M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1698
165k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1699
165k
    }
1700
564M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1701
16.6M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1702
16.6M
    }
1703
1704
564M
    Py_TYPE(unicode)->tp_free(unicode);
1705
564M
}
1706
1707
#ifdef Py_DEBUG
1708
static int
1709
unicode_is_singleton(PyObject *unicode)
1710
{
1711
    if (unicode == &_Py_STR(empty)) {
1712
        return 1;
1713
    }
1714
1715
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1716
    if (ascii->length == 1) {
1717
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1718
        if (ch < 256 && LATIN1(ch) == unicode) {
1719
            return 1;
1720
        }
1721
    }
1722
    return 0;
1723
}
1724
#endif
1725
1726
int
1727
_PyUnicode_IsModifiable(PyObject *unicode)
1728
75.9M
{
1729
75.9M
    assert(_PyUnicode_CHECK(unicode));
1730
75.9M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1731
1.48M
        return 0;
1732
74.4M
    if (PyUnicode_HASH(unicode) != -1)
1733
0
        return 0;
1734
74.4M
    if (PyUnicode_CHECK_INTERNED(unicode))
1735
0
        return 0;
1736
74.4M
    if (!PyUnicode_CheckExact(unicode))
1737
0
        return 0;
1738
#ifdef Py_DEBUG
1739
    /* singleton refcount is greater than 1 */
1740
    assert(!unicode_is_singleton(unicode));
1741
#endif
1742
74.4M
    return 1;
1743
74.4M
}
1744
1745
static int
1746
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1747
1.72M
{
1748
1.72M
    PyObject *unicode;
1749
1.72M
    Py_ssize_t old_length;
1750
1751
1.72M
    assert(p_unicode != NULL);
1752
1.72M
    unicode = *p_unicode;
1753
1754
1.72M
    assert(unicode != NULL);
1755
1.72M
    assert(PyUnicode_Check(unicode));
1756
1.72M
    assert(0 <= length);
1757
1758
1.72M
    old_length = PyUnicode_GET_LENGTH(unicode);
1759
1.72M
    if (old_length == length)
1760
0
        return 0;
1761
1762
1.72M
    if (length == 0) {
1763
0
        PyObject *empty = _PyUnicode_GetEmpty();
1764
0
        Py_SETREF(*p_unicode, empty);
1765
0
        return 0;
1766
0
    }
1767
1768
1.72M
    if (!_PyUnicode_IsModifiable(unicode)) {
1769
0
        PyObject *copy = resize_copy(unicode, length);
1770
0
        if (copy == NULL)
1771
0
            return -1;
1772
0
        Py_SETREF(*p_unicode, copy);
1773
0
        return 0;
1774
0
    }
1775
1776
1.72M
    if (PyUnicode_IS_COMPACT(unicode)) {
1777
1.72M
        PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
1778
1.72M
        if (new_unicode == NULL)
1779
0
            return -1;
1780
1.72M
        *p_unicode = new_unicode;
1781
1.72M
        return 0;
1782
1.72M
    }
1783
0
    return resize_inplace(unicode, length);
1784
1.72M
}
1785
1786
int
1787
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1788
0
{
1789
0
    PyObject *unicode;
1790
0
    if (p_unicode == NULL) {
1791
0
        PyErr_BadInternalCall();
1792
0
        return -1;
1793
0
    }
1794
0
    unicode = *p_unicode;
1795
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1796
0
    {
1797
0
        PyErr_BadInternalCall();
1798
0
        return -1;
1799
0
    }
1800
0
    return unicode_resize(p_unicode, length);
1801
0
}
1802
1803
static PyObject*
1804
get_latin1_char(Py_UCS1 ch)
1805
252M
{
1806
252M
    PyObject *o = LATIN1(ch);
1807
252M
    return o;
1808
252M
}
1809
1810
static PyObject*
1811
unicode_char(Py_UCS4 ch)
1812
304M
{
1813
304M
    PyObject *unicode;
1814
1815
304M
    assert(ch <= MAX_UNICODE);
1816
1817
304M
    if (ch < 256) {
1818
200M
        return get_latin1_char(ch);
1819
200M
    }
1820
1821
103M
    unicode = PyUnicode_New(1, ch);
1822
103M
    if (unicode == NULL)
1823
0
        return NULL;
1824
1825
103M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1826
103M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1827
100M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1828
100M
    } else {
1829
2.79M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1830
2.79M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1831
2.79M
    }
1832
103M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1833
103M
    return unicode;
1834
103M
}
1835
1836
1837
static inline void
1838
unicode_write_widechar(int kind, void *data,
1839
                       const wchar_t *u, Py_ssize_t size,
1840
                       Py_ssize_t num_surrogates)
1841
373k
{
1842
373k
    switch (kind) {
1843
344k
    case PyUnicode_1BYTE_KIND:
1844
344k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1845
344k
        break;
1846
1847
28.7k
    case PyUnicode_2BYTE_KIND:
1848
#if SIZEOF_WCHAR_T == 2
1849
        memcpy(data, u, size * 2);
1850
#else
1851
28.7k
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1852
28.7k
#endif
1853
28.7k
        break;
1854
1855
495
    case PyUnicode_4BYTE_KIND:
1856
495
    {
1857
#if SIZEOF_WCHAR_T == 2
1858
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1859
        // surrogate pairs.
1860
        const wchar_t *end = u + size;
1861
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1862
#  ifndef NDEBUG
1863
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1864
#  endif
1865
        for (const wchar_t *iter = u; iter < end; ) {
1866
            assert(ucs4_out < ucs4_end);
1867
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1868
                && (iter+1) < end
1869
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1870
            {
1871
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1872
                iter += 2;
1873
            }
1874
            else {
1875
                *ucs4_out++ = *iter;
1876
                iter++;
1877
            }
1878
        }
1879
        assert(ucs4_out == ucs4_end);
1880
#else
1881
495
        assert(num_surrogates == 0);
1882
495
        memcpy(data, u, size * 4);
1883
495
#endif
1884
495
        break;
1885
0
    }
1886
0
    default:
1887
0
        Py_UNREACHABLE();
1888
373k
    }
1889
373k
}
1890
1891
1892
PyObject *
1893
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1894
695k
{
1895
695k
    PyObject *unicode;
1896
695k
    Py_UCS4 maxchar = 0;
1897
695k
    Py_ssize_t num_surrogates;
1898
1899
695k
    if (u == NULL && size != 0) {
1900
0
        PyErr_BadInternalCall();
1901
0
        return NULL;
1902
0
    }
1903
1904
695k
    if (size == -1) {
1905
1.00k
        size = wcslen(u);
1906
1.00k
    }
1907
1908
    /* If the Unicode data is known at construction time, we can apply
1909
       some optimizations which share commonly used objects. */
1910
1911
    /* Optimization for empty strings */
1912
695k
    if (size == 0)
1913
275k
        _Py_RETURN_UNICODE_EMPTY();
1914
1915
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1916
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1917
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1918
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1919
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1920
        if (!converted) {
1921
            return NULL;
1922
        }
1923
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1924
        PyMem_Free(converted);
1925
        return unicode;
1926
    }
1927
#endif
1928
1929
    /* Single character Unicode objects in the Latin-1 range are
1930
       shared when using this constructor */
1931
419k
    if (size == 1 && (Py_UCS4)*u < 256)
1932
45.9k
        return get_latin1_char((unsigned char)*u);
1933
1934
    /* If not empty and not single character, copy the Unicode data
1935
       into the new object */
1936
373k
    if (find_maxchar_surrogates(u, u + size,
1937
373k
                                &maxchar, &num_surrogates) == -1)
1938
0
        return NULL;
1939
1940
373k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1941
373k
    if (!unicode)
1942
0
        return NULL;
1943
1944
373k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1945
373k
                           u, size, num_surrogates);
1946
1947
373k
    return unicode_result(unicode);
1948
373k
}
1949
1950
1951
int
1952
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
1953
                              const wchar_t *str,
1954
                              Py_ssize_t size)
1955
0
{
1956
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
1957
1958
0
    if (size < 0) {
1959
0
        size = wcslen(str);
1960
0
    }
1961
1962
0
    if (size == 0) {
1963
0
        return 0;
1964
0
    }
1965
1966
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1967
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1968
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1969
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1970
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
1971
        if (!converted) {
1972
            return -1;
1973
        }
1974
1975
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
1976
        PyMem_Free(converted);
1977
        return res;
1978
    }
1979
#endif
1980
1981
0
    Py_UCS4 maxchar = 0;
1982
0
    Py_ssize_t num_surrogates;
1983
0
    if (find_maxchar_surrogates(str, str + size,
1984
0
                                &maxchar, &num_surrogates) == -1) {
1985
0
        return -1;
1986
0
    }
1987
1988
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
1989
0
        return -1;
1990
0
    }
1991
1992
0
    int kind = writer->kind;
1993
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
1994
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
1995
1996
0
    writer->pos += size - num_surrogates;
1997
0
    return 0;
1998
0
}
1999
2000
2001
PyObject *
2002
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2003
5.30M
{
2004
5.30M
    if (size < 0) {
2005
0
        PyErr_SetString(PyExc_SystemError,
2006
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2007
0
        return NULL;
2008
0
    }
2009
5.30M
    if (u != NULL) {
2010
5.30M
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2011
5.30M
    }
2012
0
    if (size > 0) {
2013
0
        PyErr_SetString(PyExc_SystemError,
2014
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2015
0
        return NULL;
2016
0
    }
2017
0
    return _PyUnicode_GetEmpty();
2018
0
}
2019
2020
PyObject *
2021
PyUnicode_FromString(const char *u)
2022
9.04M
{
2023
9.04M
    size_t size = strlen(u);
2024
9.04M
    if (size > PY_SSIZE_T_MAX) {
2025
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2026
0
        return NULL;
2027
0
    }
2028
9.04M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2029
9.04M
}
2030
2031
2032
PyObject *
2033
_PyUnicode_FromId(_Py_Identifier *id)
2034
0
{
2035
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2036
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2037
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2038
2039
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2040
0
    if (index < 0) {
2041
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2042
2043
0
        PyMutex_Lock(&rt_ids->mutex);
2044
        // Check again to detect concurrent access. Another thread can have
2045
        // initialized the index while this thread waited for the lock.
2046
0
        index = _Py_atomic_load_ssize(&id->index);
2047
0
        if (index < 0) {
2048
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2049
0
            index = rt_ids->next_index;
2050
0
            rt_ids->next_index++;
2051
0
            _Py_atomic_store_ssize(&id->index, index);
2052
0
        }
2053
0
        PyMutex_Unlock(&rt_ids->mutex);
2054
0
    }
2055
0
    assert(index >= 0);
2056
2057
0
    PyObject *obj;
2058
0
    if (index < ids->size) {
2059
0
        obj = ids->array[index];
2060
0
        if (obj) {
2061
            // Return a borrowed reference
2062
0
            goto end;
2063
0
        }
2064
0
    }
2065
2066
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2067
0
                                       NULL, NULL);
2068
0
    if (!obj) {
2069
0
        goto end;
2070
0
    }
2071
0
    _PyUnicode_InternImmortal(interp, &obj);
2072
2073
0
    if (index >= ids->size) {
2074
        // Overallocate to reduce the number of realloc
2075
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2076
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2077
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2078
0
        if (new_array == NULL) {
2079
0
            PyErr_NoMemory();
2080
0
            obj = NULL;
2081
0
            goto end;
2082
0
        }
2083
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2084
0
        ids->array = new_array;
2085
0
        ids->size = new_size;
2086
0
    }
2087
2088
    // The array stores a strong reference
2089
0
    ids->array[index] = obj;
2090
2091
0
end:
2092
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2093
    // Return a borrowed reference
2094
0
    return obj;
2095
0
}
2096
2097
2098
static void
2099
unicode_clear_identifiers(struct _Py_unicode_state *state)
2100
0
{
2101
0
    struct _Py_unicode_ids *ids = &state->ids;
2102
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2103
0
        Py_XDECREF(ids->array[i]);
2104
0
    }
2105
0
    ids->size = 0;
2106
0
    PyMem_Free(ids->array);
2107
0
    ids->array = NULL;
2108
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2109
    // after Py_Finalize().
2110
0
}
2111
2112
2113
/* Internal function, doesn't check maximum character */
2114
2115
PyObject*
2116
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2117
113M
{
2118
113M
    const unsigned char *s = (const unsigned char *)buffer;
2119
113M
    PyObject *unicode;
2120
113M
    if (size == 1) {
2121
#ifdef Py_DEBUG
2122
        assert((unsigned char)s[0] < 128);
2123
#endif
2124
36.4M
        return get_latin1_char(s[0]);
2125
36.4M
    }
2126
76.8M
    unicode = PyUnicode_New(size, 127);
2127
76.8M
    if (!unicode)
2128
0
        return NULL;
2129
76.8M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2130
76.8M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2131
76.8M
    return unicode;
2132
76.8M
}
2133
2134
static Py_UCS4
2135
kind_maxchar_limit(int kind)
2136
0
{
2137
0
    switch (kind) {
2138
0
    case PyUnicode_1BYTE_KIND:
2139
0
        return 0x80;
2140
0
    case PyUnicode_2BYTE_KIND:
2141
0
        return 0x100;
2142
0
    case PyUnicode_4BYTE_KIND:
2143
0
        return 0x10000;
2144
0
    default:
2145
0
        Py_UNREACHABLE();
2146
0
    }
2147
0
}
2148
2149
static PyObject*
2150
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2151
46.4M
{
2152
46.4M
    PyObject *res;
2153
46.4M
    unsigned char max_char;
2154
2155
46.4M
    if (size == 0) {
2156
5.83M
        _Py_RETURN_UNICODE_EMPTY();
2157
5.83M
    }
2158
46.4M
    assert(size > 0);
2159
40.6M
    if (size == 1) {
2160
12.1M
        return get_latin1_char(u[0]);
2161
12.1M
    }
2162
2163
28.4M
    max_char = ucs1lib_find_max_char(u, u + size);
2164
28.4M
    res = PyUnicode_New(size, max_char);
2165
28.4M
    if (!res)
2166
0
        return NULL;
2167
28.4M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2168
28.4M
    assert(_PyUnicode_CheckConsistency(res, 1));
2169
28.4M
    return res;
2170
28.4M
}
2171
2172
static PyObject*
2173
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2174
99.5M
{
2175
99.5M
    PyObject *res;
2176
99.5M
    Py_UCS2 max_char;
2177
2178
99.5M
    if (size == 0)
2179
14.7M
        _Py_RETURN_UNICODE_EMPTY();
2180
99.5M
    assert(size > 0);
2181
84.8M
    if (size == 1)
2182
53.5M
        return unicode_char(u[0]);
2183
2184
31.2M
    max_char = ucs2lib_find_max_char(u, u + size);
2185
31.2M
    res = PyUnicode_New(size, max_char);
2186
31.2M
    if (!res)
2187
0
        return NULL;
2188
31.2M
    if (max_char >= 256)
2189
17.9M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2190
13.3M
    else {
2191
13.3M
        _PyUnicode_CONVERT_BYTES(
2192
13.3M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2193
13.3M
    }
2194
31.2M
    assert(_PyUnicode_CheckConsistency(res, 1));
2195
31.2M
    return res;
2196
31.2M
}
2197
2198
static PyObject*
2199
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2200
78.5M
{
2201
78.5M
    PyObject *res;
2202
78.5M
    Py_UCS4 max_char;
2203
2204
78.5M
    if (size == 0)
2205
6.43M
        _Py_RETURN_UNICODE_EMPTY();
2206
78.5M
    assert(size > 0);
2207
72.1M
    if (size == 1)
2208
49.5M
        return unicode_char(u[0]);
2209
2210
22.5M
    max_char = ucs4lib_find_max_char(u, u + size);
2211
22.5M
    res = PyUnicode_New(size, max_char);
2212
22.5M
    if (!res)
2213
0
        return NULL;
2214
22.5M
    if (max_char < 256)
2215
16.8M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2216
22.5M
                                 PyUnicode_1BYTE_DATA(res));
2217
5.69M
    else if (max_char < 0x10000)
2218
4.32M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2219
5.69M
                                 PyUnicode_2BYTE_DATA(res));
2220
1.37M
    else
2221
1.37M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2222
22.5M
    assert(_PyUnicode_CheckConsistency(res, 1));
2223
22.5M
    return res;
2224
22.5M
}
2225
2226
2227
int
2228
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2229
                          Py_UCS4 *str,
2230
                          Py_ssize_t size)
2231
0
{
2232
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2233
2234
0
    if (size < 0) {
2235
0
        PyErr_SetString(PyExc_ValueError,
2236
0
                        "size must be positive");
2237
0
        return -1;
2238
0
    }
2239
2240
0
    if (size == 0) {
2241
0
        return 0;
2242
0
    }
2243
2244
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2245
2246
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2247
0
        return -1;
2248
0
    }
2249
2250
0
    int kind = writer->kind;
2251
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2252
0
    if (kind == PyUnicode_1BYTE_KIND) {
2253
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2254
0
                                 str, str + size,
2255
0
                                 data);
2256
0
    }
2257
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2258
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2259
0
                                 str, str + size,
2260
0
                                 data);
2261
0
    }
2262
0
    else {
2263
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2264
0
    }
2265
0
    writer->pos += size;
2266
2267
0
    return 0;
2268
0
}
2269
2270
2271
PyObject*
2272
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2273
171M
{
2274
171M
    if (size < 0) {
2275
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2276
0
        return NULL;
2277
0
    }
2278
171M
    switch (kind) {
2279
21.9M
    case PyUnicode_1BYTE_KIND:
2280
21.9M
        return _PyUnicode_FromUCS1(buffer, size);
2281
81.1M
    case PyUnicode_2BYTE_KIND:
2282
81.1M
        return _PyUnicode_FromUCS2(buffer, size);
2283
67.9M
    case PyUnicode_4BYTE_KIND:
2284
67.9M
        return _PyUnicode_FromUCS4(buffer, size);
2285
0
    default:
2286
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2287
0
        return NULL;
2288
171M
    }
2289
171M
}
2290
2291
Py_UCS4
2292
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2293
20.8M
{
2294
20.8M
    int kind;
2295
20.8M
    const void *startptr, *endptr;
2296
2297
20.8M
    assert(0 <= start);
2298
20.8M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2299
20.8M
    assert(start <= end);
2300
2301
20.8M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2302
0
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2303
2304
20.8M
    if (start == end)
2305
0
        return 127;
2306
2307
20.8M
    if (PyUnicode_IS_ASCII(unicode))
2308
20.8M
        return 127;
2309
2310
32.8k
    kind = PyUnicode_KIND(unicode);
2311
32.8k
    startptr = PyUnicode_DATA(unicode);
2312
32.8k
    endptr = (char *)startptr + end * kind;
2313
32.8k
    startptr = (char *)startptr + start * kind;
2314
32.8k
    switch(kind) {
2315
1.56k
    case PyUnicode_1BYTE_KIND:
2316
1.56k
        return ucs1lib_find_max_char(startptr, endptr);
2317
5.50k
    case PyUnicode_2BYTE_KIND:
2318
5.50k
        return ucs2lib_find_max_char(startptr, endptr);
2319
25.7k
    case PyUnicode_4BYTE_KIND:
2320
25.7k
        return ucs4lib_find_max_char(startptr, endptr);
2321
0
    default:
2322
0
        Py_UNREACHABLE();
2323
32.8k
    }
2324
32.8k
}
2325
2326
/* Ensure that a string uses the most efficient storage, if it is not the
2327
   case: create a new string with of the right kind. Write NULL into *p_unicode
2328
   on error. */
2329
static void
2330
unicode_adjust_maxchar(PyObject **p_unicode)
2331
0
{
2332
0
    PyObject *unicode, *copy;
2333
0
    Py_UCS4 max_char;
2334
0
    Py_ssize_t len;
2335
0
    int kind;
2336
2337
0
    assert(p_unicode != NULL);
2338
0
    unicode = *p_unicode;
2339
0
    if (PyUnicode_IS_ASCII(unicode))
2340
0
        return;
2341
2342
0
    len = PyUnicode_GET_LENGTH(unicode);
2343
0
    kind = PyUnicode_KIND(unicode);
2344
0
    if (kind == PyUnicode_1BYTE_KIND) {
2345
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2346
0
        max_char = ucs1lib_find_max_char(u, u + len);
2347
0
        if (max_char >= 128)
2348
0
            return;
2349
0
    }
2350
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2351
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2352
0
        max_char = ucs2lib_find_max_char(u, u + len);
2353
0
        if (max_char >= 256)
2354
0
            return;
2355
0
    }
2356
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2357
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2358
0
        max_char = ucs4lib_find_max_char(u, u + len);
2359
0
        if (max_char >= 0x10000)
2360
0
            return;
2361
0
    }
2362
0
    else
2363
0
        Py_UNREACHABLE();
2364
2365
0
    copy = PyUnicode_New(len, max_char);
2366
0
    if (copy != NULL)
2367
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2368
0
    Py_DECREF(unicode);
2369
0
    *p_unicode = copy;
2370
0
}
2371
2372
PyObject*
2373
_PyUnicode_Copy(PyObject *unicode)
2374
3.40M
{
2375
3.40M
    Py_ssize_t length;
2376
3.40M
    PyObject *copy;
2377
2378
3.40M
    if (!PyUnicode_Check(unicode)) {
2379
0
        PyErr_BadInternalCall();
2380
0
        return NULL;
2381
0
    }
2382
2383
3.40M
    length = PyUnicode_GET_LENGTH(unicode);
2384
3.40M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2385
3.40M
    if (!copy)
2386
0
        return NULL;
2387
3.40M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2388
2389
3.40M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2390
3.40M
              length * PyUnicode_KIND(unicode));
2391
3.40M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2392
3.40M
    return copy;
2393
3.40M
}
2394
2395
2396
/* Widen Unicode objects to larger buffers. Don't write terminating null
2397
   character. Return NULL on error. */
2398
2399
static void*
2400
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2401
13.7M
{
2402
13.7M
    void *result;
2403
2404
13.7M
    assert(skind < kind);
2405
13.7M
    switch (kind) {
2406
12.4M
    case PyUnicode_2BYTE_KIND:
2407
12.4M
        result = PyMem_New(Py_UCS2, len);
2408
12.4M
        if (!result)
2409
0
            return PyErr_NoMemory();
2410
12.4M
        assert(skind == PyUnicode_1BYTE_KIND);
2411
12.4M
        _PyUnicode_CONVERT_BYTES(
2412
12.4M
            Py_UCS1, Py_UCS2,
2413
12.4M
            (const Py_UCS1 *)data,
2414
12.4M
            ((const Py_UCS1 *)data) + len,
2415
12.4M
            result);
2416
12.4M
        return result;
2417
1.28M
    case PyUnicode_4BYTE_KIND:
2418
1.28M
        result = PyMem_New(Py_UCS4, len);
2419
1.28M
        if (!result)
2420
0
            return PyErr_NoMemory();
2421
1.28M
        if (skind == PyUnicode_2BYTE_KIND) {
2422
0
            _PyUnicode_CONVERT_BYTES(
2423
0
                Py_UCS2, Py_UCS4,
2424
0
                (const Py_UCS2 *)data,
2425
0
                ((const Py_UCS2 *)data) + len,
2426
0
                result);
2427
0
        }
2428
1.28M
        else {
2429
1.28M
            assert(skind == PyUnicode_1BYTE_KIND);
2430
1.28M
            _PyUnicode_CONVERT_BYTES(
2431
1.28M
                Py_UCS1, Py_UCS4,
2432
1.28M
                (const Py_UCS1 *)data,
2433
1.28M
                ((const Py_UCS1 *)data) + len,
2434
1.28M
                result);
2435
1.28M
        }
2436
1.28M
        return result;
2437
0
    default:
2438
0
        Py_UNREACHABLE();
2439
0
        return NULL;
2440
13.7M
    }
2441
13.7M
}
2442
2443
static Py_UCS4*
2444
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2445
        int copy_null)
2446
80.1k
{
2447
80.1k
    int kind;
2448
80.1k
    const void *data;
2449
80.1k
    Py_ssize_t len, targetlen;
2450
80.1k
    kind = PyUnicode_KIND(string);
2451
80.1k
    data = PyUnicode_DATA(string);
2452
80.1k
    len = PyUnicode_GET_LENGTH(string);
2453
80.1k
    targetlen = len;
2454
80.1k
    if (copy_null)
2455
0
        targetlen++;
2456
80.1k
    if (!target) {
2457
0
        target = PyMem_New(Py_UCS4, targetlen);
2458
0
        if (!target) {
2459
0
            PyErr_NoMemory();
2460
0
            return NULL;
2461
0
        }
2462
0
    }
2463
80.1k
    else {
2464
80.1k
        if (targetsize < targetlen) {
2465
0
            PyErr_Format(PyExc_SystemError,
2466
0
                         "string is longer than the buffer");
2467
0
            if (copy_null && 0 < targetsize)
2468
0
                target[0] = 0;
2469
0
            return NULL;
2470
0
        }
2471
80.1k
    }
2472
80.1k
    if (kind == PyUnicode_1BYTE_KIND) {
2473
58.0k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2474
58.0k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2475
58.0k
    }
2476
22.1k
    else if (kind == PyUnicode_2BYTE_KIND) {
2477
16.0k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2478
16.0k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2479
16.0k
    }
2480
6.10k
    else if (kind == PyUnicode_4BYTE_KIND) {
2481
6.10k
        memcpy(target, data, len * sizeof(Py_UCS4));
2482
6.10k
    }
2483
0
    else {
2484
0
        Py_UNREACHABLE();
2485
0
    }
2486
80.1k
    if (copy_null)
2487
0
        target[len] = 0;
2488
80.1k
    return target;
2489
80.1k
}
2490
2491
Py_UCS4*
2492
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2493
                 int copy_null)
2494
80.1k
{
2495
80.1k
    if (target == NULL || targetsize < 0) {
2496
0
        PyErr_BadInternalCall();
2497
0
        return NULL;
2498
0
    }
2499
80.1k
    return as_ucs4(string, target, targetsize, copy_null);
2500
80.1k
}
2501
2502
Py_UCS4*
2503
PyUnicode_AsUCS4Copy(PyObject *string)
2504
0
{
2505
0
    return as_ucs4(string, NULL, 0, 1);
2506
0
}
2507
2508
/* maximum number of characters required for output of %jo or %jd or %p.
2509
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2510
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2511
   plus 1 for the terminal NUL. */
2512
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2513
2514
static int
2515
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2516
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2517
7.53M
{
2518
7.53M
    Py_ssize_t length, fill, arglen;
2519
7.53M
    Py_UCS4 maxchar;
2520
2521
7.53M
    length = PyUnicode_GET_LENGTH(str);
2522
7.53M
    if ((precision == -1 || precision >= length)
2523
7.53M
        && width <= length)
2524
7.53M
        return _PyUnicodeWriter_WriteStr(writer, str);
2525
2526
51
    if (precision != -1)
2527
51
        length = Py_MIN(precision, length);
2528
2529
51
    arglen = Py_MAX(length, width);
2530
51
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2531
25
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2532
26
    else
2533
26
        maxchar = writer->maxchar;
2534
2535
51
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2536
0
        return -1;
2537
2538
51
    fill = Py_MAX(width - length, 0);
2539
51
    if (fill && !(flags & F_LJUST)) {
2540
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2541
0
            return -1;
2542
0
        writer->pos += fill;
2543
0
    }
2544
2545
51
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2546
51
                                  str, 0, length);
2547
51
    writer->pos += length;
2548
2549
51
    if (fill && (flags & F_LJUST)) {
2550
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2551
0
            return -1;
2552
0
        writer->pos += fill;
2553
0
    }
2554
2555
51
    return 0;
2556
51
}
2557
2558
static int
2559
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2560
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2561
5.17M
{
2562
    /* UTF-8 */
2563
5.17M
    Py_ssize_t *pconsumed = NULL;
2564
5.17M
    Py_ssize_t length;
2565
5.17M
    if (precision == -1) {
2566
848k
        length = strlen(str);
2567
848k
    }
2568
4.32M
    else {
2569
4.32M
        length = 0;
2570
18.0M
        while (length < precision && str[length]) {
2571
13.6M
            length++;
2572
13.6M
        }
2573
4.32M
        if (length == precision) {
2574
            /* The input string is not NUL-terminated.  If it ends with an
2575
             * incomplete UTF-8 sequence, truncate the string just before it.
2576
             * Incomplete sequences in the middle and sequences which cannot
2577
             * be valid prefixes are still treated as errors and replaced
2578
             * with \xfffd. */
2579
1.98k
            pconsumed = &length;
2580
1.98k
        }
2581
4.32M
    }
2582
2583
5.17M
    if (width < 0) {
2584
5.17M
        return _PyUnicode_DecodeUTF8Writer(writer, str, length,
2585
5.17M
                                           _Py_ERROR_REPLACE, "replace", pconsumed);
2586
5.17M
    }
2587
2588
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2589
0
                                                     "replace", pconsumed);
2590
0
    if (unicode == NULL)
2591
0
        return -1;
2592
2593
0
    int res = unicode_fromformat_write_str(writer, unicode,
2594
0
                                           width, -1, flags);
2595
0
    Py_DECREF(unicode);
2596
0
    return res;
2597
0
}
2598
2599
static int
2600
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2601
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2602
0
{
2603
0
    Py_ssize_t length;
2604
0
    if (precision == -1) {
2605
0
        length = wcslen(str);
2606
0
    }
2607
0
    else {
2608
0
        length = 0;
2609
0
        while (length < precision && str[length]) {
2610
0
            length++;
2611
0
        }
2612
0
    }
2613
2614
0
    if (width < 0) {
2615
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2616
0
                                             str, length);
2617
0
    }
2618
2619
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2620
0
    if (unicode == NULL)
2621
0
        return -1;
2622
2623
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2624
0
    Py_DECREF(unicode);
2625
0
    return res;
2626
0
}
2627
2628
0
#define F_LONG 1
2629
0
#define F_LONGLONG 2
2630
105k
#define F_SIZE 3
2631
0
#define F_PTRDIFF 4
2632
0
#define F_INTMAX 5
2633
2634
static const char*
2635
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2636
                       const char *f, va_list *vargs)
2637
39.6M
{
2638
39.6M
    const char *p;
2639
39.6M
    Py_ssize_t len;
2640
39.6M
    int flags = 0;
2641
39.6M
    Py_ssize_t width;
2642
39.6M
    Py_ssize_t precision;
2643
2644
39.6M
    p = f;
2645
39.6M
    f++;
2646
39.6M
    if (*f == '%') {
2647
4.30M
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2648
0
            return NULL;
2649
4.30M
        f++;
2650
4.30M
        return f;
2651
4.30M
    }
2652
2653
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2654
    /* Flags '+', ' ' and '#' are not particularly useful.
2655
     * They are not worth the implementation and maintenance costs.
2656
     * In addition, '#' should add "0" for "o" conversions for compatibility
2657
     * with printf, but it would confuse Python users. */
2658
35.3M
    while (1) {
2659
35.3M
        switch (*f++) {
2660
0
        case '-': flags |= F_LJUST; continue;
2661
1.82k
        case '0': flags |= F_ZERO; continue;
2662
0
        case '#': flags |= F_ALT; continue;
2663
35.3M
        }
2664
35.3M
        f--;
2665
35.3M
        break;
2666
35.3M
    }
2667
2668
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2669
35.3M
    width = -1;
2670
35.3M
    if (*f == '*') {
2671
0
        width = va_arg(*vargs, int);
2672
0
        if (width < 0) {
2673
0
            flags |= F_LJUST;
2674
0
            width = -width;
2675
0
        }
2676
0
        f++;
2677
0
    }
2678
35.3M
    else if (Py_ISDIGIT((unsigned)*f)) {
2679
1.82k
        width = *f - '0';
2680
1.82k
        f++;
2681
1.82k
        while (Py_ISDIGIT((unsigned)*f)) {
2682
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2683
0
                PyErr_SetString(PyExc_ValueError,
2684
0
                                "width too big");
2685
0
                return NULL;
2686
0
            }
2687
0
            width = (width * 10) + (*f - '0');
2688
0
            f++;
2689
0
        }
2690
1.82k
    }
2691
35.3M
    precision = -1;
2692
35.3M
    if (*f == '.') {
2693
10.1M
        f++;
2694
10.1M
        if (*f == '*') {
2695
0
            precision = va_arg(*vargs, int);
2696
0
            if (precision < 0) {
2697
0
                precision = -2;
2698
0
            }
2699
0
            f++;
2700
0
        }
2701
10.1M
        else if (Py_ISDIGIT((unsigned)*f)) {
2702
10.1M
            precision = (*f - '0');
2703
10.1M
            f++;
2704
30.3M
            while (Py_ISDIGIT((unsigned)*f)) {
2705
20.2M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2706
0
                    PyErr_SetString(PyExc_ValueError,
2707
0
                                    "precision too big");
2708
0
                    return NULL;
2709
0
                }
2710
20.2M
                precision = (precision * 10) + (*f - '0');
2711
20.2M
                f++;
2712
20.2M
            }
2713
10.1M
        }
2714
10.1M
    }
2715
2716
35.3M
    int sizemod = 0;
2717
35.3M
    if (*f == 'l') {
2718
0
        if (f[1] == 'l') {
2719
0
            sizemod = F_LONGLONG;
2720
0
            f += 2;
2721
0
        }
2722
0
        else {
2723
0
            sizemod = F_LONG;
2724
0
            ++f;
2725
0
        }
2726
0
    }
2727
35.3M
    else if (*f == 'z') {
2728
52.6k
        sizemod = F_SIZE;
2729
52.6k
        ++f;
2730
52.6k
    }
2731
35.2M
    else if (*f == 't') {
2732
0
        sizemod = F_PTRDIFF;
2733
0
        ++f;
2734
0
    }
2735
35.2M
    else if (*f == 'j') {
2736
0
        sizemod = F_INTMAX;
2737
0
        ++f;
2738
0
    }
2739
35.3M
    if (f[0] != '\0' && f[1] == '\0')
2740
10.7M
        writer->overallocate = 0;
2741
2742
35.3M
    switch (*f) {
2743
17.7M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2744
17.7M
        break;
2745
4.87M
    case 'c': case 'p':
2746
4.87M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2747
4.87M
        break;
2748
5.17M
    case 's':
2749
5.17M
    case 'V':
2750
5.17M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2751
5.17M
        break;
2752
7.53M
    default:
2753
7.53M
        if (sizemod) goto invalid_format;
2754
7.53M
        break;
2755
35.3M
    }
2756
2757
35.3M
    switch (*f) {
2758
4.87M
    case 'c':
2759
4.87M
    {
2760
4.87M
        int ordinal = va_arg(*vargs, int);
2761
4.87M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2762
0
            PyErr_SetString(PyExc_OverflowError,
2763
0
                            "character argument not in range(0x110000)");
2764
0
            return NULL;
2765
0
        }
2766
4.87M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2767
0
            return NULL;
2768
4.87M
        break;
2769
4.87M
    }
2770
2771
17.7M
    case 'd': case 'i':
2772
17.7M
    case 'o': case 'u': case 'x': case 'X':
2773
17.7M
    {
2774
17.7M
        char buffer[MAX_INTMAX_CHARS];
2775
2776
        // Fill buffer using sprinf, with one of many possible format
2777
        // strings, like "%llX" for `long long` in hexadecimal.
2778
        // The type/size is in `sizemod`; the format is in `*f`.
2779
2780
        // Use macros with nested switches to keep the sprintf format strings
2781
        // as compile-time literals, avoiding warnings and maybe allowing
2782
        // optimizations.
2783
2784
        // `SPRINT` macro does one sprintf
2785
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2786
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2787
17.7M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2788
17.7M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2789
2790
        // One inner switch to handle all format variants
2791
17.7M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2792
17.7M
            switch (*f) {                                                     \
2793
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2794
9.09k
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2795
1.37k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2796
1.02k
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2797
17.7M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2798
17.7M
            }
2799
2800
        // Outer switch to handle all the sizes/types
2801
17.7M
        switch (sizemod) {
2802
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2803
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2804
52.6k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2805
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2806
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2807
17.6M
            default:         DO_SPRINTS("", int, unsigned int); break;
2808
17.7M
        }
2809
17.7M
        #undef SPRINT
2810
17.7M
        #undef DO_SPRINTS
2811
2812
17.7M
        assert(len >= 0);
2813
2814
17.7M
        int sign = (buffer[0] == '-');
2815
17.7M
        len -= sign;
2816
2817
17.7M
        precision = Py_MAX(precision, len);
2818
17.7M
        width = Py_MAX(width, precision + sign);
2819
17.7M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2820
1.82k
            precision = width - sign;
2821
1.82k
        }
2822
2823
17.7M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2824
17.7M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2825
2826
17.7M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2827
0
            return NULL;
2828
2829
17.7M
        if (spacepad && !(flags & F_LJUST)) {
2830
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2831
0
                return NULL;
2832
0
            writer->pos += spacepad;
2833
0
        }
2834
2835
17.7M
        if (sign) {
2836
979
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2837
0
                return NULL;
2838
979
        }
2839
2840
17.7M
        if (zeropad) {
2841
617
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2842
0
                return NULL;
2843
617
            writer->pos += zeropad;
2844
617
        }
2845
2846
17.7M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2847
0
            return NULL;
2848
2849
17.7M
        if (spacepad && (flags & F_LJUST)) {
2850
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2851
0
                return NULL;
2852
0
            writer->pos += spacepad;
2853
0
        }
2854
17.7M
        break;
2855
17.7M
    }
2856
2857
17.7M
    case 'p':
2858
0
    {
2859
0
        char number[MAX_INTMAX_CHARS];
2860
2861
0
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2862
0
        assert(len >= 0);
2863
2864
        /* %p is ill-defined:  ensure leading 0x. */
2865
0
        if (number[1] == 'X')
2866
0
            number[1] = 'x';
2867
0
        else if (number[1] != 'x') {
2868
0
            memmove(number + 2, number,
2869
0
                    strlen(number) + 1);
2870
0
            number[0] = '0';
2871
0
            number[1] = 'x';
2872
0
            len += 2;
2873
0
        }
2874
2875
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2876
0
            return NULL;
2877
0
        break;
2878
0
    }
2879
2880
5.17M
    case 's':
2881
5.17M
    {
2882
5.17M
        if (sizemod) {
2883
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2884
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2885
0
                return NULL;
2886
0
        }
2887
5.17M
        else {
2888
            /* UTF-8 */
2889
5.17M
            const char *s = va_arg(*vargs, const char*);
2890
5.17M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2891
0
                return NULL;
2892
5.17M
        }
2893
5.17M
        break;
2894
5.17M
    }
2895
2896
5.17M
    case 'U':
2897
1.16M
    {
2898
1.16M
        PyObject *obj = va_arg(*vargs, PyObject *);
2899
1.16M
        assert(obj && _PyUnicode_CHECK(obj));
2900
2901
1.16M
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2902
0
            return NULL;
2903
1.16M
        break;
2904
1.16M
    }
2905
2906
1.16M
    case 'V':
2907
563
    {
2908
563
        PyObject *obj = va_arg(*vargs, PyObject *);
2909
563
        const char *str;
2910
563
        const wchar_t *wstr;
2911
563
        if (sizemod) {
2912
0
            wstr = va_arg(*vargs, const wchar_t*);
2913
0
        }
2914
563
        else {
2915
563
            str = va_arg(*vargs, const char *);
2916
563
        }
2917
563
        if (obj) {
2918
0
            assert(_PyUnicode_CHECK(obj));
2919
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2920
0
                return NULL;
2921
0
        }
2922
563
        else if (sizemod) {
2923
0
            assert(wstr != NULL);
2924
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2925
0
                return NULL;
2926
0
        }
2927
563
        else {
2928
563
            assert(str != NULL);
2929
563
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
2930
0
                return NULL;
2931
563
        }
2932
563
        break;
2933
563
    }
2934
2935
1.69k
    case 'S':
2936
1.69k
    {
2937
1.69k
        PyObject *obj = va_arg(*vargs, PyObject *);
2938
1.69k
        PyObject *str;
2939
1.69k
        assert(obj);
2940
1.69k
        str = PyObject_Str(obj);
2941
1.69k
        if (!str)
2942
0
            return NULL;
2943
1.69k
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2944
0
            Py_DECREF(str);
2945
0
            return NULL;
2946
0
        }
2947
1.69k
        Py_DECREF(str);
2948
1.69k
        break;
2949
1.69k
    }
2950
2951
5.81M
    case 'R':
2952
5.81M
    {
2953
5.81M
        PyObject *obj = va_arg(*vargs, PyObject *);
2954
5.81M
        PyObject *repr;
2955
5.81M
        assert(obj);
2956
5.81M
        repr = PyObject_Repr(obj);
2957
5.81M
        if (!repr)
2958
0
            return NULL;
2959
5.81M
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2960
0
            Py_DECREF(repr);
2961
0
            return NULL;
2962
0
        }
2963
5.81M
        Py_DECREF(repr);
2964
5.81M
        break;
2965
5.81M
    }
2966
2967
0
    case 'A':
2968
0
    {
2969
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2970
0
        PyObject *ascii;
2971
0
        assert(obj);
2972
0
        ascii = PyObject_ASCII(obj);
2973
0
        if (!ascii)
2974
0
            return NULL;
2975
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2976
0
            Py_DECREF(ascii);
2977
0
            return NULL;
2978
0
        }
2979
0
        Py_DECREF(ascii);
2980
0
        break;
2981
0
    }
2982
2983
565k
    case 'T':
2984
565k
    {
2985
565k
        PyObject *obj = va_arg(*vargs, PyObject *);
2986
565k
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2987
2988
565k
        PyObject *type_name;
2989
565k
        if (flags & F_ALT) {
2990
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
2991
0
        }
2992
565k
        else {
2993
565k
            type_name = PyType_GetFullyQualifiedName(type);
2994
565k
        }
2995
565k
        Py_DECREF(type);
2996
565k
        if (!type_name) {
2997
0
            return NULL;
2998
0
        }
2999
3000
565k
        if (unicode_fromformat_write_str(writer, type_name,
3001
565k
                                         width, precision, flags) == -1) {
3002
0
            Py_DECREF(type_name);
3003
0
            return NULL;
3004
0
        }
3005
565k
        Py_DECREF(type_name);
3006
565k
        break;
3007
565k
    }
3008
3009
0
    case 'N':
3010
0
    {
3011
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3012
0
        assert(type_raw != NULL);
3013
3014
0
        if (!PyType_Check(type_raw)) {
3015
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3016
0
            return NULL;
3017
0
        }
3018
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3019
3020
0
        PyObject *type_name;
3021
0
        if (flags & F_ALT) {
3022
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3023
0
        }
3024
0
        else {
3025
0
            type_name = PyType_GetFullyQualifiedName(type);
3026
0
        }
3027
0
        if (!type_name) {
3028
0
            return NULL;
3029
0
        }
3030
0
        if (unicode_fromformat_write_str(writer, type_name,
3031
0
                                         width, precision, flags) == -1) {
3032
0
            Py_DECREF(type_name);
3033
0
            return NULL;
3034
0
        }
3035
0
        Py_DECREF(type_name);
3036
0
        break;
3037
0
    }
3038
3039
0
    default:
3040
0
    invalid_format:
3041
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3042
0
        return NULL;
3043
35.3M
    }
3044
3045
35.3M
    f++;
3046
35.3M
    return f;
3047
35.3M
}
3048
3049
static int
3050
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3051
17.4M
{
3052
17.4M
    Py_ssize_t len = strlen(format);
3053
17.4M
    writer->min_length += len + 100;
3054
17.4M
    writer->overallocate = 1;
3055
3056
    // Copy varags to be able to pass a reference to a subfunction.
3057
17.4M
    va_list vargs2;
3058
17.4M
    va_copy(vargs2, vargs);
3059
3060
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3061
    // to be encoded to ASCII.
3062
17.4M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3063
17.4M
    if (!is_ascii) {
3064
0
        Py_ssize_t i;
3065
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3066
0
        PyErr_Format(PyExc_ValueError,
3067
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3068
0
            "string, got a non-ASCII byte: 0x%02x",
3069
0
            (unsigned char)format[i]);
3070
0
        goto fail;
3071
0
    }
3072
3073
93.0M
    for (const char *f = format; *f; ) {
3074
75.5M
        if (*f == '%') {
3075
39.6M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3076
39.6M
            if (f == NULL)
3077
0
                goto fail;
3078
39.6M
        }
3079
35.9M
        else {
3080
35.9M
            const char *p = strchr(f, '%');
3081
35.9M
            if (p != NULL) {
3082
29.2M
                len = p - f;
3083
29.2M
            }
3084
6.68M
            else {
3085
6.68M
                len = strlen(f);
3086
6.68M
                writer->overallocate = 0;
3087
6.68M
            }
3088
3089
35.9M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3090
0
                goto fail;
3091
0
            }
3092
35.9M
            f += len;
3093
35.9M
        }
3094
75.5M
    }
3095
17.4M
    va_end(vargs2);
3096
17.4M
    return 0;
3097
3098
0
  fail:
3099
0
    va_end(vargs2);
3100
0
    return -1;
3101
17.4M
}
3102
3103
PyObject *
3104
PyUnicode_FromFormatV(const char *format, va_list vargs)
3105
17.4M
{
3106
17.4M
    _PyUnicodeWriter writer;
3107
17.4M
    _PyUnicodeWriter_Init(&writer);
3108
3109
17.4M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3110
0
        _PyUnicodeWriter_Dealloc(&writer);
3111
0
        return NULL;
3112
0
    }
3113
17.4M
    return _PyUnicodeWriter_Finish(&writer);
3114
17.4M
}
3115
3116
PyObject *
3117
PyUnicode_FromFormat(const char *format, ...)
3118
592k
{
3119
592k
    PyObject* ret;
3120
592k
    va_list vargs;
3121
3122
592k
    va_start(vargs, format);
3123
592k
    ret = PyUnicode_FromFormatV(format, vargs);
3124
592k
    va_end(vargs);
3125
592k
    return ret;
3126
592k
}
3127
3128
int
3129
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3130
0
{
3131
0
    va_list vargs;
3132
0
    va_start(vargs, format);
3133
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3134
0
    va_end(vargs);
3135
0
    return res;
3136
0
}
3137
3138
int
3139
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3140
                         va_list vargs)
3141
0
{
3142
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3143
0
    Py_ssize_t old_pos = _writer->pos;
3144
3145
0
    int res = unicode_from_format(_writer, format, vargs);
3146
3147
0
    if (res < 0) {
3148
0
        _writer->pos = old_pos;
3149
0
    }
3150
0
    return res;
3151
0
}
3152
3153
static Py_ssize_t
3154
unicode_get_widechar_size(PyObject *unicode)
3155
156k
{
3156
156k
    Py_ssize_t res;
3157
3158
156k
    assert(unicode != NULL);
3159
156k
    assert(_PyUnicode_CHECK(unicode));
3160
3161
156k
    res = _PyUnicode_LENGTH(unicode);
3162
#if SIZEOF_WCHAR_T == 2
3163
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3164
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3165
        const Py_UCS4 *end = s + res;
3166
        for (; s < end; ++s) {
3167
            if (*s > 0xFFFF) {
3168
                ++res;
3169
            }
3170
        }
3171
    }
3172
#endif
3173
156k
    return res;
3174
156k
}
3175
3176
static void
3177
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3178
156k
{
3179
156k
    assert(unicode != NULL);
3180
156k
    assert(_PyUnicode_CHECK(unicode));
3181
3182
156k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3183
495
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3184
495
        return;
3185
495
    }
3186
3187
156k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3188
127k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3189
1.84M
        for (; size--; ++s, ++w) {
3190
1.72M
            *w = *s;
3191
1.72M
        }
3192
127k
    }
3193
28.7k
    else {
3194
28.7k
#if SIZEOF_WCHAR_T == 4
3195
28.7k
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3196
28.7k
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3197
1.39M
        for (; size--; ++s, ++w) {
3198
1.37M
            *w = *s;
3199
1.37M
        }
3200
#else
3201
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3202
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3203
        for (; size--; ++s, ++w) {
3204
            Py_UCS4 ch = *s;
3205
            if (ch > 0xFFFF) {
3206
                assert(ch <= MAX_UNICODE);
3207
                /* encode surrogate pair in this case */
3208
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3209
                if (!size--)
3210
                    break;
3211
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3212
            }
3213
            else {
3214
                *w = ch;
3215
            }
3216
        }
3217
#endif
3218
28.7k
    }
3219
156k
}
3220
3221
#ifdef HAVE_WCHAR_H
3222
3223
/* Convert a Unicode object to a wide character string.
3224
3225
   - If w is NULL: return the number of wide characters (including the null
3226
     character) required to convert the unicode object. Ignore size argument.
3227
3228
   - Otherwise: return the number of wide characters (excluding the null
3229
     character) written into w. Write at most size wide characters (including
3230
     the null character). */
3231
Py_ssize_t
3232
PyUnicode_AsWideChar(PyObject *unicode,
3233
                     wchar_t *w,
3234
                     Py_ssize_t size)
3235
5.70k
{
3236
5.70k
    Py_ssize_t res;
3237
3238
5.70k
    if (unicode == NULL) {
3239
0
        PyErr_BadInternalCall();
3240
0
        return -1;
3241
0
    }
3242
5.70k
    if (!PyUnicode_Check(unicode)) {
3243
0
        PyErr_BadArgument();
3244
0
        return -1;
3245
0
    }
3246
3247
5.70k
    res = unicode_get_widechar_size(unicode);
3248
5.70k
    if (w == NULL) {
3249
0
        return res + 1;
3250
0
    }
3251
3252
5.70k
    if (size > res) {
3253
5.70k
        size = res + 1;
3254
5.70k
    }
3255
0
    else {
3256
0
        res = size;
3257
0
    }
3258
5.70k
    unicode_copy_as_widechar(unicode, w, size);
3259
3260
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3261
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3262
       non-Unicode locales and hence needs conversion first. */
3263
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3264
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3265
            return -1;
3266
        }
3267
    }
3268
#endif
3269
3270
5.70k
    return res;
3271
5.70k
}
3272
3273
wchar_t*
3274
PyUnicode_AsWideCharString(PyObject *unicode,
3275
                           Py_ssize_t *size)
3276
151k
{
3277
151k
    wchar_t *buffer;
3278
151k
    Py_ssize_t buflen;
3279
3280
151k
    if (unicode == NULL) {
3281
0
        PyErr_BadInternalCall();
3282
0
        return NULL;
3283
0
    }
3284
151k
    if (!PyUnicode_Check(unicode)) {
3285
0
        PyErr_BadArgument();
3286
0
        return NULL;
3287
0
    }
3288
3289
151k
    buflen = unicode_get_widechar_size(unicode);
3290
151k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3291
151k
    if (buffer == NULL) {
3292
0
        PyErr_NoMemory();
3293
0
        return NULL;
3294
0
    }
3295
151k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3296
3297
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3298
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3299
       non-Unicode locales and hence needs conversion first. */
3300
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3301
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3302
            return NULL;
3303
        }
3304
    }
3305
#endif
3306
3307
151k
    if (size != NULL) {
3308
150k
        *size = buflen;
3309
150k
    }
3310
784
    else if (wcslen(buffer) != (size_t)buflen) {
3311
0
        PyMem_Free(buffer);
3312
0
        PyErr_SetString(PyExc_ValueError,
3313
0
                        "embedded null character");
3314
0
        return NULL;
3315
0
    }
3316
151k
    return buffer;
3317
151k
}
3318
3319
#endif /* HAVE_WCHAR_H */
3320
3321
int
3322
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3323
0
{
3324
0
    wchar_t **p = (wchar_t **)ptr;
3325
0
    if (obj == NULL) {
3326
0
        PyMem_Free(*p);
3327
0
        *p = NULL;
3328
0
        return 1;
3329
0
    }
3330
0
    if (PyUnicode_Check(obj)) {
3331
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3332
0
        if (*p == NULL) {
3333
0
            return 0;
3334
0
        }
3335
0
        return Py_CLEANUP_SUPPORTED;
3336
0
    }
3337
0
    PyErr_Format(PyExc_TypeError,
3338
0
                 "argument must be str, not %.50s",
3339
0
                 Py_TYPE(obj)->tp_name);
3340
0
    return 0;
3341
0
}
3342
3343
int
3344
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3345
0
{
3346
0
    wchar_t **p = (wchar_t **)ptr;
3347
0
    if (obj == NULL) {
3348
0
        PyMem_Free(*p);
3349
0
        *p = NULL;
3350
0
        return 1;
3351
0
    }
3352
0
    if (obj == Py_None) {
3353
0
        *p = NULL;
3354
0
        return 1;
3355
0
    }
3356
0
    if (PyUnicode_Check(obj)) {
3357
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3358
0
        if (*p == NULL) {
3359
0
            return 0;
3360
0
        }
3361
0
        return Py_CLEANUP_SUPPORTED;
3362
0
    }
3363
0
    PyErr_Format(PyExc_TypeError,
3364
0
                 "argument must be str or None, not %.50s",
3365
0
                 Py_TYPE(obj)->tp_name);
3366
0
    return 0;
3367
0
}
3368
3369
PyObject *
3370
PyUnicode_FromOrdinal(int ordinal)
3371
309k
{
3372
309k
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3373
28
        PyErr_SetString(PyExc_ValueError,
3374
28
                        "chr() arg not in range(0x110000)");
3375
28
        return NULL;
3376
28
    }
3377
3378
309k
    return unicode_char((Py_UCS4)ordinal);
3379
309k
}
3380
3381
PyObject *
3382
PyUnicode_FromObject(PyObject *obj)
3383
4.22M
{
3384
    /* XXX Perhaps we should make this API an alias of
3385
       PyObject_Str() instead ?! */
3386
4.22M
    if (PyUnicode_CheckExact(obj)) {
3387
4.22M
        return Py_NewRef(obj);
3388
4.22M
    }
3389
0
    if (PyUnicode_Check(obj)) {
3390
        /* For a Unicode subtype that's not a Unicode object,
3391
           return a true Unicode object with the same data. */
3392
0
        return _PyUnicode_Copy(obj);
3393
0
    }
3394
0
    PyErr_Format(PyExc_TypeError,
3395
0
                 "Can't convert '%.100s' object to str implicitly",
3396
0
                 Py_TYPE(obj)->tp_name);
3397
0
    return NULL;
3398
0
}
3399
3400
PyObject *
3401
PyUnicode_FromEncodedObject(PyObject *obj,
3402
                            const char *encoding,
3403
                            const char *errors)
3404
9.48M
{
3405
9.48M
    Py_buffer buffer;
3406
9.48M
    PyObject *v;
3407
3408
9.48M
    if (obj == NULL) {
3409
0
        PyErr_BadInternalCall();
3410
0
        return NULL;
3411
0
    }
3412
3413
    /* Decoding bytes objects is the most common case and should be fast */
3414
9.48M
    if (PyBytes_Check(obj)) {
3415
8.90M
        if (PyBytes_GET_SIZE(obj) == 0) {
3416
498k
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3417
0
                return NULL;
3418
0
            }
3419
498k
            _Py_RETURN_UNICODE_EMPTY();
3420
498k
        }
3421
8.40M
        return PyUnicode_Decode(
3422
8.40M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3423
8.40M
                encoding, errors);
3424
8.90M
    }
3425
3426
575k
    if (PyUnicode_Check(obj)) {
3427
0
        PyErr_SetString(PyExc_TypeError,
3428
0
                        "decoding str is not supported");
3429
0
        return NULL;
3430
0
    }
3431
3432
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3433
575k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3434
0
        PyErr_Format(PyExc_TypeError,
3435
0
                     "decoding to str: need a bytes-like object, %.80s found",
3436
0
                     Py_TYPE(obj)->tp_name);
3437
0
        return NULL;
3438
0
    }
3439
3440
575k
    if (buffer.len == 0) {
3441
0
        PyBuffer_Release(&buffer);
3442
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3443
0
            return NULL;
3444
0
        }
3445
0
        _Py_RETURN_UNICODE_EMPTY();
3446
0
    }
3447
3448
575k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3449
575k
    PyBuffer_Release(&buffer);
3450
575k
    return v;
3451
575k
}
3452
3453
/* Normalize an encoding name like encodings.normalize_encoding()
3454
   but allow to convert to lowercase if *to_lower* is true.
3455
   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
3456
int
3457
_Py_normalize_encoding(const char *encoding,
3458
                       char *lower,
3459
                       size_t lower_len,
3460
                       int to_lower)
3461
13.8M
{
3462
13.8M
    const char *e;
3463
13.8M
    char *l;
3464
13.8M
    char *l_end;
3465
13.8M
    int punct;
3466
3467
13.8M
    assert(encoding != NULL);
3468
3469
13.8M
    e = encoding;
3470
13.8M
    l = lower;
3471
13.8M
    l_end = &lower[lower_len - 1];
3472
13.8M
    punct = 0;
3473
150M
    while (1) {
3474
150M
        char c = *e;
3475
150M
        if (c == 0) {
3476
13.2M
            break;
3477
13.2M
        }
3478
3479
136M
        if (Py_ISALNUM(c) || c == '.') {
3480
81.0M
            if (punct && l != lower) {
3481
9.33M
                if (l == l_end) {
3482
1.43k
                    return 0;
3483
1.43k
                }
3484
9.33M
                *l++ = '_';
3485
9.33M
            }
3486
81.0M
            punct = 0;
3487
3488
81.0M
            if (l == l_end) {
3489
567k
                return 0;
3490
567k
            }
3491
80.4M
            *l++ = to_lower ? Py_TOLOWER(c) : c;
3492
80.4M
        }
3493
55.9M
        else {
3494
55.9M
            punct = 1;
3495
55.9M
        }
3496
3497
136M
        e++;
3498
136M
    }
3499
13.2M
    *l = '\0';
3500
13.2M
    return 1;
3501
13.8M
}
3502
3503
PyObject *
3504
PyUnicode_Decode(const char *s,
3505
                 Py_ssize_t size,
3506
                 const char *encoding,
3507
                 const char *errors)
3508
8.98M
{
3509
8.98M
    PyObject *buffer = NULL, *unicode;
3510
8.98M
    Py_buffer info;
3511
8.98M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3512
3513
8.98M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3514
0
        return NULL;
3515
0
    }
3516
3517
8.98M
    if (size == 0) {
3518
0
        _Py_RETURN_UNICODE_EMPTY();
3519
0
    }
3520
3521
8.98M
    if (encoding == NULL) {
3522
44.7k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3523
44.7k
    }
3524
3525
    /* Shortcuts for common default encodings */
3526
8.94M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3527
8.94M
        char *lower = buflower;
3528
3529
        /* Fast paths */
3530
8.94M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3531
975k
            lower += 3;
3532
975k
            if (*lower == '_') {
3533
                /* Match "utf8" and "utf_8" */
3534
974k
                lower++;
3535
974k
            }
3536
3537
975k
            if (lower[0] == '8' && lower[1] == 0) {
3538
974k
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3539
974k
            }
3540
460
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3541
92
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3542
92
            }
3543
368
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3544
89
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3545
89
            }
3546
975k
        }
3547
7.96M
        else {
3548
7.96M
            if (strcmp(lower, "ascii") == 0
3549
4.40M
                || strcmp(lower, "us_ascii") == 0) {
3550
4.40M
                return PyUnicode_DecodeASCII(s, size, errors);
3551
4.40M
            }
3552
    #ifdef MS_WINDOWS
3553
            else if (strcmp(lower, "mbcs") == 0) {
3554
                return PyUnicode_DecodeMBCS(s, size, errors);
3555
            }
3556
    #endif
3557
3.56M
            else if (strcmp(lower, "latin1") == 0
3558
3.56M
                     || strcmp(lower, "latin_1") == 0
3559
368k
                     || strcmp(lower, "iso_8859_1") == 0
3560
3.21M
                     || strcmp(lower, "iso8859_1") == 0) {
3561
3.21M
                return PyUnicode_DecodeLatin1(s, size, errors);
3562
3.21M
            }
3563
7.96M
        }
3564
8.94M
    }
3565
3566
    /* Decode via the codec registry */
3567
348k
    buffer = NULL;
3568
348k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3569
0
        goto onError;
3570
348k
    buffer = PyMemoryView_FromBuffer(&info);
3571
348k
    if (buffer == NULL)
3572
0
        goto onError;
3573
348k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3574
348k
    if (unicode == NULL)
3575
93.0k
        goto onError;
3576
255k
    if (!PyUnicode_Check(unicode)) {
3577
0
        PyErr_Format(PyExc_TypeError,
3578
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3579
0
                     "use codecs.decode() to decode to arbitrary types",
3580
0
                     encoding,
3581
0
                     Py_TYPE(unicode)->tp_name);
3582
0
        Py_DECREF(unicode);
3583
0
        goto onError;
3584
0
    }
3585
255k
    Py_DECREF(buffer);
3586
255k
    return unicode_result(unicode);
3587
3588
93.0k
  onError:
3589
93.0k
    Py_XDECREF(buffer);
3590
93.0k
    return NULL;
3591
255k
}
3592
3593
PyAPI_FUNC(PyObject *)
3594
PyUnicode_AsDecodedObject(PyObject *unicode,
3595
                          const char *encoding,
3596
                          const char *errors)
3597
0
{
3598
0
    if (!PyUnicode_Check(unicode)) {
3599
0
        PyErr_BadArgument();
3600
0
        return NULL;
3601
0
    }
3602
3603
0
    if (encoding == NULL)
3604
0
        encoding = PyUnicode_GetDefaultEncoding();
3605
3606
    /* Decode via the codec registry */
3607
0
    return PyCodec_Decode(unicode, encoding, errors);
3608
0
}
3609
3610
PyAPI_FUNC(PyObject *)
3611
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3612
                           const char *encoding,
3613
                           const char *errors)
3614
0
{
3615
0
    PyObject *v;
3616
3617
0
    if (!PyUnicode_Check(unicode)) {
3618
0
        PyErr_BadArgument();
3619
0
        goto onError;
3620
0
    }
3621
3622
0
    if (encoding == NULL)
3623
0
        encoding = PyUnicode_GetDefaultEncoding();
3624
3625
    /* Decode via the codec registry */
3626
0
    v = PyCodec_Decode(unicode, encoding, errors);
3627
0
    if (v == NULL)
3628
0
        goto onError;
3629
0
    if (!PyUnicode_Check(v)) {
3630
0
        PyErr_Format(PyExc_TypeError,
3631
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3632
0
                     "use codecs.decode() to decode to arbitrary types",
3633
0
                     encoding,
3634
0
                     Py_TYPE(unicode)->tp_name);
3635
0
        Py_DECREF(v);
3636
0
        goto onError;
3637
0
    }
3638
0
    return unicode_result(v);
3639
3640
0
  onError:
3641
0
    return NULL;
3642
0
}
3643
3644
PyAPI_FUNC(PyObject *)
3645
PyUnicode_AsEncodedObject(PyObject *unicode,
3646
                          const char *encoding,
3647
                          const char *errors)
3648
0
{
3649
0
    PyObject *v;
3650
3651
0
    if (!PyUnicode_Check(unicode)) {
3652
0
        PyErr_BadArgument();
3653
0
        goto onError;
3654
0
    }
3655
3656
0
    if (encoding == NULL)
3657
0
        encoding = PyUnicode_GetDefaultEncoding();
3658
3659
    /* Encode via the codec registry */
3660
0
    v = PyCodec_Encode(unicode, encoding, errors);
3661
0
    if (v == NULL)
3662
0
        goto onError;
3663
0
    return v;
3664
3665
0
  onError:
3666
0
    return NULL;
3667
0
}
3668
3669
3670
static PyObject *
3671
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3672
                      int current_locale)
3673
696
{
3674
696
    Py_ssize_t wlen;
3675
696
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3676
696
    if (wstr == NULL) {
3677
0
        return NULL;
3678
0
    }
3679
3680
696
    if ((size_t)wlen != wcslen(wstr)) {
3681
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3682
0
        PyMem_Free(wstr);
3683
0
        return NULL;
3684
0
    }
3685
3686
696
    char *str;
3687
696
    size_t error_pos;
3688
696
    const char *reason;
3689
696
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3690
696
                                 current_locale, error_handler);
3691
696
    PyMem_Free(wstr);
3692
3693
696
    if (res != 0) {
3694
0
        if (res == -2) {
3695
0
            PyObject *exc;
3696
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3697
0
                    "locale", unicode,
3698
0
                    (Py_ssize_t)error_pos,
3699
0
                    (Py_ssize_t)(error_pos+1),
3700
0
                    reason);
3701
0
            if (exc != NULL) {
3702
0
                PyCodec_StrictErrors(exc);
3703
0
                Py_DECREF(exc);
3704
0
            }
3705
0
        }
3706
0
        else if (res == -3) {
3707
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3708
0
        }
3709
0
        else {
3710
0
            PyErr_NoMemory();
3711
0
        }
3712
0
        return NULL;
3713
0
    }
3714
3715
696
    PyObject *bytes = PyBytes_FromString(str);
3716
696
    PyMem_RawFree(str);
3717
696
    return bytes;
3718
696
}
3719
3720
PyObject *
3721
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3722
0
{
3723
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3724
0
    return unicode_encode_locale(unicode, error_handler, 1);
3725
0
}
3726
3727
PyObject *
3728
PyUnicode_EncodeFSDefault(PyObject *unicode)
3729
859k
{
3730
859k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3731
859k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3732
859k
    if (fs_codec->utf8) {
3733
858k
        return unicode_encode_utf8(unicode,
3734
858k
                                   fs_codec->error_handler,
3735
858k
                                   fs_codec->errors);
3736
858k
    }
3737
696
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3738
696
    else if (fs_codec->encoding) {
3739
0
        return PyUnicode_AsEncodedString(unicode,
3740
0
                                         fs_codec->encoding,
3741
0
                                         fs_codec->errors);
3742
0
    }
3743
696
#endif
3744
696
    else {
3745
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3746
           machinery is not ready and so cannot be used:
3747
           use wcstombs() in this case. */
3748
696
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3749
696
        const wchar_t *filesystem_errors = config->filesystem_errors;
3750
696
        assert(filesystem_errors != NULL);
3751
696
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3752
696
        assert(errors != _Py_ERROR_UNKNOWN);
3753
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3754
        return unicode_encode_utf8(unicode, errors, NULL);
3755
#else
3756
696
        return unicode_encode_locale(unicode, errors, 0);
3757
696
#endif
3758
696
    }
3759
859k
}
3760
3761
PyObject *
3762
PyUnicode_AsEncodedString(PyObject *unicode,
3763
                          const char *encoding,
3764
                          const char *errors)
3765
18.0M
{
3766
18.0M
    PyObject *v;
3767
18.0M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3768
3769
18.0M
    if (!PyUnicode_Check(unicode)) {
3770
0
        PyErr_BadArgument();
3771
0
        return NULL;
3772
0
    }
3773
3774
18.0M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3775
0
        return NULL;
3776
0
    }
3777
3778
18.0M
    if (encoding == NULL) {
3779
13.1M
        return _PyUnicode_AsUTF8String(unicode, errors);
3780
13.1M
    }
3781
3782
    /* Shortcuts for common default encodings */
3783
4.91M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3784
4.35M
        char *lower = buflower;
3785
3786
        /* Fast paths */
3787
4.35M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3788
4.20M
            lower += 3;
3789
4.20M
            if (*lower == '_') {
3790
                /* Match "utf8" and "utf_8" */
3791
4.20M
                lower++;
3792
4.20M
            }
3793
3794
4.20M
            if (lower[0] == '8' && lower[1] == 0) {
3795
4.20M
                return _PyUnicode_AsUTF8String(unicode, errors);
3796
4.20M
            }
3797
3.72k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3798
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3799
0
            }
3800
3.72k
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3801
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3802
0
            }
3803
4.20M
        }
3804
144k
        else {
3805
144k
            if (strcmp(lower, "ascii") == 0
3806
125k
                || strcmp(lower, "us_ascii") == 0) {
3807
125k
                return _PyUnicode_AsASCIIString(unicode, errors);
3808
125k
            }
3809
#ifdef MS_WINDOWS
3810
            else if (strcmp(lower, "mbcs") == 0) {
3811
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3812
            }
3813
#endif
3814
18.6k
            else if (strcmp(lower, "latin1") == 0 ||
3815
18.6k
                     strcmp(lower, "latin_1") == 0 ||
3816
18.6k
                     strcmp(lower, "iso_8859_1") == 0 ||
3817
18.6k
                     strcmp(lower, "iso8859_1") == 0) {
3818
0
                return _PyUnicode_AsLatin1String(unicode, errors);
3819
0
            }
3820
144k
        }
3821
4.35M
    }
3822
3823
    /* Encode via the codec registry */
3824
587k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3825
587k
    if (v == NULL)
3826
0
        return NULL;
3827
3828
    /* The normal path */
3829
587k
    if (PyBytes_Check(v))
3830
587k
        return v;
3831
3832
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3833
0
    if (PyByteArray_Check(v)) {
3834
0
        int error;
3835
0
        PyObject *b;
3836
3837
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3838
0
            "encoder %s returned bytearray instead of bytes; "
3839
0
            "use codecs.encode() to encode to arbitrary types",
3840
0
            encoding);
3841
0
        if (error) {
3842
0
            Py_DECREF(v);
3843
0
            return NULL;
3844
0
        }
3845
3846
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3847
0
                                      PyByteArray_GET_SIZE(v));
3848
0
        Py_DECREF(v);
3849
0
        return b;
3850
0
    }
3851
3852
0
    PyErr_Format(PyExc_TypeError,
3853
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3854
0
                 "use codecs.encode() to encode to arbitrary types",
3855
0
                 encoding,
3856
0
                 Py_TYPE(v)->tp_name);
3857
0
    Py_DECREF(v);
3858
0
    return NULL;
3859
0
}
3860
3861
PyAPI_FUNC(PyObject *)
3862
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3863
                           const char *encoding,
3864
                           const char *errors)
3865
0
{
3866
0
    PyObject *v;
3867
3868
0
    if (!PyUnicode_Check(unicode)) {
3869
0
        PyErr_BadArgument();
3870
0
        goto onError;
3871
0
    }
3872
3873
0
    if (encoding == NULL)
3874
0
        encoding = PyUnicode_GetDefaultEncoding();
3875
3876
    /* Encode via the codec registry */
3877
0
    v = PyCodec_Encode(unicode, encoding, errors);
3878
0
    if (v == NULL)
3879
0
        goto onError;
3880
0
    if (!PyUnicode_Check(v)) {
3881
0
        PyErr_Format(PyExc_TypeError,
3882
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3883
0
                     "use codecs.encode() to encode to arbitrary types",
3884
0
                     encoding,
3885
0
                     Py_TYPE(v)->tp_name);
3886
0
        Py_DECREF(v);
3887
0
        goto onError;
3888
0
    }
3889
0
    return v;
3890
3891
0
  onError:
3892
0
    return NULL;
3893
0
}
3894
3895
static PyObject*
3896
unicode_decode_locale(const char *str, Py_ssize_t len,
3897
                      _Py_error_handler errors, int current_locale)
3898
262k
{
3899
262k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3900
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3901
0
        return NULL;
3902
0
    }
3903
3904
262k
    wchar_t *wstr;
3905
262k
    size_t wlen;
3906
262k
    const char *reason;
3907
262k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3908
262k
                                 current_locale, errors);
3909
262k
    if (res != 0) {
3910
0
        if (res == -2) {
3911
0
            PyObject *exc;
3912
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3913
0
                                        "locale", str, len,
3914
0
                                        (Py_ssize_t)wlen,
3915
0
                                        (Py_ssize_t)(wlen + 1),
3916
0
                                        reason);
3917
0
            if (exc != NULL) {
3918
0
                PyCodec_StrictErrors(exc);
3919
0
                Py_DECREF(exc);
3920
0
            }
3921
0
        }
3922
0
        else if (res == -3) {
3923
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3924
0
        }
3925
0
        else {
3926
0
            PyErr_NoMemory();
3927
0
        }
3928
0
        return NULL;
3929
0
    }
3930
3931
262k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3932
262k
    PyMem_RawFree(wstr);
3933
262k
    return unicode;
3934
262k
}
3935
3936
PyObject*
3937
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3938
                              const char *errors)
3939
0
{
3940
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3941
0
    return unicode_decode_locale(str, len, error_handler, 1);
3942
0
}
3943
3944
PyObject*
3945
PyUnicode_DecodeLocale(const char *str, const char *errors)
3946
253k
{
3947
253k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3948
253k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3949
253k
    return unicode_decode_locale(str, size, error_handler, 1);
3950
253k
}
3951
3952
3953
PyObject*
3954
22
PyUnicode_DecodeFSDefault(const char *s) {
3955
22
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3956
22
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3957
22
}
3958
3959
PyObject*
3960
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3961
147k
{
3962
147k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3963
147k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3964
147k
    if (fs_codec->utf8) {
3965
138k
        return unicode_decode_utf8(s, size,
3966
138k
                                   fs_codec->error_handler,
3967
138k
                                   fs_codec->errors,
3968
138k
                                   NULL);
3969
138k
    }
3970
8.98k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3971
8.98k
    else if (fs_codec->encoding) {
3972
0
        return PyUnicode_Decode(s, size,
3973
0
                                fs_codec->encoding,
3974
0
                                fs_codec->errors);
3975
0
    }
3976
8.98k
#endif
3977
8.98k
    else {
3978
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3979
           machinery is not ready and so cannot be used:
3980
           use mbstowcs() in this case. */
3981
8.98k
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3982
8.98k
        const wchar_t *filesystem_errors = config->filesystem_errors;
3983
8.98k
        assert(filesystem_errors != NULL);
3984
8.98k
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3985
8.98k
        assert(errors != _Py_ERROR_UNKNOWN);
3986
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3987
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3988
#else
3989
8.98k
        return unicode_decode_locale(s, size, errors, 0);
3990
8.98k
#endif
3991
8.98k
    }
3992
147k
}
3993
3994
3995
int
3996
PyUnicode_FSConverter(PyObject* arg, void* addr)
3997
152k
{
3998
152k
    PyObject *path = NULL;
3999
152k
    PyObject *output = NULL;
4000
152k
    Py_ssize_t size;
4001
152k
    const char *data;
4002
152k
    if (arg == NULL) {
4003
0
        Py_DECREF(*(PyObject**)addr);
4004
0
        *(PyObject**)addr = NULL;
4005
0
        return 1;
4006
0
    }
4007
152k
    path = PyOS_FSPath(arg);
4008
152k
    if (path == NULL) {
4009
0
        return 0;
4010
0
    }
4011
152k
    if (PyBytes_Check(path)) {
4012
0
        output = path;
4013
0
    }
4014
152k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4015
152k
        output = PyUnicode_EncodeFSDefault(path);
4016
152k
        Py_DECREF(path);
4017
152k
        if (!output) {
4018
0
            return 0;
4019
0
        }
4020
152k
        assert(PyBytes_Check(output));
4021
152k
    }
4022
4023
152k
    size = PyBytes_GET_SIZE(output);
4024
152k
    data = PyBytes_AS_STRING(output);
4025
152k
    if ((size_t)size != strlen(data)) {
4026
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4027
0
        Py_DECREF(output);
4028
0
        return 0;
4029
0
    }
4030
152k
    *(PyObject**)addr = output;
4031
152k
    return Py_CLEANUP_SUPPORTED;
4032
152k
}
4033
4034
4035
int
4036
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4037
20.5k
{
4038
20.5k
    if (arg == NULL) {
4039
0
        Py_DECREF(*(PyObject**)addr);
4040
0
        *(PyObject**)addr = NULL;
4041
0
        return 1;
4042
0
    }
4043
4044
20.5k
    PyObject *path = PyOS_FSPath(arg);
4045
20.5k
    if (path == NULL) {
4046
0
        return 0;
4047
0
    }
4048
4049
20.5k
    PyObject *output = NULL;
4050
20.5k
    if (PyUnicode_Check(path)) {
4051
20.5k
        output = path;
4052
20.5k
    }
4053
0
    else if (PyBytes_Check(path)) {
4054
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4055
0
                                                  PyBytes_GET_SIZE(path));
4056
0
        Py_DECREF(path);
4057
0
        if (!output) {
4058
0
            return 0;
4059
0
        }
4060
0
    }
4061
0
    else {
4062
0
        PyErr_Format(PyExc_TypeError,
4063
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4064
0
                     Py_TYPE(arg)->tp_name);
4065
0
        Py_DECREF(path);
4066
0
        return 0;
4067
0
    }
4068
4069
20.5k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4070
20.5k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4071
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4072
0
        Py_DECREF(output);
4073
0
        return 0;
4074
0
    }
4075
20.5k
    *(PyObject**)addr = output;
4076
20.5k
    return Py_CLEANUP_SUPPORTED;
4077
20.5k
}
4078
4079
4080
static int unicode_fill_utf8(PyObject *unicode);
4081
4082
4083
static int
4084
unicode_ensure_utf8(PyObject *unicode)
4085
32.1M
{
4086
32.1M
    int err = 0;
4087
32.1M
    if (PyUnicode_UTF8(unicode) == NULL) {
4088
171k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4089
171k
        if (PyUnicode_UTF8(unicode) == NULL) {
4090
171k
            err = unicode_fill_utf8(unicode);
4091
171k
        }
4092
171k
        Py_END_CRITICAL_SECTION();
4093
171k
    }
4094
32.1M
    return err;
4095
32.1M
}
4096
4097
const char *
4098
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4099
32.1M
{
4100
32.1M
    if (!PyUnicode_Check(unicode)) {
4101
0
        PyErr_BadArgument();
4102
0
        if (psize) {
4103
0
            *psize = -1;
4104
0
        }
4105
0
        return NULL;
4106
0
    }
4107
4108
32.1M
    if (unicode_ensure_utf8(unicode) == -1) {
4109
224
        if (psize) {
4110
224
            *psize = -1;
4111
224
        }
4112
224
        return NULL;
4113
224
    }
4114
4115
32.1M
    if (psize) {
4116
32.0M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4117
32.0M
    }
4118
32.1M
    return PyUnicode_UTF8(unicode);
4119
32.1M
}
4120
4121
const char *
4122
PyUnicode_AsUTF8(PyObject *unicode)
4123
79.5k
{
4124
79.5k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4125
79.5k
}
4126
4127
const char *
4128
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4129
898k
{
4130
898k
    Py_ssize_t size;
4131
898k
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4132
898k
    if (s && strlen(s) != (size_t)size) {
4133
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4134
0
        return NULL;
4135
0
    }
4136
898k
    return s;
4137
898k
}
4138
4139
/*
4140
PyUnicode_GetSize() has been deprecated since Python 3.3
4141
because it returned length of Py_UNICODE.
4142
4143
But this function is part of stable abi, because it doesn't
4144
include Py_UNICODE in signature and it was not excluded from
4145
stable ABI in PEP 384.
4146
*/
4147
PyAPI_FUNC(Py_ssize_t)
4148
PyUnicode_GetSize(PyObject *unicode)
4149
0
{
4150
0
    PyErr_SetString(PyExc_RuntimeError,
4151
0
                    "PyUnicode_GetSize has been removed.");
4152
0
    return -1;
4153
0
}
4154
4155
Py_ssize_t
4156
PyUnicode_GetLength(PyObject *unicode)
4157
24.3k
{
4158
24.3k
    if (!PyUnicode_Check(unicode)) {
4159
0
        PyErr_BadArgument();
4160
0
        return -1;
4161
0
    }
4162
24.3k
    return PyUnicode_GET_LENGTH(unicode);
4163
24.3k
}
4164
4165
Py_UCS4
4166
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4167
19
{
4168
19
    const void *data;
4169
19
    int kind;
4170
4171
19
    if (!PyUnicode_Check(unicode)) {
4172
0
        PyErr_BadArgument();
4173
0
        return (Py_UCS4)-1;
4174
0
    }
4175
19
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4176
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4177
0
        return (Py_UCS4)-1;
4178
0
    }
4179
19
    data = PyUnicode_DATA(unicode);
4180
19
    kind = PyUnicode_KIND(unicode);
4181
19
    return PyUnicode_READ(kind, data, index);
4182
19
}
4183
4184
int
4185
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4186
0
{
4187
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4188
0
        PyErr_BadArgument();
4189
0
        return -1;
4190
0
    }
4191
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4192
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4193
0
        return -1;
4194
0
    }
4195
0
    if (unicode_check_modifiable(unicode))
4196
0
        return -1;
4197
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4198
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4199
0
        return -1;
4200
0
    }
4201
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4202
0
                    index, ch);
4203
0
    return 0;
4204
0
}
4205
4206
const char *
4207
PyUnicode_GetDefaultEncoding(void)
4208
0
{
4209
0
    return "utf-8";
4210
0
}
4211
4212
/* create or adjust a UnicodeDecodeError */
4213
static void
4214
make_decode_exception(PyObject **exceptionObject,
4215
                      const char *encoding,
4216
                      const char *input, Py_ssize_t length,
4217
                      Py_ssize_t startpos, Py_ssize_t endpos,
4218
                      const char *reason)
4219
1.14M
{
4220
1.14M
    if (*exceptionObject == NULL) {
4221
949k
        *exceptionObject = PyUnicodeDecodeError_Create(
4222
949k
            encoding, input, length, startpos, endpos, reason);
4223
949k
    }
4224
199k
    else {
4225
199k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4226
0
            goto onError;
4227
199k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4228
0
            goto onError;
4229
199k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4230
0
            goto onError;
4231
199k
    }
4232
1.14M
    return;
4233
4234
1.14M
onError:
4235
0
    Py_CLEAR(*exceptionObject);
4236
0
}
4237
4238
#ifdef MS_WINDOWS
4239
static int
4240
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4241
{
4242
    if (newsize > *size) {
4243
        wchar_t *newbuf = *buf;
4244
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4245
            PyErr_NoMemory();
4246
            return -1;
4247
        }
4248
        *buf = newbuf;
4249
    }
4250
    *size = newsize;
4251
    return 0;
4252
}
4253
4254
/* error handling callback helper:
4255
   build arguments, call the callback and check the arguments,
4256
   if no exception occurred, copy the replacement to the output
4257
   and adjust various state variables.
4258
   return 0 on success, -1 on error
4259
*/
4260
4261
static int
4262
unicode_decode_call_errorhandler_wchar(
4263
    const char *errors, PyObject **errorHandler,
4264
    const char *encoding, const char *reason,
4265
    const char **input, const char **inend, Py_ssize_t *startinpos,
4266
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4267
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4268
{
4269
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4270
4271
    PyObject *restuple = NULL;
4272
    PyObject *repunicode = NULL;
4273
    Py_ssize_t outsize;
4274
    Py_ssize_t insize;
4275
    Py_ssize_t requiredsize;
4276
    Py_ssize_t newpos;
4277
    PyObject *inputobj = NULL;
4278
    Py_ssize_t repwlen;
4279
4280
    if (*errorHandler == NULL) {
4281
        *errorHandler = PyCodec_LookupError(errors);
4282
        if (*errorHandler == NULL)
4283
            goto onError;
4284
    }
4285
4286
    make_decode_exception(exceptionObject,
4287
        encoding,
4288
        *input, *inend - *input,
4289
        *startinpos, *endinpos,
4290
        reason);
4291
    if (*exceptionObject == NULL)
4292
        goto onError;
4293
4294
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4295
    if (restuple == NULL)
4296
        goto onError;
4297
    if (!PyTuple_Check(restuple)) {
4298
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4299
        goto onError;
4300
    }
4301
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4302
        goto onError;
4303
4304
    /* Copy back the bytes variables, which might have been modified by the
4305
       callback */
4306
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4307
    if (!inputobj)
4308
        goto onError;
4309
    *input = PyBytes_AS_STRING(inputobj);
4310
    insize = PyBytes_GET_SIZE(inputobj);
4311
    *inend = *input + insize;
4312
    /* we can DECREF safely, as the exception has another reference,
4313
       so the object won't go away. */
4314
    Py_DECREF(inputobj);
4315
4316
    if (newpos<0)
4317
        newpos = insize+newpos;
4318
    if (newpos<0 || newpos>insize) {
4319
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4320
        goto onError;
4321
    }
4322
4323
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4324
    if (repwlen < 0)
4325
        goto onError;
4326
    repwlen--;
4327
    /* need more space? (at least enough for what we
4328
       have+the replacement+the rest of the string (starting
4329
       at the new input position), so we won't have to check space
4330
       when there are no errors in the rest of the string) */
4331
    requiredsize = *outpos;
4332
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4333
        goto overflow;
4334
    requiredsize += repwlen;
4335
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4336
        goto overflow;
4337
    requiredsize += insize - newpos;
4338
    outsize = *bufsize;
4339
    if (requiredsize > outsize) {
4340
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4341
            requiredsize = 2*outsize;
4342
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4343
            goto onError;
4344
        }
4345
    }
4346
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4347
    *outpos += repwlen;
4348
    *endinpos = newpos;
4349
    *inptr = *input + newpos;
4350
4351
    /* we made it! */
4352
    Py_DECREF(restuple);
4353
    return 0;
4354
4355
  overflow:
4356
    PyErr_SetString(PyExc_OverflowError,
4357
                    "decoded result is too long for a Python string");
4358
4359
  onError:
4360
    Py_XDECREF(restuple);
4361
    return -1;
4362
}
4363
#endif   /* MS_WINDOWS */
4364
4365
static int
4366
unicode_decode_call_errorhandler_writer(
4367
    const char *errors, PyObject **errorHandler,
4368
    const char *encoding, const char *reason,
4369
    const char **input, const char **inend, Py_ssize_t *startinpos,
4370
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4371
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4372
1.14M
{
4373
1.14M
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4374
4375
1.14M
    PyObject *restuple = NULL;
4376
1.14M
    PyObject *repunicode = NULL;
4377
1.14M
    Py_ssize_t insize;
4378
1.14M
    Py_ssize_t newpos;
4379
1.14M
    Py_ssize_t replen;
4380
1.14M
    Py_ssize_t remain;
4381
1.14M
    PyObject *inputobj = NULL;
4382
1.14M
    int need_to_grow = 0;
4383
1.14M
    const char *new_inptr;
4384
4385
1.14M
    if (*errorHandler == NULL) {
4386
949k
        *errorHandler = PyCodec_LookupError(errors);
4387
949k
        if (*errorHandler == NULL)
4388
0
            goto onError;
4389
949k
    }
4390
4391
1.14M
    make_decode_exception(exceptionObject,
4392
1.14M
        encoding,
4393
1.14M
        *input, *inend - *input,
4394
1.14M
        *startinpos, *endinpos,
4395
1.14M
        reason);
4396
1.14M
    if (*exceptionObject == NULL)
4397
0
        goto onError;
4398
4399
1.14M
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4400
1.14M
    if (restuple == NULL)
4401
911k
        goto onError;
4402
236k
    if (!PyTuple_Check(restuple)) {
4403
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4404
0
        goto onError;
4405
0
    }
4406
236k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4407
0
        goto onError;
4408
4409
    /* Copy back the bytes variables, which might have been modified by the
4410
       callback */
4411
236k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4412
236k
    if (!inputobj)
4413
0
        goto onError;
4414
236k
    remain = *inend - *input - *endinpos;
4415
236k
    *input = PyBytes_AS_STRING(inputobj);
4416
236k
    insize = PyBytes_GET_SIZE(inputobj);
4417
236k
    *inend = *input + insize;
4418
    /* we can DECREF safely, as the exception has another reference,
4419
       so the object won't go away. */
4420
236k
    Py_DECREF(inputobj);
4421
4422
236k
    if (newpos<0)
4423
0
        newpos = insize+newpos;
4424
236k
    if (newpos<0 || newpos>insize) {
4425
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4426
0
        goto onError;
4427
0
    }
4428
4429
236k
    replen = PyUnicode_GET_LENGTH(repunicode);
4430
236k
    if (replen > 1) {
4431
32.5k
        writer->min_length += replen - 1;
4432
32.5k
        need_to_grow = 1;
4433
32.5k
    }
4434
236k
    new_inptr = *input + newpos;
4435
236k
    if (*inend - new_inptr > remain) {
4436
        /* We don't know the decoding algorithm here so we make the worst
4437
           assumption that one byte decodes to one unicode character.
4438
           If unfortunately one byte could decode to more unicode characters,
4439
           the decoder may write out-of-bound then.  Is it possible for the
4440
           algorithms using this function? */
4441
18.2k
        writer->min_length += *inend - new_inptr - remain;
4442
18.2k
        need_to_grow = 1;
4443
18.2k
    }
4444
236k
    if (need_to_grow) {
4445
32.7k
        writer->overallocate = 1;
4446
32.7k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4447
32.7k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4448
0
            goto onError;
4449
32.7k
    }
4450
236k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4451
0
        goto onError;
4452
4453
236k
    *endinpos = newpos;
4454
236k
    *inptr = new_inptr;
4455
4456
    /* we made it! */
4457
236k
    Py_DECREF(restuple);
4458
236k
    return 0;
4459
4460
911k
  onError:
4461
911k
    Py_XDECREF(restuple);
4462
911k
    return -1;
4463
236k
}
4464
4465
/* --- UTF-7 Codec -------------------------------------------------------- */
4466
4467
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4468
4469
/* Three simple macros defining base-64. */
4470
4471
/* Is c a base-64 character? */
4472
4473
#define IS_BASE64(c) \
4474
307k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4475
307k
     ((c) >= 'a' && (c) <= 'z') ||     \
4476
307k
     ((c) >= '0' && (c) <= '9') ||     \
4477
307k
     (c) == '+' || (c) == '/')
4478
4479
/* given that c is a base-64 character, what is its base-64 value? */
4480
4481
#define FROM_BASE64(c)                                                  \
4482
268k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4483
268k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4484
190k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4485
112k
     (c) == '+' ? 62 : 63)
4486
4487
/* What is the base-64 character of the bottom 6 bits of n? */
4488
4489
#define TO_BASE64(n)  \
4490
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4491
4492
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4493
 * decoded as itself.  We are permissive on decoding; the only ASCII
4494
 * byte not decoding to itself is the + which begins a base64
4495
 * string. */
4496
4497
#define DECODE_DIRECT(c)                                \
4498
7.52M
    ((c) <= 127 && (c) != '+')
4499
4500
/* The UTF-7 encoder treats ASCII characters differently according to
4501
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4502
 * the above).  See RFC2152.  This array identifies these different
4503
 * sets:
4504
 * 0 : "Set D"
4505
 *     alphanumeric and '(),-./:?
4506
 * 1 : "Set O"
4507
 *     !"#$%&*;<=>@[]^_`{|}
4508
 * 2 : "whitespace"
4509
 *     ht nl cr sp
4510
 * 3 : special (must be base64 encoded)
4511
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4512
 */
4513
4514
static
4515
char utf7_category[128] = {
4516
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4517
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4518
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4519
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4520
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4521
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4522
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4523
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4524
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4525
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4526
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4527
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4528
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4529
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4530
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4531
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4532
};
4533
4534
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4535
 * answer depends on whether we are encoding set O as itself, and also
4536
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4537
 * clear that the answers to these questions vary between
4538
 * applications, so this code needs to be flexible.  */
4539
4540
#define ENCODE_DIRECT(c) \
4541
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4542
4543
PyObject *
4544
PyUnicode_DecodeUTF7(const char *s,
4545
                     Py_ssize_t size,
4546
                     const char *errors)
4547
0
{
4548
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4549
0
}
4550
4551
/* The decoder.  The only state we preserve is our read position,
4552
 * i.e. how many characters we have consumed.  So if we end in the
4553
 * middle of a shift sequence we have to back off the read position
4554
 * and the output to the beginning of the sequence, otherwise we lose
4555
 * all the shift state (seen bits, number of bits seen, high
4556
 * surrogate). */
4557
4558
PyObject *
4559
PyUnicode_DecodeUTF7Stateful(const char *s,
4560
                             Py_ssize_t size,
4561
                             const char *errors,
4562
                             Py_ssize_t *consumed)
4563
34.8k
{
4564
34.8k
    const char *starts = s;
4565
34.8k
    Py_ssize_t startinpos;
4566
34.8k
    Py_ssize_t endinpos;
4567
34.8k
    const char *e;
4568
34.8k
    _PyUnicodeWriter writer;
4569
34.8k
    const char *errmsg = "";
4570
34.8k
    int inShift = 0;
4571
34.8k
    Py_ssize_t shiftOutStart;
4572
34.8k
    unsigned int base64bits = 0;
4573
34.8k
    unsigned long base64buffer = 0;
4574
34.8k
    Py_UCS4 surrogate = 0;
4575
34.8k
    PyObject *errorHandler = NULL;
4576
34.8k
    PyObject *exc = NULL;
4577
4578
34.8k
    if (size == 0) {
4579
0
        if (consumed)
4580
0
            *consumed = 0;
4581
0
        _Py_RETURN_UNICODE_EMPTY();
4582
0
    }
4583
4584
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4585
34.8k
    _PyUnicodeWriter_Init(&writer);
4586
34.8k
    writer.min_length = size;
4587
4588
34.8k
    shiftOutStart = 0;
4589
34.8k
    e = s + size;
4590
4591
7.84M
    while (s < e) {
4592
7.82M
        Py_UCS4 ch;
4593
7.82M
      restart:
4594
7.82M
        ch = (unsigned char) *s;
4595
4596
7.82M
        if (inShift) { /* in a base-64 section */
4597
285k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4598
268k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4599
268k
                base64bits += 6;
4600
268k
                s++;
4601
268k
                if (base64bits >= 16) {
4602
                    /* we have enough bits for a UTF-16 value */
4603
94.8k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4604
94.8k
                    base64bits -= 16;
4605
94.8k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4606
94.8k
                    assert(outCh <= 0xffff);
4607
94.8k
                    if (surrogate) {
4608
                        /* expecting a second surrogate */
4609
8.12k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4610
3.65k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4611
3.65k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4612
0
                                goto onError;
4613
3.65k
                            surrogate = 0;
4614
3.65k
                            continue;
4615
3.65k
                        }
4616
4.47k
                        else {
4617
4.47k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4618
0
                                goto onError;
4619
4.47k
                            surrogate = 0;
4620
4.47k
                        }
4621
8.12k
                    }
4622
91.1k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4623
                        /* first surrogate */
4624
11.6k
                        surrogate = outCh;
4625
11.6k
                    }
4626
79.5k
                    else {
4627
79.5k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4628
0
                            goto onError;
4629
79.5k
                    }
4630
91.1k
                }
4631
268k
            }
4632
16.2k
            else { /* now leaving a base-64 section */
4633
16.2k
                inShift = 0;
4634
16.2k
                if (base64bits > 0) { /* left-over bits */
4635
12.7k
                    if (base64bits >= 6) {
4636
                        /* We've seen at least one base-64 character */
4637
6.31k
                        s++;
4638
6.31k
                        errmsg = "partial character in shift sequence";
4639
6.31k
                        goto utf7Error;
4640
6.31k
                    }
4641
6.43k
                    else {
4642
                        /* Some bits remain; they should be zero */
4643
6.43k
                        if (base64buffer != 0) {
4644
1.53k
                            s++;
4645
1.53k
                            errmsg = "non-zero padding bits in shift sequence";
4646
1.53k
                            goto utf7Error;
4647
1.53k
                        }
4648
6.43k
                    }
4649
12.7k
                }
4650
8.37k
                if (surrogate && DECODE_DIRECT(ch)) {
4651
2.71k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4652
0
                        goto onError;
4653
2.71k
                }
4654
8.37k
                surrogate = 0;
4655
8.37k
                if (ch == '-') {
4656
                    /* '-' is absorbed; other terminating
4657
                       characters are preserved */
4658
1.94k
                    s++;
4659
1.94k
                }
4660
8.37k
            }
4661
285k
        }
4662
7.54M
        else if ( ch == '+' ) {
4663
25.0k
            startinpos = s-starts;
4664
25.0k
            s++; /* consume '+' */
4665
25.0k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4666
2.25k
                s++;
4667
2.25k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4668
0
                    goto onError;
4669
2.25k
            }
4670
22.8k
            else if (s < e && !IS_BASE64(*s)) {
4671
2.87k
                s++;
4672
2.87k
                errmsg = "ill-formed sequence";
4673
2.87k
                goto utf7Error;
4674
2.87k
            }
4675
19.9k
            else { /* begin base64-encoded section */
4676
19.9k
                inShift = 1;
4677
19.9k
                surrogate = 0;
4678
19.9k
                shiftOutStart = writer.pos;
4679
19.9k
                base64bits = 0;
4680
19.9k
                base64buffer = 0;
4681
19.9k
            }
4682
25.0k
        }
4683
7.51M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4684
7.42M
            s++;
4685
7.42M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4686
0
                goto onError;
4687
7.42M
        }
4688
90.3k
        else {
4689
90.3k
            startinpos = s-starts;
4690
90.3k
            s++;
4691
90.3k
            errmsg = "unexpected special character";
4692
90.3k
            goto utf7Error;
4693
90.3k
        }
4694
7.72M
        continue;
4695
7.72M
utf7Error:
4696
101k
        endinpos = s-starts;
4697
101k
        if (unicode_decode_call_errorhandler_writer(
4698
101k
                errors, &errorHandler,
4699
101k
                "utf7", errmsg,
4700
101k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4701
101k
                &writer))
4702
15.8k
            goto onError;
4703
101k
    }
4704
4705
    /* end of string */
4706
4707
18.9k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4708
        /* if we're in an inconsistent state, that's an error */
4709
3.72k
        inShift = 0;
4710
3.72k
        if (surrogate ||
4711
3.35k
                (base64bits >= 6) ||
4712
2.41k
                (base64bits > 0 && base64buffer != 0)) {
4713
2.41k
            endinpos = size;
4714
2.41k
            if (unicode_decode_call_errorhandler_writer(
4715
2.41k
                    errors, &errorHandler,
4716
2.41k
                    "utf7", "unterminated shift sequence",
4717
2.41k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4718
2.41k
                    &writer))
4719
2.05k
                goto onError;
4720
361
            if (s < e)
4721
0
                goto restart;
4722
361
        }
4723
3.72k
    }
4724
4725
    /* return state */
4726
16.8k
    if (consumed) {
4727
0
        if (inShift) {
4728
0
            *consumed = startinpos;
4729
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4730
0
                PyObject *result = PyUnicode_FromKindAndData(
4731
0
                        writer.kind, writer.data, shiftOutStart);
4732
0
                Py_XDECREF(errorHandler);
4733
0
                Py_XDECREF(exc);
4734
0
                _PyUnicodeWriter_Dealloc(&writer);
4735
0
                return result;
4736
0
            }
4737
0
            writer.pos = shiftOutStart; /* back off output */
4738
0
        }
4739
0
        else {
4740
0
            *consumed = s-starts;
4741
0
        }
4742
0
    }
4743
4744
16.8k
    Py_XDECREF(errorHandler);
4745
16.8k
    Py_XDECREF(exc);
4746
16.8k
    return _PyUnicodeWriter_Finish(&writer);
4747
4748
17.9k
  onError:
4749
17.9k
    Py_XDECREF(errorHandler);
4750
17.9k
    Py_XDECREF(exc);
4751
17.9k
    _PyUnicodeWriter_Dealloc(&writer);
4752
17.9k
    return NULL;
4753
16.8k
}
4754
4755
4756
PyObject *
4757
_PyUnicode_EncodeUTF7(PyObject *str,
4758
                      const char *errors)
4759
0
{
4760
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4761
0
    if (len == 0) {
4762
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4763
0
    }
4764
0
    int kind = PyUnicode_KIND(str);
4765
0
    const void *data = PyUnicode_DATA(str);
4766
4767
    /* It might be possible to tighten this worst case */
4768
0
    if (len > PY_SSIZE_T_MAX / 8) {
4769
0
        return PyErr_NoMemory();
4770
0
    }
4771
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4772
0
    if (writer == NULL) {
4773
0
        return NULL;
4774
0
    }
4775
4776
0
    int inShift = 0;
4777
0
    unsigned int base64bits = 0;
4778
0
    unsigned long base64buffer = 0;
4779
0
    char *out = PyBytesWriter_GetData(writer);
4780
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4781
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4782
4783
0
        if (inShift) {
4784
0
            if (ENCODE_DIRECT(ch)) {
4785
                /* shifting out */
4786
0
                if (base64bits) { /* output remaining bits */
4787
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4788
0
                    base64buffer = 0;
4789
0
                    base64bits = 0;
4790
0
                }
4791
0
                inShift = 0;
4792
                /* Characters not in the BASE64 set implicitly unshift the sequence
4793
                   so no '-' is required, except if the character is itself a '-' */
4794
0
                if (IS_BASE64(ch) || ch == '-') {
4795
0
                    *out++ = '-';
4796
0
                }
4797
0
                *out++ = (char) ch;
4798
0
            }
4799
0
            else {
4800
0
                goto encode_char;
4801
0
            }
4802
0
        }
4803
0
        else { /* not in a shift sequence */
4804
0
            if (ch == '+') {
4805
0
                *out++ = '+';
4806
0
                        *out++ = '-';
4807
0
            }
4808
0
            else if (ENCODE_DIRECT(ch)) {
4809
0
                *out++ = (char) ch;
4810
0
            }
4811
0
            else {
4812
0
                *out++ = '+';
4813
0
                inShift = 1;
4814
0
                goto encode_char;
4815
0
            }
4816
0
        }
4817
0
        continue;
4818
0
encode_char:
4819
0
        if (ch >= 0x10000) {
4820
0
            assert(ch <= MAX_UNICODE);
4821
4822
            /* code first surrogate */
4823
0
            base64bits += 16;
4824
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4825
0
            while (base64bits >= 6) {
4826
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4827
0
                base64bits -= 6;
4828
0
            }
4829
            /* prepare second surrogate */
4830
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4831
0
        }
4832
0
        base64bits += 16;
4833
0
        base64buffer = (base64buffer << 16) | ch;
4834
0
        while (base64bits >= 6) {
4835
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4836
0
            base64bits -= 6;
4837
0
        }
4838
0
    }
4839
0
    if (base64bits)
4840
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4841
0
    if (inShift)
4842
0
        *out++ = '-';
4843
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4844
0
}
4845
4846
#undef IS_BASE64
4847
#undef FROM_BASE64
4848
#undef TO_BASE64
4849
#undef DECODE_DIRECT
4850
#undef ENCODE_DIRECT
4851
4852
/* --- UTF-8 Codec -------------------------------------------------------- */
4853
4854
PyObject *
4855
PyUnicode_DecodeUTF8(const char *s,
4856
                     Py_ssize_t size,
4857
                     const char *errors)
4858
6.38M
{
4859
6.38M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4860
6.38M
}
4861
4862
#include "stringlib/asciilib.h"
4863
#include "stringlib/codecs.h"
4864
#include "stringlib/undef.h"
4865
4866
#include "stringlib/ucs1lib.h"
4867
#include "stringlib/codecs.h"
4868
#include "stringlib/undef.h"
4869
4870
#include "stringlib/ucs2lib.h"
4871
#include "stringlib/codecs.h"
4872
#include "stringlib/undef.h"
4873
4874
#include "stringlib/ucs4lib.h"
4875
#include "stringlib/codecs.h"
4876
#include "stringlib/undef.h"
4877
4878
#if (SIZEOF_SIZE_T == 8)
4879
/* Mask to quickly check whether a C 'size_t' contains a
4880
   non-ASCII, UTF8-encoded char. */
4881
157M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4882
// used to count codepoints in UTF-8 string.
4883
288M
# define VECTOR_0101     0x0101010101010101ULL
4884
2.86M
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4885
#elif (SIZEOF_SIZE_T == 4)
4886
# define ASCII_CHAR_MASK 0x80808080U
4887
# define VECTOR_0101     0x01010101U
4888
# define VECTOR_00FF     0x00ff00ffU
4889
#else
4890
# error C 'size_t' size should be either 4 or 8!
4891
#endif
4892
4893
#if (defined(__clang__) || defined(__GNUC__))
4894
#define HAVE_CTZ 1
4895
static inline unsigned int
4896
ctz(size_t v)
4897
941k
{
4898
941k
    return __builtin_ctzll((unsigned long long)v);
4899
941k
}
4900
#elif defined(_MSC_VER)
4901
#define HAVE_CTZ 1
4902
static inline unsigned int
4903
ctz(size_t v)
4904
{
4905
    unsigned long pos;
4906
#if SIZEOF_SIZE_T == 4
4907
    _BitScanForward(&pos, v);
4908
#else
4909
    _BitScanForward64(&pos, v);
4910
#endif /* SIZEOF_SIZE_T */
4911
    return pos;
4912
}
4913
#else
4914
#define HAVE_CTZ 0
4915
#endif
4916
4917
#if HAVE_CTZ && PY_LITTLE_ENDIAN
4918
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
4919
static size_t
4920
load_unaligned(const unsigned char *p, size_t size)
4921
23.3M
{
4922
23.3M
    union {
4923
23.3M
        size_t s;
4924
23.3M
        unsigned char b[SIZEOF_SIZE_T];
4925
23.3M
    } u;
4926
23.3M
    u.s = 0;
4927
    // This switch statement assumes little endian because:
4928
    // * union is faster than bitwise or and shift.
4929
    // * big endian machine is rare and hard to maintain.
4930
23.3M
    switch (size) {
4931
0
    default:
4932
0
#if SIZEOF_SIZE_T == 8
4933
0
    case 8:
4934
0
        u.b[7] = p[7];
4935
0
        _Py_FALLTHROUGH;
4936
1.53M
    case 7:
4937
1.53M
        u.b[6] = p[6];
4938
1.53M
        _Py_FALLTHROUGH;
4939
4.05M
    case 6:
4940
4.05M
        u.b[5] = p[5];
4941
4.05M
        _Py_FALLTHROUGH;
4942
5.89M
    case 5:
4943
5.89M
        u.b[4] = p[4];
4944
5.89M
        _Py_FALLTHROUGH;
4945
5.89M
#endif
4946
7.37M
    case 4:
4947
7.37M
        u.b[3] = p[3];
4948
7.37M
        _Py_FALLTHROUGH;
4949
14.4M
    case 3:
4950
14.4M
        u.b[2] = p[2];
4951
14.4M
        _Py_FALLTHROUGH;
4952
17.9M
    case 2:
4953
17.9M
        u.b[1] = p[1];
4954
17.9M
        _Py_FALLTHROUGH;
4955
20.1M
    case 1:
4956
20.1M
        u.b[0] = p[0];
4957
20.1M
        break;
4958
3.23M
    case 0:
4959
3.23M
        break;
4960
23.3M
    }
4961
23.3M
    return u.s;
4962
23.3M
}
4963
#endif
4964
4965
/*
4966
 * Find the first non-ASCII character in a byte sequence.
4967
 *
4968
 * This function scans a range of bytes from `start` to `end` and returns the
4969
 * index of the first byte that is not an ASCII character (i.e., has the most
4970
 * significant bit set). If all characters in the range are ASCII, it returns
4971
 * `end - start`.
4972
 */
4973
static Py_ssize_t
4974
find_first_nonascii(const unsigned char *start, const unsigned char *end)
4975
24.0M
{
4976
    // The search is done in `size_t` chunks.
4977
    // The start and end might not be aligned at `size_t` boundaries,
4978
    // so they're handled specially.
4979
4980
24.0M
    const unsigned char *p = start;
4981
4982
24.0M
    if (end - start >= SIZEOF_SIZE_T) {
4983
        // Avoid unaligned read.
4984
8.92M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
4985
8.92M
        size_t u;
4986
8.92M
        memcpy(&u, p, sizeof(size_t));
4987
8.92M
        u &= ASCII_CHAR_MASK;
4988
8.92M
        if (u) {
4989
442k
            return (ctz(u) - 7) / 8;
4990
442k
        }
4991
8.48M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
4992
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
4993
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
4994
        while (p < p2) {
4995
            if (*p & 0x80) {
4996
                return p - start;
4997
            }
4998
            p++;
4999
        }
5000
#endif
5001
5002
8.48M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5003
128M
        while (p <= e) {
5004
120M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5005
120M
            if (u) {
5006
274k
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5007
274k
                return p - start + (ctz(u) - 7) / 8;
5008
#else
5009
                // big endian and minor compilers are difficult to test.
5010
                // fallback to per byte check.
5011
                break;
5012
#endif
5013
274k
            }
5014
120M
            p += SIZEOF_SIZE_T;
5015
120M
        }
5016
8.48M
    }
5017
23.3M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5018
24.0M
    assert((end - p) < SIZEOF_SIZE_T);
5019
    // we can not use *(const size_t*)p to avoid buffer overrun.
5020
23.3M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5021
23.3M
    if (u) {
5022
224k
        return p - start + (ctz(u) - 7) / 8;
5023
224k
    }
5024
23.1M
    return end - start;
5025
#else
5026
    while (p < end) {
5027
        if (*p & 0x80) {
5028
            break;
5029
        }
5030
        p++;
5031
    }
5032
    return p - start;
5033
#endif
5034
23.3M
}
5035
5036
static inline int
5037
scalar_utf8_start_char(unsigned int ch)
5038
960k
{
5039
    // 0xxxxxxx or 11xxxxxx are first byte.
5040
960k
    return (~ch >> 7 | ch >> 6) & 1;
5041
960k
}
5042
5043
static inline size_t
5044
vector_utf8_start_chars(size_t v)
5045
288M
{
5046
288M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5047
288M
}
5048
5049
5050
// Count the number of UTF-8 code points in a given byte sequence.
5051
static Py_ssize_t
5052
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5053
393k
{
5054
393k
    Py_ssize_t len = 0;
5055
5056
393k
    if (end - s >= SIZEOF_SIZE_T) {
5057
324k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5058
17.5k
            len += scalar_utf8_start_char(*s++);
5059
17.5k
        }
5060
5061
1.73M
        while (s + SIZEOF_SIZE_T <= end) {
5062
1.43M
            const unsigned char *e = end;
5063
1.43M
            if (e - s > SIZEOF_SIZE_T * 255) {
5064
1.12M
                e = s + SIZEOF_SIZE_T * 255;
5065
1.12M
            }
5066
1.43M
            Py_ssize_t vstart = 0;
5067
289M
            while (s + SIZEOF_SIZE_T <= e) {
5068
288M
                size_t v = *(size_t*)s;
5069
288M
                size_t vs = vector_utf8_start_chars(v);
5070
288M
                vstart += vs;
5071
288M
                s += SIZEOF_SIZE_T;
5072
288M
            }
5073
1.43M
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5074
1.43M
            vstart += vstart >> 16;
5075
1.43M
#if SIZEOF_SIZE_T == 8
5076
1.43M
            vstart += vstart >> 32;
5077
1.43M
#endif
5078
1.43M
            len += vstart & 0x7ff;
5079
1.43M
        }
5080
307k
    }
5081
1.33M
    while (s < end) {
5082
942k
        len += scalar_utf8_start_char(*s++);
5083
942k
    }
5084
393k
    return len;
5085
393k
}
5086
5087
static Py_ssize_t
5088
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5089
9.58M
{
5090
9.58M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5091
9.58M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5092
9.48M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5093
4.43M
    {
5094
        /* Fast path, see in STRINGLIB(utf8_decode) for
5095
           an explanation. */
5096
4.43M
        const char *p = start;
5097
4.43M
        Py_UCS1 *q = dest;
5098
8.30M
        while (p + SIZEOF_SIZE_T <= end) {
5099
4.84M
            size_t value = *(const size_t *) p;
5100
4.84M
            if (value & ASCII_CHAR_MASK)
5101
987k
                break;
5102
3.86M
            *((size_t *)q) = value;
5103
3.86M
            p += SIZEOF_SIZE_T;
5104
3.86M
            q += SIZEOF_SIZE_T;
5105
3.86M
        }
5106
7.40M
        while (p < end) {
5107
3.98M
            if ((unsigned char)*p & 0x80)
5108
1.01M
                break;
5109
2.96M
            *q++ = *p++;
5110
2.96M
        }
5111
4.43M
        return p - start;
5112
4.43M
    }
5113
5.14M
#endif
5114
5.14M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5115
5.14M
                                         (const unsigned char*)end);
5116
5.14M
    memcpy(dest, start, pos);
5117
5.14M
    return pos;
5118
9.58M
}
5119
5120
static int
5121
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5122
                         const char *starts, const char *s, const char *end,
5123
                         _Py_error_handler error_handler,
5124
                         const char *errors,
5125
                         Py_ssize_t *consumed)
5126
944k
{
5127
944k
    Py_ssize_t startinpos, endinpos;
5128
944k
    const char *errmsg = "";
5129
944k
    PyObject *error_handler_obj = NULL;
5130
944k
    PyObject *exc = NULL;
5131
5132
193M
    while (s < end) {
5133
193M
        Py_UCS4 ch;
5134
193M
        int kind = writer->kind;
5135
5136
193M
        if (kind == PyUnicode_1BYTE_KIND) {
5137
942k
            if (PyUnicode_IS_ASCII(writer->buffer))
5138
548k
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5139
394k
            else
5140
394k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5141
192M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5142
91.9M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5143
100M
        } else {
5144
100M
            assert(kind == PyUnicode_4BYTE_KIND);
5145
100M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5146
100M
        }
5147
5148
193M
        switch (ch) {
5149
736k
        case 0:
5150
736k
            if (s == end || consumed)
5151
708k
                goto End;
5152
28.4k
            errmsg = "unexpected end of data";
5153
28.4k
            startinpos = s - starts;
5154
28.4k
            endinpos = end - starts;
5155
28.4k
            break;
5156
145M
        case 1:
5157
145M
            errmsg = "invalid start byte";
5158
145M
            startinpos = s - starts;
5159
145M
            endinpos = startinpos + 1;
5160
145M
            break;
5161
45.1M
        case 2:
5162
45.1M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5163
0
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5164
0
            {
5165
                /* Truncated surrogate code in range D800-DFFF */
5166
0
                goto End;
5167
0
            }
5168
45.1M
            _Py_FALLTHROUGH;
5169
46.6M
        case 3:
5170
46.7M
        case 4:
5171
46.7M
            errmsg = "invalid continuation byte";
5172
46.7M
            startinpos = s - starts;
5173
46.7M
            endinpos = startinpos + ch - 1;
5174
46.7M
            break;
5175
452k
        default:
5176
            // ch doesn't fit into kind, so change the buffer kind to write
5177
            // the character
5178
452k
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5179
0
                goto onError;
5180
452k
            continue;
5181
193M
        }
5182
5183
192M
        if (error_handler == _Py_ERROR_UNKNOWN)
5184
245k
            error_handler = _Py_GetErrorHandler(errors);
5185
5186
192M
        switch (error_handler) {
5187
0
        case _Py_ERROR_IGNORE:
5188
0
            s += (endinpos - startinpos);
5189
0
            break;
5190
5191
183M
        case _Py_ERROR_REPLACE:
5192
183M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5193
0
                goto onError;
5194
183M
            s += (endinpos - startinpos);
5195
183M
            break;
5196
5197
8.57M
        case _Py_ERROR_SURROGATEESCAPE:
5198
8.57M
        {
5199
8.57M
            Py_ssize_t i;
5200
5201
8.57M
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5202
0
                goto onError;
5203
17.1M
            for (i=startinpos; i<endinpos; i++) {
5204
8.57M
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5205
8.57M
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5206
8.57M
                                ch + 0xdc00);
5207
8.57M
                writer->pos++;
5208
8.57M
            }
5209
8.57M
            s += (endinpos - startinpos);
5210
8.57M
            break;
5211
8.57M
        }
5212
5213
1.16k
        default:
5214
1.16k
            if (unicode_decode_call_errorhandler_writer(
5215
1.16k
                    errors, &error_handler_obj,
5216
1.16k
                    "utf-8", errmsg,
5217
1.16k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5218
1.16k
                    writer)) {
5219
1.16k
                goto onError;
5220
1.16k
            }
5221
5222
0
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5223
0
                return -1;
5224
0
            }
5225
192M
        }
5226
192M
    }
5227
5228
943k
End:
5229
943k
    if (consumed)
5230
1.13k
        *consumed = s - starts;
5231
5232
943k
    Py_XDECREF(error_handler_obj);
5233
943k
    Py_XDECREF(exc);
5234
943k
    return 0;
5235
5236
1.16k
onError:
5237
1.16k
    Py_XDECREF(error_handler_obj);
5238
1.16k
    Py_XDECREF(exc);
5239
1.16k
    return -1;
5240
944k
}
5241
5242
5243
static PyObject *
5244
unicode_decode_utf8(const char *s, Py_ssize_t size,
5245
                    _Py_error_handler error_handler, const char *errors,
5246
                    Py_ssize_t *consumed)
5247
22.0M
{
5248
22.0M
    if (size == 0) {
5249
65.5k
        if (consumed) {
5250
0
            *consumed = 0;
5251
0
        }
5252
65.5k
        _Py_RETURN_UNICODE_EMPTY();
5253
65.5k
    }
5254
5255
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5256
21.9M
    if (size == 1 && (unsigned char)s[0] < 128) {
5257
3.04M
        if (consumed) {
5258
0
            *consumed = 1;
5259
0
        }
5260
3.04M
        return get_latin1_char((unsigned char)s[0]);
5261
3.04M
    }
5262
5263
    // I don't know this check is necessary or not. But there is a test
5264
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5265
18.9M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5266
0
        PyErr_NoMemory();
5267
0
        return NULL;
5268
0
    }
5269
5270
18.9M
    const char *starts = s;
5271
18.9M
    const char *end = s + size;
5272
5273
18.9M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5274
18.9M
    if (pos == size) {  // fast path: ASCII string.
5275
18.0M
        PyObject *u = PyUnicode_New(size, 127);
5276
18.0M
        if (u == NULL) {
5277
0
            return NULL;
5278
0
        }
5279
18.0M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5280
18.0M
        if (consumed) {
5281
0
            *consumed = size;
5282
0
        }
5283
18.0M
        return u;
5284
18.0M
    }
5285
5286
894k
    int maxchr = 127;
5287
894k
    Py_ssize_t maxsize = size;
5288
5289
894k
    unsigned char ch = (unsigned char)(s[pos]);
5290
    // error handler other than strict may remove/replace the invalid byte.
5291
    // consumed != NULL allows 1~3 bytes remainings.
5292
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5293
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5294
    // reallocation and copy.
5295
894k
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5296
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5297
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5298
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5299
        // means that it is no longer necessary to allocate several times the required amount
5300
        // of memory.
5301
393k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5302
393k
        if (ch < 0xc4) { // latin1
5303
271k
            maxchr = 0xff;
5304
271k
        }
5305
121k
        else if (ch < 0xf0) { // ucs2
5306
111k
            maxchr = 0xffff;
5307
111k
        }
5308
10.8k
        else { // ucs4
5309
10.8k
            maxchr = 0x10ffff;
5310
10.8k
        }
5311
393k
    }
5312
894k
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5313
894k
    if (!u) {
5314
0
        return NULL;
5315
0
    }
5316
5317
    // Use _PyUnicodeWriter after fast path is failed.
5318
894k
    _PyUnicodeWriter writer;
5319
894k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5320
894k
    if (maxchr <= 255) {
5321
772k
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5322
772k
        s += pos;
5323
772k
        writer.pos = pos;
5324
772k
    }
5325
5326
894k
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5327
894k
                                 error_handler, errors,
5328
894k
                                 consumed) < 0) {
5329
1.16k
        _PyUnicodeWriter_Dealloc(&writer);
5330
1.16k
        return NULL;
5331
1.16k
    }
5332
893k
    return _PyUnicodeWriter_Finish(&writer);
5333
894k
}
5334
5335
5336
// Used by PyUnicodeWriter_WriteUTF8() implementation
5337
int
5338
_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
5339
                            const char *s, Py_ssize_t size,
5340
                            _Py_error_handler error_handler, const char *errors,
5341
                            Py_ssize_t *consumed)
5342
5.17M
{
5343
5.17M
    if (size == 0) {
5344
9.21k
        if (consumed) {
5345
0
            *consumed = 0;
5346
0
        }
5347
9.21k
        return 0;
5348
9.21k
    }
5349
5350
    // fast path: try ASCII string.
5351
5.16M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5352
0
        return -1;
5353
0
    }
5354
5355
5.16M
    const char *starts = s;
5356
5.16M
    const char *end = s + size;
5357
5.16M
    Py_ssize_t decoded = 0;
5358
5.16M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5359
5.16M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5360
5.16M
        decoded = ascii_decode(s, end, dest);
5361
5.16M
        writer->pos += decoded;
5362
5363
5.16M
        if (decoded == size) {
5364
5.11M
            if (consumed) {
5365
854
                *consumed = size;
5366
854
            }
5367
5.11M
            return 0;
5368
5.11M
        }
5369
47.2k
        s += decoded;
5370
47.2k
    }
5371
5372
49.9k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5373
49.9k
                                    error_handler, errors, consumed);
5374
5.16M
}
5375
5376
5377
PyObject *
5378
PyUnicode_DecodeUTF8Stateful(const char *s,
5379
                             Py_ssize_t size,
5380
                             const char *errors,
5381
                             Py_ssize_t *consumed)
5382
21.8M
{
5383
21.8M
    return unicode_decode_utf8(s, size,
5384
21.8M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5385
21.8M
                               errors, consumed);
5386
21.8M
}
5387
5388
5389
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5390
   non-zero, use strict error handler otherwise.
5391
5392
   On success, write a pointer to a newly allocated wide character string into
5393
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5394
   (in number of wchar_t units) into *wlen (if wlen is set).
5395
5396
   On memory allocation failure, return -1.
5397
5398
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5399
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5400
   is not NULL, write the decoding error message into *reason. */
5401
int
5402
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5403
                 const char **reason, _Py_error_handler errors)
5404
9.18k
{
5405
9.18k
    const char *orig_s = s;
5406
9.18k
    const char *e;
5407
9.18k
    wchar_t *unicode;
5408
9.18k
    Py_ssize_t outpos;
5409
5410
9.18k
    int surrogateescape = 0;
5411
9.18k
    int surrogatepass = 0;
5412
9.18k
    switch (errors)
5413
9.18k
    {
5414
0
    case _Py_ERROR_STRICT:
5415
0
        break;
5416
9.18k
    case _Py_ERROR_SURROGATEESCAPE:
5417
9.18k
        surrogateescape = 1;
5418
9.18k
        break;
5419
0
    case _Py_ERROR_SURROGATEPASS:
5420
0
        surrogatepass = 1;
5421
0
        break;
5422
0
    default:
5423
0
        return -3;
5424
9.18k
    }
5425
5426
    /* Note: size will always be longer than the resulting Unicode
5427
       character count */
5428
9.18k
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5429
0
        return -1;
5430
0
    }
5431
5432
9.18k
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5433
9.18k
    if (!unicode) {
5434
0
        return -1;
5435
0
    }
5436
5437
    /* Unpack UTF-8 encoded data */
5438
9.18k
    e = s + size;
5439
9.18k
    outpos = 0;
5440
9.18k
    while (s < e) {
5441
9.18k
        Py_UCS4 ch;
5442
9.18k
#if SIZEOF_WCHAR_T == 4
5443
9.18k
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5444
#else
5445
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5446
#endif
5447
9.18k
        if (ch > 0xFF) {
5448
0
#if SIZEOF_WCHAR_T == 4
5449
0
            Py_UNREACHABLE();
5450
#else
5451
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5452
            /* write a surrogate pair */
5453
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5454
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5455
#endif
5456
0
        }
5457
9.18k
        else {
5458
9.18k
            if (!ch && s == e) {
5459
9.18k
                break;
5460
9.18k
            }
5461
5462
0
            if (surrogateescape) {
5463
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5464
0
            }
5465
0
            else {
5466
                /* Is it a valid three-byte code? */
5467
0
                if (surrogatepass
5468
0
                    && (e - s) >= 3
5469
0
                    && (s[0] & 0xf0) == 0xe0
5470
0
                    && (s[1] & 0xc0) == 0x80
5471
0
                    && (s[2] & 0xc0) == 0x80)
5472
0
                {
5473
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5474
0
                    s += 3;
5475
0
                    unicode[outpos++] = ch;
5476
0
                }
5477
0
                else {
5478
0
                    PyMem_RawFree(unicode );
5479
0
                    if (reason != NULL) {
5480
0
                        switch (ch) {
5481
0
                        case 0:
5482
0
                            *reason = "unexpected end of data";
5483
0
                            break;
5484
0
                        case 1:
5485
0
                            *reason = "invalid start byte";
5486
0
                            break;
5487
                        /* 2, 3, 4 */
5488
0
                        default:
5489
0
                            *reason = "invalid continuation byte";
5490
0
                            break;
5491
0
                        }
5492
0
                    }
5493
0
                    if (wlen != NULL) {
5494
0
                        *wlen = s - orig_s;
5495
0
                    }
5496
0
                    return -2;
5497
0
                }
5498
0
            }
5499
0
        }
5500
9.18k
    }
5501
9.18k
    unicode[outpos] = L'\0';
5502
9.18k
    if (wlen) {
5503
9.18k
        *wlen = outpos;
5504
9.18k
    }
5505
9.18k
    *wstr = unicode;
5506
9.18k
    return 0;
5507
9.18k
}
5508
5509
5510
wchar_t*
5511
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5512
                               size_t *wlen)
5513
0
{
5514
0
    wchar_t *wstr;
5515
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5516
0
                               &wstr, wlen,
5517
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5518
0
    if (res != 0) {
5519
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5520
0
        assert(res != -3);
5521
0
        if (wlen) {
5522
0
            *wlen = (size_t)res;
5523
0
        }
5524
0
        return NULL;
5525
0
    }
5526
0
    return wstr;
5527
0
}
5528
5529
5530
/* UTF-8 encoder.
5531
5532
   On success, return 0 and write the newly allocated character string (use
5533
   PyMem_Free() to free the memory) into *str.
5534
5535
   On encoding failure, return -2 and write the position of the invalid
5536
   surrogate character into *error_pos (if error_pos is set) and the decoding
5537
   error message into *reason (if reason is set).
5538
5539
   On memory allocation failure, return -1. */
5540
int
5541
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5542
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5543
1.08k
{
5544
1.08k
    const Py_ssize_t max_char_size = 4;
5545
1.08k
    Py_ssize_t len = wcslen(text);
5546
5547
1.08k
    assert(len >= 0);
5548
5549
1.08k
    int surrogateescape = 0;
5550
1.08k
    int surrogatepass = 0;
5551
1.08k
    switch (errors)
5552
1.08k
    {
5553
112
    case _Py_ERROR_STRICT:
5554
112
        break;
5555
976
    case _Py_ERROR_SURROGATEESCAPE:
5556
976
        surrogateescape = 1;
5557
976
        break;
5558
0
    case _Py_ERROR_SURROGATEPASS:
5559
0
        surrogatepass = 1;
5560
0
        break;
5561
0
    default:
5562
0
        return -3;
5563
1.08k
    }
5564
5565
1.08k
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5566
0
        return -1;
5567
0
    }
5568
1.08k
    char *bytes;
5569
1.08k
    if (raw_malloc) {
5570
1.08k
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5571
1.08k
    }
5572
0
    else {
5573
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5574
0
    }
5575
1.08k
    if (bytes == NULL) {
5576
0
        return -1;
5577
0
    }
5578
5579
1.08k
    char *p = bytes;
5580
1.08k
    Py_ssize_t i;
5581
70.8k
    for (i = 0; i < len; ) {
5582
69.7k
        Py_ssize_t ch_pos = i;
5583
69.7k
        Py_UCS4 ch = text[i];
5584
69.7k
        i++;
5585
#if Py_UNICODE_SIZE == 2
5586
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5587
            && i < len
5588
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5589
        {
5590
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5591
            i++;
5592
        }
5593
#endif
5594
5595
69.7k
        if (ch < 0x80) {
5596
            /* Encode ASCII */
5597
69.7k
            *p++ = (char) ch;
5598
5599
69.7k
        }
5600
0
        else if (ch < 0x0800) {
5601
            /* Encode Latin-1 */
5602
0
            *p++ = (char)(0xc0 | (ch >> 6));
5603
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5604
0
        }
5605
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5606
            /* surrogateescape error handler */
5607
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5608
0
                if (error_pos != NULL) {
5609
0
                    *error_pos = (size_t)ch_pos;
5610
0
                }
5611
0
                if (reason != NULL) {
5612
0
                    *reason = "encoding error";
5613
0
                }
5614
0
                if (raw_malloc) {
5615
0
                    PyMem_RawFree(bytes);
5616
0
                }
5617
0
                else {
5618
0
                    PyMem_Free(bytes);
5619
0
                }
5620
0
                return -2;
5621
0
            }
5622
0
            *p++ = (char)(ch & 0xff);
5623
0
        }
5624
0
        else if (ch < 0x10000) {
5625
0
            *p++ = (char)(0xe0 | (ch >> 12));
5626
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5627
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5628
0
        }
5629
0
        else {  /* ch >= 0x10000 */
5630
0
            assert(ch <= MAX_UNICODE);
5631
            /* Encode UCS4 Unicode ordinals */
5632
0
            *p++ = (char)(0xf0 | (ch >> 18));
5633
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5634
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5635
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5636
0
        }
5637
69.7k
    }
5638
1.08k
    *p++ = '\0';
5639
5640
1.08k
    size_t final_size = (p - bytes);
5641
1.08k
    char *bytes2;
5642
1.08k
    if (raw_malloc) {
5643
1.08k
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5644
1.08k
    }
5645
0
    else {
5646
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5647
0
    }
5648
1.08k
    if (bytes2 == NULL) {
5649
0
        if (error_pos != NULL) {
5650
0
            *error_pos = (size_t)-1;
5651
0
        }
5652
0
        if (raw_malloc) {
5653
0
            PyMem_RawFree(bytes);
5654
0
        }
5655
0
        else {
5656
0
            PyMem_Free(bytes);
5657
0
        }
5658
0
        return -1;
5659
0
    }
5660
1.08k
    *str = bytes2;
5661
1.08k
    return 0;
5662
1.08k
}
5663
5664
5665
/* Primary internal function which creates utf8 encoded bytes objects.
5666
5667
   Allocation strategy:  if the string is short, convert into a stack buffer
5668
   and allocate exactly as much space needed at the end.  Else allocate the
5669
   maximum possible needed (4 result bytes per Unicode character), and return
5670
   the excess memory at the end.
5671
*/
5672
static PyObject *
5673
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5674
                    const char *errors)
5675
18.1M
{
5676
18.1M
    if (!PyUnicode_Check(unicode)) {
5677
0
        PyErr_BadArgument();
5678
0
        return NULL;
5679
0
    }
5680
5681
18.1M
    if (PyUnicode_UTF8(unicode))
5682
10.4M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5683
10.4M
                                         PyUnicode_UTF8_LENGTH(unicode));
5684
5685
7.77M
    int kind = PyUnicode_KIND(unicode);
5686
7.77M
    const void *data = PyUnicode_DATA(unicode);
5687
7.77M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5688
5689
7.77M
    PyBytesWriter *writer;
5690
7.77M
    char *end;
5691
5692
7.77M
    switch (kind) {
5693
0
    default:
5694
0
        Py_UNREACHABLE();
5695
5.86M
    case PyUnicode_1BYTE_KIND:
5696
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5697
5.86M
        assert(!PyUnicode_IS_ASCII(unicode));
5698
5.86M
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5699
5.86M
                                      error_handler, errors, &end);
5700
5.86M
        break;
5701
1.86M
    case PyUnicode_2BYTE_KIND:
5702
1.86M
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5703
1.86M
                                      error_handler, errors, &end);
5704
1.86M
        break;
5705
57.2k
    case PyUnicode_4BYTE_KIND:
5706
57.2k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5707
57.2k
                                      error_handler, errors, &end);
5708
57.2k
        break;
5709
7.77M
    }
5710
5711
7.77M
    if (writer == NULL) {
5712
187k
        PyBytesWriter_Discard(writer);
5713
187k
        return NULL;
5714
187k
    }
5715
7.59M
    return PyBytesWriter_FinishWithPointer(writer, end);
5716
7.77M
}
5717
5718
static int
5719
unicode_fill_utf8(PyObject *unicode)
5720
171k
{
5721
171k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5722
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5723
171k
    assert(!PyUnicode_IS_ASCII(unicode));
5724
5725
171k
    int kind = PyUnicode_KIND(unicode);
5726
171k
    const void *data = PyUnicode_DATA(unicode);
5727
171k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5728
5729
171k
    PyBytesWriter *writer;
5730
171k
    char *end;
5731
5732
171k
    switch (kind) {
5733
0
    default:
5734
0
        Py_UNREACHABLE();
5735
137k
    case PyUnicode_1BYTE_KIND:
5736
137k
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5737
137k
                                      _Py_ERROR_STRICT, NULL, &end);
5738
137k
        break;
5739
28.6k
    case PyUnicode_2BYTE_KIND:
5740
28.6k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5741
28.6k
                                      _Py_ERROR_STRICT, NULL, &end);
5742
28.6k
        break;
5743
5.94k
    case PyUnicode_4BYTE_KIND:
5744
5.94k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5745
5.94k
                                      _Py_ERROR_STRICT, NULL, &end);
5746
5.94k
        break;
5747
171k
    }
5748
171k
    if (writer == NULL) {
5749
224
        return -1;
5750
224
    }
5751
5752
171k
    const char *start = PyBytesWriter_GetData(writer);
5753
171k
    Py_ssize_t len = end - start;
5754
5755
171k
    char *cache = PyMem_Malloc(len + 1);
5756
171k
    if (cache == NULL) {
5757
0
        PyBytesWriter_Discard(writer);
5758
0
        PyErr_NoMemory();
5759
0
        return -1;
5760
0
    }
5761
171k
    memcpy(cache, start, len);
5762
171k
    cache[len] = '\0';
5763
171k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5764
171k
    PyUnicode_SET_UTF8(unicode, cache);
5765
171k
    PyBytesWriter_Discard(writer);
5766
171k
    return 0;
5767
171k
}
5768
5769
PyObject *
5770
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5771
17.3M
{
5772
17.3M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5773
17.3M
}
5774
5775
5776
PyObject *
5777
PyUnicode_AsUTF8String(PyObject *unicode)
5778
2.66k
{
5779
2.66k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5780
2.66k
}
5781
5782
/* --- UTF-32 Codec ------------------------------------------------------- */
5783
5784
PyObject *
5785
PyUnicode_DecodeUTF32(const char *s,
5786
                      Py_ssize_t size,
5787
                      const char *errors,
5788
                      int *byteorder)
5789
89
{
5790
89
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5791
89
}
5792
5793
PyObject *
5794
PyUnicode_DecodeUTF32Stateful(const char *s,
5795
                              Py_ssize_t size,
5796
                              const char *errors,
5797
                              int *byteorder,
5798
                              Py_ssize_t *consumed)
5799
42.6k
{
5800
42.6k
    const char *starts = s;
5801
42.6k
    Py_ssize_t startinpos;
5802
42.6k
    Py_ssize_t endinpos;
5803
42.6k
    _PyUnicodeWriter writer;
5804
42.6k
    const unsigned char *q, *e;
5805
42.6k
    int le, bo = 0;       /* assume native ordering by default */
5806
42.6k
    const char *encoding;
5807
42.6k
    const char *errmsg = "";
5808
42.6k
    PyObject *errorHandler = NULL;
5809
42.6k
    PyObject *exc = NULL;
5810
5811
42.6k
    q = (const unsigned char *)s;
5812
42.6k
    e = q + size;
5813
5814
42.6k
    if (byteorder)
5815
42.5k
        bo = *byteorder;
5816
5817
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5818
       byte order setting accordingly. In native mode, the leading BOM
5819
       mark is skipped, in all other modes, it is copied to the output
5820
       stream as-is (giving a ZWNBSP character). */
5821
42.6k
    if (bo == 0 && size >= 4) {
5822
40.3k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5823
40.3k
        if (bom == 0x0000FEFF) {
5824
147
            bo = -1;
5825
147
            q += 4;
5826
147
        }
5827
40.2k
        else if (bom == 0xFFFE0000) {
5828
240
            bo = 1;
5829
240
            q += 4;
5830
240
        }
5831
40.3k
        if (byteorder)
5832
40.2k
            *byteorder = bo;
5833
40.3k
    }
5834
5835
42.6k
    if (q == e) {
5836
100
        if (consumed)
5837
0
            *consumed = size;
5838
100
        _Py_RETURN_UNICODE_EMPTY();
5839
100
    }
5840
5841
#ifdef WORDS_BIGENDIAN
5842
    le = bo < 0;
5843
#else
5844
42.5k
    le = bo <= 0;
5845
42.5k
#endif
5846
42.5k
    encoding = le ? "utf-32-le" : "utf-32-be";
5847
5848
42.5k
    _PyUnicodeWriter_Init(&writer);
5849
42.5k
    writer.min_length = (e - q + 3) / 4;
5850
42.5k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5851
0
        goto onError;
5852
5853
129k
    while (1) {
5854
129k
        Py_UCS4 ch = 0;
5855
129k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5856
5857
129k
        if (e - q >= 4) {
5858
104k
            int kind = writer.kind;
5859
104k
            void *data = writer.data;
5860
104k
            const unsigned char *last = e - 4;
5861
104k
            Py_ssize_t pos = writer.pos;
5862
104k
            if (le) {
5863
123k
                do {
5864
123k
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5865
123k
                    if (ch > maxch)
5866
100k
                        break;
5867
23.3k
                    if (kind != PyUnicode_1BYTE_KIND &&
5868
7.50k
                        Py_UNICODE_IS_SURROGATE(ch))
5869
179
                        break;
5870
23.1k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5871
23.1k
                    q += 4;
5872
23.1k
                } while (q <= last);
5873
101k
            }
5874
3.26k
            else {
5875
6.26k
                do {
5876
6.26k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5877
6.26k
                    if (ch > maxch)
5878
3.03k
                        break;
5879
3.23k
                    if (kind != PyUnicode_1BYTE_KIND &&
5880
2.65k
                        Py_UNICODE_IS_SURROGATE(ch))
5881
99
                        break;
5882
3.13k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5883
3.13k
                    q += 4;
5884
3.13k
                } while (q <= last);
5885
3.26k
            }
5886
104k
            writer.pos = pos;
5887
104k
        }
5888
5889
129k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5890
280
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5891
280
            startinpos = ((const char *)q) - starts;
5892
280
            endinpos = startinpos + 4;
5893
280
        }
5894
129k
        else if (ch <= maxch) {
5895
26.6k
            if (q == e || consumed)
5896
3.77k
                break;
5897
            /* remaining bytes at the end? (size should be divisible by 4) */
5898
22.8k
            errmsg = "truncated data";
5899
22.8k
            startinpos = ((const char *)q) - starts;
5900
22.8k
            endinpos = ((const char *)e) - starts;
5901
22.8k
        }
5902
103k
        else {
5903
103k
            if (ch < 0x110000) {
5904
4.13k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5905
0
                    goto onError;
5906
4.13k
                q += 4;
5907
4.13k
                continue;
5908
4.13k
            }
5909
98.9k
            errmsg = "code point not in range(0x110000)";
5910
98.9k
            startinpos = ((const char *)q) - starts;
5911
98.9k
            endinpos = startinpos + 4;
5912
98.9k
        }
5913
5914
        /* The remaining input chars are ignored if the callback
5915
           chooses to skip the input */
5916
122k
        if (unicode_decode_call_errorhandler_writer(
5917
122k
                errors, &errorHandler,
5918
122k
                encoding, errmsg,
5919
122k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5920
122k
                &writer))
5921
38.7k
            goto onError;
5922
122k
    }
5923
5924
3.77k
    if (consumed)
5925
0
        *consumed = (const char *)q-starts;
5926
5927
3.77k
    Py_XDECREF(errorHandler);
5928
3.77k
    Py_XDECREF(exc);
5929
3.77k
    return _PyUnicodeWriter_Finish(&writer);
5930
5931
38.7k
  onError:
5932
38.7k
    _PyUnicodeWriter_Dealloc(&writer);
5933
38.7k
    Py_XDECREF(errorHandler);
5934
38.7k
    Py_XDECREF(exc);
5935
38.7k
    return NULL;
5936
42.5k
}
5937
5938
PyObject *
5939
_PyUnicode_EncodeUTF32(PyObject *str,
5940
                       const char *errors,
5941
                       int byteorder)
5942
0
{
5943
0
    if (!PyUnicode_Check(str)) {
5944
0
        PyErr_BadArgument();
5945
0
        return NULL;
5946
0
    }
5947
0
    int kind = PyUnicode_KIND(str);
5948
0
    const void *data = PyUnicode_DATA(str);
5949
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
5950
5951
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5952
0
        return PyErr_NoMemory();
5953
0
    Py_ssize_t nsize = len + (byteorder == 0);
5954
5955
0
#if PY_LITTLE_ENDIAN
5956
0
    int native_ordering = byteorder <= 0;
5957
#else
5958
    int native_ordering = byteorder >= 0;
5959
#endif
5960
5961
0
    if (kind == PyUnicode_1BYTE_KIND) {
5962
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
5963
        // on short strings
5964
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5965
0
        if (v == NULL) {
5966
0
            return NULL;
5967
0
        }
5968
5969
        /* output buffer is 4-bytes aligned */
5970
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5971
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
5972
0
        if (byteorder == 0) {
5973
0
            *out++ = 0xFEFF;
5974
0
        }
5975
0
        if (len > 0) {
5976
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
5977
0
                                 &out, native_ordering);
5978
0
        }
5979
0
        return v;
5980
0
    }
5981
5982
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
5983
0
    if (writer == NULL) {
5984
0
        return NULL;
5985
0
    }
5986
5987
    /* output buffer is 4-bytes aligned */
5988
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
5989
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
5990
0
    if (byteorder == 0) {
5991
0
        *out++ = 0xFEFF;
5992
0
    }
5993
0
    if (len == 0) {
5994
0
        return PyBytesWriter_Finish(writer);
5995
0
    }
5996
5997
0
    const char *encoding;
5998
0
    if (byteorder == -1)
5999
0
        encoding = "utf-32-le";
6000
0
    else if (byteorder == 1)
6001
0
        encoding = "utf-32-be";
6002
0
    else
6003
0
        encoding = "utf-32";
6004
6005
0
    PyObject *errorHandler = NULL;
6006
0
    PyObject *exc = NULL;
6007
0
    PyObject *rep = NULL;
6008
6009
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6010
0
        if (kind == PyUnicode_2BYTE_KIND) {
6011
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6012
0
                                        &out, native_ordering);
6013
0
        }
6014
0
        else {
6015
0
            assert(kind == PyUnicode_4BYTE_KIND);
6016
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6017
0
                                        &out, native_ordering);
6018
0
        }
6019
0
        if (pos == len)
6020
0
            break;
6021
6022
0
        Py_ssize_t newpos;
6023
0
        rep = unicode_encode_call_errorhandler(
6024
0
                errors, &errorHandler,
6025
0
                encoding, "surrogates not allowed",
6026
0
                str, &exc, pos, pos + 1, &newpos);
6027
0
        if (!rep)
6028
0
            goto error;
6029
6030
0
        Py_ssize_t repsize, moreunits;
6031
0
        if (PyBytes_Check(rep)) {
6032
0
            repsize = PyBytes_GET_SIZE(rep);
6033
0
            if (repsize & 3) {
6034
0
                raise_encode_exception(&exc, encoding,
6035
0
                                       str, pos, pos + 1,
6036
0
                                       "surrogates not allowed");
6037
0
                goto error;
6038
0
            }
6039
0
            moreunits = repsize / 4;
6040
0
        }
6041
0
        else {
6042
0
            assert(PyUnicode_Check(rep));
6043
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6044
0
            if (!PyUnicode_IS_ASCII(rep)) {
6045
0
                raise_encode_exception(&exc, encoding,
6046
0
                                       str, pos, pos + 1,
6047
0
                                       "surrogates not allowed");
6048
0
                goto error;
6049
0
            }
6050
0
        }
6051
0
        moreunits += pos - newpos;
6052
0
        pos = newpos;
6053
6054
        /* four bytes are reserved for each surrogate */
6055
0
        if (moreunits > 0) {
6056
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6057
0
            if (out == NULL) {
6058
0
                goto error;
6059
0
            }
6060
0
        }
6061
6062
0
        if (PyBytes_Check(rep)) {
6063
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6064
0
            out += repsize / 4;
6065
0
        }
6066
0
        else {
6067
            /* rep is unicode */
6068
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6069
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6070
0
                                 &out, native_ordering);
6071
0
        }
6072
6073
0
        Py_CLEAR(rep);
6074
0
    }
6075
6076
0
    Py_XDECREF(errorHandler);
6077
0
    Py_XDECREF(exc);
6078
6079
    /* Cut back to size actually needed. This is necessary for, for example,
6080
       encoding of a string containing isolated surrogates and the 'ignore'
6081
       handler is used. */
6082
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6083
6084
0
  error:
6085
0
    Py_XDECREF(rep);
6086
0
    Py_XDECREF(errorHandler);
6087
0
    Py_XDECREF(exc);
6088
0
    PyBytesWriter_Discard(writer);
6089
0
    return NULL;
6090
0
}
6091
6092
PyObject *
6093
PyUnicode_AsUTF32String(PyObject *unicode)
6094
0
{
6095
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6096
0
}
6097
6098
/* --- UTF-16 Codec ------------------------------------------------------- */
6099
6100
PyObject *
6101
PyUnicode_DecodeUTF16(const char *s,
6102
                      Py_ssize_t size,
6103
                      const char *errors,
6104
                      int *byteorder)
6105
92
{
6106
92
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6107
92
}
6108
6109
PyObject *
6110
PyUnicode_DecodeUTF16Stateful(const char *s,
6111
                              Py_ssize_t size,
6112
                              const char *errors,
6113
                              int *byteorder,
6114
                              Py_ssize_t *consumed)
6115
13.5k
{
6116
13.5k
    const char *starts = s;
6117
13.5k
    Py_ssize_t startinpos;
6118
13.5k
    Py_ssize_t endinpos;
6119
13.5k
    _PyUnicodeWriter writer;
6120
13.5k
    const unsigned char *q, *e;
6121
13.5k
    int bo = 0;       /* assume native ordering by default */
6122
13.5k
    int native_ordering;
6123
13.5k
    const char *errmsg = "";
6124
13.5k
    PyObject *errorHandler = NULL;
6125
13.5k
    PyObject *exc = NULL;
6126
13.5k
    const char *encoding;
6127
6128
13.5k
    q = (const unsigned char *)s;
6129
13.5k
    e = q + size;
6130
6131
13.5k
    if (byteorder)
6132
13.4k
        bo = *byteorder;
6133
6134
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6135
       byte order setting accordingly. In native mode, the leading BOM
6136
       mark is skipped, in all other modes, it is copied to the output
6137
       stream as-is (giving a ZWNBSP character). */
6138
13.5k
    if (bo == 0 && size >= 2) {
6139
12.8k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6140
12.8k
        if (bom == 0xFEFF) {
6141
382
            q += 2;
6142
382
            bo = -1;
6143
382
        }
6144
12.4k
        else if (bom == 0xFFFE) {
6145
2.08k
            q += 2;
6146
2.08k
            bo = 1;
6147
2.08k
        }
6148
12.8k
        if (byteorder)
6149
12.7k
            *byteorder = bo;
6150
12.8k
    }
6151
6152
13.5k
    if (q == e) {
6153
72
        if (consumed)
6154
0
            *consumed = size;
6155
72
        _Py_RETURN_UNICODE_EMPTY();
6156
72
    }
6157
6158
13.4k
#if PY_LITTLE_ENDIAN
6159
13.4k
    native_ordering = bo <= 0;
6160
13.4k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6161
#else
6162
    native_ordering = bo >= 0;
6163
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6164
#endif
6165
6166
    /* Note: size will always be longer than the resulting Unicode
6167
       character count normally.  Error handler will take care of
6168
       resizing when needed. */
6169
13.4k
    _PyUnicodeWriter_Init(&writer);
6170
13.4k
    writer.min_length = (e - q + 1) / 2;
6171
13.4k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6172
0
        goto onError;
6173
6174
53.3k
    while (1) {
6175
53.3k
        Py_UCS4 ch = 0;
6176
53.3k
        if (e - q >= 2) {
6177
46.1k
            int kind = writer.kind;
6178
46.1k
            if (kind == PyUnicode_1BYTE_KIND) {
6179
16.1k
                if (PyUnicode_IS_ASCII(writer.buffer))
6180
12.9k
                    ch = asciilib_utf16_decode(&q, e,
6181
12.9k
                            (Py_UCS1*)writer.data, &writer.pos,
6182
12.9k
                            native_ordering);
6183
3.27k
                else
6184
3.27k
                    ch = ucs1lib_utf16_decode(&q, e,
6185
3.27k
                            (Py_UCS1*)writer.data, &writer.pos,
6186
3.27k
                            native_ordering);
6187
29.9k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6188
11.6k
                ch = ucs2lib_utf16_decode(&q, e,
6189
11.6k
                        (Py_UCS2*)writer.data, &writer.pos,
6190
11.6k
                        native_ordering);
6191
18.3k
            } else {
6192
18.3k
                assert(kind == PyUnicode_4BYTE_KIND);
6193
18.3k
                ch = ucs4lib_utf16_decode(&q, e,
6194
18.3k
                        (Py_UCS4*)writer.data, &writer.pos,
6195
18.3k
                        native_ordering);
6196
18.3k
            }
6197
46.1k
        }
6198
6199
53.3k
        switch (ch)
6200
53.3k
        {
6201
13.6k
        case 0:
6202
            /* remaining byte at the end? (size should be even) */
6203
13.6k
            if (q == e || consumed)
6204
8.79k
                goto End;
6205
4.84k
            errmsg = "truncated data";
6206
4.84k
            startinpos = ((const char *)q) - starts;
6207
4.84k
            endinpos = ((const char *)e) - starts;
6208
4.84k
            break;
6209
            /* The remaining input chars are ignored if the callback
6210
               chooses to skip the input */
6211
1.62k
        case 1:
6212
1.62k
            q -= 2;
6213
1.62k
            if (consumed)
6214
0
                goto End;
6215
1.62k
            errmsg = "unexpected end of data";
6216
1.62k
            startinpos = ((const char *)q) - starts;
6217
1.62k
            endinpos = ((const char *)e) - starts;
6218
1.62k
            break;
6219
16.3k
        case 2:
6220
16.3k
            errmsg = "illegal encoding";
6221
16.3k
            startinpos = ((const char *)q) - 2 - starts;
6222
16.3k
            endinpos = startinpos + 2;
6223
16.3k
            break;
6224
6.09k
        case 3:
6225
6.09k
            errmsg = "illegal UTF-16 surrogate";
6226
6.09k
            startinpos = ((const char *)q) - 4 - starts;
6227
6.09k
            endinpos = startinpos + 2;
6228
6.09k
            break;
6229
15.6k
        default:
6230
15.6k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6231
0
                goto onError;
6232
15.6k
            continue;
6233
53.3k
        }
6234
6235
28.9k
        if (unicode_decode_call_errorhandler_writer(
6236
28.9k
                errors,
6237
28.9k
                &errorHandler,
6238
28.9k
                encoding, errmsg,
6239
28.9k
                &starts,
6240
28.9k
                (const char **)&e,
6241
28.9k
                &startinpos,
6242
28.9k
                &endinpos,
6243
28.9k
                &exc,
6244
28.9k
                (const char **)&q,
6245
28.9k
                &writer))
6246
4.66k
            goto onError;
6247
28.9k
    }
6248
6249
8.79k
End:
6250
8.79k
    if (consumed)
6251
0
        *consumed = (const char *)q-starts;
6252
6253
8.79k
    Py_XDECREF(errorHandler);
6254
8.79k
    Py_XDECREF(exc);
6255
8.79k
    return _PyUnicodeWriter_Finish(&writer);
6256
6257
4.66k
  onError:
6258
4.66k
    _PyUnicodeWriter_Dealloc(&writer);
6259
4.66k
    Py_XDECREF(errorHandler);
6260
4.66k
    Py_XDECREF(exc);
6261
4.66k
    return NULL;
6262
13.4k
}
6263
6264
PyObject *
6265
_PyUnicode_EncodeUTF16(PyObject *str,
6266
                       const char *errors,
6267
                       int byteorder)
6268
3.72k
{
6269
3.72k
    if (!PyUnicode_Check(str)) {
6270
0
        PyErr_BadArgument();
6271
0
        return NULL;
6272
0
    }
6273
3.72k
    int kind = PyUnicode_KIND(str);
6274
3.72k
    const void *data = PyUnicode_DATA(str);
6275
3.72k
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6276
6277
3.72k
    Py_ssize_t pairs = 0;
6278
3.72k
    if (kind == PyUnicode_4BYTE_KIND) {
6279
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6280
0
        const Py_UCS4 *end = in + len;
6281
0
        while (in < end) {
6282
0
            if (*in++ >= 0x10000) {
6283
0
                pairs++;
6284
0
            }
6285
0
        }
6286
0
    }
6287
3.72k
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6288
0
        return PyErr_NoMemory();
6289
0
    }
6290
3.72k
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6291
6292
#if PY_BIG_ENDIAN
6293
    int native_ordering = byteorder >= 0;
6294
#else
6295
3.72k
    int native_ordering = byteorder <= 0;
6296
3.72k
#endif
6297
6298
3.72k
    if (kind == PyUnicode_1BYTE_KIND) {
6299
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6300
        // on short strings
6301
3.69k
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6302
3.69k
        if (v == NULL) {
6303
0
            return NULL;
6304
0
        }
6305
6306
        /* output buffer is 2-bytes aligned */
6307
3.69k
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6308
3.69k
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6309
3.69k
        if (byteorder == 0) {
6310
0
            *out++ = 0xFEFF;
6311
0
        }
6312
3.69k
        if (len > 0) {
6313
3.69k
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6314
3.69k
        }
6315
3.69k
        return v;
6316
3.69k
    }
6317
6318
34
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6319
34
    if (writer == NULL) {
6320
0
        return NULL;
6321
0
    }
6322
6323
    /* output buffer is 2-bytes aligned */
6324
34
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6325
34
    unsigned short *out = PyBytesWriter_GetData(writer);
6326
34
    if (byteorder == 0) {
6327
0
        *out++ = 0xFEFF;
6328
0
    }
6329
34
    if (len == 0) {
6330
0
        return PyBytesWriter_Finish(writer);
6331
0
    }
6332
6333
34
    const char *encoding;
6334
34
    if (byteorder < 0) {
6335
0
        encoding = "utf-16-le";
6336
0
    }
6337
34
    else if (byteorder > 0) {
6338
34
        encoding = "utf-16-be";
6339
34
    }
6340
0
    else {
6341
0
        encoding = "utf-16";
6342
0
    }
6343
6344
34
    PyObject *errorHandler = NULL;
6345
34
    PyObject *exc = NULL;
6346
34
    PyObject *rep = NULL;
6347
6348
34
    for (Py_ssize_t pos = 0; pos < len; ) {
6349
34
        if (kind == PyUnicode_2BYTE_KIND) {
6350
34
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6351
34
                                        &out, native_ordering);
6352
34
        }
6353
0
        else {
6354
0
            assert(kind == PyUnicode_4BYTE_KIND);
6355
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6356
0
                                        &out, native_ordering);
6357
0
        }
6358
34
        if (pos == len)
6359
34
            break;
6360
6361
0
        Py_ssize_t newpos;
6362
0
        rep = unicode_encode_call_errorhandler(
6363
0
                errors, &errorHandler,
6364
0
                encoding, "surrogates not allowed",
6365
0
                str, &exc, pos, pos + 1, &newpos);
6366
0
        if (!rep)
6367
0
            goto error;
6368
6369
0
        Py_ssize_t repsize, moreunits;
6370
0
        if (PyBytes_Check(rep)) {
6371
0
            repsize = PyBytes_GET_SIZE(rep);
6372
0
            if (repsize & 1) {
6373
0
                raise_encode_exception(&exc, encoding,
6374
0
                                       str, pos, pos + 1,
6375
0
                                       "surrogates not allowed");
6376
0
                goto error;
6377
0
            }
6378
0
            moreunits = repsize / 2;
6379
0
        }
6380
0
        else {
6381
0
            assert(PyUnicode_Check(rep));
6382
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6383
0
            if (!PyUnicode_IS_ASCII(rep)) {
6384
0
                raise_encode_exception(&exc, encoding,
6385
0
                                       str, pos, pos + 1,
6386
0
                                       "surrogates not allowed");
6387
0
                goto error;
6388
0
            }
6389
0
        }
6390
0
        moreunits += pos - newpos;
6391
0
        pos = newpos;
6392
6393
        /* two bytes are reserved for each surrogate */
6394
0
        if (moreunits > 0) {
6395
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6396
0
            if (out == NULL) {
6397
0
                goto error;
6398
0
            }
6399
0
        }
6400
6401
0
        if (PyBytes_Check(rep)) {
6402
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6403
0
            out += repsize / 2;
6404
0
        } else {
6405
            /* rep is unicode */
6406
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6407
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6408
0
                                 &out, native_ordering);
6409
0
        }
6410
6411
0
        Py_CLEAR(rep);
6412
0
    }
6413
6414
34
    Py_XDECREF(errorHandler);
6415
34
    Py_XDECREF(exc);
6416
6417
    /* Cut back to size actually needed. This is necessary for, for example,
6418
    encoding of a string containing isolated surrogates and the 'ignore' handler
6419
    is used. */
6420
34
    return PyBytesWriter_FinishWithPointer(writer, out);
6421
6422
0
  error:
6423
0
    Py_XDECREF(rep);
6424
0
    Py_XDECREF(errorHandler);
6425
0
    Py_XDECREF(exc);
6426
0
    PyBytesWriter_Discard(writer);
6427
0
    return NULL;
6428
34
}
6429
6430
PyObject *
6431
PyUnicode_AsUTF16String(PyObject *unicode)
6432
0
{
6433
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6434
0
}
6435
6436
_PyUnicode_Name_CAPI *
6437
_PyUnicode_GetNameCAPI(void)
6438
1.56k
{
6439
1.56k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6440
1.56k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6441
6442
1.56k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6443
1.56k
    if (ucnhash_capi == NULL) {
6444
1
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6445
1
                PyUnicodeData_CAPSULE_NAME, 1);
6446
6447
        // It's fine if we overwrite the value here. It's always the same value.
6448
1
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6449
1
    }
6450
1.56k
    return ucnhash_capi;
6451
1.56k
}
6452
6453
/* --- Unicode Escape Codec ----------------------------------------------- */
6454
6455
PyObject *
6456
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6457
                               Py_ssize_t size,
6458
                               const char *errors,
6459
                               Py_ssize_t *consumed,
6460
                               int *first_invalid_escape_char,
6461
                               const char **first_invalid_escape_ptr)
6462
30.6k
{
6463
30.6k
    const char *starts = s;
6464
30.6k
    const char *initial_starts = starts;
6465
30.6k
    _PyUnicodeWriter writer;
6466
30.6k
    const char *end;
6467
30.6k
    PyObject *errorHandler = NULL;
6468
30.6k
    PyObject *exc = NULL;
6469
30.6k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6470
6471
    // so we can remember if we've seen an invalid escape char or not
6472
30.6k
    *first_invalid_escape_char = -1;
6473
30.6k
    *first_invalid_escape_ptr = NULL;
6474
6475
30.6k
    if (size == 0) {
6476
1.82k
        if (consumed) {
6477
0
            *consumed = 0;
6478
0
        }
6479
1.82k
        _Py_RETURN_UNICODE_EMPTY();
6480
1.82k
    }
6481
    /* Escaped strings will always be longer than the resulting
6482
       Unicode string, so we start with size here and then reduce the
6483
       length after conversion to the true value.
6484
       (but if the error callback returns a long replacement string
6485
       we'll have to allocate more space) */
6486
28.8k
    _PyUnicodeWriter_Init(&writer);
6487
28.8k
    writer.min_length = size;
6488
28.8k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6489
0
        goto onError;
6490
0
    }
6491
6492
28.8k
    end = s + size;
6493
172k
    while (s < end) {
6494
144k
        unsigned char c = (unsigned char) *s++;
6495
144k
        Py_UCS4 ch;
6496
144k
        int count;
6497
144k
        const char *message;
6498
6499
144k
#define WRITE_ASCII_CHAR(ch)                                                  \
6500
144k
            do {                                                              \
6501
15.1k
                assert(ch <= 127);                                            \
6502
15.1k
                assert(writer.pos < writer.size);                             \
6503
15.1k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6504
15.1k
            } while(0)
6505
6506
144k
#define WRITE_CHAR(ch)                                                        \
6507
144k
            do {                                                              \
6508
133k
                if (ch <= writer.maxchar) {                                   \
6509
118k
                    assert(writer.pos < writer.size);                         \
6510
118k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6511
118k
                }                                                             \
6512
133k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6513
0
                    goto onError;                                             \
6514
0
                }                                                             \
6515
133k
            } while(0)
6516
6517
        /* Non-escape characters are interpreted as Unicode ordinals */
6518
144k
        if (c != '\\') {
6519
93.8k
            WRITE_CHAR(c);
6520
93.8k
            continue;
6521
93.8k
        }
6522
6523
50.1k
        Py_ssize_t startinpos = s - starts - 1;
6524
        /* \ - Escapes */
6525
50.1k
        if (s >= end) {
6526
0
            message = "\\ at end of string";
6527
0
            goto incomplete;
6528
0
        }
6529
50.1k
        c = (unsigned char) *s++;
6530
6531
50.1k
        assert(writer.pos < writer.size);
6532
50.1k
        switch (c) {
6533
6534
            /* \x escapes */
6535
671
        case '\n': continue;
6536
1.47k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6537
898
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6538
1.16k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6539
1.02k
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6540
        /* FF */
6541
734
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6542
902
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6543
981
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6544
1.41k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6545
        /* VT */
6546
827
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6547
        /* BEL, not classic C */
6548
698
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6549
6550
            /* \OOO (octal) escapes */
6551
3.77k
        case '0': case '1': case '2': case '3':
6552
6.73k
        case '4': case '5': case '6': case '7':
6553
6.73k
            ch = c - '0';
6554
6.73k
            if (s < end && '0' <= *s && *s <= '7') {
6555
2.48k
                ch = (ch<<3) + *s++ - '0';
6556
2.48k
                if (s < end && '0' <= *s && *s <= '7') {
6557
1.25k
                    ch = (ch<<3) + *s++ - '0';
6558
1.25k
                }
6559
2.48k
            }
6560
6.73k
            if (ch > 0377) {
6561
1.07k
                if (*first_invalid_escape_char == -1) {
6562
750
                    *first_invalid_escape_char = ch;
6563
750
                    if (starts == initial_starts) {
6564
                        /* Back up 3 chars, since we've already incremented s. */
6565
750
                        *first_invalid_escape_ptr = s - 3;
6566
750
                    }
6567
750
                }
6568
1.07k
            }
6569
6.73k
            WRITE_CHAR(ch);
6570
6.73k
            continue;
6571
6572
            /* hex escapes */
6573
            /* \xXX */
6574
6.73k
        case 'x':
6575
5.99k
            count = 2;
6576
5.99k
            message = "truncated \\xXX escape";
6577
5.99k
            goto hexescape;
6578
6579
            /* \uXXXX */
6580
9.24k
        case 'u':
6581
9.24k
            count = 4;
6582
9.24k
            message = "truncated \\uXXXX escape";
6583
9.24k
            goto hexescape;
6584
6585
            /* \UXXXXXXXX */
6586
10.7k
        case 'U':
6587
10.7k
            count = 8;
6588
10.7k
            message = "truncated \\UXXXXXXXX escape";
6589
26.0k
        hexescape:
6590
161k
            for (ch = 0; count; ++s, --count) {
6591
135k
                if (s >= end) {
6592
6
                    goto incomplete;
6593
6
                }
6594
135k
                c = (unsigned char)*s;
6595
135k
                ch <<= 4;
6596
135k
                if (c >= '0' && c <= '9') {
6597
104k
                    ch += c - '0';
6598
104k
                }
6599
31.2k
                else if (c >= 'a' && c <= 'f') {
6600
30.9k
                    ch += c - ('a' - 10);
6601
30.9k
                }
6602
252
                else if (c >= 'A' && c <= 'F') {
6603
245
                    ch += c - ('A' - 10);
6604
245
                }
6605
7
                else {
6606
7
                    goto error;
6607
7
                }
6608
135k
            }
6609
6610
            /* when we get here, ch is a 32-bit unicode character */
6611
26.0k
            if (ch > MAX_UNICODE) {
6612
1
                message = "illegal Unicode character";
6613
1
                goto error;
6614
1
            }
6615
6616
26.0k
            WRITE_CHAR(ch);
6617
26.0k
            continue;
6618
6619
            /* \N{name} */
6620
26.0k
        case 'N':
6621
1.56k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6622
1.56k
            if (ucnhash_capi == NULL) {
6623
0
                PyErr_SetString(
6624
0
                        PyExc_UnicodeError,
6625
0
                        "\\N escapes not supported (can't load unicodedata module)"
6626
0
                );
6627
0
                goto onError;
6628
0
            }
6629
6630
1.56k
            message = "malformed \\N character escape";
6631
1.56k
            if (s >= end) {
6632
4
                goto incomplete;
6633
4
            }
6634
1.55k
            if (*s == '{') {
6635
1.55k
                const char *start = ++s;
6636
1.55k
                size_t namelen;
6637
                /* look for the closing brace */
6638
20.3k
                while (s < end && *s != '}')
6639
18.7k
                    s++;
6640
1.55k
                if (s >= end) {
6641
11
                    goto incomplete;
6642
11
                }
6643
1.54k
                namelen = s - start;
6644
1.54k
                if (namelen) {
6645
                    /* found a name.  look it up in the unicode database */
6646
1.54k
                    s++;
6647
1.54k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6648
1.54k
                    if (namelen <= INT_MAX &&
6649
1.54k
                        ucnhash_capi->getcode(start, (int)namelen,
6650
1.54k
                                              &ch, 0)) {
6651
1.47k
                        assert(ch <= MAX_UNICODE);
6652
1.47k
                        WRITE_CHAR(ch);
6653
1.47k
                        continue;
6654
1.47k
                    }
6655
66
                    message = "unknown Unicode character name";
6656
66
                }
6657
1.54k
            }
6658
71
            goto error;
6659
6660
5.02k
        default:
6661
5.02k
            if (*first_invalid_escape_char == -1) {
6662
3.73k
                *first_invalid_escape_char = c;
6663
3.73k
                if (starts == initial_starts) {
6664
                    /* Back up one char, since we've already incremented s. */
6665
3.73k
                    *first_invalid_escape_ptr = s - 1;
6666
3.73k
                }
6667
3.73k
            }
6668
5.02k
            WRITE_ASCII_CHAR('\\');
6669
5.02k
            WRITE_CHAR(c);
6670
5.02k
            continue;
6671
50.1k
        }
6672
6673
21
      incomplete:
6674
21
        if (consumed) {
6675
0
            *consumed = startinpos;
6676
0
            break;
6677
0
        }
6678
100
      error:;
6679
100
        Py_ssize_t endinpos = s-starts;
6680
100
        writer.min_length = end - s + writer.pos;
6681
100
        if (unicode_decode_call_errorhandler_writer(
6682
100
                errors, &errorHandler,
6683
100
                "unicodeescape", message,
6684
100
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6685
100
                &writer)) {
6686
100
            goto onError;
6687
100
        }
6688
100
        assert(end - s <= writer.size - writer.pos);
6689
6690
0
#undef WRITE_ASCII_CHAR
6691
0
#undef WRITE_CHAR
6692
0
    }
6693
6694
28.7k
    Py_XDECREF(errorHandler);
6695
28.7k
    Py_XDECREF(exc);
6696
28.7k
    return _PyUnicodeWriter_Finish(&writer);
6697
6698
100
  onError:
6699
100
    _PyUnicodeWriter_Dealloc(&writer);
6700
100
    Py_XDECREF(errorHandler);
6701
100
    Py_XDECREF(exc);
6702
100
    return NULL;
6703
28.8k
}
6704
6705
PyObject *
6706
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6707
                              Py_ssize_t size,
6708
                              const char *errors,
6709
                              Py_ssize_t *consumed)
6710
0
{
6711
0
    int first_invalid_escape_char;
6712
0
    const char *first_invalid_escape_ptr;
6713
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6714
0
                                                      consumed,
6715
0
                                                      &first_invalid_escape_char,
6716
0
                                                      &first_invalid_escape_ptr);
6717
0
    if (result == NULL)
6718
0
        return NULL;
6719
0
    if (first_invalid_escape_char != -1) {
6720
0
        if (first_invalid_escape_char > 0xff) {
6721
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6722
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6723
0
                                 "Such sequences will not work in the future. ",
6724
0
                                 first_invalid_escape_char) < 0)
6725
0
            {
6726
0
                Py_DECREF(result);
6727
0
                return NULL;
6728
0
            }
6729
0
        }
6730
0
        else {
6731
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6732
0
                                 "\"\\%c\" is an invalid escape sequence. "
6733
0
                                 "Such sequences will not work in the future. ",
6734
0
                                 first_invalid_escape_char) < 0)
6735
0
            {
6736
0
                Py_DECREF(result);
6737
0
                return NULL;
6738
0
            }
6739
0
        }
6740
0
    }
6741
0
    return result;
6742
0
}
6743
6744
PyObject *
6745
PyUnicode_DecodeUnicodeEscape(const char *s,
6746
                              Py_ssize_t size,
6747
                              const char *errors)
6748
0
{
6749
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6750
0
}
6751
6752
/* Return a Unicode-Escape string version of the Unicode object. */
6753
6754
PyObject *
6755
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6756
311k
{
6757
311k
    if (!PyUnicode_Check(unicode)) {
6758
0
        PyErr_BadArgument();
6759
0
        return NULL;
6760
0
    }
6761
6762
311k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6763
311k
    if (len == 0) {
6764
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6765
0
    }
6766
311k
    int kind = PyUnicode_KIND(unicode);
6767
311k
    const void *data = PyUnicode_DATA(unicode);
6768
6769
    /* Initial allocation is based on the longest-possible character
6770
     * escape.
6771
     *
6772
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6773
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6774
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6775
311k
    Py_ssize_t expandsize = kind * 2 + 2;
6776
311k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6777
0
        return PyErr_NoMemory();
6778
0
    }
6779
6780
311k
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6781
311k
    if (writer == NULL) {
6782
0
        return NULL;
6783
0
    }
6784
311k
    char *p = PyBytesWriter_GetData(writer);
6785
6786
622k
    for (Py_ssize_t i = 0; i < len; i++) {
6787
311k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6788
6789
        /* U+0000-U+00ff range */
6790
311k
        if (ch < 0x100) {
6791
304k
            if (ch >= ' ' && ch < 127) {
6792
23.8k
                if (ch != '\\') {
6793
                    /* Copy printable US ASCII as-is */
6794
0
                    *p++ = (char) ch;
6795
0
                }
6796
                /* Escape backslashes */
6797
23.8k
                else {
6798
23.8k
                    *p++ = '\\';
6799
23.8k
                    *p++ = '\\';
6800
23.8k
                }
6801
23.8k
            }
6802
6803
            /* Map special whitespace to '\t', \n', '\r' */
6804
280k
            else if (ch == '\t') {
6805
2.81k
                *p++ = '\\';
6806
2.81k
                *p++ = 't';
6807
2.81k
            }
6808
278k
            else if (ch == '\n') {
6809
4.27k
                *p++ = '\\';
6810
4.27k
                *p++ = 'n';
6811
4.27k
            }
6812
273k
            else if (ch == '\r') {
6813
508
                *p++ = '\\';
6814
508
                *p++ = 'r';
6815
508
            }
6816
6817
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6818
273k
            else {
6819
273k
                *p++ = '\\';
6820
273k
                *p++ = 'x';
6821
273k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6822
273k
                *p++ = Py_hexdigits[ch & 0x000F];
6823
273k
            }
6824
304k
        }
6825
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6826
6.67k
        else if (ch < 0x10000) {
6827
5.56k
            *p++ = '\\';
6828
5.56k
            *p++ = 'u';
6829
5.56k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6830
5.56k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6831
5.56k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6832
5.56k
            *p++ = Py_hexdigits[ch & 0x000F];
6833
5.56k
        }
6834
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6835
1.10k
        else {
6836
6837
            /* Make sure that the first two digits are zero */
6838
1.10k
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6839
1.10k
            *p++ = '\\';
6840
1.10k
            *p++ = 'U';
6841
1.10k
            *p++ = '0';
6842
1.10k
            *p++ = '0';
6843
1.10k
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6844
1.10k
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6845
1.10k
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6846
1.10k
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6847
1.10k
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6848
1.10k
            *p++ = Py_hexdigits[ch & 0x0000000F];
6849
1.10k
        }
6850
311k
    }
6851
6852
311k
    return PyBytesWriter_FinishWithPointer(writer, p);
6853
311k
}
6854
6855
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6856
6857
PyObject *
6858
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6859
                                          Py_ssize_t size,
6860
                                          const char *errors,
6861
                                          Py_ssize_t *consumed)
6862
0
{
6863
0
    const char *starts = s;
6864
0
    _PyUnicodeWriter writer;
6865
0
    const char *end;
6866
0
    PyObject *errorHandler = NULL;
6867
0
    PyObject *exc = NULL;
6868
6869
0
    if (size == 0) {
6870
0
        if (consumed) {
6871
0
            *consumed = 0;
6872
0
        }
6873
0
        _Py_RETURN_UNICODE_EMPTY();
6874
0
    }
6875
6876
    /* Escaped strings will always be longer than the resulting
6877
       Unicode string, so we start with size here and then reduce the
6878
       length after conversion to the true value. (But decoding error
6879
       handler might have to resize the string) */
6880
0
    _PyUnicodeWriter_Init(&writer);
6881
0
    writer.min_length = size;
6882
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6883
0
        goto onError;
6884
0
    }
6885
6886
0
    end = s + size;
6887
0
    while (s < end) {
6888
0
        unsigned char c = (unsigned char) *s++;
6889
0
        Py_UCS4 ch;
6890
0
        int count;
6891
0
        const char *message;
6892
6893
0
#define WRITE_CHAR(ch)                                                        \
6894
0
            do {                                                              \
6895
0
                if (ch <= writer.maxchar) {                                   \
6896
0
                    assert(writer.pos < writer.size);                         \
6897
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6898
0
                }                                                             \
6899
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6900
0
                    goto onError;                                             \
6901
0
                }                                                             \
6902
0
            } while(0)
6903
6904
        /* Non-escape characters are interpreted as Unicode ordinals */
6905
0
        if (c != '\\' || (s >= end && !consumed)) {
6906
0
            WRITE_CHAR(c);
6907
0
            continue;
6908
0
        }
6909
6910
0
        Py_ssize_t startinpos = s - starts - 1;
6911
        /* \ - Escapes */
6912
0
        if (s >= end) {
6913
0
            assert(consumed);
6914
            // Set message to silent compiler warning.
6915
            // Actually it is never used.
6916
0
            message = "\\ at end of string";
6917
0
            goto incomplete;
6918
0
        }
6919
6920
0
        c = (unsigned char) *s++;
6921
0
        if (c == 'u') {
6922
0
            count = 4;
6923
0
            message = "truncated \\uXXXX escape";
6924
0
        }
6925
0
        else if (c == 'U') {
6926
0
            count = 8;
6927
0
            message = "truncated \\UXXXXXXXX escape";
6928
0
        }
6929
0
        else {
6930
0
            assert(writer.pos < writer.size);
6931
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6932
0
            WRITE_CHAR(c);
6933
0
            continue;
6934
0
        }
6935
6936
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6937
0
        for (ch = 0; count; ++s, --count) {
6938
0
            if (s >= end) {
6939
0
                goto incomplete;
6940
0
            }
6941
0
            c = (unsigned char)*s;
6942
0
            ch <<= 4;
6943
0
            if (c >= '0' && c <= '9') {
6944
0
                ch += c - '0';
6945
0
            }
6946
0
            else if (c >= 'a' && c <= 'f') {
6947
0
                ch += c - ('a' - 10);
6948
0
            }
6949
0
            else if (c >= 'A' && c <= 'F') {
6950
0
                ch += c - ('A' - 10);
6951
0
            }
6952
0
            else {
6953
0
                goto error;
6954
0
            }
6955
0
        }
6956
0
        if (ch > MAX_UNICODE) {
6957
0
            message = "\\Uxxxxxxxx out of range";
6958
0
            goto error;
6959
0
        }
6960
0
        WRITE_CHAR(ch);
6961
0
        continue;
6962
6963
0
      incomplete:
6964
0
        if (consumed) {
6965
0
            *consumed = startinpos;
6966
0
            break;
6967
0
        }
6968
0
      error:;
6969
0
        Py_ssize_t endinpos = s-starts;
6970
0
        writer.min_length = end - s + writer.pos;
6971
0
        if (unicode_decode_call_errorhandler_writer(
6972
0
                errors, &errorHandler,
6973
0
                "rawunicodeescape", message,
6974
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6975
0
                &writer)) {
6976
0
            goto onError;
6977
0
        }
6978
0
        assert(end - s <= writer.size - writer.pos);
6979
6980
0
#undef WRITE_CHAR
6981
0
    }
6982
0
    Py_XDECREF(errorHandler);
6983
0
    Py_XDECREF(exc);
6984
0
    return _PyUnicodeWriter_Finish(&writer);
6985
6986
0
  onError:
6987
0
    _PyUnicodeWriter_Dealloc(&writer);
6988
0
    Py_XDECREF(errorHandler);
6989
0
    Py_XDECREF(exc);
6990
0
    return NULL;
6991
0
}
6992
6993
PyObject *
6994
PyUnicode_DecodeRawUnicodeEscape(const char *s,
6995
                                 Py_ssize_t size,
6996
                                 const char *errors)
6997
0
{
6998
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6999
0
}
7000
7001
7002
PyObject *
7003
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7004
253k
{
7005
253k
    if (!PyUnicode_Check(unicode)) {
7006
0
        PyErr_BadArgument();
7007
0
        return NULL;
7008
0
    }
7009
253k
    int kind = PyUnicode_KIND(unicode);
7010
253k
    const void *data = PyUnicode_DATA(unicode);
7011
253k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7012
253k
    if (len == 0) {
7013
635
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7014
635
    }
7015
253k
    if (kind == PyUnicode_1BYTE_KIND) {
7016
253k
        return PyBytes_FromStringAndSize(data, len);
7017
253k
    }
7018
7019
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7020
       bytes, and 1 byte characters 4. */
7021
312
    Py_ssize_t expandsize = kind * 2 + 2;
7022
312
    if (len > PY_SSIZE_T_MAX / expandsize) {
7023
0
        return PyErr_NoMemory();
7024
0
    }
7025
7026
312
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7027
312
    if (writer == NULL) {
7028
0
        return NULL;
7029
0
    }
7030
312
    char *p = PyBytesWriter_GetData(writer);
7031
7032
5.14M
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7033
5.14M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7034
7035
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7036
5.14M
        if (ch < 0x100) {
7037
5.11M
            *p++ = (char) ch;
7038
5.11M
        }
7039
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7040
32.0k
        else if (ch < 0x10000) {
7041
31.4k
            *p++ = '\\';
7042
31.4k
            *p++ = 'u';
7043
31.4k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7044
31.4k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7045
31.4k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7046
31.4k
            *p++ = Py_hexdigits[ch & 15];
7047
31.4k
        }
7048
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7049
582
        else {
7050
582
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7051
582
            *p++ = '\\';
7052
582
            *p++ = 'U';
7053
582
            *p++ = '0';
7054
582
            *p++ = '0';
7055
582
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7056
582
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7057
582
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7058
582
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7059
582
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7060
582
            *p++ = Py_hexdigits[ch & 15];
7061
582
        }
7062
5.14M
    }
7063
7064
312
    return PyBytesWriter_FinishWithPointer(writer, p);
7065
312
}
7066
7067
/* --- Latin-1 Codec ------------------------------------------------------ */
7068
7069
PyObject *
7070
PyUnicode_DecodeLatin1(const char *s,
7071
                       Py_ssize_t size,
7072
                       const char *errors)
7073
3.22M
{
7074
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7075
3.22M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7076
3.22M
}
7077
7078
/* create or adjust a UnicodeEncodeError */
7079
static void
7080
make_encode_exception(PyObject **exceptionObject,
7081
                      const char *encoding,
7082
                      PyObject *unicode,
7083
                      Py_ssize_t startpos, Py_ssize_t endpos,
7084
                      const char *reason)
7085
235k
{
7086
235k
    if (*exceptionObject == NULL) {
7087
235k
        *exceptionObject = PyObject_CallFunction(
7088
235k
            PyExc_UnicodeEncodeError, "sOnns",
7089
235k
            encoding, unicode, startpos, endpos, reason);
7090
235k
    }
7091
0
    else {
7092
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7093
0
            goto onError;
7094
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7095
0
            goto onError;
7096
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7097
0
            goto onError;
7098
0
        return;
7099
0
      onError:
7100
0
        Py_CLEAR(*exceptionObject);
7101
0
    }
7102
235k
}
7103
7104
/* raises a UnicodeEncodeError */
7105
static void
7106
raise_encode_exception(PyObject **exceptionObject,
7107
                       const char *encoding,
7108
                       PyObject *unicode,
7109
                       Py_ssize_t startpos, Py_ssize_t endpos,
7110
                       const char *reason)
7111
37.6k
{
7112
37.6k
    make_encode_exception(exceptionObject,
7113
37.6k
                          encoding, unicode, startpos, endpos, reason);
7114
37.6k
    if (*exceptionObject != NULL)
7115
37.6k
        PyCodec_StrictErrors(*exceptionObject);
7116
37.6k
}
7117
7118
/* error handling callback helper:
7119
   build arguments, call the callback and check the arguments,
7120
   put the result into newpos and return the replacement string, which
7121
   has to be freed by the caller */
7122
static PyObject *
7123
unicode_encode_call_errorhandler(const char *errors,
7124
                                 PyObject **errorHandler,
7125
                                 const char *encoding, const char *reason,
7126
                                 PyObject *unicode, PyObject **exceptionObject,
7127
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7128
                                 Py_ssize_t *newpos)
7129
197k
{
7130
197k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7131
197k
    Py_ssize_t len;
7132
197k
    PyObject *restuple;
7133
197k
    PyObject *resunicode;
7134
7135
197k
    if (*errorHandler == NULL) {
7136
197k
        *errorHandler = PyCodec_LookupError(errors);
7137
197k
        if (*errorHandler == NULL)
7138
0
            return NULL;
7139
197k
    }
7140
7141
197k
    len = PyUnicode_GET_LENGTH(unicode);
7142
7143
197k
    make_encode_exception(exceptionObject,
7144
197k
                          encoding, unicode, startpos, endpos, reason);
7145
197k
    if (*exceptionObject == NULL)
7146
0
        return NULL;
7147
7148
197k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7149
197k
    if (restuple == NULL)
7150
197k
        return NULL;
7151
0
    if (!PyTuple_Check(restuple)) {
7152
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7153
0
        Py_DECREF(restuple);
7154
0
        return NULL;
7155
0
    }
7156
0
    if (!PyArg_ParseTuple(restuple, argparse,
7157
0
                          &resunicode, newpos)) {
7158
0
        Py_DECREF(restuple);
7159
0
        return NULL;
7160
0
    }
7161
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7162
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7163
0
        Py_DECREF(restuple);
7164
0
        return NULL;
7165
0
    }
7166
0
    if (*newpos<0)
7167
0
        *newpos = len + *newpos;
7168
0
    if (*newpos<0 || *newpos>len) {
7169
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7170
0
        Py_DECREF(restuple);
7171
0
        return NULL;
7172
0
    }
7173
0
    Py_INCREF(resunicode);
7174
0
    Py_DECREF(restuple);
7175
0
    return resunicode;
7176
0
}
7177
7178
static PyObject *
7179
unicode_encode_ucs1(PyObject *unicode,
7180
                    const char *errors,
7181
                    const Py_UCS4 limit)
7182
48.2k
{
7183
    /* input state */
7184
48.2k
    Py_ssize_t pos=0, size;
7185
48.2k
    int kind;
7186
48.2k
    const void *data;
7187
48.2k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7188
48.2k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7189
48.2k
    PyObject *error_handler_obj = NULL;
7190
48.2k
    PyObject *exc = NULL;
7191
48.2k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7192
48.2k
    PyObject *rep = NULL;
7193
7194
48.2k
    size = PyUnicode_GET_LENGTH(unicode);
7195
48.2k
    kind = PyUnicode_KIND(unicode);
7196
48.2k
    data = PyUnicode_DATA(unicode);
7197
    /* allocate enough for a simple encoding without
7198
       replacements, if we need more, we'll resize */
7199
48.2k
    if (size == 0)
7200
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7201
7202
    /* output object */
7203
48.2k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7204
48.2k
    if (writer == NULL) {
7205
0
        return NULL;
7206
0
    }
7207
    /* pointer into the output */
7208
48.2k
    char *str = PyBytesWriter_GetData(writer);
7209
7210
4.88M
    while (pos < size) {
7211
4.88M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7212
7213
        /* can we encode this? */
7214
4.88M
        if (ch < limit) {
7215
            /* no overflow check, because we know that the space is enough */
7216
4.84M
            *str++ = (char)ch;
7217
4.84M
            ++pos;
7218
4.84M
        }
7219
48.2k
        else {
7220
48.2k
            Py_ssize_t newpos, i;
7221
            /* startpos for collecting unencodable chars */
7222
48.2k
            Py_ssize_t collstart = pos;
7223
48.2k
            Py_ssize_t collend = collstart + 1;
7224
            /* find all unecodable characters */
7225
7226
332k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7227
284k
                ++collend;
7228
7229
            /* Only overallocate the buffer if it's not the last write */
7230
48.2k
            writer->overallocate = (collend < size);
7231
7232
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7233
48.2k
            if (error_handler == _Py_ERROR_UNKNOWN)
7234
48.2k
                error_handler = _Py_GetErrorHandler(errors);
7235
7236
48.2k
            switch (error_handler) {
7237
37.6k
            case _Py_ERROR_STRICT:
7238
37.6k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7239
37.6k
                goto onError;
7240
7241
0
            case _Py_ERROR_REPLACE:
7242
0
                memset(str, '?', collend - collstart);
7243
0
                str += (collend - collstart);
7244
0
                _Py_FALLTHROUGH;
7245
0
            case _Py_ERROR_IGNORE:
7246
0
                pos = collend;
7247
0
                break;
7248
7249
0
            case _Py_ERROR_BACKSLASHREPLACE:
7250
                /* subtract preallocated bytes */
7251
0
                writer->size -= (collend - collstart);
7252
0
                str = backslashreplace(writer, str,
7253
0
                                       unicode, collstart, collend);
7254
0
                if (str == NULL)
7255
0
                    goto onError;
7256
0
                pos = collend;
7257
0
                break;
7258
7259
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7260
                /* subtract preallocated bytes */
7261
0
                writer->size -= (collend - collstart);
7262
0
                str = xmlcharrefreplace(writer, str,
7263
0
                                        unicode, collstart, collend);
7264
0
                if (str == NULL)
7265
0
                    goto onError;
7266
0
                pos = collend;
7267
0
                break;
7268
7269
10.6k
            case _Py_ERROR_SURROGATEESCAPE:
7270
10.6k
                for (i = collstart; i < collend; ++i) {
7271
10.6k
                    ch = PyUnicode_READ(kind, data, i);
7272
10.6k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7273
                        /* Not a UTF-8b surrogate */
7274
10.6k
                        break;
7275
10.6k
                    }
7276
0
                    *str++ = (char)(ch - 0xdc00);
7277
0
                    ++pos;
7278
0
                }
7279
10.6k
                if (i >= collend)
7280
0
                    break;
7281
10.6k
                collstart = pos;
7282
10.6k
                assert(collstart != collend);
7283
10.6k
                _Py_FALLTHROUGH;
7284
7285
10.6k
            default:
7286
10.6k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7287
10.6k
                                                       encoding, reason, unicode, &exc,
7288
10.6k
                                                       collstart, collend, &newpos);
7289
10.6k
                if (rep == NULL)
7290
10.6k
                    goto onError;
7291
7292
0
                if (newpos < collstart) {
7293
0
                    writer->overallocate = 1;
7294
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7295
0
                                                             collstart - newpos,
7296
0
                                                             str);
7297
0
                    if (str == NULL) {
7298
0
                        goto onError;
7299
0
                    }
7300
0
                }
7301
0
                else {
7302
                    /* subtract preallocated bytes */
7303
0
                    writer->size -= newpos - collstart;
7304
                    /* Only overallocate the buffer if it's not the last write */
7305
0
                    writer->overallocate = (newpos < size);
7306
0
                }
7307
7308
0
                char *rep_str;
7309
0
                Py_ssize_t rep_len;
7310
0
                if (PyBytes_Check(rep)) {
7311
                    /* Directly copy bytes result to output. */
7312
0
                    rep_str = PyBytes_AS_STRING(rep);
7313
0
                    rep_len = PyBytes_GET_SIZE(rep);
7314
0
                }
7315
0
                else {
7316
0
                    assert(PyUnicode_Check(rep));
7317
7318
0
                    if (limit == 256 ?
7319
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7320
0
                        !PyUnicode_IS_ASCII(rep))
7321
0
                    {
7322
                        /* Not all characters are smaller than limit */
7323
0
                        raise_encode_exception(&exc, encoding, unicode,
7324
0
                                               collstart, collend, reason);
7325
0
                        goto onError;
7326
0
                    }
7327
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7328
0
                    rep_str = PyUnicode_DATA(rep);
7329
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7330
0
                }
7331
7332
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7333
0
                if (str == NULL) {
7334
0
                    goto onError;
7335
0
                }
7336
0
                memcpy(str, rep_str, rep_len);
7337
0
                str += rep_len;
7338
7339
0
                pos = newpos;
7340
0
                Py_CLEAR(rep);
7341
48.2k
            }
7342
7343
            /* If overallocation was disabled, ensure that it was the last
7344
               write. Otherwise, we missed an optimization */
7345
48.2k
            assert(writer->overallocate || pos == size);
7346
0
        }
7347
4.88M
    }
7348
7349
0
    Py_XDECREF(error_handler_obj);
7350
0
    Py_XDECREF(exc);
7351
0
    return PyBytesWriter_FinishWithPointer(writer, str);
7352
7353
48.2k
  onError:
7354
48.2k
    Py_XDECREF(rep);
7355
48.2k
    PyBytesWriter_Discard(writer);
7356
48.2k
    Py_XDECREF(error_handler_obj);
7357
48.2k
    Py_XDECREF(exc);
7358
48.2k
    return NULL;
7359
48.2k
}
7360
7361
PyObject *
7362
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7363
0
{
7364
0
    if (!PyUnicode_Check(unicode)) {
7365
0
        PyErr_BadArgument();
7366
0
        return NULL;
7367
0
    }
7368
    /* Fast path: if it is a one-byte string, construct
7369
       bytes object directly. */
7370
0
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7371
0
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7372
0
                                         PyUnicode_GET_LENGTH(unicode));
7373
    /* Non-Latin-1 characters present. Defer to above function to
7374
       raise the exception. */
7375
0
    return unicode_encode_ucs1(unicode, errors, 256);
7376
0
}
7377
7378
PyObject*
7379
PyUnicode_AsLatin1String(PyObject *unicode)
7380
0
{
7381
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7382
0
}
7383
7384
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7385
7386
PyObject *
7387
PyUnicode_DecodeASCII(const char *s,
7388
                      Py_ssize_t size,
7389
                      const char *errors)
7390
4.43M
{
7391
4.43M
    const char *starts = s;
7392
4.43M
    const char *e = s + size;
7393
4.43M
    PyObject *error_handler_obj = NULL;
7394
4.43M
    PyObject *exc = NULL;
7395
4.43M
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7396
7397
4.43M
    if (size == 0)
7398
0
        _Py_RETURN_UNICODE_EMPTY();
7399
7400
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7401
4.43M
    if (size == 1 && (unsigned char)s[0] < 128) {
7402
15.9k
        return get_latin1_char((unsigned char)s[0]);
7403
15.9k
    }
7404
7405
    // Shortcut for simple case
7406
4.41M
    PyObject *u = PyUnicode_New(size, 127);
7407
4.41M
    if (u == NULL) {
7408
0
        return NULL;
7409
0
    }
7410
4.41M
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7411
4.41M
    if (outpos == size) {
7412
3.40M
        return u;
7413
3.40M
    }
7414
7415
1.01M
    _PyUnicodeWriter writer;
7416
1.01M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7417
1.01M
    writer.pos = outpos;
7418
7419
1.01M
    s += outpos;
7420
1.01M
    int kind = writer.kind;
7421
1.01M
    void *data = writer.data;
7422
1.01M
    Py_ssize_t startinpos, endinpos;
7423
7424
19.7M
    while (s < e) {
7425
19.6M
        unsigned char c = (unsigned char)*s;
7426
19.6M
        if (c < 128) {
7427
6.97M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7428
6.97M
            writer.pos++;
7429
6.97M
            ++s;
7430
6.97M
            continue;
7431
6.97M
        }
7432
7433
        /* byte outsize range 0x00..0x7f: call the error handler */
7434
7435
12.6M
        if (error_handler == _Py_ERROR_UNKNOWN)
7436
1.01M
            error_handler = _Py_GetErrorHandler(errors);
7437
7438
12.6M
        switch (error_handler)
7439
12.6M
        {
7440
814k
        case _Py_ERROR_REPLACE:
7441
11.7M
        case _Py_ERROR_SURROGATEESCAPE:
7442
            /* Fast-path: the error handler only writes one character,
7443
               but we may switch to UCS2 at the first write */
7444
11.7M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7445
0
                goto onError;
7446
11.7M
            kind = writer.kind;
7447
11.7M
            data = writer.data;
7448
7449
11.7M
            if (error_handler == _Py_ERROR_REPLACE)
7450
814k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7451
10.9M
            else
7452
10.9M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7453
11.7M
            writer.pos++;
7454
11.7M
            ++s;
7455
11.7M
            break;
7456
7457
0
        case _Py_ERROR_IGNORE:
7458
0
            ++s;
7459
0
            break;
7460
7461
849k
        default:
7462
849k
            startinpos = s-starts;
7463
849k
            endinpos = startinpos + 1;
7464
849k
            if (unicode_decode_call_errorhandler_writer(
7465
849k
                    errors, &error_handler_obj,
7466
849k
                    "ascii", "ordinal not in range(128)",
7467
849k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7468
849k
                    &writer))
7469
849k
                goto onError;
7470
0
            kind = writer.kind;
7471
0
            data = writer.data;
7472
12.6M
        }
7473
12.6M
    }
7474
166k
    Py_XDECREF(error_handler_obj);
7475
166k
    Py_XDECREF(exc);
7476
166k
    return _PyUnicodeWriter_Finish(&writer);
7477
7478
849k
  onError:
7479
849k
    _PyUnicodeWriter_Dealloc(&writer);
7480
849k
    Py_XDECREF(error_handler_obj);
7481
849k
    Py_XDECREF(exc);
7482
849k
    return NULL;
7483
1.01M
}
7484
7485
PyObject *
7486
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7487
204k
{
7488
204k
    if (!PyUnicode_Check(unicode)) {
7489
0
        PyErr_BadArgument();
7490
0
        return NULL;
7491
0
    }
7492
    /* Fast path: if it is an ASCII-only string, construct bytes object
7493
       directly. Else defer to above function to raise the exception. */
7494
204k
    if (PyUnicode_IS_ASCII(unicode))
7495
155k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7496
155k
                                         PyUnicode_GET_LENGTH(unicode));
7497
48.2k
    return unicode_encode_ucs1(unicode, errors, 128);
7498
204k
}
7499
7500
PyObject *
7501
PyUnicode_AsASCIIString(PyObject *unicode)
7502
78.2k
{
7503
78.2k
    return _PyUnicode_AsASCIIString(unicode, NULL);
7504
78.2k
}
7505
7506
#ifdef MS_WINDOWS
7507
7508
/* --- MBCS codecs for Windows -------------------------------------------- */
7509
7510
#if SIZEOF_INT < SIZEOF_SIZE_T
7511
#define NEED_RETRY
7512
#endif
7513
7514
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7515
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7516
   both cases also and avoids partial characters overrunning the
7517
   length limit in MultiByteToWideChar on Windows */
7518
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7519
7520
#ifndef WC_ERR_INVALID_CHARS
7521
#  define WC_ERR_INVALID_CHARS 0x0080
7522
#endif
7523
7524
static const char*
7525
code_page_name(UINT code_page, PyObject **obj)
7526
{
7527
    *obj = NULL;
7528
    if (code_page == CP_ACP)
7529
        return "mbcs";
7530
7531
    *obj = PyBytes_FromFormat("cp%u", code_page);
7532
    if (*obj == NULL)
7533
        return NULL;
7534
    return PyBytes_AS_STRING(*obj);
7535
}
7536
7537
static DWORD
7538
decode_code_page_flags(UINT code_page)
7539
{
7540
    if (code_page == CP_UTF7) {
7541
        /* The CP_UTF7 decoder only supports flags=0 */
7542
        return 0;
7543
    }
7544
    else
7545
        return MB_ERR_INVALID_CHARS;
7546
}
7547
7548
/*
7549
 * Decode a byte string from a Windows code page into unicode object in strict
7550
 * mode.
7551
 *
7552
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7553
 * OSError and returns -1 on other error.
7554
 */
7555
static int
7556
decode_code_page_strict(UINT code_page,
7557
                        wchar_t **buf,
7558
                        Py_ssize_t *bufsize,
7559
                        const char *in,
7560
                        int insize)
7561
{
7562
    DWORD flags = MB_ERR_INVALID_CHARS;
7563
    wchar_t *out;
7564
    DWORD outsize;
7565
7566
    /* First get the size of the result */
7567
    assert(insize > 0);
7568
    while ((outsize = MultiByteToWideChar(code_page, flags,
7569
                                          in, insize, NULL, 0)) <= 0)
7570
    {
7571
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7572
            goto error;
7573
        }
7574
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7575
        flags = 0;
7576
    }
7577
7578
    /* Extend a wchar_t* buffer */
7579
    Py_ssize_t n = *bufsize;   /* Get the current length */
7580
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7581
        return -1;
7582
    }
7583
    out = *buf + n;
7584
7585
    /* Do the conversion */
7586
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7587
    if (outsize <= 0)
7588
        goto error;
7589
    return insize;
7590
7591
error:
7592
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7593
        return -2;
7594
    PyErr_SetFromWindowsErr(0);
7595
    return -1;
7596
}
7597
7598
/*
7599
 * Decode a byte string from a code page into unicode object with an error
7600
 * handler.
7601
 *
7602
 * Returns consumed size if succeed, or raise an OSError or
7603
 * UnicodeDecodeError exception and returns -1 on error.
7604
 */
7605
static int
7606
decode_code_page_errors(UINT code_page,
7607
                        wchar_t **buf,
7608
                        Py_ssize_t *bufsize,
7609
                        const char *in, const int size,
7610
                        const char *errors, int final)
7611
{
7612
    const char *startin = in;
7613
    const char *endin = in + size;
7614
    DWORD flags = MB_ERR_INVALID_CHARS;
7615
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7616
       2000 English version of the message. */
7617
    const char *reason = "No mapping for the Unicode character exists "
7618
                         "in the target code page.";
7619
    /* each step cannot decode more than 1 character, but a character can be
7620
       represented as a surrogate pair */
7621
    wchar_t buffer[2], *out;
7622
    int insize;
7623
    Py_ssize_t outsize;
7624
    PyObject *errorHandler = NULL;
7625
    PyObject *exc = NULL;
7626
    PyObject *encoding_obj = NULL;
7627
    const char *encoding;
7628
    DWORD err;
7629
    int ret = -1;
7630
7631
    assert(size > 0);
7632
7633
    encoding = code_page_name(code_page, &encoding_obj);
7634
    if (encoding == NULL)
7635
        return -1;
7636
7637
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7638
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7639
           UnicodeDecodeError. */
7640
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7641
        if (exc != NULL) {
7642
            PyCodec_StrictErrors(exc);
7643
            Py_CLEAR(exc);
7644
        }
7645
        goto error;
7646
    }
7647
7648
    /* Extend a wchar_t* buffer */
7649
    Py_ssize_t n = *bufsize;   /* Get the current length */
7650
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7651
        PyErr_NoMemory();
7652
        goto error;
7653
    }
7654
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7655
        goto error;
7656
    }
7657
    out = *buf + n;
7658
7659
    /* Decode the byte string character per character */
7660
    while (in < endin)
7661
    {
7662
        /* Decode a character */
7663
        insize = 1;
7664
        do
7665
        {
7666
            outsize = MultiByteToWideChar(code_page, flags,
7667
                                          in, insize,
7668
                                          buffer, Py_ARRAY_LENGTH(buffer));
7669
            if (outsize > 0)
7670
                break;
7671
            err = GetLastError();
7672
            if (err == ERROR_INVALID_FLAGS && flags) {
7673
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7674
                flags = 0;
7675
                continue;
7676
            }
7677
            if (err != ERROR_NO_UNICODE_TRANSLATION
7678
                && err != ERROR_INSUFFICIENT_BUFFER)
7679
            {
7680
                PyErr_SetFromWindowsErr(err);
7681
                goto error;
7682
            }
7683
            insize++;
7684
        }
7685
        /* 4=maximum length of a UTF-8 sequence */
7686
        while (insize <= 4 && (in + insize) <= endin);
7687
7688
        if (outsize <= 0) {
7689
            Py_ssize_t startinpos, endinpos, outpos;
7690
7691
            /* last character in partial decode? */
7692
            if (in + insize >= endin && !final)
7693
                break;
7694
7695
            startinpos = in - startin;
7696
            endinpos = startinpos + 1;
7697
            outpos = out - *buf;
7698
            if (unicode_decode_call_errorhandler_wchar(
7699
                    errors, &errorHandler,
7700
                    encoding, reason,
7701
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7702
                    buf, bufsize, &outpos))
7703
            {
7704
                goto error;
7705
            }
7706
            out = *buf + outpos;
7707
        }
7708
        else {
7709
            in += insize;
7710
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7711
            out += outsize;
7712
        }
7713
    }
7714
7715
    /* Shrink the buffer */
7716
    assert(out - *buf <= *bufsize);
7717
    *bufsize = out - *buf;
7718
    /* (in - startin) <= size and size is an int */
7719
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7720
7721
error:
7722
    Py_XDECREF(encoding_obj);
7723
    Py_XDECREF(errorHandler);
7724
    Py_XDECREF(exc);
7725
    return ret;
7726
}
7727
7728
static PyObject *
7729
decode_code_page_stateful(int code_page,
7730
                          const char *s, Py_ssize_t size,
7731
                          const char *errors, Py_ssize_t *consumed)
7732
{
7733
    wchar_t *buf = NULL;
7734
    Py_ssize_t bufsize = 0;
7735
    int chunk_size, final, converted, done;
7736
7737
    if (code_page < 0) {
7738
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7739
        return NULL;
7740
    }
7741
    if (size < 0) {
7742
        PyErr_BadInternalCall();
7743
        return NULL;
7744
    }
7745
7746
    if (consumed)
7747
        *consumed = 0;
7748
7749
    do
7750
    {
7751
#ifdef NEED_RETRY
7752
        if (size > DECODING_CHUNK_SIZE) {
7753
            chunk_size = DECODING_CHUNK_SIZE;
7754
            final = 0;
7755
            done = 0;
7756
        }
7757
        else
7758
#endif
7759
        {
7760
            chunk_size = (int)size;
7761
            final = (consumed == NULL);
7762
            done = 1;
7763
        }
7764
7765
        if (chunk_size == 0 && done) {
7766
            if (buf != NULL)
7767
                break;
7768
            _Py_RETURN_UNICODE_EMPTY();
7769
        }
7770
7771
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7772
                                            s, chunk_size);
7773
        if (converted == -2)
7774
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7775
                                                s, chunk_size,
7776
                                                errors, final);
7777
        assert(converted != 0 || done);
7778
7779
        if (converted < 0) {
7780
            PyMem_Free(buf);
7781
            return NULL;
7782
        }
7783
7784
        if (consumed)
7785
            *consumed += converted;
7786
7787
        s += converted;
7788
        size -= converted;
7789
    } while (!done);
7790
7791
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7792
    PyMem_Free(buf);
7793
    return v;
7794
}
7795
7796
PyObject *
7797
PyUnicode_DecodeCodePageStateful(int code_page,
7798
                                 const char *s,
7799
                                 Py_ssize_t size,
7800
                                 const char *errors,
7801
                                 Py_ssize_t *consumed)
7802
{
7803
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7804
}
7805
7806
PyObject *
7807
PyUnicode_DecodeMBCSStateful(const char *s,
7808
                             Py_ssize_t size,
7809
                             const char *errors,
7810
                             Py_ssize_t *consumed)
7811
{
7812
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7813
}
7814
7815
PyObject *
7816
PyUnicode_DecodeMBCS(const char *s,
7817
                     Py_ssize_t size,
7818
                     const char *errors)
7819
{
7820
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7821
}
7822
7823
static DWORD
7824
encode_code_page_flags(UINT code_page, const char *errors)
7825
{
7826
    if (code_page == CP_UTF8) {
7827
        return WC_ERR_INVALID_CHARS;
7828
    }
7829
    else if (code_page == CP_UTF7) {
7830
        /* CP_UTF7 only supports flags=0 */
7831
        return 0;
7832
    }
7833
    else {
7834
        if (errors != NULL && strcmp(errors, "replace") == 0)
7835
            return 0;
7836
        else
7837
            return WC_NO_BEST_FIT_CHARS;
7838
    }
7839
}
7840
7841
/*
7842
 * Encode a Unicode string to a Windows code page into a byte string in strict
7843
 * mode.
7844
 *
7845
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7846
 * an OSError and returns -1 on other error.
7847
 */
7848
static int
7849
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7850
                        PyObject *unicode, Py_ssize_t offset, int len,
7851
                        const char* errors)
7852
{
7853
    BOOL usedDefaultChar = FALSE;
7854
    BOOL *pusedDefaultChar = &usedDefaultChar;
7855
    int outsize;
7856
    wchar_t *p;
7857
    Py_ssize_t size;
7858
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7859
    char *out;
7860
    /* Create a substring so that we can get the UTF-16 representation
7861
       of just the slice under consideration. */
7862
    PyObject *substring;
7863
    int ret = -1;
7864
7865
    assert(len > 0);
7866
7867
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7868
        pusedDefaultChar = &usedDefaultChar;
7869
    else
7870
        pusedDefaultChar = NULL;
7871
7872
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7873
    if (substring == NULL)
7874
        return -1;
7875
    p = PyUnicode_AsWideCharString(substring, &size);
7876
    Py_CLEAR(substring);
7877
    if (p == NULL) {
7878
        return -1;
7879
    }
7880
    assert(size <= INT_MAX);
7881
7882
    /* First get the size of the result */
7883
    outsize = WideCharToMultiByte(code_page, flags,
7884
                                  p, (int)size,
7885
                                  NULL, 0,
7886
                                  NULL, pusedDefaultChar);
7887
    if (outsize <= 0)
7888
        goto error;
7889
    /* If we used a default char, then we failed! */
7890
    if (pusedDefaultChar && *pusedDefaultChar) {
7891
        ret = -2;
7892
        goto done;
7893
    }
7894
7895
    if (*writer == NULL) {
7896
        /* Create string object */
7897
        *writer = PyBytesWriter_Create(outsize);
7898
        if (*writer == NULL) {
7899
            goto done;
7900
        }
7901
        out = PyBytesWriter_GetData(*writer);
7902
    }
7903
    else {
7904
        /* Extend string object */
7905
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
7906
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
7907
            goto done;
7908
        }
7909
        out = (char*)PyBytesWriter_GetData(*writer) + n;
7910
    }
7911
7912
    /* Do the conversion */
7913
    outsize = WideCharToMultiByte(code_page, flags,
7914
                                  p, (int)size,
7915
                                  out, outsize,
7916
                                  NULL, pusedDefaultChar);
7917
    if (outsize <= 0)
7918
        goto error;
7919
    if (pusedDefaultChar && *pusedDefaultChar) {
7920
        ret = -2;
7921
        goto done;
7922
    }
7923
    ret = 0;
7924
7925
done:
7926
    PyMem_Free(p);
7927
    return ret;
7928
7929
error:
7930
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7931
        ret = -2;
7932
        goto done;
7933
    }
7934
    PyErr_SetFromWindowsErr(0);
7935
    goto done;
7936
}
7937
7938
/*
7939
 * Encode a Unicode string to a Windows code page into a byte string using an
7940
 * error handler.
7941
 *
7942
 * Returns consumed characters if succeed, or raise an OSError and returns
7943
 * -1 on other error.
7944
 */
7945
static int
7946
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
7947
                        PyObject *unicode, Py_ssize_t unicode_offset,
7948
                        Py_ssize_t insize, const char* errors)
7949
{
7950
    const DWORD flags = encode_code_page_flags(code_page, errors);
7951
    Py_ssize_t pos = unicode_offset;
7952
    Py_ssize_t endin = unicode_offset + insize;
7953
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7954
       2000 English version of the message. */
7955
    const char *reason = "invalid character";
7956
    /* 4=maximum length of a UTF-8 sequence */
7957
    char buffer[4];
7958
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7959
    Py_ssize_t outsize;
7960
    char *out;
7961
    PyObject *errorHandler = NULL;
7962
    PyObject *exc = NULL;
7963
    PyObject *encoding_obj = NULL;
7964
    const char *encoding;
7965
    Py_ssize_t newpos;
7966
    PyObject *rep;
7967
    int ret = -1;
7968
7969
    assert(insize > 0);
7970
7971
    encoding = code_page_name(code_page, &encoding_obj);
7972
    if (encoding == NULL)
7973
        return -1;
7974
7975
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7976
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7977
           then we raise a UnicodeEncodeError. */
7978
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7979
        if (exc != NULL) {
7980
            PyCodec_StrictErrors(exc);
7981
            Py_DECREF(exc);
7982
        }
7983
        Py_XDECREF(encoding_obj);
7984
        return -1;
7985
    }
7986
7987
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7988
        pusedDefaultChar = &usedDefaultChar;
7989
    else
7990
        pusedDefaultChar = NULL;
7991
7992
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7993
        PyErr_NoMemory();
7994
        goto error;
7995
    }
7996
    outsize = insize * Py_ARRAY_LENGTH(buffer);
7997
7998
    if (*writer == NULL) {
7999
        /* Create string object */
8000
        *writer = PyBytesWriter_Create(outsize);
8001
        if (*writer == NULL) {
8002
            goto error;
8003
        }
8004
        out = PyBytesWriter_GetData(*writer);
8005
    }
8006
    else {
8007
        /* Extend string object */
8008
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8009
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8010
            goto error;
8011
        }
8012
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8013
    }
8014
8015
    /* Encode the string character per character */
8016
    while (pos < endin)
8017
    {
8018
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8019
        wchar_t chars[2];
8020
        int charsize;
8021
        if (ch < 0x10000) {
8022
            chars[0] = (wchar_t)ch;
8023
            charsize = 1;
8024
        }
8025
        else {
8026
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8027
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8028
            charsize = 2;
8029
        }
8030
8031
        outsize = WideCharToMultiByte(code_page, flags,
8032
                                      chars, charsize,
8033
                                      buffer, Py_ARRAY_LENGTH(buffer),
8034
                                      NULL, pusedDefaultChar);
8035
        if (outsize > 0) {
8036
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8037
            {
8038
                pos++;
8039
                memcpy(out, buffer, outsize);
8040
                out += outsize;
8041
                continue;
8042
            }
8043
        }
8044
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8045
            PyErr_SetFromWindowsErr(0);
8046
            goto error;
8047
        }
8048
8049
        rep = unicode_encode_call_errorhandler(
8050
                  errors, &errorHandler, encoding, reason,
8051
                  unicode, &exc,
8052
                  pos, pos + 1, &newpos);
8053
        if (rep == NULL)
8054
            goto error;
8055
8056
        Py_ssize_t morebytes = pos - newpos;
8057
        if (PyBytes_Check(rep)) {
8058
            outsize = PyBytes_GET_SIZE(rep);
8059
            morebytes += outsize;
8060
            if (morebytes > 0) {
8061
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8062
                if (out == NULL) {
8063
                    Py_DECREF(rep);
8064
                    goto error;
8065
                }
8066
            }
8067
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8068
            out += outsize;
8069
        }
8070
        else {
8071
            Py_ssize_t i;
8072
            int kind;
8073
            const void *data;
8074
8075
            outsize = PyUnicode_GET_LENGTH(rep);
8076
            morebytes += outsize;
8077
            if (morebytes > 0) {
8078
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8079
                if (out == NULL) {
8080
                    Py_DECREF(rep);
8081
                    goto error;
8082
                }
8083
            }
8084
            kind = PyUnicode_KIND(rep);
8085
            data = PyUnicode_DATA(rep);
8086
            for (i=0; i < outsize; i++) {
8087
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8088
                if (ch > 127) {
8089
                    raise_encode_exception(&exc,
8090
                        encoding, unicode,
8091
                        pos, pos + 1,
8092
                        "unable to encode error handler result to ASCII");
8093
                    Py_DECREF(rep);
8094
                    goto error;
8095
                }
8096
                *out = (unsigned char)ch;
8097
                out++;
8098
            }
8099
        }
8100
        pos = newpos;
8101
        Py_DECREF(rep);
8102
    }
8103
    /* write a NUL byte */
8104
    *out = 0;
8105
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8106
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8107
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8108
        goto error;
8109
    }
8110
    ret = 0;
8111
8112
error:
8113
    Py_XDECREF(encoding_obj);
8114
    Py_XDECREF(errorHandler);
8115
    Py_XDECREF(exc);
8116
    return ret;
8117
}
8118
8119
8120
PyObject *
8121
PyUnicode_EncodeCodePage(int code_page,
8122
                         PyObject *unicode,
8123
                         const char *errors)
8124
{
8125
    Py_ssize_t len;
8126
    PyBytesWriter *writer = NULL;
8127
    Py_ssize_t offset;
8128
    int chunk_len, ret, done;
8129
8130
    if (!PyUnicode_Check(unicode)) {
8131
        PyErr_BadArgument();
8132
        return NULL;
8133
    }
8134
8135
    len = PyUnicode_GET_LENGTH(unicode);
8136
8137
    if (code_page < 0) {
8138
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8139
        return NULL;
8140
    }
8141
8142
    if (len == 0)
8143
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8144
8145
    offset = 0;
8146
    do
8147
    {
8148
#ifdef NEED_RETRY
8149
        if (len > DECODING_CHUNK_SIZE) {
8150
            chunk_len = DECODING_CHUNK_SIZE;
8151
            done = 0;
8152
        }
8153
        else
8154
#endif
8155
        {
8156
            chunk_len = (int)len;
8157
            done = 1;
8158
        }
8159
8160
        ret = encode_code_page_strict(code_page, &writer,
8161
                                      unicode, offset, chunk_len,
8162
                                      errors);
8163
        if (ret == -2)
8164
            ret = encode_code_page_errors(code_page, &writer,
8165
                                          unicode, offset,
8166
                                          chunk_len, errors);
8167
        if (ret < 0) {
8168
            PyBytesWriter_Discard(writer);
8169
            return NULL;
8170
        }
8171
8172
        offset += chunk_len;
8173
        len -= chunk_len;
8174
    } while (!done);
8175
8176
    return PyBytesWriter_Finish(writer);
8177
}
8178
8179
8180
PyObject *
8181
PyUnicode_AsMBCSString(PyObject *unicode)
8182
{
8183
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8184
}
8185
8186
#undef NEED_RETRY
8187
8188
#endif /* MS_WINDOWS */
8189
8190
/* --- Character Mapping Codec -------------------------------------------- */
8191
8192
static int
8193
charmap_decode_string(const char *s,
8194
                      Py_ssize_t size,
8195
                      PyObject *mapping,
8196
                      const char *errors,
8197
                      _PyUnicodeWriter *writer)
8198
59.1k
{
8199
59.1k
    const char *starts = s;
8200
59.1k
    const char *e;
8201
59.1k
    Py_ssize_t startinpos, endinpos;
8202
59.1k
    PyObject *errorHandler = NULL, *exc = NULL;
8203
59.1k
    Py_ssize_t maplen;
8204
59.1k
    int mapkind;
8205
59.1k
    const void *mapdata;
8206
59.1k
    Py_UCS4 x;
8207
59.1k
    unsigned char ch;
8208
8209
59.1k
    maplen = PyUnicode_GET_LENGTH(mapping);
8210
59.1k
    mapdata = PyUnicode_DATA(mapping);
8211
59.1k
    mapkind = PyUnicode_KIND(mapping);
8212
8213
59.1k
    e = s + size;
8214
8215
59.1k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8216
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8217
         * is disabled in encoding aliases, latin1 is preferred because
8218
         * its implementation is faster. */
8219
126
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8220
126
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8221
126
        Py_UCS4 maxchar = writer->maxchar;
8222
8223
126
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8224
2.18k
        while (s < e) {
8225
2.06k
            ch = *s;
8226
2.06k
            x = mapdata_ucs1[ch];
8227
2.06k
            if (x > maxchar) {
8228
116
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8229
0
                    goto onError;
8230
116
                maxchar = writer->maxchar;
8231
116
                outdata = (Py_UCS1 *)writer->data;
8232
116
            }
8233
2.06k
            outdata[writer->pos] = x;
8234
2.06k
            writer->pos++;
8235
2.06k
            ++s;
8236
2.06k
        }
8237
126
        return 0;
8238
126
    }
8239
8240
155k
    while (s < e) {
8241
142k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8242
142k
            int outkind = writer->kind;
8243
142k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8244
142k
            if (outkind == PyUnicode_1BYTE_KIND) {
8245
92.0k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8246
92.0k
                Py_UCS4 maxchar = writer->maxchar;
8247
1.51M
                while (s < e) {
8248
1.49M
                    ch = *s;
8249
1.49M
                    x = mapdata_ucs2[ch];
8250
1.49M
                    if (x > maxchar)
8251
69.2k
                        goto Error;
8252
1.42M
                    outdata[writer->pos] = x;
8253
1.42M
                    writer->pos++;
8254
1.42M
                    ++s;
8255
1.42M
                }
8256
22.8k
                break;
8257
92.0k
            }
8258
49.9k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8259
49.9k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8260
7.53M
                while (s < e) {
8261
7.51M
                    ch = *s;
8262
7.51M
                    x = mapdata_ucs2[ch];
8263
7.51M
                    if (x == 0xFFFE)
8264
27.3k
                        goto Error;
8265
7.48M
                    outdata[writer->pos] = x;
8266
7.48M
                    writer->pos++;
8267
7.48M
                    ++s;
8268
7.48M
                }
8269
22.5k
                break;
8270
49.9k
            }
8271
142k
        }
8272
0
        ch = *s;
8273
8274
0
        if (ch < maplen)
8275
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8276
0
        else
8277
0
            x = 0xfffe; /* invalid value */
8278
96.6k
Error:
8279
96.6k
        if (x == 0xfffe)
8280
43.8k
        {
8281
            /* undefined mapping */
8282
43.8k
            startinpos = s-starts;
8283
43.8k
            endinpos = startinpos+1;
8284
43.8k
            if (unicode_decode_call_errorhandler_writer(
8285
43.8k
                    errors, &errorHandler,
8286
43.8k
                    "charmap", "character maps to <undefined>",
8287
43.8k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8288
43.8k
                    writer)) {
8289
15
                goto onError;
8290
15
            }
8291
43.8k
            continue;
8292
43.8k
        }
8293
8294
52.7k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8295
0
            goto onError;
8296
52.7k
        ++s;
8297
52.7k
    }
8298
58.9k
    Py_XDECREF(errorHandler);
8299
58.9k
    Py_XDECREF(exc);
8300
58.9k
    return 0;
8301
8302
15
onError:
8303
15
    Py_XDECREF(errorHandler);
8304
15
    Py_XDECREF(exc);
8305
15
    return -1;
8306
58.9k
}
8307
8308
static int
8309
charmap_decode_mapping(const char *s,
8310
                       Py_ssize_t size,
8311
                       PyObject *mapping,
8312
                       const char *errors,
8313
                       _PyUnicodeWriter *writer)
8314
0
{
8315
0
    const char *starts = s;
8316
0
    const char *e;
8317
0
    Py_ssize_t startinpos, endinpos;
8318
0
    PyObject *errorHandler = NULL, *exc = NULL;
8319
0
    unsigned char ch;
8320
0
    PyObject *key, *item = NULL;
8321
8322
0
    e = s + size;
8323
8324
0
    while (s < e) {
8325
0
        ch = *s;
8326
8327
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8328
0
        key = PyLong_FromLong((long)ch);
8329
0
        if (key == NULL)
8330
0
            goto onError;
8331
8332
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8333
0
        Py_DECREF(key);
8334
0
        if (rc == 0) {
8335
            /* No mapping found means: mapping is undefined. */
8336
0
            goto Undefined;
8337
0
        }
8338
0
        if (item == NULL) {
8339
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8340
                /* No mapping found means: mapping is undefined. */
8341
0
                PyErr_Clear();
8342
0
                goto Undefined;
8343
0
            } else
8344
0
                goto onError;
8345
0
        }
8346
8347
        /* Apply mapping */
8348
0
        if (item == Py_None)
8349
0
            goto Undefined;
8350
0
        if (PyLong_Check(item)) {
8351
0
            long value = PyLong_AsLong(item);
8352
0
            if (value == 0xFFFE)
8353
0
                goto Undefined;
8354
0
            if (value < 0 || value > MAX_UNICODE) {
8355
0
                PyErr_Format(PyExc_TypeError,
8356
0
                             "character mapping must be in range(0x%x)",
8357
0
                             (unsigned long)MAX_UNICODE + 1);
8358
0
                goto onError;
8359
0
            }
8360
8361
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8362
0
                goto onError;
8363
0
        }
8364
0
        else if (PyUnicode_Check(item)) {
8365
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8366
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8367
0
                if (value == 0xFFFE)
8368
0
                    goto Undefined;
8369
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8370
0
                    goto onError;
8371
0
            }
8372
0
            else {
8373
0
                writer->overallocate = 1;
8374
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8375
0
                    goto onError;
8376
0
            }
8377
0
        }
8378
0
        else {
8379
            /* wrong return value */
8380
0
            PyErr_SetString(PyExc_TypeError,
8381
0
                            "character mapping must return integer, None or str");
8382
0
            goto onError;
8383
0
        }
8384
0
        Py_CLEAR(item);
8385
0
        ++s;
8386
0
        continue;
8387
8388
0
Undefined:
8389
        /* undefined mapping */
8390
0
        Py_CLEAR(item);
8391
0
        startinpos = s-starts;
8392
0
        endinpos = startinpos+1;
8393
0
        if (unicode_decode_call_errorhandler_writer(
8394
0
                errors, &errorHandler,
8395
0
                "charmap", "character maps to <undefined>",
8396
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8397
0
                writer)) {
8398
0
            goto onError;
8399
0
        }
8400
0
    }
8401
0
    Py_XDECREF(errorHandler);
8402
0
    Py_XDECREF(exc);
8403
0
    return 0;
8404
8405
0
onError:
8406
0
    Py_XDECREF(item);
8407
0
    Py_XDECREF(errorHandler);
8408
0
    Py_XDECREF(exc);
8409
0
    return -1;
8410
0
}
8411
8412
PyObject *
8413
PyUnicode_DecodeCharmap(const char *s,
8414
                        Py_ssize_t size,
8415
                        PyObject *mapping,
8416
                        const char *errors)
8417
59.1k
{
8418
59.1k
    _PyUnicodeWriter writer;
8419
8420
    /* Default to Latin-1 */
8421
59.1k
    if (mapping == NULL)
8422
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8423
8424
59.1k
    if (size == 0)
8425
0
        _Py_RETURN_UNICODE_EMPTY();
8426
59.1k
    _PyUnicodeWriter_Init(&writer);
8427
59.1k
    writer.min_length = size;
8428
59.1k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8429
0
        goto onError;
8430
8431
59.1k
    if (PyUnicode_CheckExact(mapping)) {
8432
59.1k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8433
15
            goto onError;
8434
59.1k
    }
8435
0
    else {
8436
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8437
0
            goto onError;
8438
0
    }
8439
59.1k
    return _PyUnicodeWriter_Finish(&writer);
8440
8441
15
  onError:
8442
15
    _PyUnicodeWriter_Dealloc(&writer);
8443
15
    return NULL;
8444
59.1k
}
8445
8446
/* Charmap encoding: the lookup table */
8447
8448
/*[clinic input]
8449
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8450
[clinic start generated code]*/
8451
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8452
8453
struct encoding_map {
8454
    PyObject_HEAD
8455
    unsigned char level1[32];
8456
    int count2, count3;
8457
    unsigned char level23[1];
8458
};
8459
8460
/*[clinic input]
8461
EncodingMap.size
8462
8463
Return the size (in bytes) of this object.
8464
[clinic start generated code]*/
8465
8466
static PyObject *
8467
EncodingMap_size_impl(struct encoding_map *self)
8468
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8469
0
{
8470
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8471
0
                           128*self->count3);
8472
0
}
8473
8474
static PyMethodDef encoding_map_methods[] = {
8475
    ENCODINGMAP_SIZE_METHODDEF
8476
    {NULL, NULL}
8477
};
8478
8479
static PyTypeObject EncodingMapType = {
8480
    PyVarObject_HEAD_INIT(NULL, 0)
8481
    .tp_name = "EncodingMap",
8482
    .tp_basicsize = sizeof(struct encoding_map),
8483
    /* methods */
8484
    .tp_flags = Py_TPFLAGS_DEFAULT,
8485
    .tp_methods = encoding_map_methods,
8486
};
8487
8488
PyObject*
8489
PyUnicode_BuildEncodingMap(PyObject* string)
8490
114
{
8491
114
    PyObject *result;
8492
114
    struct encoding_map *mresult;
8493
114
    int i;
8494
114
    int need_dict = 0;
8495
114
    unsigned char level1[32];
8496
114
    unsigned char level2[512];
8497
114
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8498
114
    int count2 = 0, count3 = 0;
8499
114
    int kind;
8500
114
    const void *data;
8501
114
    int length;
8502
114
    Py_UCS4 ch;
8503
8504
114
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8505
0
        PyErr_BadArgument();
8506
0
        return NULL;
8507
0
    }
8508
114
    kind = PyUnicode_KIND(string);
8509
114
    data = PyUnicode_DATA(string);
8510
114
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8511
114
    memset(level1, 0xFF, sizeof level1);
8512
114
    memset(level2, 0xFF, sizeof level2);
8513
8514
    /* If there isn't a one-to-one mapping of NULL to \0,
8515
       or if there are non-BMP characters, we need to use
8516
       a mapping dictionary. */
8517
114
    if (PyUnicode_READ(kind, data, 0) != 0)
8518
0
        need_dict = 1;
8519
29.1k
    for (i = 1; i < length; i++) {
8520
29.0k
        int l1, l2;
8521
29.0k
        ch = PyUnicode_READ(kind, data, i);
8522
29.0k
        if (ch == 0 || ch > 0xFFFF) {
8523
0
            need_dict = 1;
8524
0
            break;
8525
0
        }
8526
29.0k
        if (ch == 0xFFFE)
8527
            /* unmapped character */
8528
733
            continue;
8529
28.3k
        l1 = ch >> 11;
8530
28.3k
        l2 = ch >> 7;
8531
28.3k
        if (level1[l1] == 0xFF)
8532
206
            level1[l1] = count2++;
8533
28.3k
        if (level2[l2] == 0xFF)
8534
617
            level2[l2] = count3++;
8535
28.3k
    }
8536
8537
114
    if (count2 >= 0xFF || count3 >= 0xFF)
8538
0
        need_dict = 1;
8539
8540
114
    if (need_dict) {
8541
0
        PyObject *result = PyDict_New();
8542
0
        if (!result)
8543
0
            return NULL;
8544
0
        for (i = 0; i < length; i++) {
8545
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8546
0
            PyObject *key = PyLong_FromLong(c);
8547
0
            if (key == NULL) {
8548
0
                Py_DECREF(result);
8549
0
                return NULL;
8550
0
            }
8551
0
            PyObject *value = PyLong_FromLong(i);
8552
0
            if (value == NULL) {
8553
0
                Py_DECREF(key);
8554
0
                Py_DECREF(result);
8555
0
                return NULL;
8556
0
            }
8557
0
            int rc = PyDict_SetItem(result, key, value);
8558
0
            Py_DECREF(key);
8559
0
            Py_DECREF(value);
8560
0
            if (rc < 0) {
8561
0
                Py_DECREF(result);
8562
0
                return NULL;
8563
0
            }
8564
0
        }
8565
0
        return result;
8566
0
    }
8567
8568
    /* Create a three-level trie */
8569
114
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8570
114
                             16*count2 + 128*count3 - 1);
8571
114
    if (!result) {
8572
0
        return PyErr_NoMemory();
8573
0
    }
8574
8575
114
    _PyObject_Init(result, &EncodingMapType);
8576
114
    mresult = (struct encoding_map*)result;
8577
114
    mresult->count2 = count2;
8578
114
    mresult->count3 = count3;
8579
114
    mlevel1 = mresult->level1;
8580
114
    mlevel2 = mresult->level23;
8581
114
    mlevel3 = mresult->level23 + 16*count2;
8582
114
    memcpy(mlevel1, level1, 32);
8583
114
    memset(mlevel2, 0xFF, 16*count2);
8584
114
    memset(mlevel3, 0, 128*count3);
8585
114
    count3 = 0;
8586
29.1k
    for (i = 1; i < length; i++) {
8587
29.0k
        int o1, o2, o3, i2, i3;
8588
29.0k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8589
29.0k
        if (ch == 0xFFFE)
8590
            /* unmapped character */
8591
733
            continue;
8592
28.3k
        o1 = ch>>11;
8593
28.3k
        o2 = (ch>>7) & 0xF;
8594
28.3k
        i2 = 16*mlevel1[o1] + o2;
8595
28.3k
        if (mlevel2[i2] == 0xFF)
8596
617
            mlevel2[i2] = count3++;
8597
28.3k
        o3 = ch & 0x7F;
8598
28.3k
        i3 = 128*mlevel2[i2] + o3;
8599
28.3k
        mlevel3[i3] = i;
8600
28.3k
    }
8601
114
    return result;
8602
114
}
8603
8604
static int
8605
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8606
0
{
8607
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8608
0
    int l1 = c>>11;
8609
0
    int l2 = (c>>7) & 0xF;
8610
0
    int l3 = c & 0x7F;
8611
0
    int i;
8612
8613
0
    if (c > 0xFFFF)
8614
0
        return -1;
8615
0
    if (c == 0)
8616
0
        return 0;
8617
    /* level 1*/
8618
0
    i = map->level1[l1];
8619
0
    if (i == 0xFF) {
8620
0
        return -1;
8621
0
    }
8622
    /* level 2*/
8623
0
    i = map->level23[16*i+l2];
8624
0
    if (i == 0xFF) {
8625
0
        return -1;
8626
0
    }
8627
    /* level 3 */
8628
0
    i = map->level23[16*map->count2 + 128*i + l3];
8629
0
    if (i == 0) {
8630
0
        return -1;
8631
0
    }
8632
0
    return i;
8633
0
}
8634
8635
/* Lookup the character in the mapping.
8636
   On success, return PyLong, PyBytes or None (if the character can't be found).
8637
   If the result is PyLong, put its value in replace.
8638
   On error, return NULL.
8639
   */
8640
static PyObject *
8641
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8642
0
{
8643
0
    PyObject *w = PyLong_FromLong((long)c);
8644
0
    PyObject *x;
8645
8646
0
    if (w == NULL)
8647
0
        return NULL;
8648
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8649
0
    Py_DECREF(w);
8650
0
    if (rc == 0) {
8651
        /* No mapping found means: mapping is undefined. */
8652
0
        Py_RETURN_NONE;
8653
0
    }
8654
0
    if (x == NULL) {
8655
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8656
            /* No mapping found means: mapping is undefined. */
8657
0
            PyErr_Clear();
8658
0
            Py_RETURN_NONE;
8659
0
        } else
8660
0
            return NULL;
8661
0
    }
8662
0
    else if (x == Py_None)
8663
0
        return x;
8664
0
    else if (PyLong_Check(x)) {
8665
0
        long value = PyLong_AsLong(x);
8666
0
        if (value < 0 || value > 255) {
8667
0
            PyErr_SetString(PyExc_TypeError,
8668
0
                            "character mapping must be in range(256)");
8669
0
            Py_DECREF(x);
8670
0
            return NULL;
8671
0
        }
8672
0
        *replace = (unsigned char)value;
8673
0
        return x;
8674
0
    }
8675
0
    else if (PyBytes_Check(x))
8676
0
        return x;
8677
0
    else {
8678
        /* wrong return value */
8679
0
        PyErr_Format(PyExc_TypeError,
8680
0
                     "character mapping must return integer, bytes or None, not %.400s",
8681
0
                     Py_TYPE(x)->tp_name);
8682
0
        Py_DECREF(x);
8683
0
        return NULL;
8684
0
    }
8685
0
}
8686
8687
static int
8688
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8689
0
{
8690
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8691
    /* exponentially overallocate to minimize reallocations */
8692
0
    if (requiredsize < 2 * outsize)
8693
0
        requiredsize = 2 * outsize;
8694
0
    return PyBytesWriter_Resize(writer, requiredsize);
8695
0
}
8696
8697
typedef enum charmapencode_result {
8698
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8699
} charmapencode_result;
8700
/* lookup the character, put the result in the output string and adjust
8701
   various state variables. Resize the output bytes object if not enough
8702
   space is available. Return a new reference to the object that
8703
   was put in the output buffer, or Py_None, if the mapping was undefined
8704
   (in which case no character was written) or NULL, if a
8705
   reallocation error occurred. The caller must decref the result */
8706
static charmapencode_result
8707
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8708
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8709
0
{
8710
0
    PyObject *rep;
8711
0
    unsigned char replace;
8712
0
    char *outstart;
8713
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8714
8715
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8716
0
        int res = encoding_map_lookup(c, mapping);
8717
0
        Py_ssize_t requiredsize = *outpos+1;
8718
0
        if (res == -1) {
8719
0
            return enc_FAILED;
8720
0
        }
8721
8722
0
        if (outsize<requiredsize) {
8723
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8724
0
                return enc_EXCEPTION;
8725
0
            }
8726
0
        }
8727
0
        outstart = _PyBytesWriter_GetData(writer);
8728
0
        outstart[(*outpos)++] = (char)res;
8729
0
        return enc_SUCCESS;
8730
0
    }
8731
8732
0
    rep = charmapencode_lookup(c, mapping, &replace);
8733
0
    if (rep==NULL)
8734
0
        return enc_EXCEPTION;
8735
0
    else if (rep==Py_None) {
8736
0
        Py_DECREF(rep);
8737
0
        return enc_FAILED;
8738
0
    } else {
8739
0
        if (PyLong_Check(rep)) {
8740
0
            Py_ssize_t requiredsize = *outpos+1;
8741
0
            if (outsize<requiredsize)
8742
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8743
0
                    Py_DECREF(rep);
8744
0
                    return enc_EXCEPTION;
8745
0
                }
8746
0
            outstart = _PyBytesWriter_GetData(writer);
8747
0
            outstart[(*outpos)++] = (char)replace;
8748
0
        }
8749
0
        else {
8750
0
            const char *repchars = PyBytes_AS_STRING(rep);
8751
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8752
0
            Py_ssize_t requiredsize = *outpos+repsize;
8753
0
            if (outsize<requiredsize)
8754
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8755
0
                    Py_DECREF(rep);
8756
0
                    return enc_EXCEPTION;
8757
0
                }
8758
0
            outstart = _PyBytesWriter_GetData(writer);
8759
0
            memcpy(outstart + *outpos, repchars, repsize);
8760
0
            *outpos += repsize;
8761
0
        }
8762
0
    }
8763
0
    Py_DECREF(rep);
8764
0
    return enc_SUCCESS;
8765
0
}
8766
8767
/* handle an error in _PyUnicode_EncodeCharmap()
8768
   Return 0 on success, -1 on error */
8769
static int
8770
charmap_encoding_error(
8771
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8772
    PyObject **exceptionObject,
8773
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8774
    PyBytesWriter *writer, Py_ssize_t *respos)
8775
0
{
8776
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8777
0
    Py_ssize_t size, repsize;
8778
0
    Py_ssize_t newpos;
8779
0
    int kind;
8780
0
    const void *data;
8781
0
    Py_ssize_t index;
8782
    /* startpos for collecting unencodable chars */
8783
0
    Py_ssize_t collstartpos = *inpos;
8784
0
    Py_ssize_t collendpos = *inpos+1;
8785
0
    Py_ssize_t collpos;
8786
0
    const char *encoding = "charmap";
8787
0
    const char *reason = "character maps to <undefined>";
8788
0
    charmapencode_result x;
8789
0
    Py_UCS4 ch;
8790
0
    int val;
8791
8792
0
    size = PyUnicode_GET_LENGTH(unicode);
8793
    /* find all unencodable characters */
8794
0
    while (collendpos < size) {
8795
0
        PyObject *rep;
8796
0
        unsigned char replace;
8797
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8798
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8799
0
            val = encoding_map_lookup(ch, mapping);
8800
0
            if (val != -1)
8801
0
                break;
8802
0
            ++collendpos;
8803
0
            continue;
8804
0
        }
8805
8806
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8807
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8808
0
        if (rep==NULL)
8809
0
            return -1;
8810
0
        else if (rep!=Py_None) {
8811
0
            Py_DECREF(rep);
8812
0
            break;
8813
0
        }
8814
0
        Py_DECREF(rep);
8815
0
        ++collendpos;
8816
0
    }
8817
    /* cache callback name lookup
8818
     * (if not done yet, i.e. it's the first error) */
8819
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8820
0
        *error_handler = _Py_GetErrorHandler(errors);
8821
8822
0
    switch (*error_handler) {
8823
0
    case _Py_ERROR_STRICT:
8824
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8825
0
        return -1;
8826
8827
0
    case _Py_ERROR_REPLACE:
8828
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8829
0
            x = charmapencode_output('?', mapping, writer, respos);
8830
0
            if (x==enc_EXCEPTION) {
8831
0
                return -1;
8832
0
            }
8833
0
            else if (x==enc_FAILED) {
8834
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8835
0
                return -1;
8836
0
            }
8837
0
        }
8838
0
        _Py_FALLTHROUGH;
8839
0
    case _Py_ERROR_IGNORE:
8840
0
        *inpos = collendpos;
8841
0
        break;
8842
8843
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8844
        /* generate replacement (temporarily (mis)uses p) */
8845
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8846
0
            char buffer[2+29+1+1];
8847
0
            char *cp;
8848
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8849
0
            for (cp = buffer; *cp; ++cp) {
8850
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8851
0
                if (x==enc_EXCEPTION)
8852
0
                    return -1;
8853
0
                else if (x==enc_FAILED) {
8854
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8855
0
                    return -1;
8856
0
                }
8857
0
            }
8858
0
        }
8859
0
        *inpos = collendpos;
8860
0
        break;
8861
8862
0
    default:
8863
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8864
0
                                                      encoding, reason, unicode, exceptionObject,
8865
0
                                                      collstartpos, collendpos, &newpos);
8866
0
        if (repunicode == NULL)
8867
0
            return -1;
8868
0
        if (PyBytes_Check(repunicode)) {
8869
            /* Directly copy bytes result to output. */
8870
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8871
0
            Py_ssize_t requiredsize;
8872
0
            repsize = PyBytes_Size(repunicode);
8873
0
            requiredsize = *respos + repsize;
8874
0
            if (requiredsize > outsize)
8875
                /* Make room for all additional bytes. */
8876
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8877
0
                    Py_DECREF(repunicode);
8878
0
                    return -1;
8879
0
                }
8880
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8881
0
                   PyBytes_AsString(repunicode),  repsize);
8882
0
            *respos += repsize;
8883
0
            *inpos = newpos;
8884
0
            Py_DECREF(repunicode);
8885
0
            break;
8886
0
        }
8887
        /* generate replacement  */
8888
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8889
0
        data = PyUnicode_DATA(repunicode);
8890
0
        kind = PyUnicode_KIND(repunicode);
8891
0
        for (index = 0; index < repsize; index++) {
8892
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8893
0
            x = charmapencode_output(repch, mapping, writer, respos);
8894
0
            if (x==enc_EXCEPTION) {
8895
0
                Py_DECREF(repunicode);
8896
0
                return -1;
8897
0
            }
8898
0
            else if (x==enc_FAILED) {
8899
0
                Py_DECREF(repunicode);
8900
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8901
0
                return -1;
8902
0
            }
8903
0
        }
8904
0
        *inpos = newpos;
8905
0
        Py_DECREF(repunicode);
8906
0
    }
8907
0
    return 0;
8908
0
}
8909
8910
PyObject *
8911
_PyUnicode_EncodeCharmap(PyObject *unicode,
8912
                         PyObject *mapping,
8913
                         const char *errors)
8914
0
{
8915
    /* Default to Latin-1 */
8916
0
    if (mapping == NULL) {
8917
0
        return unicode_encode_ucs1(unicode, errors, 256);
8918
0
    }
8919
8920
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
8921
0
    if (size == 0) {
8922
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8923
0
    }
8924
0
    const void *data = PyUnicode_DATA(unicode);
8925
0
    int kind = PyUnicode_KIND(unicode);
8926
8927
0
    PyObject *error_handler_obj = NULL;
8928
0
    PyObject *exc = NULL;
8929
8930
    /* output object */
8931
0
    PyBytesWriter *writer;
8932
    /* allocate enough for a simple encoding without
8933
       replacements, if we need more, we'll resize */
8934
0
    writer = PyBytesWriter_Create(size);
8935
0
    if (writer == NULL) {
8936
0
        goto onError;
8937
0
    }
8938
8939
    /* current input position */
8940
0
    Py_ssize_t inpos = 0;
8941
    /* current output position */
8942
0
    Py_ssize_t respos = 0;
8943
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8944
8945
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8946
0
        char *outstart = _PyBytesWriter_GetData(writer);
8947
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8948
8949
0
        while (inpos<size) {
8950
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8951
8952
            /* try to encode it */
8953
0
            int res = encoding_map_lookup(ch, mapping);
8954
0
            Py_ssize_t requiredsize = respos+1;
8955
0
            if (res == -1) {
8956
0
                goto enc_FAILED;
8957
0
            }
8958
8959
0
            if (outsize<requiredsize) {
8960
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
8961
0
                    goto onError;
8962
0
                }
8963
0
                outstart = _PyBytesWriter_GetData(writer);
8964
0
                outsize = _PyBytesWriter_GetSize(writer);
8965
0
            }
8966
0
            outstart[respos++] = (char)res;
8967
8968
            /* done with this character => adjust input position */
8969
0
            ++inpos;
8970
0
            continue;
8971
8972
0
enc_FAILED:
8973
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
8974
0
                                       &exc,
8975
0
                                       &error_handler, &error_handler_obj, errors,
8976
0
                                       writer, &respos)) {
8977
0
                goto onError;
8978
0
            }
8979
0
            outstart = _PyBytesWriter_GetData(writer);
8980
0
            outsize = _PyBytesWriter_GetSize(writer);
8981
0
        }
8982
0
    }
8983
0
    else {
8984
0
        while (inpos<size) {
8985
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8986
            /* try to encode it */
8987
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
8988
0
            if (x==enc_EXCEPTION) { /* error */
8989
0
                goto onError;
8990
0
            }
8991
0
            if (x==enc_FAILED) { /* unencodable character */
8992
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
8993
0
                                           &exc,
8994
0
                                           &error_handler, &error_handler_obj, errors,
8995
0
                                           writer, &respos)) {
8996
0
                    goto onError;
8997
0
                }
8998
0
            }
8999
0
            else {
9000
                /* done with this character => adjust input position */
9001
0
                ++inpos;
9002
0
            }
9003
0
        }
9004
0
    }
9005
9006
0
    Py_XDECREF(exc);
9007
0
    Py_XDECREF(error_handler_obj);
9008
9009
    /* Resize if we allocated too much */
9010
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9011
9012
0
  onError:
9013
0
    PyBytesWriter_Discard(writer);
9014
0
    Py_XDECREF(exc);
9015
0
    Py_XDECREF(error_handler_obj);
9016
0
    return NULL;
9017
0
}
9018
9019
PyObject *
9020
PyUnicode_AsCharmapString(PyObject *unicode,
9021
                          PyObject *mapping)
9022
0
{
9023
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9024
0
        PyErr_BadArgument();
9025
0
        return NULL;
9026
0
    }
9027
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9028
0
}
9029
9030
/* create or adjust a UnicodeTranslateError */
9031
static void
9032
make_translate_exception(PyObject **exceptionObject,
9033
                         PyObject *unicode,
9034
                         Py_ssize_t startpos, Py_ssize_t endpos,
9035
                         const char *reason)
9036
0
{
9037
0
    if (*exceptionObject == NULL) {
9038
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9039
0
            unicode, startpos, endpos, reason);
9040
0
    }
9041
0
    else {
9042
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9043
0
            goto onError;
9044
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9045
0
            goto onError;
9046
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9047
0
            goto onError;
9048
0
        return;
9049
0
      onError:
9050
0
        Py_CLEAR(*exceptionObject);
9051
0
    }
9052
0
}
9053
9054
/* error handling callback helper:
9055
   build arguments, call the callback and check the arguments,
9056
   put the result into newpos and return the replacement string, which
9057
   has to be freed by the caller */
9058
static PyObject *
9059
unicode_translate_call_errorhandler(const char *errors,
9060
                                    PyObject **errorHandler,
9061
                                    const char *reason,
9062
                                    PyObject *unicode, PyObject **exceptionObject,
9063
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9064
                                    Py_ssize_t *newpos)
9065
0
{
9066
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9067
9068
0
    Py_ssize_t i_newpos;
9069
0
    PyObject *restuple;
9070
0
    PyObject *resunicode;
9071
9072
0
    if (*errorHandler == NULL) {
9073
0
        *errorHandler = PyCodec_LookupError(errors);
9074
0
        if (*errorHandler == NULL)
9075
0
            return NULL;
9076
0
    }
9077
9078
0
    make_translate_exception(exceptionObject,
9079
0
                             unicode, startpos, endpos, reason);
9080
0
    if (*exceptionObject == NULL)
9081
0
        return NULL;
9082
9083
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9084
0
    if (restuple == NULL)
9085
0
        return NULL;
9086
0
    if (!PyTuple_Check(restuple)) {
9087
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9088
0
        Py_DECREF(restuple);
9089
0
        return NULL;
9090
0
    }
9091
0
    if (!PyArg_ParseTuple(restuple, argparse,
9092
0
                          &resunicode, &i_newpos)) {
9093
0
        Py_DECREF(restuple);
9094
0
        return NULL;
9095
0
    }
9096
0
    if (i_newpos<0)
9097
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9098
0
    else
9099
0
        *newpos = i_newpos;
9100
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9101
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9102
0
        Py_DECREF(restuple);
9103
0
        return NULL;
9104
0
    }
9105
0
    Py_INCREF(resunicode);
9106
0
    Py_DECREF(restuple);
9107
0
    return resunicode;
9108
0
}
9109
9110
/* Lookup the character ch in the mapping and put the result in result,
9111
   which must be decrefed by the caller.
9112
   The result can be PyLong, PyUnicode, None or NULL.
9113
   If the result is PyLong, put its value in replace.
9114
   Return 0 on success, -1 on error */
9115
static int
9116
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9117
15.6k
{
9118
15.6k
    PyObject *w = PyLong_FromLong((long)c);
9119
15.6k
    PyObject *x;
9120
9121
15.6k
    if (w == NULL)
9122
0
        return -1;
9123
15.6k
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9124
15.6k
    Py_DECREF(w);
9125
15.6k
    if (rc == 0) {
9126
        /* No mapping found means: use 1:1 mapping. */
9127
5.28k
        *result = NULL;
9128
5.28k
        return 0;
9129
5.28k
    }
9130
10.4k
    if (x == NULL) {
9131
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9132
            /* No mapping found means: use 1:1 mapping. */
9133
0
            PyErr_Clear();
9134
0
            *result = NULL;
9135
0
            return 0;
9136
0
        } else
9137
0
            return -1;
9138
0
    }
9139
10.4k
    else if (x == Py_None) {
9140
0
        *result = x;
9141
0
        return 0;
9142
0
    }
9143
10.4k
    else if (PyLong_Check(x)) {
9144
0
        long value = PyLong_AsLong(x);
9145
0
        if (value < 0 || value > MAX_UNICODE) {
9146
0
            PyErr_Format(PyExc_ValueError,
9147
0
                         "character mapping must be in range(0x%x)",
9148
0
                         MAX_UNICODE+1);
9149
0
            Py_DECREF(x);
9150
0
            return -1;
9151
0
        }
9152
0
        *result = x;
9153
0
        *replace = (Py_UCS4)value;
9154
0
        return 0;
9155
0
    }
9156
10.4k
    else if (PyUnicode_Check(x)) {
9157
10.4k
        *result = x;
9158
10.4k
        return 0;
9159
10.4k
    }
9160
0
    else {
9161
        /* wrong return value */
9162
0
        PyErr_SetString(PyExc_TypeError,
9163
0
                        "character mapping must return integer, None or str");
9164
0
        Py_DECREF(x);
9165
0
        return -1;
9166
0
    }
9167
10.4k
}
9168
9169
/* lookup the character, write the result into the writer.
9170
   Return 1 if the result was written into the writer, return 0 if the mapping
9171
   was undefined, raise an exception return -1 on error. */
9172
static int
9173
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9174
                        _PyUnicodeWriter *writer)
9175
5.32k
{
9176
5.32k
    PyObject *item;
9177
5.32k
    Py_UCS4 replace;
9178
9179
5.32k
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9180
0
        return -1;
9181
9182
5.32k
    if (item == NULL) {
9183
        /* not found => default to 1:1 mapping */
9184
85
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9185
0
            return -1;
9186
0
        }
9187
85
        return 1;
9188
85
    }
9189
9190
5.23k
    if (item == Py_None) {
9191
0
        Py_DECREF(item);
9192
0
        return 0;
9193
0
    }
9194
9195
5.23k
    if (PyLong_Check(item)) {
9196
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9197
0
            Py_DECREF(item);
9198
0
            return -1;
9199
0
        }
9200
0
        Py_DECREF(item);
9201
0
        return 1;
9202
0
    }
9203
9204
5.23k
    if (!PyUnicode_Check(item)) {
9205
0
        Py_DECREF(item);
9206
0
        return -1;
9207
0
    }
9208
9209
5.23k
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9210
0
        Py_DECREF(item);
9211
0
        return -1;
9212
0
    }
9213
9214
5.23k
    Py_DECREF(item);
9215
5.23k
    return 1;
9216
5.23k
}
9217
9218
static int
9219
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9220
                              Py_UCS1 *translate)
9221
10.3k
{
9222
10.3k
    PyObject *item = NULL;
9223
10.3k
    Py_UCS4 replace;
9224
10.3k
    int ret = 0;
9225
9226
10.3k
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9227
0
        return -1;
9228
0
    }
9229
9230
10.3k
    if (item == Py_None) {
9231
        /* deletion */
9232
0
        translate[ch] = 0xfe;
9233
0
    }
9234
10.3k
    else if (item == NULL) {
9235
        /* not found => default to 1:1 mapping */
9236
5.20k
        translate[ch] = ch;
9237
5.20k
        return 1;
9238
5.20k
    }
9239
5.16k
    else if (PyLong_Check(item)) {
9240
0
        if (replace > 127) {
9241
            /* invalid character or character outside ASCII:
9242
               skip the fast translate */
9243
0
            goto exit;
9244
0
        }
9245
0
        translate[ch] = (Py_UCS1)replace;
9246
0
    }
9247
5.16k
    else if (PyUnicode_Check(item)) {
9248
5.16k
        if (PyUnicode_GET_LENGTH(item) != 1)
9249
5.16k
            goto exit;
9250
9251
0
        replace = PyUnicode_READ_CHAR(item, 0);
9252
0
        if (replace > 127)
9253
0
            goto exit;
9254
0
        translate[ch] = (Py_UCS1)replace;
9255
0
    }
9256
0
    else {
9257
        /* not None, NULL, long or unicode */
9258
0
        goto exit;
9259
0
    }
9260
0
    ret = 1;
9261
9262
5.16k
  exit:
9263
5.16k
    Py_DECREF(item);
9264
5.16k
    return ret;
9265
0
}
9266
9267
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9268
   was translated into writer, return 0 if the input string was partially
9269
   translated into writer, raise an exception and return -1 on error. */
9270
static int
9271
unicode_fast_translate(PyObject *input, PyObject *mapping,
9272
                       _PyUnicodeWriter *writer, int ignore,
9273
                       Py_ssize_t *input_pos)
9274
10.3k
{
9275
10.3k
    Py_UCS1 ascii_table[128], ch, ch2;
9276
10.3k
    Py_ssize_t len;
9277
10.3k
    const Py_UCS1 *in, *end;
9278
10.3k
    Py_UCS1 *out;
9279
10.3k
    int res = 0;
9280
9281
10.3k
    len = PyUnicode_GET_LENGTH(input);
9282
9283
10.3k
    memset(ascii_table, 0xff, 128);
9284
9285
10.3k
    in = PyUnicode_1BYTE_DATA(input);
9286
10.3k
    end = in + len;
9287
9288
10.3k
    assert(PyUnicode_IS_ASCII(writer->buffer));
9289
10.3k
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9290
10.3k
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9291
9292
15.5k
    for (; in < end; in++) {
9293
10.3k
        ch = *in;
9294
10.3k
        ch2 = ascii_table[ch];
9295
10.3k
        if (ch2 == 0xff) {
9296
10.3k
            int translate = unicode_fast_translate_lookup(mapping, ch,
9297
10.3k
                                                          ascii_table);
9298
10.3k
            if (translate < 0)
9299
0
                return -1;
9300
10.3k
            if (translate == 0)
9301
5.16k
                goto exit;
9302
5.20k
            ch2 = ascii_table[ch];
9303
5.20k
        }
9304
5.22k
        if (ch2 == 0xfe) {
9305
0
            if (ignore)
9306
0
                continue;
9307
0
            goto exit;
9308
0
        }
9309
5.22k
        assert(ch2 < 128);
9310
5.22k
        *out = ch2;
9311
5.22k
        out++;
9312
5.22k
    }
9313
5.15k
    res = 1;
9314
9315
10.3k
exit:
9316
10.3k
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9317
10.3k
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9318
10.3k
    return res;
9319
5.15k
}
9320
9321
static PyObject *
9322
_PyUnicode_TranslateCharmap(PyObject *input,
9323
                            PyObject *mapping,
9324
                            const char *errors)
9325
10.3k
{
9326
    /* input object */
9327
10.3k
    const void *data;
9328
10.3k
    Py_ssize_t size, i;
9329
10.3k
    int kind;
9330
    /* output buffer */
9331
10.3k
    _PyUnicodeWriter writer;
9332
    /* error handler */
9333
10.3k
    const char *reason = "character maps to <undefined>";
9334
10.3k
    PyObject *errorHandler = NULL;
9335
10.3k
    PyObject *exc = NULL;
9336
10.3k
    int ignore;
9337
10.3k
    int res;
9338
9339
10.3k
    if (mapping == NULL) {
9340
0
        PyErr_BadArgument();
9341
0
        return NULL;
9342
0
    }
9343
9344
10.3k
    data = PyUnicode_DATA(input);
9345
10.3k
    kind = PyUnicode_KIND(input);
9346
10.3k
    size = PyUnicode_GET_LENGTH(input);
9347
9348
10.3k
    if (size == 0)
9349
0
        return PyUnicode_FromObject(input);
9350
9351
    /* allocate enough for a simple 1:1 translation without
9352
       replacements, if we need more, we'll resize */
9353
10.3k
    _PyUnicodeWriter_Init(&writer);
9354
10.3k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9355
0
        goto onError;
9356
9357
10.3k
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9358
9359
10.3k
    if (PyUnicode_IS_ASCII(input)) {
9360
10.3k
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9361
10.3k
        if (res < 0) {
9362
0
            _PyUnicodeWriter_Dealloc(&writer);
9363
0
            return NULL;
9364
0
        }
9365
10.3k
        if (res == 1)
9366
5.15k
            return _PyUnicodeWriter_Finish(&writer);
9367
10.3k
    }
9368
0
    else {
9369
0
        i = 0;
9370
0
    }
9371
9372
10.4k
    while (i<size) {
9373
        /* try to encode it */
9374
5.32k
        int translate;
9375
5.32k
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9376
5.32k
        Py_ssize_t newpos;
9377
        /* startpos for collecting untranslatable chars */
9378
5.32k
        Py_ssize_t collstart;
9379
5.32k
        Py_ssize_t collend;
9380
5.32k
        Py_UCS4 ch;
9381
9382
5.32k
        ch = PyUnicode_READ(kind, data, i);
9383
5.32k
        translate = charmaptranslate_output(ch, mapping, &writer);
9384
5.32k
        if (translate < 0)
9385
0
            goto onError;
9386
9387
5.32k
        if (translate != 0) {
9388
            /* it worked => adjust input pointer */
9389
5.32k
            ++i;
9390
5.32k
            continue;
9391
5.32k
        }
9392
9393
        /* untranslatable character */
9394
0
        collstart = i;
9395
0
        collend = i+1;
9396
9397
        /* find all untranslatable characters */
9398
0
        while (collend < size) {
9399
0
            PyObject *x;
9400
0
            Py_UCS4 replace;
9401
0
            ch = PyUnicode_READ(kind, data, collend);
9402
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9403
0
                goto onError;
9404
0
            Py_XDECREF(x);
9405
0
            if (x != Py_None)
9406
0
                break;
9407
0
            ++collend;
9408
0
        }
9409
9410
0
        if (ignore) {
9411
0
            i = collend;
9412
0
        }
9413
0
        else {
9414
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9415
0
                                                             reason, input, &exc,
9416
0
                                                             collstart, collend, &newpos);
9417
0
            if (repunicode == NULL)
9418
0
                goto onError;
9419
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9420
0
                Py_DECREF(repunicode);
9421
0
                goto onError;
9422
0
            }
9423
0
            Py_DECREF(repunicode);
9424
0
            i = newpos;
9425
0
        }
9426
0
    }
9427
5.16k
    Py_XDECREF(exc);
9428
5.16k
    Py_XDECREF(errorHandler);
9429
5.16k
    return _PyUnicodeWriter_Finish(&writer);
9430
9431
0
  onError:
9432
0
    _PyUnicodeWriter_Dealloc(&writer);
9433
0
    Py_XDECREF(exc);
9434
0
    Py_XDECREF(errorHandler);
9435
0
    return NULL;
9436
5.16k
}
9437
9438
PyObject *
9439
PyUnicode_Translate(PyObject *str,
9440
                    PyObject *mapping,
9441
                    const char *errors)
9442
0
{
9443
0
    if (ensure_unicode(str) < 0)
9444
0
        return NULL;
9445
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9446
0
}
9447
9448
PyObject *
9449
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9450
7.18M
{
9451
7.18M
    if (!PyUnicode_Check(unicode)) {
9452
0
        PyErr_BadInternalCall();
9453
0
        return NULL;
9454
0
    }
9455
7.18M
    if (PyUnicode_IS_ASCII(unicode)) {
9456
        /* If the string is already ASCII, just return the same string */
9457
7.18M
        return Py_NewRef(unicode);
9458
7.18M
    }
9459
9460
2.46k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9461
2.46k
    PyObject *result = PyUnicode_New(len, 127);
9462
2.46k
    if (result == NULL) {
9463
0
        return NULL;
9464
0
    }
9465
9466
2.46k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9467
2.46k
    int kind = PyUnicode_KIND(unicode);
9468
2.46k
    const void *data = PyUnicode_DATA(unicode);
9469
2.46k
    Py_ssize_t i;
9470
33.5k
    for (i = 0; i < len; ++i) {
9471
31.2k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9472
31.2k
        if (ch < 127) {
9473
28.3k
            out[i] = ch;
9474
28.3k
        }
9475
2.89k
        else if (Py_UNICODE_ISSPACE(ch)) {
9476
1.12k
            out[i] = ' ';
9477
1.12k
        }
9478
1.77k
        else {
9479
1.77k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9480
1.77k
            if (decimal < 0) {
9481
147
                out[i] = '?';
9482
147
                out[i+1] = '\0';
9483
147
                _PyUnicode_LENGTH(result) = i + 1;
9484
147
                break;
9485
147
            }
9486
1.62k
            out[i] = '0' + decimal;
9487
1.62k
        }
9488
31.2k
    }
9489
9490
2.46k
    assert(_PyUnicode_CheckConsistency(result, 1));
9491
2.46k
    return result;
9492
2.46k
}
9493
9494
/* --- Helpers ------------------------------------------------------------ */
9495
9496
/* helper macro to fixup start/end slice values */
9497
#define ADJUST_INDICES(start, end, len) \
9498
162M
    do {                                \
9499
162M
        if (end > len) {                \
9500
132M
            end = len;                  \
9501
132M
        }                               \
9502
162M
        else if (end < 0) {             \
9503
0
            end += len;                 \
9504
0
            if (end < 0) {              \
9505
0
                end = 0;                \
9506
0
            }                           \
9507
0
        }                               \
9508
162M
        if (start < 0) {                \
9509
20.8k
            start += len;               \
9510
20.8k
            if (start < 0) {            \
9511
0
                start = 0;              \
9512
0
            }                           \
9513
20.8k
        }                               \
9514
162M
    } while (0)
9515
9516
static Py_ssize_t
9517
any_find_slice(PyObject* s1, PyObject* s2,
9518
               Py_ssize_t start,
9519
               Py_ssize_t end,
9520
               int direction)
9521
28.4M
{
9522
28.4M
    int kind1, kind2;
9523
28.4M
    const void *buf1, *buf2;
9524
28.4M
    Py_ssize_t len1, len2, result;
9525
9526
28.4M
    kind1 = PyUnicode_KIND(s1);
9527
28.4M
    kind2 = PyUnicode_KIND(s2);
9528
28.4M
    if (kind1 < kind2)
9529
0
        return -1;
9530
9531
28.4M
    len1 = PyUnicode_GET_LENGTH(s1);
9532
28.4M
    len2 = PyUnicode_GET_LENGTH(s2);
9533
28.4M
    ADJUST_INDICES(start, end, len1);
9534
28.4M
    if (end - start < len2)
9535
2.75M
        return -1;
9536
9537
25.6M
    buf1 = PyUnicode_DATA(s1);
9538
25.6M
    buf2 = PyUnicode_DATA(s2);
9539
25.6M
    if (len2 == 1) {
9540
25.6M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9541
25.6M
        result = findchar((const char *)buf1 + kind1*start,
9542
25.6M
                          kind1, end - start, ch, direction);
9543
25.6M
        if (result == -1)
9544
3.70M
            return -1;
9545
21.8M
        else
9546
21.8M
            return start + result;
9547
25.6M
    }
9548
9549
66.0k
    if (kind2 != kind1) {
9550
52.9k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9551
52.9k
        if (!buf2)
9552
0
            return -2;
9553
52.9k
    }
9554
9555
66.0k
    if (direction > 0) {
9556
66.0k
        switch (kind1) {
9557
13.0k
        case PyUnicode_1BYTE_KIND:
9558
13.0k
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9559
5.69k
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9560
7.33k
            else
9561
7.33k
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9562
13.0k
            break;
9563
35.9k
        case PyUnicode_2BYTE_KIND:
9564
35.9k
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9565
35.9k
            break;
9566
16.9k
        case PyUnicode_4BYTE_KIND:
9567
16.9k
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9568
16.9k
            break;
9569
0
        default:
9570
0
            Py_UNREACHABLE();
9571
66.0k
        }
9572
66.0k
    }
9573
0
    else {
9574
0
        switch (kind1) {
9575
0
        case PyUnicode_1BYTE_KIND:
9576
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9577
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9578
0
            else
9579
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9580
0
            break;
9581
0
        case PyUnicode_2BYTE_KIND:
9582
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9583
0
            break;
9584
0
        case PyUnicode_4BYTE_KIND:
9585
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9586
0
            break;
9587
0
        default:
9588
0
            Py_UNREACHABLE();
9589
0
        }
9590
0
    }
9591
9592
66.0k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9593
66.0k
    if (kind2 != kind1)
9594
52.9k
        PyMem_Free((void *)buf2);
9595
9596
66.0k
    return result;
9597
66.0k
}
9598
9599
9600
Py_ssize_t
9601
PyUnicode_Count(PyObject *str,
9602
                PyObject *substr,
9603
                Py_ssize_t start,
9604
                Py_ssize_t end)
9605
0
{
9606
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9607
0
        return -1;
9608
9609
0
    return unicode_count_impl(str, substr, start, end);
9610
0
}
9611
9612
Py_ssize_t
9613
PyUnicode_Find(PyObject *str,
9614
               PyObject *substr,
9615
               Py_ssize_t start,
9616
               Py_ssize_t end,
9617
               int direction)
9618
0
{
9619
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9620
0
        return -2;
9621
9622
0
    return any_find_slice(str, substr, start, end, direction);
9623
0
}
9624
9625
Py_ssize_t
9626
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9627
                   Py_ssize_t start, Py_ssize_t end,
9628
                   int direction)
9629
2.76M
{
9630
2.76M
    int kind;
9631
2.76M
    Py_ssize_t len, result;
9632
2.76M
    len = PyUnicode_GET_LENGTH(str);
9633
2.76M
    ADJUST_INDICES(start, end, len);
9634
2.76M
    if (end - start < 1)
9635
0
        return -1;
9636
2.76M
    kind = PyUnicode_KIND(str);
9637
2.76M
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9638
2.76M
                      kind, end-start, ch, direction);
9639
2.76M
    if (result == -1)
9640
1.87M
        return -1;
9641
889k
    else
9642
889k
        return start + result;
9643
2.76M
}
9644
9645
static int
9646
tailmatch(PyObject *self,
9647
          PyObject *substring,
9648
          Py_ssize_t start,
9649
          Py_ssize_t end,
9650
          int direction)
9651
102M
{
9652
102M
    int kind_self;
9653
102M
    int kind_sub;
9654
102M
    const void *data_self;
9655
102M
    const void *data_sub;
9656
102M
    Py_ssize_t offset;
9657
102M
    Py_ssize_t i;
9658
102M
    Py_ssize_t end_sub;
9659
9660
102M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9661
102M
    end -= PyUnicode_GET_LENGTH(substring);
9662
102M
    if (end < start)
9663
9.87M
        return 0;
9664
9665
92.2M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9666
0
        return 1;
9667
9668
92.2M
    kind_self = PyUnicode_KIND(self);
9669
92.2M
    data_self = PyUnicode_DATA(self);
9670
92.2M
    kind_sub = PyUnicode_KIND(substring);
9671
92.2M
    data_sub = PyUnicode_DATA(substring);
9672
92.2M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9673
9674
92.2M
    if (direction > 0)
9675
7.53M
        offset = end;
9676
84.7M
    else
9677
84.7M
        offset = start;
9678
9679
92.2M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9680
92.2M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9681
45.3M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9682
45.3M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9683
        /* If both are of the same kind, memcmp is sufficient */
9684
16.3M
        if (kind_self == kind_sub) {
9685
9.68M
            return ! memcmp((char *)data_self +
9686
9.68M
                                (offset * PyUnicode_KIND(substring)),
9687
9.68M
                            data_sub,
9688
9.68M
                            PyUnicode_GET_LENGTH(substring) *
9689
9.68M
                                PyUnicode_KIND(substring));
9690
9.68M
        }
9691
        /* otherwise we have to compare each character by first accessing it */
9692
6.68M
        else {
9693
            /* We do not need to compare 0 and len(substring)-1 because
9694
               the if statement above ensured already that they are equal
9695
               when we end up here. */
9696
6.75M
            for (i = 1; i < end_sub; ++i) {
9697
76.3k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9698
76.3k
                    PyUnicode_READ(kind_sub, data_sub, i))
9699
4.72k
                    return 0;
9700
76.3k
            }
9701
6.68M
            return 1;
9702
6.68M
        }
9703
16.3M
    }
9704
9705
75.9M
    return 0;
9706
92.2M
}
9707
9708
Py_ssize_t
9709
PyUnicode_Tailmatch(PyObject *str,
9710
                    PyObject *substr,
9711
                    Py_ssize_t start,
9712
                    Py_ssize_t end,
9713
                    int direction)
9714
0
{
9715
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9716
0
        return -1;
9717
9718
0
    return tailmatch(str, substr, start, end, direction);
9719
0
}
9720
9721
static PyObject *
9722
ascii_upper_or_lower(PyObject *self, int lower)
9723
81.8M
{
9724
81.8M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9725
81.8M
    const char *data = PyUnicode_DATA(self);
9726
81.8M
    char *resdata;
9727
81.8M
    PyObject *res;
9728
9729
81.8M
    res = PyUnicode_New(len, 127);
9730
81.8M
    if (res == NULL)
9731
0
        return NULL;
9732
81.8M
    resdata = PyUnicode_DATA(res);
9733
81.8M
    if (lower)
9734
81.8M
        _Py_bytes_lower(resdata, data, len);
9735
153
    else
9736
153
        _Py_bytes_upper(resdata, data, len);
9737
81.8M
    return res;
9738
81.8M
}
9739
9740
static Py_UCS4
9741
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9742
155k
{
9743
155k
    Py_ssize_t j;
9744
155k
    int final_sigma;
9745
155k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9746
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9747
9748
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9749
9750
    where ! is a negation and \p{xxx} is a character with property xxx.
9751
    */
9752
309k
    for (j = i - 1; j >= 0; j--) {
9753
306k
        c = PyUnicode_READ(kind, data, j);
9754
306k
        if (!_PyUnicode_IsCaseIgnorable(c))
9755
152k
            break;
9756
306k
    }
9757
155k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9758
155k
    if (final_sigma) {
9759
239k
        for (j = i + 1; j < length; j++) {
9760
232k
            c = PyUnicode_READ(kind, data, j);
9761
232k
            if (!_PyUnicode_IsCaseIgnorable(c))
9762
105k
                break;
9763
232k
        }
9764
112k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9765
112k
    }
9766
155k
    return (final_sigma) ? 0x3C2 : 0x3C3;
9767
155k
}
9768
9769
static int
9770
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9771
           Py_UCS4 c, Py_UCS4 *mapped)
9772
86.0M
{
9773
    /* Obscure special case. */
9774
86.0M
    if (c == 0x3A3) {
9775
155k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9776
155k
        return 1;
9777
155k
    }
9778
85.8M
    return _PyUnicode_ToLowerFull(c, mapped);
9779
86.0M
}
9780
9781
static Py_ssize_t
9782
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9783
0
{
9784
0
    Py_ssize_t i, k = 0;
9785
0
    int n_res, j;
9786
0
    Py_UCS4 c, mapped[3];
9787
9788
0
    c = PyUnicode_READ(kind, data, 0);
9789
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9790
0
    for (j = 0; j < n_res; j++) {
9791
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9792
0
        res[k++] = mapped[j];
9793
0
    }
9794
0
    for (i = 1; i < length; i++) {
9795
0
        c = PyUnicode_READ(kind, data, i);
9796
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9797
0
        for (j = 0; j < n_res; j++) {
9798
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9799
0
            res[k++] = mapped[j];
9800
0
        }
9801
0
    }
9802
0
    return k;
9803
0
}
9804
9805
static Py_ssize_t
9806
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9807
0
    Py_ssize_t i, k = 0;
9808
9809
0
    for (i = 0; i < length; i++) {
9810
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9811
0
        int n_res, j;
9812
0
        if (Py_UNICODE_ISUPPER(c)) {
9813
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9814
0
        }
9815
0
        else if (Py_UNICODE_ISLOWER(c)) {
9816
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9817
0
        }
9818
0
        else {
9819
0
            n_res = 1;
9820
0
            mapped[0] = c;
9821
0
        }
9822
0
        for (j = 0; j < n_res; j++) {
9823
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9824
0
            res[k++] = mapped[j];
9825
0
        }
9826
0
    }
9827
0
    return k;
9828
0
}
9829
9830
static Py_ssize_t
9831
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9832
                  Py_UCS4 *maxchar, int lower)
9833
7.91M
{
9834
7.91M
    Py_ssize_t i, k = 0;
9835
9836
93.9M
    for (i = 0; i < length; i++) {
9837
86.0M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9838
86.0M
        int n_res, j;
9839
86.0M
        if (lower)
9840
86.0M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9841
0
        else
9842
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9843
172M
        for (j = 0; j < n_res; j++) {
9844
86.0M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9845
86.0M
            res[k++] = mapped[j];
9846
86.0M
        }
9847
86.0M
    }
9848
7.91M
    return k;
9849
7.91M
}
9850
9851
static Py_ssize_t
9852
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9853
0
{
9854
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9855
0
}
9856
9857
static Py_ssize_t
9858
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9859
7.91M
{
9860
7.91M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9861
7.91M
}
9862
9863
static Py_ssize_t
9864
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9865
0
{
9866
0
    Py_ssize_t i, k = 0;
9867
9868
0
    for (i = 0; i < length; i++) {
9869
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9870
0
        Py_UCS4 mapped[3];
9871
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9872
0
        for (j = 0; j < n_res; j++) {
9873
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9874
0
            res[k++] = mapped[j];
9875
0
        }
9876
0
    }
9877
0
    return k;
9878
0
}
9879
9880
static Py_ssize_t
9881
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9882
0
{
9883
0
    Py_ssize_t i, k = 0;
9884
0
    int previous_is_cased;
9885
9886
0
    previous_is_cased = 0;
9887
0
    for (i = 0; i < length; i++) {
9888
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9889
0
        Py_UCS4 mapped[3];
9890
0
        int n_res, j;
9891
9892
0
        if (previous_is_cased)
9893
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9894
0
        else
9895
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9896
9897
0
        for (j = 0; j < n_res; j++) {
9898
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9899
0
            res[k++] = mapped[j];
9900
0
        }
9901
9902
0
        previous_is_cased = _PyUnicode_IsCased(c);
9903
0
    }
9904
0
    return k;
9905
0
}
9906
9907
static PyObject *
9908
case_operation(PyObject *self,
9909
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9910
7.91M
{
9911
7.91M
    PyObject *res = NULL;
9912
7.91M
    Py_ssize_t length, newlength = 0;
9913
7.91M
    int kind, outkind;
9914
7.91M
    const void *data;
9915
7.91M
    void *outdata;
9916
7.91M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9917
9918
7.91M
    kind = PyUnicode_KIND(self);
9919
7.91M
    data = PyUnicode_DATA(self);
9920
7.91M
    length = PyUnicode_GET_LENGTH(self);
9921
7.91M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9922
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9923
0
        return NULL;
9924
0
    }
9925
7.91M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9926
7.91M
    if (tmp == NULL)
9927
0
        return PyErr_NoMemory();
9928
7.91M
    newlength = perform(kind, data, length, tmp, &maxchar);
9929
7.91M
    res = PyUnicode_New(newlength, maxchar);
9930
7.91M
    if (res == NULL)
9931
0
        goto leave;
9932
7.91M
    tmpend = tmp + newlength;
9933
7.91M
    outdata = PyUnicode_DATA(res);
9934
7.91M
    outkind = PyUnicode_KIND(res);
9935
7.91M
    switch (outkind) {
9936
215k
    case PyUnicode_1BYTE_KIND:
9937
215k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9938
215k
        break;
9939
7.65M
    case PyUnicode_2BYTE_KIND:
9940
7.65M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9941
7.65M
        break;
9942
46.9k
    case PyUnicode_4BYTE_KIND:
9943
46.9k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9944
46.9k
        break;
9945
0
    default:
9946
0
        Py_UNREACHABLE();
9947
7.91M
    }
9948
7.91M
  leave:
9949
7.91M
    PyMem_Free(tmp);
9950
7.91M
    return res;
9951
7.91M
}
9952
9953
PyObject *
9954
PyUnicode_Join(PyObject *separator, PyObject *seq)
9955
30.0M
{
9956
30.0M
    PyObject *res;
9957
30.0M
    PyObject *fseq;
9958
30.0M
    Py_ssize_t seqlen;
9959
30.0M
    PyObject **items;
9960
9961
30.0M
    fseq = PySequence_Fast(seq, "can only join an iterable");
9962
30.0M
    if (fseq == NULL) {
9963
641
        return NULL;
9964
641
    }
9965
9966
30.0M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9967
9968
30.0M
    items = PySequence_Fast_ITEMS(fseq);
9969
30.0M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9970
30.0M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9971
9972
30.0M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9973
9974
30.0M
    Py_DECREF(fseq);
9975
30.0M
    return res;
9976
30.0M
}
9977
9978
PyObject *
9979
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9980
74.5M
{
9981
74.5M
    PyObject *res = NULL; /* the result */
9982
74.5M
    PyObject *sep = NULL;
9983
74.5M
    Py_ssize_t seplen;
9984
74.5M
    PyObject *item;
9985
74.5M
    Py_ssize_t sz, i, res_offset;
9986
74.5M
    Py_UCS4 maxchar;
9987
74.5M
    Py_UCS4 item_maxchar;
9988
74.5M
    int use_memcpy;
9989
74.5M
    unsigned char *res_data = NULL, *sep_data = NULL;
9990
74.5M
    PyObject *last_obj;
9991
74.5M
    int kind = 0;
9992
9993
    /* If empty sequence, return u"". */
9994
74.5M
    if (seqlen == 0) {
9995
7.11M
        _Py_RETURN_UNICODE_EMPTY();
9996
7.11M
    }
9997
9998
    /* If singleton sequence with an exact Unicode, return that. */
9999
67.4M
    last_obj = NULL;
10000
67.4M
    if (seqlen == 1) {
10001
12.2M
        if (PyUnicode_CheckExact(items[0])) {
10002
10.7M
            res = items[0];
10003
10.7M
            return Py_NewRef(res);
10004
10.7M
        }
10005
1.49M
        seplen = 0;
10006
1.49M
        maxchar = 0;
10007
1.49M
    }
10008
55.1M
    else {
10009
        /* Set up sep and seplen */
10010
55.1M
        if (separator == NULL) {
10011
            /* fall back to a blank space separator */
10012
0
            sep = PyUnicode_FromOrdinal(' ');
10013
0
            if (!sep)
10014
0
                goto onError;
10015
0
            seplen = 1;
10016
0
            maxchar = 32;
10017
0
        }
10018
55.1M
        else {
10019
55.1M
            if (!PyUnicode_Check(separator)) {
10020
0
                PyErr_Format(PyExc_TypeError,
10021
0
                             "separator: expected str instance,"
10022
0
                             " %.80s found",
10023
0
                             Py_TYPE(separator)->tp_name);
10024
0
                goto onError;
10025
0
            }
10026
55.1M
            sep = separator;
10027
55.1M
            seplen = PyUnicode_GET_LENGTH(separator);
10028
55.1M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10029
            /* inc refcount to keep this code path symmetric with the
10030
               above case of a blank separator */
10031
55.1M
            Py_INCREF(sep);
10032
55.1M
        }
10033
55.1M
        last_obj = sep;
10034
55.1M
    }
10035
10036
    /* There are at least two things to join, or else we have a subclass
10037
     * of str in the sequence.
10038
     * Do a pre-pass to figure out the total amount of space we'll
10039
     * need (sz), and see whether all argument are strings.
10040
     */
10041
56.6M
    sz = 0;
10042
#ifdef Py_DEBUG
10043
    use_memcpy = 0;
10044
#else
10045
56.6M
    use_memcpy = 1;
10046
56.6M
#endif
10047
428M
    for (i = 0; i < seqlen; i++) {
10048
371M
        size_t add_sz;
10049
371M
        item = items[i];
10050
371M
        if (!PyUnicode_Check(item)) {
10051
0
            PyErr_Format(PyExc_TypeError,
10052
0
                         "sequence item %zd: expected str instance,"
10053
0
                         " %.80s found",
10054
0
                         i, Py_TYPE(item)->tp_name);
10055
0
            goto onError;
10056
0
        }
10057
371M
        add_sz = PyUnicode_GET_LENGTH(item);
10058
371M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10059
371M
        maxchar = Py_MAX(maxchar, item_maxchar);
10060
371M
        if (i != 0) {
10061
314M
            add_sz += seplen;
10062
314M
        }
10063
371M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10064
0
            PyErr_SetString(PyExc_OverflowError,
10065
0
                            "join() result is too long for a Python string");
10066
0
            goto onError;
10067
0
        }
10068
371M
        sz += add_sz;
10069
371M
        if (use_memcpy && last_obj != NULL) {
10070
303M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10071
6.13M
                use_memcpy = 0;
10072
303M
        }
10073
371M
        last_obj = item;
10074
371M
    }
10075
10076
56.6M
    res = PyUnicode_New(sz, maxchar);
10077
56.6M
    if (res == NULL)
10078
0
        goto onError;
10079
10080
    /* Catenate everything. */
10081
#ifdef Py_DEBUG
10082
    use_memcpy = 0;
10083
#else
10084
56.6M
    if (use_memcpy) {
10085
50.5M
        res_data = PyUnicode_1BYTE_DATA(res);
10086
50.5M
        kind = PyUnicode_KIND(res);
10087
50.5M
        if (seplen != 0)
10088
149k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10089
50.5M
    }
10090
56.6M
#endif
10091
56.6M
    if (use_memcpy) {
10092
323M
        for (i = 0; i < seqlen; ++i) {
10093
272M
            Py_ssize_t itemlen;
10094
272M
            item = items[i];
10095
10096
            /* Copy item, and maybe the separator. */
10097
272M
            if (i && seplen != 0) {
10098
671k
                memcpy(res_data,
10099
671k
                          sep_data,
10100
671k
                          kind * seplen);
10101
671k
                res_data += kind * seplen;
10102
671k
            }
10103
10104
272M
            itemlen = PyUnicode_GET_LENGTH(item);
10105
272M
            if (itemlen != 0) {
10106
237M
                memcpy(res_data,
10107
237M
                          PyUnicode_DATA(item),
10108
237M
                          kind * itemlen);
10109
237M
                res_data += kind * itemlen;
10110
237M
            }
10111
272M
        }
10112
50.5M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10113
50.5M
                           + kind * PyUnicode_GET_LENGTH(res));
10114
50.5M
    }
10115
6.13M
    else {
10116
105M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10117
98.8M
            Py_ssize_t itemlen;
10118
98.8M
            item = items[i];
10119
10120
            /* Copy item, and maybe the separator. */
10121
98.8M
            if (i && seplen != 0) {
10122
718k
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10123
718k
                res_offset += seplen;
10124
718k
            }
10125
10126
98.8M
            itemlen = PyUnicode_GET_LENGTH(item);
10127
98.8M
            if (itemlen != 0) {
10128
96.0M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10129
96.0M
                res_offset += itemlen;
10130
96.0M
            }
10131
98.8M
        }
10132
6.13M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10133
6.13M
    }
10134
10135
56.6M
    Py_XDECREF(sep);
10136
56.6M
    assert(_PyUnicode_CheckConsistency(res, 1));
10137
56.6M
    return res;
10138
10139
0
  onError:
10140
0
    Py_XDECREF(sep);
10141
0
    Py_XDECREF(res);
10142
0
    return NULL;
10143
56.6M
}
10144
10145
void
10146
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10147
                    Py_UCS4 fill_char)
10148
617
{
10149
617
    const int kind = PyUnicode_KIND(unicode);
10150
617
    void *data = PyUnicode_DATA(unicode);
10151
617
    assert(_PyUnicode_IsModifiable(unicode));
10152
617
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10153
617
    assert(start >= 0);
10154
617
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10155
617
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10156
617
}
10157
10158
Py_ssize_t
10159
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10160
               Py_UCS4 fill_char)
10161
617
{
10162
617
    Py_ssize_t maxlen;
10163
10164
617
    if (!PyUnicode_Check(unicode)) {
10165
0
        PyErr_BadInternalCall();
10166
0
        return -1;
10167
0
    }
10168
617
    if (unicode_check_modifiable(unicode))
10169
0
        return -1;
10170
10171
617
    if (start < 0) {
10172
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10173
0
        return -1;
10174
0
    }
10175
617
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10176
0
        PyErr_SetString(PyExc_ValueError,
10177
0
                         "fill character is bigger than "
10178
0
                         "the string maximum character");
10179
0
        return -1;
10180
0
    }
10181
10182
617
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10183
617
    length = Py_MIN(maxlen, length);
10184
617
    if (length <= 0)
10185
0
        return 0;
10186
10187
617
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10188
617
    return length;
10189
617
}
10190
10191
static PyObject *
10192
pad(PyObject *self,
10193
    Py_ssize_t left,
10194
    Py_ssize_t right,
10195
    Py_UCS4 fill)
10196
0
{
10197
0
    PyObject *u;
10198
0
    Py_UCS4 maxchar;
10199
0
    int kind;
10200
0
    void *data;
10201
10202
0
    if (left < 0)
10203
0
        left = 0;
10204
0
    if (right < 0)
10205
0
        right = 0;
10206
10207
0
    if (left == 0 && right == 0)
10208
0
        return unicode_result_unchanged(self);
10209
10210
0
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10211
0
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10212
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10213
0
        return NULL;
10214
0
    }
10215
0
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10216
0
    maxchar = Py_MAX(maxchar, fill);
10217
0
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10218
0
    if (!u)
10219
0
        return NULL;
10220
10221
0
    kind = PyUnicode_KIND(u);
10222
0
    data = PyUnicode_DATA(u);
10223
0
    if (left)
10224
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10225
0
    if (right)
10226
0
        _PyUnicode_Fill(kind, data, fill,
10227
0
                        left + _PyUnicode_LENGTH(self), right);
10228
0
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10229
0
    assert(_PyUnicode_CheckConsistency(u, 1));
10230
0
    return u;
10231
0
}
10232
10233
PyObject *
10234
PyUnicode_Splitlines(PyObject *string, int keepends)
10235
13.3k
{
10236
13.3k
    PyObject *list;
10237
10238
13.3k
    if (ensure_unicode(string) < 0)
10239
0
        return NULL;
10240
10241
13.3k
    switch (PyUnicode_KIND(string)) {
10242
3.48k
    case PyUnicode_1BYTE_KIND:
10243
3.48k
        if (PyUnicode_IS_ASCII(string))
10244
2.67k
            list = asciilib_splitlines(
10245
2.67k
                string, PyUnicode_1BYTE_DATA(string),
10246
2.67k
                PyUnicode_GET_LENGTH(string), keepends);
10247
818
        else
10248
818
            list = ucs1lib_splitlines(
10249
818
                string, PyUnicode_1BYTE_DATA(string),
10250
818
                PyUnicode_GET_LENGTH(string), keepends);
10251
3.48k
        break;
10252
6.93k
    case PyUnicode_2BYTE_KIND:
10253
6.93k
        list = ucs2lib_splitlines(
10254
6.93k
            string, PyUnicode_2BYTE_DATA(string),
10255
6.93k
            PyUnicode_GET_LENGTH(string), keepends);
10256
6.93k
        break;
10257
2.92k
    case PyUnicode_4BYTE_KIND:
10258
2.92k
        list = ucs4lib_splitlines(
10259
2.92k
            string, PyUnicode_4BYTE_DATA(string),
10260
2.92k
            PyUnicode_GET_LENGTH(string), keepends);
10261
2.92k
        break;
10262
0
    default:
10263
0
        Py_UNREACHABLE();
10264
13.3k
    }
10265
13.3k
    return list;
10266
13.3k
}
10267
10268
static PyObject *
10269
split(PyObject *self,
10270
      PyObject *substring,
10271
      Py_ssize_t maxcount)
10272
23.2M
{
10273
23.2M
    int kind1, kind2;
10274
23.2M
    const void *buf1, *buf2;
10275
23.2M
    Py_ssize_t len1, len2;
10276
23.2M
    PyObject* out;
10277
23.2M
    len1 = PyUnicode_GET_LENGTH(self);
10278
23.2M
    kind1 = PyUnicode_KIND(self);
10279
10280
23.2M
    if (substring == NULL) {
10281
181k
        if (maxcount < 0) {
10282
156k
            maxcount = (len1 - 1) / 2 + 1;
10283
156k
        }
10284
181k
        switch (kind1) {
10285
119k
        case PyUnicode_1BYTE_KIND:
10286
119k
            if (PyUnicode_IS_ASCII(self))
10287
93.7k
                return asciilib_split_whitespace(
10288
93.7k
                    self,  PyUnicode_1BYTE_DATA(self),
10289
93.7k
                    len1, maxcount
10290
93.7k
                    );
10291
25.5k
            else
10292
25.5k
                return ucs1lib_split_whitespace(
10293
25.5k
                    self,  PyUnicode_1BYTE_DATA(self),
10294
25.5k
                    len1, maxcount
10295
25.5k
                    );
10296
51.3k
        case PyUnicode_2BYTE_KIND:
10297
51.3k
            return ucs2lib_split_whitespace(
10298
51.3k
                self,  PyUnicode_2BYTE_DATA(self),
10299
51.3k
                len1, maxcount
10300
51.3k
                );
10301
11.0k
        case PyUnicode_4BYTE_KIND:
10302
11.0k
            return ucs4lib_split_whitespace(
10303
11.0k
                self,  PyUnicode_4BYTE_DATA(self),
10304
11.0k
                len1, maxcount
10305
11.0k
                );
10306
0
        default:
10307
0
            Py_UNREACHABLE();
10308
181k
        }
10309
181k
    }
10310
10311
23.0M
    kind2 = PyUnicode_KIND(substring);
10312
23.0M
    len2 = PyUnicode_GET_LENGTH(substring);
10313
23.0M
    if (maxcount < 0) {
10314
        // if len2 == 0, it will raise ValueError.
10315
15.5M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10316
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10317
15.5M
        maxcount = maxcount < 0 ? len1 : maxcount;
10318
15.5M
    }
10319
23.0M
    if (kind1 < kind2 || len1 < len2) {
10320
2.45M
        out = PyList_New(1);
10321
2.45M
        if (out == NULL)
10322
0
            return NULL;
10323
2.45M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10324
2.45M
        return out;
10325
2.45M
    }
10326
20.5M
    buf1 = PyUnicode_DATA(self);
10327
20.5M
    buf2 = PyUnicode_DATA(substring);
10328
20.5M
    if (kind2 != kind1) {
10329
250k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10330
250k
        if (!buf2)
10331
0
            return NULL;
10332
250k
    }
10333
10334
20.5M
    switch (kind1) {
10335
20.3M
    case PyUnicode_1BYTE_KIND:
10336
20.3M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10337
19.1M
            out = asciilib_split(
10338
19.1M
                self,  buf1, len1, buf2, len2, maxcount);
10339
1.20M
        else
10340
1.20M
            out = ucs1lib_split(
10341
1.20M
                self,  buf1, len1, buf2, len2, maxcount);
10342
20.3M
        break;
10343
216k
    case PyUnicode_2BYTE_KIND:
10344
216k
        out = ucs2lib_split(
10345
216k
            self,  buf1, len1, buf2, len2, maxcount);
10346
216k
        break;
10347
34.8k
    case PyUnicode_4BYTE_KIND:
10348
34.8k
        out = ucs4lib_split(
10349
34.8k
            self,  buf1, len1, buf2, len2, maxcount);
10350
34.8k
        break;
10351
0
    default:
10352
0
        out = NULL;
10353
20.5M
    }
10354
20.5M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10355
20.5M
    if (kind2 != kind1)
10356
250k
        PyMem_Free((void *)buf2);
10357
20.5M
    return out;
10358
20.5M
}
10359
10360
static PyObject *
10361
rsplit(PyObject *self,
10362
       PyObject *substring,
10363
       Py_ssize_t maxcount)
10364
50
{
10365
50
    int kind1, kind2;
10366
50
    const void *buf1, *buf2;
10367
50
    Py_ssize_t len1, len2;
10368
50
    PyObject* out;
10369
10370
50
    len1 = PyUnicode_GET_LENGTH(self);
10371
50
    kind1 = PyUnicode_KIND(self);
10372
10373
50
    if (substring == NULL) {
10374
0
        if (maxcount < 0) {
10375
0
            maxcount = (len1 - 1) / 2 + 1;
10376
0
        }
10377
0
        switch (kind1) {
10378
0
        case PyUnicode_1BYTE_KIND:
10379
0
            if (PyUnicode_IS_ASCII(self))
10380
0
                return asciilib_rsplit_whitespace(
10381
0
                    self,  PyUnicode_1BYTE_DATA(self),
10382
0
                    len1, maxcount
10383
0
                    );
10384
0
            else
10385
0
                return ucs1lib_rsplit_whitespace(
10386
0
                    self,  PyUnicode_1BYTE_DATA(self),
10387
0
                    len1, maxcount
10388
0
                    );
10389
0
        case PyUnicode_2BYTE_KIND:
10390
0
            return ucs2lib_rsplit_whitespace(
10391
0
                self,  PyUnicode_2BYTE_DATA(self),
10392
0
                len1, maxcount
10393
0
                );
10394
0
        case PyUnicode_4BYTE_KIND:
10395
0
            return ucs4lib_rsplit_whitespace(
10396
0
                self,  PyUnicode_4BYTE_DATA(self),
10397
0
                len1, maxcount
10398
0
                );
10399
0
        default:
10400
0
            Py_UNREACHABLE();
10401
0
        }
10402
0
    }
10403
50
    kind2 = PyUnicode_KIND(substring);
10404
50
    len2 = PyUnicode_GET_LENGTH(substring);
10405
50
    if (maxcount < 0) {
10406
        // if len2 == 0, it will raise ValueError.
10407
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10408
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10409
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10410
0
    }
10411
50
    if (kind1 < kind2 || len1 < len2) {
10412
0
        out = PyList_New(1);
10413
0
        if (out == NULL)
10414
0
            return NULL;
10415
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10416
0
        return out;
10417
0
    }
10418
50
    buf1 = PyUnicode_DATA(self);
10419
50
    buf2 = PyUnicode_DATA(substring);
10420
50
    if (kind2 != kind1) {
10421
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10422
0
        if (!buf2)
10423
0
            return NULL;
10424
0
    }
10425
10426
50
    switch (kind1) {
10427
50
    case PyUnicode_1BYTE_KIND:
10428
50
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10429
50
            out = asciilib_rsplit(
10430
50
                self,  buf1, len1, buf2, len2, maxcount);
10431
0
        else
10432
0
            out = ucs1lib_rsplit(
10433
0
                self,  buf1, len1, buf2, len2, maxcount);
10434
50
        break;
10435
0
    case PyUnicode_2BYTE_KIND:
10436
0
        out = ucs2lib_rsplit(
10437
0
            self,  buf1, len1, buf2, len2, maxcount);
10438
0
        break;
10439
0
    case PyUnicode_4BYTE_KIND:
10440
0
        out = ucs4lib_rsplit(
10441
0
            self,  buf1, len1, buf2, len2, maxcount);
10442
0
        break;
10443
0
    default:
10444
0
        out = NULL;
10445
50
    }
10446
50
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10447
50
    if (kind2 != kind1)
10448
0
        PyMem_Free((void *)buf2);
10449
50
    return out;
10450
50
}
10451
10452
static Py_ssize_t
10453
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10454
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10455
171M
{
10456
171M
    switch (kind) {
10457
29.4M
    case PyUnicode_1BYTE_KIND:
10458
29.4M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10459
25.3M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10460
4.08M
        else
10461
4.08M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10462
76.3M
    case PyUnicode_2BYTE_KIND:
10463
76.3M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10464
65.6M
    case PyUnicode_4BYTE_KIND:
10465
65.6M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10466
171M
    }
10467
171M
    Py_UNREACHABLE();
10468
171M
}
10469
10470
static Py_ssize_t
10471
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10472
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10473
41.9M
{
10474
41.9M
    switch (kind) {
10475
35.8M
    case PyUnicode_1BYTE_KIND:
10476
35.8M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10477
5.97M
    case PyUnicode_2BYTE_KIND:
10478
5.97M
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10479
128k
    case PyUnicode_4BYTE_KIND:
10480
128k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10481
41.9M
    }
10482
41.9M
    Py_UNREACHABLE();
10483
41.9M
}
10484
10485
static void
10486
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10487
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10488
1.89M
{
10489
1.89M
    int kind = PyUnicode_KIND(u);
10490
1.89M
    void *data = PyUnicode_DATA(u);
10491
1.89M
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10492
1.89M
    if (kind == PyUnicode_1BYTE_KIND) {
10493
805k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10494
805k
                                      (Py_UCS1 *)data + len,
10495
805k
                                      u1, u2, maxcount);
10496
805k
    }
10497
1.08M
    else if (kind == PyUnicode_2BYTE_KIND) {
10498
1.07M
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10499
1.07M
                                      (Py_UCS2 *)data + len,
10500
1.07M
                                      u1, u2, maxcount);
10501
1.07M
    }
10502
17.1k
    else {
10503
17.1k
        assert(kind == PyUnicode_4BYTE_KIND);
10504
17.1k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10505
17.1k
                                      (Py_UCS4 *)data + len,
10506
17.1k
                                      u1, u2, maxcount);
10507
17.1k
    }
10508
1.89M
}
10509
10510
static PyObject *
10511
replace(PyObject *self, PyObject *str1,
10512
        PyObject *str2, Py_ssize_t maxcount)
10513
76.6M
{
10514
76.6M
    PyObject *u;
10515
76.6M
    const char *sbuf = PyUnicode_DATA(self);
10516
76.6M
    const void *buf1 = PyUnicode_DATA(str1);
10517
76.6M
    const void *buf2 = PyUnicode_DATA(str2);
10518
76.6M
    int srelease = 0, release1 = 0, release2 = 0;
10519
76.6M
    int skind = PyUnicode_KIND(self);
10520
76.6M
    int kind1 = PyUnicode_KIND(str1);
10521
76.6M
    int kind2 = PyUnicode_KIND(str2);
10522
76.6M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10523
76.6M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10524
76.6M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10525
76.6M
    int mayshrink;
10526
76.6M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10527
10528
76.6M
    if (slen < len1)
10529
27.5M
        goto nothing;
10530
10531
49.1M
    if (maxcount < 0)
10532
49.1M
        maxcount = PY_SSIZE_T_MAX;
10533
0
    else if (maxcount == 0)
10534
0
        goto nothing;
10535
10536
49.1M
    if (str1 == str2)
10537
3.53k
        goto nothing;
10538
10539
49.1M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10540
49.1M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10541
49.1M
    if (maxchar < maxchar_str1)
10542
        /* substring too wide to be present */
10543
0
        goto nothing;
10544
49.1M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10545
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10546
       result string. */
10547
49.1M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10548
49.1M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10549
10550
49.1M
    if (len1 == len2) {
10551
        /* same length */
10552
7.25M
        if (len1 == 0)
10553
0
            goto nothing;
10554
7.25M
        if (len1 == 1) {
10555
            /* replace characters */
10556
7.25M
            Py_UCS4 u1, u2;
10557
7.25M
            Py_ssize_t pos;
10558
10559
7.25M
            u1 = PyUnicode_READ(kind1, buf1, 0);
10560
7.25M
            pos = findchar(sbuf, skind, slen, u1, 1);
10561
7.25M
            if (pos < 0)
10562
5.35M
                goto nothing;
10563
1.89M
            u2 = PyUnicode_READ(kind2, buf2, 0);
10564
1.89M
            u = PyUnicode_New(slen, maxchar);
10565
1.89M
            if (!u)
10566
0
                goto error;
10567
10568
1.89M
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10569
1.89M
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10570
1.89M
        }
10571
0
        else {
10572
0
            int rkind = skind;
10573
0
            char *res;
10574
0
            Py_ssize_t i;
10575
10576
0
            if (kind1 < rkind) {
10577
                /* widen substring */
10578
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10579
0
                if (!buf1) goto error;
10580
0
                release1 = 1;
10581
0
            }
10582
0
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10583
0
            if (i < 0)
10584
0
                goto nothing;
10585
0
            if (rkind > kind2) {
10586
                /* widen replacement */
10587
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10588
0
                if (!buf2) goto error;
10589
0
                release2 = 1;
10590
0
            }
10591
0
            else if (rkind < kind2) {
10592
                /* widen self and buf1 */
10593
0
                rkind = kind2;
10594
0
                if (release1) {
10595
0
                    assert(buf1 != PyUnicode_DATA(str1));
10596
0
                    PyMem_Free((void *)buf1);
10597
0
                    buf1 = PyUnicode_DATA(str1);
10598
0
                    release1 = 0;
10599
0
                }
10600
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10601
0
                if (!sbuf) goto error;
10602
0
                srelease = 1;
10603
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10604
0
                if (!buf1) goto error;
10605
0
                release1 = 1;
10606
0
            }
10607
0
            u = PyUnicode_New(slen, maxchar);
10608
0
            if (!u)
10609
0
                goto error;
10610
0
            assert(PyUnicode_KIND(u) == rkind);
10611
0
            res = PyUnicode_DATA(u);
10612
10613
0
            memcpy(res, sbuf, rkind * slen);
10614
            /* change everything in-place, starting with this one */
10615
0
            memcpy(res + rkind * i,
10616
0
                   buf2,
10617
0
                   rkind * len2);
10618
0
            i += len1;
10619
10620
0
            while ( --maxcount > 0) {
10621
0
                i = anylib_find(rkind, self,
10622
0
                                sbuf+rkind*i, slen-i,
10623
0
                                str1, buf1, len1, i);
10624
0
                if (i == -1)
10625
0
                    break;
10626
0
                memcpy(res + rkind * i,
10627
0
                       buf2,
10628
0
                       rkind * len2);
10629
0
                i += len1;
10630
0
            }
10631
0
        }
10632
7.25M
    }
10633
41.9M
    else {
10634
41.9M
        Py_ssize_t n, i, j, ires;
10635
41.9M
        Py_ssize_t new_size;
10636
41.9M
        int rkind = skind;
10637
41.9M
        char *res;
10638
10639
41.9M
        if (kind1 < rkind) {
10640
            /* widen substring */
10641
6.09M
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10642
6.09M
            if (!buf1) goto error;
10643
6.09M
            release1 = 1;
10644
6.09M
        }
10645
41.9M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10646
41.9M
        if (n == 0)
10647
36.8M
            goto nothing;
10648
5.06M
        if (kind2 < rkind) {
10649
            /* widen replacement */
10650
1.26M
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10651
1.26M
            if (!buf2) goto error;
10652
1.26M
            release2 = 1;
10653
1.26M
        }
10654
3.80M
        else if (kind2 > rkind) {
10655
            /* widen self and buf1 */
10656
0
            rkind = kind2;
10657
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10658
0
            if (!sbuf) goto error;
10659
0
            srelease = 1;
10660
0
            if (release1) {
10661
0
                assert(buf1 != PyUnicode_DATA(str1));
10662
0
                PyMem_Free((void *)buf1);
10663
0
                buf1 = PyUnicode_DATA(str1);
10664
0
                release1 = 0;
10665
0
            }
10666
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10667
0
            if (!buf1) goto error;
10668
0
            release1 = 1;
10669
0
        }
10670
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10671
           PyUnicode_GET_LENGTH(str1)); */
10672
5.06M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10673
0
                PyErr_SetString(PyExc_OverflowError,
10674
0
                                "replace string is too long");
10675
0
                goto error;
10676
0
        }
10677
5.06M
        new_size = slen + n * (len2 - len1);
10678
5.06M
        if (new_size == 0) {
10679
0
            u = _PyUnicode_GetEmpty();
10680
0
            goto done;
10681
0
        }
10682
5.06M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10683
0
            PyErr_SetString(PyExc_OverflowError,
10684
0
                            "replace string is too long");
10685
0
            goto error;
10686
0
        }
10687
5.06M
        u = PyUnicode_New(new_size, maxchar);
10688
5.06M
        if (!u)
10689
0
            goto error;
10690
5.06M
        assert(PyUnicode_KIND(u) == rkind);
10691
5.06M
        res = PyUnicode_DATA(u);
10692
5.06M
        ires = i = 0;
10693
5.06M
        if (len1 > 0) {
10694
176M
            while (n-- > 0) {
10695
                /* look for next match */
10696
171M
                j = anylib_find(rkind, self,
10697
171M
                                sbuf + rkind * i, slen-i,
10698
171M
                                str1, buf1, len1, i);
10699
171M
                if (j == -1)
10700
0
                    break;
10701
171M
                else if (j > i) {
10702
                    /* copy unchanged part [i:j] */
10703
23.8M
                    memcpy(res + rkind * ires,
10704
23.8M
                           sbuf + rkind * i,
10705
23.8M
                           rkind * (j-i));
10706
23.8M
                    ires += j - i;
10707
23.8M
                }
10708
                /* copy substitution string */
10709
171M
                if (len2 > 0) {
10710
171M
                    memcpy(res + rkind * ires,
10711
171M
                           buf2,
10712
171M
                           rkind * len2);
10713
171M
                    ires += len2;
10714
171M
                }
10715
171M
                i = j + len1;
10716
171M
            }
10717
5.06M
            if (i < slen)
10718
                /* copy tail [i:] */
10719
4.97M
                memcpy(res + rkind * ires,
10720
4.97M
                       sbuf + rkind * i,
10721
4.97M
                       rkind * (slen-i));
10722
5.06M
        }
10723
0
        else {
10724
            /* interleave */
10725
0
            while (n > 0) {
10726
0
                memcpy(res + rkind * ires,
10727
0
                       buf2,
10728
0
                       rkind * len2);
10729
0
                ires += len2;
10730
0
                if (--n <= 0)
10731
0
                    break;
10732
0
                memcpy(res + rkind * ires,
10733
0
                       sbuf + rkind * i,
10734
0
                       rkind);
10735
0
                ires++;
10736
0
                i++;
10737
0
            }
10738
0
            memcpy(res + rkind * ires,
10739
0
                   sbuf + rkind * i,
10740
0
                   rkind * (slen-i));
10741
0
        }
10742
5.06M
    }
10743
10744
6.96M
    if (mayshrink) {
10745
0
        unicode_adjust_maxchar(&u);
10746
0
        if (u == NULL)
10747
0
            goto error;
10748
0
    }
10749
10750
6.96M
  done:
10751
6.96M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10752
6.96M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10753
6.96M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10754
6.96M
    if (srelease)
10755
0
        PyMem_Free((void *)sbuf);
10756
6.96M
    if (release1)
10757
1.26M
        PyMem_Free((void *)buf1);
10758
6.96M
    if (release2)
10759
1.26M
        PyMem_Free((void *)buf2);
10760
6.96M
    assert(_PyUnicode_CheckConsistency(u, 1));
10761
6.96M
    return u;
10762
10763
69.7M
  nothing:
10764
    /* nothing to replace; return original string (when possible) */
10765
69.7M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10766
69.7M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10767
69.7M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10768
69.7M
    if (srelease)
10769
0
        PyMem_Free((void *)sbuf);
10770
69.7M
    if (release1)
10771
4.83M
        PyMem_Free((void *)buf1);
10772
69.7M
    if (release2)
10773
0
        PyMem_Free((void *)buf2);
10774
69.7M
    return unicode_result_unchanged(self);
10775
10776
0
  error:
10777
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10778
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10779
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10780
0
    if (srelease)
10781
0
        PyMem_Free((void *)sbuf);
10782
0
    if (release1)
10783
0
        PyMem_Free((void *)buf1);
10784
0
    if (release2)
10785
0
        PyMem_Free((void *)buf2);
10786
0
    return NULL;
10787
6.96M
}
10788
10789
/* --- Unicode Object Methods --------------------------------------------- */
10790
10791
/*[clinic input]
10792
@permit_long_docstring_body
10793
str.title as unicode_title
10794
10795
Return a version of the string where each word is titlecased.
10796
10797
More specifically, words start with uppercased characters and all remaining
10798
cased characters have lower case.
10799
[clinic start generated code]*/
10800
10801
static PyObject *
10802
unicode_title_impl(PyObject *self)
10803
/*[clinic end generated code: output=c75ae03809574902 input=533ce0eb6a7f5d1b]*/
10804
0
{
10805
0
    return case_operation(self, do_title);
10806
0
}
10807
10808
/*[clinic input]
10809
@permit_long_docstring_body
10810
str.capitalize as unicode_capitalize
10811
10812
Return a capitalized version of the string.
10813
10814
More specifically, make the first character have upper case and the rest lower
10815
case.
10816
[clinic start generated code]*/
10817
10818
static PyObject *
10819
unicode_capitalize_impl(PyObject *self)
10820
/*[clinic end generated code: output=e49a4c333cdb7667 input=a4a15ade41f6f9e9]*/
10821
0
{
10822
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10823
0
        return unicode_result_unchanged(self);
10824
0
    return case_operation(self, do_capitalize);
10825
0
}
10826
10827
/*[clinic input]
10828
str.casefold as unicode_casefold
10829
10830
Return a version of the string suitable for caseless comparisons.
10831
[clinic start generated code]*/
10832
10833
static PyObject *
10834
unicode_casefold_impl(PyObject *self)
10835
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10836
0
{
10837
0
    if (PyUnicode_IS_ASCII(self))
10838
0
        return ascii_upper_or_lower(self, 1);
10839
0
    return case_operation(self, do_casefold);
10840
0
}
10841
10842
10843
/* Argument converter. Accepts a single Unicode character. */
10844
10845
static int
10846
convert_uc(PyObject *obj, void *addr)
10847
0
{
10848
0
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10849
10850
0
    if (!PyUnicode_Check(obj)) {
10851
0
        PyErr_Format(PyExc_TypeError,
10852
0
                     "The fill character must be a unicode character, "
10853
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10854
0
        return 0;
10855
0
    }
10856
0
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10857
0
        PyErr_SetString(PyExc_TypeError,
10858
0
                        "The fill character must be exactly one character long");
10859
0
        return 0;
10860
0
    }
10861
0
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10862
0
    return 1;
10863
0
}
10864
10865
/*[clinic input]
10866
str.center as unicode_center
10867
10868
    width: Py_ssize_t
10869
    fillchar: Py_UCS4 = ' '
10870
    /
10871
10872
Return a centered string of length width.
10873
10874
Padding is done using the specified fill character (default is a space).
10875
[clinic start generated code]*/
10876
10877
static PyObject *
10878
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10879
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10880
0
{
10881
0
    Py_ssize_t marg, left;
10882
10883
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10884
0
        return unicode_result_unchanged(self);
10885
10886
0
    marg = width - PyUnicode_GET_LENGTH(self);
10887
0
    left = marg / 2 + (marg & width & 1);
10888
10889
0
    return pad(self, left, marg - left, fillchar);
10890
0
}
10891
10892
/* This function assumes that str1 and str2 are readied by the caller. */
10893
10894
static int
10895
unicode_compare(PyObject *str1, PyObject *str2)
10896
15.0M
{
10897
15.0M
#define COMPARE(TYPE1, TYPE2) \
10898
15.0M
    do { \
10899
14.0M
        TYPE1* p1 = (TYPE1 *)data1; \
10900
14.0M
        TYPE2* p2 = (TYPE2 *)data2; \
10901
14.0M
        TYPE1* end = p1 + len; \
10902
14.0M
        Py_UCS4 c1, c2; \
10903
14.0M
        for (; p1 != end; p1++, p2++) { \
10904
14.0M
            c1 = *p1; \
10905
14.0M
            c2 = *p2; \
10906
14.0M
            if (c1 != c2) \
10907
14.0M
                return (c1 < c2) ? -1 : 1; \
10908
14.0M
        } \
10909
14.0M
    } \
10910
14.0M
    while (0)
10911
10912
15.0M
    int kind1, kind2;
10913
15.0M
    const void *data1, *data2;
10914
15.0M
    Py_ssize_t len1, len2, len;
10915
10916
15.0M
    kind1 = PyUnicode_KIND(str1);
10917
15.0M
    kind2 = PyUnicode_KIND(str2);
10918
15.0M
    data1 = PyUnicode_DATA(str1);
10919
15.0M
    data2 = PyUnicode_DATA(str2);
10920
15.0M
    len1 = PyUnicode_GET_LENGTH(str1);
10921
15.0M
    len2 = PyUnicode_GET_LENGTH(str2);
10922
15.0M
    len = Py_MIN(len1, len2);
10923
10924
15.0M
    switch(kind1) {
10925
1.41M
    case PyUnicode_1BYTE_KIND:
10926
1.41M
    {
10927
1.41M
        switch(kind2) {
10928
247k
        case PyUnicode_1BYTE_KIND:
10929
247k
        {
10930
247k
            int cmp = memcmp(data1, data2, len);
10931
            /* normalize result of memcmp() into the range [-1; 1] */
10932
247k
            if (cmp < 0)
10933
219k
                return -1;
10934
28.5k
            if (cmp > 0)
10935
22.3k
                return 1;
10936
6.23k
            break;
10937
28.5k
        }
10938
992k
        case PyUnicode_2BYTE_KIND:
10939
992k
            COMPARE(Py_UCS1, Py_UCS2);
10940
0
            break;
10941
175k
        case PyUnicode_4BYTE_KIND:
10942
175k
            COMPARE(Py_UCS1, Py_UCS4);
10943
0
            break;
10944
0
        default:
10945
0
            Py_UNREACHABLE();
10946
1.41M
        }
10947
6.23k
        break;
10948
1.41M
    }
10949
12.3M
    case PyUnicode_2BYTE_KIND:
10950
12.3M
    {
10951
12.3M
        switch(kind2) {
10952
3.62k
        case PyUnicode_1BYTE_KIND:
10953
3.62k
            COMPARE(Py_UCS2, Py_UCS1);
10954
0
            break;
10955
11.0M
        case PyUnicode_2BYTE_KIND:
10956
11.0M
        {
10957
11.0M
            COMPARE(Py_UCS2, Py_UCS2);
10958
0
            break;
10959
11.0M
        }
10960
1.32M
        case PyUnicode_4BYTE_KIND:
10961
1.32M
            COMPARE(Py_UCS2, Py_UCS4);
10962
0
            break;
10963
0
        default:
10964
0
            Py_UNREACHABLE();
10965
12.3M
        }
10966
0
        break;
10967
12.3M
    }
10968
1.24M
    case PyUnicode_4BYTE_KIND:
10969
1.24M
    {
10970
1.24M
        switch(kind2) {
10971
730
        case PyUnicode_1BYTE_KIND:
10972
730
            COMPARE(Py_UCS4, Py_UCS1);
10973
0
            break;
10974
479k
        case PyUnicode_2BYTE_KIND:
10975
479k
            COMPARE(Py_UCS4, Py_UCS2);
10976
0
            break;
10977
766k
        case PyUnicode_4BYTE_KIND:
10978
766k
        {
10979
766k
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10980
766k
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10981
            /* normalize result of wmemcmp() into the range [-1; 1] */
10982
766k
            if (cmp < 0)
10983
374k
                return -1;
10984
392k
            if (cmp > 0)
10985
392k
                return 1;
10986
#else
10987
            COMPARE(Py_UCS4, Py_UCS4);
10988
#endif
10989
0
            break;
10990
392k
        }
10991
0
        default:
10992
0
            Py_UNREACHABLE();
10993
1.24M
        }
10994
0
        break;
10995
1.24M
    }
10996
0
    default:
10997
0
        Py_UNREACHABLE();
10998
15.0M
    }
10999
11000
6.23k
    if (len1 == len2)
11001
6.22k
        return 0;
11002
11
    if (len1 < len2)
11003
11
        return -1;
11004
0
    else
11005
0
        return 1;
11006
11007
11
#undef COMPARE
11008
11
}
11009
11010
11011
int
11012
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11013
614M
{
11014
614M
    assert(PyUnicode_Check(str1));
11015
614M
    assert(PyUnicode_Check(str2));
11016
614M
    if (str1 == str2) {
11017
98.3M
        return 1;
11018
98.3M
    }
11019
516M
    return unicode_eq(str1, str2);
11020
614M
}
11021
11022
11023
int
11024
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11025
0
{
11026
0
    if (!PyUnicode_Check(str1)) {
11027
0
        PyErr_Format(PyExc_TypeError,
11028
0
                     "first argument must be str, not %T", str1);
11029
0
        return -1;
11030
0
    }
11031
0
    if (!PyUnicode_Check(str2)) {
11032
0
        PyErr_Format(PyExc_TypeError,
11033
0
                     "second argument must be str, not %T", str2);
11034
0
        return -1;
11035
0
    }
11036
11037
0
    return _PyUnicode_Equal(str1, str2);
11038
0
}
11039
11040
11041
int
11042
PyUnicode_Compare(PyObject *left, PyObject *right)
11043
181k
{
11044
181k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11045
        /* a string is equal to itself */
11046
181k
        if (left == right)
11047
0
            return 0;
11048
11049
181k
        return unicode_compare(left, right);
11050
181k
    }
11051
0
    PyErr_Format(PyExc_TypeError,
11052
0
                 "Can't compare %.100s and %.100s",
11053
0
                 Py_TYPE(left)->tp_name,
11054
0
                 Py_TYPE(right)->tp_name);
11055
0
    return -1;
11056
181k
}
11057
11058
int
11059
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11060
3.97M
{
11061
3.97M
    Py_ssize_t i;
11062
3.97M
    int kind;
11063
3.97M
    Py_UCS4 chr;
11064
11065
3.97M
    assert(_PyUnicode_CHECK(uni));
11066
3.97M
    kind = PyUnicode_KIND(uni);
11067
3.97M
    if (kind == PyUnicode_1BYTE_KIND) {
11068
3.97M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11069
3.97M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11070
3.97M
        size_t len, len2 = strlen(str);
11071
3.97M
        int cmp;
11072
11073
3.97M
        len = Py_MIN(len1, len2);
11074
3.97M
        cmp = memcmp(data, str, len);
11075
3.97M
        if (cmp != 0) {
11076
3.50M
            if (cmp < 0)
11077
8.46k
                return -1;
11078
3.49M
            else
11079
3.49M
                return 1;
11080
3.50M
        }
11081
468k
        if (len1 > len2)
11082
70
            return 1; /* uni is longer */
11083
468k
        if (len1 < len2)
11084
742
            return -1; /* str is longer */
11085
468k
        return 0;
11086
468k
    }
11087
1.43k
    else {
11088
1.43k
        const void *data = PyUnicode_DATA(uni);
11089
        /* Compare Unicode string and source character set string */
11090
2.67k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11091
2.43k
            if (chr != (unsigned char)str[i])
11092
1.19k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11093
        /* This check keeps Python strings that end in '\0' from comparing equal
11094
         to C strings identical up to that point. */
11095
240
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11096
240
            return 1; /* uni is longer */
11097
0
        if (str[i])
11098
0
            return -1; /* str is longer */
11099
0
        return 0;
11100
0
    }
11101
3.97M
}
11102
11103
int
11104
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11105
30
{
11106
30
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11107
30
}
11108
11109
int
11110
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11111
30
{
11112
30
    assert(_PyUnicode_CHECK(unicode));
11113
30
    assert(str);
11114
11115
30
    if (PyUnicode_IS_ASCII(unicode)) {
11116
30
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11117
30
        return size == len &&
11118
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11119
30
    }
11120
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11121
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11122
0
        return size == len &&
11123
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11124
0
    }
11125
11126
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11127
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11128
0
        return 0;
11129
0
    }
11130
0
    const unsigned char *s = (const unsigned char *)str;
11131
0
    const unsigned char *ends = s + (size_t)size;
11132
0
    int kind = PyUnicode_KIND(unicode);
11133
0
    const void *data = PyUnicode_DATA(unicode);
11134
    /* Compare Unicode string and UTF-8 string */
11135
0
    for (Py_ssize_t i = 0; i < len; i++) {
11136
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11137
0
        if (ch < 0x80) {
11138
0
            if (ends == s || s[0] != ch) {
11139
0
                return 0;
11140
0
            }
11141
0
            s += 1;
11142
0
        }
11143
0
        else if (ch < 0x800) {
11144
0
            if ((ends - s) < 2 ||
11145
0
                s[0] != (0xc0 | (ch >> 6)) ||
11146
0
                s[1] != (0x80 | (ch & 0x3f)))
11147
0
            {
11148
0
                return 0;
11149
0
            }
11150
0
            s += 2;
11151
0
        }
11152
0
        else if (ch < 0x10000) {
11153
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11154
0
                (ends - s) < 3 ||
11155
0
                s[0] != (0xe0 | (ch >> 12)) ||
11156
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11157
0
                s[2] != (0x80 | (ch & 0x3f)))
11158
0
            {
11159
0
                return 0;
11160
0
            }
11161
0
            s += 3;
11162
0
        }
11163
0
        else {
11164
0
            assert(ch <= MAX_UNICODE);
11165
0
            if ((ends - s) < 4 ||
11166
0
                s[0] != (0xf0 | (ch >> 18)) ||
11167
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11168
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11169
0
                s[3] != (0x80 | (ch & 0x3f)))
11170
0
            {
11171
0
                return 0;
11172
0
            }
11173
0
            s += 4;
11174
0
        }
11175
0
    }
11176
0
    return s == ends;
11177
0
}
11178
11179
int
11180
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11181
8.95M
{
11182
8.95M
    size_t len;
11183
8.95M
    assert(_PyUnicode_CHECK(unicode));
11184
8.95M
    assert(str);
11185
#ifndef NDEBUG
11186
    for (const char *p = str; *p; p++) {
11187
        assert((unsigned char)*p < 128);
11188
    }
11189
#endif
11190
8.95M
    if (!PyUnicode_IS_ASCII(unicode))
11191
149k
        return 0;
11192
8.80M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11193
8.80M
    return strlen(str) == len &&
11194
771k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11195
8.95M
}
11196
11197
int
11198
_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11199
0
{
11200
0
    PyObject *right_uni;
11201
11202
0
    assert(_PyUnicode_CHECK(left));
11203
0
    assert(right->string);
11204
#ifndef NDEBUG
11205
    for (const char *p = right->string; *p; p++) {
11206
        assert((unsigned char)*p < 128);
11207
    }
11208
#endif
11209
11210
0
    if (!PyUnicode_IS_ASCII(left))
11211
0
        return 0;
11212
11213
0
    right_uni = _PyUnicode_FromId(right);       /* borrowed */
11214
0
    if (right_uni == NULL) {
11215
        /* memory error or bad data */
11216
0
        PyErr_Clear();
11217
0
        return _PyUnicode_EqualToASCIIString(left, right->string);
11218
0
    }
11219
11220
0
    if (left == right_uni)
11221
0
        return 1;
11222
11223
0
    assert(PyUnicode_CHECK_INTERNED(right_uni));
11224
0
    if (PyUnicode_CHECK_INTERNED(left)) {
11225
0
        return 0;
11226
0
    }
11227
11228
0
    Py_hash_t right_hash = PyUnicode_HASH(right_uni);
11229
0
    assert(right_hash != -1);
11230
0
    Py_hash_t hash = PyUnicode_HASH(left);
11231
0
    if (hash != -1 && hash != right_hash) {
11232
0
        return 0;
11233
0
    }
11234
11235
0
    return unicode_eq(left, right_uni);
11236
0
}
11237
11238
PyObject *
11239
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11240
47.5M
{
11241
47.5M
    int result;
11242
11243
47.5M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11244
95.1k
        Py_RETURN_NOTIMPLEMENTED;
11245
11246
47.4M
    if (left == right) {
11247
2.59k
        switch (op) {
11248
2.44k
        case Py_EQ:
11249
2.44k
        case Py_LE:
11250
2.44k
        case Py_GE:
11251
            /* a string is equal to itself */
11252
2.44k
            Py_RETURN_TRUE;
11253
157
        case Py_NE:
11254
157
        case Py_LT:
11255
157
        case Py_GT:
11256
157
            Py_RETURN_FALSE;
11257
0
        default:
11258
0
            PyErr_BadArgument();
11259
0
            return NULL;
11260
2.59k
        }
11261
2.59k
    }
11262
47.4M
    else if (op == Py_EQ || op == Py_NE) {
11263
32.6M
        result = unicode_eq(left, right);
11264
32.6M
        result ^= (op == Py_NE);
11265
32.6M
        return PyBool_FromLong(result);
11266
32.6M
    }
11267
14.8M
    else {
11268
14.8M
        result = unicode_compare(left, right);
11269
14.8M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11270
14.8M
    }
11271
47.4M
}
11272
11273
int
11274
PyUnicode_Contains(PyObject *str, PyObject *substr)
11275
239M
{
11276
239M
    int kind1, kind2;
11277
239M
    const void *buf1, *buf2;
11278
239M
    Py_ssize_t len1, len2;
11279
239M
    int result;
11280
11281
239M
    if (!PyUnicode_Check(substr)) {
11282
0
        PyErr_Format(PyExc_TypeError,
11283
0
                     "'in <string>' requires string as left operand, not %.100s",
11284
0
                     Py_TYPE(substr)->tp_name);
11285
0
        return -1;
11286
0
    }
11287
239M
    if (ensure_unicode(str) < 0)
11288
0
        return -1;
11289
11290
239M
    kind1 = PyUnicode_KIND(str);
11291
239M
    kind2 = PyUnicode_KIND(substr);
11292
239M
    if (kind1 < kind2)
11293
15.8M
        return 0;
11294
223M
    len1 = PyUnicode_GET_LENGTH(str);
11295
223M
    len2 = PyUnicode_GET_LENGTH(substr);
11296
223M
    if (len1 < len2)
11297
596k
        return 0;
11298
223M
    buf1 = PyUnicode_DATA(str);
11299
223M
    buf2 = PyUnicode_DATA(substr);
11300
223M
    if (len2 == 1) {
11301
213M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11302
213M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11303
213M
        return result;
11304
213M
    }
11305
9.65M
    if (kind2 != kind1) {
11306
17.1k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11307
17.1k
        if (!buf2)
11308
0
            return -1;
11309
17.1k
    }
11310
11311
9.65M
    switch (kind1) {
11312
9.63M
    case PyUnicode_1BYTE_KIND:
11313
9.63M
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11314
9.63M
        break;
11315
13.2k
    case PyUnicode_2BYTE_KIND:
11316
13.2k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11317
13.2k
        break;
11318
3.91k
    case PyUnicode_4BYTE_KIND:
11319
3.91k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11320
3.91k
        break;
11321
0
    default:
11322
0
        Py_UNREACHABLE();
11323
9.65M
    }
11324
11325
9.65M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11326
9.65M
    if (kind2 != kind1)
11327
17.1k
        PyMem_Free((void *)buf2);
11328
11329
9.65M
    return result;
11330
9.65M
}
11331
11332
/* Concat to string or Unicode object giving a new Unicode object. */
11333
11334
PyObject *
11335
PyUnicode_Concat(PyObject *left, PyObject *right)
11336
46.7M
{
11337
46.7M
    PyObject *result;
11338
46.7M
    Py_UCS4 maxchar, maxchar2;
11339
46.7M
    Py_ssize_t left_len, right_len, new_len;
11340
11341
46.7M
    if (ensure_unicode(left) < 0)
11342
0
        return NULL;
11343
11344
46.7M
    if (!PyUnicode_Check(right)) {
11345
0
        PyErr_Format(PyExc_TypeError,
11346
0
            "can only concatenate str (not \"%.200s\") to str",
11347
0
            Py_TYPE(right)->tp_name);
11348
0
        return NULL;
11349
0
    }
11350
11351
    /* Shortcuts */
11352
46.7M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11353
46.7M
    if (left == empty) {
11354
73.9k
        return PyUnicode_FromObject(right);
11355
73.9k
    }
11356
46.6M
    if (right == empty) {
11357
4.12M
        return PyUnicode_FromObject(left);
11358
4.12M
    }
11359
11360
42.5M
    left_len = PyUnicode_GET_LENGTH(left);
11361
42.5M
    right_len = PyUnicode_GET_LENGTH(right);
11362
42.5M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11363
0
        PyErr_SetString(PyExc_OverflowError,
11364
0
                        "strings are too large to concat");
11365
0
        return NULL;
11366
0
    }
11367
42.5M
    new_len = left_len + right_len;
11368
11369
42.5M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11370
42.5M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11371
42.5M
    maxchar = Py_MAX(maxchar, maxchar2);
11372
11373
    /* Concat the two Unicode strings */
11374
42.5M
    result = PyUnicode_New(new_len, maxchar);
11375
42.5M
    if (result == NULL)
11376
0
        return NULL;
11377
42.5M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11378
42.5M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11379
42.5M
    assert(_PyUnicode_CheckConsistency(result, 1));
11380
42.5M
    return result;
11381
42.5M
}
11382
11383
void
11384
PyUnicode_Append(PyObject **p_left, PyObject *right)
11385
3.70M
{
11386
3.70M
    PyObject *left, *res;
11387
3.70M
    Py_UCS4 maxchar, maxchar2;
11388
3.70M
    Py_ssize_t left_len, right_len, new_len;
11389
11390
3.70M
    if (p_left == NULL) {
11391
0
        if (!PyErr_Occurred())
11392
0
            PyErr_BadInternalCall();
11393
0
        return;
11394
0
    }
11395
3.70M
    left = *p_left;
11396
3.70M
    if (right == NULL || left == NULL
11397
3.70M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11398
0
        if (!PyErr_Occurred())
11399
0
            PyErr_BadInternalCall();
11400
0
        goto error;
11401
0
    }
11402
11403
    /* Shortcuts */
11404
3.70M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11405
3.70M
    if (left == empty) {
11406
474k
        Py_DECREF(left);
11407
474k
        *p_left = Py_NewRef(right);
11408
474k
        return;
11409
474k
    }
11410
3.22M
    if (right == empty) {
11411
11.8k
        return;
11412
11.8k
    }
11413
11414
3.21M
    left_len = PyUnicode_GET_LENGTH(left);
11415
3.21M
    right_len = PyUnicode_GET_LENGTH(right);
11416
3.21M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11417
0
        PyErr_SetString(PyExc_OverflowError,
11418
0
                        "strings are too large to concat");
11419
0
        goto error;
11420
0
    }
11421
3.21M
    new_len = left_len + right_len;
11422
11423
3.21M
    if (_PyUnicode_IsModifiable(left)
11424
3.21M
        && PyUnicode_CheckExact(right)
11425
3.21M
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11426
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11427
           to change the structure size, but characters are stored just after
11428
           the structure, and so it requires to move all characters which is
11429
           not so different than duplicating the string. */
11430
1.72M
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11431
1.72M
    {
11432
        /* append inplace */
11433
1.72M
        if (unicode_resize(p_left, new_len) != 0)
11434
0
            goto error;
11435
11436
        /* copy 'right' into the newly allocated area of 'left' */
11437
1.72M
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11438
1.72M
    }
11439
1.49M
    else {
11440
1.49M
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11441
1.49M
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11442
1.49M
        maxchar = Py_MAX(maxchar, maxchar2);
11443
11444
        /* Concat the two Unicode strings */
11445
1.49M
        res = PyUnicode_New(new_len, maxchar);
11446
1.49M
        if (res == NULL)
11447
0
            goto error;
11448
1.49M
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11449
1.49M
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11450
1.49M
        Py_DECREF(left);
11451
1.49M
        *p_left = res;
11452
1.49M
    }
11453
3.21M
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11454
3.21M
    return;
11455
11456
0
error:
11457
0
    Py_CLEAR(*p_left);
11458
0
}
11459
11460
void
11461
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11462
0
{
11463
0
    PyUnicode_Append(pleft, right);
11464
0
    Py_XDECREF(right);
11465
0
}
11466
11467
/*[clinic input]
11468
@permit_long_summary
11469
@text_signature "($self, sub[, start[, end]], /)"
11470
str.count as unicode_count -> Py_ssize_t
11471
11472
    self as str: self
11473
    sub as substr: unicode
11474
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11475
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11476
    /
11477
11478
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11479
11480
Optional arguments start and end are interpreted as in slice notation.
11481
[clinic start generated code]*/
11482
11483
static Py_ssize_t
11484
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11485
                   Py_ssize_t end)
11486
/*[clinic end generated code: output=8fcc3aef0b18edbf input=8590716ee228b935]*/
11487
29.4M
{
11488
29.4M
    assert(PyUnicode_Check(str));
11489
29.4M
    assert(PyUnicode_Check(substr));
11490
11491
29.4M
    Py_ssize_t result;
11492
29.4M
    int kind1, kind2;
11493
29.4M
    const void *buf1 = NULL, *buf2 = NULL;
11494
29.4M
    Py_ssize_t len1, len2;
11495
11496
29.4M
    kind1 = PyUnicode_KIND(str);
11497
29.4M
    kind2 = PyUnicode_KIND(substr);
11498
29.4M
    if (kind1 < kind2)
11499
0
        return 0;
11500
11501
29.4M
    len1 = PyUnicode_GET_LENGTH(str);
11502
29.4M
    len2 = PyUnicode_GET_LENGTH(substr);
11503
29.4M
    ADJUST_INDICES(start, end, len1);
11504
29.4M
    if (end - start < len2)
11505
6.38M
        return 0;
11506
11507
23.0M
    buf1 = PyUnicode_DATA(str);
11508
23.0M
    buf2 = PyUnicode_DATA(substr);
11509
23.0M
    if (kind2 != kind1) {
11510
5.99M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11511
5.99M
        if (!buf2)
11512
0
            goto onError;
11513
5.99M
    }
11514
11515
    // We don't reuse `anylib_count` here because of the explicit casts.
11516
23.0M
    switch (kind1) {
11517
17.0M
    case PyUnicode_1BYTE_KIND:
11518
17.0M
        result = ucs1lib_count(
11519
17.0M
            ((const Py_UCS1*)buf1) + start, end - start,
11520
17.0M
            buf2, len2, PY_SSIZE_T_MAX
11521
17.0M
            );
11522
17.0M
        break;
11523
4.93M
    case PyUnicode_2BYTE_KIND:
11524
4.93M
        result = ucs2lib_count(
11525
4.93M
            ((const Py_UCS2*)buf1) + start, end - start,
11526
4.93M
            buf2, len2, PY_SSIZE_T_MAX
11527
4.93M
            );
11528
4.93M
        break;
11529
1.06M
    case PyUnicode_4BYTE_KIND:
11530
1.06M
        result = ucs4lib_count(
11531
1.06M
            ((const Py_UCS4*)buf1) + start, end - start,
11532
1.06M
            buf2, len2, PY_SSIZE_T_MAX
11533
1.06M
            );
11534
1.06M
        break;
11535
0
    default:
11536
0
        Py_UNREACHABLE();
11537
23.0M
    }
11538
11539
23.0M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11540
23.0M
    if (kind2 != kind1)
11541
5.99M
        PyMem_Free((void *)buf2);
11542
11543
23.0M
    return result;
11544
0
  onError:
11545
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11546
0
    if (kind2 != kind1)
11547
0
        PyMem_Free((void *)buf2);
11548
0
    return -1;
11549
23.0M
}
11550
11551
/*[clinic input]
11552
str.encode as unicode_encode
11553
11554
    encoding: str(c_default="NULL") = 'utf-8'
11555
        The encoding in which to encode the string.
11556
    errors: str(c_default="NULL") = 'strict'
11557
        The error handling scheme to use for encoding errors.
11558
        The default is 'strict' meaning that encoding errors raise a
11559
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11560
        'xmlcharrefreplace' as well as any other name registered with
11561
        codecs.register_error that can handle UnicodeEncodeErrors.
11562
11563
Encode the string using the codec registered for encoding.
11564
[clinic start generated code]*/
11565
11566
static PyObject *
11567
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11568
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11569
17.7M
{
11570
17.7M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11571
17.7M
}
11572
11573
/*[clinic input]
11574
str.expandtabs as unicode_expandtabs
11575
11576
    tabsize: int = 8
11577
11578
Return a copy where all tab characters are expanded using spaces.
11579
11580
If tabsize is not given, a tab size of 8 characters is assumed.
11581
[clinic start generated code]*/
11582
11583
static PyObject *
11584
unicode_expandtabs_impl(PyObject *self, int tabsize)
11585
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11586
7.21M
{
11587
7.21M
    Py_ssize_t i, j, line_pos, src_len, incr;
11588
7.21M
    Py_UCS4 ch;
11589
7.21M
    PyObject *u;
11590
7.21M
    const void *src_data;
11591
7.21M
    void *dest_data;
11592
7.21M
    int kind;
11593
7.21M
    int found;
11594
11595
    /* First pass: determine size of output string */
11596
7.21M
    src_len = PyUnicode_GET_LENGTH(self);
11597
7.21M
    i = j = line_pos = 0;
11598
7.21M
    kind = PyUnicode_KIND(self);
11599
7.21M
    src_data = PyUnicode_DATA(self);
11600
7.21M
    found = 0;
11601
151M
    for (; i < src_len; i++) {
11602
144M
        ch = PyUnicode_READ(kind, src_data, i);
11603
144M
        if (ch == '\t') {
11604
12.5M
            found = 1;
11605
12.5M
            if (tabsize > 0) {
11606
12.5M
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11607
12.5M
                if (j > PY_SSIZE_T_MAX - incr)
11608
0
                    goto overflow;
11609
12.5M
                line_pos += incr;
11610
12.5M
                j += incr;
11611
12.5M
            }
11612
12.5M
        }
11613
131M
        else {
11614
131M
            if (j > PY_SSIZE_T_MAX - 1)
11615
0
                goto overflow;
11616
131M
            line_pos++;
11617
131M
            j++;
11618
131M
            if (ch == '\n' || ch == '\r')
11619
15.7k
                line_pos = 0;
11620
131M
        }
11621
144M
    }
11622
7.21M
    if (!found)
11623
6.99M
        return unicode_result_unchanged(self);
11624
11625
    /* Second pass: create output string and fill it */
11626
218k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11627
218k
    if (!u)
11628
0
        return NULL;
11629
218k
    dest_data = PyUnicode_DATA(u);
11630
11631
218k
    i = j = line_pos = 0;
11632
11633
32.9M
    for (; i < src_len; i++) {
11634
32.6M
        ch = PyUnicode_READ(kind, src_data, i);
11635
32.6M
        if (ch == '\t') {
11636
12.5M
            if (tabsize > 0) {
11637
12.5M
                incr = tabsize - (line_pos % tabsize);
11638
12.5M
                line_pos += incr;
11639
12.5M
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11640
12.5M
                j += incr;
11641
12.5M
            }
11642
12.5M
        }
11643
20.1M
        else {
11644
20.1M
            line_pos++;
11645
20.1M
            PyUnicode_WRITE(kind, dest_data, j, ch);
11646
20.1M
            j++;
11647
20.1M
            if (ch == '\n' || ch == '\r')
11648
0
                line_pos = 0;
11649
20.1M
        }
11650
32.6M
    }
11651
218k
    assert (j == PyUnicode_GET_LENGTH(u));
11652
218k
    return unicode_result(u);
11653
11654
0
  overflow:
11655
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11656
0
    return NULL;
11657
218k
}
11658
11659
/*[clinic input]
11660
@permit_long_summary
11661
str.find as unicode_find = str.count
11662
11663
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11664
11665
Optional arguments start and end are interpreted as in slice notation.
11666
Return -1 on failure.
11667
[clinic start generated code]*/
11668
11669
static Py_ssize_t
11670
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11671
                  Py_ssize_t end)
11672
/*[clinic end generated code: output=51dbe6255712e278 input=3a9d650fe4c24695]*/
11673
28.0M
{
11674
28.0M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11675
28.0M
    if (result < 0) {
11676
6.45M
        return -1;
11677
6.45M
    }
11678
21.5M
    return result;
11679
28.0M
}
11680
11681
static PyObject *
11682
unicode_getitem(PyObject *self, Py_ssize_t index)
11683
66.9M
{
11684
66.9M
    const void *data;
11685
66.9M
    int kind;
11686
66.9M
    Py_UCS4 ch;
11687
11688
66.9M
    if (!PyUnicode_Check(self)) {
11689
0
        PyErr_BadArgument();
11690
0
        return NULL;
11691
0
    }
11692
66.9M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11693
8.01k
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11694
8.01k
        return NULL;
11695
8.01k
    }
11696
66.9M
    kind = PyUnicode_KIND(self);
11697
66.9M
    data = PyUnicode_DATA(self);
11698
66.9M
    ch = PyUnicode_READ(kind, data, index);
11699
66.9M
    return unicode_char(ch);
11700
66.9M
}
11701
11702
/* Believe it or not, this produces the same value for ASCII strings
11703
   as bytes_hash(). */
11704
static Py_hash_t
11705
unicode_hash(PyObject *self)
11706
1.04G
{
11707
1.04G
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11708
11709
#ifdef Py_DEBUG
11710
    assert(_Py_HashSecret_Initialized);
11711
#endif
11712
1.04G
    Py_hash_t hash = PyUnicode_HASH(self);
11713
1.04G
    if (hash != -1) {
11714
984M
        return hash;
11715
984M
    }
11716
61.8M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11717
61.8M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11718
11719
61.8M
    PyUnicode_SET_HASH(self, x);
11720
61.8M
    return x;
11721
1.04G
}
11722
11723
/*[clinic input]
11724
@permit_long_summary
11725
str.index as unicode_index = str.count
11726
11727
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11728
11729
Optional arguments start and end are interpreted as in slice notation.
11730
Raises ValueError when the substring is not found.
11731
[clinic start generated code]*/
11732
11733
static Py_ssize_t
11734
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11735
                   Py_ssize_t end)
11736
/*[clinic end generated code: output=77558288837cdf40 input=ae5e48f69ed75b06]*/
11737
13.9k
{
11738
13.9k
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11739
13.9k
    if (result == -1) {
11740
649
        PyErr_SetString(PyExc_ValueError, "substring not found");
11741
649
    }
11742
13.3k
    else if (result < 0) {
11743
0
        return -1;
11744
0
    }
11745
13.9k
    return result;
11746
13.9k
}
11747
11748
/*[clinic input]
11749
str.isascii as unicode_isascii
11750
11751
Return True if all characters in the string are ASCII, False otherwise.
11752
11753
ASCII characters have code points in the range U+0000-U+007F.
11754
Empty string is ASCII too.
11755
[clinic start generated code]*/
11756
11757
static PyObject *
11758
unicode_isascii_impl(PyObject *self)
11759
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11760
8.60k
{
11761
8.60k
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11762
8.60k
}
11763
11764
/*[clinic input]
11765
@permit_long_docstring_body
11766
str.islower as unicode_islower
11767
11768
Return True if the string is a lowercase string, False otherwise.
11769
11770
A string is lowercase if all cased characters in the string are lowercase and
11771
there is at least one cased character in the string.
11772
[clinic start generated code]*/
11773
11774
static PyObject *
11775
unicode_islower_impl(PyObject *self)
11776
/*[clinic end generated code: output=dbd41995bd005b81 input=c6fc0295241a1aaa]*/
11777
0
{
11778
0
    Py_ssize_t i, length;
11779
0
    int kind;
11780
0
    const void *data;
11781
0
    int cased;
11782
11783
0
    length = PyUnicode_GET_LENGTH(self);
11784
0
    kind = PyUnicode_KIND(self);
11785
0
    data = PyUnicode_DATA(self);
11786
11787
    /* Shortcut for single character strings */
11788
0
    if (length == 1)
11789
0
        return PyBool_FromLong(
11790
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11791
11792
    /* Special case for empty strings */
11793
0
    if (length == 0)
11794
0
        Py_RETURN_FALSE;
11795
11796
0
    cased = 0;
11797
0
    for (i = 0; i < length; i++) {
11798
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11799
11800
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11801
0
            Py_RETURN_FALSE;
11802
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11803
0
            cased = 1;
11804
0
    }
11805
0
    return PyBool_FromLong(cased);
11806
0
}
11807
11808
/*[clinic input]
11809
@permit_long_docstring_body
11810
str.isupper as unicode_isupper
11811
11812
Return True if the string is an uppercase string, False otherwise.
11813
11814
A string is uppercase if all cased characters in the string are uppercase and
11815
there is at least one cased character in the string.
11816
[clinic start generated code]*/
11817
11818
static PyObject *
11819
unicode_isupper_impl(PyObject *self)
11820
/*[clinic end generated code: output=049209c8e7f15f59 input=8d5cb33e67efde72]*/
11821
7.00k
{
11822
7.00k
    Py_ssize_t i, length;
11823
7.00k
    int kind;
11824
7.00k
    const void *data;
11825
7.00k
    int cased;
11826
11827
7.00k
    length = PyUnicode_GET_LENGTH(self);
11828
7.00k
    kind = PyUnicode_KIND(self);
11829
7.00k
    data = PyUnicode_DATA(self);
11830
11831
    /* Shortcut for single character strings */
11832
7.00k
    if (length == 1)
11833
0
        return PyBool_FromLong(
11834
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11835
11836
    /* Special case for empty strings */
11837
7.00k
    if (length == 0)
11838
0
        Py_RETURN_FALSE;
11839
11840
7.00k
    cased = 0;
11841
89.3k
    for (i = 0; i < length; i++) {
11842
83.2k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11843
11844
83.2k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11845
840
            Py_RETURN_FALSE;
11846
82.3k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11847
6.24k
            cased = 1;
11848
83.2k
    }
11849
6.16k
    return PyBool_FromLong(cased);
11850
7.00k
}
11851
11852
/*[clinic input]
11853
str.istitle as unicode_istitle
11854
11855
Return True if the string is a title-cased string, False otherwise.
11856
11857
In a title-cased string, upper- and title-case characters may only
11858
follow uncased characters and lowercase characters only cased ones.
11859
[clinic start generated code]*/
11860
11861
static PyObject *
11862
unicode_istitle_impl(PyObject *self)
11863
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11864
0
{
11865
0
    Py_ssize_t i, length;
11866
0
    int kind;
11867
0
    const void *data;
11868
0
    int cased, previous_is_cased;
11869
11870
0
    length = PyUnicode_GET_LENGTH(self);
11871
0
    kind = PyUnicode_KIND(self);
11872
0
    data = PyUnicode_DATA(self);
11873
11874
    /* Shortcut for single character strings */
11875
0
    if (length == 1) {
11876
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11877
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11878
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11879
0
    }
11880
11881
    /* Special case for empty strings */
11882
0
    if (length == 0)
11883
0
        Py_RETURN_FALSE;
11884
11885
0
    cased = 0;
11886
0
    previous_is_cased = 0;
11887
0
    for (i = 0; i < length; i++) {
11888
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11889
11890
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11891
0
            if (previous_is_cased)
11892
0
                Py_RETURN_FALSE;
11893
0
            previous_is_cased = 1;
11894
0
            cased = 1;
11895
0
        }
11896
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11897
0
            if (!previous_is_cased)
11898
0
                Py_RETURN_FALSE;
11899
0
            previous_is_cased = 1;
11900
0
            cased = 1;
11901
0
        }
11902
0
        else
11903
0
            previous_is_cased = 0;
11904
0
    }
11905
0
    return PyBool_FromLong(cased);
11906
0
}
11907
11908
/*[clinic input]
11909
@permit_long_docstring_body
11910
str.isspace as unicode_isspace
11911
11912
Return True if the string is a whitespace string, False otherwise.
11913
11914
A string is whitespace if all characters in the string are whitespace and there
11915
is at least one character in the string.
11916
[clinic start generated code]*/
11917
11918
static PyObject *
11919
unicode_isspace_impl(PyObject *self)
11920
/*[clinic end generated code: output=163a63bfa08ac2b9 input=44fe05e248c6e159]*/
11921
23.3M
{
11922
23.3M
    Py_ssize_t i, length;
11923
23.3M
    int kind;
11924
23.3M
    const void *data;
11925
11926
23.3M
    length = PyUnicode_GET_LENGTH(self);
11927
23.3M
    kind = PyUnicode_KIND(self);
11928
23.3M
    data = PyUnicode_DATA(self);
11929
11930
    /* Shortcut for single character strings */
11931
23.3M
    if (length == 1)
11932
23.3M
        return PyBool_FromLong(
11933
23.3M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11934
11935
    /* Special case for empty strings */
11936
0
    if (length == 0)
11937
0
        Py_RETURN_FALSE;
11938
11939
0
    for (i = 0; i < length; i++) {
11940
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11941
0
        if (!Py_UNICODE_ISSPACE(ch))
11942
0
            Py_RETURN_FALSE;
11943
0
    }
11944
0
    Py_RETURN_TRUE;
11945
0
}
11946
11947
/*[clinic input]
11948
@permit_long_docstring_body
11949
str.isalpha as unicode_isalpha
11950
11951
Return True if the string is an alphabetic string, False otherwise.
11952
11953
A string is alphabetic if all characters in the string are alphabetic and there
11954
is at least one character in the string.
11955
[clinic start generated code]*/
11956
11957
static PyObject *
11958
unicode_isalpha_impl(PyObject *self)
11959
/*[clinic end generated code: output=cc81b9ac3883ec4f input=c233000624a56e0d]*/
11960
7
{
11961
7
    Py_ssize_t i, length;
11962
7
    int kind;
11963
7
    const void *data;
11964
11965
7
    length = PyUnicode_GET_LENGTH(self);
11966
7
    kind = PyUnicode_KIND(self);
11967
7
    data = PyUnicode_DATA(self);
11968
11969
    /* Shortcut for single character strings */
11970
7
    if (length == 1)
11971
5
        return PyBool_FromLong(
11972
5
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11973
11974
    /* Special case for empty strings */
11975
2
    if (length == 0)
11976
0
        Py_RETURN_FALSE;
11977
11978
2
    for (i = 0; i < length; i++) {
11979
2
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11980
2
            Py_RETURN_FALSE;
11981
2
    }
11982
2
    Py_RETURN_TRUE;
11983
2
}
11984
11985
/*[clinic input]
11986
@permit_long_docstring_body
11987
str.isalnum as unicode_isalnum
11988
11989
Return True if the string is an alpha-numeric string, False otherwise.
11990
11991
A string is alpha-numeric if all characters in the string are alpha-numeric and
11992
there is at least one character in the string.
11993
[clinic start generated code]*/
11994
11995
static PyObject *
11996
unicode_isalnum_impl(PyObject *self)
11997
/*[clinic end generated code: output=a5a23490ffc3660c input=5d63ba9c9bafdb6b]*/
11998
0
{
11999
0
    int kind;
12000
0
    const void *data;
12001
0
    Py_ssize_t len, i;
12002
12003
0
    kind = PyUnicode_KIND(self);
12004
0
    data = PyUnicode_DATA(self);
12005
0
    len = PyUnicode_GET_LENGTH(self);
12006
12007
    /* Shortcut for single character strings */
12008
0
    if (len == 1) {
12009
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12010
0
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12011
0
    }
12012
12013
    /* Special case for empty strings */
12014
0
    if (len == 0)
12015
0
        Py_RETURN_FALSE;
12016
12017
0
    for (i = 0; i < len; i++) {
12018
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12019
0
        if (!Py_UNICODE_ISALNUM(ch))
12020
0
            Py_RETURN_FALSE;
12021
0
    }
12022
0
    Py_RETURN_TRUE;
12023
0
}
12024
12025
/*[clinic input]
12026
@permit_long_docstring_body
12027
str.isdecimal as unicode_isdecimal
12028
12029
Return True if the string is a decimal string, False otherwise.
12030
12031
A string is a decimal string if all characters in the string are decimal and
12032
there is at least one character in the string.
12033
[clinic start generated code]*/
12034
12035
static PyObject *
12036
unicode_isdecimal_impl(PyObject *self)
12037
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=8e84a58b414935a3]*/
12038
416
{
12039
416
    Py_ssize_t i, length;
12040
416
    int kind;
12041
416
    const void *data;
12042
12043
416
    length = PyUnicode_GET_LENGTH(self);
12044
416
    kind = PyUnicode_KIND(self);
12045
416
    data = PyUnicode_DATA(self);
12046
12047
    /* Shortcut for single character strings */
12048
416
    if (length == 1)
12049
12
        return PyBool_FromLong(
12050
12
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12051
12052
    /* Special case for empty strings */
12053
404
    if (length == 0)
12054
0
        Py_RETURN_FALSE;
12055
12056
926
    for (i = 0; i < length; i++) {
12057
876
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12058
354
            Py_RETURN_FALSE;
12059
876
    }
12060
404
    Py_RETURN_TRUE;
12061
404
}
12062
12063
/*[clinic input]
12064
@permit_long_docstring_body
12065
str.isdigit as unicode_isdigit
12066
12067
Return True if the string is a digit string, False otherwise.
12068
12069
A string is a digit string if all characters in the string are digits and there
12070
is at least one character in the string.
12071
[clinic start generated code]*/
12072
12073
static PyObject *
12074
unicode_isdigit_impl(PyObject *self)
12075
/*[clinic end generated code: output=10a6985311da6858 input=99e284affb54d4a0]*/
12076
1.39M
{
12077
1.39M
    Py_ssize_t i, length;
12078
1.39M
    int kind;
12079
1.39M
    const void *data;
12080
12081
1.39M
    length = PyUnicode_GET_LENGTH(self);
12082
1.39M
    kind = PyUnicode_KIND(self);
12083
1.39M
    data = PyUnicode_DATA(self);
12084
12085
    /* Shortcut for single character strings */
12086
1.39M
    if (length == 1) {
12087
1.39M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12088
1.39M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12089
1.39M
    }
12090
12091
    /* Special case for empty strings */
12092
306
    if (length == 0)
12093
0
        Py_RETURN_FALSE;
12094
12095
1.09k
    for (i = 0; i < length; i++) {
12096
786
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12097
0
            Py_RETURN_FALSE;
12098
786
    }
12099
306
    Py_RETURN_TRUE;
12100
306
}
12101
12102
/*[clinic input]
12103
@permit_long_docstring_body
12104
str.isnumeric as unicode_isnumeric
12105
12106
Return True if the string is a numeric string, False otherwise.
12107
12108
A string is numeric if all characters in the string are numeric and there is at
12109
least one character in the string.
12110
[clinic start generated code]*/
12111
12112
static PyObject *
12113
unicode_isnumeric_impl(PyObject *self)
12114
/*[clinic end generated code: output=9172a32d9013051a input=e9f5b6b8b29b0ee6]*/
12115
0
{
12116
0
    Py_ssize_t i, length;
12117
0
    int kind;
12118
0
    const void *data;
12119
12120
0
    length = PyUnicode_GET_LENGTH(self);
12121
0
    kind = PyUnicode_KIND(self);
12122
0
    data = PyUnicode_DATA(self);
12123
12124
    /* Shortcut for single character strings */
12125
0
    if (length == 1)
12126
0
        return PyBool_FromLong(
12127
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12128
12129
    /* Special case for empty strings */
12130
0
    if (length == 0)
12131
0
        Py_RETURN_FALSE;
12132
12133
0
    for (i = 0; i < length; i++) {
12134
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12135
0
            Py_RETURN_FALSE;
12136
0
    }
12137
0
    Py_RETURN_TRUE;
12138
0
}
12139
12140
Py_ssize_t
12141
_PyUnicode_ScanIdentifier(PyObject *self)
12142
13.3k
{
12143
13.3k
    Py_ssize_t i;
12144
13.3k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12145
13.3k
    if (len == 0) {
12146
        /* an empty string is not a valid identifier */
12147
0
        return 0;
12148
0
    }
12149
12150
13.3k
    int kind = PyUnicode_KIND(self);
12151
13.3k
    const void *data = PyUnicode_DATA(self);
12152
13.3k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12153
    /* PEP 3131 says that the first character must be in
12154
       XID_Start and subsequent characters in XID_Continue,
12155
       and for the ASCII range, the 2.x rules apply (i.e
12156
       start with letters and underscore, continue with
12157
       letters, digits, underscore). However, given the current
12158
       definition of XID_Start and XID_Continue, it is sufficient
12159
       to check just for these, except that _ must be allowed
12160
       as starting an identifier.  */
12161
13.3k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12162
674
        return 0;
12163
674
    }
12164
12165
52.3k
    for (i = 1; i < len; i++) {
12166
39.9k
        ch = PyUnicode_READ(kind, data, i);
12167
39.9k
        if (!_PyUnicode_IsXidContinue(ch)) {
12168
302
            return i;
12169
302
        }
12170
39.9k
    }
12171
12.3k
    return i;
12172
12.6k
}
12173
12174
int
12175
PyUnicode_IsIdentifier(PyObject *self)
12176
1.91k
{
12177
1.91k
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12178
1.91k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12179
    /* an empty string is not a valid identifier */
12180
1.91k
    return len && i == len;
12181
1.91k
}
12182
12183
/*[clinic input]
12184
@permit_long_docstring_body
12185
str.isidentifier as unicode_isidentifier
12186
12187
Return True if the string is a valid Python identifier, False otherwise.
12188
12189
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12190
such as "def" or "class".
12191
[clinic start generated code]*/
12192
12193
static PyObject *
12194
unicode_isidentifier_impl(PyObject *self)
12195
/*[clinic end generated code: output=fe585a9666572905 input=86315dd889d7bd04]*/
12196
984
{
12197
984
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12198
984
}
12199
12200
/*[clinic input]
12201
@permit_long_summary
12202
str.isprintable as unicode_isprintable
12203
12204
Return True if all characters in the string are printable, False otherwise.
12205
12206
A character is printable if repr() may use it in its output.
12207
[clinic start generated code]*/
12208
12209
static PyObject *
12210
unicode_isprintable_impl(PyObject *self)
12211
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12212
1.09M
{
12213
1.09M
    Py_ssize_t i, length;
12214
1.09M
    int kind;
12215
1.09M
    const void *data;
12216
12217
1.09M
    length = PyUnicode_GET_LENGTH(self);
12218
1.09M
    kind = PyUnicode_KIND(self);
12219
1.09M
    data = PyUnicode_DATA(self);
12220
12221
    /* Shortcut for single character strings */
12222
1.09M
    if (length == 1)
12223
1.09M
        return PyBool_FromLong(
12224
1.09M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12225
12226
0
    for (i = 0; i < length; i++) {
12227
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12228
0
            Py_RETURN_FALSE;
12229
0
        }
12230
0
    }
12231
0
    Py_RETURN_TRUE;
12232
0
}
12233
12234
/*[clinic input]
12235
@permit_long_docstring_body
12236
str.join as unicode_join
12237
12238
    iterable: object
12239
    /
12240
12241
Concatenate any number of strings.
12242
12243
The string whose method is called is inserted in between each given string.
12244
The result is returned as a new string.
12245
12246
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12247
[clinic start generated code]*/
12248
12249
static PyObject *
12250
unicode_join(PyObject *self, PyObject *iterable)
12251
/*[clinic end generated code: output=6857e7cecfe7bf98 input=bac724ed412ef3f8]*/
12252
23.5M
{
12253
23.5M
    return PyUnicode_Join(self, iterable);
12254
23.5M
}
12255
12256
static Py_ssize_t
12257
unicode_length(PyObject *self)
12258
40.6M
{
12259
40.6M
    return PyUnicode_GET_LENGTH(self);
12260
40.6M
}
12261
12262
/*[clinic input]
12263
str.ljust as unicode_ljust
12264
12265
    width: Py_ssize_t
12266
    fillchar: Py_UCS4 = ' '
12267
    /
12268
12269
Return a left-justified string of length width.
12270
12271
Padding is done using the specified fill character (default is a space).
12272
[clinic start generated code]*/
12273
12274
static PyObject *
12275
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12276
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12277
0
{
12278
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12279
0
        return unicode_result_unchanged(self);
12280
12281
0
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12282
0
}
12283
12284
/*[clinic input]
12285
str.lower as unicode_lower
12286
12287
Return a copy of the string converted to lowercase.
12288
[clinic start generated code]*/
12289
12290
static PyObject *
12291
unicode_lower_impl(PyObject *self)
12292
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12293
89.7M
{
12294
89.7M
    if (PyUnicode_IS_ASCII(self))
12295
81.8M
        return ascii_upper_or_lower(self, 1);
12296
7.91M
    return case_operation(self, do_lower);
12297
89.7M
}
12298
12299
81.7M
#define LEFTSTRIP 0
12300
110M
#define RIGHTSTRIP 1
12301
45.8M
#define BOTHSTRIP 2
12302
12303
/* Arrays indexed by above */
12304
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12305
12306
0
#define STRIPNAME(i) (stripfuncnames[i])
12307
12308
/* externally visible for str.strip(unicode) */
12309
PyObject *
12310
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12311
9.41M
{
12312
9.41M
    const void *data;
12313
9.41M
    int kind;
12314
9.41M
    Py_ssize_t i, j, len;
12315
9.41M
    BLOOM_MASK sepmask;
12316
9.41M
    Py_ssize_t seplen;
12317
12318
9.41M
    kind = PyUnicode_KIND(self);
12319
9.41M
    data = PyUnicode_DATA(self);
12320
9.41M
    len = PyUnicode_GET_LENGTH(self);
12321
9.41M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12322
9.41M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12323
9.41M
                              PyUnicode_DATA(sepobj),
12324
9.41M
                              seplen);
12325
12326
9.41M
    i = 0;
12327
9.41M
    if (striptype != RIGHTSTRIP) {
12328
501k
        while (i < len) {
12329
498k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12330
498k
            if (!BLOOM(sepmask, ch))
12331
457k
                break;
12332
41.1k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12333
2.95k
                break;
12334
38.1k
            i++;
12335
38.1k
        }
12336
463k
    }
12337
12338
9.41M
    j = len;
12339
9.41M
    if (striptype != LEFTSTRIP) {
12340
8.95M
        j--;
12341
9.64M
        while (j >= i) {
12342
4.84M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12343
4.84M
            if (!BLOOM(sepmask, ch))
12344
4.06M
                break;
12345
782k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12346
95.6k
                break;
12347
686k
            j--;
12348
686k
        }
12349
12350
8.95M
        j++;
12351
8.95M
    }
12352
12353
9.41M
    return PyUnicode_Substring(self, i, j);
12354
9.41M
}
12355
12356
PyObject*
12357
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12358
279M
{
12359
279M
    const unsigned char *data;
12360
279M
    int kind;
12361
279M
    Py_ssize_t length;
12362
12363
279M
    length = PyUnicode_GET_LENGTH(self);
12364
279M
    end = Py_MIN(end, length);
12365
12366
279M
    if (start == 0 && end == length)
12367
77.7M
        return unicode_result_unchanged(self);
12368
12369
201M
    if (start < 0 || end < 0) {
12370
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12371
0
        return NULL;
12372
0
    }
12373
201M
    if (start >= length || end < start)
12374
5.00M
        _Py_RETURN_UNICODE_EMPTY();
12375
12376
196M
    length = end - start;
12377
196M
    if (PyUnicode_IS_ASCII(self)) {
12378
57.8M
        data = PyUnicode_1BYTE_DATA(self);
12379
57.8M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12380
57.8M
    }
12381
138M
    else {
12382
138M
        kind = PyUnicode_KIND(self);
12383
138M
        data = PyUnicode_1BYTE_DATA(self);
12384
138M
        return PyUnicode_FromKindAndData(kind,
12385
138M
                                         data + kind * start,
12386
138M
                                         length);
12387
138M
    }
12388
196M
}
12389
12390
static PyObject *
12391
do_strip(PyObject *self, int striptype)
12392
70.0M
{
12393
70.0M
    Py_ssize_t len, i, j;
12394
12395
70.0M
    len = PyUnicode_GET_LENGTH(self);
12396
12397
70.0M
    if (PyUnicode_IS_ASCII(self)) {
12398
57.9M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12399
12400
57.9M
        i = 0;
12401
57.9M
        if (striptype != RIGHTSTRIP) {
12402
44.0M
            while (i < len) {
12403
33.7M
                Py_UCS1 ch = data[i];
12404
33.7M
                if (!_Py_ascii_whitespace[ch])
12405
27.9M
                    break;
12406
5.74M
                i++;
12407
5.74M
            }
12408
38.2M
        }
12409
12410
57.9M
        j = len;
12411
57.9M
        if (striptype != LEFTSTRIP) {
12412
57.5M
            j--;
12413
73.9M
            while (j >= i) {
12414
52.7M
                Py_UCS1 ch = data[j];
12415
52.7M
                if (!_Py_ascii_whitespace[ch])
12416
36.3M
                    break;
12417
16.3M
                j--;
12418
16.3M
            }
12419
57.5M
            j++;
12420
57.5M
        }
12421
57.9M
    }
12422
12.1M
    else {
12423
12.1M
        int kind = PyUnicode_KIND(self);
12424
12.1M
        const void *data = PyUnicode_DATA(self);
12425
12426
12.1M
        i = 0;
12427
12.1M
        if (striptype != RIGHTSTRIP) {
12428
11.8M
            while (i < len) {
12429
11.8M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12430
11.8M
                if (!Py_UNICODE_ISSPACE(ch))
12431
9.32M
                    break;
12432
2.52M
                i++;
12433
2.52M
            }
12434
9.32M
        }
12435
12436
12.1M
        j = len;
12437
12.1M
        if (striptype != LEFTSTRIP) {
12438
10.7M
            j--;
12439
12.3M
            while (j >= i) {
12440
12.3M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12441
12.3M
                if (!Py_UNICODE_ISSPACE(ch))
12442
10.6M
                    break;
12443
1.65M
                j--;
12444
1.65M
            }
12445
10.7M
            j++;
12446
10.7M
        }
12447
12.1M
    }
12448
12449
70.0M
    return PyUnicode_Substring(self, i, j);
12450
70.0M
}
12451
12452
12453
static PyObject *
12454
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12455
79.4M
{
12456
79.4M
    if (sep != Py_None) {
12457
9.41M
        if (PyUnicode_Check(sep))
12458
9.41M
            return _PyUnicode_XStrip(self, striptype, sep);
12459
0
        else {
12460
0
            PyErr_Format(PyExc_TypeError,
12461
0
                         "%s arg must be None or str",
12462
0
                         STRIPNAME(striptype));
12463
0
            return NULL;
12464
0
        }
12465
9.41M
    }
12466
12467
70.0M
    return do_strip(self, striptype);
12468
79.4M
}
12469
12470
12471
/*[clinic input]
12472
@permit_long_summary
12473
str.strip as unicode_strip
12474
12475
    chars: object = None
12476
    /
12477
12478
Return a copy of the string with leading and trailing whitespace removed.
12479
12480
If chars is given and not None, remove characters in chars instead.
12481
[clinic start generated code]*/
12482
12483
static PyObject *
12484
unicode_strip_impl(PyObject *self, PyObject *chars)
12485
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12486
45.8M
{
12487
45.8M
    return do_argstrip(self, BOTHSTRIP, chars);
12488
45.8M
}
12489
12490
12491
/*[clinic input]
12492
str.lstrip as unicode_lstrip
12493
12494
    chars: object = None
12495
    /
12496
12497
Return a copy of the string with leading whitespace removed.
12498
12499
If chars is given and not None, remove characters in chars instead.
12500
[clinic start generated code]*/
12501
12502
static PyObject *
12503
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12504
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12505
2.25M
{
12506
2.25M
    return do_argstrip(self, LEFTSTRIP, chars);
12507
2.25M
}
12508
12509
12510
/*[clinic input]
12511
str.rstrip as unicode_rstrip
12512
12513
    chars: object = None
12514
    /
12515
12516
Return a copy of the string with trailing whitespace removed.
12517
12518
If chars is given and not None, remove characters in chars instead.
12519
[clinic start generated code]*/
12520
12521
static PyObject *
12522
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12523
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12524
31.4M
{
12525
31.4M
    return do_argstrip(self, RIGHTSTRIP, chars);
12526
31.4M
}
12527
12528
12529
static PyObject*
12530
unicode_repeat(PyObject *str, Py_ssize_t len)
12531
682k
{
12532
682k
    PyObject *u;
12533
682k
    Py_ssize_t nchars, n;
12534
12535
682k
    if (len < 1)
12536
32.4k
        _Py_RETURN_UNICODE_EMPTY();
12537
12538
    /* no repeat, return original string */
12539
649k
    if (len == 1)
12540
133k
        return unicode_result_unchanged(str);
12541
12542
516k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12543
0
        PyErr_SetString(PyExc_OverflowError,
12544
0
                        "repeated string is too long");
12545
0
        return NULL;
12546
0
    }
12547
516k
    nchars = len * PyUnicode_GET_LENGTH(str);
12548
12549
516k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12550
516k
    if (!u)
12551
0
        return NULL;
12552
516k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12553
12554
516k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12555
513k
        int kind = PyUnicode_KIND(str);
12556
513k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12557
513k
        if (kind == PyUnicode_1BYTE_KIND) {
12558
513k
            void *to = PyUnicode_DATA(u);
12559
513k
            memset(to, (unsigned char)fill_char, len);
12560
513k
        }
12561
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12562
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12563
0
            for (n = 0; n < len; ++n)
12564
0
                ucs2[n] = fill_char;
12565
0
        } else {
12566
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12567
0
            assert(kind == PyUnicode_4BYTE_KIND);
12568
0
            for (n = 0; n < len; ++n)
12569
0
                ucs4[n] = fill_char;
12570
0
        }
12571
513k
    }
12572
2.36k
    else {
12573
2.36k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12574
2.36k
        char *to = (char *) PyUnicode_DATA(u);
12575
2.36k
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12576
2.36k
            PyUnicode_GET_LENGTH(str) * char_size);
12577
2.36k
    }
12578
12579
516k
    assert(_PyUnicode_CheckConsistency(u, 1));
12580
516k
    return u;
12581
516k
}
12582
12583
PyObject *
12584
PyUnicode_Replace(PyObject *str,
12585
                  PyObject *substr,
12586
                  PyObject *replstr,
12587
                  Py_ssize_t maxcount)
12588
3
{
12589
3
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12590
3
            ensure_unicode(replstr) < 0)
12591
0
        return NULL;
12592
3
    return replace(str, substr, replstr, maxcount);
12593
3
}
12594
12595
/*[clinic input]
12596
@permit_long_docstring_body
12597
str.replace as unicode_replace
12598
12599
    old: unicode
12600
    new: unicode
12601
    /
12602
    count: Py_ssize_t = -1
12603
        Maximum number of occurrences to replace.
12604
        -1 (the default value) means replace all occurrences.
12605
12606
Return a copy with all occurrences of substring old replaced by new.
12607
12608
If the optional argument count is given, only the first count occurrences are
12609
replaced.
12610
[clinic start generated code]*/
12611
12612
static PyObject *
12613
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12614
                     Py_ssize_t count)
12615
/*[clinic end generated code: output=b63f1a8b5eebf448 input=f27ca92ac46b65a1]*/
12616
76.6M
{
12617
76.6M
    return replace(self, old, new, count);
12618
76.6M
}
12619
12620
/*[clinic input]
12621
@permit_long_docstring_body
12622
str.removeprefix as unicode_removeprefix
12623
12624
    prefix: unicode
12625
    /
12626
12627
Return a str with the given prefix string removed if present.
12628
12629
If the string starts with the prefix string, return string[len(prefix):].
12630
Otherwise, return a copy of the original string.
12631
[clinic start generated code]*/
12632
12633
static PyObject *
12634
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12635
/*[clinic end generated code: output=f1e5945e9763bcb9 input=1989a856dbb813f1]*/
12636
0
{
12637
0
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12638
0
    if (match == -1) {
12639
0
        return NULL;
12640
0
    }
12641
0
    if (match) {
12642
0
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12643
0
                                   PyUnicode_GET_LENGTH(self));
12644
0
    }
12645
0
    return unicode_result_unchanged(self);
12646
0
}
12647
12648
/*[clinic input]
12649
str.removesuffix as unicode_removesuffix
12650
12651
    suffix: unicode
12652
    /
12653
12654
Return a str with the given suffix string removed if present.
12655
12656
If the string ends with the suffix string and that suffix is not empty,
12657
return string[:-len(suffix)]. Otherwise, return a copy of the original
12658
string.
12659
[clinic start generated code]*/
12660
12661
static PyObject *
12662
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12663
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12664
0
{
12665
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12666
0
    if (match == -1) {
12667
0
        return NULL;
12668
0
    }
12669
0
    if (match) {
12670
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12671
0
                                            - PyUnicode_GET_LENGTH(suffix));
12672
0
    }
12673
0
    return unicode_result_unchanged(self);
12674
0
}
12675
12676
static PyObject *
12677
unicode_repr(PyObject *unicode)
12678
16.4M
{
12679
16.4M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12680
16.4M
    const void *idata = PyUnicode_DATA(unicode);
12681
12682
    /* Compute length of output, quote characters, and
12683
       maximum character */
12684
16.4M
    Py_ssize_t osize = 0;
12685
16.4M
    Py_UCS4 maxch = 127;
12686
16.4M
    Py_ssize_t squote = 0;
12687
16.4M
    Py_ssize_t dquote = 0;
12688
16.4M
    int ikind = PyUnicode_KIND(unicode);
12689
275M
    for (Py_ssize_t i = 0; i < isize; i++) {
12690
258M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12691
258M
        Py_ssize_t incr = 1;
12692
258M
        switch (ch) {
12693
232k
        case '\'': squote++; break;
12694
643k
        case '"':  dquote++; break;
12695
5.36M
        case '\\': case '\t': case '\r': case '\n':
12696
5.36M
            incr = 2;
12697
5.36M
            break;
12698
252M
        default:
12699
            /* Fast-path ASCII */
12700
252M
            if (ch < ' ' || ch == 0x7f)
12701
122M
                incr = 4; /* \xHH */
12702
130M
            else if (ch < 0x7f)
12703
117M
                ;
12704
12.3M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12705
12.2M
                maxch = (ch > maxch) ? ch : maxch;
12706
138k
            else if (ch < 0x100)
12707
45.3k
                incr = 4; /* \xHH */
12708
93.2k
            else if (ch < 0x10000)
12709
64.1k
                incr = 6; /* \uHHHH */
12710
29.0k
            else
12711
29.0k
                incr = 10; /* \uHHHHHHHH */
12712
258M
        }
12713
258M
        if (osize > PY_SSIZE_T_MAX - incr) {
12714
0
            PyErr_SetString(PyExc_OverflowError,
12715
0
                            "string is too long to generate repr");
12716
0
            return NULL;
12717
0
        }
12718
258M
        osize += incr;
12719
258M
    }
12720
12721
16.4M
    Py_UCS4 quote = '\'';
12722
16.4M
    int changed = (osize != isize);
12723
16.4M
    if (squote) {
12724
129k
        changed = 1;
12725
129k
        if (dquote)
12726
            /* Both squote and dquote present. Use squote,
12727
               and escape them */
12728
18.7k
            osize += squote;
12729
110k
        else
12730
110k
            quote = '"';
12731
129k
    }
12732
16.4M
    osize += 2;   /* quotes */
12733
12734
16.4M
    PyObject *repr = PyUnicode_New(osize, maxch);
12735
16.4M
    if (repr == NULL)
12736
0
        return NULL;
12737
16.4M
    int okind = PyUnicode_KIND(repr);
12738
16.4M
    void *odata = PyUnicode_DATA(repr);
12739
12740
16.4M
    if (!changed) {
12741
9.47M
        PyUnicode_WRITE(okind, odata, 0, quote);
12742
12743
9.47M
        _PyUnicode_FastCopyCharacters(repr, 1,
12744
9.47M
                                      unicode, 0,
12745
9.47M
                                      isize);
12746
12747
9.47M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12748
9.47M
    }
12749
6.94M
    else {
12750
6.94M
        switch (okind) {
12751
6.62M
        case PyUnicode_1BYTE_KIND:
12752
6.62M
            ucs1lib_repr(unicode, quote, odata);
12753
6.62M
            break;
12754
307k
        case PyUnicode_2BYTE_KIND:
12755
307k
            ucs2lib_repr(unicode, quote, odata);
12756
307k
            break;
12757
5.58k
        default:
12758
5.58k
            assert(okind == PyUnicode_4BYTE_KIND);
12759
5.58k
            ucs4lib_repr(unicode, quote, odata);
12760
6.94M
        }
12761
6.94M
    }
12762
12763
16.4M
    assert(_PyUnicode_CheckConsistency(repr, 1));
12764
16.4M
    return repr;
12765
16.4M
}
12766
12767
/*[clinic input]
12768
@permit_long_summary
12769
str.rfind as unicode_rfind = str.count
12770
12771
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12772
12773
Optional arguments start and end are interpreted as in slice notation.
12774
Return -1 on failure.
12775
[clinic start generated code]*/
12776
12777
static Py_ssize_t
12778
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12779
                   Py_ssize_t end)
12780
/*[clinic end generated code: output=880b29f01dd014c8 input=7f7e97d5cd3299a2]*/
12781
255k
{
12782
255k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12783
255k
    if (result < 0) {
12784
8.00k
        return -1;
12785
8.00k
    }
12786
247k
    return result;
12787
255k
}
12788
12789
/*[clinic input]
12790
@permit_long_summary
12791
str.rindex as unicode_rindex = str.count
12792
12793
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12794
12795
Optional arguments start and end are interpreted as in slice notation.
12796
Raises ValueError when the substring is not found.
12797
[clinic start generated code]*/
12798
12799
static Py_ssize_t
12800
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12801
                    Py_ssize_t end)
12802
/*[clinic end generated code: output=5f3aef124c867fe1 input=0363a324740b3e62]*/
12803
126k
{
12804
126k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12805
126k
    if (result == -1) {
12806
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12807
0
    }
12808
126k
    else if (result < 0) {
12809
0
        return -1;
12810
0
    }
12811
126k
    return result;
12812
126k
}
12813
12814
/*[clinic input]
12815
str.rjust as unicode_rjust
12816
12817
    width: Py_ssize_t
12818
    fillchar: Py_UCS4 = ' '
12819
    /
12820
12821
Return a right-justified string of length width.
12822
12823
Padding is done using the specified fill character (default is a space).
12824
[clinic start generated code]*/
12825
12826
static PyObject *
12827
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12828
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12829
0
{
12830
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12831
0
        return unicode_result_unchanged(self);
12832
12833
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12834
0
}
12835
12836
PyObject *
12837
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12838
0
{
12839
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12840
0
        return NULL;
12841
12842
0
    return split(s, sep, maxsplit);
12843
0
}
12844
12845
/*[clinic input]
12846
@permit_long_summary
12847
str.split as unicode_split
12848
12849
    sep: object = None
12850
        The separator used to split the string.
12851
12852
        When set to None (the default value), will split on any whitespace
12853
        character (including \n \r \t \f and spaces) and will discard
12854
        empty strings from the result.
12855
    maxsplit: Py_ssize_t = -1
12856
        Maximum number of splits.
12857
        -1 (the default value) means no limit.
12858
12859
Return a list of the substrings in the string, using sep as the separator string.
12860
12861
Splitting starts at the front of the string and works to the end.
12862
12863
Note, str.split() is mainly useful for data that has been intentionally
12864
delimited.  With natural text that includes punctuation, consider using
12865
the regular expression module.
12866
12867
[clinic start generated code]*/
12868
12869
static PyObject *
12870
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12871
/*[clinic end generated code: output=3a65b1db356948dc input=2c1fd08a78e038b8]*/
12872
23.2M
{
12873
23.2M
    if (sep == Py_None)
12874
181k
        return split(self, NULL, maxsplit);
12875
23.0M
    if (PyUnicode_Check(sep))
12876
23.0M
        return split(self, sep, maxsplit);
12877
12878
0
    PyErr_Format(PyExc_TypeError,
12879
0
                 "must be str or None, not %.100s",
12880
0
                 Py_TYPE(sep)->tp_name);
12881
0
    return NULL;
12882
23.0M
}
12883
12884
PyObject *
12885
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12886
8.63M
{
12887
8.63M
    PyObject* out;
12888
8.63M
    int kind1, kind2;
12889
8.63M
    const void *buf1, *buf2;
12890
8.63M
    Py_ssize_t len1, len2;
12891
12892
8.63M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12893
0
        return NULL;
12894
12895
8.63M
    kind1 = PyUnicode_KIND(str_obj);
12896
8.63M
    kind2 = PyUnicode_KIND(sep_obj);
12897
8.63M
    len1 = PyUnicode_GET_LENGTH(str_obj);
12898
8.63M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12899
8.63M
    if (kind1 < kind2 || len1 < len2) {
12900
1.18k
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12901
1.18k
        return PyTuple_Pack(3, str_obj, empty, empty);
12902
1.18k
    }
12903
8.63M
    buf1 = PyUnicode_DATA(str_obj);
12904
8.63M
    buf2 = PyUnicode_DATA(sep_obj);
12905
8.63M
    if (kind2 != kind1) {
12906
91.6k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12907
91.6k
        if (!buf2)
12908
0
            return NULL;
12909
91.6k
    }
12910
12911
8.63M
    switch (kind1) {
12912
8.54M
    case PyUnicode_1BYTE_KIND:
12913
8.54M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12914
3.08M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12915
5.46M
        else
12916
5.46M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12917
8.54M
        break;
12918
81.4k
    case PyUnicode_2BYTE_KIND:
12919
81.4k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12920
81.4k
        break;
12921
10.2k
    case PyUnicode_4BYTE_KIND:
12922
10.2k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12923
10.2k
        break;
12924
0
    default:
12925
0
        Py_UNREACHABLE();
12926
8.63M
    }
12927
12928
8.63M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12929
8.63M
    if (kind2 != kind1)
12930
91.6k
        PyMem_Free((void *)buf2);
12931
12932
8.63M
    return out;
12933
8.63M
}
12934
12935
12936
PyObject *
12937
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12938
61.5k
{
12939
61.5k
    PyObject* out;
12940
61.5k
    int kind1, kind2;
12941
61.5k
    const void *buf1, *buf2;
12942
61.5k
    Py_ssize_t len1, len2;
12943
12944
61.5k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12945
0
        return NULL;
12946
12947
61.5k
    kind1 = PyUnicode_KIND(str_obj);
12948
61.5k
    kind2 = PyUnicode_KIND(sep_obj);
12949
61.5k
    len1 = PyUnicode_GET_LENGTH(str_obj);
12950
61.5k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12951
61.5k
    if (kind1 < kind2 || len1 < len2) {
12952
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12953
0
        return PyTuple_Pack(3, empty, empty, str_obj);
12954
0
    }
12955
61.5k
    buf1 = PyUnicode_DATA(str_obj);
12956
61.5k
    buf2 = PyUnicode_DATA(sep_obj);
12957
61.5k
    if (kind2 != kind1) {
12958
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12959
0
        if (!buf2)
12960
0
            return NULL;
12961
0
    }
12962
12963
61.5k
    switch (kind1) {
12964
61.5k
    case PyUnicode_1BYTE_KIND:
12965
61.5k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12966
61.5k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12967
0
        else
12968
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12969
61.5k
        break;
12970
0
    case PyUnicode_2BYTE_KIND:
12971
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12972
0
        break;
12973
0
    case PyUnicode_4BYTE_KIND:
12974
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12975
0
        break;
12976
0
    default:
12977
0
        Py_UNREACHABLE();
12978
61.5k
    }
12979
12980
61.5k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12981
61.5k
    if (kind2 != kind1)
12982
0
        PyMem_Free((void *)buf2);
12983
12984
61.5k
    return out;
12985
61.5k
}
12986
12987
/*[clinic input]
12988
@permit_long_docstring_body
12989
str.partition as unicode_partition
12990
12991
    sep: object
12992
    /
12993
12994
Partition the string into three parts using the given separator.
12995
12996
This will search for the separator in the string.  If the separator is found,
12997
returns a 3-tuple containing the part before the separator, the separator
12998
itself, and the part after it.
12999
13000
If the separator is not found, returns a 3-tuple containing the original string
13001
and two empty strings.
13002
[clinic start generated code]*/
13003
13004
static PyObject *
13005
unicode_partition(PyObject *self, PyObject *sep)
13006
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=4d854b520d7b0e97]*/
13007
8.63M
{
13008
8.63M
    return PyUnicode_Partition(self, sep);
13009
8.63M
}
13010
13011
/*[clinic input]
13012
@permit_long_docstring_body
13013
str.rpartition as unicode_rpartition = str.partition
13014
13015
Partition the string into three parts using the given separator.
13016
13017
This will search for the separator in the string, starting at the end. If
13018
the separator is found, returns a 3-tuple containing the part before the
13019
separator, the separator itself, and the part after it.
13020
13021
If the separator is not found, returns a 3-tuple containing two empty strings
13022
and the original string.
13023
[clinic start generated code]*/
13024
13025
static PyObject *
13026
unicode_rpartition(PyObject *self, PyObject *sep)
13027
/*[clinic end generated code: output=1aa13cf1156572aa input=a6adabe91e75b486]*/
13028
61.5k
{
13029
61.5k
    return PyUnicode_RPartition(self, sep);
13030
61.5k
}
13031
13032
PyObject *
13033
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13034
0
{
13035
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13036
0
        return NULL;
13037
13038
0
    return rsplit(s, sep, maxsplit);
13039
0
}
13040
13041
/*[clinic input]
13042
@permit_long_summary
13043
str.rsplit as unicode_rsplit = str.split
13044
13045
Return a list of the substrings in the string, using sep as the separator string.
13046
13047
Splitting starts at the end of the string and works to the front.
13048
[clinic start generated code]*/
13049
13050
static PyObject *
13051
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13052
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13053
50
{
13054
50
    if (sep == Py_None)
13055
0
        return rsplit(self, NULL, maxsplit);
13056
50
    if (PyUnicode_Check(sep))
13057
50
        return rsplit(self, sep, maxsplit);
13058
13059
0
    PyErr_Format(PyExc_TypeError,
13060
0
                 "must be str or None, not %.100s",
13061
0
                 Py_TYPE(sep)->tp_name);
13062
0
    return NULL;
13063
50
}
13064
13065
/*[clinic input]
13066
@permit_long_docstring_body
13067
str.splitlines as unicode_splitlines
13068
13069
    keepends: bool = False
13070
13071
Return a list of the lines in the string, breaking at line boundaries.
13072
13073
Line breaks are not included in the resulting list unless keepends is given and
13074
true.
13075
[clinic start generated code]*/
13076
13077
static PyObject *
13078
unicode_splitlines_impl(PyObject *self, int keepends)
13079
/*[clinic end generated code: output=f664dcdad153ec40 input=39eeafbfef61c827]*/
13080
13.3k
{
13081
13.3k
    return PyUnicode_Splitlines(self, keepends);
13082
13.3k
}
13083
13084
static
13085
PyObject *unicode_str(PyObject *self)
13086
3.32M
{
13087
3.32M
    return unicode_result_unchanged(self);
13088
3.32M
}
13089
13090
/*[clinic input]
13091
@permit_long_summary
13092
str.swapcase as unicode_swapcase
13093
13094
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13095
[clinic start generated code]*/
13096
13097
static PyObject *
13098
unicode_swapcase_impl(PyObject *self)
13099
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13100
0
{
13101
0
    return case_operation(self, do_swapcase);
13102
0
}
13103
13104
/*[clinic input]
13105
13106
@staticmethod
13107
str.maketrans as unicode_maketrans
13108
13109
  x: object
13110
13111
  y: unicode=NULL
13112
13113
  z: unicode=NULL
13114
13115
  /
13116
13117
Return a translation table usable for str.translate().
13118
13119
If there is only one argument, it must be a dictionary mapping Unicode
13120
ordinals (integers) or characters to Unicode ordinals, strings or None.
13121
Character keys will be then converted to ordinals.
13122
If there are two arguments, they must be strings of equal length, and
13123
in the resulting dictionary, each character in x will be mapped to the
13124
character at the same position in y. If there is a third argument, it
13125
must be a string, whose characters will be mapped to None in the result.
13126
[clinic start generated code]*/
13127
13128
static PyObject *
13129
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13130
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13131
0
{
13132
0
    PyObject *new = NULL, *key, *value;
13133
0
    Py_ssize_t i = 0;
13134
0
    int res;
13135
13136
0
    new = PyDict_New();
13137
0
    if (!new)
13138
0
        return NULL;
13139
0
    if (y != NULL) {
13140
0
        int x_kind, y_kind, z_kind;
13141
0
        const void *x_data, *y_data, *z_data;
13142
13143
        /* x must be a string too, of equal length */
13144
0
        if (!PyUnicode_Check(x)) {
13145
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13146
0
                            "be a string if there is a second argument");
13147
0
            goto err;
13148
0
        }
13149
0
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13150
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13151
0
                            "arguments must have equal length");
13152
0
            goto err;
13153
0
        }
13154
        /* create entries for translating chars in x to those in y */
13155
0
        x_kind = PyUnicode_KIND(x);
13156
0
        y_kind = PyUnicode_KIND(y);
13157
0
        x_data = PyUnicode_DATA(x);
13158
0
        y_data = PyUnicode_DATA(y);
13159
0
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13160
0
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13161
0
            if (!key)
13162
0
                goto err;
13163
0
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13164
0
            if (!value) {
13165
0
                Py_DECREF(key);
13166
0
                goto err;
13167
0
            }
13168
0
            res = PyDict_SetItem(new, key, value);
13169
0
            Py_DECREF(key);
13170
0
            Py_DECREF(value);
13171
0
            if (res < 0)
13172
0
                goto err;
13173
0
        }
13174
        /* create entries for deleting chars in z */
13175
0
        if (z != NULL) {
13176
0
            z_kind = PyUnicode_KIND(z);
13177
0
            z_data = PyUnicode_DATA(z);
13178
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13179
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13180
0
                if (!key)
13181
0
                    goto err;
13182
0
                res = PyDict_SetItem(new, key, Py_None);
13183
0
                Py_DECREF(key);
13184
0
                if (res < 0)
13185
0
                    goto err;
13186
0
            }
13187
0
        }
13188
0
    } else {
13189
0
        int kind;
13190
0
        const void *data;
13191
13192
        /* x must be a dict */
13193
0
        if (!PyDict_CheckExact(x)) {
13194
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13195
0
                            "to maketrans it must be a dict");
13196
0
            goto err;
13197
0
        }
13198
        /* copy entries into the new dict, converting string keys to int keys */
13199
0
        while (PyDict_Next(x, &i, &key, &value)) {
13200
0
            if (PyUnicode_Check(key)) {
13201
                /* convert string keys to integer keys */
13202
0
                PyObject *newkey;
13203
0
                if (PyUnicode_GET_LENGTH(key) != 1) {
13204
0
                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13205
0
                                    "table must be of length 1");
13206
0
                    goto err;
13207
0
                }
13208
0
                kind = PyUnicode_KIND(key);
13209
0
                data = PyUnicode_DATA(key);
13210
0
                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13211
0
                if (!newkey)
13212
0
                    goto err;
13213
0
                res = PyDict_SetItem(new, newkey, value);
13214
0
                Py_DECREF(newkey);
13215
0
                if (res < 0)
13216
0
                    goto err;
13217
0
            } else if (PyLong_Check(key)) {
13218
                /* just keep integer keys */
13219
0
                if (PyDict_SetItem(new, key, value) < 0)
13220
0
                    goto err;
13221
0
            } else {
13222
0
                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13223
0
                                "be strings or integers");
13224
0
                goto err;
13225
0
            }
13226
0
        }
13227
0
    }
13228
0
    return new;
13229
0
  err:
13230
0
    Py_DECREF(new);
13231
0
    return NULL;
13232
0
}
13233
13234
/*[clinic input]
13235
@permit_long_docstring_body
13236
str.translate as unicode_translate
13237
13238
    table: object
13239
        Translation table, which must be a mapping of Unicode ordinals to
13240
        Unicode ordinals, strings, or None.
13241
    /
13242
13243
Replace each character in the string using the given translation table.
13244
13245
The table must implement lookup/indexing via __getitem__, for instance a
13246
dictionary or list.  If this operation raises LookupError, the character is
13247
left untouched.  Characters mapped to None are deleted.
13248
[clinic start generated code]*/
13249
13250
static PyObject *
13251
unicode_translate(PyObject *self, PyObject *table)
13252
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=699e5fa0ebf9f5e9]*/
13253
10.3k
{
13254
10.3k
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13255
10.3k
}
13256
13257
/*[clinic input]
13258
str.upper as unicode_upper
13259
13260
Return a copy of the string converted to uppercase.
13261
[clinic start generated code]*/
13262
13263
static PyObject *
13264
unicode_upper_impl(PyObject *self)
13265
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13266
153
{
13267
153
    if (PyUnicode_IS_ASCII(self))
13268
153
        return ascii_upper_or_lower(self, 0);
13269
0
    return case_operation(self, do_upper);
13270
153
}
13271
13272
/*[clinic input]
13273
@permit_long_summary
13274
str.zfill as unicode_zfill
13275
13276
    width: Py_ssize_t
13277
    /
13278
13279
Pad a numeric string with zeros on the left, to fill a field of the given width.
13280
13281
The string is never truncated.
13282
[clinic start generated code]*/
13283
13284
static PyObject *
13285
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13286
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13287
0
{
13288
0
    Py_ssize_t fill;
13289
0
    PyObject *u;
13290
0
    int kind;
13291
0
    const void *data;
13292
0
    Py_UCS4 chr;
13293
13294
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13295
0
        return unicode_result_unchanged(self);
13296
13297
0
    fill = width - PyUnicode_GET_LENGTH(self);
13298
13299
0
    u = pad(self, fill, 0, '0');
13300
13301
0
    if (u == NULL)
13302
0
        return NULL;
13303
13304
0
    kind = PyUnicode_KIND(u);
13305
0
    data = PyUnicode_DATA(u);
13306
0
    chr = PyUnicode_READ(kind, data, fill);
13307
13308
0
    if (chr == '+' || chr == '-') {
13309
        /* move sign to beginning of string */
13310
0
        PyUnicode_WRITE(kind, data, 0, chr);
13311
0
        PyUnicode_WRITE(kind, data, fill, '0');
13312
0
    }
13313
13314
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13315
0
    return u;
13316
0
}
13317
13318
/*[clinic input]
13319
@permit_long_summary
13320
@text_signature "($self, prefix[, start[, end]], /)"
13321
str.startswith as unicode_startswith
13322
13323
    prefix as subobj: object
13324
        A string or a tuple of strings to try.
13325
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13326
        Optional start position. Default: start of the string.
13327
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13328
        Optional stop position. Default: end of the string.
13329
    /
13330
13331
Return True if the string starts with the specified prefix, False otherwise.
13332
[clinic start generated code]*/
13333
13334
static PyObject *
13335
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13336
                        Py_ssize_t end)
13337
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13338
74.2M
{
13339
74.2M
    if (PyTuple_Check(subobj)) {
13340
9.26M
        Py_ssize_t i;
13341
33.5M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13342
24.3M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13343
24.3M
            if (!PyUnicode_Check(substring)) {
13344
0
                PyErr_Format(PyExc_TypeError,
13345
0
                             "tuple for startswith must only contain str, "
13346
0
                             "not %.100s",
13347
0
                             Py_TYPE(substring)->tp_name);
13348
0
                return NULL;
13349
0
            }
13350
24.3M
            int result = tailmatch(self, substring, start, end, -1);
13351
24.3M
            if (result < 0) {
13352
0
                return NULL;
13353
0
            }
13354
24.3M
            if (result) {
13355
40.2k
                Py_RETURN_TRUE;
13356
40.2k
            }
13357
24.3M
        }
13358
        /* nothing matched */
13359
9.26M
        Py_RETURN_FALSE;
13360
9.26M
    }
13361
64.9M
    if (!PyUnicode_Check(subobj)) {
13362
0
        PyErr_Format(PyExc_TypeError,
13363
0
                     "startswith first arg must be str or "
13364
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13365
0
        return NULL;
13366
0
    }
13367
64.9M
    int result = tailmatch(self, subobj, start, end, -1);
13368
64.9M
    if (result < 0) {
13369
0
        return NULL;
13370
0
    }
13371
64.9M
    return PyBool_FromLong(result);
13372
64.9M
}
13373
13374
13375
/*[clinic input]
13376
@permit_long_summary
13377
@text_signature "($self, suffix[, start[, end]], /)"
13378
str.endswith as unicode_endswith
13379
13380
    suffix as subobj: object
13381
        A string or a tuple of strings to try.
13382
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13383
        Optional start position. Default: start of the string.
13384
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13385
        Optional stop position. Default: end of the string.
13386
    /
13387
13388
Return True if the string ends with the specified suffix, False otherwise.
13389
[clinic start generated code]*/
13390
13391
static PyObject *
13392
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13393
                      Py_ssize_t end)
13394
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13395
12.7M
{
13396
12.7M
    if (PyTuple_Check(subobj)) {
13397
208k
        Py_ssize_t i;
13398
407k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13399
370k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13400
370k
            if (!PyUnicode_Check(substring)) {
13401
0
                PyErr_Format(PyExc_TypeError,
13402
0
                             "tuple for endswith must only contain str, "
13403
0
                             "not %.100s",
13404
0
                             Py_TYPE(substring)->tp_name);
13405
0
                return NULL;
13406
0
            }
13407
370k
            int result = tailmatch(self, substring, start, end, +1);
13408
370k
            if (result < 0) {
13409
0
                return NULL;
13410
0
            }
13411
370k
            if (result) {
13412
170k
                Py_RETURN_TRUE;
13413
170k
            }
13414
370k
        }
13415
208k
        Py_RETURN_FALSE;
13416
208k
    }
13417
12.5M
    if (!PyUnicode_Check(subobj)) {
13418
0
        PyErr_Format(PyExc_TypeError,
13419
0
                     "endswith first arg must be str or "
13420
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13421
0
        return NULL;
13422
0
    }
13423
12.5M
    int result = tailmatch(self, subobj, start, end, +1);
13424
12.5M
    if (result < 0) {
13425
0
        return NULL;
13426
0
    }
13427
12.5M
    return PyBool_FromLong(result);
13428
12.5M
}
13429
13430
13431
#include "stringlib/unicode_format.h"
13432
13433
PyDoc_STRVAR(format__doc__,
13434
             "format($self, /, *args, **kwargs)\n\
13435
--\n\
13436
\n\
13437
Return a formatted version of the string, using substitutions from args and kwargs.\n\
13438
The substitutions are identified by braces ('{' and '}').");
13439
13440
PyDoc_STRVAR(format_map__doc__,
13441
             "format_map($self, mapping, /)\n\
13442
--\n\
13443
\n\
13444
Return a formatted version of the string, using substitutions from mapping.\n\
13445
The substitutions are identified by braces ('{' and '}').");
13446
13447
/*[clinic input]
13448
str.__format__ as unicode___format__
13449
13450
    format_spec: unicode
13451
    /
13452
13453
Return a formatted version of the string as described by format_spec.
13454
[clinic start generated code]*/
13455
13456
static PyObject *
13457
unicode___format___impl(PyObject *self, PyObject *format_spec)
13458
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13459
0
{
13460
0
    _PyUnicodeWriter writer;
13461
0
    int ret;
13462
13463
0
    _PyUnicodeWriter_Init(&writer);
13464
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13465
0
                                          self, format_spec, 0,
13466
0
                                          PyUnicode_GET_LENGTH(format_spec));
13467
0
    if (ret == -1) {
13468
0
        _PyUnicodeWriter_Dealloc(&writer);
13469
0
        return NULL;
13470
0
    }
13471
0
    return _PyUnicodeWriter_Finish(&writer);
13472
0
}
13473
13474
/*[clinic input]
13475
str.__sizeof__ as unicode_sizeof
13476
13477
Return the size of the string in memory, in bytes.
13478
[clinic start generated code]*/
13479
13480
static PyObject *
13481
unicode_sizeof_impl(PyObject *self)
13482
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13483
0
{
13484
0
    Py_ssize_t size;
13485
13486
    /* If it's a compact object, account for base structure +
13487
       character data. */
13488
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
13489
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13490
0
    }
13491
0
    else if (PyUnicode_IS_COMPACT(self)) {
13492
0
        size = sizeof(PyCompactUnicodeObject) +
13493
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13494
0
    }
13495
0
    else {
13496
        /* If it is a two-block object, account for base object, and
13497
           for character block if present. */
13498
0
        size = sizeof(PyUnicodeObject);
13499
0
        if (_PyUnicode_DATA_ANY(self))
13500
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13501
0
                PyUnicode_KIND(self);
13502
0
    }
13503
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13504
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13505
13506
0
    return PyLong_FromSsize_t(size);
13507
0
}
13508
13509
static PyObject *
13510
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13511
0
{
13512
0
    PyObject *copy = _PyUnicode_Copy(v);
13513
0
    if (!copy)
13514
0
        return NULL;
13515
0
    return Py_BuildValue("(N)", copy);
13516
0
}
13517
13518
/*
13519
This function searchs the longest common leading whitespace
13520
of all lines in the [src, end).
13521
It returns the length of the common leading whitespace and sets `output` to
13522
point to the beginning of the common leading whitespace if length > 0.
13523
*/
13524
static Py_ssize_t
13525
search_longest_common_leading_whitespace(
13526
    const char *const src,
13527
    const char *const end,
13528
    const char **output)
13529
0
{
13530
    // [_start, _start + _len)
13531
    // describes the current longest common leading whitespace
13532
0
    const char *_start = NULL;
13533
0
    Py_ssize_t _len = 0;
13534
13535
0
    for (const char *iter = src; iter < end; ++iter) {
13536
0
        const char *line_start = iter;
13537
0
        const char *leading_whitespace_end = NULL;
13538
13539
        // scan the whole line
13540
0
        while (iter < end && *iter != '\n') {
13541
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
13542
                /* `iter` points to the first non-whitespace character
13543
                   in this line */
13544
0
                if (iter == line_start) {
13545
                    // some line has no indent, fast exit!
13546
0
                    return 0;
13547
0
                }
13548
0
                leading_whitespace_end = iter;
13549
0
            }
13550
0
            ++iter;
13551
0
        }
13552
13553
        // if this line has all white space, skip it
13554
0
        if (!leading_whitespace_end) {
13555
0
            continue;
13556
0
        }
13557
13558
0
        if (!_start) {
13559
            // update the first leading whitespace
13560
0
            _start = line_start;
13561
0
            _len = leading_whitespace_end - line_start;
13562
0
            assert(_len > 0);
13563
0
        }
13564
0
        else {
13565
            /* We then compare with the current longest leading whitespace.
13566
13567
               [line_start, leading_whitespace_end) is the leading
13568
               whitespace of this line,
13569
13570
               [_start, _start + _len) is the leading whitespace of the
13571
               current longest leading whitespace. */
13572
0
            Py_ssize_t new_len = 0;
13573
0
            const char *_iter = _start, *line_iter = line_start;
13574
13575
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
13576
0
                   && *_iter == *line_iter)
13577
0
            {
13578
0
                ++_iter;
13579
0
                ++line_iter;
13580
0
                ++new_len;
13581
0
            }
13582
13583
0
            _len = new_len;
13584
0
            if (_len == 0) {
13585
                // No common things now, fast exit!
13586
0
                return 0;
13587
0
            }
13588
0
        }
13589
0
    }
13590
13591
0
    assert(_len >= 0);
13592
0
    if (_len > 0) {
13593
0
        *output = _start;
13594
0
    }
13595
0
    return _len;
13596
0
}
13597
13598
/* Dedent a string.
13599
   Behaviour is expected to be an exact match of `textwrap.dedent`.
13600
   Return a new reference on success, NULL with exception set on error.
13601
   */
13602
PyObject *
13603
_PyUnicode_Dedent(PyObject *unicode)
13604
0
{
13605
0
    Py_ssize_t src_len = 0;
13606
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
13607
0
    if (!src) {
13608
0
        return NULL;
13609
0
    }
13610
0
    assert(src_len >= 0);
13611
0
    if (src_len == 0) {
13612
0
        return Py_NewRef(unicode);
13613
0
    }
13614
13615
0
    const char *const end = src + src_len;
13616
13617
    // [whitespace_start, whitespace_start + whitespace_len)
13618
    // describes the current longest common leading whitespace
13619
0
    const char *whitespace_start = NULL;
13620
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
13621
0
        src, end, &whitespace_start);
13622
13623
0
    if (whitespace_len == 0) {
13624
0
        return Py_NewRef(unicode);
13625
0
    }
13626
13627
    // now we should trigger a dedent
13628
0
    char *dest = PyMem_Malloc(src_len);
13629
0
    if (!dest) {
13630
0
        PyErr_NoMemory();
13631
0
        return NULL;
13632
0
    }
13633
0
    char *dest_iter = dest;
13634
13635
0
    for (const char *iter = src; iter < end; ++iter) {
13636
0
        const char *line_start = iter;
13637
0
        bool in_leading_space = true;
13638
13639
        // iterate over a line to find the end of a line
13640
0
        while (iter < end && *iter != '\n') {
13641
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
13642
0
                in_leading_space = false;
13643
0
            }
13644
0
            ++iter;
13645
0
        }
13646
13647
        // invariant: *iter == '\n' or iter == end
13648
0
        bool append_newline = iter < end;
13649
13650
        // if this line has all white space, write '\n' and continue
13651
0
        if (in_leading_space && append_newline) {
13652
0
            *dest_iter++ = '\n';
13653
0
            continue;
13654
0
        }
13655
13656
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
13657
            conditionally append '\n' */
13658
13659
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
13660
0
        assert(new_line_len >= 0);
13661
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
13662
13663
0
        dest_iter += new_line_len;
13664
13665
0
        if (append_newline) {
13666
0
            *dest_iter++ = '\n';
13667
0
        }
13668
0
    }
13669
13670
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
13671
0
    PyMem_Free(dest);
13672
0
    return res;
13673
0
}
13674
13675
static PyMethodDef unicode_methods[] = {
13676
    UNICODE_ENCODE_METHODDEF
13677
    UNICODE_REPLACE_METHODDEF
13678
    UNICODE_SPLIT_METHODDEF
13679
    UNICODE_RSPLIT_METHODDEF
13680
    UNICODE_JOIN_METHODDEF
13681
    UNICODE_CAPITALIZE_METHODDEF
13682
    UNICODE_CASEFOLD_METHODDEF
13683
    UNICODE_TITLE_METHODDEF
13684
    UNICODE_CENTER_METHODDEF
13685
    UNICODE_COUNT_METHODDEF
13686
    UNICODE_EXPANDTABS_METHODDEF
13687
    UNICODE_FIND_METHODDEF
13688
    UNICODE_PARTITION_METHODDEF
13689
    UNICODE_INDEX_METHODDEF
13690
    UNICODE_LJUST_METHODDEF
13691
    UNICODE_LOWER_METHODDEF
13692
    UNICODE_LSTRIP_METHODDEF
13693
    UNICODE_RFIND_METHODDEF
13694
    UNICODE_RINDEX_METHODDEF
13695
    UNICODE_RJUST_METHODDEF
13696
    UNICODE_RSTRIP_METHODDEF
13697
    UNICODE_RPARTITION_METHODDEF
13698
    UNICODE_SPLITLINES_METHODDEF
13699
    UNICODE_STRIP_METHODDEF
13700
    UNICODE_SWAPCASE_METHODDEF
13701
    UNICODE_TRANSLATE_METHODDEF
13702
    UNICODE_UPPER_METHODDEF
13703
    UNICODE_STARTSWITH_METHODDEF
13704
    UNICODE_ENDSWITH_METHODDEF
13705
    UNICODE_REMOVEPREFIX_METHODDEF
13706
    UNICODE_REMOVESUFFIX_METHODDEF
13707
    UNICODE_ISASCII_METHODDEF
13708
    UNICODE_ISLOWER_METHODDEF
13709
    UNICODE_ISUPPER_METHODDEF
13710
    UNICODE_ISTITLE_METHODDEF
13711
    UNICODE_ISSPACE_METHODDEF
13712
    UNICODE_ISDECIMAL_METHODDEF
13713
    UNICODE_ISDIGIT_METHODDEF
13714
    UNICODE_ISNUMERIC_METHODDEF
13715
    UNICODE_ISALPHA_METHODDEF
13716
    UNICODE_ISALNUM_METHODDEF
13717
    UNICODE_ISIDENTIFIER_METHODDEF
13718
    UNICODE_ISPRINTABLE_METHODDEF
13719
    UNICODE_ZFILL_METHODDEF
13720
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13721
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
13722
    UNICODE___FORMAT___METHODDEF
13723
    UNICODE_MAKETRANS_METHODDEF
13724
    UNICODE_SIZEOF_METHODDEF
13725
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13726
    {NULL, NULL}
13727
};
13728
13729
static PyObject *
13730
unicode_mod(PyObject *v, PyObject *w)
13731
31.0M
{
13732
31.0M
    if (!PyUnicode_Check(v))
13733
0
        Py_RETURN_NOTIMPLEMENTED;
13734
31.0M
    return PyUnicode_Format(v, w);
13735
31.0M
}
13736
13737
static PyNumberMethods unicode_as_number = {
13738
    0,              /*nb_add*/
13739
    0,              /*nb_subtract*/
13740
    0,              /*nb_multiply*/
13741
    unicode_mod,            /*nb_remainder*/
13742
};
13743
13744
static PySequenceMethods unicode_as_sequence = {
13745
    unicode_length,     /* sq_length */
13746
    PyUnicode_Concat,   /* sq_concat */
13747
    unicode_repeat,     /* sq_repeat */
13748
    unicode_getitem,    /* sq_item */
13749
    0,                  /* sq_slice */
13750
    0,                  /* sq_ass_item */
13751
    0,                  /* sq_ass_slice */
13752
    PyUnicode_Contains, /* sq_contains */
13753
};
13754
13755
static PyObject*
13756
unicode_subscript(PyObject* self, PyObject* item)
13757
123M
{
13758
123M
    if (_PyIndex_Check(item)) {
13759
66.9M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13760
66.9M
        if (i == -1 && PyErr_Occurred())
13761
0
            return NULL;
13762
66.9M
        if (i < 0)
13763
54.8k
            i += PyUnicode_GET_LENGTH(self);
13764
66.9M
        return unicode_getitem(self, i);
13765
66.9M
    } else if (PySlice_Check(item)) {
13766
56.2M
        Py_ssize_t start, stop, step, slicelength, i;
13767
56.2M
        size_t cur;
13768
56.2M
        PyObject *result;
13769
56.2M
        const void *src_data;
13770
56.2M
        void *dest_data;
13771
56.2M
        int src_kind, dest_kind;
13772
56.2M
        Py_UCS4 ch, max_char, kind_limit;
13773
13774
56.2M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13775
0
            return NULL;
13776
0
        }
13777
56.2M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13778
56.2M
                                            &start, &stop, step);
13779
13780
56.2M
        if (slicelength <= 0) {
13781
10.3M
            _Py_RETURN_UNICODE_EMPTY();
13782
45.9M
        } else if (start == 0 && step == 1 &&
13783
10.6M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
13784
2.08M
            return unicode_result_unchanged(self);
13785
43.8M
        } else if (step == 1) {
13786
43.8M
            return PyUnicode_Substring(self,
13787
43.8M
                                       start, start + slicelength);
13788
43.8M
        }
13789
        /* General case */
13790
0
        src_kind = PyUnicode_KIND(self);
13791
0
        src_data = PyUnicode_DATA(self);
13792
0
        if (!PyUnicode_IS_ASCII(self)) {
13793
0
            kind_limit = kind_maxchar_limit(src_kind);
13794
0
            max_char = 0;
13795
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13796
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
13797
0
                if (ch > max_char) {
13798
0
                    max_char = ch;
13799
0
                    if (max_char >= kind_limit)
13800
0
                        break;
13801
0
                }
13802
0
            }
13803
0
        }
13804
0
        else
13805
0
            max_char = 127;
13806
0
        result = PyUnicode_New(slicelength, max_char);
13807
0
        if (result == NULL)
13808
0
            return NULL;
13809
0
        dest_kind = PyUnicode_KIND(result);
13810
0
        dest_data = PyUnicode_DATA(result);
13811
13812
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13813
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13814
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13815
0
        }
13816
0
        assert(_PyUnicode_CheckConsistency(result, 1));
13817
0
        return result;
13818
0
    } else {
13819
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13820
0
                     Py_TYPE(item)->tp_name);
13821
0
        return NULL;
13822
0
    }
13823
123M
}
13824
13825
static PyMappingMethods unicode_as_mapping = {
13826
    unicode_length,     /* mp_length */
13827
    unicode_subscript,  /* mp_subscript */
13828
    0,                  /* mp_ass_subscript */
13829
};
13830
13831
13832
static PyObject *
13833
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
13834
13835
/*[clinic input]
13836
@classmethod
13837
str.__new__ as unicode_new
13838
13839
    object as x: object = NULL
13840
    encoding: str = NULL
13841
    errors: str = NULL
13842
13843
[clinic start generated code]*/
13844
13845
static PyObject *
13846
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
13847
                 const char *errors)
13848
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
13849
16.6M
{
13850
16.6M
    PyObject *unicode;
13851
16.6M
    if (x == NULL) {
13852
0
        unicode = _PyUnicode_GetEmpty();
13853
0
    }
13854
16.6M
    else if (encoding == NULL && errors == NULL) {
13855
16.6M
        unicode = PyObject_Str(x);
13856
16.6M
    }
13857
0
    else {
13858
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
13859
0
    }
13860
13861
16.6M
    if (unicode != NULL && type != &PyUnicode_Type) {
13862
16.6M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
13863
16.6M
    }
13864
16.6M
    return unicode;
13865
16.6M
}
13866
13867
static const char *
13868
arg_as_utf8(PyObject *obj, const char *name)
13869
897k
{
13870
897k
    if (!PyUnicode_Check(obj)) {
13871
0
        PyErr_Format(PyExc_TypeError,
13872
0
                     "str() argument '%s' must be str, not %T",
13873
0
                     name, obj);
13874
0
        return NULL;
13875
0
    }
13876
897k
    return _PyUnicode_AsUTF8NoNUL(obj);
13877
897k
}
13878
13879
static PyObject *
13880
unicode_vectorcall(PyObject *type, PyObject *const *args,
13881
                   size_t nargsf, PyObject *kwnames)
13882
659k
{
13883
659k
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
13884
13885
659k
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
13886
659k
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
13887
        // Fallback to unicode_new()
13888
0
        PyObject *tuple = PyTuple_FromArray(args, nargs);
13889
0
        if (tuple == NULL) {
13890
0
            return NULL;
13891
0
        }
13892
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
13893
0
        if (dict == NULL) {
13894
0
            Py_DECREF(tuple);
13895
0
            return NULL;
13896
0
        }
13897
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
13898
0
        Py_DECREF(tuple);
13899
0
        Py_DECREF(dict);
13900
0
        return ret;
13901
0
    }
13902
659k
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
13903
0
        return NULL;
13904
0
    }
13905
659k
    if (nargs == 0) {
13906
3.99k
        return _PyUnicode_GetEmpty();
13907
3.99k
    }
13908
655k
    PyObject *object = args[0];
13909
655k
    if (nargs == 1) {
13910
589
        return PyObject_Str(object);
13911
589
    }
13912
655k
    const char *encoding = arg_as_utf8(args[1], "encoding");
13913
655k
    if (encoding == NULL) {
13914
0
        return NULL;
13915
0
    }
13916
655k
    const char *errors = NULL;
13917
655k
    if (nargs == 3) {
13918
242k
        errors = arg_as_utf8(args[2], "errors");
13919
242k
        if (errors == NULL) {
13920
0
            return NULL;
13921
0
        }
13922
242k
    }
13923
655k
    return PyUnicode_FromEncodedObject(object, encoding, errors);
13924
655k
}
13925
13926
static PyObject *
13927
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
13928
16.6M
{
13929
16.6M
    PyObject *self;
13930
16.6M
    Py_ssize_t length, char_size;
13931
16.6M
    int share_utf8;
13932
16.6M
    int kind;
13933
16.6M
    void *data;
13934
13935
16.6M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13936
16.6M
    assert(_PyUnicode_CHECK(unicode));
13937
13938
16.6M
    self = type->tp_alloc(type, 0);
13939
16.6M
    if (self == NULL) {
13940
0
        return NULL;
13941
0
    }
13942
16.6M
    kind = PyUnicode_KIND(unicode);
13943
16.6M
    length = PyUnicode_GET_LENGTH(unicode);
13944
13945
16.6M
    _PyUnicode_LENGTH(self) = length;
13946
#ifdef Py_DEBUG
13947
    _PyUnicode_HASH(self) = -1;
13948
#else
13949
16.6M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13950
16.6M
#endif
13951
16.6M
    _PyUnicode_STATE(self).interned = 0;
13952
16.6M
    _PyUnicode_STATE(self).kind = kind;
13953
16.6M
    _PyUnicode_STATE(self).compact = 0;
13954
16.6M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13955
16.6M
    _PyUnicode_STATE(self).statically_allocated = 0;
13956
16.6M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
13957
16.6M
    PyUnicode_SET_UTF8(self, NULL);
13958
16.6M
    _PyUnicode_DATA_ANY(self) = NULL;
13959
13960
16.6M
    share_utf8 = 0;
13961
16.6M
    if (kind == PyUnicode_1BYTE_KIND) {
13962
14.7M
        char_size = 1;
13963
14.7M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13964
14.6M
            share_utf8 = 1;
13965
14.7M
    }
13966
1.90M
    else if (kind == PyUnicode_2BYTE_KIND) {
13967
1.83M
        char_size = 2;
13968
1.83M
    }
13969
65.1k
    else {
13970
65.1k
        assert(kind == PyUnicode_4BYTE_KIND);
13971
65.1k
        char_size = 4;
13972
65.1k
    }
13973
13974
    /* Ensure we won't overflow the length. */
13975
16.6M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13976
0
        PyErr_NoMemory();
13977
0
        goto onError;
13978
0
    }
13979
16.6M
    data = PyMem_Malloc((length + 1) * char_size);
13980
16.6M
    if (data == NULL) {
13981
0
        PyErr_NoMemory();
13982
0
        goto onError;
13983
0
    }
13984
13985
16.6M
    _PyUnicode_DATA_ANY(self) = data;
13986
16.6M
    if (share_utf8) {
13987
14.6M
        PyUnicode_SET_UTF8_LENGTH(self, length);
13988
14.6M
        PyUnicode_SET_UTF8(self, data);
13989
14.6M
    }
13990
13991
16.6M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
13992
16.6M
    assert(_PyUnicode_CheckConsistency(self, 1));
13993
#ifdef Py_DEBUG
13994
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13995
#endif
13996
16.6M
    return self;
13997
13998
0
onError:
13999
0
    Py_DECREF(self);
14000
0
    return NULL;
14001
16.6M
}
14002
14003
void
14004
_PyUnicode_ExactDealloc(PyObject *op)
14005
94.4M
{
14006
94.4M
    assert(PyUnicode_CheckExact(op));
14007
94.4M
    unicode_dealloc(op);
14008
94.4M
}
14009
14010
PyDoc_STRVAR(unicode_doc,
14011
"str(object='') -> str\n\
14012
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14013
\n\
14014
Create a new string object from the given object. If encoding or\n\
14015
errors is specified, then the object must expose a data buffer\n\
14016
that will be decoded using the given encoding and error handler.\n\
14017
Otherwise, returns the result of object.__str__() (if defined)\n\
14018
or repr(object).\n\
14019
encoding defaults to 'utf-8'.\n\
14020
errors defaults to 'strict'.");
14021
14022
static PyObject *unicode_iter(PyObject *seq);
14023
14024
PyTypeObject PyUnicode_Type = {
14025
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14026
    "str",                        /* tp_name */
14027
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14028
    0,                            /* tp_itemsize */
14029
    /* Slots */
14030
    unicode_dealloc,              /* tp_dealloc */
14031
    0,                            /* tp_vectorcall_offset */
14032
    0,                            /* tp_getattr */
14033
    0,                            /* tp_setattr */
14034
    0,                            /* tp_as_async */
14035
    unicode_repr,                 /* tp_repr */
14036
    &unicode_as_number,           /* tp_as_number */
14037
    &unicode_as_sequence,         /* tp_as_sequence */
14038
    &unicode_as_mapping,          /* tp_as_mapping */
14039
    unicode_hash,                 /* tp_hash*/
14040
    0,                            /* tp_call*/
14041
    unicode_str,                  /* tp_str */
14042
    PyObject_GenericGetAttr,      /* tp_getattro */
14043
    0,                            /* tp_setattro */
14044
    0,                            /* tp_as_buffer */
14045
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14046
        Py_TPFLAGS_UNICODE_SUBCLASS |
14047
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14048
    unicode_doc,                  /* tp_doc */
14049
    0,                            /* tp_traverse */
14050
    0,                            /* tp_clear */
14051
    PyUnicode_RichCompare,        /* tp_richcompare */
14052
    0,                            /* tp_weaklistoffset */
14053
    unicode_iter,                 /* tp_iter */
14054
    0,                            /* tp_iternext */
14055
    unicode_methods,              /* tp_methods */
14056
    0,                            /* tp_members */
14057
    0,                            /* tp_getset */
14058
    0,                            /* tp_base */
14059
    0,                            /* tp_dict */
14060
    0,                            /* tp_descr_get */
14061
    0,                            /* tp_descr_set */
14062
    0,                            /* tp_dictoffset */
14063
    0,                            /* tp_init */
14064
    0,                            /* tp_alloc */
14065
    unicode_new,                  /* tp_new */
14066
    PyObject_Free,                /* tp_free */
14067
    .tp_vectorcall = unicode_vectorcall,
14068
};
14069
14070
/* Initialize the Unicode implementation */
14071
14072
static void
14073
_init_global_state(void)
14074
28
{
14075
28
    static int initialized = 0;
14076
28
    if (initialized) {
14077
0
        return;
14078
0
    }
14079
28
    initialized = 1;
14080
14081
    /* initialize the linebreak bloom filter */
14082
28
    const Py_UCS2 linebreak[] = {
14083
28
        0x000A, /* LINE FEED */
14084
28
        0x000D, /* CARRIAGE RETURN */
14085
28
        0x001C, /* FILE SEPARATOR */
14086
28
        0x001D, /* GROUP SEPARATOR */
14087
28
        0x001E, /* RECORD SEPARATOR */
14088
28
        0x0085, /* NEXT LINE */
14089
28
        0x2028, /* LINE SEPARATOR */
14090
28
        0x2029, /* PARAGRAPH SEPARATOR */
14091
28
    };
14092
28
    bloom_linebreak = make_bloom_mask(
14093
28
        PyUnicode_2BYTE_KIND, linebreak,
14094
28
        Py_ARRAY_LENGTH(linebreak));
14095
28
}
14096
14097
void
14098
_PyUnicode_InitState(PyInterpreterState *interp)
14099
28
{
14100
28
    if (!_Py_IsMainInterpreter(interp)) {
14101
0
        return;
14102
0
    }
14103
28
    _init_global_state();
14104
28
}
14105
14106
14107
PyStatus
14108
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14109
28
{
14110
28
    if (_Py_IsMainInterpreter(interp)) {
14111
28
        PyStatus status = init_global_interned_strings(interp);
14112
28
        if (_PyStatus_EXCEPTION(status)) {
14113
0
            return status;
14114
0
        }
14115
28
    }
14116
28
    assert(INTERNED_STRINGS);
14117
14118
28
    if (init_interned_dict(interp)) {
14119
0
        PyErr_Clear();
14120
0
        return _PyStatus_ERR("failed to create interned dict");
14121
0
    }
14122
14123
28
    return _PyStatus_OK();
14124
28
}
14125
14126
14127
PyStatus
14128
_PyUnicode_InitTypes(PyInterpreterState *interp)
14129
28
{
14130
28
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
14131
0
        goto error;
14132
0
    }
14133
28
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
14134
0
        goto error;
14135
0
    }
14136
28
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
14137
0
        goto error;
14138
0
    }
14139
28
    return _PyStatus_OK();
14140
14141
0
error:
14142
0
    return _PyStatus_ERR("Can't initialize unicode types");
14143
28
}
14144
14145
static /* non-null */ PyObject*
14146
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
14147
30.2k
{
14148
    // Note that this steals a reference to `s`, but in many cases that
14149
    // stolen ref is returned, requiring no decref/incref.
14150
14151
30.2k
    assert(s != NULL);
14152
30.2k
    assert(_PyUnicode_CHECK(s));
14153
30.2k
    assert(_PyUnicode_STATE(s).statically_allocated);
14154
30.2k
    assert(!PyUnicode_CHECK_INTERNED(s));
14155
14156
#ifdef Py_DEBUG
14157
    /* We must not add process-global interned string if there's already a
14158
     * per-interpreter interned_dict, which might contain duplicates.
14159
     */
14160
    PyObject *interned = get_interned_dict(interp);
14161
    assert(interned == NULL);
14162
#endif
14163
14164
    /* Look in the global cache first. */
14165
30.2k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14166
    /* We should only init each string once */
14167
30.2k
    assert(r == NULL);
14168
    /* but just in case (for the non-debug build), handle this */
14169
30.2k
    if (r != NULL && r != s) {
14170
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
14171
0
        assert(_PyUnicode_CHECK(r));
14172
0
        Py_DECREF(s);
14173
0
        return Py_NewRef(r);
14174
0
    }
14175
14176
30.2k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
14177
0
        Py_FatalError("failed to intern static string");
14178
0
    }
14179
14180
30.2k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14181
30.2k
    return s;
14182
30.2k
}
14183
14184
void
14185
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
14186
30.2k
{
14187
    // This should only be called as part of runtime initialization
14188
30.2k
    assert(!Py_IsInitialized());
14189
14190
30.2k
    *p = intern_static(interp, *p);
14191
30.2k
    assert(*p);
14192
30.2k
}
14193
14194
static void
14195
immortalize_interned(PyObject *s)
14196
165k
{
14197
165k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
14198
165k
    assert(!_Py_IsImmortal(s));
14199
#ifdef Py_REF_DEBUG
14200
    /* The reference count value should be excluded from the RefTotal.
14201
       The decrements to these objects will not be registered so they
14202
       need to be accounted for in here. */
14203
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
14204
        _Py_DecRefTotal(_PyThreadState_GET());
14205
    }
14206
#endif
14207
165k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
14208
165k
    _Py_SetImmortal(s);
14209
165k
}
14210
14211
static /* non-null */ PyObject*
14212
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
14213
              bool immortalize)
14214
79.7M
{
14215
    // Note that this steals a reference to `s`, but in many cases that
14216
    // stolen ref is returned, requiring no decref/incref.
14217
14218
#ifdef Py_DEBUG
14219
    assert(s != NULL);
14220
    assert(_PyUnicode_CHECK(s));
14221
#else
14222
79.7M
    if (s == NULL || !PyUnicode_Check(s)) {
14223
0
        return s;
14224
0
    }
14225
79.7M
#endif
14226
14227
    /* If it's a subclass, we don't really know what putting
14228
       it in the interned dict might do. */
14229
79.7M
    if (!PyUnicode_CheckExact(s)) {
14230
0
        return s;
14231
0
    }
14232
14233
    /* Is it already interned? */
14234
79.7M
    switch (PyUnicode_CHECK_INTERNED(s)) {
14235
4.67M
        case SSTATE_NOT_INTERNED:
14236
            // no, go on
14237
4.67M
            break;
14238
26.2k
        case SSTATE_INTERNED_MORTAL:
14239
            // yes but we might need to make it immortal
14240
26.2k
            if (immortalize) {
14241
5.42k
                immortalize_interned(s);
14242
5.42k
            }
14243
26.2k
            return s;
14244
75.1M
        default:
14245
            // all done
14246
75.1M
            return s;
14247
79.7M
    }
14248
14249
    /* Statically allocated strings must be already interned. */
14250
79.7M
    assert(!_PyUnicode_STATE(s).statically_allocated);
14251
14252
#if Py_GIL_DISABLED
14253
    /* In the free-threaded build, all interned strings are immortal */
14254
    immortalize = 1;
14255
#endif
14256
14257
    /* If it's already immortal, intern it as such */
14258
4.67M
    if (_Py_IsImmortal(s)) {
14259
0
        immortalize = 1;
14260
0
    }
14261
14262
    /* if it's a short string, get the singleton */
14263
4.67M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
14264
21.8k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
14265
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
14266
0
        assert(PyUnicode_CHECK_INTERNED(r));
14267
0
        Py_DECREF(s);
14268
0
        return r;
14269
0
    }
14270
#ifdef Py_DEBUG
14271
    assert(!unicode_is_singleton(s));
14272
#endif
14273
14274
    /* Look in the global cache now. */
14275
4.67M
    {
14276
4.67M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14277
4.67M
        if (r != NULL) {
14278
618k
            assert(_PyUnicode_STATE(r).statically_allocated);
14279
618k
            assert(r != s);  // r must be statically_allocated; s is not
14280
618k
            Py_DECREF(s);
14281
618k
            return Py_NewRef(r);
14282
618k
        }
14283
4.67M
    }
14284
14285
    /* Do a setdefault on the per-interpreter cache. */
14286
4.05M
    PyObject *interned = get_interned_dict(interp);
14287
4.05M
    assert(interned != NULL);
14288
#ifdef Py_GIL_DISABLED
14289
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
14290
#endif
14291
4.05M
    FT_MUTEX_LOCK(INTERN_MUTEX);
14292
4.05M
    PyObject *t;
14293
4.05M
    {
14294
4.05M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
14295
4.05M
        if (res < 0) {
14296
0
            PyErr_Clear();
14297
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14298
0
            return s;
14299
0
        }
14300
4.05M
        else if (res == 1) {
14301
            // value was already present (not inserted)
14302
3.38M
            Py_DECREF(s);
14303
3.38M
            if (immortalize &&
14304
1.27M
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
14305
7.38k
                immortalize_interned(t);
14306
7.38k
            }
14307
3.38M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14308
3.38M
            return t;
14309
3.38M
        }
14310
666k
        else {
14311
            // value was newly inserted
14312
666k
            assert (s == t);
14313
666k
            Py_DECREF(t);
14314
666k
        }
14315
4.05M
    }
14316
14317
    /* NOT_INTERNED -> INTERNED_MORTAL */
14318
14319
4.05M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
14320
14321
666k
    if (!_Py_IsImmortal(s)) {
14322
        /* The two references in interned dict (key and value) are not counted.
14323
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
14324
666k
        Py_DECREF(s);
14325
666k
        Py_DECREF(s);
14326
666k
    }
14327
666k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
14328
14329
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
14330
14331
#ifdef Py_DEBUG
14332
    if (_Py_IsImmortal(s)) {
14333
        assert(immortalize);
14334
    }
14335
#endif
14336
666k
    if (immortalize) {
14337
152k
        immortalize_interned(s);
14338
152k
    }
14339
14340
666k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
14341
666k
    return s;
14342
4.05M
}
14343
14344
void
14345
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
14346
5.09M
{
14347
5.09M
    *p = intern_common(interp, *p, 1);
14348
5.09M
    assert(*p);
14349
5.09M
}
14350
14351
void
14352
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
14353
74.7M
{
14354
74.7M
    *p = intern_common(interp, *p, 0);
14355
74.7M
    assert(*p);
14356
74.7M
}
14357
14358
14359
void
14360
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
14361
0
{
14362
0
    _PyUnicode_InternImmortal(interp, p);
14363
0
    return;
14364
0
}
14365
14366
void
14367
PyUnicode_InternInPlace(PyObject **p)
14368
0
{
14369
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14370
0
    _PyUnicode_InternMortal(interp, p);
14371
0
}
14372
14373
// Public-looking name kept for the stable ABI; user should not call this:
14374
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14375
void
14376
PyUnicode_InternImmortal(PyObject **p)
14377
0
{
14378
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14379
0
    _PyUnicode_InternImmortal(interp, p);
14380
0
}
14381
14382
PyObject *
14383
PyUnicode_InternFromString(const char *cp)
14384
1.34M
{
14385
1.34M
    PyObject *s = PyUnicode_FromString(cp);
14386
1.34M
    if (s == NULL) {
14387
0
        return NULL;
14388
0
    }
14389
1.34M
    PyInterpreterState *interp = _PyInterpreterState_GET();
14390
1.34M
    _PyUnicode_InternMortal(interp, &s);
14391
1.34M
    return s;
14392
1.34M
}
14393
14394
14395
void
14396
_PyUnicode_ClearInterned(PyInterpreterState *interp)
14397
0
{
14398
0
    PyObject *interned = get_interned_dict(interp);
14399
0
    if (interned == NULL) {
14400
0
        return;
14401
0
    }
14402
0
    assert(PyDict_CheckExact(interned));
14403
14404
0
    if (has_shared_intern_dict(interp)) {
14405
        // the dict doesn't belong to this interpreter, skip the debug
14406
        // checks on it and just clear the pointer to it
14407
0
        clear_interned_dict(interp);
14408
0
        return;
14409
0
    }
14410
14411
#ifdef INTERNED_STATS
14412
    fprintf(stderr, "releasing %zd interned strings\n",
14413
            PyDict_GET_SIZE(interned));
14414
14415
    Py_ssize_t total_length = 0;
14416
#endif
14417
0
    Py_ssize_t pos = 0;
14418
0
    PyObject *s, *ignored_value;
14419
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
14420
0
        int shared = 0;
14421
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
14422
0
        case SSTATE_INTERNED_IMMORTAL:
14423
            /* Make immortal interned strings mortal again. */
14424
            // Skip the Immortal Instance check and restore
14425
            // the two references (key and value) ignored
14426
            // by PyUnicode_InternInPlace().
14427
0
            _Py_SetMortal(s, 2);
14428
#ifdef Py_REF_DEBUG
14429
            /* let's be pedantic with the ref total */
14430
            _Py_IncRefTotal(_PyThreadState_GET());
14431
            _Py_IncRefTotal(_PyThreadState_GET());
14432
#endif
14433
#ifdef INTERNED_STATS
14434
            total_length += PyUnicode_GET_LENGTH(s);
14435
#endif
14436
0
            break;
14437
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
14438
            /* It is shared between interpreters, so we should unmark it
14439
               only when this is the last interpreter in which it's
14440
               interned.  We immortalize all the statically initialized
14441
               strings during startup, so we can rely on the
14442
               main interpreter to be the last one. */
14443
0
            if (!_Py_IsMainInterpreter(interp)) {
14444
0
                shared = 1;
14445
0
            }
14446
0
            break;
14447
0
        case SSTATE_INTERNED_MORTAL:
14448
            // Restore 2 references held by the interned dict; these will
14449
            // be decref'd by clear_interned_dict's PyDict_Clear.
14450
0
            _Py_RefcntAdd(s, 2);
14451
#ifdef Py_REF_DEBUG
14452
            /* let's be pedantic with the ref total */
14453
            _Py_IncRefTotal(_PyThreadState_GET());
14454
            _Py_IncRefTotal(_PyThreadState_GET());
14455
#endif
14456
0
            break;
14457
0
        case SSTATE_NOT_INTERNED:
14458
0
            _Py_FALLTHROUGH;
14459
0
        default:
14460
0
            Py_UNREACHABLE();
14461
0
        }
14462
0
        if (!shared) {
14463
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
14464
0
        }
14465
0
    }
14466
#ifdef INTERNED_STATS
14467
    fprintf(stderr,
14468
            "total length of all interned strings: %zd characters\n",
14469
            total_length);
14470
#endif
14471
14472
0
    struct _Py_unicode_state *state = &interp->unicode;
14473
0
    struct _Py_unicode_ids *ids = &state->ids;
14474
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
14475
0
        Py_XINCREF(ids->array[i]);
14476
0
    }
14477
0
    clear_interned_dict(interp);
14478
0
    if (_Py_IsMainInterpreter(interp)) {
14479
0
        clear_global_interned_strings();
14480
0
    }
14481
0
}
14482
14483
14484
/********************* Unicode Iterator **************************/
14485
14486
typedef struct {
14487
    PyObject_HEAD
14488
    Py_ssize_t it_index;
14489
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14490
} unicodeiterobject;
14491
14492
static void
14493
unicodeiter_dealloc(PyObject *op)
14494
1.81M
{
14495
1.81M
    unicodeiterobject *it = (unicodeiterobject *)op;
14496
1.81M
    _PyObject_GC_UNTRACK(it);
14497
1.81M
    Py_XDECREF(it->it_seq);
14498
1.81M
    PyObject_GC_Del(it);
14499
1.81M
}
14500
14501
static int
14502
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
14503
3
{
14504
3
    unicodeiterobject *it = (unicodeiterobject *)op;
14505
3
    Py_VISIT(it->it_seq);
14506
3
    return 0;
14507
3
}
14508
14509
static PyObject *
14510
unicodeiter_next(PyObject *op)
14511
134M
{
14512
134M
    unicodeiterobject *it = (unicodeiterobject *)op;
14513
134M
    PyObject *seq;
14514
14515
134M
    assert(it != NULL);
14516
134M
    seq = it->it_seq;
14517
134M
    if (seq == NULL)
14518
0
        return NULL;
14519
134M
    assert(_PyUnicode_CHECK(seq));
14520
14521
134M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14522
133M
        int kind = PyUnicode_KIND(seq);
14523
133M
        const void *data = PyUnicode_DATA(seq);
14524
133M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14525
133M
        it->it_index++;
14526
133M
        return unicode_char(chr);
14527
133M
    }
14528
14529
841k
    it->it_seq = NULL;
14530
841k
    Py_DECREF(seq);
14531
841k
    return NULL;
14532
134M
}
14533
14534
static PyObject *
14535
unicode_ascii_iter_next(PyObject *op)
14536
107M
{
14537
107M
    unicodeiterobject *it = (unicodeiterobject *)op;
14538
107M
    assert(it != NULL);
14539
107M
    PyObject *seq = it->it_seq;
14540
107M
    if (seq == NULL) {
14541
0
        return NULL;
14542
0
    }
14543
107M
    assert(_PyUnicode_CHECK(seq));
14544
107M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
14545
107M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14546
106M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14547
106M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14548
106M
                                              data, it->it_index);
14549
106M
        it->it_index++;
14550
106M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14551
106M
    }
14552
869k
    it->it_seq = NULL;
14553
869k
    Py_DECREF(seq);
14554
869k
    return NULL;
14555
107M
}
14556
14557
static PyObject *
14558
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
14559
0
{
14560
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14561
0
    Py_ssize_t len = 0;
14562
0
    if (it->it_seq)
14563
0
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14564
0
    return PyLong_FromSsize_t(len);
14565
0
}
14566
14567
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14568
14569
static PyObject *
14570
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
14571
0
{
14572
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14573
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
14574
14575
    /* _PyEval_GetBuiltin can invoke arbitrary code,
14576
     * call must be before access of iterator pointers.
14577
     * see issue #101765 */
14578
14579
0
    if (it->it_seq != NULL) {
14580
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
14581
0
    } else {
14582
0
        PyObject *u = _PyUnicode_GetEmpty();
14583
0
        if (u == NULL) {
14584
0
            Py_XDECREF(iter);
14585
0
            return NULL;
14586
0
        }
14587
0
        return Py_BuildValue("N(N)", iter, u);
14588
0
    }
14589
0
}
14590
14591
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14592
14593
static PyObject *
14594
unicodeiter_setstate(PyObject *op, PyObject *state)
14595
0
{
14596
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14597
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
14598
0
    if (index == -1 && PyErr_Occurred())
14599
0
        return NULL;
14600
0
    if (it->it_seq != NULL) {
14601
0
        if (index < 0)
14602
0
            index = 0;
14603
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14604
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14605
0
        it->it_index = index;
14606
0
    }
14607
0
    Py_RETURN_NONE;
14608
0
}
14609
14610
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14611
14612
static PyMethodDef unicodeiter_methods[] = {
14613
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
14614
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
14615
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
14616
    {NULL,      NULL}       /* sentinel */
14617
};
14618
14619
PyTypeObject PyUnicodeIter_Type = {
14620
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14621
    "str_iterator",         /* tp_name */
14622
    sizeof(unicodeiterobject),      /* tp_basicsize */
14623
    0,                  /* tp_itemsize */
14624
    /* methods */
14625
    unicodeiter_dealloc,/* tp_dealloc */
14626
    0,                  /* tp_vectorcall_offset */
14627
    0,                  /* tp_getattr */
14628
    0,                  /* tp_setattr */
14629
    0,                  /* tp_as_async */
14630
    0,                  /* tp_repr */
14631
    0,                  /* tp_as_number */
14632
    0,                  /* tp_as_sequence */
14633
    0,                  /* tp_as_mapping */
14634
    0,                  /* tp_hash */
14635
    0,                  /* tp_call */
14636
    0,                  /* tp_str */
14637
    PyObject_GenericGetAttr,        /* tp_getattro */
14638
    0,                  /* tp_setattro */
14639
    0,                  /* tp_as_buffer */
14640
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14641
    0,                  /* tp_doc */
14642
    unicodeiter_traverse, /* tp_traverse */
14643
    0,                  /* tp_clear */
14644
    0,                  /* tp_richcompare */
14645
    0,                  /* tp_weaklistoffset */
14646
    PyObject_SelfIter,          /* tp_iter */
14647
    unicodeiter_next,   /* tp_iternext */
14648
    unicodeiter_methods,            /* tp_methods */
14649
    0,
14650
};
14651
14652
PyTypeObject _PyUnicodeASCIIIter_Type = {
14653
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14654
    .tp_name = "str_ascii_iterator",
14655
    .tp_basicsize = sizeof(unicodeiterobject),
14656
    .tp_dealloc = unicodeiter_dealloc,
14657
    .tp_getattro = PyObject_GenericGetAttr,
14658
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14659
    .tp_traverse = unicodeiter_traverse,
14660
    .tp_iter = PyObject_SelfIter,
14661
    .tp_iternext = unicode_ascii_iter_next,
14662
    .tp_methods = unicodeiter_methods,
14663
};
14664
14665
static PyObject *
14666
unicode_iter(PyObject *seq)
14667
1.81M
{
14668
1.81M
    unicodeiterobject *it;
14669
14670
1.81M
    if (!PyUnicode_Check(seq)) {
14671
0
        PyErr_BadInternalCall();
14672
0
        return NULL;
14673
0
    }
14674
1.81M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14675
968k
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14676
968k
    }
14677
841k
    else {
14678
841k
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14679
841k
    }
14680
1.81M
    if (it == NULL)
14681
0
        return NULL;
14682
1.81M
    it->it_index = 0;
14683
1.81M
    it->it_seq = Py_NewRef(seq);
14684
1.81M
    _PyObject_GC_TRACK(it);
14685
1.81M
    return (PyObject *)it;
14686
1.81M
}
14687
14688
static int
14689
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14690
112
{
14691
112
    int res;
14692
112
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14693
112
    if (res == -2) {
14694
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
14695
0
        return -1;
14696
0
    }
14697
112
    if (res < 0) {
14698
0
        PyErr_NoMemory();
14699
0
        return -1;
14700
0
    }
14701
112
    return 0;
14702
112
}
14703
14704
14705
static int
14706
config_get_codec_name(wchar_t **config_encoding)
14707
56
{
14708
56
    char *encoding;
14709
56
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
14710
0
        return -1;
14711
0
    }
14712
14713
56
    PyObject *name_obj = NULL;
14714
56
    PyObject *codec = _PyCodec_Lookup(encoding);
14715
56
    PyMem_RawFree(encoding);
14716
14717
56
    if (!codec)
14718
0
        goto error;
14719
14720
56
    name_obj = PyObject_GetAttrString(codec, "name");
14721
56
    Py_CLEAR(codec);
14722
56
    if (!name_obj) {
14723
0
        goto error;
14724
0
    }
14725
14726
56
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14727
56
    Py_DECREF(name_obj);
14728
56
    if (wname == NULL) {
14729
0
        goto error;
14730
0
    }
14731
14732
56
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14733
56
    if (raw_wname == NULL) {
14734
0
        PyMem_Free(wname);
14735
0
        PyErr_NoMemory();
14736
0
        goto error;
14737
0
    }
14738
14739
56
    PyMem_RawFree(*config_encoding);
14740
56
    *config_encoding = raw_wname;
14741
14742
56
    PyMem_Free(wname);
14743
56
    return 0;
14744
14745
0
error:
14746
0
    Py_XDECREF(codec);
14747
0
    Py_XDECREF(name_obj);
14748
0
    return -1;
14749
56
}
14750
14751
14752
static PyStatus
14753
init_stdio_encoding(PyInterpreterState *interp)
14754
28
{
14755
    /* Update the stdio encoding to the normalized Python codec name. */
14756
28
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14757
28
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
14758
0
        return _PyStatus_ERR("failed to get the Python codec name "
14759
0
                             "of the stdio encoding");
14760
0
    }
14761
28
    return _PyStatus_OK();
14762
28
}
14763
14764
14765
static int
14766
init_fs_codec(PyInterpreterState *interp)
14767
28
{
14768
28
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
14769
14770
28
    _Py_error_handler error_handler;
14771
28
    error_handler = get_error_handler_wide(config->filesystem_errors);
14772
28
    if (error_handler == _Py_ERROR_UNKNOWN) {
14773
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
14774
0
        return -1;
14775
0
    }
14776
14777
28
    char *encoding, *errors;
14778
28
    if (encode_wstr_utf8(config->filesystem_encoding,
14779
28
                         &encoding,
14780
28
                         "filesystem_encoding") < 0) {
14781
0
        return -1;
14782
0
    }
14783
14784
28
    if (encode_wstr_utf8(config->filesystem_errors,
14785
28
                         &errors,
14786
28
                         "filesystem_errors") < 0) {
14787
0
        PyMem_RawFree(encoding);
14788
0
        return -1;
14789
0
    }
14790
14791
28
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
14792
28
    PyMem_RawFree(fs_codec->encoding);
14793
28
    fs_codec->encoding = encoding;
14794
    /* encoding has been normalized by init_fs_encoding() */
14795
28
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
14796
28
    PyMem_RawFree(fs_codec->errors);
14797
28
    fs_codec->errors = errors;
14798
28
    fs_codec->error_handler = error_handler;
14799
14800
#ifdef _Py_FORCE_UTF8_FS_ENCODING
14801
    assert(fs_codec->utf8 == 1);
14802
#endif
14803
14804
    /* At this point, PyUnicode_EncodeFSDefault() and
14805
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
14806
       the C implementation of the filesystem encoding. */
14807
14808
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
14809
       global configuration variables. */
14810
28
    if (_Py_IsMainInterpreter(interp)) {
14811
14812
28
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
14813
28
                                      fs_codec->errors) < 0) {
14814
0
            PyErr_NoMemory();
14815
0
            return -1;
14816
0
        }
14817
28
    }
14818
28
    return 0;
14819
28
}
14820
14821
14822
static PyStatus
14823
init_fs_encoding(PyThreadState *tstate)
14824
28
{
14825
28
    PyInterpreterState *interp = tstate->interp;
14826
14827
    /* Update the filesystem encoding to the normalized Python codec name.
14828
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
14829
       (Python codec name). */
14830
28
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14831
28
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
14832
0
        _Py_DumpPathConfig(tstate);
14833
0
        return _PyStatus_ERR("failed to get the Python codec "
14834
0
                             "of the filesystem encoding");
14835
0
    }
14836
14837
28
    if (init_fs_codec(interp) < 0) {
14838
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
14839
0
    }
14840
28
    return _PyStatus_OK();
14841
28
}
14842
14843
14844
PyStatus
14845
_PyUnicode_InitEncodings(PyThreadState *tstate)
14846
28
{
14847
28
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
14848
28
    if (_PyStatus_EXCEPTION(status)) {
14849
0
        return status;
14850
0
    }
14851
28
    status = init_fs_encoding(tstate);
14852
28
    if (_PyStatus_EXCEPTION(status)) {
14853
0
        return status;
14854
0
    }
14855
14856
28
    return init_stdio_encoding(tstate->interp);
14857
28
}
14858
14859
14860
static void
14861
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
14862
0
{
14863
0
    PyMem_RawFree(fs_codec->encoding);
14864
0
    fs_codec->encoding = NULL;
14865
0
    fs_codec->utf8 = 0;
14866
0
    PyMem_RawFree(fs_codec->errors);
14867
0
    fs_codec->errors = NULL;
14868
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
14869
0
}
14870
14871
14872
#ifdef MS_WINDOWS
14873
int
14874
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
14875
{
14876
    PyInterpreterState *interp = _PyInterpreterState_GET();
14877
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
14878
14879
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
14880
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
14881
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
14882
    if (encoding == NULL || errors == NULL) {
14883
        PyMem_RawFree(encoding);
14884
        PyMem_RawFree(errors);
14885
        PyErr_NoMemory();
14886
        return -1;
14887
    }
14888
14889
    PyMem_RawFree(config->filesystem_encoding);
14890
    config->filesystem_encoding = encoding;
14891
    PyMem_RawFree(config->filesystem_errors);
14892
    config->filesystem_errors = errors;
14893
14894
    return init_fs_codec(interp);
14895
}
14896
#endif
14897
14898
14899
#ifdef Py_DEBUG
14900
static inline int
14901
unicode_is_finalizing(void)
14902
{
14903
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
14904
}
14905
#endif
14906
14907
14908
void
14909
_PyUnicode_FiniTypes(PyInterpreterState *interp)
14910
0
{
14911
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
14912
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
14913
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
14914
0
}
14915
14916
14917
void
14918
_PyUnicode_Fini(PyInterpreterState *interp)
14919
0
{
14920
0
    struct _Py_unicode_state *state = &interp->unicode;
14921
14922
0
    if (!has_shared_intern_dict(interp)) {
14923
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
14924
0
        assert(get_interned_dict(interp) == NULL);
14925
0
    }
14926
14927
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
14928
14929
    // bpo-47182: force a unicodedata CAPI capsule re-import on
14930
    // subsequent initialization of interpreter.
14931
0
    interp->unicode.ucnhash_capi = NULL;
14932
14933
0
    unicode_clear_identifiers(state);
14934
0
}
14935
14936
/* A _string module, to export formatter_parser and formatter_field_name_split
14937
   to the string.Formatter class implemented in Python. */
14938
14939
static PyMethodDef _string_methods[] = {
14940
    {"formatter_field_name_split", formatter_field_name_split,
14941
     METH_O, PyDoc_STR("split the argument as a field name")},
14942
    {"formatter_parser", formatter_parser,
14943
     METH_O, PyDoc_STR("parse the argument as a format string")},
14944
    {NULL, NULL}
14945
};
14946
14947
static PyModuleDef_Slot module_slots[] = {
14948
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
14949
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
14950
    {0, NULL}
14951
};
14952
14953
static struct PyModuleDef _string_module = {
14954
    PyModuleDef_HEAD_INIT,
14955
    .m_name = "_string",
14956
    .m_doc = PyDoc_STR("string helper module"),
14957
    .m_size = 0,
14958
    .m_methods = _string_methods,
14959
    .m_slots = module_slots,
14960
};
14961
14962
PyMODINIT_FUNC
14963
PyInit__string(void)
14964
6
{
14965
6
    return PyModuleDef_Init(&_string_module);
14966
6
}
14967
14968
14969
#undef PyUnicode_KIND
14970
int PyUnicode_KIND(PyObject *op)
14971
0
{
14972
0
    if (!PyUnicode_Check(op)) {
14973
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14974
0
        return -1;
14975
0
    }
14976
0
    return _PyASCIIObject_CAST(op)->state.kind;
14977
0
}
14978
14979
#undef PyUnicode_DATA
14980
void* PyUnicode_DATA(PyObject *op)
14981
0
{
14982
0
    if (!PyUnicode_Check(op)) {
14983
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14984
0
        return NULL;
14985
0
    }
14986
0
    return _PyUnicode_DATA(op);
14987
0
}