Coverage Report

Created: 2026-06-09 06:31

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython3/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_RepeatBuffer()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_initconfig.h"    // _PyStatus_OK()
50
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
51
#include "pycore_long.h"          // _PyLong_FormatWriter()
52
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
54
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
55
#include "pycore_pyhash.h"        // _Py_HashSecret_t
56
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
57
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
58
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
59
#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart
60
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
61
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
62
63
#include "stringlib/eq.h"         // unicode_eq()
64
#include <stddef.h>               // ptrdiff_t
65
66
#ifdef MS_WINDOWS
67
#include <windows.h>
68
#endif
69
70
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
71
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
72
#endif
73
74
/* Uncomment to display statistics on interned strings at exit
75
   in _PyUnicode_ClearInterned(). */
76
/* #define INTERNED_STATS 1 */
77
78
79
/*[clinic input]
80
class str "PyObject *" "&PyUnicode_Type"
81
[clinic start generated code]*/
82
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
83
84
/*[python input]
85
class Py_UCS4_converter(CConverter):
86
    type = 'Py_UCS4'
87
    converter = 'convert_uc'
88
89
    def c_default_init(self):
90
        import libclinic
91
        self.c_default = libclinic.c_unichar_repr(self.default)
92
93
[python start generated code]*/
94
/*[python end generated code: output=da39a3ee5e6b4b0d input=22f057b68fd9a65a]*/
95
96
/* --- Globals ------------------------------------------------------------
97
98
NOTE: In the interpreter's initialization phase, some globals are currently
99
      initialized dynamically as needed. In the process Unicode objects may
100
      be created before the Unicode type is ready.
101
102
*/
103
104
1.06M
#define MAX_UNICODE _Py_MAX_UNICODE
105
133k
#define ensure_unicode _PyUnicode_EnsureUnicode
106
107
#ifdef Py_DEBUG
108
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
109
#else
110
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
111
#endif
112
113
static inline char* _PyUnicode_UTF8(PyObject *op)
114
42.9M
{
115
42.9M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
116
42.9M
}
117
118
static inline char* PyUnicode_UTF8(PyObject *op)
119
560k
{
120
560k
    assert(_PyUnicode_CHECK(op));
121
560k
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
122
549k
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
123
549k
    }
124
11.8k
    else {
125
11.8k
         return _PyUnicode_UTF8(op);
126
11.8k
    }
127
560k
}
128
129
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
130
2.82k
{
131
2.82k
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
132
2.82k
}
133
134
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
135
120k
{
136
120k
    assert(_PyUnicode_CHECK(op));
137
120k
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
138
120k
         return _PyASCIIObject_CAST(op)->length;
139
120k
    }
140
588
    else {
141
588
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
142
588
    }
143
120k
}
144
145
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
146
2.82k
{
147
2.82k
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
148
2.82k
}
149
150
#define _PyUnicode_LENGTH(op)                           \
151
12.5M
    (_PyASCIIObject_CAST(op)->length)
152
#define _PyUnicode_STATE(op)                            \
153
86.3M
    (_PyASCIIObject_CAST(op)->state)
154
#define _PyUnicode_HASH(op)                             \
155
12.3M
    (_PyASCIIObject_CAST(op)->hash)
156
157
2.68M
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
158
159
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
160
2.07M
{
161
2.07M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
162
2.07M
}
163
164
#define _PyUnicode_DATA_ANY(op)                         \
165
70
    (_PyUnicodeObject_CAST(op)->data.any)
166
167
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
168
0
{
169
0
    assert(_PyUnicode_CHECK(op));
170
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
171
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
172
0
}
173
174
/* true if the Unicode object has an allocated UTF-8 memory block
175
   (not shared with other data) */
176
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
177
12.3M
{
178
12.3M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
179
8.52M
            && _PyUnicode_UTF8(op) != NULL
180
2.75k
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
181
12.3M
}
182
183
184
4.30M
#define LATIN1 _Py_LATIN1_CHR
185
186
/* Forward declaration */
187
static PyObject *
188
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
189
                    const char *errors);
190
static PyObject *
191
unicode_decode_utf8(const char *s, Py_ssize_t size,
192
                    _Py_error_handler error_handler, const char *errors,
193
                    Py_ssize_t *consumed);
194
#ifdef Py_DEBUG
195
static inline int unicode_is_finalizing(void);
196
static int unicode_is_singleton(PyObject *unicode);
197
#endif
198
199
200
// Return a reference to the immortal empty string singleton.
201
PyObject*
202
_PyUnicode_GetEmpty(void)
203
38.7M
{
204
38.7M
    _Py_DECLARE_STR(empty, "");
205
38.7M
    return &_Py_STR(empty);
206
38.7M
}
207
208
/* This dictionary holds per-interpreter interned strings.
209
 * See InternalDocs/string_interning.md for details.
210
 */
211
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
212
1.55M
{
213
1.55M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
214
1.55M
}
215
216
/* This hashtable holds statically allocated interned strings.
217
 * See InternalDocs/string_interning.md for details.
218
 */
219
1.55M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
220
221
/* Get number of all interned strings for the current interpreter. */
222
Py_ssize_t
223
_PyUnicode_InternedSize(void)
224
0
{
225
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
226
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
227
0
}
228
229
/* Get number of immortal interned strings for the current interpreter. */
230
Py_ssize_t
231
_PyUnicode_InternedSize_Immortal(void)
232
0
{
233
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
234
0
    PyObject *key, *value;
235
0
    Py_ssize_t pos = 0;
236
0
    Py_ssize_t count = 0;
237
238
    // It's tempting to keep a count and avoid a loop here. But, this function
239
    // is intended for refleak tests. It spends extra work to report the true
240
    // value, to help detect bugs in optimizations.
241
242
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
243
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
244
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
245
0
           count++;
246
0
       }
247
0
    }
248
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
249
0
}
250
251
static Py_hash_t unicode_hash(PyObject *);
252
253
static Py_uhash_t
254
hashtable_unicode_hash(const void *key)
255
1.61M
{
256
1.61M
    return unicode_hash((PyObject *)key);
257
1.61M
}
258
259
static int
260
hashtable_unicode_compare(const void *key1, const void *key2)
261
94.9k
{
262
94.9k
    PyObject *obj1 = (PyObject *)key1;
263
94.9k
    PyObject *obj2 = (PyObject *)key2;
264
94.9k
    if (obj1 != NULL && obj2 != NULL) {
265
94.9k
        return unicode_eq(obj1, obj2);
266
94.9k
    }
267
0
    else {
268
0
        return obj1 == obj2;
269
0
    }
270
94.9k
}
271
272
/* Return true if this interpreter should share the main interpreter's
273
   intern_dict.  That's important for interpreters which load basic
274
   single-phase init extension modules (m_size == -1).  There could be interned
275
   immortal strings that are shared between interpreters, due to the
276
   PyDict_Update(mdict, m_copy) call in import_find_extension().
277
278
   It's not safe to deallocate those strings until all interpreters that
279
   potentially use them are freed.  By storing them in the main interpreter, we
280
   ensure they get freed after all other interpreters are freed.
281
*/
282
static bool
283
has_shared_intern_dict(PyInterpreterState *interp)
284
19
{
285
19
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
286
19
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
287
19
}
288
289
static int
290
init_interned_dict(PyInterpreterState *interp)
291
19
{
292
19
    assert(get_interned_dict(interp) == NULL);
293
19
    PyObject *interned;
294
19
    if (has_shared_intern_dict(interp)) {
295
0
        interned = get_interned_dict(_PyInterpreterState_Main());
296
0
        Py_INCREF(interned);
297
0
    }
298
19
    else {
299
19
        interned = PyDict_New();
300
19
        if (interned == NULL) {
301
0
            return -1;
302
0
        }
303
19
    }
304
19
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
305
19
    return 0;
306
19
}
307
308
static void
309
clear_interned_dict(PyInterpreterState *interp)
310
0
{
311
0
    PyObject *interned = get_interned_dict(interp);
312
0
    if (interned != NULL) {
313
0
        if (!has_shared_intern_dict(interp)) {
314
            // only clear if the dict belongs to this interpreter
315
0
            PyDict_Clear(interned);
316
0
        }
317
0
        Py_DECREF(interned);
318
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
319
0
    }
320
0
}
321
322
static PyStatus
323
init_global_interned_strings(PyInterpreterState *interp)
324
19
{
325
19
    assert(INTERNED_STRINGS == NULL);
326
19
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
327
328
19
    INTERNED_STRINGS = _Py_hashtable_new_full(
329
19
        hashtable_unicode_hash,
330
19
        hashtable_unicode_compare,
331
        // Objects stored here are immortal and statically allocated,
332
        // so we don't need key_destroy_func & value_destroy_func:
333
19
        NULL,
334
19
        NULL,
335
19
        &hashtable_alloc
336
19
    );
337
19
    if (INTERNED_STRINGS == NULL) {
338
0
        PyErr_Clear();
339
0
        return _PyStatus_ERR("failed to create global interned dict");
340
0
    }
341
342
    /* Intern statically allocated string identifiers, deepfreeze strings,
343
        * and one-byte latin-1 strings.
344
        * This must be done before any module initialization so that statically
345
        * allocated string identifiers are used instead of heap allocated strings.
346
        * Deepfreeze uses the interned identifiers if present to save space
347
        * else generates them and they are interned to speed up dict lookups.
348
    */
349
19
    _PyUnicode_InitStaticStrings(interp);
350
351
4.88k
    for (int i = 0; i < 256; i++) {
352
4.86k
        PyObject *s = LATIN1(i);
353
4.86k
        _PyUnicode_InternStatic(interp, &s);
354
4.86k
        assert(s == LATIN1(i));
355
4.86k
    }
356
#ifdef Py_DEBUG
357
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
358
359
    for (int i = 0; i < 256; i++) {
360
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
361
    }
362
#endif
363
19
    return _PyStatus_OK();
364
19
}
365
366
static void clear_global_interned_strings(void)
367
0
{
368
0
    if (INTERNED_STRINGS != NULL) {
369
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
370
0
        INTERNED_STRINGS = NULL;
371
0
    }
372
0
}
373
374
#define _Py_RETURN_UNICODE_EMPTY()   \
375
26.4M
    do {                             \
376
26.4M
        return _PyUnicode_GetEmpty();\
377
26.4M
    } while (0)
378
379
380
/* Fast detection of the most frequent whitespace characters */
381
const unsigned char _Py_ascii_whitespace[] = {
382
    0, 0, 0, 0, 0, 0, 0, 0,
383
/*     case 0x0009: * CHARACTER TABULATION */
384
/*     case 0x000A: * LINE FEED */
385
/*     case 0x000B: * LINE TABULATION */
386
/*     case 0x000C: * FORM FEED */
387
/*     case 0x000D: * CARRIAGE RETURN */
388
    0, 1, 1, 1, 1, 1, 0, 0,
389
    0, 0, 0, 0, 0, 0, 0, 0,
390
/*     case 0x001C: * FILE SEPARATOR */
391
/*     case 0x001D: * GROUP SEPARATOR */
392
/*     case 0x001E: * RECORD SEPARATOR */
393
/*     case 0x001F: * UNIT SEPARATOR */
394
    0, 0, 0, 0, 1, 1, 1, 1,
395
/*     case 0x0020: * SPACE */
396
    1, 0, 0, 0, 0, 0, 0, 0,
397
    0, 0, 0, 0, 0, 0, 0, 0,
398
    0, 0, 0, 0, 0, 0, 0, 0,
399
    0, 0, 0, 0, 0, 0, 0, 0,
400
401
    0, 0, 0, 0, 0, 0, 0, 0,
402
    0, 0, 0, 0, 0, 0, 0, 0,
403
    0, 0, 0, 0, 0, 0, 0, 0,
404
    0, 0, 0, 0, 0, 0, 0, 0,
405
    0, 0, 0, 0, 0, 0, 0, 0,
406
    0, 0, 0, 0, 0, 0, 0, 0,
407
    0, 0, 0, 0, 0, 0, 0, 0,
408
    0, 0, 0, 0, 0, 0, 0, 0
409
};
410
411
/* forward */
412
static PyObject* get_latin1_char(unsigned char ch);
413
414
415
static PyObject *
416
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
417
static PyObject *
418
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
419
static PyObject *
420
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
421
422
static PyObject *
423
unicode_encode_call_errorhandler(const char *errors,
424
       PyObject **errorHandler,const char *encoding, const char *reason,
425
       PyObject *unicode, PyObject **exceptionObject,
426
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
427
428
static void
429
raise_encode_exception(PyObject **exceptionObject,
430
                       const char *encoding,
431
                       PyObject *unicode,
432
                       Py_ssize_t startpos, Py_ssize_t endpos,
433
                       const char *reason);
434
435
/* Same for linebreaks */
436
static const unsigned char ascii_linebreak[] = {
437
    0, 0, 0, 0, 0, 0, 0, 0,
438
/*         0x000A, * LINE FEED */
439
/*         0x000B, * LINE TABULATION */
440
/*         0x000C, * FORM FEED */
441
/*         0x000D, * CARRIAGE RETURN */
442
    0, 0, 1, 1, 1, 1, 0, 0,
443
    0, 0, 0, 0, 0, 0, 0, 0,
444
/*         0x001C, * FILE SEPARATOR */
445
/*         0x001D, * GROUP SEPARATOR */
446
/*         0x001E, * RECORD SEPARATOR */
447
    0, 0, 0, 0, 1, 1, 1, 0,
448
    0, 0, 0, 0, 0, 0, 0, 0,
449
    0, 0, 0, 0, 0, 0, 0, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0,
452
453
    0, 0, 0, 0, 0, 0, 0, 0,
454
    0, 0, 0, 0, 0, 0, 0, 0,
455
    0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 0, 0, 0, 0, 0, 0, 0,
459
    0, 0, 0, 0, 0, 0, 0, 0,
460
    0, 0, 0, 0, 0, 0, 0, 0
461
};
462
463
static int convert_uc(PyObject *obj, void *addr);
464
465
struct encoding_map;
466
#include "clinic/unicodeobject.c.h"
467
468
_Py_error_handler
469
_Py_GetErrorHandler(const char *errors)
470
9.39k
{
471
9.39k
    if (errors == NULL || strcmp(errors, "strict") == 0) {
472
2.45k
        return _Py_ERROR_STRICT;
473
2.45k
    }
474
6.94k
    if (strcmp(errors, "surrogateescape") == 0) {
475
4.43k
        return _Py_ERROR_SURROGATEESCAPE;
476
4.43k
    }
477
2.50k
    if (strcmp(errors, "replace") == 0) {
478
1.87k
        return _Py_ERROR_REPLACE;
479
1.87k
    }
480
626
    if (strcmp(errors, "ignore") == 0) {
481
0
        return _Py_ERROR_IGNORE;
482
0
    }
483
626
    if (strcmp(errors, "backslashreplace") == 0) {
484
0
        return _Py_ERROR_BACKSLASHREPLACE;
485
0
    }
486
626
    if (strcmp(errors, "surrogatepass") == 0) {
487
626
        return _Py_ERROR_SURROGATEPASS;
488
626
    }
489
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
490
0
        return _Py_ERROR_XMLCHARREFREPLACE;
491
0
    }
492
0
    return _Py_ERROR_OTHER;
493
0
}
494
495
496
static _Py_error_handler
497
get_error_handler_wide(const wchar_t *errors)
498
38
{
499
38
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
500
0
        return _Py_ERROR_STRICT;
501
0
    }
502
38
    if (wcscmp(errors, L"surrogateescape") == 0) {
503
38
        return _Py_ERROR_SURROGATEESCAPE;
504
38
    }
505
0
    if (wcscmp(errors, L"replace") == 0) {
506
0
        return _Py_ERROR_REPLACE;
507
0
    }
508
0
    if (wcscmp(errors, L"ignore") == 0) {
509
0
        return _Py_ERROR_IGNORE;
510
0
    }
511
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
512
0
        return _Py_ERROR_BACKSLASHREPLACE;
513
0
    }
514
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
515
0
        return _Py_ERROR_SURROGATEPASS;
516
0
    }
517
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
518
0
        return _Py_ERROR_XMLCHARREFREPLACE;
519
0
    }
520
0
    return _Py_ERROR_OTHER;
521
0
}
522
523
524
static inline int
525
unicode_check_encoding_errors(const char *encoding, const char *errors)
526
29.9k
{
527
29.9k
    if (encoding == NULL && errors == NULL) {
528
0
        return 0;
529
0
    }
530
531
29.9k
    PyInterpreterState *interp = _PyInterpreterState_GET();
532
29.9k
#ifndef Py_DEBUG
533
    /* In release mode, only check in development mode (-X dev) */
534
29.9k
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
535
29.9k
        return 0;
536
29.9k
    }
537
#else
538
    /* Always check in debug mode */
539
#endif
540
541
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
542
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
543
0
    if (!interp->unicode.fs_codec.encoding) {
544
0
        return 0;
545
0
    }
546
547
    /* Disable checks during Python finalization. For example, it allows to
548
     * call PyObject_Dump() during finalization for debugging purpose.
549
     */
550
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
551
0
        return 0;
552
0
    }
553
554
0
    if (encoding != NULL
555
        // Fast path for the most common built-in encodings. Even if the codec
556
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
557
        // create a temporary Unicode string (the key in the cache).
558
0
        && strcmp(encoding, "utf-8") != 0
559
0
        && strcmp(encoding, "utf8") != 0
560
0
        && strcmp(encoding, "ascii") != 0)
561
0
    {
562
0
        PyObject *handler = _PyCodec_Lookup(encoding);
563
0
        if (handler == NULL) {
564
0
            return -1;
565
0
        }
566
0
        Py_DECREF(handler);
567
0
    }
568
569
0
    if (errors != NULL
570
        // Fast path for the most common built-in error handlers.
571
0
        && strcmp(errors, "strict") != 0
572
0
        && strcmp(errors, "ignore") != 0
573
0
        && strcmp(errors, "replace") != 0
574
0
        && strcmp(errors, "surrogateescape") != 0
575
0
        && strcmp(errors, "surrogatepass") != 0)
576
0
    {
577
0
        PyObject *handler = PyCodec_LookupError(errors);
578
0
        if (handler == NULL) {
579
0
            return -1;
580
0
        }
581
0
        Py_DECREF(handler);
582
0
    }
583
0
    return 0;
584
0
}
585
586
587
int
588
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
589
35.1M
{
590
35.1M
#define CHECK(expr) \
591
175M
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
592
#ifdef Py_GIL_DISABLED
593
# define CHECK_IF_GIL(expr) (void)(expr)
594
# define CHECK_IF_FT(expr) CHECK(expr)
595
#else
596
35.1M
# define CHECK_IF_GIL(expr) CHECK(expr)
597
35.1M
# define CHECK_IF_FT(expr) (void)(expr)
598
35.1M
#endif
599
600
601
35.1M
    assert(op != NULL);
602
35.1M
    CHECK(PyUnicode_Check(op));
603
604
35.1M
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
605
0
    int kind = ascii->state.kind;
606
607
35.1M
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
608
17.9M
        CHECK(kind == PyUnicode_1BYTE_KIND);
609
17.9M
    }
610
17.1M
    else {
611
17.1M
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
612
0
        void *data;
613
614
17.1M
        if (ascii->state.compact == 1) {
615
17.1M
            data = compact + 1;
616
17.1M
            CHECK(kind == PyUnicode_1BYTE_KIND
617
17.1M
                                 || kind == PyUnicode_2BYTE_KIND
618
17.1M
                                 || kind == PyUnicode_4BYTE_KIND);
619
17.1M
            CHECK(ascii->state.ascii == 0);
620
17.1M
            CHECK(_PyUnicode_UTF8(op) != data);
621
17.1M
        }
622
35
        else {
623
35
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
624
625
0
            data = unicode->data.any;
626
35
            CHECK(kind == PyUnicode_1BYTE_KIND
627
35
                     || kind == PyUnicode_2BYTE_KIND
628
35
                     || kind == PyUnicode_4BYTE_KIND);
629
35
            CHECK(ascii->state.compact == 0);
630
35
            CHECK(data != NULL);
631
35
            if (ascii->state.ascii) {
632
35
                CHECK(_PyUnicode_UTF8(op) == data);
633
35
                CHECK(compact->utf8_length == ascii->length);
634
35
            }
635
0
            else {
636
0
                CHECK(_PyUnicode_UTF8(op) != data);
637
0
            }
638
35
        }
639
17.1M
#ifndef Py_GIL_DISABLED
640
17.1M
        if (_PyUnicode_UTF8(op) == NULL)
641
17.1M
            CHECK(compact->utf8_length == 0);
642
17.1M
#endif
643
17.1M
    }
644
645
    /* check that the best kind is used: O(n) operation */
646
35.1M
    if (check_content) {
647
22.6M
        Py_ssize_t i;
648
22.6M
        Py_UCS4 maxchar = 0;
649
22.6M
        const void *data;
650
22.6M
        Py_UCS4 ch;
651
652
22.6M
        data = PyUnicode_DATA(ascii);
653
1.56G
        for (i=0; i < ascii->length; i++)
654
1.54G
        {
655
1.54G
            ch = PyUnicode_READ(kind, data, i);
656
1.54G
            if (ch > maxchar)
657
13.5M
                maxchar = ch;
658
1.54G
        }
659
22.6M
        if (kind == PyUnicode_1BYTE_KIND) {
660
14.2M
            if (ascii->state.ascii == 0) {
661
226k
                CHECK(maxchar >= 128);
662
226k
                CHECK(maxchar <= 255);
663
226k
            }
664
14.0M
            else
665
14.0M
                CHECK(maxchar < 128);
666
14.2M
        }
667
8.43M
        else if (kind == PyUnicode_2BYTE_KIND) {
668
7.71M
            CHECK(maxchar >= 0x100);
669
7.71M
            CHECK(maxchar <= 0xFFFF);
670
7.71M
        }
671
722k
        else {
672
722k
            CHECK(maxchar >= 0x10000);
673
722k
            CHECK(maxchar <= MAX_UNICODE);
674
722k
        }
675
22.6M
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
676
22.6M
    }
677
678
    /* Check interning state */
679
#ifdef Py_DEBUG
680
    // Note that we do not check `_Py_IsImmortal(op)` in the GIL-enabled build
681
    // since stable ABI extensions can make immortal strings mortal (but with a
682
    // high enough refcount).
683
    switch (PyUnicode_CHECK_INTERNED(op)) {
684
        case SSTATE_NOT_INTERNED:
685
            if (ascii->state.statically_allocated) {
686
                // This state is for two exceptions:
687
                // - strings are currently checked before they're interned
688
                // - the 256 one-latin1-character strings
689
                //   are static but use SSTATE_NOT_INTERNED
690
            }
691
            else {
692
                CHECK_IF_GIL(!_Py_IsImmortal(op));
693
            }
694
            break;
695
        case SSTATE_INTERNED_MORTAL:
696
            CHECK(!ascii->state.statically_allocated);
697
            CHECK_IF_GIL(!_Py_IsImmortal(op));
698
            break;
699
        case SSTATE_INTERNED_IMMORTAL:
700
            CHECK(!ascii->state.statically_allocated);
701
            CHECK_IF_FT(_Py_IsImmortal(op));
702
            break;
703
        case SSTATE_INTERNED_IMMORTAL_STATIC:
704
            CHECK(ascii->state.statically_allocated);
705
            CHECK_IF_FT(_Py_IsImmortal(op));
706
            break;
707
        default:
708
            Py_UNREACHABLE();
709
    }
710
#endif
711
712
35.1M
    return 1;
713
714
35.1M
#undef CHECK
715
35.1M
}
716
717
PyObject*
718
_PyUnicode_Result(PyObject *unicode)
719
507k
{
720
507k
    assert(_PyUnicode_CHECK(unicode));
721
722
507k
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
723
507k
    if (length == 0) {
724
1
        PyObject *empty = _PyUnicode_GetEmpty();
725
1
        if (unicode != empty) {
726
0
            Py_DECREF(unicode);
727
0
        }
728
1
        return empty;
729
1
    }
730
731
507k
    if (length == 1) {
732
187k
        int kind = PyUnicode_KIND(unicode);
733
187k
        if (kind == PyUnicode_1BYTE_KIND) {
734
119k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
735
119k
            Py_UCS1 ch = data[0];
736
119k
            PyObject *latin1_char = LATIN1(ch);
737
119k
            if (unicode != latin1_char) {
738
119k
                Py_DECREF(unicode);
739
119k
            }
740
119k
            return latin1_char;
741
119k
        }
742
187k
    }
743
744
507k
    assert(_PyUnicode_CheckConsistency(unicode, 1));
745
388k
    return unicode;
746
388k
}
747
9.39k
#define unicode_result _PyUnicode_Result
748
749
static PyObject*
750
unicode_result_unchanged(PyObject *unicode)
751
54.3k
{
752
54.3k
    if (PyUnicode_CheckExact(unicode)) {
753
54.3k
        return Py_NewRef(unicode);
754
54.3k
    }
755
0
    else
756
        /* Subtype -- return genuine unicode string with the same value. */
757
0
        return _PyUnicode_Copy(unicode);
758
54.3k
}
759
760
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
761
   ASCII, Latin1, UTF-8, etc. */
762
static char*
763
backslashreplace(PyBytesWriter *writer, char *str,
764
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
765
0
{
766
0
    Py_ssize_t size, i;
767
0
    Py_UCS4 ch;
768
0
    int kind;
769
0
    const void *data;
770
771
0
    kind = PyUnicode_KIND(unicode);
772
0
    data = PyUnicode_DATA(unicode);
773
774
0
    size = 0;
775
    /* determine replacement size */
776
0
    for (i = collstart; i < collend; ++i) {
777
0
        Py_ssize_t incr;
778
779
0
        ch = PyUnicode_READ(kind, data, i);
780
0
        if (ch < 0x100)
781
0
            incr = 2+2;
782
0
        else if (ch < 0x10000)
783
0
            incr = 2+4;
784
0
        else {
785
0
            assert(ch <= MAX_UNICODE);
786
0
            incr = 2+8;
787
0
        }
788
0
        if (size > PY_SSIZE_T_MAX - incr) {
789
0
            PyErr_SetString(PyExc_OverflowError,
790
0
                            "encoded result is too long for a Python string");
791
0
            return NULL;
792
0
        }
793
0
        size += incr;
794
0
    }
795
796
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
797
0
    if (str == NULL) {
798
0
        return NULL;
799
0
    }
800
801
    /* generate replacement */
802
0
    for (i = collstart; i < collend; ++i) {
803
0
        ch = PyUnicode_READ(kind, data, i);
804
0
        *str++ = '\\';
805
0
        if (ch >= 0x00010000) {
806
0
            *str++ = 'U';
807
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
808
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
809
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
810
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
811
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
812
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
813
0
        }
814
0
        else if (ch >= 0x100) {
815
0
            *str++ = 'u';
816
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
817
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
818
0
        }
819
0
        else
820
0
            *str++ = 'x';
821
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
822
0
        *str++ = Py_hexdigits[ch&0xf];
823
0
    }
824
0
    return str;
825
0
}
826
827
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
828
   ASCII, Latin1, UTF-8, etc. */
829
static char*
830
xmlcharrefreplace(PyBytesWriter *writer, char *str,
831
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
832
0
{
833
0
    Py_ssize_t size, i;
834
0
    Py_UCS4 ch;
835
0
    int kind;
836
0
    const void *data;
837
838
0
    kind = PyUnicode_KIND(unicode);
839
0
    data = PyUnicode_DATA(unicode);
840
841
0
    size = 0;
842
    /* determine replacement size */
843
0
    for (i = collstart; i < collend; ++i) {
844
0
        Py_ssize_t incr;
845
846
0
        ch = PyUnicode_READ(kind, data, i);
847
0
        if (ch < 10)
848
0
            incr = 2+1+1;
849
0
        else if (ch < 100)
850
0
            incr = 2+2+1;
851
0
        else if (ch < 1000)
852
0
            incr = 2+3+1;
853
0
        else if (ch < 10000)
854
0
            incr = 2+4+1;
855
0
        else if (ch < 100000)
856
0
            incr = 2+5+1;
857
0
        else if (ch < 1000000)
858
0
            incr = 2+6+1;
859
0
        else {
860
0
            assert(ch <= MAX_UNICODE);
861
0
            incr = 2+7+1;
862
0
        }
863
0
        if (size > PY_SSIZE_T_MAX - incr) {
864
0
            PyErr_SetString(PyExc_OverflowError,
865
0
                            "encoded result is too long for a Python string");
866
0
            return NULL;
867
0
        }
868
0
        size += incr;
869
0
    }
870
871
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
872
0
    if (str == NULL) {
873
0
        return NULL;
874
0
    }
875
876
    /* generate replacement */
877
0
    for (i = collstart; i < collend; ++i) {
878
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
879
0
        if (size < 0) {
880
0
            return NULL;
881
0
        }
882
0
        str += size;
883
0
    }
884
0
    return str;
885
0
}
886
887
/* --- Bloom Filters ----------------------------------------------------- */
888
889
/* stuff to implement simple "bloom filters" for Unicode characters.
890
   to keep things simple, we use a single bitmask, using the least 5
891
   bits from each unicode characters as the bit index. */
892
893
/* the linebreak mask is set up by _PyUnicode_Init() below */
894
895
#if LONG_BIT >= 128
896
#define BLOOM_WIDTH 128
897
#elif LONG_BIT >= 64
898
9.85k
#define BLOOM_WIDTH 64
899
#elif LONG_BIT >= 32
900
#define BLOOM_WIDTH 32
901
#else
902
#error "LONG_BIT is smaller than 32"
903
#endif
904
905
9.66k
#define BLOOM_MASK unsigned long
906
907
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
908
909
4.86k
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
910
911
#define BLOOM_LINEBREAK(ch)                                             \
912
0
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
913
0
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
914
915
static inline BLOOM_MASK
916
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
917
4.84k
{
918
4.84k
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
919
4.84k
    do {                                               \
920
4.84k
        TYPE *data = (TYPE *)PTR;                      \
921
4.84k
        TYPE *end = data + LEN;                        \
922
4.84k
        Py_UCS4 ch;                                    \
923
9.83k
        for (; data != end; data++) {                  \
924
4.99k
            ch = *data;                                \
925
4.99k
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
926
4.99k
        }                                              \
927
4.84k
        break;                                         \
928
4.84k
    } while (0)
929
930
    /* calculate simple bloom-style bitmask for a given unicode string */
931
932
4.84k
    BLOOM_MASK mask;
933
934
4.84k
    mask = 0;
935
4.84k
    switch (kind) {
936
4.82k
    case PyUnicode_1BYTE_KIND:
937
4.82k
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
938
4.82k
        break;
939
19
    case PyUnicode_2BYTE_KIND:
940
19
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
941
19
        break;
942
0
    case PyUnicode_4BYTE_KIND:
943
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
944
0
        break;
945
0
    default:
946
0
        Py_UNREACHABLE();
947
4.84k
    }
948
4.84k
    return mask;
949
950
4.84k
#undef BLOOM_UPDATE
951
4.84k
}
952
953
/* Compilation of templated routines */
954
955
1.43k
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
956
957
#include "stringlib/asciilib.h"
958
#include "stringlib/fastsearch.h"
959
#include "stringlib/partition.h"
960
#include "stringlib/split.h"
961
#include "stringlib/count.h"
962
#include "stringlib/find.h"
963
#include "stringlib/find_max_char.h"
964
#include "stringlib/undef.h"
965
966
#include "stringlib/ucs1lib.h"
967
#include "stringlib/fastsearch.h"
968
#include "stringlib/partition.h"
969
#include "stringlib/split.h"
970
#include "stringlib/count.h"
971
#include "stringlib/find.h"
972
#include "stringlib/replace.h"
973
#include "stringlib/repr.h"
974
#include "stringlib/find_max_char.h"
975
#include "stringlib/undef.h"
976
977
#include "stringlib/ucs2lib.h"
978
#include "stringlib/fastsearch.h"
979
#include "stringlib/partition.h"
980
#include "stringlib/split.h"
981
#include "stringlib/count.h"
982
#include "stringlib/find.h"
983
#include "stringlib/replace.h"
984
#include "stringlib/repr.h"
985
#include "stringlib/find_max_char.h"
986
#include "stringlib/undef.h"
987
988
#include "stringlib/ucs4lib.h"
989
#include "stringlib/fastsearch.h"
990
#include "stringlib/partition.h"
991
#include "stringlib/split.h"
992
#include "stringlib/count.h"
993
#include "stringlib/find.h"
994
#include "stringlib/replace.h"
995
#include "stringlib/repr.h"
996
#include "stringlib/find_max_char.h"
997
#include "stringlib/undef.h"
998
999
#undef STRINGLIB_GET_EMPTY
1000
1001
/* --- Unicode Object ----------------------------------------------------- */
1002
1003
static inline Py_ssize_t
1004
findchar(const void *s, int kind,
1005
         Py_ssize_t size, Py_UCS4 ch,
1006
         int direction)
1007
265k
{
1008
265k
    switch (kind) {
1009
250k
    case PyUnicode_1BYTE_KIND:
1010
250k
        if ((Py_UCS1) ch != ch)
1011
1.78k
            return -1;
1012
248k
        if (direction > 0)
1013
245k
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1014
3.34k
        else
1015
3.34k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1016
14.2k
    case PyUnicode_2BYTE_KIND:
1017
14.2k
        if ((Py_UCS2) ch != ch)
1018
0
            return -1;
1019
14.2k
        if (direction > 0)
1020
13.3k
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1021
998
        else
1022
998
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1023
1.36k
    case PyUnicode_4BYTE_KIND:
1024
1.36k
        if (direction > 0)
1025
9
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1026
1.35k
        else
1027
1.35k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1028
0
    default:
1029
0
        Py_UNREACHABLE();
1030
265k
    }
1031
265k
}
1032
1033
#ifdef Py_DEBUG
1034
/* Fill the data of a Unicode string with invalid characters to detect bugs
1035
   earlier.
1036
1037
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1038
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1039
   invalid character in Unicode 6.0. */
1040
static void
1041
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1042
{
1043
    int kind = PyUnicode_KIND(unicode);
1044
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1045
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1046
    if (length <= old_length)
1047
        return;
1048
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1049
}
1050
#endif
1051
1052
static PyObject*
1053
resize_copy(PyObject *unicode, Py_ssize_t length)
1054
0
{
1055
0
    Py_ssize_t copy_length;
1056
0
    PyObject *copy;
1057
1058
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1059
0
    if (copy == NULL)
1060
0
        return NULL;
1061
1062
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1063
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1064
0
    return copy;
1065
0
}
1066
1067
PyObject*
1068
_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
1069
136k
{
1070
136k
    Py_ssize_t char_size;
1071
136k
    Py_ssize_t struct_size;
1072
136k
    Py_ssize_t new_size;
1073
136k
    PyObject *new_unicode;
1074
#ifdef Py_DEBUG
1075
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1076
#endif
1077
1078
136k
    if (!_PyUnicode_IsModifiable(unicode)) {
1079
0
        PyObject *copy = resize_copy(unicode, length);
1080
0
        if (copy == NULL) {
1081
0
            return NULL;
1082
0
        }
1083
0
        Py_DECREF(unicode);
1084
0
        return copy;
1085
0
    }
1086
136k
    assert(PyUnicode_IS_COMPACT(unicode));
1087
1088
136k
    char_size = PyUnicode_KIND(unicode);
1089
136k
    if (PyUnicode_IS_ASCII(unicode))
1090
96.2k
        struct_size = sizeof(PyASCIIObject);
1091
40.2k
    else
1092
40.2k
        struct_size = sizeof(PyCompactUnicodeObject);
1093
1094
136k
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1095
0
        PyErr_NoMemory();
1096
0
        return NULL;
1097
0
    }
1098
136k
    new_size = (struct_size + (length + 1) * char_size);
1099
1100
136k
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1101
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1102
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1103
0
        PyUnicode_SET_UTF8(unicode, NULL);
1104
0
    }
1105
#ifdef Py_TRACE_REFS
1106
    _Py_ForgetReference(unicode);
1107
#endif
1108
136k
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1109
1110
136k
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1111
136k
    if (new_unicode == NULL) {
1112
0
        _Py_NewReferenceNoTotal(unicode);
1113
0
        PyErr_NoMemory();
1114
0
        return NULL;
1115
0
    }
1116
136k
    unicode = new_unicode;
1117
136k
    _Py_NewReferenceNoTotal(unicode);
1118
1119
136k
    _PyUnicode_LENGTH(unicode) = length;
1120
#ifdef Py_DEBUG
1121
    unicode_fill_invalid(unicode, old_length);
1122
#endif
1123
136k
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1124
136k
                    length, 0);
1125
136k
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1126
136k
    return unicode;
1127
136k
}
1128
1129
static int
1130
resize_inplace(PyObject *unicode, Py_ssize_t length)
1131
0
{
1132
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1133
0
    assert(Py_REFCNT(unicode) == 1);
1134
1135
0
    Py_ssize_t new_size;
1136
0
    Py_ssize_t char_size;
1137
0
    int share_utf8;
1138
0
    void *data;
1139
#ifdef Py_DEBUG
1140
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1141
#endif
1142
1143
0
    data = _PyUnicode_DATA_ANY(unicode);
1144
0
    char_size = PyUnicode_KIND(unicode);
1145
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1146
1147
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1148
0
        PyErr_NoMemory();
1149
0
        return -1;
1150
0
    }
1151
0
    new_size = (length + 1) * char_size;
1152
1153
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1154
0
    {
1155
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1156
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1157
0
        PyUnicode_SET_UTF8(unicode, NULL);
1158
0
    }
1159
1160
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1161
0
    if (data == NULL) {
1162
0
        PyErr_NoMemory();
1163
0
        return -1;
1164
0
    }
1165
0
    _PyUnicode_DATA_ANY(unicode) = data;
1166
0
    if (share_utf8) {
1167
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1168
0
        PyUnicode_SET_UTF8(unicode, data);
1169
0
    }
1170
0
    _PyUnicode_LENGTH(unicode) = length;
1171
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1172
#ifdef Py_DEBUG
1173
    unicode_fill_invalid(unicode, old_length);
1174
#endif
1175
1176
    /* check for integer overflow */
1177
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1178
0
        PyErr_NoMemory();
1179
0
        return -1;
1180
0
    }
1181
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1182
0
    return 0;
1183
0
}
1184
1185
static const char*
1186
unicode_kind_name(PyObject *unicode)
1187
0
{
1188
    /* don't check consistency: unicode_kind_name() is called from
1189
       _PyUnicode_Dump() */
1190
0
    if (!PyUnicode_IS_COMPACT(unicode))
1191
0
    {
1192
0
        switch (PyUnicode_KIND(unicode))
1193
0
        {
1194
0
        case PyUnicode_1BYTE_KIND:
1195
0
            if (PyUnicode_IS_ASCII(unicode))
1196
0
                return "legacy ascii";
1197
0
            else
1198
0
                return "legacy latin1";
1199
0
        case PyUnicode_2BYTE_KIND:
1200
0
            return "legacy UCS2";
1201
0
        case PyUnicode_4BYTE_KIND:
1202
0
            return "legacy UCS4";
1203
0
        default:
1204
0
            return "<legacy invalid kind>";
1205
0
        }
1206
0
    }
1207
0
    switch (PyUnicode_KIND(unicode)) {
1208
0
    case PyUnicode_1BYTE_KIND:
1209
0
        if (PyUnicode_IS_ASCII(unicode))
1210
0
            return "ascii";
1211
0
        else
1212
0
            return "latin1";
1213
0
    case PyUnicode_2BYTE_KIND:
1214
0
        return "UCS2";
1215
0
    case PyUnicode_4BYTE_KIND:
1216
0
        return "UCS4";
1217
0
    default:
1218
0
        return "<invalid compact kind>";
1219
0
    }
1220
0
}
1221
1222
#ifdef Py_DEBUG
1223
/* Functions wrapping macros for use in debugger */
1224
const char *_PyUnicode_utf8(void *unicode_raw){
1225
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1226
    return PyUnicode_UTF8(unicode);
1227
}
1228
1229
const void *_PyUnicode_compact_data(void *unicode_raw) {
1230
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1231
    return _PyUnicode_COMPACT_DATA(unicode);
1232
}
1233
const void *_PyUnicode_data(void *unicode_raw) {
1234
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1235
    printf("obj %p\n", (void*)unicode);
1236
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1237
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1238
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1239
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1240
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1241
    return PyUnicode_DATA(unicode);
1242
}
1243
1244
void
1245
_PyUnicode_Dump(PyObject *op)
1246
{
1247
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1248
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1249
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1250
    const void *data;
1251
1252
    if (ascii->state.compact)
1253
    {
1254
        if (ascii->state.ascii)
1255
            data = (ascii + 1);
1256
        else
1257
            data = (compact + 1);
1258
    }
1259
    else
1260
        data = unicode->data.any;
1261
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1262
1263
    if (!ascii->state.ascii) {
1264
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1265
    }
1266
    printf(", data=%p\n", data);
1267
}
1268
#endif
1269
1270
1271
PyObject *
1272
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1273
24.4M
{
1274
    /* Optimization for empty strings */
1275
24.4M
    if (size == 0) {
1276
12.1M
        return _PyUnicode_GetEmpty();
1277
12.1M
    }
1278
1279
12.3M
    PyObject *obj;
1280
12.3M
    PyCompactUnicodeObject *unicode;
1281
12.3M
    void *data;
1282
12.3M
    int kind;
1283
12.3M
    int is_ascii;
1284
12.3M
    Py_ssize_t char_size;
1285
12.3M
    Py_ssize_t struct_size;
1286
1287
12.3M
    is_ascii = 0;
1288
12.3M
    struct_size = sizeof(PyCompactUnicodeObject);
1289
12.3M
    if (maxchar < 128) {
1290
3.88M
        kind = PyUnicode_1BYTE_KIND;
1291
3.88M
        char_size = 1;
1292
3.88M
        is_ascii = 1;
1293
3.88M
        struct_size = sizeof(PyASCIIObject);
1294
3.88M
    }
1295
8.48M
    else if (maxchar < 256) {
1296
208k
        kind = PyUnicode_1BYTE_KIND;
1297
208k
        char_size = 1;
1298
208k
    }
1299
8.27M
    else if (maxchar < 65536) {
1300
7.58M
        kind = PyUnicode_2BYTE_KIND;
1301
7.58M
        char_size = 2;
1302
7.58M
    }
1303
690k
    else {
1304
690k
        if (maxchar > MAX_UNICODE) {
1305
0
            PyErr_SetString(PyExc_SystemError,
1306
0
                            "invalid maximum character passed to PyUnicode_New");
1307
0
            return NULL;
1308
0
        }
1309
690k
        kind = PyUnicode_4BYTE_KIND;
1310
690k
        char_size = 4;
1311
690k
    }
1312
1313
    /* Ensure we won't overflow the size. */
1314
12.3M
    if (size < 0) {
1315
0
        PyErr_SetString(PyExc_SystemError,
1316
0
                        "Negative size passed to PyUnicode_New");
1317
0
        return NULL;
1318
0
    }
1319
12.3M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1320
0
        return PyErr_NoMemory();
1321
1322
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1323
     * PyObject_New() so we are able to allocate space for the object and
1324
     * it's data buffer.
1325
     */
1326
12.3M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1327
12.3M
    if (obj == NULL) {
1328
0
        return PyErr_NoMemory();
1329
0
    }
1330
12.3M
    _PyObject_Init(obj, &PyUnicode_Type);
1331
1332
12.3M
    unicode = (PyCompactUnicodeObject *)obj;
1333
12.3M
    if (is_ascii)
1334
3.88M
        data = ((PyASCIIObject*)obj) + 1;
1335
8.48M
    else
1336
8.48M
        data = unicode + 1;
1337
12.3M
    _PyUnicode_LENGTH(unicode) = size;
1338
12.3M
    _PyUnicode_HASH(unicode) = -1;
1339
12.3M
    _PyUnicode_STATE(unicode).interned = 0;
1340
12.3M
    _PyUnicode_STATE(unicode).kind = kind;
1341
12.3M
    _PyUnicode_STATE(unicode).compact = 1;
1342
12.3M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1343
12.3M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1344
12.3M
    if (is_ascii) {
1345
3.88M
        ((char*)data)[size] = 0;
1346
3.88M
    }
1347
8.48M
    else if (kind == PyUnicode_1BYTE_KIND) {
1348
208k
        ((char*)data)[size] = 0;
1349
208k
        unicode->utf8 = NULL;
1350
208k
        unicode->utf8_length = 0;
1351
208k
    }
1352
8.27M
    else {
1353
8.27M
        unicode->utf8 = NULL;
1354
8.27M
        unicode->utf8_length = 0;
1355
8.27M
        if (kind == PyUnicode_2BYTE_KIND)
1356
7.58M
            ((Py_UCS2*)data)[size] = 0;
1357
690k
        else /* kind == PyUnicode_4BYTE_KIND */
1358
690k
            ((Py_UCS4*)data)[size] = 0;
1359
8.27M
    }
1360
#ifdef Py_DEBUG
1361
    unicode_fill_invalid((PyObject*)unicode, 0);
1362
#endif
1363
12.3M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1364
12.3M
    return obj;
1365
12.3M
}
1366
1367
static int
1368
unicode_check_modifiable(PyObject *unicode)
1369
585
{
1370
585
    if (!_PyUnicode_IsModifiable(unicode)) {
1371
0
        PyErr_SetString(PyExc_SystemError,
1372
0
                        "Cannot modify a string currently used");
1373
0
        return -1;
1374
0
    }
1375
585
    return 0;
1376
585
}
1377
1378
static int
1379
_copy_characters(PyObject *to, Py_ssize_t to_start,
1380
                 PyObject *from, Py_ssize_t from_start,
1381
                 Py_ssize_t how_many, int check_maxchar)
1382
781k
{
1383
781k
    int from_kind, to_kind;
1384
781k
    const void *from_data;
1385
781k
    void *to_data;
1386
1387
781k
    assert(0 <= how_many);
1388
781k
    assert(0 <= from_start);
1389
781k
    assert(0 <= to_start);
1390
781k
    assert(PyUnicode_Check(from));
1391
781k
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1392
1393
781k
    assert(to == NULL || PyUnicode_Check(to));
1394
1395
781k
    if (how_many == 0) {
1396
3.92k
        return 0;
1397
3.92k
    }
1398
1399
781k
    assert(to != NULL);
1400
777k
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1401
1402
777k
    from_kind = PyUnicode_KIND(from);
1403
777k
    from_data = PyUnicode_DATA(from);
1404
777k
    to_kind = PyUnicode_KIND(to);
1405
777k
    to_data = PyUnicode_DATA(to);
1406
1407
#ifdef Py_DEBUG
1408
    if (!check_maxchar
1409
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1410
    {
1411
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1412
        Py_UCS4 ch;
1413
        Py_ssize_t i;
1414
        for (i=0; i < how_many; i++) {
1415
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1416
            assert(ch <= to_maxchar);
1417
        }
1418
    }
1419
#endif
1420
1421
777k
    if (from_kind == to_kind) {
1422
477k
        if (check_maxchar
1423
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1424
0
        {
1425
            /* Writing Latin-1 characters into an ASCII string requires to
1426
               check that all written characters are pure ASCII */
1427
0
            Py_UCS4 max_char;
1428
0
            max_char = ucs1lib_find_max_char(from_data,
1429
0
                                             (const Py_UCS1*)from_data + how_many);
1430
0
            if (max_char >= 128)
1431
0
                return -1;
1432
0
        }
1433
477k
        memcpy((char*)to_data + to_kind * to_start,
1434
477k
                  (const char*)from_data + from_kind * from_start,
1435
477k
                  to_kind * how_many);
1436
477k
    }
1437
299k
    else if (from_kind == PyUnicode_1BYTE_KIND
1438
151k
             && to_kind == PyUnicode_2BYTE_KIND)
1439
108k
    {
1440
108k
        _PyUnicode_CONVERT_BYTES(
1441
108k
            Py_UCS1, Py_UCS2,
1442
108k
            PyUnicode_1BYTE_DATA(from) + from_start,
1443
108k
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1444
108k
            PyUnicode_2BYTE_DATA(to) + to_start
1445
108k
            );
1446
108k
    }
1447
191k
    else if (from_kind == PyUnicode_1BYTE_KIND
1448
43.3k
             && to_kind == PyUnicode_4BYTE_KIND)
1449
43.3k
    {
1450
43.3k
        _PyUnicode_CONVERT_BYTES(
1451
43.3k
            Py_UCS1, Py_UCS4,
1452
43.3k
            PyUnicode_1BYTE_DATA(from) + from_start,
1453
43.3k
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1454
43.3k
            PyUnicode_4BYTE_DATA(to) + to_start
1455
43.3k
            );
1456
43.3k
    }
1457
148k
    else if (from_kind == PyUnicode_2BYTE_KIND
1458
109k
             && to_kind == PyUnicode_4BYTE_KIND)
1459
101k
    {
1460
101k
        _PyUnicode_CONVERT_BYTES(
1461
101k
            Py_UCS2, Py_UCS4,
1462
101k
            PyUnicode_2BYTE_DATA(from) + from_start,
1463
101k
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1464
101k
            PyUnicode_4BYTE_DATA(to) + to_start
1465
101k
            );
1466
101k
    }
1467
46.6k
    else {
1468
46.6k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1469
1470
46.6k
        if (!check_maxchar) {
1471
46.6k
            if (from_kind == PyUnicode_2BYTE_KIND
1472
7.44k
                && to_kind == PyUnicode_1BYTE_KIND)
1473
7.44k
            {
1474
7.44k
                _PyUnicode_CONVERT_BYTES(
1475
7.44k
                    Py_UCS2, Py_UCS1,
1476
7.44k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1477
7.44k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1478
7.44k
                    PyUnicode_1BYTE_DATA(to) + to_start
1479
7.44k
                    );
1480
7.44k
            }
1481
39.2k
            else if (from_kind == PyUnicode_4BYTE_KIND
1482
39.2k
                     && to_kind == PyUnicode_1BYTE_KIND)
1483
25.6k
            {
1484
25.6k
                _PyUnicode_CONVERT_BYTES(
1485
25.6k
                    Py_UCS4, Py_UCS1,
1486
25.6k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1487
25.6k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1488
25.6k
                    PyUnicode_1BYTE_DATA(to) + to_start
1489
25.6k
                    );
1490
25.6k
            }
1491
13.6k
            else if (from_kind == PyUnicode_4BYTE_KIND
1492
13.6k
                     && to_kind == PyUnicode_2BYTE_KIND)
1493
13.6k
            {
1494
13.6k
                _PyUnicode_CONVERT_BYTES(
1495
13.6k
                    Py_UCS4, Py_UCS2,
1496
13.6k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1497
13.6k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1498
13.6k
                    PyUnicode_2BYTE_DATA(to) + to_start
1499
13.6k
                    );
1500
13.6k
            }
1501
0
            else {
1502
0
                Py_UNREACHABLE();
1503
0
            }
1504
46.6k
        }
1505
0
        else {
1506
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1507
0
            Py_UCS4 ch;
1508
0
            Py_ssize_t i;
1509
1510
0
            for (i=0; i < how_many; i++) {
1511
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1512
0
                if (ch > to_maxchar)
1513
0
                    return -1;
1514
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1515
0
            }
1516
0
        }
1517
46.6k
    }
1518
777k
    return 0;
1519
777k
}
1520
1521
void
1522
_PyUnicode_FastCopyCharacters(
1523
    PyObject *to, Py_ssize_t to_start,
1524
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1525
781k
{
1526
781k
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1527
781k
}
1528
1529
Py_ssize_t
1530
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1531
                         PyObject *from, Py_ssize_t from_start,
1532
                         Py_ssize_t how_many)
1533
0
{
1534
0
    int err;
1535
1536
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1537
0
        PyErr_BadInternalCall();
1538
0
        return -1;
1539
0
    }
1540
1541
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1542
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1543
0
        return -1;
1544
0
    }
1545
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1546
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1547
0
        return -1;
1548
0
    }
1549
0
    if (how_many < 0) {
1550
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1551
0
        return -1;
1552
0
    }
1553
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1554
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1555
0
        PyErr_Format(PyExc_SystemError,
1556
0
                     "Cannot write %zi characters at %zi "
1557
0
                     "in a string of %zi characters",
1558
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1559
0
        return -1;
1560
0
    }
1561
1562
0
    if (how_many == 0)
1563
0
        return 0;
1564
1565
0
    if (unicode_check_modifiable(to))
1566
0
        return -1;
1567
1568
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1569
0
    if (err) {
1570
0
        PyErr_Format(PyExc_SystemError,
1571
0
                     "Cannot copy %s characters "
1572
0
                     "into a string of %s characters",
1573
0
                     unicode_kind_name(from),
1574
0
                     unicode_kind_name(to));
1575
0
        return -1;
1576
0
    }
1577
0
    return how_many;
1578
0
}
1579
1580
/* Find the maximum code point and count the number of surrogate pairs so a
1581
   correct string length can be computed before converting a string to UCS4.
1582
   This function counts single surrogates as a character and not as a pair.
1583
1584
   Return 0 on success, or -1 on error. */
1585
static int
1586
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1587
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1588
5.23k
{
1589
5.23k
    const wchar_t *iter;
1590
5.23k
    Py_UCS4 ch;
1591
1592
5.23k
    assert(num_surrogates != NULL && maxchar != NULL);
1593
5.23k
    *num_surrogates = 0;
1594
5.23k
    *maxchar = 0;
1595
1596
152k
    for (iter = begin; iter < end; ) {
1597
#if SIZEOF_WCHAR_T == 2
1598
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1599
            && (iter+1) < end
1600
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1601
        {
1602
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1603
            ++(*num_surrogates);
1604
            iter += 2;
1605
        }
1606
        else
1607
#endif
1608
147k
        {
1609
147k
            ch = *iter;
1610
147k
            iter++;
1611
147k
        }
1612
147k
        if (ch > *maxchar) {
1613
22.4k
            *maxchar = ch;
1614
22.4k
            if (*maxchar > MAX_UNICODE) {
1615
0
                PyErr_Format(PyExc_ValueError,
1616
0
                             "character U+%x is not in range [U+0000; U+%x]",
1617
0
                             ch, MAX_UNICODE);
1618
0
                return -1;
1619
0
            }
1620
22.4k
        }
1621
147k
    }
1622
5.23k
    return 0;
1623
5.23k
}
1624
1625
static void
1626
unicode_dealloc(PyObject *unicode)
1627
12.2M
{
1628
#ifdef Py_DEBUG
1629
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1630
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1631
    }
1632
#endif
1633
24.4M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1634
        /* This should never get called, but we also don't want to SEGV if
1635
        * we accidentally decref an immortal string out of existence. Since
1636
        * the string is an immortal object, just re-set the reference count.
1637
        */
1638
#ifdef Py_DEBUG
1639
        Py_UNREACHABLE();
1640
#endif
1641
0
        _Py_SetImmortal(unicode);
1642
0
        return;
1643
0
    }
1644
12.2M
    switch (_PyUnicode_STATE(unicode).interned) {
1645
12.0M
        case SSTATE_NOT_INTERNED:
1646
12.0M
            break;
1647
136k
        case SSTATE_INTERNED_MORTAL:
1648
            /* Remove the object from the intern dict.
1649
             * Before doing so, we set the refcount to 2: the key and value
1650
             * in the interned_dict.
1651
             */
1652
136k
            assert(Py_REFCNT(unicode) == 0);
1653
136k
            Py_SET_REFCNT(unicode, 2);
1654
#ifdef Py_REF_DEBUG
1655
            /* let's be pedantic with the ref total */
1656
            _Py_IncRefTotal(_PyThreadState_GET());
1657
            _Py_IncRefTotal(_PyThreadState_GET());
1658
#endif
1659
136k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1660
136k
            PyObject *interned = get_interned_dict(interp);
1661
136k
            assert(interned != NULL);
1662
136k
            PyObject *popped;
1663
136k
            int r = PyDict_Pop(interned, unicode, &popped);
1664
136k
            if (r == -1) {
1665
0
                PyErr_FormatUnraisable("Exception ignored while "
1666
0
                                       "removing an interned string %R",
1667
0
                                       unicode);
1668
                // We don't know what happened to the string. It's probably
1669
                // best to leak it:
1670
                // - if it was popped, there are no more references to it
1671
                //   so it can't cause trouble (except wasted memory)
1672
                // - if it wasn't popped, it'll remain interned
1673
0
                _Py_SetImmortal(unicode);
1674
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1675
0
                return;
1676
0
            }
1677
136k
            if (r == 0) {
1678
                // The interned string was not found in the interned_dict.
1679
#ifdef Py_DEBUG
1680
                Py_UNREACHABLE();
1681
#endif
1682
0
                _Py_SetImmortal(unicode);
1683
0
                return;
1684
0
            }
1685
            // Successfully popped.
1686
136k
            assert(popped == unicode);
1687
            // Only our `popped` reference should be left; remove it too.
1688
136k
            assert(Py_REFCNT(unicode) == 1);
1689
136k
            Py_SET_REFCNT(unicode, 0);
1690
#ifdef Py_REF_DEBUG
1691
            /* let's be pedantic with the ref total */
1692
            _Py_DecRefTotal(_PyThreadState_GET());
1693
#endif
1694
136k
            break;
1695
0
        default:
1696
            // As with `statically_allocated` above.
1697
#ifdef Py_REF_DEBUG
1698
            Py_UNREACHABLE();
1699
#endif
1700
0
            _Py_SetImmortal(unicode);
1701
0
            return;
1702
12.2M
    }
1703
12.2M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1704
2.75k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1705
2.75k
    }
1706
12.2M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1707
0
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1708
0
    }
1709
1710
12.2M
    Py_TYPE(unicode)->tp_free(unicode);
1711
12.2M
}
1712
1713
#ifdef Py_DEBUG
1714
static int
1715
unicode_is_singleton(PyObject *unicode)
1716
{
1717
    if (unicode == &_Py_STR(empty)) {
1718
        return 1;
1719
    }
1720
1721
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1722
    if (ascii->length == 1) {
1723
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1724
        if (ch < 256 && LATIN1(ch) == unicode) {
1725
            return 1;
1726
        }
1727
    }
1728
    return 0;
1729
}
1730
#endif
1731
1732
int
1733
_PyUnicode_IsModifiable(PyObject *unicode)
1734
148k
{
1735
148k
    assert(_PyUnicode_CHECK(unicode));
1736
148k
    if (!_PyObject_IsUniquelyReferenced(unicode))
1737
346
        return 0;
1738
147k
    if (PyUnicode_HASH(unicode) != -1)
1739
0
        return 0;
1740
147k
    if (PyUnicode_CHECK_INTERNED(unicode))
1741
0
        return 0;
1742
147k
    if (!PyUnicode_CheckExact(unicode))
1743
0
        return 0;
1744
#ifdef Py_DEBUG
1745
    /* singleton refcount is greater than 1 */
1746
    assert(!unicode_is_singleton(unicode));
1747
#endif
1748
147k
    return 1;
1749
147k
}
1750
1751
static int
1752
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1753
4.98k
{
1754
4.98k
    PyObject *unicode;
1755
4.98k
    Py_ssize_t old_length;
1756
1757
4.98k
    assert(p_unicode != NULL);
1758
4.98k
    unicode = *p_unicode;
1759
1760
4.98k
    assert(unicode != NULL);
1761
4.98k
    assert(PyUnicode_Check(unicode));
1762
4.98k
    assert(0 <= length);
1763
1764
4.98k
    old_length = PyUnicode_GET_LENGTH(unicode);
1765
4.98k
    if (old_length == length)
1766
0
        return 0;
1767
1768
4.98k
    if (length == 0) {
1769
0
        PyObject *empty = _PyUnicode_GetEmpty();
1770
0
        Py_SETREF(*p_unicode, empty);
1771
0
        return 0;
1772
0
    }
1773
1774
4.98k
    if (!_PyUnicode_IsModifiable(unicode)) {
1775
0
        PyObject *copy = resize_copy(unicode, length);
1776
0
        if (copy == NULL)
1777
0
            return -1;
1778
0
        Py_SETREF(*p_unicode, copy);
1779
0
        return 0;
1780
0
    }
1781
1782
4.98k
    if (PyUnicode_IS_COMPACT(unicode)) {
1783
4.98k
        PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
1784
4.98k
        if (new_unicode == NULL)
1785
0
            return -1;
1786
4.98k
        *p_unicode = new_unicode;
1787
4.98k
        return 0;
1788
4.98k
    }
1789
0
    return resize_inplace(unicode, length);
1790
4.98k
}
1791
1792
int
1793
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1794
0
{
1795
0
    PyObject *unicode;
1796
0
    if (p_unicode == NULL) {
1797
0
        PyErr_BadInternalCall();
1798
0
        return -1;
1799
0
    }
1800
0
    unicode = *p_unicode;
1801
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1802
0
    {
1803
0
        PyErr_BadInternalCall();
1804
0
        return -1;
1805
0
    }
1806
0
    return unicode_resize(p_unicode, length);
1807
0
}
1808
1809
static PyObject*
1810
get_latin1_char(Py_UCS1 ch)
1811
4.18M
{
1812
4.18M
    PyObject *o = LATIN1(ch);
1813
4.18M
    return o;
1814
4.18M
}
1815
1816
static PyObject*
1817
unicode_char(Py_UCS4 ch)
1818
10.9M
{
1819
10.9M
    PyObject *unicode;
1820
1821
10.9M
    assert(ch <= MAX_UNICODE);
1822
1823
10.9M
    if (ch < 256) {
1824
3.26M
        return get_latin1_char(ch);
1825
3.26M
    }
1826
1827
7.67M
    unicode = PyUnicode_New(1, ch);
1828
7.67M
    if (unicode == NULL)
1829
0
        return NULL;
1830
1831
7.67M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1832
15.3M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1833
7.06M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1834
7.06M
    } else {
1835
613k
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1836
613k
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1837
613k
    }
1838
7.67M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1839
7.67M
    return unicode;
1840
7.67M
}
1841
1842
1843
static inline void
1844
unicode_write_widechar(int kind, void *data,
1845
                       const wchar_t *u, Py_ssize_t size,
1846
                       Py_ssize_t num_surrogates)
1847
5.23k
{
1848
5.23k
    switch (kind) {
1849
5.23k
    case PyUnicode_1BYTE_KIND:
1850
5.23k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1851
5.23k
        break;
1852
1853
0
    case PyUnicode_2BYTE_KIND:
1854
#if SIZEOF_WCHAR_T == 2
1855
        memcpy(data, u, size * 2);
1856
#else
1857
0
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1858
0
#endif
1859
0
        break;
1860
1861
0
    case PyUnicode_4BYTE_KIND:
1862
0
    {
1863
#if SIZEOF_WCHAR_T == 2
1864
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1865
        // surrogate pairs.
1866
        const wchar_t *end = u + size;
1867
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1868
#  ifndef NDEBUG
1869
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1870
#  endif
1871
        for (const wchar_t *iter = u; iter < end; ) {
1872
            assert(ucs4_out < ucs4_end);
1873
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1874
                && (iter+1) < end
1875
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1876
            {
1877
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1878
                iter += 2;
1879
            }
1880
            else {
1881
                *ucs4_out++ = *iter;
1882
                iter++;
1883
            }
1884
        }
1885
        assert(ucs4_out == ucs4_end);
1886
#else
1887
0
        assert(num_surrogates == 0);
1888
0
        memcpy(data, u, size * 4);
1889
0
#endif
1890
0
        break;
1891
0
    }
1892
0
    default:
1893
0
        Py_UNREACHABLE();
1894
5.23k
    }
1895
5.23k
}
1896
1897
1898
PyObject *
1899
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1900
5.27k
{
1901
5.27k
    PyObject *unicode;
1902
5.27k
    Py_UCS4 maxchar = 0;
1903
5.27k
    Py_ssize_t num_surrogates;
1904
1905
5.27k
    if (u == NULL && size != 0) {
1906
0
        PyErr_BadInternalCall();
1907
0
        return NULL;
1908
0
    }
1909
1910
5.27k
    if (size == -1) {
1911
798
        size = wcslen(u);
1912
798
    }
1913
1914
    /* If the Unicode data is known at construction time, we can apply
1915
       some optimizations which share commonly used objects. */
1916
1917
    /* Optimization for empty strings */
1918
5.27k
    if (size == 0)
1919
38
        _Py_RETURN_UNICODE_EMPTY();
1920
1921
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1922
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1923
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1924
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1925
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1926
        if (!converted) {
1927
            return NULL;
1928
        }
1929
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1930
        PyMem_Free(converted);
1931
        return unicode;
1932
    }
1933
#endif
1934
1935
    /* Single character Unicode objects in the Latin-1 range are
1936
       shared when using this constructor */
1937
5.23k
    if (size == 1 && (Py_UCS4)*u < 256)
1938
0
        return get_latin1_char((unsigned char)*u);
1939
1940
    /* If not empty and not single character, copy the Unicode data
1941
       into the new object */
1942
5.23k
    if (find_maxchar_surrogates(u, u + size,
1943
5.23k
                                &maxchar, &num_surrogates) == -1)
1944
0
        return NULL;
1945
1946
5.23k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1947
5.23k
    if (!unicode)
1948
0
        return NULL;
1949
1950
5.23k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1951
5.23k
                           u, size, num_surrogates);
1952
1953
5.23k
    return unicode_result(unicode);
1954
5.23k
}
1955
1956
1957
int
1958
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
1959
                              const wchar_t *str,
1960
                              Py_ssize_t size)
1961
0
{
1962
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
1963
1964
0
    if (size < 0) {
1965
0
        size = wcslen(str);
1966
0
    }
1967
1968
0
    if (size == 0) {
1969
0
        return 0;
1970
0
    }
1971
1972
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1973
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1974
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1975
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1976
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
1977
        if (!converted) {
1978
            return -1;
1979
        }
1980
1981
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
1982
        PyMem_Free(converted);
1983
        return res;
1984
    }
1985
#endif
1986
1987
0
    Py_UCS4 maxchar = 0;
1988
0
    Py_ssize_t num_surrogates;
1989
0
    if (find_maxchar_surrogates(str, str + size,
1990
0
                                &maxchar, &num_surrogates) == -1) {
1991
0
        return -1;
1992
0
    }
1993
1994
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
1995
0
        return -1;
1996
0
    }
1997
1998
0
    int kind = writer->kind;
1999
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2000
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
2001
2002
0
    writer->pos += size - num_surrogates;
2003
0
    return 0;
2004
0
}
2005
2006
2007
PyObject *
2008
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2009
46.4k
{
2010
46.4k
    if (size < 0) {
2011
0
        PyErr_SetString(PyExc_SystemError,
2012
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2013
0
        return NULL;
2014
0
    }
2015
46.4k
    if (u != NULL) {
2016
46.4k
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2017
46.4k
    }
2018
0
    if (size > 0) {
2019
0
        PyErr_SetString(PyExc_SystemError,
2020
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2021
0
        return NULL;
2022
0
    }
2023
0
    return _PyUnicode_GetEmpty();
2024
0
}
2025
2026
PyObject *
2027
PyUnicode_FromString(const char *u)
2028
1.20M
{
2029
1.20M
    size_t size = strlen(u);
2030
1.20M
    if (size > PY_SSIZE_T_MAX) {
2031
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2032
0
        return NULL;
2033
0
    }
2034
1.20M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2035
1.20M
}
2036
2037
2038
PyObject *
2039
_PyUnicode_FromId(_Py_Identifier *id)
2040
0
{
2041
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2042
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2043
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2044
2045
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2046
0
    if (index < 0) {
2047
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2048
2049
0
        PyMutex_Lock(&rt_ids->mutex);
2050
        // Check again to detect concurrent access. Another thread can have
2051
        // initialized the index while this thread waited for the lock.
2052
0
        index = _Py_atomic_load_ssize(&id->index);
2053
0
        if (index < 0) {
2054
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2055
0
            index = rt_ids->next_index;
2056
0
            rt_ids->next_index++;
2057
0
            _Py_atomic_store_ssize(&id->index, index);
2058
0
        }
2059
0
        PyMutex_Unlock(&rt_ids->mutex);
2060
0
    }
2061
0
    assert(index >= 0);
2062
2063
0
    PyObject *obj;
2064
0
    if (index < ids->size) {
2065
0
        obj = ids->array[index];
2066
0
        if (obj) {
2067
            // Return a borrowed reference
2068
0
            goto end;
2069
0
        }
2070
0
    }
2071
2072
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2073
0
                                       NULL, NULL);
2074
0
    if (!obj) {
2075
0
        goto end;
2076
0
    }
2077
0
    _PyUnicode_InternImmortal(interp, &obj);
2078
2079
0
    if (index >= ids->size) {
2080
        // Overallocate to reduce the number of realloc
2081
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2082
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2083
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2084
0
        if (new_array == NULL) {
2085
0
            PyErr_NoMemory();
2086
0
            obj = NULL;
2087
0
            goto end;
2088
0
        }
2089
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2090
0
        ids->array = new_array;
2091
0
        ids->size = new_size;
2092
0
    }
2093
2094
    // The array stores a strong reference
2095
0
    ids->array[index] = obj;
2096
2097
0
end:
2098
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2099
    // Return a borrowed reference
2100
0
    return obj;
2101
0
}
2102
2103
2104
static void
2105
unicode_clear_identifiers(struct _Py_unicode_state *state)
2106
0
{
2107
0
    struct _Py_unicode_ids *ids = &state->ids;
2108
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2109
0
        Py_XDECREF(ids->array[i]);
2110
0
    }
2111
0
    ids->size = 0;
2112
0
    PyMem_Free(ids->array);
2113
0
    ids->array = NULL;
2114
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2115
    // after Py_Finalize().
2116
0
}
2117
2118
2119
/* Internal function, doesn't check maximum character */
2120
2121
PyObject*
2122
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2123
12.2M
{
2124
12.2M
    const unsigned char *s = (const unsigned char *)buffer;
2125
12.2M
    PyObject *unicode;
2126
12.2M
    if (size == 1) {
2127
#ifdef Py_DEBUG
2128
        assert((unsigned char)s[0] < 128);
2129
#endif
2130
14.3k
        return get_latin1_char(s[0]);
2131
14.3k
    }
2132
12.2M
    unicode = PyUnicode_New(size, 127);
2133
12.2M
    if (!unicode)
2134
0
        return NULL;
2135
12.2M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2136
12.2M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2137
12.2M
    return unicode;
2138
12.2M
}
2139
2140
static Py_UCS4
2141
kind_maxchar_limit(int kind)
2142
18
{
2143
18
    switch (kind) {
2144
0
    case PyUnicode_1BYTE_KIND:
2145
0
        return 0x80;
2146
11
    case PyUnicode_2BYTE_KIND:
2147
11
        return 0x100;
2148
7
    case PyUnicode_4BYTE_KIND:
2149
7
        return 0x10000;
2150
0
    default:
2151
0
        Py_UNREACHABLE();
2152
18
    }
2153
18
}
2154
2155
static PyObject*
2156
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2157
8.95M
{
2158
8.95M
    PyObject *res;
2159
8.95M
    unsigned char max_char;
2160
2161
8.95M
    if (size == 0) {
2162
8.80M
        _Py_RETURN_UNICODE_EMPTY();
2163
8.80M
    }
2164
8.95M
    assert(size > 0);
2165
152k
    if (size == 1) {
2166
6.81k
        return get_latin1_char(u[0]);
2167
6.81k
    }
2168
2169
145k
    max_char = ucs1lib_find_max_char(u, u + size);
2170
145k
    res = PyUnicode_New(size, max_char);
2171
145k
    if (!res)
2172
0
        return NULL;
2173
145k
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2174
145k
    assert(_PyUnicode_CheckConsistency(res, 1));
2175
145k
    return res;
2176
145k
}
2177
2178
static PyObject*
2179
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2180
8.29M
{
2181
8.29M
    PyObject *res;
2182
8.29M
    Py_UCS2 max_char;
2183
2184
8.29M
    if (size == 0)
2185
7.49M
        _Py_RETURN_UNICODE_EMPTY();
2186
8.29M
    assert(size > 0);
2187
803k
    if (size == 1)
2188
52.1k
        return unicode_char(u[0]);
2189
2190
751k
    max_char = ucs2lib_find_max_char(u, u + size);
2191
751k
    res = PyUnicode_New(size, max_char);
2192
751k
    if (!res)
2193
0
        return NULL;
2194
751k
    if (max_char >= 256)
2195
73.7k
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2196
677k
    else {
2197
677k
        _PyUnicode_CONVERT_BYTES(
2198
677k
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2199
677k
    }
2200
751k
    assert(_PyUnicode_CheckConsistency(res, 1));
2201
751k
    return res;
2202
751k
}
2203
2204
static PyObject*
2205
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2206
11.0M
{
2207
11.0M
    PyObject *res;
2208
11.0M
    Py_UCS4 max_char;
2209
2210
11.0M
    if (size == 0)
2211
10.1M
        _Py_RETURN_UNICODE_EMPTY();
2212
11.0M
    assert(size > 0);
2213
868k
    if (size == 1)
2214
212k
        return unicode_char(u[0]);
2215
2216
656k
    max_char = ucs4lib_find_max_char(u, u + size);
2217
656k
    res = PyUnicode_New(size, max_char);
2218
656k
    if (!res)
2219
0
        return NULL;
2220
656k
    if (max_char < 256)
2221
586k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2222
656k
                                 PyUnicode_1BYTE_DATA(res));
2223
69.8k
    else if (max_char < 0x10000)
2224
55.9k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2225
69.8k
                                 PyUnicode_2BYTE_DATA(res));
2226
13.9k
    else
2227
13.9k
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2228
656k
    assert(_PyUnicode_CheckConsistency(res, 1));
2229
656k
    return res;
2230
656k
}
2231
2232
2233
int
2234
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2235
                          const Py_UCS4 *str,
2236
                          Py_ssize_t size)
2237
0
{
2238
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2239
2240
0
    if (size < 0) {
2241
0
        PyErr_SetString(PyExc_ValueError,
2242
0
                        "size must be positive");
2243
0
        return -1;
2244
0
    }
2245
2246
0
    if (size == 0) {
2247
0
        return 0;
2248
0
    }
2249
2250
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2251
2252
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2253
0
        return -1;
2254
0
    }
2255
2256
0
    int kind = writer->kind;
2257
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2258
0
    if (kind == PyUnicode_1BYTE_KIND) {
2259
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2260
0
                                 str, str + size,
2261
0
                                 data);
2262
0
    }
2263
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2264
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2265
0
                                 str, str + size,
2266
0
                                 data);
2267
0
    }
2268
0
    else {
2269
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2270
0
    }
2271
0
    writer->pos += size;
2272
2273
0
    return 0;
2274
0
}
2275
2276
2277
PyObject*
2278
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2279
2.25M
{
2280
2.25M
    if (size < 0) {
2281
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2282
0
        return NULL;
2283
0
    }
2284
2.25M
    switch (kind) {
2285
130k
    case PyUnicode_1BYTE_KIND:
2286
130k
        return _PyUnicode_FromUCS1(buffer, size);
2287
201k
    case PyUnicode_2BYTE_KIND:
2288
201k
        return _PyUnicode_FromUCS2(buffer, size);
2289
1.91M
    case PyUnicode_4BYTE_KIND:
2290
1.91M
        return _PyUnicode_FromUCS4(buffer, size);
2291
0
    default:
2292
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2293
0
        return NULL;
2294
2.25M
    }
2295
2.25M
}
2296
2297
Py_UCS4
2298
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2299
72.8k
{
2300
72.8k
    int kind;
2301
72.8k
    const void *startptr, *endptr;
2302
2303
72.8k
    assert(0 <= start);
2304
72.8k
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2305
72.8k
    assert(start <= end);
2306
2307
72.8k
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2308
0
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2309
2310
72.8k
    if (start == end)
2311
0
        return 127;
2312
2313
72.8k
    if (PyUnicode_IS_ASCII(unicode))
2314
4.58k
        return 127;
2315
2316
68.2k
    kind = PyUnicode_KIND(unicode);
2317
68.2k
    startptr = PyUnicode_DATA(unicode);
2318
68.2k
    endptr = (char *)startptr + end * kind;
2319
68.2k
    startptr = (char *)startptr + start * kind;
2320
68.2k
    switch(kind) {
2321
7.41k
    case PyUnicode_1BYTE_KIND:
2322
7.41k
        return ucs1lib_find_max_char(startptr, endptr);
2323
19.3k
    case PyUnicode_2BYTE_KIND:
2324
19.3k
        return ucs2lib_find_max_char(startptr, endptr);
2325
41.4k
    case PyUnicode_4BYTE_KIND:
2326
41.4k
        return ucs4lib_find_max_char(startptr, endptr);
2327
0
    default:
2328
0
        Py_UNREACHABLE();
2329
68.2k
    }
2330
68.2k
}
2331
2332
/* Ensure that a string uses the most efficient storage, if it is not the
2333
   case: create a new string with of the right kind. Write NULL into *p_unicode
2334
   on error. */
2335
static void
2336
unicode_adjust_maxchar(PyObject **p_unicode)
2337
0
{
2338
0
    PyObject *unicode, *copy;
2339
0
    Py_UCS4 max_char;
2340
0
    Py_ssize_t len;
2341
0
    int kind;
2342
2343
0
    assert(p_unicode != NULL);
2344
0
    unicode = *p_unicode;
2345
0
    if (PyUnicode_IS_ASCII(unicode))
2346
0
        return;
2347
2348
0
    len = PyUnicode_GET_LENGTH(unicode);
2349
0
    kind = PyUnicode_KIND(unicode);
2350
0
    if (kind == PyUnicode_1BYTE_KIND) {
2351
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2352
0
        max_char = ucs1lib_find_max_char(u, u + len);
2353
0
        if (max_char >= 128)
2354
0
            return;
2355
0
    }
2356
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2357
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2358
0
        max_char = ucs2lib_find_max_char(u, u + len);
2359
0
        if (max_char >= 256)
2360
0
            return;
2361
0
    }
2362
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2363
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2364
0
        max_char = ucs4lib_find_max_char(u, u + len);
2365
0
        if (max_char >= 0x10000)
2366
0
            return;
2367
0
    }
2368
0
    else
2369
0
        Py_UNREACHABLE();
2370
2371
0
    copy = PyUnicode_New(len, max_char);
2372
0
    if (copy != NULL)
2373
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2374
0
    Py_DECREF(unicode);
2375
0
    *p_unicode = copy;
2376
0
}
2377
2378
PyObject*
2379
_PyUnicode_Copy(PyObject *unicode)
2380
0
{
2381
0
    Py_ssize_t length;
2382
0
    PyObject *copy;
2383
2384
0
    if (!PyUnicode_Check(unicode)) {
2385
0
        PyErr_BadInternalCall();
2386
0
        return NULL;
2387
0
    }
2388
2389
0
    length = PyUnicode_GET_LENGTH(unicode);
2390
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2391
0
    if (!copy)
2392
0
        return NULL;
2393
0
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2394
2395
0
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2396
0
              length * PyUnicode_KIND(unicode));
2397
0
    assert(_PyUnicode_CheckConsistency(copy, 1));
2398
0
    return copy;
2399
0
}
2400
2401
2402
/* Widen Unicode objects to larger buffers. Don't write terminating null
2403
   character. Return NULL on error. */
2404
2405
static void*
2406
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2407
3.59k
{
2408
3.59k
    void *result;
2409
2410
3.59k
    assert(skind < kind);
2411
3.59k
    switch (kind) {
2412
1.66k
    case PyUnicode_2BYTE_KIND:
2413
1.66k
        result = PyMem_New(Py_UCS2, len);
2414
1.66k
        if (!result)
2415
0
            return PyErr_NoMemory();
2416
1.66k
        assert(skind == PyUnicode_1BYTE_KIND);
2417
1.66k
        _PyUnicode_CONVERT_BYTES(
2418
1.66k
            Py_UCS1, Py_UCS2,
2419
1.66k
            (const Py_UCS1 *)data,
2420
1.66k
            ((const Py_UCS1 *)data) + len,
2421
1.66k
            result);
2422
1.66k
        return result;
2423
1.92k
    case PyUnicode_4BYTE_KIND:
2424
1.92k
        result = PyMem_New(Py_UCS4, len);
2425
1.92k
        if (!result)
2426
0
            return PyErr_NoMemory();
2427
1.92k
        if (skind == PyUnicode_2BYTE_KIND) {
2428
0
            _PyUnicode_CONVERT_BYTES(
2429
0
                Py_UCS2, Py_UCS4,
2430
0
                (const Py_UCS2 *)data,
2431
0
                ((const Py_UCS2 *)data) + len,
2432
0
                result);
2433
0
        }
2434
1.92k
        else {
2435
1.92k
            assert(skind == PyUnicode_1BYTE_KIND);
2436
1.92k
            _PyUnicode_CONVERT_BYTES(
2437
1.92k
                Py_UCS1, Py_UCS4,
2438
1.92k
                (const Py_UCS1 *)data,
2439
1.92k
                ((const Py_UCS1 *)data) + len,
2440
1.92k
                result);
2441
1.92k
        }
2442
1.92k
        return result;
2443
0
    default:
2444
0
        Py_UNREACHABLE();
2445
0
        return NULL;
2446
3.59k
    }
2447
3.59k
}
2448
2449
static Py_UCS4*
2450
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2451
        int copy_null)
2452
0
{
2453
0
    int kind;
2454
0
    const void *data;
2455
0
    Py_ssize_t len, targetlen;
2456
0
    kind = PyUnicode_KIND(string);
2457
0
    data = PyUnicode_DATA(string);
2458
0
    len = PyUnicode_GET_LENGTH(string);
2459
0
    targetlen = len;
2460
0
    if (copy_null)
2461
0
        targetlen++;
2462
0
    if (!target) {
2463
0
        target = PyMem_New(Py_UCS4, targetlen);
2464
0
        if (!target) {
2465
0
            PyErr_NoMemory();
2466
0
            return NULL;
2467
0
        }
2468
0
    }
2469
0
    else {
2470
0
        if (targetsize < targetlen) {
2471
0
            PyErr_Format(PyExc_SystemError,
2472
0
                         "string is longer than the buffer");
2473
0
            if (copy_null && 0 < targetsize)
2474
0
                target[0] = 0;
2475
0
            return NULL;
2476
0
        }
2477
0
    }
2478
0
    if (kind == PyUnicode_1BYTE_KIND) {
2479
0
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2480
0
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2481
0
    }
2482
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2483
0
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2484
0
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2485
0
    }
2486
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2487
0
        memcpy(target, data, len * sizeof(Py_UCS4));
2488
0
    }
2489
0
    else {
2490
0
        Py_UNREACHABLE();
2491
0
    }
2492
0
    if (copy_null)
2493
0
        target[len] = 0;
2494
0
    return target;
2495
0
}
2496
2497
Py_UCS4*
2498
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2499
                 int copy_null)
2500
0
{
2501
0
    if (target == NULL || targetsize < 0) {
2502
0
        PyErr_BadInternalCall();
2503
0
        return NULL;
2504
0
    }
2505
0
    return as_ucs4(string, target, targetsize, copy_null);
2506
0
}
2507
2508
Py_UCS4*
2509
PyUnicode_AsUCS4Copy(PyObject *string)
2510
0
{
2511
0
    return as_ucs4(string, NULL, 0, 1);
2512
0
}
2513
2514
/* maximum number of characters required for output of %jo or %jd or %p.
2515
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2516
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2517
   plus 1 for the terminal NUL. */
2518
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2519
2520
static int
2521
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2522
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2523
18.5k
{
2524
18.5k
    Py_ssize_t length, fill, arglen;
2525
18.5k
    Py_UCS4 maxchar;
2526
2527
18.5k
    length = PyUnicode_GET_LENGTH(str);
2528
18.5k
    if ((precision == -1 || precision >= length)
2529
17.0k
        && width <= length)
2530
17.0k
        return _PyUnicodeWriter_WriteStr(writer, str);
2531
2532
1.48k
    if (precision != -1)
2533
1.48k
        length = Py_MIN(precision, length);
2534
2535
1.48k
    arglen = Py_MAX(length, width);
2536
1.48k
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2537
348
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2538
1.13k
    else
2539
1.13k
        maxchar = writer->maxchar;
2540
2541
1.48k
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2542
0
        return -1;
2543
2544
1.48k
    fill = Py_MAX(width - length, 0);
2545
1.48k
    if (fill && !(flags & F_LJUST)) {
2546
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2547
0
            return -1;
2548
0
        writer->pos += fill;
2549
0
    }
2550
2551
1.48k
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2552
1.48k
                                  str, 0, length);
2553
1.48k
    writer->pos += length;
2554
2555
1.48k
    if (fill && (flags & F_LJUST)) {
2556
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2557
0
            return -1;
2558
0
        writer->pos += fill;
2559
0
    }
2560
2561
1.48k
    return 0;
2562
1.48k
}
2563
2564
static int
2565
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2566
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2567
55.7k
{
2568
    /* UTF-8 */
2569
55.7k
    Py_ssize_t *pconsumed = NULL;
2570
55.7k
    Py_ssize_t length;
2571
55.7k
    if (precision == -1) {
2572
38.5k
        length = strlen(str);
2573
38.5k
    }
2574
17.2k
    else {
2575
17.2k
        length = 0;
2576
117k
        while (length < precision && str[length]) {
2577
100k
            length++;
2578
100k
        }
2579
17.2k
        if (length == precision) {
2580
            /* The input string is not NUL-terminated.  If it ends with an
2581
             * incomplete UTF-8 sequence, truncate the string just before it.
2582
             * Incomplete sequences in the middle and sequences which cannot
2583
             * be valid prefixes are still treated as errors and replaced
2584
             * with \xfffd. */
2585
1.11k
            pconsumed = &length;
2586
1.11k
        }
2587
17.2k
    }
2588
2589
55.7k
    if (width < 0) {
2590
55.7k
        return _PyUnicode_DecodeUTF8Writer(writer, str, length,
2591
55.7k
                                           _Py_ERROR_REPLACE, "replace", pconsumed);
2592
55.7k
    }
2593
2594
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2595
0
                                                     "replace", pconsumed);
2596
0
    if (unicode == NULL)
2597
0
        return -1;
2598
2599
0
    int res = unicode_fromformat_write_str(writer, unicode,
2600
0
                                           width, -1, flags);
2601
0
    Py_DECREF(unicode);
2602
0
    return res;
2603
0
}
2604
2605
static int
2606
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2607
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2608
0
{
2609
0
    Py_ssize_t length;
2610
0
    if (precision == -1) {
2611
0
        length = wcslen(str);
2612
0
    }
2613
0
    else {
2614
0
        length = 0;
2615
0
        while (length < precision && str[length]) {
2616
0
            length++;
2617
0
        }
2618
0
    }
2619
2620
0
    if (width < 0) {
2621
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2622
0
                                             str, length);
2623
0
    }
2624
2625
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2626
0
    if (unicode == NULL)
2627
0
        return -1;
2628
2629
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2630
0
    Py_DECREF(unicode);
2631
0
    return res;
2632
0
}
2633
2634
0
#define F_LONG 1
2635
0
#define F_LONGLONG 2
2636
4.66k
#define F_SIZE 3
2637
0
#define F_PTRDIFF 4
2638
0
#define F_INTMAX 5
2639
2640
static const char*
2641
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2642
                       const char *f, va_list *vargs)
2643
95.7k
{
2644
95.7k
    const char *p;
2645
95.7k
    Py_ssize_t len;
2646
95.7k
    int flags = 0;
2647
95.7k
    Py_ssize_t width;
2648
95.7k
    Py_ssize_t precision;
2649
2650
95.7k
    p = f;
2651
95.7k
    f++;
2652
95.7k
    if (*f == '%') {
2653
0
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2654
0
            return NULL;
2655
0
        f++;
2656
0
        return f;
2657
0
    }
2658
2659
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2660
    /* Flags '+', ' ' and '#' are not particularly useful.
2661
     * They are not worth the implementation and maintenance costs.
2662
     * In addition, '#' should add "0" for "o" conversions for compatibility
2663
     * with printf, but it would confuse Python users. */
2664
97.3k
    while (1) {
2665
97.3k
        switch (*f++) {
2666
0
        case '-': flags |= F_LJUST; continue;
2667
1.63k
        case '0': flags |= F_ZERO; continue;
2668
0
        case '#': flags |= F_ALT; continue;
2669
97.3k
        }
2670
95.7k
        f--;
2671
95.7k
        break;
2672
97.3k
    }
2673
2674
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2675
95.7k
    width = -1;
2676
95.7k
    if (*f == '*') {
2677
0
        width = va_arg(*vargs, int);
2678
0
        if (width < 0) {
2679
0
            flags |= F_LJUST;
2680
0
            width = -width;
2681
0
        }
2682
0
        f++;
2683
0
    }
2684
95.7k
    else if (Py_ISDIGIT((unsigned)*f)) {
2685
1.63k
        width = *f - '0';
2686
1.63k
        f++;
2687
1.63k
        while (Py_ISDIGIT((unsigned)*f)) {
2688
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2689
0
                PyErr_SetString(PyExc_ValueError,
2690
0
                                "width too big");
2691
0
                return NULL;
2692
0
            }
2693
0
            width = (width * 10) + (*f - '0');
2694
0
            f++;
2695
0
        }
2696
1.63k
    }
2697
95.7k
    precision = -1;
2698
95.7k
    if (*f == '.') {
2699
22.1k
        f++;
2700
22.1k
        if (*f == '*') {
2701
0
            precision = va_arg(*vargs, int);
2702
0
            if (precision < 0) {
2703
0
                precision = -2;
2704
0
            }
2705
0
            f++;
2706
0
        }
2707
22.1k
        else if (Py_ISDIGIT((unsigned)*f)) {
2708
22.1k
            precision = (*f - '0');
2709
22.1k
            f++;
2710
63.7k
            while (Py_ISDIGIT((unsigned)*f)) {
2711
41.5k
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2712
0
                    PyErr_SetString(PyExc_ValueError,
2713
0
                                    "precision too big");
2714
0
                    return NULL;
2715
0
                }
2716
41.5k
                precision = (precision * 10) + (*f - '0');
2717
41.5k
                f++;
2718
41.5k
            }
2719
22.1k
        }
2720
22.1k
    }
2721
2722
95.7k
    int sizemod = 0;
2723
95.7k
    if (*f == 'l') {
2724
0
        if (f[1] == 'l') {
2725
0
            sizemod = F_LONGLONG;
2726
0
            f += 2;
2727
0
        }
2728
0
        else {
2729
0
            sizemod = F_LONG;
2730
0
            ++f;
2731
0
        }
2732
0
    }
2733
95.7k
    else if (*f == 'z') {
2734
2.33k
        sizemod = F_SIZE;
2735
2.33k
        ++f;
2736
2.33k
    }
2737
93.3k
    else if (*f == 't') {
2738
0
        sizemod = F_PTRDIFF;
2739
0
        ++f;
2740
0
    }
2741
93.3k
    else if (*f == 'j') {
2742
0
        sizemod = F_INTMAX;
2743
0
        ++f;
2744
0
    }
2745
95.7k
    if (f[0] != '\0' && f[1] == '\0')
2746
16.8k
        writer->overallocate = 0;
2747
2748
95.7k
    switch (*f) {
2749
11.4k
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2750
11.4k
        break;
2751
10.0k
    case 'c': case 'p':
2752
10.0k
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2753
10.0k
        break;
2754
55.3k
    case 's':
2755
55.7k
    case 'V':
2756
55.7k
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2757
55.7k
        break;
2758
55.7k
    default:
2759
18.5k
        if (sizemod) goto invalid_format;
2760
18.5k
        break;
2761
95.7k
    }
2762
2763
95.7k
    switch (*f) {
2764
10.0k
    case 'c':
2765
10.0k
    {
2766
10.0k
        int ordinal = va_arg(*vargs, int);
2767
10.0k
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2768
0
            PyErr_SetString(PyExc_OverflowError,
2769
0
                            "character argument not in range(0x110000)");
2770
0
            return NULL;
2771
0
        }
2772
10.0k
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2773
0
            return NULL;
2774
10.0k
        break;
2775
10.0k
    }
2776
2777
10.0k
    case 'd': case 'i':
2778
11.4k
    case 'o': case 'u': case 'x': case 'X':
2779
11.4k
    {
2780
11.4k
        char buffer[MAX_INTMAX_CHARS];
2781
2782
        // Fill buffer using sprinf, with one of many possible format
2783
        // strings, like "%llX" for `long long` in hexadecimal.
2784
        // The type/size is in `sizemod`; the format is in `*f`.
2785
2786
        // Use macros with nested switches to keep the sprintf format strings
2787
        // as compile-time literals, avoiding warnings and maybe allowing
2788
        // optimizations.
2789
2790
        // `SPRINT` macro does one sprintf
2791
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2792
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2793
11.4k
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2794
11.4k
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2795
2796
        // One inner switch to handle all format variants
2797
11.4k
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2798
11.4k
            switch (*f) {                                                     \
2799
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2800
0
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2801
1.18k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2802
789
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2803
9.44k
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2804
11.4k
            }
2805
2806
        // Outer switch to handle all the sizes/types
2807
11.4k
        switch (sizemod) {
2808
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2809
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2810
2.33k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2811
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2812
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2813
9.08k
            default:         DO_SPRINTS("", int, unsigned int); break;
2814
11.4k
        }
2815
11.4k
        #undef SPRINT
2816
11.4k
        #undef DO_SPRINTS
2817
2818
11.4k
        assert(len >= 0);
2819
2820
11.4k
        int sign = (buffer[0] == '-');
2821
11.4k
        len -= sign;
2822
2823
11.4k
        precision = Py_MAX(precision, len);
2824
11.4k
        width = Py_MAX(width, precision + sign);
2825
11.4k
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2826
1.63k
            precision = width - sign;
2827
1.63k
        }
2828
2829
11.4k
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2830
11.4k
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2831
2832
11.4k
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2833
0
            return NULL;
2834
2835
11.4k
        if (spacepad && !(flags & F_LJUST)) {
2836
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2837
0
                return NULL;
2838
0
            writer->pos += spacepad;
2839
0
        }
2840
2841
11.4k
        if (sign) {
2842
0
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2843
0
                return NULL;
2844
0
        }
2845
2846
11.4k
        if (zeropad) {
2847
585
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2848
0
                return NULL;
2849
585
            writer->pos += zeropad;
2850
585
        }
2851
2852
11.4k
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2853
0
            return NULL;
2854
2855
11.4k
        if (spacepad && (flags & F_LJUST)) {
2856
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2857
0
                return NULL;
2858
0
            writer->pos += spacepad;
2859
0
        }
2860
11.4k
        break;
2861
11.4k
    }
2862
2863
11.4k
    case 'p':
2864
0
    {
2865
0
        char number[MAX_INTMAX_CHARS];
2866
2867
0
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2868
0
        assert(len >= 0);
2869
2870
        /* %p is ill-defined:  ensure leading 0x. */
2871
0
        if (number[1] == 'X')
2872
0
            number[1] = 'x';
2873
0
        else if (number[1] != 'x') {
2874
0
            memmove(number + 2, number,
2875
0
                    strlen(number) + 1);
2876
0
            number[0] = '0';
2877
0
            number[1] = 'x';
2878
0
            len += 2;
2879
0
        }
2880
2881
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2882
0
            return NULL;
2883
0
        break;
2884
0
    }
2885
2886
55.3k
    case 's':
2887
55.3k
    {
2888
55.3k
        if (sizemod) {
2889
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2890
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2891
0
                return NULL;
2892
0
        }
2893
55.3k
        else {
2894
            /* UTF-8 */
2895
55.3k
            const char *s = va_arg(*vargs, const char*);
2896
55.3k
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2897
0
                return NULL;
2898
55.3k
        }
2899
55.3k
        break;
2900
55.3k
    }
2901
2902
55.3k
    case 'U':
2903
11.8k
    {
2904
11.8k
        PyObject *obj = va_arg(*vargs, PyObject *);
2905
11.8k
        assert(obj && _PyUnicode_CHECK(obj));
2906
2907
11.8k
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2908
0
            return NULL;
2909
11.8k
        break;
2910
11.8k
    }
2911
2912
11.8k
    case 'V':
2913
338
    {
2914
338
        PyObject *obj = va_arg(*vargs, PyObject *);
2915
338
        const char *str;
2916
338
        const wchar_t *wstr;
2917
338
        if (sizemod) {
2918
0
            wstr = va_arg(*vargs, const wchar_t*);
2919
0
        }
2920
338
        else {
2921
338
            str = va_arg(*vargs, const char *);
2922
338
        }
2923
338
        if (obj) {
2924
0
            assert(_PyUnicode_CHECK(obj));
2925
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2926
0
                return NULL;
2927
0
        }
2928
338
        else if (sizemod) {
2929
0
            assert(wstr != NULL);
2930
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2931
0
                return NULL;
2932
0
        }
2933
338
        else {
2934
338
            assert(str != NULL);
2935
338
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
2936
0
                return NULL;
2937
338
        }
2938
338
        break;
2939
338
    }
2940
2941
338
    case 'S':
2942
58
    {
2943
58
        PyObject *obj = va_arg(*vargs, PyObject *);
2944
58
        PyObject *str;
2945
58
        assert(obj);
2946
58
        str = PyObject_Str(obj);
2947
58
        if (!str)
2948
0
            return NULL;
2949
58
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2950
0
            Py_DECREF(str);
2951
0
            return NULL;
2952
0
        }
2953
58
        Py_DECREF(str);
2954
58
        break;
2955
58
    }
2956
2957
6.62k
    case 'R':
2958
6.62k
    {
2959
6.62k
        PyObject *obj = va_arg(*vargs, PyObject *);
2960
6.62k
        PyObject *repr;
2961
6.62k
        assert(obj);
2962
6.62k
        repr = PyObject_Repr(obj);
2963
6.62k
        if (!repr)
2964
0
            return NULL;
2965
6.62k
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2966
0
            Py_DECREF(repr);
2967
0
            return NULL;
2968
0
        }
2969
6.62k
        Py_DECREF(repr);
2970
6.62k
        break;
2971
6.62k
    }
2972
2973
0
    case 'A':
2974
0
    {
2975
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2976
0
        PyObject *ascii;
2977
0
        assert(obj);
2978
0
        ascii = PyObject_ASCII(obj);
2979
0
        if (!ascii)
2980
0
            return NULL;
2981
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2982
0
            Py_DECREF(ascii);
2983
0
            return NULL;
2984
0
        }
2985
0
        Py_DECREF(ascii);
2986
0
        break;
2987
0
    }
2988
2989
0
    case 'T':
2990
0
    {
2991
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2992
0
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2993
2994
0
        PyObject *type_name;
2995
0
        if (flags & F_ALT) {
2996
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
2997
0
        }
2998
0
        else {
2999
0
            type_name = PyType_GetFullyQualifiedName(type);
3000
0
        }
3001
0
        Py_DECREF(type);
3002
0
        if (!type_name) {
3003
0
            return NULL;
3004
0
        }
3005
3006
0
        if (unicode_fromformat_write_str(writer, type_name,
3007
0
                                         width, precision, flags) == -1) {
3008
0
            Py_DECREF(type_name);
3009
0
            return NULL;
3010
0
        }
3011
0
        Py_DECREF(type_name);
3012
0
        break;
3013
0
    }
3014
3015
0
    case 'N':
3016
0
    {
3017
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3018
0
        assert(type_raw != NULL);
3019
3020
0
        if (!PyType_Check(type_raw)) {
3021
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3022
0
            return NULL;
3023
0
        }
3024
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3025
3026
0
        PyObject *type_name;
3027
0
        if (flags & F_ALT) {
3028
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3029
0
        }
3030
0
        else {
3031
0
            type_name = PyType_GetFullyQualifiedName(type);
3032
0
        }
3033
0
        if (!type_name) {
3034
0
            return NULL;
3035
0
        }
3036
0
        if (unicode_fromformat_write_str(writer, type_name,
3037
0
                                         width, precision, flags) == -1) {
3038
0
            Py_DECREF(type_name);
3039
0
            return NULL;
3040
0
        }
3041
0
        Py_DECREF(type_name);
3042
0
        break;
3043
0
    }
3044
3045
0
    default:
3046
0
    invalid_format:
3047
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3048
0
        return NULL;
3049
95.7k
    }
3050
3051
95.7k
    f++;
3052
95.7k
    return f;
3053
95.7k
}
3054
3055
static int
3056
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3057
63.4k
{
3058
63.4k
    Py_ssize_t len = strlen(format);
3059
63.4k
    writer->min_length += len + 100;
3060
63.4k
    writer->overallocate = 1;
3061
3062
    // Copy varags to be able to pass a reference to a subfunction.
3063
63.4k
    va_list vargs2;
3064
63.4k
    va_copy(vargs2, vargs);
3065
3066
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3067
    // to be encoded to ASCII.
3068
63.4k
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3069
63.4k
    if (!is_ascii) {
3070
0
        Py_ssize_t i;
3071
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3072
0
        PyErr_Format(PyExc_ValueError,
3073
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3074
0
            "string, got a non-ASCII byte: 0x%02x",
3075
0
            (unsigned char)format[i]);
3076
0
        goto fail;
3077
0
    }
3078
3079
287k
    for (const char *f = format; *f; ) {
3080
223k
        if (*f == '%') {
3081
95.7k
            f = unicode_fromformat_arg(writer, f, &vargs2);
3082
95.7k
            if (f == NULL)
3083
0
                goto fail;
3084
95.7k
        }
3085
127k
        else {
3086
127k
            const char *p = strchr(f, '%');
3087
127k
            if (p != NULL) {
3088
81.3k
                len = p - f;
3089
81.3k
            }
3090
46.6k
            else {
3091
46.6k
                len = strlen(f);
3092
46.6k
                writer->overallocate = 0;
3093
46.6k
            }
3094
3095
127k
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3096
0
                goto fail;
3097
0
            }
3098
127k
            f += len;
3099
127k
        }
3100
223k
    }
3101
63.4k
    va_end(vargs2);
3102
63.4k
    return 0;
3103
3104
0
  fail:
3105
0
    va_end(vargs2);
3106
0
    return -1;
3107
63.4k
}
3108
3109
PyObject *
3110
PyUnicode_FromFormatV(const char *format, va_list vargs)
3111
63.4k
{
3112
63.4k
    _PyUnicodeWriter writer;
3113
63.4k
    _PyUnicodeWriter_Init(&writer);
3114
3115
63.4k
    if (unicode_from_format(&writer, format, vargs) < 0) {
3116
0
        _PyUnicodeWriter_Dealloc(&writer);
3117
0
        return NULL;
3118
0
    }
3119
63.4k
    return _PyUnicodeWriter_Finish(&writer);
3120
63.4k
}
3121
3122
PyObject *
3123
PyUnicode_FromFormat(const char *format, ...)
3124
15.6k
{
3125
15.6k
    PyObject* ret;
3126
15.6k
    va_list vargs;
3127
3128
15.6k
    va_start(vargs, format);
3129
15.6k
    ret = PyUnicode_FromFormatV(format, vargs);
3130
15.6k
    va_end(vargs);
3131
15.6k
    return ret;
3132
15.6k
}
3133
3134
int
3135
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3136
0
{
3137
0
    va_list vargs;
3138
0
    va_start(vargs, format);
3139
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3140
0
    va_end(vargs);
3141
0
    return res;
3142
0
}
3143
3144
int
3145
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3146
                         va_list vargs)
3147
0
{
3148
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3149
0
    Py_ssize_t old_pos = _writer->pos;
3150
3151
0
    int res = unicode_from_format(_writer, format, vargs);
3152
3153
0
    if (res < 0) {
3154
0
        _writer->pos = old_pos;
3155
0
    }
3156
0
    return res;
3157
0
}
3158
3159
static Py_ssize_t
3160
unicode_get_widechar_size(PyObject *unicode)
3161
1.37k
{
3162
1.37k
    Py_ssize_t res;
3163
3164
1.37k
    assert(unicode != NULL);
3165
1.37k
    assert(_PyUnicode_CHECK(unicode));
3166
3167
1.37k
    res = _PyUnicode_LENGTH(unicode);
3168
#if SIZEOF_WCHAR_T == 2
3169
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3170
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3171
        const Py_UCS4 *end = s + res;
3172
        for (; s < end; ++s) {
3173
            if (*s > 0xFFFF) {
3174
                ++res;
3175
            }
3176
        }
3177
    }
3178
#endif
3179
0
    return res;
3180
1.37k
}
3181
3182
static void
3183
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3184
1.37k
{
3185
1.37k
    assert(unicode != NULL);
3186
1.37k
    assert(_PyUnicode_CHECK(unicode));
3187
3188
2.74k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3189
0
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3190
0
        return;
3191
0
    }
3192
3193
2.74k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3194
1.37k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3195
56.7k
        for (; size--; ++s, ++w) {
3196
55.4k
            *w = *s;
3197
55.4k
        }
3198
1.37k
    }
3199
0
    else {
3200
0
#if SIZEOF_WCHAR_T == 4
3201
0
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3202
0
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3203
0
        for (; size--; ++s, ++w) {
3204
0
            *w = *s;
3205
0
        }
3206
#else
3207
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3208
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3209
        for (; size--; ++s, ++w) {
3210
            Py_UCS4 ch = *s;
3211
            if (ch > 0xFFFF) {
3212
                assert(ch <= MAX_UNICODE);
3213
                /* encode surrogate pair in this case */
3214
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3215
                if (!size--)
3216
                    break;
3217
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3218
            }
3219
            else {
3220
                *w = ch;
3221
            }
3222
        }
3223
#endif
3224
0
    }
3225
1.37k
}
3226
3227
#ifdef HAVE_WCHAR_H
3228
3229
/* Convert a Unicode object to a wide character string.
3230
3231
   - If w is NULL: return the number of wide characters (including the null
3232
     character) required to convert the unicode object. Ignore size argument.
3233
3234
   - Otherwise: return the number of wide characters (excluding the null
3235
     character) written into w. Write at most size wide characters (including
3236
     the null character). */
3237
Py_ssize_t
3238
PyUnicode_AsWideChar(PyObject *unicode,
3239
                     wchar_t *w,
3240
                     Py_ssize_t size)
3241
78
{
3242
78
    Py_ssize_t res;
3243
3244
78
    if (unicode == NULL) {
3245
0
        PyErr_BadInternalCall();
3246
0
        return -1;
3247
0
    }
3248
78
    if (!PyUnicode_Check(unicode)) {
3249
0
        PyErr_BadArgument();
3250
0
        return -1;
3251
0
    }
3252
3253
78
    res = unicode_get_widechar_size(unicode);
3254
78
    if (w == NULL) {
3255
0
        return res + 1;
3256
0
    }
3257
3258
78
    if (size > res) {
3259
78
        size = res + 1;
3260
78
    }
3261
0
    else {
3262
0
        res = size;
3263
0
    }
3264
78
    unicode_copy_as_widechar(unicode, w, size);
3265
3266
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3267
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3268
       non-Unicode locales and hence needs conversion first. */
3269
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3270
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3271
            return -1;
3272
        }
3273
    }
3274
#endif
3275
3276
78
    return res;
3277
78
}
3278
3279
wchar_t*
3280
PyUnicode_AsWideCharString(PyObject *unicode,
3281
                           Py_ssize_t *size)
3282
1.29k
{
3283
1.29k
    wchar_t *buffer;
3284
1.29k
    Py_ssize_t buflen;
3285
3286
1.29k
    if (unicode == NULL) {
3287
0
        PyErr_BadInternalCall();
3288
0
        return NULL;
3289
0
    }
3290
1.29k
    if (!PyUnicode_Check(unicode)) {
3291
0
        PyErr_BadArgument();
3292
0
        return NULL;
3293
0
    }
3294
3295
1.29k
    buflen = unicode_get_widechar_size(unicode);
3296
1.29k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3297
1.29k
    if (buffer == NULL) {
3298
0
        PyErr_NoMemory();
3299
0
        return NULL;
3300
0
    }
3301
1.29k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3302
3303
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3304
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3305
       non-Unicode locales and hence needs conversion first. */
3306
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3307
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3308
            return NULL;
3309
        }
3310
    }
3311
#endif
3312
3313
1.29k
    if (size != NULL) {
3314
608
        *size = buflen;
3315
608
    }
3316
684
    else if (wcslen(buffer) != (size_t)buflen) {
3317
0
        PyMem_Free(buffer);
3318
0
        PyErr_SetString(PyExc_ValueError,
3319
0
                        "embedded null character");
3320
0
        return NULL;
3321
0
    }
3322
1.29k
    return buffer;
3323
1.29k
}
3324
3325
#endif /* HAVE_WCHAR_H */
3326
3327
int
3328
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3329
0
{
3330
0
    wchar_t **p = (wchar_t **)ptr;
3331
0
    if (obj == NULL) {
3332
0
        PyMem_Free(*p);
3333
0
        *p = NULL;
3334
0
        return 1;
3335
0
    }
3336
0
    if (PyUnicode_Check(obj)) {
3337
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3338
0
        if (*p == NULL) {
3339
0
            return 0;
3340
0
        }
3341
0
        return Py_CLEANUP_SUPPORTED;
3342
0
    }
3343
0
    PyErr_Format(PyExc_TypeError,
3344
0
                 "argument must be str, not %.50s",
3345
0
                 Py_TYPE(obj)->tp_name);
3346
0
    return 0;
3347
0
}
3348
3349
int
3350
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3351
0
{
3352
0
    wchar_t **p = (wchar_t **)ptr;
3353
0
    if (obj == NULL) {
3354
0
        PyMem_Free(*p);
3355
0
        *p = NULL;
3356
0
        return 1;
3357
0
    }
3358
0
    if (obj == Py_None) {
3359
0
        *p = NULL;
3360
0
        return 1;
3361
0
    }
3362
0
    if (PyUnicode_Check(obj)) {
3363
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3364
0
        if (*p == NULL) {
3365
0
            return 0;
3366
0
        }
3367
0
        return Py_CLEANUP_SUPPORTED;
3368
0
    }
3369
0
    PyErr_Format(PyExc_TypeError,
3370
0
                 "argument must be str or None, not %.50s",
3371
0
                 Py_TYPE(obj)->tp_name);
3372
0
    return 0;
3373
0
}
3374
3375
PyObject *
3376
PyUnicode_FromOrdinal(int ordinal)
3377
169k
{
3378
169k
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3379
0
        PyErr_SetString(PyExc_ValueError,
3380
0
                        "chr() arg not in range(0x110000)");
3381
0
        return NULL;
3382
0
    }
3383
3384
169k
    return unicode_char((Py_UCS4)ordinal);
3385
169k
}
3386
3387
PyObject *
3388
PyUnicode_FromObject(PyObject *obj)
3389
148k
{
3390
    /* XXX Perhaps we should make this API an alias of
3391
       PyObject_Str() instead ?! */
3392
148k
    if (PyUnicode_CheckExact(obj)) {
3393
148k
        return Py_NewRef(obj);
3394
148k
    }
3395
0
    if (PyUnicode_Check(obj)) {
3396
        /* For a Unicode subtype that's not a Unicode object,
3397
           return a true Unicode object with the same data. */
3398
0
        return _PyUnicode_Copy(obj);
3399
0
    }
3400
0
    PyErr_Format(PyExc_TypeError,
3401
0
                 "Can't convert '%.100s' object to str implicitly",
3402
0
                 Py_TYPE(obj)->tp_name);
3403
0
    return NULL;
3404
0
}
3405
3406
PyObject *
3407
PyUnicode_FromEncodedObject(PyObject *obj,
3408
                            const char *encoding,
3409
                            const char *errors)
3410
20.3k
{
3411
20.3k
    Py_buffer buffer;
3412
20.3k
    PyObject *v;
3413
3414
20.3k
    if (obj == NULL) {
3415
0
        PyErr_BadInternalCall();
3416
0
        return NULL;
3417
0
    }
3418
3419
    /* Decoding bytes objects is the most common case and should be fast */
3420
20.3k
    if (PyBytes_Check(obj)) {
3421
20.3k
        if (PyBytes_GET_SIZE(obj) == 0) {
3422
5.06k
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3423
0
                return NULL;
3424
0
            }
3425
5.06k
            _Py_RETURN_UNICODE_EMPTY();
3426
5.06k
        }
3427
15.3k
        return PyUnicode_Decode(
3428
15.3k
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3429
15.3k
                encoding, errors);
3430
20.3k
    }
3431
3432
0
    if (PyUnicode_Check(obj)) {
3433
0
        PyErr_SetString(PyExc_TypeError,
3434
0
                        "decoding str is not supported");
3435
0
        return NULL;
3436
0
    }
3437
3438
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3439
0
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3440
0
        PyErr_Format(PyExc_TypeError,
3441
0
                     "decoding to str: need a bytes-like object, %.80s found",
3442
0
                     Py_TYPE(obj)->tp_name);
3443
0
        return NULL;
3444
0
    }
3445
3446
0
    if (buffer.len == 0) {
3447
0
        PyBuffer_Release(&buffer);
3448
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3449
0
            return NULL;
3450
0
        }
3451
0
        _Py_RETURN_UNICODE_EMPTY();
3452
0
    }
3453
3454
0
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3455
0
    PyBuffer_Release(&buffer);
3456
0
    return v;
3457
0
}
3458
3459
/* Normalize an encoding name like encodings.normalize_encoding()
3460
   but allow to convert to lowercase if *to_lower* is true.
3461
   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
3462
int
3463
_Py_normalize_encoding(const char *encoding,
3464
                       char *lower,
3465
                       size_t lower_len,
3466
                       int to_lower)
3467
25.1k
{
3468
25.1k
    const char *e;
3469
25.1k
    char *l;
3470
25.1k
    char *l_end;
3471
25.1k
    int punct;
3472
3473
25.1k
    assert(encoding != NULL);
3474
3475
25.1k
    e = encoding;
3476
25.1k
    l = lower;
3477
25.1k
    l_end = &lower[lower_len - 1];
3478
25.1k
    punct = 0;
3479
180k
    while (1) {
3480
180k
        char c = *e;
3481
180k
        if (c == 0) {
3482
25.0k
            break;
3483
25.0k
        }
3484
3485
155k
        if (Py_ISALNUM(c) || c == '.') {
3486
139k
            if (punct && l != lower) {
3487
14.5k
                if (l == l_end) {
3488
1
                    return 0;
3489
1
                }
3490
14.5k
                *l++ = '_';
3491
14.5k
            }
3492
139k
            punct = 0;
3493
3494
139k
            if (l == l_end) {
3495
38
                return 0;
3496
38
            }
3497
139k
            *l++ = to_lower ? Py_TOLOWER(c) : c;
3498
139k
        }
3499
15.6k
        else {
3500
15.6k
            punct = 1;
3501
15.6k
        }
3502
3503
155k
        e++;
3504
155k
    }
3505
25.0k
    *l = '\0';
3506
25.0k
    return 1;
3507
25.1k
}
3508
3509
PyObject *
3510
PyUnicode_Decode(const char *s,
3511
                 Py_ssize_t size,
3512
                 const char *encoding,
3513
                 const char *errors)
3514
21.0k
{
3515
21.0k
    PyObject *buffer = NULL, *unicode;
3516
21.0k
    Py_buffer info;
3517
21.0k
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3518
3519
21.0k
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3520
0
        return NULL;
3521
0
    }
3522
3523
21.0k
    if (size == 0) {
3524
1
        _Py_RETURN_UNICODE_EMPTY();
3525
1
    }
3526
3527
21.0k
    if (encoding == NULL) {
3528
0
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3529
0
    }
3530
3531
    /* Shortcuts for common default encodings */
3532
21.0k
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3533
21.0k
        char *lower = buflower;
3534
3535
        /* Fast paths */
3536
21.0k
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3537
12.2k
            lower += 3;
3538
12.2k
            if (*lower == '_') {
3539
                /* Match "utf8" and "utf_8" */
3540
12.1k
                lower++;
3541
12.1k
            }
3542
3543
12.2k
            if (lower[0] == '8' && lower[1] == 0) {
3544
10.5k
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3545
10.5k
            }
3546
1.65k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3547
77
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3548
77
            }
3549
1.57k
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3550
14
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3551
14
            }
3552
12.2k
        }
3553
8.85k
        else {
3554
8.85k
            if (strcmp(lower, "ascii") == 0
3555
4.85k
                || strcmp(lower, "us_ascii") == 0) {
3556
4.85k
                return PyUnicode_DecodeASCII(s, size, errors);
3557
4.85k
            }
3558
    #ifdef MS_WINDOWS
3559
            else if (strcmp(lower, "mbcs") == 0) {
3560
                return PyUnicode_DecodeMBCS(s, size, errors);
3561
            }
3562
    #endif
3563
4.00k
            else if (strcmp(lower, "latin1") == 0
3564
3.97k
                     || strcmp(lower, "latin_1") == 0
3565
3.96k
                     || strcmp(lower, "iso_8859_1") == 0
3566
3.95k
                     || strcmp(lower, "iso8859_1") == 0) {
3567
48
                return PyUnicode_DecodeLatin1(s, size, errors);
3568
48
            }
3569
8.85k
        }
3570
21.0k
    }
3571
3572
    /* Decode via the codec registry */
3573
5.55k
    buffer = NULL;
3574
5.55k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3575
0
        goto onError;
3576
5.55k
    buffer = PyMemoryView_FromBuffer(&info);
3577
5.55k
    if (buffer == NULL)
3578
0
        goto onError;
3579
5.55k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3580
5.55k
    if (unicode == NULL)
3581
1.60k
        goto onError;
3582
3.95k
    if (!PyUnicode_Check(unicode)) {
3583
0
        PyErr_Format(PyExc_TypeError,
3584
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3585
0
                     "use codecs.decode() to decode to arbitrary types",
3586
0
                     encoding,
3587
0
                     Py_TYPE(unicode)->tp_name);
3588
0
        Py_DECREF(unicode);
3589
0
        goto onError;
3590
0
    }
3591
3.95k
    Py_DECREF(buffer);
3592
3.95k
    return unicode_result(unicode);
3593
3594
1.60k
  onError:
3595
1.60k
    Py_XDECREF(buffer);
3596
1.60k
    return NULL;
3597
3.95k
}
3598
3599
PyAPI_FUNC(PyObject *)
3600
PyUnicode_AsDecodedObject(PyObject *unicode,
3601
                          const char *encoding,
3602
                          const char *errors)
3603
0
{
3604
0
    if (!PyUnicode_Check(unicode)) {
3605
0
        PyErr_BadArgument();
3606
0
        return NULL;
3607
0
    }
3608
3609
0
    if (encoding == NULL)
3610
0
        encoding = PyUnicode_GetDefaultEncoding();
3611
3612
    /* Decode via the codec registry */
3613
0
    return PyCodec_Decode(unicode, encoding, errors);
3614
0
}
3615
3616
PyAPI_FUNC(PyObject *)
3617
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3618
                           const char *encoding,
3619
                           const char *errors)
3620
0
{
3621
0
    PyObject *v;
3622
3623
0
    if (!PyUnicode_Check(unicode)) {
3624
0
        PyErr_BadArgument();
3625
0
        goto onError;
3626
0
    }
3627
3628
0
    if (encoding == NULL)
3629
0
        encoding = PyUnicode_GetDefaultEncoding();
3630
3631
    /* Decode via the codec registry */
3632
0
    v = PyCodec_Decode(unicode, encoding, errors);
3633
0
    if (v == NULL)
3634
0
        goto onError;
3635
0
    if (!PyUnicode_Check(v)) {
3636
0
        PyErr_Format(PyExc_TypeError,
3637
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3638
0
                     "use codecs.decode() to decode to arbitrary types",
3639
0
                     encoding,
3640
0
                     Py_TYPE(unicode)->tp_name);
3641
0
        Py_DECREF(v);
3642
0
        goto onError;
3643
0
    }
3644
0
    return unicode_result(v);
3645
3646
0
  onError:
3647
0
    return NULL;
3648
0
}
3649
3650
PyAPI_FUNC(PyObject *)
3651
PyUnicode_AsEncodedObject(PyObject *unicode,
3652
                          const char *encoding,
3653
                          const char *errors)
3654
0
{
3655
0
    PyObject *v;
3656
3657
0
    if (!PyUnicode_Check(unicode)) {
3658
0
        PyErr_BadArgument();
3659
0
        goto onError;
3660
0
    }
3661
3662
0
    if (encoding == NULL)
3663
0
        encoding = PyUnicode_GetDefaultEncoding();
3664
3665
    /* Encode via the codec registry */
3666
0
    v = PyCodec_Encode(unicode, encoding, errors);
3667
0
    if (v == NULL)
3668
0
        goto onError;
3669
0
    return v;
3670
3671
0
  onError:
3672
0
    return NULL;
3673
0
}
3674
3675
3676
static PyObject *
3677
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3678
                      int current_locale)
3679
0
{
3680
0
    Py_ssize_t wlen;
3681
0
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3682
0
    if (wstr == NULL) {
3683
0
        return NULL;
3684
0
    }
3685
3686
0
    if ((size_t)wlen != wcslen(wstr)) {
3687
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3688
0
        PyMem_Free(wstr);
3689
0
        return NULL;
3690
0
    }
3691
3692
0
    char *str;
3693
0
    size_t error_pos;
3694
0
    const char *reason;
3695
0
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3696
0
                                 current_locale, error_handler);
3697
0
    PyMem_Free(wstr);
3698
3699
0
    if (res != 0) {
3700
0
        if (res == -2) {
3701
0
            PyObject *exc;
3702
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3703
0
                    "locale", unicode,
3704
0
                    (Py_ssize_t)error_pos,
3705
0
                    (Py_ssize_t)(error_pos+1),
3706
0
                    reason);
3707
0
            if (exc != NULL) {
3708
0
                PyCodec_StrictErrors(exc);
3709
0
                Py_DECREF(exc);
3710
0
            }
3711
0
        }
3712
0
        else if (res == -3) {
3713
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3714
0
        }
3715
0
        else {
3716
0
            PyErr_NoMemory();
3717
0
        }
3718
0
        return NULL;
3719
0
    }
3720
3721
0
    PyObject *bytes = PyBytes_FromString(str);
3722
0
    PyMem_RawFree(str);
3723
0
    return bytes;
3724
0
}
3725
3726
PyObject *
3727
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3728
0
{
3729
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3730
0
    return unicode_encode_locale(unicode, error_handler, 1);
3731
0
}
3732
3733
PyObject *
3734
PyUnicode_EncodeFSDefault(PyObject *unicode)
3735
4.80k
{
3736
4.80k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3737
4.80k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3738
4.80k
    if (fs_codec->utf8) {
3739
4.80k
        return unicode_encode_utf8(unicode,
3740
4.80k
                                   fs_codec->error_handler,
3741
4.80k
                                   fs_codec->errors);
3742
4.80k
    }
3743
0
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3744
0
    else if (fs_codec->encoding) {
3745
0
        return PyUnicode_AsEncodedString(unicode,
3746
0
                                         fs_codec->encoding,
3747
0
                                         fs_codec->errors);
3748
0
    }
3749
0
#endif
3750
0
    else {
3751
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3752
           machinery is not ready and so cannot be used:
3753
           use wcstombs() in this case. */
3754
0
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3755
0
        const wchar_t *filesystem_errors = config->filesystem_errors;
3756
0
        assert(filesystem_errors != NULL);
3757
0
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3758
0
        assert(errors != _Py_ERROR_UNKNOWN);
3759
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3760
        return unicode_encode_utf8(unicode, errors, NULL);
3761
#else
3762
0
        return unicode_encode_locale(unicode, errors, 0);
3763
0
#endif
3764
0
    }
3765
4.80k
}
3766
3767
PyObject *
3768
PyUnicode_AsEncodedString(PyObject *unicode,
3769
                          const char *encoding,
3770
                          const char *errors)
3771
3.83k
{
3772
3.83k
    PyObject *v;
3773
3.83k
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3774
3775
3.83k
    if (!PyUnicode_Check(unicode)) {
3776
0
        PyErr_BadArgument();
3777
0
        return NULL;
3778
0
    }
3779
3780
3.83k
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3781
0
        return NULL;
3782
0
    }
3783
3784
3.83k
    if (encoding == NULL) {
3785
0
        return _PyUnicode_AsUTF8String(unicode, errors);
3786
0
    }
3787
3788
    /* Shortcuts for common default encodings */
3789
3.83k
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3790
3.83k
        char *lower = buflower;
3791
3792
        /* Fast paths */
3793
3.83k
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3794
57
            lower += 3;
3795
57
            if (*lower == '_') {
3796
                /* Match "utf8" and "utf_8" */
3797
57
                lower++;
3798
57
            }
3799
3800
57
            if (lower[0] == '8' && lower[1] == 0) {
3801
57
                return _PyUnicode_AsUTF8String(unicode, errors);
3802
57
            }
3803
0
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3804
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3805
0
            }
3806
0
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3807
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3808
0
            }
3809
57
        }
3810
3.77k
        else {
3811
3.77k
            if (strcmp(lower, "ascii") == 0
3812
2.64k
                || strcmp(lower, "us_ascii") == 0) {
3813
2.64k
                return _PyUnicode_AsASCIIString(unicode, errors);
3814
2.64k
            }
3815
#ifdef MS_WINDOWS
3816
            else if (strcmp(lower, "mbcs") == 0) {
3817
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3818
            }
3819
#endif
3820
1.13k
            else if (strcmp(lower, "latin1") == 0 ||
3821
1.13k
                     strcmp(lower, "latin_1") == 0 ||
3822
1.13k
                     strcmp(lower, "iso_8859_1") == 0 ||
3823
1.13k
                     strcmp(lower, "iso8859_1") == 0) {
3824
0
                return _PyUnicode_AsLatin1String(unicode, errors);
3825
0
            }
3826
3.77k
        }
3827
3.83k
    }
3828
3829
    /* Encode via the codec registry */
3830
1.13k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3831
1.13k
    if (v == NULL)
3832
0
        return NULL;
3833
3834
    /* The normal path */
3835
1.13k
    if (PyBytes_Check(v))
3836
1.13k
        return v;
3837
3838
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3839
0
    if (PyByteArray_Check(v)) {
3840
0
        int error;
3841
0
        PyObject *b;
3842
3843
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3844
0
            "encoder %s returned bytearray instead of bytes; "
3845
0
            "use codecs.encode() to encode to arbitrary types",
3846
0
            encoding);
3847
0
        if (error) {
3848
0
            Py_DECREF(v);
3849
0
            return NULL;
3850
0
        }
3851
3852
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3853
0
                                      PyByteArray_GET_SIZE(v));
3854
0
        Py_DECREF(v);
3855
0
        return b;
3856
0
    }
3857
3858
0
    PyErr_Format(PyExc_TypeError,
3859
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3860
0
                 "use codecs.encode() to encode to arbitrary types",
3861
0
                 encoding,
3862
0
                 Py_TYPE(v)->tp_name);
3863
0
    Py_DECREF(v);
3864
0
    return NULL;
3865
0
}
3866
3867
PyAPI_FUNC(PyObject *)
3868
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3869
                           const char *encoding,
3870
                           const char *errors)
3871
0
{
3872
0
    PyObject *v;
3873
3874
0
    if (!PyUnicode_Check(unicode)) {
3875
0
        PyErr_BadArgument();
3876
0
        goto onError;
3877
0
    }
3878
3879
0
    if (encoding == NULL)
3880
0
        encoding = PyUnicode_GetDefaultEncoding();
3881
3882
    /* Encode via the codec registry */
3883
0
    v = PyCodec_Encode(unicode, encoding, errors);
3884
0
    if (v == NULL)
3885
0
        goto onError;
3886
0
    if (!PyUnicode_Check(v)) {
3887
0
        PyErr_Format(PyExc_TypeError,
3888
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3889
0
                     "use codecs.encode() to encode to arbitrary types",
3890
0
                     encoding,
3891
0
                     Py_TYPE(v)->tp_name);
3892
0
        Py_DECREF(v);
3893
0
        goto onError;
3894
0
    }
3895
0
    return v;
3896
3897
0
  onError:
3898
0
    return NULL;
3899
0
}
3900
3901
static PyObject*
3902
unicode_decode_locale(const char *str, Py_ssize_t len,
3903
                      _Py_error_handler errors, int current_locale)
3904
4.28k
{
3905
4.28k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3906
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3907
0
        return NULL;
3908
0
    }
3909
3910
4.28k
    wchar_t *wstr;
3911
4.28k
    size_t wlen;
3912
4.28k
    const char *reason;
3913
4.28k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3914
4.28k
                                 current_locale, errors);
3915
4.28k
    if (res != 0) {
3916
0
        if (res == -2) {
3917
0
            PyObject *exc;
3918
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3919
0
                                        "locale", str, len,
3920
0
                                        (Py_ssize_t)wlen,
3921
0
                                        (Py_ssize_t)(wlen + 1),
3922
0
                                        reason);
3923
0
            if (exc != NULL) {
3924
0
                PyCodec_StrictErrors(exc);
3925
0
                Py_DECREF(exc);
3926
0
            }
3927
0
        }
3928
0
        else if (res == -3) {
3929
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3930
0
        }
3931
0
        else {
3932
0
            PyErr_NoMemory();
3933
0
        }
3934
0
        return NULL;
3935
0
    }
3936
3937
4.28k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3938
4.28k
    PyMem_RawFree(wstr);
3939
4.28k
    return unicode;
3940
4.28k
}
3941
3942
PyObject*
3943
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3944
                              const char *errors)
3945
0
{
3946
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3947
0
    return unicode_decode_locale(str, len, error_handler, 1);
3948
0
}
3949
3950
PyObject*
3951
PyUnicode_DecodeLocale(const char *str, const char *errors)
3952
4.26k
{
3953
4.26k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3954
4.26k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3955
4.26k
    return unicode_decode_locale(str, size, error_handler, 1);
3956
4.26k
}
3957
3958
3959
PyObject*
3960
15.1k
PyUnicode_DecodeFSDefault(const char *s) {
3961
15.1k
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3962
15.1k
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3963
15.1k
}
3964
3965
PyObject*
3966
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3967
20.7k
{
3968
20.7k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3969
20.7k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3970
20.7k
    if (fs_codec->utf8) {
3971
20.6k
        return unicode_decode_utf8(s, size,
3972
20.6k
                                   fs_codec->error_handler,
3973
20.6k
                                   fs_codec->errors,
3974
20.6k
                                   NULL);
3975
20.6k
    }
3976
19
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3977
19
    else if (fs_codec->encoding) {
3978
0
        return PyUnicode_Decode(s, size,
3979
0
                                fs_codec->encoding,
3980
0
                                fs_codec->errors);
3981
0
    }
3982
19
#endif
3983
19
    else {
3984
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3985
           machinery is not ready and so cannot be used:
3986
           use mbstowcs() in this case. */
3987
19
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3988
19
        const wchar_t *filesystem_errors = config->filesystem_errors;
3989
19
        assert(filesystem_errors != NULL);
3990
19
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3991
19
        assert(errors != _Py_ERROR_UNKNOWN);
3992
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3993
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3994
#else
3995
19
        return unicode_decode_locale(s, size, errors, 0);
3996
19
#endif
3997
19
    }
3998
20.7k
}
3999
4000
4001
int
4002
PyUnicode_FSConverter(PyObject* arg, void* addr)
4003
3.68k
{
4004
3.68k
    PyObject *path = NULL;
4005
3.68k
    PyObject *output = NULL;
4006
3.68k
    Py_ssize_t size;
4007
3.68k
    const char *data;
4008
3.68k
    if (arg == NULL) {
4009
0
        Py_DECREF(*(PyObject**)addr);
4010
0
        *(PyObject**)addr = NULL;
4011
0
        return 1;
4012
0
    }
4013
3.68k
    path = PyOS_FSPath(arg);
4014
3.68k
    if (path == NULL) {
4015
0
        return 0;
4016
0
    }
4017
3.68k
    if (PyBytes_Check(path)) {
4018
0
        output = path;
4019
0
    }
4020
3.68k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4021
3.68k
        output = PyUnicode_EncodeFSDefault(path);
4022
3.68k
        Py_DECREF(path);
4023
3.68k
        if (!output) {
4024
0
            return 0;
4025
0
        }
4026
3.68k
        assert(PyBytes_Check(output));
4027
3.68k
    }
4028
4029
3.68k
    size = PyBytes_GET_SIZE(output);
4030
3.68k
    data = PyBytes_AS_STRING(output);
4031
3.68k
    if ((size_t)size != strlen(data)) {
4032
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4033
0
        Py_DECREF(output);
4034
0
        return 0;
4035
0
    }
4036
3.68k
    *(PyObject**)addr = output;
4037
3.68k
    return Py_CLEANUP_SUPPORTED;
4038
3.68k
}
4039
4040
4041
int
4042
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4043
20
{
4044
20
    if (arg == NULL) {
4045
0
        Py_DECREF(*(PyObject**)addr);
4046
0
        *(PyObject**)addr = NULL;
4047
0
        return 1;
4048
0
    }
4049
4050
20
    PyObject *path = PyOS_FSPath(arg);
4051
20
    if (path == NULL) {
4052
0
        return 0;
4053
0
    }
4054
4055
20
    PyObject *output = NULL;
4056
20
    if (PyUnicode_Check(path)) {
4057
20
        output = path;
4058
20
    }
4059
0
    else if (PyBytes_Check(path)) {
4060
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4061
0
                                                  PyBytes_GET_SIZE(path));
4062
0
        Py_DECREF(path);
4063
0
        if (!output) {
4064
0
            return 0;
4065
0
        }
4066
0
    }
4067
0
    else {
4068
0
        PyErr_Format(PyExc_TypeError,
4069
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4070
0
                     Py_TYPE(arg)->tp_name);
4071
0
        Py_DECREF(path);
4072
0
        return 0;
4073
0
    }
4074
4075
20
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4076
20
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4077
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4078
0
        Py_DECREF(output);
4079
0
        return 0;
4080
0
    }
4081
20
    *(PyObject**)addr = output;
4082
20
    return Py_CLEANUP_SUPPORTED;
4083
20
}
4084
4085
4086
static int unicode_fill_utf8(PyObject *unicode);
4087
4088
4089
static int
4090
unicode_ensure_utf8(PyObject *unicode)
4091
273k
{
4092
273k
    int err = 0;
4093
273k
    if (PyUnicode_UTF8(unicode) == NULL) {
4094
2.75k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4095
2.75k
        if (PyUnicode_UTF8(unicode) == NULL) {
4096
2.75k
            err = unicode_fill_utf8(unicode);
4097
2.75k
        }
4098
2.75k
        Py_END_CRITICAL_SECTION();
4099
2.75k
    }
4100
273k
    return err;
4101
273k
}
4102
4103
const char *
4104
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4105
273k
{
4106
273k
    if (!PyUnicode_Check(unicode)) {
4107
0
        PyErr_BadArgument();
4108
0
        if (psize) {
4109
0
            *psize = -1;
4110
0
        }
4111
0
        return NULL;
4112
0
    }
4113
4114
273k
    if (unicode_ensure_utf8(unicode) == -1) {
4115
0
        if (psize) {
4116
0
            *psize = -1;
4117
0
        }
4118
0
        return NULL;
4119
0
    }
4120
4121
273k
    if (psize) {
4122
115k
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4123
115k
    }
4124
273k
    return PyUnicode_UTF8(unicode);
4125
273k
}
4126
4127
const char *
4128
PyUnicode_AsUTF8(PyObject *unicode)
4129
157k
{
4130
157k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4131
157k
}
4132
4133
const char *
4134
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4135
11.8k
{
4136
11.8k
    Py_ssize_t size;
4137
11.8k
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4138
11.8k
    if (s && strlen(s) != (size_t)size) {
4139
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4140
0
        return NULL;
4141
0
    }
4142
11.8k
    return s;
4143
11.8k
}
4144
4145
/*
4146
PyUnicode_GetSize() has been deprecated since Python 3.3
4147
because it returned length of Py_UNICODE.
4148
4149
But this function is part of stable abi, because it doesn't
4150
include Py_UNICODE in signature and it was not excluded from
4151
stable ABI in PEP 384.
4152
*/
4153
PyAPI_FUNC(Py_ssize_t)
4154
PyUnicode_GetSize(PyObject *unicode)
4155
0
{
4156
0
    PyErr_SetString(PyExc_RuntimeError,
4157
0
                    "PyUnicode_GetSize has been removed.");
4158
0
    return -1;
4159
0
}
4160
4161
Py_ssize_t
4162
PyUnicode_GetLength(PyObject *unicode)
4163
559
{
4164
559
    if (!PyUnicode_Check(unicode)) {
4165
0
        PyErr_BadArgument();
4166
0
        return -1;
4167
0
    }
4168
559
    return PyUnicode_GET_LENGTH(unicode);
4169
559
}
4170
4171
Py_UCS4
4172
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4173
7
{
4174
7
    const void *data;
4175
7
    int kind;
4176
4177
7
    if (!PyUnicode_Check(unicode)) {
4178
0
        PyErr_BadArgument();
4179
0
        return (Py_UCS4)-1;
4180
0
    }
4181
7
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4182
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4183
0
        return (Py_UCS4)-1;
4184
0
    }
4185
7
    data = PyUnicode_DATA(unicode);
4186
7
    kind = PyUnicode_KIND(unicode);
4187
7
    return PyUnicode_READ(kind, data, index);
4188
7
}
4189
4190
int
4191
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4192
0
{
4193
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4194
0
        PyErr_BadArgument();
4195
0
        return -1;
4196
0
    }
4197
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4198
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4199
0
        return -1;
4200
0
    }
4201
0
    if (unicode_check_modifiable(unicode))
4202
0
        return -1;
4203
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4204
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4205
0
        return -1;
4206
0
    }
4207
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4208
0
                    index, ch);
4209
0
    return 0;
4210
0
}
4211
4212
const char *
4213
PyUnicode_GetDefaultEncoding(void)
4214
0
{
4215
0
    return "utf-8";
4216
0
}
4217
4218
/* create or adjust a UnicodeDecodeError */
4219
static void
4220
make_decode_exception(PyObject **exceptionObject,
4221
                      const char *encoding,
4222
                      const char *input, Py_ssize_t length,
4223
                      Py_ssize_t startpos, Py_ssize_t endpos,
4224
                      const char *reason)
4225
121k
{
4226
121k
    if (*exceptionObject == NULL) {
4227
4.29k
        *exceptionObject = PyUnicodeDecodeError_Create(
4228
4.29k
            encoding, input, length, startpos, endpos, reason);
4229
4.29k
    }
4230
117k
    else {
4231
117k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4232
0
            goto onError;
4233
117k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4234
0
            goto onError;
4235
117k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4236
0
            goto onError;
4237
117k
    }
4238
121k
    return;
4239
4240
121k
onError:
4241
0
    Py_CLEAR(*exceptionObject);
4242
0
}
4243
4244
#ifdef MS_WINDOWS
4245
static int
4246
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4247
{
4248
    if (newsize > *size) {
4249
        wchar_t *newbuf = *buf;
4250
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4251
            PyErr_NoMemory();
4252
            return -1;
4253
        }
4254
        *buf = newbuf;
4255
    }
4256
    *size = newsize;
4257
    return 0;
4258
}
4259
4260
/* error handling callback helper:
4261
   build arguments, call the callback and check the arguments,
4262
   if no exception occurred, copy the replacement to the output
4263
   and adjust various state variables.
4264
   return 0 on success, -1 on error
4265
*/
4266
4267
static int
4268
unicode_decode_call_errorhandler_wchar(
4269
    const char *errors, PyObject **errorHandler,
4270
    const char *encoding, const char *reason,
4271
    const char **input, const char **inend, Py_ssize_t *startinpos,
4272
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4273
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4274
{
4275
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4276
4277
    PyObject *restuple = NULL;
4278
    PyObject *repunicode = NULL;
4279
    Py_ssize_t outsize;
4280
    Py_ssize_t insize;
4281
    Py_ssize_t requiredsize;
4282
    Py_ssize_t newpos;
4283
    PyObject *inputobj = NULL;
4284
    Py_ssize_t repwlen;
4285
4286
    if (*errorHandler == NULL) {
4287
        *errorHandler = PyCodec_LookupError(errors);
4288
        if (*errorHandler == NULL)
4289
            goto onError;
4290
    }
4291
4292
    make_decode_exception(exceptionObject,
4293
        encoding,
4294
        *input, *inend - *input,
4295
        *startinpos, *endinpos,
4296
        reason);
4297
    if (*exceptionObject == NULL)
4298
        goto onError;
4299
4300
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4301
    if (restuple == NULL)
4302
        goto onError;
4303
    if (!PyTuple_Check(restuple)) {
4304
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4305
        goto onError;
4306
    }
4307
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4308
        goto onError;
4309
4310
    /* Copy back the bytes variables, which might have been modified by the
4311
       callback */
4312
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4313
    if (!inputobj)
4314
        goto onError;
4315
    *input = PyBytes_AS_STRING(inputobj);
4316
    insize = PyBytes_GET_SIZE(inputobj);
4317
    *inend = *input + insize;
4318
    /* we can DECREF safely, as the exception has another reference,
4319
       so the object won't go away. */
4320
    Py_DECREF(inputobj);
4321
4322
    if (newpos<0)
4323
        newpos = insize+newpos;
4324
    if (newpos<0 || newpos>insize) {
4325
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4326
        goto onError;
4327
    }
4328
4329
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4330
    if (repwlen < 0)
4331
        goto onError;
4332
    repwlen--;
4333
    /* need more space? (at least enough for what we
4334
       have+the replacement+the rest of the string (starting
4335
       at the new input position), so we won't have to check space
4336
       when there are no errors in the rest of the string) */
4337
    requiredsize = *outpos;
4338
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4339
        goto overflow;
4340
    requiredsize += repwlen;
4341
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4342
        goto overflow;
4343
    requiredsize += insize - newpos;
4344
    outsize = *bufsize;
4345
    if (requiredsize > outsize) {
4346
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4347
            requiredsize = 2*outsize;
4348
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4349
            goto onError;
4350
        }
4351
    }
4352
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4353
    *outpos += repwlen;
4354
    *endinpos = newpos;
4355
    *inptr = *input + newpos;
4356
4357
    /* we made it! */
4358
    Py_DECREF(restuple);
4359
    return 0;
4360
4361
  overflow:
4362
    PyErr_SetString(PyExc_OverflowError,
4363
                    "decoded result is too long for a Python string");
4364
4365
  onError:
4366
    Py_XDECREF(restuple);
4367
    return -1;
4368
}
4369
#endif   /* MS_WINDOWS */
4370
4371
static int
4372
unicode_decode_call_errorhandler_writer(
4373
    const char *errors, PyObject **errorHandler,
4374
    const char *encoding, const char *reason,
4375
    const char **input, const char **inend, Py_ssize_t *startinpos,
4376
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4377
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4378
121k
{
4379
121k
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4380
4381
121k
    PyObject *restuple = NULL;
4382
121k
    PyObject *repunicode = NULL;
4383
121k
    Py_ssize_t insize;
4384
121k
    Py_ssize_t newpos;
4385
121k
    Py_ssize_t replen;
4386
121k
    Py_ssize_t remain;
4387
121k
    PyObject *inputobj = NULL;
4388
121k
    int need_to_grow = 0;
4389
121k
    const char *new_inptr;
4390
4391
121k
    if (*errorHandler == NULL) {
4392
4.29k
        *errorHandler = PyCodec_LookupError(errors);
4393
4.29k
        if (*errorHandler == NULL)
4394
0
            goto onError;
4395
4.29k
    }
4396
4397
121k
    make_decode_exception(exceptionObject,
4398
121k
        encoding,
4399
121k
        *input, *inend - *input,
4400
121k
        *startinpos, *endinpos,
4401
121k
        reason);
4402
121k
    if (*exceptionObject == NULL)
4403
0
        goto onError;
4404
4405
121k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4406
121k
    if (restuple == NULL)
4407
3.87k
        goto onError;
4408
117k
    if (!PyTuple_Check(restuple)) {
4409
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4410
0
        goto onError;
4411
0
    }
4412
117k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4413
0
        goto onError;
4414
4415
    /* Copy back the bytes variables, which might have been modified by the
4416
       callback */
4417
117k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4418
117k
    if (!inputobj)
4419
0
        goto onError;
4420
117k
    remain = *inend - *input - *endinpos;
4421
117k
    *input = PyBytes_AS_STRING(inputobj);
4422
117k
    insize = PyBytes_GET_SIZE(inputobj);
4423
117k
    *inend = *input + insize;
4424
    /* we can DECREF safely, as the exception has another reference,
4425
       so the object won't go away. */
4426
117k
    Py_DECREF(inputobj);
4427
4428
117k
    if (newpos<0)
4429
0
        newpos = insize+newpos;
4430
117k
    if (newpos<0 || newpos>insize) {
4431
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4432
0
        goto onError;
4433
0
    }
4434
4435
117k
    replen = PyUnicode_GET_LENGTH(repunicode);
4436
117k
    if (replen > 1) {
4437
0
        writer->min_length += replen - 1;
4438
0
        need_to_grow = 1;
4439
0
    }
4440
117k
    new_inptr = *input + newpos;
4441
117k
    if (*inend - new_inptr > remain) {
4442
        /* We don't know the decoding algorithm here so we make the worst
4443
           assumption that one byte decodes to one unicode character.
4444
           If unfortunately one byte could decode to more unicode characters,
4445
           the decoder may write out-of-bound then.  Is it possible for the
4446
           algorithms using this function? */
4447
63
        writer->min_length += *inend - new_inptr - remain;
4448
63
        need_to_grow = 1;
4449
63
    }
4450
117k
    if (need_to_grow) {
4451
63
        writer->overallocate = 1;
4452
63
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4453
63
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4454
0
            goto onError;
4455
63
    }
4456
117k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4457
0
        goto onError;
4458
4459
117k
    *endinpos = newpos;
4460
117k
    *inptr = new_inptr;
4461
4462
    /* we made it! */
4463
117k
    Py_DECREF(restuple);
4464
117k
    return 0;
4465
4466
3.87k
  onError:
4467
3.87k
    Py_XDECREF(restuple);
4468
3.87k
    return -1;
4469
117k
}
4470
4471
/* --- UTF-7 Codec -------------------------------------------------------- */
4472
4473
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4474
4475
/* Three simple macros defining base-64. */
4476
4477
/* Is c a base-64 character? */
4478
4479
#define IS_BASE64(c) \
4480
1.78k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4481
1.78k
     ((c) >= 'a' && (c) <= 'z') ||     \
4482
1.78k
     ((c) >= '0' && (c) <= '9') ||     \
4483
1.78k
     (c) == '+' || (c) == '/')
4484
4485
/* given that c is a base-64 character, what is its base-64 value? */
4486
4487
#define FROM_BASE64(c)                                                  \
4488
1.53k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4489
1.53k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4490
992
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4491
681
     (c) == '+' ? 62 : 63)
4492
4493
/* What is the base-64 character of the bottom 6 bits of n? */
4494
4495
#define TO_BASE64(n)  \
4496
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4497
4498
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4499
 * decoded as itself.  We are permissive on decoding; the only ASCII
4500
 * byte not decoding to itself is the + which begins a base64
4501
 * string. */
4502
4503
#define DECODE_DIRECT(c)                                \
4504
11.5k
    ((c) <= 127 && (c) != '+')
4505
4506
/* The UTF-7 encoder treats ASCII characters differently according to
4507
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4508
 * the above).  See RFC2152.  This array identifies these different
4509
 * sets:
4510
 * 0 : "Set D"
4511
 *     alphanumeric and '(),-./:?
4512
 * 1 : "Set O"
4513
 *     !"#$%&*;<=>@[]^_`{|}
4514
 * 2 : "whitespace"
4515
 *     ht nl cr sp
4516
 * 3 : special (must be base64 encoded)
4517
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4518
 */
4519
4520
static
4521
char utf7_category[128] = {
4522
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4523
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4524
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4525
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4526
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4527
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4528
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4529
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4530
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4531
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4532
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4533
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4534
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4535
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4536
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4537
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4538
};
4539
4540
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4541
 * answer depends on whether we are encoding set O as itself, and also
4542
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4543
 * clear that the answers to these questions vary between
4544
 * applications, so this code needs to be flexible.  */
4545
4546
#define ENCODE_DIRECT(c) \
4547
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4548
4549
PyObject *
4550
PyUnicode_DecodeUTF7(const char *s,
4551
                     Py_ssize_t size,
4552
                     const char *errors)
4553
0
{
4554
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4555
0
}
4556
4557
/* The decoder.  The only state we preserve is our read position,
4558
 * i.e. how many characters we have consumed.  So if we end in the
4559
 * middle of a shift sequence we have to back off the read position
4560
 * and the output to the beginning of the sequence, otherwise we lose
4561
 * all the shift state (seen bits, number of bits seen, high
4562
 * surrogate). */
4563
4564
PyObject *
4565
PyUnicode_DecodeUTF7Stateful(const char *s,
4566
                             Py_ssize_t size,
4567
                             const char *errors,
4568
                             Py_ssize_t *consumed)
4569
64
{
4570
64
    const char *starts = s;
4571
64
    Py_ssize_t startinpos;
4572
64
    Py_ssize_t endinpos;
4573
64
    const char *e;
4574
64
    _PyUnicodeWriter writer;
4575
64
    const char *errmsg = "";
4576
64
    int inShift = 0;
4577
64
    Py_ssize_t shiftOutStart;
4578
64
    unsigned int base64bits = 0;
4579
64
    unsigned long base64buffer = 0;
4580
64
    Py_UCS4 surrogate = 0;
4581
64
    PyObject *errorHandler = NULL;
4582
64
    PyObject *exc = NULL;
4583
4584
64
    if (size == 0) {
4585
0
        if (consumed)
4586
0
            *consumed = 0;
4587
0
        _Py_RETURN_UNICODE_EMPTY();
4588
0
    }
4589
4590
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4591
64
    _PyUnicodeWriter_Init(&writer);
4592
64
    writer.min_length = size;
4593
4594
64
    shiftOutStart = 0;
4595
64
    e = s + size;
4596
4597
13.6k
    while (s < e) {
4598
13.6k
        Py_UCS4 ch;
4599
13.6k
      restart:
4600
13.6k
        ch = (unsigned char) *s;
4601
4602
13.6k
        if (inShift) { /* in a base-64 section */
4603
1.65k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4604
1.53k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4605
1.53k
                base64bits += 6;
4606
1.53k
                s++;
4607
1.53k
                if (base64bits >= 16) {
4608
                    /* we have enough bits for a UTF-16 value */
4609
555
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4610
555
                    base64bits -= 16;
4611
555
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4612
555
                    assert(outCh <= 0xffff);
4613
555
                    if (surrogate) {
4614
                        /* expecting a second surrogate */
4615
50
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4616
23
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4617
23
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4618
0
                                goto onError;
4619
23
                            surrogate = 0;
4620
23
                            continue;
4621
23
                        }
4622
27
                        else {
4623
27
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4624
0
                                goto onError;
4625
27
                            surrogate = 0;
4626
27
                        }
4627
50
                    }
4628
532
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4629
                        /* first surrogate */
4630
85
                        surrogate = outCh;
4631
85
                    }
4632
447
                    else {
4633
447
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4634
0
                            goto onError;
4635
447
                    }
4636
532
                }
4637
1.53k
            }
4638
126
            else { /* now leaving a base-64 section */
4639
126
                inShift = 0;
4640
126
                if (base64bits > 0) { /* left-over bits */
4641
82
                    if (base64bits >= 6) {
4642
                        /* We've seen at least one base-64 character */
4643
17
                        s++;
4644
17
                        errmsg = "partial character in shift sequence";
4645
17
                        goto utf7Error;
4646
17
                    }
4647
65
                    else {
4648
                        /* Some bits remain; they should be zero */
4649
65
                        if (base64buffer != 0) {
4650
8
                            s++;
4651
8
                            errmsg = "non-zero padding bits in shift sequence";
4652
8
                            goto utf7Error;
4653
8
                        }
4654
65
                    }
4655
82
                }
4656
101
                if (surrogate && DECODE_DIRECT(ch)) {
4657
32
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4658
0
                        goto onError;
4659
32
                }
4660
101
                surrogate = 0;
4661
101
                if (ch == '-') {
4662
                    /* '-' is absorbed; other terminating
4663
                       characters are preserved */
4664
9
                    s++;
4665
9
                }
4666
101
            }
4667
1.65k
        }
4668
11.9k
        else if ( ch == '+' ) {
4669
485
            startinpos = s-starts;
4670
485
            s++; /* consume '+' */
4671
485
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4672
354
                s++;
4673
354
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4674
0
                    goto onError;
4675
354
            }
4676
131
            else if (s < e && !IS_BASE64(*s)) {
4677
5
                s++;
4678
5
                errmsg = "ill-formed sequence";
4679
5
                goto utf7Error;
4680
5
            }
4681
126
            else { /* begin base64-encoded section */
4682
126
                inShift = 1;
4683
126
                surrogate = 0;
4684
126
                shiftOutStart = writer.pos;
4685
126
                base64bits = 0;
4686
126
                base64buffer = 0;
4687
126
            }
4688
485
        }
4689
11.4k
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4690
11.4k
            s++;
4691
11.4k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4692
0
                goto onError;
4693
11.4k
        }
4694
14
        else {
4695
14
            startinpos = s-starts;
4696
14
            s++;
4697
14
            errmsg = "unexpected special character";
4698
14
            goto utf7Error;
4699
14
        }
4700
13.5k
        continue;
4701
13.5k
utf7Error:
4702
44
        endinpos = s-starts;
4703
44
        if (unicode_decode_call_errorhandler_writer(
4704
44
                errors, &errorHandler,
4705
44
                "utf7", errmsg,
4706
44
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4707
44
                &writer))
4708
44
            goto onError;
4709
44
    }
4710
4711
    /* end of string */
4712
4713
20
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4714
        /* if we're in an inconsistent state, that's an error */
4715
0
        inShift = 0;
4716
0
        if (surrogate ||
4717
0
                (base64bits >= 6) ||
4718
0
                (base64bits > 0 && base64buffer != 0)) {
4719
0
            endinpos = size;
4720
0
            if (unicode_decode_call_errorhandler_writer(
4721
0
                    errors, &errorHandler,
4722
0
                    "utf7", "unterminated shift sequence",
4723
0
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4724
0
                    &writer))
4725
0
                goto onError;
4726
0
            if (s < e)
4727
0
                goto restart;
4728
0
        }
4729
0
    }
4730
4731
    /* return state */
4732
20
    if (consumed) {
4733
0
        if (inShift) {
4734
0
            *consumed = startinpos;
4735
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4736
0
                PyObject *result = PyUnicode_FromKindAndData(
4737
0
                        writer.kind, writer.data, shiftOutStart);
4738
0
                Py_XDECREF(errorHandler);
4739
0
                Py_XDECREF(exc);
4740
0
                _PyUnicodeWriter_Dealloc(&writer);
4741
0
                return result;
4742
0
            }
4743
0
            writer.pos = shiftOutStart; /* back off output */
4744
0
        }
4745
0
        else {
4746
0
            *consumed = s-starts;
4747
0
        }
4748
0
    }
4749
4750
20
    Py_XDECREF(errorHandler);
4751
20
    Py_XDECREF(exc);
4752
20
    return _PyUnicodeWriter_Finish(&writer);
4753
4754
44
  onError:
4755
44
    Py_XDECREF(errorHandler);
4756
44
    Py_XDECREF(exc);
4757
44
    _PyUnicodeWriter_Dealloc(&writer);
4758
44
    return NULL;
4759
20
}
4760
4761
4762
PyObject *
4763
_PyUnicode_EncodeUTF7(PyObject *str,
4764
                      const char *errors)
4765
0
{
4766
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4767
0
    if (len == 0) {
4768
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4769
0
    }
4770
0
    int kind = PyUnicode_KIND(str);
4771
0
    const void *data = PyUnicode_DATA(str);
4772
4773
    /* It might be possible to tighten this worst case */
4774
0
    if (len > PY_SSIZE_T_MAX / 8) {
4775
0
        return PyErr_NoMemory();
4776
0
    }
4777
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4778
0
    if (writer == NULL) {
4779
0
        return NULL;
4780
0
    }
4781
4782
0
    int inShift = 0;
4783
0
    unsigned int base64bits = 0;
4784
0
    unsigned long base64buffer = 0;
4785
0
    char *out = PyBytesWriter_GetData(writer);
4786
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4787
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4788
4789
0
        if (inShift) {
4790
0
            if (ENCODE_DIRECT(ch)) {
4791
                /* shifting out */
4792
0
                if (base64bits) { /* output remaining bits */
4793
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4794
0
                    base64buffer = 0;
4795
0
                    base64bits = 0;
4796
0
                }
4797
0
                inShift = 0;
4798
                /* Characters not in the BASE64 set implicitly unshift the sequence
4799
                   so no '-' is required, except if the character is itself a '-' */
4800
0
                if (IS_BASE64(ch) || ch == '-') {
4801
0
                    *out++ = '-';
4802
0
                }
4803
0
                *out++ = (char) ch;
4804
0
            }
4805
0
            else {
4806
0
                goto encode_char;
4807
0
            }
4808
0
        }
4809
0
        else { /* not in a shift sequence */
4810
0
            if (ch == '+') {
4811
0
                *out++ = '+';
4812
0
                        *out++ = '-';
4813
0
            }
4814
0
            else if (ENCODE_DIRECT(ch)) {
4815
0
                *out++ = (char) ch;
4816
0
            }
4817
0
            else {
4818
0
                *out++ = '+';
4819
0
                inShift = 1;
4820
0
                goto encode_char;
4821
0
            }
4822
0
        }
4823
0
        continue;
4824
0
encode_char:
4825
0
        if (ch >= 0x10000) {
4826
0
            assert(ch <= MAX_UNICODE);
4827
4828
            /* code first surrogate */
4829
0
            base64bits += 16;
4830
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4831
0
            while (base64bits >= 6) {
4832
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4833
0
                base64bits -= 6;
4834
0
            }
4835
            /* prepare second surrogate */
4836
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4837
0
        }
4838
0
        base64bits += 16;
4839
0
        base64buffer = (base64buffer << 16) | ch;
4840
0
        while (base64bits >= 6) {
4841
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4842
0
            base64bits -= 6;
4843
0
        }
4844
0
    }
4845
0
    if (base64bits)
4846
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4847
0
    if (inShift)
4848
0
        *out++ = '-';
4849
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4850
0
}
4851
4852
#undef IS_BASE64
4853
#undef FROM_BASE64
4854
#undef TO_BASE64
4855
#undef DECODE_DIRECT
4856
#undef ENCODE_DIRECT
4857
4858
/* --- UTF-8 Codec -------------------------------------------------------- */
4859
4860
PyObject *
4861
PyUnicode_DecodeUTF8(const char *s,
4862
                     Py_ssize_t size,
4863
                     const char *errors)
4864
1.96M
{
4865
1.96M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4866
1.96M
}
4867
4868
#include "stringlib/asciilib.h"
4869
#include "stringlib/codecs.h"
4870
#include "stringlib/undef.h"
4871
4872
#include "stringlib/ucs1lib.h"
4873
#include "stringlib/codecs.h"
4874
#include "stringlib/undef.h"
4875
4876
#include "stringlib/ucs2lib.h"
4877
#include "stringlib/codecs.h"
4878
#include "stringlib/undef.h"
4879
4880
#include "stringlib/ucs4lib.h"
4881
#include "stringlib/codecs.h"
4882
#include "stringlib/undef.h"
4883
4884
#if (SIZEOF_SIZE_T == 8)
4885
/* Mask to quickly check whether a C 'size_t' contains a
4886
   non-ASCII, UTF8-encoded char. */
4887
58.0M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4888
// used to count codepoints in UTF-8 string.
4889
17.4M
# define VECTOR_0101     0x0101010101010101ULL
4890
320k
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4891
#elif (SIZEOF_SIZE_T == 4)
4892
# define ASCII_CHAR_MASK 0x80808080U
4893
# define VECTOR_0101     0x01010101U
4894
# define VECTOR_00FF     0x00ff00ffU
4895
#else
4896
# error C 'size_t' size should be either 4 or 8!
4897
#endif
4898
4899
#if (defined(__clang__) || defined(__GNUC__))
4900
#define HAVE_CTZ 1
4901
static inline unsigned int
4902
ctz(size_t v)
4903
364k
{
4904
364k
    return __builtin_ctzll((unsigned long long)v);
4905
364k
}
4906
#elif defined(_MSC_VER)
4907
#define HAVE_CTZ 1
4908
static inline unsigned int
4909
ctz(size_t v)
4910
{
4911
    unsigned long pos;
4912
#if SIZEOF_SIZE_T == 4
4913
    _BitScanForward(&pos, v);
4914
#else
4915
    _BitScanForward64(&pos, v);
4916
#endif /* SIZEOF_SIZE_T */
4917
    return pos;
4918
}
4919
#else
4920
#define HAVE_CTZ 0
4921
#endif
4922
4923
#if HAVE_CTZ && PY_LITTLE_ENDIAN
4924
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
4925
static size_t
4926
load_unaligned(const unsigned char *p, size_t size)
4927
2.34M
{
4928
2.34M
    union {
4929
2.34M
        size_t s;
4930
2.34M
        unsigned char b[SIZEOF_SIZE_T];
4931
2.34M
    } u;
4932
2.34M
    u.s = 0;
4933
    // This switch statement assumes little endian because:
4934
    // * union is faster than bitwise or and shift.
4935
    // * big endian machine is rare and hard to maintain.
4936
2.34M
    switch (size) {
4937
0
    default:
4938
0
#if SIZEOF_SIZE_T == 8
4939
0
    case 8:
4940
0
        u.b[7] = p[7];
4941
0
        _Py_FALLTHROUGH;
4942
119k
    case 7:
4943
119k
        u.b[6] = p[6];
4944
119k
        _Py_FALLTHROUGH;
4945
258k
    case 6:
4946
258k
        u.b[5] = p[5];
4947
258k
        _Py_FALLTHROUGH;
4948
414k
    case 5:
4949
414k
        u.b[4] = p[4];
4950
414k
        _Py_FALLTHROUGH;
4951
414k
#endif
4952
799k
    case 4:
4953
799k
        u.b[3] = p[3];
4954
799k
        _Py_FALLTHROUGH;
4955
1.13M
    case 3:
4956
1.13M
        u.b[2] = p[2];
4957
1.13M
        _Py_FALLTHROUGH;
4958
1.90M
    case 2:
4959
1.90M
        u.b[1] = p[1];
4960
1.90M
        _Py_FALLTHROUGH;
4961
1.98M
    case 1:
4962
1.98M
        u.b[0] = p[0];
4963
1.98M
        break;
4964
359k
    case 0:
4965
359k
        break;
4966
2.34M
    }
4967
2.34M
    return u.s;
4968
2.34M
}
4969
#endif
4970
4971
/*
4972
 * Find the first non-ASCII character in a byte sequence.
4973
 *
4974
 * This function scans a range of bytes from `start` to `end` and returns the
4975
 * index of the first byte that is not an ASCII character (i.e., has the most
4976
 * significant bit set). If all characters in the range are ASCII, it returns
4977
 * `end - start`.
4978
 */
4979
static Py_ssize_t
4980
find_first_nonascii(const unsigned char *start, const unsigned char *end)
4981
2.45M
{
4982
    // The search is done in `size_t` chunks.
4983
    // The start and end might not be aligned at `size_t` boundaries,
4984
    // so they're handled specially.
4985
4986
2.45M
    const unsigned char *p = start;
4987
4988
2.45M
    if (end - start >= SIZEOF_SIZE_T) {
4989
        // Avoid unaligned read.
4990
770k
#if PY_LITTLE_ENDIAN && HAVE_CTZ
4991
770k
        size_t u;
4992
770k
        memcpy(&u, p, sizeof(size_t));
4993
770k
        u &= ASCII_CHAR_MASK;
4994
770k
        if (u) {
4995
95.4k
            return (ctz(u) - 7) / 8;
4996
95.4k
        }
4997
674k
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
4998
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
4999
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
5000
        while (p < p2) {
5001
            if (*p & 0x80) {
5002
                return p - start;
5003
            }
5004
            p++;
5005
        }
5006
#endif
5007
5008
674k
        const unsigned char *e = end - SIZEOF_SIZE_T;
5009
53.1M
        while (p <= e) {
5010
52.5M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5011
52.5M
            if (u) {
5012
13.0k
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5013
13.0k
                return p - start + (ctz(u) - 7) / 8;
5014
#else
5015
                // big endian and minor compilers are difficult to test.
5016
                // fallback to per byte check.
5017
                break;
5018
#endif
5019
13.0k
            }
5020
52.4M
            p += SIZEOF_SIZE_T;
5021
52.4M
        }
5022
674k
    }
5023
2.34M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5024
2.45M
    assert((end - p) < SIZEOF_SIZE_T);
5025
    // we can not use *(const size_t*)p to avoid buffer overrun.
5026
2.34M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5027
2.34M
    if (u) {
5028
256k
        return p - start + (ctz(u) - 7) / 8;
5029
256k
    }
5030
2.08M
    return end - start;
5031
#else
5032
    while (p < end) {
5033
        if (*p & 0x80) {
5034
            break;
5035
        }
5036
        p++;
5037
    }
5038
    return p - start;
5039
#endif
5040
2.34M
}
5041
5042
static inline int
5043
scalar_utf8_start_char(unsigned int ch)
5044
1.15M
{
5045
    // 0xxxxxxx or 11xxxxxx are first byte.
5046
1.15M
    return (~ch >> 7 | ch >> 6) & 1;
5047
1.15M
}
5048
5049
static inline size_t
5050
vector_utf8_start_chars(size_t v)
5051
17.4M
{
5052
17.4M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5053
17.4M
}
5054
5055
5056
// Count the number of UTF-8 code points in a given byte sequence.
5057
static Py_ssize_t
5058
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5059
353k
{
5060
353k
    Py_ssize_t len = 0;
5061
5062
353k
    if (end - s >= SIZEOF_SIZE_T) {
5063
213k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5064
109k
            len += scalar_utf8_start_char(*s++);
5065
109k
        }
5066
5067
264k
        while (s + SIZEOF_SIZE_T <= end) {
5068
160k
            const unsigned char *e = end;
5069
160k
            if (e - s > SIZEOF_SIZE_T * 255) {
5070
66.8k
                e = s + SIZEOF_SIZE_T * 255;
5071
66.8k
            }
5072
160k
            Py_ssize_t vstart = 0;
5073
17.6M
            while (s + SIZEOF_SIZE_T <= e) {
5074
17.4M
                size_t v = *(size_t*)s;
5075
17.4M
                size_t vs = vector_utf8_start_chars(v);
5076
17.4M
                vstart += vs;
5077
17.4M
                s += SIZEOF_SIZE_T;
5078
17.4M
            }
5079
160k
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5080
160k
            vstart += vstart >> 16;
5081
160k
#if SIZEOF_SIZE_T == 8
5082
160k
            vstart += vstart >> 32;
5083
160k
#endif
5084
160k
            len += vstart & 0x7ff;
5085
160k
        }
5086
104k
    }
5087
1.40M
    while (s < end) {
5088
1.04M
        len += scalar_utf8_start_char(*s++);
5089
1.04M
    }
5090
353k
    return len;
5091
353k
}
5092
5093
static Py_ssize_t
5094
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5095
91.5k
{
5096
91.5k
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5097
91.5k
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5098
13.3k
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5099
9.48k
    {
5100
        /* Fast path, see in STRINGLIB(utf8_decode) for
5101
           an explanation. */
5102
9.48k
        const char *p = start;
5103
9.48k
        Py_UCS1 *q = dest;
5104
2.46M
        while (p + SIZEOF_SIZE_T <= end) {
5105
2.45M
            size_t value = *(const size_t *) p;
5106
2.45M
            if (value & ASCII_CHAR_MASK)
5107
211
                break;
5108
2.45M
            *((size_t *)q) = value;
5109
2.45M
            p += SIZEOF_SIZE_T;
5110
2.45M
            q += SIZEOF_SIZE_T;
5111
2.45M
        }
5112
38.1k
        while (p < end) {
5113
28.9k
            if ((unsigned char)*p & 0x80)
5114
276
                break;
5115
28.7k
            *q++ = *p++;
5116
28.7k
        }
5117
9.48k
        return p - start;
5118
9.48k
    }
5119
82.0k
#endif
5120
82.0k
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5121
82.0k
                                         (const unsigned char*)end);
5122
82.0k
    memcpy(dest, start, pos);
5123
82.0k
    return pos;
5124
91.5k
}
5125
5126
static int
5127
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5128
                         const char *starts, const char *s, const char *end,
5129
                         _Py_error_handler error_handler,
5130
                         const char *errors,
5131
                         Py_ssize_t *consumed)
5132
365k
{
5133
365k
    Py_ssize_t startinpos, endinpos;
5134
365k
    const char *errmsg = "";
5135
365k
    PyObject *error_handler_obj = NULL;
5136
365k
    PyObject *exc = NULL;
5137
5138
491k
    while (s < end) {
5139
485k
        Py_UCS4 ch;
5140
485k
        int kind = writer->kind;
5141
5142
485k
        if (kind == PyUnicode_1BYTE_KIND) {
5143
172k
            if (PyUnicode_IS_ASCII(writer->buffer))
5144
11.1k
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5145
160k
            else
5146
160k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5147
313k
        } else if (kind == PyUnicode_2BYTE_KIND) {
5148
209k
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5149
209k
        } else {
5150
104k
            assert(kind == PyUnicode_4BYTE_KIND);
5151
104k
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5152
104k
        }
5153
5154
485k
        switch (ch) {
5155
358k
        case 0:
5156
358k
            if (s == end || consumed)
5157
356k
                goto End;
5158
1.83k
            errmsg = "unexpected end of data";
5159
1.83k
            startinpos = s - starts;
5160
1.83k
            endinpos = end - starts;
5161
1.83k
            break;
5162
40.4k
        case 1:
5163
40.4k
            errmsg = "invalid start byte";
5164
40.4k
            startinpos = s - starts;
5165
40.4k
            endinpos = startinpos + 1;
5166
40.4k
            break;
5167
57.5k
        case 2:
5168
57.5k
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5169
8
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5170
3
            {
5171
                /* Truncated surrogate code in range D800-DFFF */
5172
3
                goto End;
5173
3
            }
5174
57.5k
            _Py_FALLTHROUGH;
5175
63.4k
        case 3:
5176
65.1k
        case 4:
5177
65.1k
            errmsg = "invalid continuation byte";
5178
65.1k
            startinpos = s - starts;
5179
65.1k
            endinpos = startinpos + ch - 1;
5180
65.1k
            break;
5181
21.8k
        default:
5182
            // ch doesn't fit into kind, so change the buffer kind to write
5183
            // the character
5184
21.8k
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5185
0
                goto onError;
5186
21.8k
            continue;
5187
485k
        }
5188
5189
107k
        if (error_handler == _Py_ERROR_UNKNOWN)
5190
2.50k
            error_handler = _Py_GetErrorHandler(errors);
5191
5192
107k
        switch (error_handler) {
5193
0
        case _Py_ERROR_IGNORE:
5194
0
            s += (endinpos - startinpos);
5195
0
            break;
5196
5197
78.1k
        case _Py_ERROR_REPLACE:
5198
78.1k
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5199
0
                goto onError;
5200
78.1k
            s += (endinpos - startinpos);
5201
78.1k
            break;
5202
5203
0
        case _Py_ERROR_SURROGATEESCAPE:
5204
0
        {
5205
0
            Py_ssize_t i;
5206
5207
0
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5208
0
                goto onError;
5209
0
            for (i=startinpos; i<endinpos; i++) {
5210
0
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5211
0
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5212
0
                                ch + 0xdc00);
5213
0
                writer->pos++;
5214
0
            }
5215
0
            s += (endinpos - startinpos);
5216
0
            break;
5217
0
        }
5218
5219
29.1k
        default:
5220
29.1k
            if (unicode_decode_call_errorhandler_writer(
5221
29.1k
                    errors, &error_handler_obj,
5222
29.1k
                    "utf-8", errmsg,
5223
29.1k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5224
29.1k
                    writer)) {
5225
3.15k
                goto onError;
5226
3.15k
            }
5227
5228
26.0k
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5229
0
                goto onError;
5230
0
            }
5231
107k
        }
5232
107k
    }
5233
5234
362k
End:
5235
362k
    if (consumed)
5236
53
        *consumed = s - starts;
5237
5238
362k
    Py_XDECREF(error_handler_obj);
5239
362k
    Py_XDECREF(exc);
5240
362k
    return 0;
5241
5242
3.15k
onError:
5243
3.15k
    Py_XDECREF(error_handler_obj);
5244
3.15k
    Py_XDECREF(exc);
5245
3.15k
    return -1;
5246
365k
}
5247
5248
5249
static PyObject *
5250
unicode_decode_utf8(const char *s, Py_ssize_t size,
5251
                    _Py_error_handler error_handler, const char *errors,
5252
                    Py_ssize_t *consumed)
5253
3.28M
{
5254
3.28M
    if (size == 0) {
5255
16.8k
        if (consumed) {
5256
0
            *consumed = 0;
5257
0
        }
5258
16.8k
        _Py_RETURN_UNICODE_EMPTY();
5259
16.8k
    }
5260
5261
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5262
3.26M
    if (size == 1 && (unsigned char)s[0] < 128) {
5263
896k
        if (consumed) {
5264
0
            *consumed = 1;
5265
0
        }
5266
896k
        return get_latin1_char((unsigned char)s[0]);
5267
896k
    }
5268
5269
    // I don't know this check is necessary or not. But there is a test
5270
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5271
2.36M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5272
0
        PyErr_NoMemory();
5273
0
        return NULL;
5274
0
    }
5275
5276
2.36M
    const char *starts = s;
5277
2.36M
    const char *end = s + size;
5278
5279
2.36M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5280
2.36M
    if (pos == size) {  // fast path: ASCII string.
5281
2.00M
        PyObject *u = PyUnicode_New(size, 127);
5282
2.00M
        if (u == NULL) {
5283
0
            return NULL;
5284
0
        }
5285
2.00M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5286
2.00M
        if (consumed) {
5287
0
            *consumed = size;
5288
0
        }
5289
2.00M
        return u;
5290
2.00M
    }
5291
5292
363k
    int maxchr = 127;
5293
363k
    Py_ssize_t maxsize = size;
5294
5295
363k
    unsigned char ch = (unsigned char)(s[pos]);
5296
    // error handler other than strict may remove/replace the invalid byte.
5297
    // consumed != NULL allows 1~3 bytes remainings.
5298
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5299
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5300
    // reallocation and copy.
5301
363k
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5302
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5303
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5304
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5305
        // means that it is no longer necessary to allocate several times the required amount
5306
        // of memory.
5307
353k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5308
353k
        if (ch < 0xc4) { // latin1
5309
159k
            maxchr = 0xff;
5310
159k
        }
5311
194k
        else if (ch < 0xf0) { // ucs2
5312
161k
            maxchr = 0xffff;
5313
161k
        }
5314
33.2k
        else { // ucs4
5315
33.2k
            maxchr = 0x10ffff;
5316
33.2k
        }
5317
353k
    }
5318
363k
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5319
363k
    if (!u) {
5320
0
        return NULL;
5321
0
    }
5322
5323
    // Use _PyUnicodeWriter after fast path is failed.
5324
363k
    _PyUnicodeWriter writer;
5325
363k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5326
363k
    if (maxchr <= 255) {
5327
168k
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5328
168k
        s += pos;
5329
168k
        writer.pos = pos;
5330
168k
    }
5331
5332
363k
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5333
363k
                                 error_handler, errors,
5334
363k
                                 consumed) < 0) {
5335
3.15k
        _PyUnicodeWriter_Dealloc(&writer);
5336
3.15k
        return NULL;
5337
3.15k
    }
5338
360k
    return _PyUnicodeWriter_Finish(&writer);
5339
363k
}
5340
5341
5342
// Used by PyUnicodeWriter_WriteUTF8() implementation
5343
int
5344
_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
5345
                            const char *s, Py_ssize_t size,
5346
                            _Py_error_handler error_handler, const char *errors,
5347
                            Py_ssize_t *consumed)
5348
94.5k
{
5349
94.5k
    if (size == 0) {
5350
10.4k
        if (consumed) {
5351
0
            *consumed = 0;
5352
0
        }
5353
10.4k
        return 0;
5354
10.4k
    }
5355
5356
    // fast path: try ASCII string.
5357
84.1k
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5358
0
        return -1;
5359
0
    }
5360
5361
84.1k
    const char *starts = s;
5362
84.1k
    const char *end = s + size;
5363
84.1k
    Py_ssize_t decoded = 0;
5364
84.1k
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5365
84.1k
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5366
83.2k
        decoded = ascii_decode(s, end, dest);
5367
83.2k
        writer->pos += decoded;
5368
5369
83.2k
        if (decoded == size) {
5370
81.6k
            if (consumed) {
5371
1.06k
                *consumed = size;
5372
1.06k
            }
5373
81.6k
            return 0;
5374
81.6k
        }
5375
1.60k
        s += decoded;
5376
1.60k
    }
5377
5378
2.46k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5379
2.46k
                                    error_handler, errors, consumed);
5380
84.1k
}
5381
5382
5383
PyObject *
5384
PyUnicode_DecodeUTF8Stateful(const char *s,
5385
                             Py_ssize_t size,
5386
                             const char *errors,
5387
                             Py_ssize_t *consumed)
5388
3.26M
{
5389
3.26M
    return unicode_decode_utf8(s, size,
5390
3.26M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5391
3.26M
                               errors, consumed);
5392
3.26M
}
5393
5394
5395
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5396
   non-zero, use strict error handler otherwise.
5397
5398
   On success, write a pointer to a newly allocated wide character string into
5399
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5400
   (in number of wchar_t units) into *wlen (if wlen is set).
5401
5402
   On memory allocation failure, return -1.
5403
5404
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5405
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5406
   is not NULL, write the decoding error message into *reason. */
5407
int
5408
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5409
                 const char **reason, _Py_error_handler errors)
5410
133
{
5411
133
    const char *orig_s = s;
5412
133
    const char *e;
5413
133
    wchar_t *unicode;
5414
133
    Py_ssize_t outpos;
5415
5416
133
    int surrogateescape = 0;
5417
133
    int surrogatepass = 0;
5418
133
    switch (errors)
5419
133
    {
5420
0
    case _Py_ERROR_STRICT:
5421
0
        break;
5422
133
    case _Py_ERROR_SURROGATEESCAPE:
5423
133
        surrogateescape = 1;
5424
133
        break;
5425
0
    case _Py_ERROR_SURROGATEPASS:
5426
0
        surrogatepass = 1;
5427
0
        break;
5428
0
    default:
5429
0
        return -3;
5430
133
    }
5431
5432
    /* Note: size will always be longer than the resulting Unicode
5433
       character count */
5434
133
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5435
0
        return -1;
5436
0
    }
5437
5438
133
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5439
133
    if (!unicode) {
5440
0
        return -1;
5441
0
    }
5442
5443
    /* Unpack UTF-8 encoded data */
5444
133
    e = s + size;
5445
133
    outpos = 0;
5446
133
    while (s < e) {
5447
133
        Py_UCS4 ch;
5448
133
#if SIZEOF_WCHAR_T == 4
5449
133
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5450
#else
5451
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5452
#endif
5453
133
        if (ch > 0xFF) {
5454
0
#if SIZEOF_WCHAR_T == 4
5455
0
            Py_UNREACHABLE();
5456
#else
5457
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5458
            /* write a surrogate pair */
5459
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5460
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5461
#endif
5462
0
        }
5463
133
        else {
5464
133
            if (!ch && s == e) {
5465
133
                break;
5466
133
            }
5467
5468
0
            if (surrogateescape) {
5469
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5470
0
            }
5471
0
            else {
5472
                /* Is it a valid three-byte code? */
5473
0
                if (surrogatepass
5474
0
                    && (e - s) >= 3
5475
0
                    && (s[0] & 0xf0) == 0xe0
5476
0
                    && (s[1] & 0xc0) == 0x80
5477
0
                    && (s[2] & 0xc0) == 0x80)
5478
0
                {
5479
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5480
0
                    s += 3;
5481
0
                    unicode[outpos++] = ch;
5482
0
                }
5483
0
                else {
5484
0
                    PyMem_RawFree(unicode );
5485
0
                    if (reason != NULL) {
5486
0
                        switch (ch) {
5487
0
                        case 0:
5488
0
                            *reason = "unexpected end of data";
5489
0
                            break;
5490
0
                        case 1:
5491
0
                            *reason = "invalid start byte";
5492
0
                            break;
5493
                        /* 2, 3, 4 */
5494
0
                        default:
5495
0
                            *reason = "invalid continuation byte";
5496
0
                            break;
5497
0
                        }
5498
0
                    }
5499
0
                    if (wlen != NULL) {
5500
0
                        *wlen = s - orig_s;
5501
0
                    }
5502
0
                    return -2;
5503
0
                }
5504
0
            }
5505
0
        }
5506
133
    }
5507
133
    unicode[outpos] = L'\0';
5508
133
    if (wlen) {
5509
133
        *wlen = outpos;
5510
133
    }
5511
133
    *wstr = unicode;
5512
133
    return 0;
5513
133
}
5514
5515
5516
wchar_t*
5517
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5518
                               size_t *wlen)
5519
0
{
5520
0
    wchar_t *wstr;
5521
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5522
0
                               &wstr, wlen,
5523
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5524
0
    if (res != 0) {
5525
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5526
0
        assert(res != -3);
5527
0
        if (wlen) {
5528
0
            *wlen = (size_t)res;
5529
0
        }
5530
0
        return NULL;
5531
0
    }
5532
0
    return wstr;
5533
0
}
5534
5535
5536
/* UTF-8 encoder.
5537
5538
   On success, return 0 and write the newly allocated character string (use
5539
   PyMem_Free() to free the memory) into *str.
5540
5541
   On encoding failure, return -2 and write the position of the invalid
5542
   surrogate character into *error_pos (if error_pos is set) and the decoding
5543
   error message into *reason (if reason is set).
5544
5545
   On memory allocation failure, return -1. */
5546
int
5547
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5548
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5549
380
{
5550
380
    const Py_ssize_t max_char_size = 4;
5551
380
    Py_ssize_t len = wcslen(text);
5552
5553
380
    assert(len >= 0);
5554
5555
380
    int surrogateescape = 0;
5556
380
    int surrogatepass = 0;
5557
380
    switch (errors)
5558
380
    {
5559
76
    case _Py_ERROR_STRICT:
5560
76
        break;
5561
304
    case _Py_ERROR_SURROGATEESCAPE:
5562
304
        surrogateescape = 1;
5563
304
        break;
5564
0
    case _Py_ERROR_SURROGATEPASS:
5565
0
        surrogatepass = 1;
5566
0
        break;
5567
0
    default:
5568
0
        return -3;
5569
380
    }
5570
5571
380
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5572
0
        return -1;
5573
0
    }
5574
380
    char *bytes;
5575
380
    if (raw_malloc) {
5576
380
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5577
380
    }
5578
0
    else {
5579
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5580
0
    }
5581
380
    if (bytes == NULL) {
5582
0
        return -1;
5583
0
    }
5584
5585
380
    char *p = bytes;
5586
380
    Py_ssize_t i;
5587
17.4k
    for (i = 0; i < len; ) {
5588
17.0k
        Py_ssize_t ch_pos = i;
5589
17.0k
        Py_UCS4 ch = text[i];
5590
17.0k
        i++;
5591
17.0k
        if (sizeof(wchar_t) == 2
5592
0
            && Py_UNICODE_IS_HIGH_SURROGATE(ch)
5593
0
            && i < len
5594
0
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5595
0
        {
5596
0
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5597
0
            i++;
5598
0
        }
5599
5600
17.0k
        if (ch < 0x80) {
5601
            /* Encode ASCII */
5602
17.0k
            *p++ = (char) ch;
5603
5604
17.0k
        }
5605
0
        else if (ch < 0x0800) {
5606
            /* Encode Latin-1 */
5607
0
            *p++ = (char)(0xc0 | (ch >> 6));
5608
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5609
0
        }
5610
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5611
            /* surrogateescape error handler */
5612
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5613
0
                if (error_pos != NULL) {
5614
0
                    *error_pos = (size_t)ch_pos;
5615
0
                }
5616
0
                if (reason != NULL) {
5617
0
                    *reason = "encoding error";
5618
0
                }
5619
0
                if (raw_malloc) {
5620
0
                    PyMem_RawFree(bytes);
5621
0
                }
5622
0
                else {
5623
0
                    PyMem_Free(bytes);
5624
0
                }
5625
0
                return -2;
5626
0
            }
5627
0
            *p++ = (char)(ch & 0xff);
5628
0
        }
5629
0
        else if (ch < 0x10000) {
5630
0
            *p++ = (char)(0xe0 | (ch >> 12));
5631
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5632
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5633
0
        }
5634
0
        else {  /* ch >= 0x10000 */
5635
0
            assert(ch <= MAX_UNICODE);
5636
            /* Encode UCS4 Unicode ordinals */
5637
0
            *p++ = (char)(0xf0 | (ch >> 18));
5638
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5639
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5640
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5641
0
        }
5642
17.0k
    }
5643
380
    *p++ = '\0';
5644
5645
380
    size_t final_size = (p - bytes);
5646
380
    char *bytes2;
5647
380
    if (raw_malloc) {
5648
380
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5649
380
    }
5650
0
    else {
5651
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5652
0
    }
5653
380
    if (bytes2 == NULL) {
5654
0
        if (error_pos != NULL) {
5655
0
            *error_pos = (size_t)-1;
5656
0
        }
5657
0
        if (raw_malloc) {
5658
0
            PyMem_RawFree(bytes);
5659
0
        }
5660
0
        else {
5661
0
            PyMem_Free(bytes);
5662
0
        }
5663
0
        return -1;
5664
0
    }
5665
380
    *str = bytes2;
5666
380
    return 0;
5667
380
}
5668
5669
5670
/* Primary internal function which creates utf8 encoded bytes objects.
5671
5672
   Allocation strategy:  if the string is short, convert into a stack buffer
5673
   and allocate exactly as much space needed at the end.  Else allocate the
5674
   maximum possible needed (4 result bytes per Unicode character), and return
5675
   the excess memory at the end.
5676
*/
5677
static PyObject *
5678
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5679
                    const char *errors)
5680
6.51k
{
5681
6.51k
    if (!PyUnicode_Check(unicode)) {
5682
0
        PyErr_BadArgument();
5683
0
        return NULL;
5684
0
    }
5685
5686
6.51k
    if (PyUnicode_UTF8(unicode))
5687
4.96k
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5688
4.96k
                                         PyUnicode_UTF8_LENGTH(unicode));
5689
5690
1.55k
    int kind = PyUnicode_KIND(unicode);
5691
1.55k
    const void *data = PyUnicode_DATA(unicode);
5692
1.55k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5693
5694
1.55k
    PyBytesWriter *writer;
5695
1.55k
    char *end;
5696
5697
1.55k
    switch (kind) {
5698
0
    default:
5699
0
        Py_UNREACHABLE();
5700
405
    case PyUnicode_1BYTE_KIND:
5701
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5702
405
        assert(!PyUnicode_IS_ASCII(unicode));
5703
405
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5704
405
                                      error_handler, errors, &end);
5705
405
        break;
5706
1.04k
    case PyUnicode_2BYTE_KIND:
5707
1.04k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5708
1.04k
                                      error_handler, errors, &end);
5709
1.04k
        break;
5710
111
    case PyUnicode_4BYTE_KIND:
5711
111
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5712
111
                                      error_handler, errors, &end);
5713
111
        break;
5714
1.55k
    }
5715
5716
1.55k
    if (writer == NULL) {
5717
7
        PyBytesWriter_Discard(writer);
5718
7
        return NULL;
5719
7
    }
5720
1.55k
    return PyBytesWriter_FinishWithPointer(writer, end);
5721
1.55k
}
5722
5723
static int
5724
unicode_fill_utf8(PyObject *unicode)
5725
2.75k
{
5726
2.75k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5727
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5728
2.75k
    assert(!PyUnicode_IS_ASCII(unicode));
5729
5730
2.75k
    int kind = PyUnicode_KIND(unicode);
5731
2.75k
    const void *data = PyUnicode_DATA(unicode);
5732
2.75k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5733
5734
2.75k
    PyBytesWriter *writer;
5735
2.75k
    char *end;
5736
5737
2.75k
    switch (kind) {
5738
0
    default:
5739
0
        Py_UNREACHABLE();
5740
276
    case PyUnicode_1BYTE_KIND:
5741
276
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5742
276
                                      _Py_ERROR_STRICT, NULL, &end);
5743
276
        break;
5744
1.70k
    case PyUnicode_2BYTE_KIND:
5745
1.70k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5746
1.70k
                                      _Py_ERROR_STRICT, NULL, &end);
5747
1.70k
        break;
5748
773
    case PyUnicode_4BYTE_KIND:
5749
773
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5750
773
                                      _Py_ERROR_STRICT, NULL, &end);
5751
773
        break;
5752
2.75k
    }
5753
2.75k
    if (writer == NULL) {
5754
0
        return -1;
5755
0
    }
5756
5757
2.75k
    const char *start = PyBytesWriter_GetData(writer);
5758
2.75k
    Py_ssize_t len = end - start;
5759
5760
2.75k
    char *cache = PyMem_Malloc(len + 1);
5761
2.75k
    if (cache == NULL) {
5762
0
        PyBytesWriter_Discard(writer);
5763
0
        PyErr_NoMemory();
5764
0
        return -1;
5765
0
    }
5766
2.75k
    memcpy(cache, start, len);
5767
2.75k
    cache[len] = '\0';
5768
2.75k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5769
2.75k
    PyUnicode_SET_UTF8(unicode, cache);
5770
2.75k
    PyBytesWriter_Discard(writer);
5771
2.75k
    return 0;
5772
2.75k
}
5773
5774
PyObject *
5775
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5776
1.71k
{
5777
1.71k
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5778
1.71k
}
5779
5780
5781
PyObject *
5782
PyUnicode_AsUTF8String(PyObject *unicode)
5783
1.65k
{
5784
1.65k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5785
1.65k
}
5786
5787
/* --- UTF-32 Codec ------------------------------------------------------- */
5788
5789
PyObject *
5790
PyUnicode_DecodeUTF32(const char *s,
5791
                      Py_ssize_t size,
5792
                      const char *errors,
5793
                      int *byteorder)
5794
14
{
5795
14
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5796
14
}
5797
5798
PyObject *
5799
PyUnicode_DecodeUTF32Stateful(const char *s,
5800
                              Py_ssize_t size,
5801
                              const char *errors,
5802
                              int *byteorder,
5803
                              Py_ssize_t *consumed)
5804
407
{
5805
407
    const char *starts = s;
5806
407
    Py_ssize_t startinpos;
5807
407
    Py_ssize_t endinpos;
5808
407
    _PyUnicodeWriter writer;
5809
407
    const unsigned char *q, *e;
5810
407
    int le, bo = 0;       /* assume native ordering by default */
5811
407
    const char *encoding;
5812
407
    const char *errmsg = "";
5813
407
    PyObject *errorHandler = NULL;
5814
407
    PyObject *exc = NULL;
5815
5816
407
    q = (const unsigned char *)s;
5817
407
    e = q + size;
5818
5819
407
    if (byteorder)
5820
393
        bo = *byteorder;
5821
5822
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5823
       byte order setting accordingly. In native mode, the leading BOM
5824
       mark is skipped, in all other modes, it is copied to the output
5825
       stream as-is (giving a ZWNBSP character). */
5826
407
    if (bo == 0 && size >= 4) {
5827
17
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5828
17
        if (bom == 0x0000FEFF) {
5829
3
            bo = -1;
5830
3
            q += 4;
5831
3
        }
5832
14
        else if (bom == 0xFFFE0000) {
5833
9
            bo = 1;
5834
9
            q += 4;
5835
9
        }
5836
17
        if (byteorder)
5837
3
            *byteorder = bo;
5838
17
    }
5839
5840
407
    if (q == e) {
5841
2
        if (consumed)
5842
0
            *consumed = size;
5843
2
        _Py_RETURN_UNICODE_EMPTY();
5844
2
    }
5845
5846
#ifdef WORDS_BIGENDIAN
5847
    le = bo < 0;
5848
#else
5849
405
    le = bo <= 0;
5850
405
#endif
5851
405
    encoding = le ? "utf-32-le" : "utf-32-be";
5852
5853
405
    _PyUnicodeWriter_Init(&writer);
5854
405
    writer.min_length = (e - q + 3) / 4;
5855
405
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5856
0
        goto onError;
5857
5858
2.88k
    while (1) {
5859
2.88k
        Py_UCS4 ch = 0;
5860
2.88k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5861
5862
2.88k
        if (e - q >= 4) {
5863
2.76k
            int kind = writer.kind;
5864
2.76k
            void *data = writer.data;
5865
2.76k
            const unsigned char *last = e - 4;
5866
2.76k
            Py_ssize_t pos = writer.pos;
5867
2.76k
            if (le) {
5868
249k
                do {
5869
249k
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5870
249k
                    if (ch > maxch)
5871
272
                        break;
5872
249k
                    if (kind != PyUnicode_1BYTE_KIND &&
5873
223k
                        Py_UNICODE_IS_SURROGATE(ch))
5874
413
                        break;
5875
248k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5876
248k
                    q += 4;
5877
248k
                } while (q <= last);
5878
743
            }
5879
2.01k
            else {
5880
46.2k
                do {
5881
46.2k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5882
46.2k
                    if (ch > maxch)
5883
316
                        break;
5884
45.9k
                    if (kind != PyUnicode_1BYTE_KIND &&
5885
41.8k
                        Py_UNICODE_IS_SURROGATE(ch))
5886
1.65k
                        break;
5887
44.2k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5888
44.2k
                    q += 4;
5889
44.2k
                } while (q <= last);
5890
2.01k
            }
5891
2.76k
            writer.pos = pos;
5892
2.76k
        }
5893
5894
2.88k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5895
2.11k
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5896
2.11k
            startinpos = ((const char *)q) - starts;
5897
2.11k
            endinpos = startinpos + 4;
5898
2.11k
        }
5899
762
        else if (ch <= maxch) {
5900
229
            if (q == e || consumed)
5901
206
                break;
5902
            /* remaining bytes at the end? (size should be divisible by 4) */
5903
23
            errmsg = "truncated data";
5904
23
            startinpos = ((const char *)q) - starts;
5905
23
            endinpos = ((const char *)e) - starts;
5906
23
        }
5907
533
        else {
5908
533
            if (ch < 0x110000) {
5909
357
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5910
0
                    goto onError;
5911
357
                q += 4;
5912
357
                continue;
5913
357
            }
5914
176
            errmsg = "code point not in range(0x110000)";
5915
176
            startinpos = ((const char *)q) - starts;
5916
176
            endinpos = startinpos + 4;
5917
176
        }
5918
5919
        /* The remaining input chars are ignored if the callback
5920
           chooses to skip the input */
5921
2.31k
        if (unicode_decode_call_errorhandler_writer(
5922
2.31k
                errors, &errorHandler,
5923
2.31k
                encoding, errmsg,
5924
2.31k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5925
2.31k
                &writer))
5926
199
            goto onError;
5927
2.31k
    }
5928
5929
206
    if (consumed)
5930
0
        *consumed = (const char *)q-starts;
5931
5932
206
    Py_XDECREF(errorHandler);
5933
206
    Py_XDECREF(exc);
5934
206
    return _PyUnicodeWriter_Finish(&writer);
5935
5936
199
  onError:
5937
199
    _PyUnicodeWriter_Dealloc(&writer);
5938
199
    Py_XDECREF(errorHandler);
5939
199
    Py_XDECREF(exc);
5940
199
    return NULL;
5941
405
}
5942
5943
PyObject *
5944
_PyUnicode_EncodeUTF32(PyObject *str,
5945
                       const char *errors,
5946
                       int byteorder)
5947
0
{
5948
0
    if (!PyUnicode_Check(str)) {
5949
0
        PyErr_BadArgument();
5950
0
        return NULL;
5951
0
    }
5952
0
    int kind = PyUnicode_KIND(str);
5953
0
    const void *data = PyUnicode_DATA(str);
5954
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
5955
5956
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5957
0
        return PyErr_NoMemory();
5958
0
    Py_ssize_t nsize = len + (byteorder == 0);
5959
5960
0
#if PY_LITTLE_ENDIAN
5961
0
    int native_ordering = byteorder <= 0;
5962
#else
5963
    int native_ordering = byteorder >= 0;
5964
#endif
5965
5966
0
    if (kind == PyUnicode_1BYTE_KIND) {
5967
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
5968
        // on short strings
5969
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5970
0
        if (v == NULL) {
5971
0
            return NULL;
5972
0
        }
5973
5974
        /* output buffer is 4-bytes aligned */
5975
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5976
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
5977
0
        if (byteorder == 0) {
5978
0
            *out++ = 0xFEFF;
5979
0
        }
5980
0
        if (len > 0) {
5981
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
5982
0
                                 &out, native_ordering);
5983
0
        }
5984
0
        return v;
5985
0
    }
5986
5987
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
5988
0
    if (writer == NULL) {
5989
0
        return NULL;
5990
0
    }
5991
5992
    /* output buffer is 4-bytes aligned */
5993
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
5994
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
5995
0
    if (byteorder == 0) {
5996
0
        *out++ = 0xFEFF;
5997
0
    }
5998
0
    if (len == 0) {
5999
0
        return PyBytesWriter_Finish(writer);
6000
0
    }
6001
6002
0
    const char *encoding;
6003
0
    if (byteorder == -1)
6004
0
        encoding = "utf-32-le";
6005
0
    else if (byteorder == 1)
6006
0
        encoding = "utf-32-be";
6007
0
    else
6008
0
        encoding = "utf-32";
6009
6010
0
    PyObject *errorHandler = NULL;
6011
0
    PyObject *exc = NULL;
6012
0
    PyObject *rep = NULL;
6013
6014
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6015
0
        if (kind == PyUnicode_2BYTE_KIND) {
6016
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6017
0
                                        &out, native_ordering);
6018
0
        }
6019
0
        else {
6020
0
            assert(kind == PyUnicode_4BYTE_KIND);
6021
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6022
0
                                        &out, native_ordering);
6023
0
        }
6024
0
        if (pos == len)
6025
0
            break;
6026
6027
0
        Py_ssize_t newpos;
6028
0
        rep = unicode_encode_call_errorhandler(
6029
0
                errors, &errorHandler,
6030
0
                encoding, "surrogates not allowed",
6031
0
                str, &exc, pos, pos + 1, &newpos);
6032
0
        if (!rep)
6033
0
            goto error;
6034
6035
0
        Py_ssize_t repsize, moreunits;
6036
0
        if (PyBytes_Check(rep)) {
6037
0
            repsize = PyBytes_GET_SIZE(rep);
6038
0
            if (repsize & 3) {
6039
0
                raise_encode_exception(&exc, encoding,
6040
0
                                       str, pos, pos + 1,
6041
0
                                       "surrogates not allowed");
6042
0
                goto error;
6043
0
            }
6044
0
            moreunits = repsize / 4;
6045
0
        }
6046
0
        else {
6047
0
            assert(PyUnicode_Check(rep));
6048
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6049
0
            if (!PyUnicode_IS_ASCII(rep)) {
6050
0
                raise_encode_exception(&exc, encoding,
6051
0
                                       str, pos, pos + 1,
6052
0
                                       "surrogates not allowed");
6053
0
                goto error;
6054
0
            }
6055
0
        }
6056
0
        moreunits += pos - newpos;
6057
0
        pos = newpos;
6058
6059
        /* four bytes are reserved for each surrogate */
6060
0
        if (moreunits > 0) {
6061
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6062
0
            if (out == NULL) {
6063
0
                goto error;
6064
0
            }
6065
0
        }
6066
6067
0
        if (PyBytes_Check(rep)) {
6068
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6069
0
            out += repsize / 4;
6070
0
        }
6071
0
        else {
6072
            /* rep is unicode */
6073
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6074
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6075
0
                                 &out, native_ordering);
6076
0
        }
6077
6078
0
        Py_CLEAR(rep);
6079
0
    }
6080
6081
0
    Py_XDECREF(errorHandler);
6082
0
    Py_XDECREF(exc);
6083
6084
    /* Cut back to size actually needed. This is necessary for, for example,
6085
       encoding of a string containing isolated surrogates and the 'ignore'
6086
       handler is used. */
6087
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6088
6089
0
  error:
6090
0
    Py_XDECREF(rep);
6091
0
    Py_XDECREF(errorHandler);
6092
0
    Py_XDECREF(exc);
6093
0
    PyBytesWriter_Discard(writer);
6094
0
    return NULL;
6095
0
}
6096
6097
PyObject *
6098
PyUnicode_AsUTF32String(PyObject *unicode)
6099
0
{
6100
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6101
0
}
6102
6103
/* --- UTF-16 Codec ------------------------------------------------------- */
6104
6105
PyObject *
6106
PyUnicode_DecodeUTF16(const char *s,
6107
                      Py_ssize_t size,
6108
                      const char *errors,
6109
                      int *byteorder)
6110
77
{
6111
77
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6112
77
}
6113
6114
PyObject *
6115
PyUnicode_DecodeUTF16Stateful(const char *s,
6116
                              Py_ssize_t size,
6117
                              const char *errors,
6118
                              int *byteorder,
6119
                              Py_ssize_t *consumed)
6120
733
{
6121
733
    const char *starts = s;
6122
733
    Py_ssize_t startinpos;
6123
733
    Py_ssize_t endinpos;
6124
733
    _PyUnicodeWriter writer;
6125
733
    const unsigned char *q, *e;
6126
733
    int bo = 0;       /* assume native ordering by default */
6127
733
    int native_ordering;
6128
733
    const char *errmsg = "";
6129
733
    PyObject *errorHandler = NULL;
6130
733
    PyObject *exc = NULL;
6131
733
    const char *encoding;
6132
6133
733
    q = (const unsigned char *)s;
6134
733
    e = q + size;
6135
6136
733
    if (byteorder)
6137
656
        bo = *byteorder;
6138
6139
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6140
       byte order setting accordingly. In native mode, the leading BOM
6141
       mark is skipped, in all other modes, it is copied to the output
6142
       stream as-is (giving a ZWNBSP character). */
6143
733
    if (bo == 0 && size >= 2) {
6144
89
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6145
89
        if (bom == 0xFEFF) {
6146
18
            q += 2;
6147
18
            bo = -1;
6148
18
        }
6149
71
        else if (bom == 0xFFFE) {
6150
22
            q += 2;
6151
22
            bo = 1;
6152
22
        }
6153
89
        if (byteorder)
6154
12
            *byteorder = bo;
6155
89
    }
6156
6157
733
    if (q == e) {
6158
2
        if (consumed)
6159
0
            *consumed = size;
6160
2
        _Py_RETURN_UNICODE_EMPTY();
6161
2
    }
6162
6163
731
#if PY_LITTLE_ENDIAN
6164
731
    native_ordering = bo <= 0;
6165
731
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6166
#else
6167
    native_ordering = bo >= 0;
6168
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6169
#endif
6170
6171
    /* Note: size will always be longer than the resulting Unicode
6172
       character count normally.  Error handler will take care of
6173
       resizing when needed. */
6174
731
    _PyUnicodeWriter_Init(&writer);
6175
731
    writer.min_length = (e - q + 1) / 2;
6176
731
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6177
0
        goto onError;
6178
6179
90.9k
    while (1) {
6180
90.9k
        Py_UCS4 ch = 0;
6181
90.9k
        if (e - q >= 2) {
6182
90.6k
            int kind = writer.kind;
6183
90.6k
            if (kind == PyUnicode_1BYTE_KIND) {
6184
961
                if (PyUnicode_IS_ASCII(writer.buffer))
6185
729
                    ch = asciilib_utf16_decode(&q, e,
6186
729
                            (Py_UCS1*)writer.data, &writer.pos,
6187
729
                            native_ordering);
6188
232
                else
6189
232
                    ch = ucs1lib_utf16_decode(&q, e,
6190
232
                            (Py_UCS1*)writer.data, &writer.pos,
6191
232
                            native_ordering);
6192
89.6k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6193
22.8k
                ch = ucs2lib_utf16_decode(&q, e,
6194
22.8k
                        (Py_UCS2*)writer.data, &writer.pos,
6195
22.8k
                        native_ordering);
6196
66.7k
            } else {
6197
66.7k
                assert(kind == PyUnicode_4BYTE_KIND);
6198
66.7k
                ch = ucs4lib_utf16_decode(&q, e,
6199
66.7k
                        (Py_UCS4*)writer.data, &writer.pos,
6200
66.7k
                        native_ordering);
6201
66.7k
            }
6202
90.6k
        }
6203
6204
90.9k
        switch (ch)
6205
90.9k
        {
6206
716
        case 0:
6207
            /* remaining byte at the end? (size should be even) */
6208
716
            if (q == e || consumed)
6209
428
                goto End;
6210
288
            errmsg = "truncated data";
6211
288
            startinpos = ((const char *)q) - starts;
6212
288
            endinpos = ((const char *)e) - starts;
6213
288
            break;
6214
            /* The remaining input chars are ignored if the callback
6215
               chooses to skip the input */
6216
158
        case 1:
6217
158
            q -= 2;
6218
158
            if (consumed)
6219
0
                goto End;
6220
158
            errmsg = "unexpected end of data";
6221
158
            startinpos = ((const char *)q) - starts;
6222
158
            endinpos = ((const char *)e) - starts;
6223
158
            break;
6224
58.5k
        case 2:
6225
58.5k
            errmsg = "illegal encoding";
6226
58.5k
            startinpos = ((const char *)q) - 2 - starts;
6227
58.5k
            endinpos = startinpos + 2;
6228
58.5k
            break;
6229
30.6k
        case 3:
6230
30.6k
            errmsg = "illegal UTF-16 surrogate";
6231
30.6k
            startinpos = ((const char *)q) - 4 - starts;
6232
30.6k
            endinpos = startinpos + 2;
6233
30.6k
            break;
6234
815
        default:
6235
815
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6236
0
                goto onError;
6237
815
            continue;
6238
90.9k
        }
6239
6240
89.6k
        if (unicode_decode_call_errorhandler_writer(
6241
89.6k
                errors,
6242
89.6k
                &errorHandler,
6243
89.6k
                encoding, errmsg,
6244
89.6k
                &starts,
6245
89.6k
                (const char **)&e,
6246
89.6k
                &startinpos,
6247
89.6k
                &endinpos,
6248
89.6k
                &exc,
6249
89.6k
                (const char **)&q,
6250
89.6k
                &writer))
6251
303
            goto onError;
6252
89.6k
    }
6253
6254
428
End:
6255
428
    if (consumed)
6256
0
        *consumed = (const char *)q-starts;
6257
6258
428
    Py_XDECREF(errorHandler);
6259
428
    Py_XDECREF(exc);
6260
428
    return _PyUnicodeWriter_Finish(&writer);
6261
6262
303
  onError:
6263
303
    _PyUnicodeWriter_Dealloc(&writer);
6264
303
    Py_XDECREF(errorHandler);
6265
303
    Py_XDECREF(exc);
6266
303
    return NULL;
6267
731
}
6268
6269
PyObject *
6270
_PyUnicode_EncodeUTF16(PyObject *str,
6271
                       const char *errors,
6272
                       int byteorder)
6273
0
{
6274
0
    if (!PyUnicode_Check(str)) {
6275
0
        PyErr_BadArgument();
6276
0
        return NULL;
6277
0
    }
6278
0
    int kind = PyUnicode_KIND(str);
6279
0
    const void *data = PyUnicode_DATA(str);
6280
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6281
6282
0
    Py_ssize_t pairs = 0;
6283
0
    if (kind == PyUnicode_4BYTE_KIND) {
6284
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6285
0
        const Py_UCS4 *end = in + len;
6286
0
        while (in < end) {
6287
0
            if (*in++ >= 0x10000) {
6288
0
                pairs++;
6289
0
            }
6290
0
        }
6291
0
    }
6292
0
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6293
0
        return PyErr_NoMemory();
6294
0
    }
6295
0
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6296
6297
#if PY_BIG_ENDIAN
6298
    int native_ordering = byteorder >= 0;
6299
#else
6300
0
    int native_ordering = byteorder <= 0;
6301
0
#endif
6302
6303
0
    if (kind == PyUnicode_1BYTE_KIND) {
6304
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6305
        // on short strings
6306
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6307
0
        if (v == NULL) {
6308
0
            return NULL;
6309
0
        }
6310
6311
        /* output buffer is 2-bytes aligned */
6312
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6313
0
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6314
0
        if (byteorder == 0) {
6315
0
            *out++ = 0xFEFF;
6316
0
        }
6317
0
        if (len > 0) {
6318
0
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6319
0
        }
6320
0
        return v;
6321
0
    }
6322
6323
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6324
0
    if (writer == NULL) {
6325
0
        return NULL;
6326
0
    }
6327
6328
    /* output buffer is 2-bytes aligned */
6329
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6330
0
    unsigned short *out = PyBytesWriter_GetData(writer);
6331
0
    if (byteorder == 0) {
6332
0
        *out++ = 0xFEFF;
6333
0
    }
6334
0
    if (len == 0) {
6335
0
        return PyBytesWriter_Finish(writer);
6336
0
    }
6337
6338
0
    const char *encoding;
6339
0
    if (byteorder < 0) {
6340
0
        encoding = "utf-16-le";
6341
0
    }
6342
0
    else if (byteorder > 0) {
6343
0
        encoding = "utf-16-be";
6344
0
    }
6345
0
    else {
6346
0
        encoding = "utf-16";
6347
0
    }
6348
6349
0
    PyObject *errorHandler = NULL;
6350
0
    PyObject *exc = NULL;
6351
0
    PyObject *rep = NULL;
6352
6353
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6354
0
        if (kind == PyUnicode_2BYTE_KIND) {
6355
0
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6356
0
                                        &out, native_ordering);
6357
0
        }
6358
0
        else {
6359
0
            assert(kind == PyUnicode_4BYTE_KIND);
6360
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6361
0
                                        &out, native_ordering);
6362
0
        }
6363
0
        if (pos == len)
6364
0
            break;
6365
6366
0
        Py_ssize_t newpos;
6367
0
        rep = unicode_encode_call_errorhandler(
6368
0
                errors, &errorHandler,
6369
0
                encoding, "surrogates not allowed",
6370
0
                str, &exc, pos, pos + 1, &newpos);
6371
0
        if (!rep)
6372
0
            goto error;
6373
6374
0
        Py_ssize_t repsize, moreunits;
6375
0
        if (PyBytes_Check(rep)) {
6376
0
            repsize = PyBytes_GET_SIZE(rep);
6377
0
            if (repsize & 1) {
6378
0
                raise_encode_exception(&exc, encoding,
6379
0
                                       str, pos, pos + 1,
6380
0
                                       "surrogates not allowed");
6381
0
                goto error;
6382
0
            }
6383
0
            moreunits = repsize / 2;
6384
0
        }
6385
0
        else {
6386
0
            assert(PyUnicode_Check(rep));
6387
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6388
0
            if (!PyUnicode_IS_ASCII(rep)) {
6389
0
                raise_encode_exception(&exc, encoding,
6390
0
                                       str, pos, pos + 1,
6391
0
                                       "surrogates not allowed");
6392
0
                goto error;
6393
0
            }
6394
0
        }
6395
0
        moreunits += pos - newpos;
6396
0
        pos = newpos;
6397
6398
        /* two bytes are reserved for each surrogate */
6399
0
        if (moreunits > 0) {
6400
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6401
0
            if (out == NULL) {
6402
0
                goto error;
6403
0
            }
6404
0
        }
6405
6406
0
        if (PyBytes_Check(rep)) {
6407
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6408
0
            out += repsize / 2;
6409
0
        } else {
6410
            /* rep is unicode */
6411
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6412
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6413
0
                                 &out, native_ordering);
6414
0
        }
6415
6416
0
        Py_CLEAR(rep);
6417
0
    }
6418
6419
0
    Py_XDECREF(errorHandler);
6420
0
    Py_XDECREF(exc);
6421
6422
    /* Cut back to size actually needed. This is necessary for, for example,
6423
    encoding of a string containing isolated surrogates and the 'ignore' handler
6424
    is used. */
6425
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6426
6427
0
  error:
6428
0
    Py_XDECREF(rep);
6429
0
    Py_XDECREF(errorHandler);
6430
0
    Py_XDECREF(exc);
6431
0
    PyBytesWriter_Discard(writer);
6432
0
    return NULL;
6433
0
}
6434
6435
PyObject *
6436
PyUnicode_AsUTF16String(PyObject *unicode)
6437
0
{
6438
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6439
0
}
6440
6441
_PyUnicode_Name_CAPI *
6442
_PyUnicode_GetNameCAPI(void)
6443
1.14k
{
6444
1.14k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6445
1.14k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6446
6447
1.14k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6448
1.14k
    if (ucnhash_capi == NULL) {
6449
2
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6450
2
                PyUnicodeData_CAPSULE_NAME, 1);
6451
6452
        // It's fine if we overwrite the value here. It's always the same value.
6453
2
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6454
2
    }
6455
1.14k
    return ucnhash_capi;
6456
1.14k
}
6457
6458
/* --- Unicode Escape Codec ----------------------------------------------- */
6459
6460
PyObject *
6461
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6462
                               Py_ssize_t size,
6463
                               const char *errors,
6464
                               Py_ssize_t *consumed,
6465
                               int *first_invalid_escape_char,
6466
                               const char **first_invalid_escape_ptr)
6467
15.7k
{
6468
15.7k
    const char *starts = s;
6469
15.7k
    const char *initial_starts = starts;
6470
15.7k
    _PyUnicodeWriter writer;
6471
15.7k
    const char *end;
6472
15.7k
    PyObject *errorHandler = NULL;
6473
15.7k
    PyObject *exc = NULL;
6474
15.7k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6475
6476
    // so we can remember if we've seen an invalid escape char or not
6477
15.7k
    *first_invalid_escape_char = -1;
6478
15.7k
    *first_invalid_escape_ptr = NULL;
6479
6480
15.7k
    if (size == 0) {
6481
1.24k
        if (consumed) {
6482
0
            *consumed = 0;
6483
0
        }
6484
1.24k
        _Py_RETURN_UNICODE_EMPTY();
6485
1.24k
    }
6486
    /* Escaped strings will always be longer than the resulting
6487
       Unicode string, so we start with size here and then reduce the
6488
       length after conversion to the true value.
6489
       (but if the error callback returns a long replacement string
6490
       we'll have to allocate more space) */
6491
14.4k
    _PyUnicodeWriter_Init(&writer);
6492
14.4k
    writer.min_length = size;
6493
14.4k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6494
0
        goto onError;
6495
0
    }
6496
6497
14.4k
    end = s + size;
6498
10.1M
    while (s < end) {
6499
10.1M
        unsigned char c = (unsigned char) *s++;
6500
10.1M
        Py_UCS4 ch;
6501
10.1M
        int count;
6502
10.1M
        const char *message;
6503
6504
10.1M
#define WRITE_ASCII_CHAR(ch)                                                  \
6505
10.1M
            do {                                                              \
6506
2.62M
                assert(ch <= 127);                                            \
6507
2.62M
                assert(writer.pos < writer.size);                             \
6508
2.62M
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6509
2.62M
            } while(0)
6510
6511
10.1M
#define WRITE_CHAR(ch)                                                        \
6512
10.1M
            do {                                                              \
6513
7.85M
                if (ch <= writer.maxchar) {                                   \
6514
7.85M
                    assert(writer.pos < writer.size);                         \
6515
7.85M
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6516
7.85M
                }                                                             \
6517
7.85M
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6518
0
                    goto onError;                                             \
6519
0
                }                                                             \
6520
7.85M
            } while(0)
6521
6522
        /* Non-escape characters are interpreted as Unicode ordinals */
6523
10.1M
        if (c != '\\') {
6524
7.21M
            WRITE_CHAR(c);
6525
7.21M
            continue;
6526
7.21M
        }
6527
6528
2.88M
        Py_ssize_t startinpos = s - starts - 1;
6529
        /* \ - Escapes */
6530
2.88M
        if (s >= end) {
6531
0
            message = "\\ at end of string";
6532
0
            goto incomplete;
6533
0
        }
6534
2.88M
        c = (unsigned char) *s++;
6535
6536
2.88M
        assert(writer.pos < writer.size);
6537
2.88M
        switch (c) {
6538
6539
            /* \x escapes */
6540
2.79k
        case '\n': continue;
6541
246k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6542
246k
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6543
282k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6544
282k
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6545
        /* FF */
6546
404k
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6547
404k
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6548
680k
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6549
680k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6550
        /* VT */
6551
386k
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6552
        /* BEL, not classic C */
6553
30.8k
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6554
6555
            /* \OOO (octal) escapes */
6556
75.6k
        case '0': case '1': case '2': case '3':
6557
88.6k
        case '4': case '5': case '6': case '7':
6558
88.6k
            ch = c - '0';
6559
88.6k
            if (s < end && '0' <= *s && *s <= '7') {
6560
10.1k
                ch = (ch<<3) + *s++ - '0';
6561
10.1k
                if (s < end && '0' <= *s && *s <= '7') {
6562
1.15k
                    ch = (ch<<3) + *s++ - '0';
6563
1.15k
                }
6564
10.1k
            }
6565
88.6k
            if (ch > 0377) {
6566
377
                if (*first_invalid_escape_char == -1) {
6567
49
                    *first_invalid_escape_char = ch;
6568
49
                    if (starts == initial_starts) {
6569
                        /* Back up 3 chars, since we've already incremented s. */
6570
49
                        *first_invalid_escape_ptr = s - 3;
6571
49
                    }
6572
49
                }
6573
377
            }
6574
88.6k
            WRITE_CHAR(ch);
6575
88.6k
            continue;
6576
6577
            /* hex escapes */
6578
            /* \xXX */
6579
88.6k
        case 'x':
6580
95
            count = 2;
6581
95
            message = "truncated \\xXX escape";
6582
95
            goto hexescape;
6583
6584
            /* \uXXXX */
6585
668
        case 'u':
6586
668
            count = 4;
6587
668
            message = "truncated \\uXXXX escape";
6588
668
            goto hexescape;
6589
6590
            /* \UXXXXXXXX */
6591
172k
        case 'U':
6592
172k
            count = 8;
6593
172k
            message = "truncated \\UXXXXXXXX escape";
6594
173k
        hexescape:
6595
1.55M
            for (ch = 0; count; ++s, --count) {
6596
1.38M
                if (s >= end) {
6597
2
                    goto incomplete;
6598
2
                }
6599
1.38M
                c = (unsigned char)*s;
6600
1.38M
                ch <<= 4;
6601
1.38M
                if (c >= '0' && c <= '9') {
6602
1.10M
                    ch += c - '0';
6603
1.10M
                }
6604
274k
                else if (c >= 'a' && c <= 'f') {
6605
274k
                    ch += c - ('a' - 10);
6606
274k
                }
6607
137
                else if (c >= 'A' && c <= 'F') {
6608
130
                    ch += c - ('A' - 10);
6609
130
                }
6610
7
                else {
6611
7
                    goto error;
6612
7
                }
6613
1.38M
            }
6614
6615
            /* when we get here, ch is a 32-bit unicode character */
6616
173k
            if (ch > MAX_UNICODE) {
6617
0
                message = "illegal Unicode character";
6618
0
                goto error;
6619
0
            }
6620
6621
173k
            WRITE_CHAR(ch);
6622
173k
            continue;
6623
6624
            /* \N{name} */
6625
173k
        case 'N':
6626
1.14k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6627
1.14k
            if (ucnhash_capi == NULL) {
6628
0
                PyErr_SetString(
6629
0
                        PyExc_UnicodeError,
6630
0
                        "\\N escapes not supported (can't load unicodedata module)"
6631
0
                );
6632
0
                goto onError;
6633
0
            }
6634
6635
1.14k
            message = "malformed \\N character escape";
6636
1.14k
            if (s >= end) {
6637
8
                goto incomplete;
6638
8
            }
6639
1.13k
            if (*s == '{') {
6640
1.12k
                const char *start = ++s;
6641
1.12k
                size_t namelen;
6642
                /* look for the closing brace */
6643
3.14M
                while (s < end && *s != '}')
6644
3.14M
                    s++;
6645
1.12k
                if (s >= end) {
6646
2
                    goto incomplete;
6647
2
                }
6648
1.12k
                namelen = s - start;
6649
1.12k
                if (namelen) {
6650
                    /* found a name.  look it up in the unicode database */
6651
1.12k
                    s++;
6652
1.12k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6653
1.12k
                    if (namelen <= INT_MAX &&
6654
1.12k
                        ucnhash_capi->getcode(start, (int)namelen,
6655
1.12k
                                              &ch, 0)) {
6656
1.09k
                        assert(ch <= MAX_UNICODE);
6657
1.09k
                        WRITE_CHAR(ch);
6658
1.09k
                        continue;
6659
1.09k
                    }
6660
34
                    message = "unknown Unicode character name";
6661
34
                }
6662
1.12k
            }
6663
41
            goto error;
6664
6665
377k
        default:
6666
377k
            if (*first_invalid_escape_char == -1) {
6667
9.18k
                *first_invalid_escape_char = c;
6668
9.18k
                if (starts == initial_starts) {
6669
                    /* Back up one char, since we've already incremented s. */
6670
9.18k
                    *first_invalid_escape_ptr = s - 1;
6671
9.18k
                }
6672
9.18k
            }
6673
377k
            WRITE_ASCII_CHAR('\\');
6674
377k
            WRITE_CHAR(c);
6675
377k
            continue;
6676
2.88M
        }
6677
6678
12
      incomplete:
6679
12
        if (consumed) {
6680
0
            *consumed = startinpos;
6681
0
            break;
6682
0
        }
6683
60
      error:;
6684
60
        Py_ssize_t endinpos = s-starts;
6685
60
        writer.min_length = end - s + writer.pos;
6686
60
        if (unicode_decode_call_errorhandler_writer(
6687
60
                errors, &errorHandler,
6688
60
                "unicodeescape", message,
6689
60
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6690
60
                &writer)) {
6691
60
            goto onError;
6692
60
        }
6693
60
        assert(end - s <= writer.size - writer.pos);
6694
6695
0
#undef WRITE_ASCII_CHAR
6696
0
#undef WRITE_CHAR
6697
0
    }
6698
6699
14.4k
    Py_XDECREF(errorHandler);
6700
14.4k
    Py_XDECREF(exc);
6701
14.4k
    return _PyUnicodeWriter_Finish(&writer);
6702
6703
60
  onError:
6704
60
    _PyUnicodeWriter_Dealloc(&writer);
6705
60
    Py_XDECREF(errorHandler);
6706
60
    Py_XDECREF(exc);
6707
60
    return NULL;
6708
14.4k
}
6709
6710
PyObject *
6711
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6712
                              Py_ssize_t size,
6713
                              const char *errors,
6714
                              Py_ssize_t *consumed)
6715
0
{
6716
0
    int first_invalid_escape_char;
6717
0
    const char *first_invalid_escape_ptr;
6718
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6719
0
                                                      consumed,
6720
0
                                                      &first_invalid_escape_char,
6721
0
                                                      &first_invalid_escape_ptr);
6722
0
    if (result == NULL)
6723
0
        return NULL;
6724
0
    if (first_invalid_escape_char != -1) {
6725
0
        if (first_invalid_escape_char > 0xff) {
6726
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6727
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6728
0
                                 "Such sequences will not work in the future. ",
6729
0
                                 first_invalid_escape_char) < 0)
6730
0
            {
6731
0
                Py_DECREF(result);
6732
0
                return NULL;
6733
0
            }
6734
0
        }
6735
0
        else {
6736
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6737
0
                                 "\"\\%c\" is an invalid escape sequence. "
6738
0
                                 "Such sequences will not work in the future. ",
6739
0
                                 first_invalid_escape_char) < 0)
6740
0
            {
6741
0
                Py_DECREF(result);
6742
0
                return NULL;
6743
0
            }
6744
0
        }
6745
0
    }
6746
0
    return result;
6747
0
}
6748
6749
PyObject *
6750
PyUnicode_DecodeUnicodeEscape(const char *s,
6751
                              Py_ssize_t size,
6752
                              const char *errors)
6753
0
{
6754
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6755
0
}
6756
6757
/* Return a Unicode-Escape string version of the Unicode object. */
6758
6759
PyObject *
6760
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6761
0
{
6762
0
    if (!PyUnicode_Check(unicode)) {
6763
0
        PyErr_BadArgument();
6764
0
        return NULL;
6765
0
    }
6766
6767
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6768
0
    if (len == 0) {
6769
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6770
0
    }
6771
0
    int kind = PyUnicode_KIND(unicode);
6772
0
    const void *data = PyUnicode_DATA(unicode);
6773
6774
    /* Initial allocation is based on the longest-possible character
6775
     * escape.
6776
     *
6777
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6778
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6779
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6780
0
    Py_ssize_t expandsize = kind * 2 + 2;
6781
0
    if (len > PY_SSIZE_T_MAX / expandsize) {
6782
0
        return PyErr_NoMemory();
6783
0
    }
6784
6785
0
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6786
0
    if (writer == NULL) {
6787
0
        return NULL;
6788
0
    }
6789
0
    char *p = PyBytesWriter_GetData(writer);
6790
6791
0
    for (Py_ssize_t i = 0; i < len; i++) {
6792
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6793
6794
        /* U+0000-U+00ff range */
6795
0
        if (ch < 0x100) {
6796
0
            if (ch >= ' ' && ch < 127) {
6797
0
                if (ch != '\\') {
6798
                    /* Copy printable US ASCII as-is */
6799
0
                    *p++ = (char) ch;
6800
0
                }
6801
                /* Escape backslashes */
6802
0
                else {
6803
0
                    *p++ = '\\';
6804
0
                    *p++ = '\\';
6805
0
                }
6806
0
            }
6807
6808
            /* Map special whitespace to '\t', \n', '\r' */
6809
0
            else if (ch == '\t') {
6810
0
                *p++ = '\\';
6811
0
                *p++ = 't';
6812
0
            }
6813
0
            else if (ch == '\n') {
6814
0
                *p++ = '\\';
6815
0
                *p++ = 'n';
6816
0
            }
6817
0
            else if (ch == '\r') {
6818
0
                *p++ = '\\';
6819
0
                *p++ = 'r';
6820
0
            }
6821
6822
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6823
0
            else {
6824
0
                *p++ = '\\';
6825
0
                *p++ = 'x';
6826
0
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6827
0
                *p++ = Py_hexdigits[ch & 0x000F];
6828
0
            }
6829
0
        }
6830
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6831
0
        else if (ch < 0x10000) {
6832
0
            *p++ = '\\';
6833
0
            *p++ = 'u';
6834
0
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6835
0
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6836
0
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6837
0
            *p++ = Py_hexdigits[ch & 0x000F];
6838
0
        }
6839
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6840
0
        else {
6841
6842
            /* Make sure that the first two digits are zero */
6843
0
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6844
0
            *p++ = '\\';
6845
0
            *p++ = 'U';
6846
0
            *p++ = '0';
6847
0
            *p++ = '0';
6848
0
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6849
0
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6850
0
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6851
0
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6852
0
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6853
0
            *p++ = Py_hexdigits[ch & 0x0000000F];
6854
0
        }
6855
0
    }
6856
6857
0
    return PyBytesWriter_FinishWithPointer(writer, p);
6858
0
}
6859
6860
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6861
6862
PyObject *
6863
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6864
                                          Py_ssize_t size,
6865
                                          const char *errors,
6866
                                          Py_ssize_t *consumed)
6867
0
{
6868
0
    const char *starts = s;
6869
0
    _PyUnicodeWriter writer;
6870
0
    const char *end;
6871
0
    PyObject *errorHandler = NULL;
6872
0
    PyObject *exc = NULL;
6873
6874
0
    if (size == 0) {
6875
0
        if (consumed) {
6876
0
            *consumed = 0;
6877
0
        }
6878
0
        _Py_RETURN_UNICODE_EMPTY();
6879
0
    }
6880
6881
    /* Escaped strings will always be longer than the resulting
6882
       Unicode string, so we start with size here and then reduce the
6883
       length after conversion to the true value. (But decoding error
6884
       handler might have to resize the string) */
6885
0
    _PyUnicodeWriter_Init(&writer);
6886
0
    writer.min_length = size;
6887
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6888
0
        goto onError;
6889
0
    }
6890
6891
0
    end = s + size;
6892
0
    while (s < end) {
6893
0
        unsigned char c = (unsigned char) *s++;
6894
0
        Py_UCS4 ch;
6895
0
        int count;
6896
0
        const char *message;
6897
6898
0
#define WRITE_CHAR(ch)                                                        \
6899
0
            do {                                                              \
6900
0
                if (ch <= writer.maxchar) {                                   \
6901
0
                    assert(writer.pos < writer.size);                         \
6902
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6903
0
                }                                                             \
6904
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6905
0
                    goto onError;                                             \
6906
0
                }                                                             \
6907
0
            } while(0)
6908
6909
        /* Non-escape characters are interpreted as Unicode ordinals */
6910
0
        if (c != '\\' || (s >= end && !consumed)) {
6911
0
            WRITE_CHAR(c);
6912
0
            continue;
6913
0
        }
6914
6915
0
        Py_ssize_t startinpos = s - starts - 1;
6916
        /* \ - Escapes */
6917
0
        if (s >= end) {
6918
0
            assert(consumed);
6919
            // Set message to silent compiler warning.
6920
            // Actually it is never used.
6921
0
            message = "\\ at end of string";
6922
0
            goto incomplete;
6923
0
        }
6924
6925
0
        c = (unsigned char) *s++;
6926
0
        if (c == 'u') {
6927
0
            count = 4;
6928
0
            message = "truncated \\uXXXX escape";
6929
0
        }
6930
0
        else if (c == 'U') {
6931
0
            count = 8;
6932
0
            message = "truncated \\UXXXXXXXX escape";
6933
0
        }
6934
0
        else {
6935
0
            assert(writer.pos < writer.size);
6936
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6937
0
            WRITE_CHAR(c);
6938
0
            continue;
6939
0
        }
6940
6941
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6942
0
        for (ch = 0; count; ++s, --count) {
6943
0
            if (s >= end) {
6944
0
                goto incomplete;
6945
0
            }
6946
0
            c = (unsigned char)*s;
6947
0
            ch <<= 4;
6948
0
            if (c >= '0' && c <= '9') {
6949
0
                ch += c - '0';
6950
0
            }
6951
0
            else if (c >= 'a' && c <= 'f') {
6952
0
                ch += c - ('a' - 10);
6953
0
            }
6954
0
            else if (c >= 'A' && c <= 'F') {
6955
0
                ch += c - ('A' - 10);
6956
0
            }
6957
0
            else {
6958
0
                goto error;
6959
0
            }
6960
0
        }
6961
0
        if (ch > MAX_UNICODE) {
6962
0
            message = "\\Uxxxxxxxx out of range";
6963
0
            goto error;
6964
0
        }
6965
0
        WRITE_CHAR(ch);
6966
0
        continue;
6967
6968
0
      incomplete:
6969
0
        if (consumed) {
6970
0
            *consumed = startinpos;
6971
0
            break;
6972
0
        }
6973
0
      error:;
6974
0
        Py_ssize_t endinpos = s-starts;
6975
0
        writer.min_length = end - s + writer.pos;
6976
0
        if (unicode_decode_call_errorhandler_writer(
6977
0
                errors, &errorHandler,
6978
0
                "rawunicodeescape", message,
6979
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6980
0
                &writer)) {
6981
0
            goto onError;
6982
0
        }
6983
0
        assert(end - s <= writer.size - writer.pos);
6984
6985
0
#undef WRITE_CHAR
6986
0
    }
6987
0
    Py_XDECREF(errorHandler);
6988
0
    Py_XDECREF(exc);
6989
0
    return _PyUnicodeWriter_Finish(&writer);
6990
6991
0
  onError:
6992
0
    _PyUnicodeWriter_Dealloc(&writer);
6993
0
    Py_XDECREF(errorHandler);
6994
0
    Py_XDECREF(exc);
6995
0
    return NULL;
6996
0
}
6997
6998
PyObject *
6999
PyUnicode_DecodeRawUnicodeEscape(const char *s,
7000
                                 Py_ssize_t size,
7001
                                 const char *errors)
7002
0
{
7003
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
7004
0
}
7005
7006
7007
PyObject *
7008
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7009
0
{
7010
0
    if (!PyUnicode_Check(unicode)) {
7011
0
        PyErr_BadArgument();
7012
0
        return NULL;
7013
0
    }
7014
0
    int kind = PyUnicode_KIND(unicode);
7015
0
    const void *data = PyUnicode_DATA(unicode);
7016
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7017
0
    if (len == 0) {
7018
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7019
0
    }
7020
0
    if (kind == PyUnicode_1BYTE_KIND) {
7021
0
        return PyBytes_FromStringAndSize(data, len);
7022
0
    }
7023
7024
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7025
       bytes, and 1 byte characters 4. */
7026
0
    Py_ssize_t expandsize = kind * 2 + 2;
7027
0
    if (len > PY_SSIZE_T_MAX / expandsize) {
7028
0
        return PyErr_NoMemory();
7029
0
    }
7030
7031
0
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7032
0
    if (writer == NULL) {
7033
0
        return NULL;
7034
0
    }
7035
0
    char *p = PyBytesWriter_GetData(writer);
7036
7037
0
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7038
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7039
7040
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7041
0
        if (ch < 0x100) {
7042
0
            *p++ = (char) ch;
7043
0
        }
7044
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7045
0
        else if (ch < 0x10000) {
7046
0
            *p++ = '\\';
7047
0
            *p++ = 'u';
7048
0
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7049
0
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7050
0
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7051
0
            *p++ = Py_hexdigits[ch & 15];
7052
0
        }
7053
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7054
0
        else {
7055
0
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7056
0
            *p++ = '\\';
7057
0
            *p++ = 'U';
7058
0
            *p++ = '0';
7059
0
            *p++ = '0';
7060
0
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7061
0
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7062
0
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7063
0
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7064
0
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7065
0
            *p++ = Py_hexdigits[ch & 15];
7066
0
        }
7067
0
    }
7068
7069
0
    return PyBytesWriter_FinishWithPointer(writer, p);
7070
0
}
7071
7072
/* --- Latin-1 Codec ------------------------------------------------------ */
7073
7074
PyObject *
7075
PyUnicode_DecodeLatin1(const char *s,
7076
                       Py_ssize_t size,
7077
                       const char *errors)
7078
383
{
7079
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7080
383
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7081
383
}
7082
7083
/* create or adjust a UnicodeEncodeError */
7084
static void
7085
make_encode_exception(PyObject **exceptionObject,
7086
                      const char *encoding,
7087
                      PyObject *unicode,
7088
                      Py_ssize_t startpos, Py_ssize_t endpos,
7089
                      const char *reason)
7090
2.34k
{
7091
2.34k
    if (*exceptionObject == NULL) {
7092
2.34k
        *exceptionObject = PyObject_CallFunction(
7093
2.34k
            PyExc_UnicodeEncodeError, "sOnns",
7094
2.34k
            encoding, unicode, startpos, endpos, reason);
7095
2.34k
    }
7096
0
    else {
7097
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7098
0
            goto onError;
7099
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7100
0
            goto onError;
7101
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7102
0
            goto onError;
7103
0
        return;
7104
0
      onError:
7105
0
        Py_CLEAR(*exceptionObject);
7106
0
    }
7107
2.34k
}
7108
7109
/* raises a UnicodeEncodeError */
7110
static void
7111
raise_encode_exception(PyObject **exceptionObject,
7112
                       const char *encoding,
7113
                       PyObject *unicode,
7114
                       Py_ssize_t startpos, Py_ssize_t endpos,
7115
                       const char *reason)
7116
2.33k
{
7117
2.33k
    make_encode_exception(exceptionObject,
7118
2.33k
                          encoding, unicode, startpos, endpos, reason);
7119
2.33k
    if (*exceptionObject != NULL)
7120
2.33k
        PyCodec_StrictErrors(*exceptionObject);
7121
2.33k
}
7122
7123
/* error handling callback helper:
7124
   build arguments, call the callback and check the arguments,
7125
   put the result into newpos and return the replacement string, which
7126
   has to be freed by the caller */
7127
static PyObject *
7128
unicode_encode_call_errorhandler(const char *errors,
7129
                                 PyObject **errorHandler,
7130
                                 const char *encoding, const char *reason,
7131
                                 PyObject *unicode, PyObject **exceptionObject,
7132
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7133
                                 Py_ssize_t *newpos)
7134
7
{
7135
7
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7136
7
    Py_ssize_t len;
7137
7
    PyObject *restuple;
7138
7
    PyObject *resunicode;
7139
7140
7
    if (*errorHandler == NULL) {
7141
7
        *errorHandler = PyCodec_LookupError(errors);
7142
7
        if (*errorHandler == NULL)
7143
0
            return NULL;
7144
7
    }
7145
7146
7
    len = PyUnicode_GET_LENGTH(unicode);
7147
7148
7
    make_encode_exception(exceptionObject,
7149
7
                          encoding, unicode, startpos, endpos, reason);
7150
7
    if (*exceptionObject == NULL)
7151
0
        return NULL;
7152
7153
7
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7154
7
    if (restuple == NULL)
7155
7
        return NULL;
7156
0
    if (!PyTuple_Check(restuple)) {
7157
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7158
0
        Py_DECREF(restuple);
7159
0
        return NULL;
7160
0
    }
7161
0
    if (!PyArg_ParseTuple(restuple, argparse,
7162
0
                          &resunicode, newpos)) {
7163
0
        Py_DECREF(restuple);
7164
0
        return NULL;
7165
0
    }
7166
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7167
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7168
0
        Py_DECREF(restuple);
7169
0
        return NULL;
7170
0
    }
7171
0
    if (*newpos<0)
7172
0
        *newpos = len + *newpos;
7173
0
    if (*newpos<0 || *newpos>len) {
7174
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7175
0
        Py_DECREF(restuple);
7176
0
        return NULL;
7177
0
    }
7178
0
    Py_INCREF(resunicode);
7179
0
    Py_DECREF(restuple);
7180
0
    return resunicode;
7181
0
}
7182
7183
static PyObject *
7184
unicode_encode_ucs1(PyObject *unicode,
7185
                    const char *errors,
7186
                    const Py_UCS4 limit)
7187
2.33k
{
7188
    /* input state */
7189
2.33k
    Py_ssize_t pos=0, size;
7190
2.33k
    int kind;
7191
2.33k
    const void *data;
7192
2.33k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7193
2.33k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7194
2.33k
    PyObject *error_handler_obj = NULL;
7195
2.33k
    PyObject *exc = NULL;
7196
2.33k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7197
2.33k
    PyObject *rep = NULL;
7198
7199
2.33k
    size = PyUnicode_GET_LENGTH(unicode);
7200
2.33k
    kind = PyUnicode_KIND(unicode);
7201
2.33k
    data = PyUnicode_DATA(unicode);
7202
    /* allocate enough for a simple encoding without
7203
       replacements, if we need more, we'll resize */
7204
2.33k
    if (size == 0)
7205
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7206
7207
    /* output object */
7208
2.33k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7209
2.33k
    if (writer == NULL) {
7210
0
        return NULL;
7211
0
    }
7212
    /* pointer into the output */
7213
2.33k
    char *str = PyBytesWriter_GetData(writer);
7214
7215
6.05k
    while (pos < size) {
7216
6.05k
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7217
7218
        /* can we encode this? */
7219
6.05k
        if (ch < limit) {
7220
            /* no overflow check, because we know that the space is enough */
7221
3.71k
            *str++ = (char)ch;
7222
3.71k
            ++pos;
7223
3.71k
        }
7224
2.33k
        else {
7225
2.33k
            Py_ssize_t newpos, i;
7226
            /* startpos for collecting unencodable chars */
7227
2.33k
            Py_ssize_t collstart = pos;
7228
2.33k
            Py_ssize_t collend = collstart + 1;
7229
            /* find all unecodable characters */
7230
7231
51.1k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7232
48.7k
                ++collend;
7233
7234
            /* Only overallocate the buffer if it's not the last write */
7235
2.33k
            writer->overallocate = (collend < size);
7236
7237
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7238
2.33k
            if (error_handler == _Py_ERROR_UNKNOWN)
7239
2.33k
                error_handler = _Py_GetErrorHandler(errors);
7240
7241
2.33k
            switch (error_handler) {
7242
2.33k
            case _Py_ERROR_STRICT:
7243
2.33k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7244
2.33k
                goto onError;
7245
7246
0
            case _Py_ERROR_REPLACE:
7247
0
                memset(str, '?', collend - collstart);
7248
0
                str += (collend - collstart);
7249
0
                _Py_FALLTHROUGH;
7250
0
            case _Py_ERROR_IGNORE:
7251
0
                pos = collend;
7252
0
                break;
7253
7254
0
            case _Py_ERROR_BACKSLASHREPLACE:
7255
                /* subtract preallocated bytes */
7256
0
                writer->size -= (collend - collstart);
7257
0
                str = backslashreplace(writer, str,
7258
0
                                       unicode, collstart, collend);
7259
0
                if (str == NULL)
7260
0
                    goto onError;
7261
0
                pos = collend;
7262
0
                break;
7263
7264
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7265
                /* subtract preallocated bytes */
7266
0
                writer->size -= (collend - collstart);
7267
0
                str = xmlcharrefreplace(writer, str,
7268
0
                                        unicode, collstart, collend);
7269
0
                if (str == NULL)
7270
0
                    goto onError;
7271
0
                pos = collend;
7272
0
                break;
7273
7274
0
            case _Py_ERROR_SURROGATEESCAPE:
7275
0
                for (i = collstart; i < collend; ++i) {
7276
0
                    ch = PyUnicode_READ(kind, data, i);
7277
0
                    if (ch < 0xdc80 || 0xdcff < ch) {
7278
                        /* Not a UTF-8b surrogate */
7279
0
                        break;
7280
0
                    }
7281
0
                    *str++ = (char)(ch - 0xdc00);
7282
0
                    ++pos;
7283
0
                }
7284
0
                if (i >= collend)
7285
0
                    break;
7286
0
                collstart = pos;
7287
0
                assert(collstart != collend);
7288
0
                _Py_FALLTHROUGH;
7289
7290
0
            default:
7291
0
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7292
0
                                                       encoding, reason, unicode, &exc,
7293
0
                                                       collstart, collend, &newpos);
7294
0
                if (rep == NULL)
7295
0
                    goto onError;
7296
7297
0
                if (newpos < collstart) {
7298
0
                    writer->overallocate = 1;
7299
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7300
0
                                                             collstart - newpos,
7301
0
                                                             str);
7302
0
                    if (str == NULL) {
7303
0
                        goto onError;
7304
0
                    }
7305
0
                }
7306
0
                else {
7307
                    /* subtract preallocated bytes */
7308
0
                    writer->size -= newpos - collstart;
7309
                    /* Only overallocate the buffer if it's not the last write */
7310
0
                    writer->overallocate = (newpos < size);
7311
0
                }
7312
7313
0
                char *rep_str;
7314
0
                Py_ssize_t rep_len;
7315
0
                if (PyBytes_Check(rep)) {
7316
                    /* Directly copy bytes result to output. */
7317
0
                    rep_str = PyBytes_AS_STRING(rep);
7318
0
                    rep_len = PyBytes_GET_SIZE(rep);
7319
0
                }
7320
0
                else {
7321
0
                    assert(PyUnicode_Check(rep));
7322
7323
0
                    if (limit == 256 ?
7324
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7325
0
                        !PyUnicode_IS_ASCII(rep))
7326
0
                    {
7327
                        /* Not all characters are smaller than limit */
7328
0
                        raise_encode_exception(&exc, encoding, unicode,
7329
0
                                               collstart, collend, reason);
7330
0
                        goto onError;
7331
0
                    }
7332
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7333
0
                    rep_str = PyUnicode_DATA(rep);
7334
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7335
0
                }
7336
7337
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7338
0
                if (str == NULL) {
7339
0
                    goto onError;
7340
0
                }
7341
0
                memcpy(str, rep_str, rep_len);
7342
0
                str += rep_len;
7343
7344
0
                pos = newpos;
7345
0
                Py_CLEAR(rep);
7346
2.33k
            }
7347
7348
            /* If overallocation was disabled, ensure that it was the last
7349
               write. Otherwise, we missed an optimization */
7350
2.33k
            assert(writer->overallocate || pos == size);
7351
0
        }
7352
6.05k
    }
7353
7354
0
    Py_XDECREF(error_handler_obj);
7355
0
    Py_XDECREF(exc);
7356
0
    return PyBytesWriter_FinishWithPointer(writer, str);
7357
7358
2.33k
  onError:
7359
2.33k
    Py_XDECREF(rep);
7360
2.33k
    PyBytesWriter_Discard(writer);
7361
2.33k
    Py_XDECREF(error_handler_obj);
7362
2.33k
    Py_XDECREF(exc);
7363
2.33k
    return NULL;
7364
2.33k
}
7365
7366
PyObject *
7367
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7368
0
{
7369
0
    if (!PyUnicode_Check(unicode)) {
7370
0
        PyErr_BadArgument();
7371
0
        return NULL;
7372
0
    }
7373
    /* Fast path: if it is a one-byte string, construct
7374
       bytes object directly. */
7375
0
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7376
0
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7377
0
                                         PyUnicode_GET_LENGTH(unicode));
7378
    /* Non-Latin-1 characters present. Defer to above function to
7379
       raise the exception. */
7380
0
    return unicode_encode_ucs1(unicode, errors, 256);
7381
0
}
7382
7383
PyObject*
7384
PyUnicode_AsLatin1String(PyObject *unicode)
7385
0
{
7386
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7387
0
}
7388
7389
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7390
7391
PyObject *
7392
PyUnicode_DecodeASCII(const char *s,
7393
                      Py_ssize_t size,
7394
                      const char *errors)
7395
8.66k
{
7396
8.66k
    const char *starts = s;
7397
8.66k
    const char *e = s + size;
7398
8.66k
    PyObject *error_handler_obj = NULL;
7399
8.66k
    PyObject *exc = NULL;
7400
8.66k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7401
7402
8.66k
    if (size == 0)
7403
9
        _Py_RETURN_UNICODE_EMPTY();
7404
7405
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7406
8.65k
    if (size == 1 && (unsigned char)s[0] < 128) {
7407
368
        return get_latin1_char((unsigned char)s[0]);
7408
368
    }
7409
7410
    // Shortcut for simple case
7411
8.29k
    PyObject *u = PyUnicode_New(size, 127);
7412
8.29k
    if (u == NULL) {
7413
0
        return NULL;
7414
0
    }
7415
8.29k
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7416
8.29k
    if (outpos == size) {
7417
8.01k
        return u;
7418
8.01k
    }
7419
7420
276
    _PyUnicodeWriter writer;
7421
276
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7422
276
    writer.pos = outpos;
7423
7424
276
    s += outpos;
7425
276
    int kind = writer.kind;
7426
276
    void *data = writer.data;
7427
276
    Py_ssize_t startinpos, endinpos;
7428
7429
7.51M
    while (s < e) {
7430
7.51M
        unsigned char c = (unsigned char)*s;
7431
7.51M
        if (c < 128) {
7432
7.48M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7433
7.48M
            writer.pos++;
7434
7.48M
            ++s;
7435
7.48M
            continue;
7436
7.48M
        }
7437
7438
        /* byte outsize range 0x00..0x7f: call the error handler */
7439
7440
34.1k
        if (error_handler == _Py_ERROR_UNKNOWN)
7441
276
            error_handler = _Py_GetErrorHandler(errors);
7442
7443
34.1k
        switch (error_handler)
7444
34.1k
        {
7445
0
        case _Py_ERROR_REPLACE:
7446
34.0k
        case _Py_ERROR_SURROGATEESCAPE:
7447
            /* Fast-path: the error handler only writes one character,
7448
               but we may switch to UCS2 at the first write */
7449
34.0k
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7450
0
                goto onError;
7451
34.0k
            kind = writer.kind;
7452
34.0k
            data = writer.data;
7453
7454
34.0k
            if (error_handler == _Py_ERROR_REPLACE)
7455
0
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7456
34.0k
            else
7457
34.0k
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7458
34.0k
            writer.pos++;
7459
34.0k
            ++s;
7460
34.0k
            break;
7461
7462
0
        case _Py_ERROR_IGNORE:
7463
0
            ++s;
7464
0
            break;
7465
7466
107
        default:
7467
107
            startinpos = s-starts;
7468
107
            endinpos = startinpos + 1;
7469
107
            if (unicode_decode_call_errorhandler_writer(
7470
107
                    errors, &error_handler_obj,
7471
107
                    "ascii", "ordinal not in range(128)",
7472
107
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7473
107
                    &writer))
7474
107
                goto onError;
7475
0
            kind = writer.kind;
7476
0
            data = writer.data;
7477
34.1k
        }
7478
34.1k
    }
7479
169
    Py_XDECREF(error_handler_obj);
7480
169
    Py_XDECREF(exc);
7481
169
    return _PyUnicodeWriter_Finish(&writer);
7482
7483
107
  onError:
7484
107
    _PyUnicodeWriter_Dealloc(&writer);
7485
107
    Py_XDECREF(error_handler_obj);
7486
107
    Py_XDECREF(exc);
7487
107
    return NULL;
7488
276
}
7489
7490
PyObject *
7491
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7492
2.64k
{
7493
2.64k
    if (!PyUnicode_Check(unicode)) {
7494
0
        PyErr_BadArgument();
7495
0
        return NULL;
7496
0
    }
7497
    /* Fast path: if it is an ASCII-only string, construct bytes object
7498
       directly. Else defer to above function to raise the exception. */
7499
2.64k
    if (PyUnicode_IS_ASCII(unicode))
7500
304
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7501
304
                                         PyUnicode_GET_LENGTH(unicode));
7502
2.33k
    return unicode_encode_ucs1(unicode, errors, 128);
7503
2.64k
}
7504
7505
PyObject *
7506
PyUnicode_AsASCIIString(PyObject *unicode)
7507
2
{
7508
2
    return _PyUnicode_AsASCIIString(unicode, NULL);
7509
2
}
7510
7511
#ifdef MS_WINDOWS
7512
7513
/* --- MBCS codecs for Windows -------------------------------------------- */
7514
7515
#if SIZEOF_INT < SIZEOF_SIZE_T
7516
#define NEED_RETRY
7517
#endif
7518
7519
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7520
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7521
   both cases also and avoids partial characters overrunning the
7522
   length limit in MultiByteToWideChar on Windows */
7523
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7524
7525
#ifndef WC_ERR_INVALID_CHARS
7526
#  define WC_ERR_INVALID_CHARS 0x0080
7527
#endif
7528
7529
static const char*
7530
code_page_name(UINT code_page, PyObject **obj)
7531
{
7532
    *obj = NULL;
7533
    if (code_page == CP_ACP)
7534
        return "mbcs";
7535
7536
    *obj = PyBytes_FromFormat("cp%u", code_page);
7537
    if (*obj == NULL)
7538
        return NULL;
7539
    return PyBytes_AS_STRING(*obj);
7540
}
7541
7542
static DWORD
7543
decode_code_page_flags(UINT code_page)
7544
{
7545
    if (code_page == CP_UTF7) {
7546
        /* The CP_UTF7 decoder only supports flags=0 */
7547
        return 0;
7548
    }
7549
    else
7550
        return MB_ERR_INVALID_CHARS;
7551
}
7552
7553
/*
7554
 * Decode a byte string from a Windows code page into unicode object in strict
7555
 * mode.
7556
 *
7557
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7558
 * OSError and returns -1 on other error.
7559
 */
7560
static int
7561
decode_code_page_strict(UINT code_page,
7562
                        wchar_t **buf,
7563
                        Py_ssize_t *bufsize,
7564
                        const char *in,
7565
                        int insize)
7566
{
7567
    DWORD flags = MB_ERR_INVALID_CHARS;
7568
    wchar_t *out;
7569
    DWORD outsize;
7570
7571
    /* First get the size of the result */
7572
    assert(insize > 0);
7573
    while ((outsize = MultiByteToWideChar(code_page, flags,
7574
                                          in, insize, NULL, 0)) <= 0)
7575
    {
7576
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7577
            goto error;
7578
        }
7579
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7580
        flags = 0;
7581
    }
7582
7583
    /* Extend a wchar_t* buffer */
7584
    Py_ssize_t n = *bufsize;   /* Get the current length */
7585
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7586
        return -1;
7587
    }
7588
    out = *buf + n;
7589
7590
    /* Do the conversion */
7591
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7592
    if (outsize <= 0)
7593
        goto error;
7594
    return insize;
7595
7596
error:
7597
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7598
        return -2;
7599
    PyErr_SetFromWindowsErr(0);
7600
    return -1;
7601
}
7602
7603
/*
7604
 * Decode a byte string from a code page into unicode object with an error
7605
 * handler.
7606
 *
7607
 * Returns consumed size if succeed, or raise an OSError or
7608
 * UnicodeDecodeError exception and returns -1 on error.
7609
 */
7610
static int
7611
decode_code_page_errors(UINT code_page,
7612
                        wchar_t **buf,
7613
                        Py_ssize_t *bufsize,
7614
                        const char *in, const int size,
7615
                        const char *errors, int final)
7616
{
7617
    const char *startin = in;
7618
    const char *endin = in + size;
7619
    DWORD flags = MB_ERR_INVALID_CHARS;
7620
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7621
       2000 English version of the message. */
7622
    const char *reason = "No mapping for the Unicode character exists "
7623
                         "in the target code page.";
7624
    /* each step cannot decode more than 1 character, but a character can be
7625
       represented as a surrogate pair */
7626
    wchar_t buffer[2], *out;
7627
    int insize;
7628
    Py_ssize_t outsize;
7629
    PyObject *errorHandler = NULL;
7630
    PyObject *exc = NULL;
7631
    PyObject *encoding_obj = NULL;
7632
    const char *encoding;
7633
    DWORD err;
7634
    int ret = -1;
7635
7636
    assert(size > 0);
7637
7638
    encoding = code_page_name(code_page, &encoding_obj);
7639
    if (encoding == NULL)
7640
        return -1;
7641
7642
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7643
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7644
           UnicodeDecodeError. */
7645
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7646
        if (exc != NULL) {
7647
            PyCodec_StrictErrors(exc);
7648
            Py_CLEAR(exc);
7649
        }
7650
        goto error;
7651
    }
7652
7653
    /* Extend a wchar_t* buffer */
7654
    Py_ssize_t n = *bufsize;   /* Get the current length */
7655
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7656
        PyErr_NoMemory();
7657
        goto error;
7658
    }
7659
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7660
        goto error;
7661
    }
7662
    out = *buf + n;
7663
7664
    /* Decode the byte string character per character */
7665
    while (in < endin)
7666
    {
7667
        /* Decode a character */
7668
        insize = 1;
7669
        do
7670
        {
7671
            outsize = MultiByteToWideChar(code_page, flags,
7672
                                          in, insize,
7673
                                          buffer, Py_ARRAY_LENGTH(buffer));
7674
            if (outsize > 0)
7675
                break;
7676
            err = GetLastError();
7677
            if (err == ERROR_INVALID_FLAGS && flags) {
7678
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7679
                flags = 0;
7680
                continue;
7681
            }
7682
            if (err != ERROR_NO_UNICODE_TRANSLATION
7683
                && err != ERROR_INSUFFICIENT_BUFFER)
7684
            {
7685
                PyErr_SetFromWindowsErr(err);
7686
                goto error;
7687
            }
7688
            insize++;
7689
        }
7690
        /* 4=maximum length of a UTF-8 sequence */
7691
        while (insize <= 4 && (in + insize) <= endin);
7692
7693
        if (outsize <= 0) {
7694
            Py_ssize_t startinpos, endinpos, outpos;
7695
7696
            /* last character in partial decode? */
7697
            if (in + insize >= endin && !final)
7698
                break;
7699
7700
            startinpos = in - startin;
7701
            endinpos = startinpos + 1;
7702
            outpos = out - *buf;
7703
            if (unicode_decode_call_errorhandler_wchar(
7704
                    errors, &errorHandler,
7705
                    encoding, reason,
7706
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7707
                    buf, bufsize, &outpos))
7708
            {
7709
                goto error;
7710
            }
7711
            out = *buf + outpos;
7712
        }
7713
        else {
7714
            in += insize;
7715
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7716
            out += outsize;
7717
        }
7718
    }
7719
7720
    /* Shrink the buffer */
7721
    assert(out - *buf <= *bufsize);
7722
    *bufsize = out - *buf;
7723
    /* (in - startin) <= size and size is an int */
7724
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7725
7726
error:
7727
    Py_XDECREF(encoding_obj);
7728
    Py_XDECREF(errorHandler);
7729
    Py_XDECREF(exc);
7730
    return ret;
7731
}
7732
7733
static PyObject *
7734
decode_code_page_stateful(int code_page,
7735
                          const char *s, Py_ssize_t size,
7736
                          const char *errors, Py_ssize_t *consumed)
7737
{
7738
    wchar_t *buf = NULL;
7739
    Py_ssize_t bufsize = 0;
7740
    int chunk_size, final, converted, done;
7741
7742
    if (code_page < 0) {
7743
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7744
        return NULL;
7745
    }
7746
    if (size < 0) {
7747
        PyErr_BadInternalCall();
7748
        return NULL;
7749
    }
7750
7751
    if (consumed)
7752
        *consumed = 0;
7753
7754
    do
7755
    {
7756
#ifdef NEED_RETRY
7757
        if (size > DECODING_CHUNK_SIZE) {
7758
            chunk_size = DECODING_CHUNK_SIZE;
7759
            final = 0;
7760
            done = 0;
7761
        }
7762
        else
7763
#endif
7764
        {
7765
            chunk_size = (int)size;
7766
            final = (consumed == NULL);
7767
            done = 1;
7768
        }
7769
7770
        if (chunk_size == 0 && done) {
7771
            if (buf != NULL)
7772
                break;
7773
            _Py_RETURN_UNICODE_EMPTY();
7774
        }
7775
7776
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7777
                                            s, chunk_size);
7778
        if (converted == -2)
7779
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7780
                                                s, chunk_size,
7781
                                                errors, final);
7782
        assert(converted != 0 || done);
7783
7784
        if (converted < 0) {
7785
            PyMem_Free(buf);
7786
            return NULL;
7787
        }
7788
7789
        if (consumed)
7790
            *consumed += converted;
7791
7792
        s += converted;
7793
        size -= converted;
7794
    } while (!done);
7795
7796
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7797
    PyMem_Free(buf);
7798
    return v;
7799
}
7800
7801
PyObject *
7802
PyUnicode_DecodeCodePageStateful(int code_page,
7803
                                 const char *s,
7804
                                 Py_ssize_t size,
7805
                                 const char *errors,
7806
                                 Py_ssize_t *consumed)
7807
{
7808
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7809
}
7810
7811
PyObject *
7812
PyUnicode_DecodeMBCSStateful(const char *s,
7813
                             Py_ssize_t size,
7814
                             const char *errors,
7815
                             Py_ssize_t *consumed)
7816
{
7817
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7818
}
7819
7820
PyObject *
7821
PyUnicode_DecodeMBCS(const char *s,
7822
                     Py_ssize_t size,
7823
                     const char *errors)
7824
{
7825
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7826
}
7827
7828
static DWORD
7829
encode_code_page_flags(UINT code_page, const char *errors)
7830
{
7831
    if (code_page == CP_UTF8) {
7832
        return WC_ERR_INVALID_CHARS;
7833
    }
7834
    else if (code_page == CP_UTF7) {
7835
        /* CP_UTF7 only supports flags=0 */
7836
        return 0;
7837
    }
7838
    else {
7839
        if (errors != NULL && strcmp(errors, "replace") == 0)
7840
            return 0;
7841
        else
7842
            return WC_NO_BEST_FIT_CHARS;
7843
    }
7844
}
7845
7846
/*
7847
 * Encode a Unicode string to a Windows code page into a byte string in strict
7848
 * mode.
7849
 *
7850
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7851
 * an OSError and returns -1 on other error.
7852
 */
7853
static int
7854
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7855
                        PyObject *unicode, Py_ssize_t offset, int len,
7856
                        const char* errors)
7857
{
7858
    BOOL usedDefaultChar = FALSE;
7859
    BOOL *pusedDefaultChar = &usedDefaultChar;
7860
    int outsize;
7861
    wchar_t *p;
7862
    Py_ssize_t size;
7863
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7864
    char *out;
7865
    /* Create a substring so that we can get the UTF-16 representation
7866
       of just the slice under consideration. */
7867
    PyObject *substring;
7868
    int ret = -1;
7869
7870
    assert(len > 0);
7871
7872
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7873
        pusedDefaultChar = &usedDefaultChar;
7874
    else
7875
        pusedDefaultChar = NULL;
7876
7877
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7878
    if (substring == NULL)
7879
        return -1;
7880
    p = PyUnicode_AsWideCharString(substring, &size);
7881
    Py_CLEAR(substring);
7882
    if (p == NULL) {
7883
        return -1;
7884
    }
7885
    assert(size <= INT_MAX);
7886
7887
    /* First get the size of the result */
7888
    outsize = WideCharToMultiByte(code_page, flags,
7889
                                  p, (int)size,
7890
                                  NULL, 0,
7891
                                  NULL, pusedDefaultChar);
7892
    if (outsize <= 0)
7893
        goto error;
7894
    /* If we used a default char, then we failed! */
7895
    if (pusedDefaultChar && *pusedDefaultChar) {
7896
        ret = -2;
7897
        goto done;
7898
    }
7899
7900
    if (*writer == NULL) {
7901
        /* Create string object */
7902
        *writer = PyBytesWriter_Create(outsize);
7903
        if (*writer == NULL) {
7904
            goto done;
7905
        }
7906
        out = PyBytesWriter_GetData(*writer);
7907
    }
7908
    else {
7909
        /* Extend string object */
7910
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
7911
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
7912
            goto done;
7913
        }
7914
        out = (char*)PyBytesWriter_GetData(*writer) + n;
7915
    }
7916
7917
    /* Do the conversion */
7918
    outsize = WideCharToMultiByte(code_page, flags,
7919
                                  p, (int)size,
7920
                                  out, outsize,
7921
                                  NULL, pusedDefaultChar);
7922
    if (outsize <= 0)
7923
        goto error;
7924
    if (pusedDefaultChar && *pusedDefaultChar) {
7925
        ret = -2;
7926
        goto done;
7927
    }
7928
    ret = 0;
7929
7930
done:
7931
    PyMem_Free(p);
7932
    return ret;
7933
7934
error:
7935
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7936
        ret = -2;
7937
        goto done;
7938
    }
7939
    PyErr_SetFromWindowsErr(0);
7940
    goto done;
7941
}
7942
7943
/*
7944
 * Encode a Unicode string to a Windows code page into a byte string using an
7945
 * error handler.
7946
 *
7947
 * Returns consumed characters if succeed, or raise an OSError and returns
7948
 * -1 on other error.
7949
 */
7950
static int
7951
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
7952
                        PyObject *unicode, Py_ssize_t unicode_offset,
7953
                        Py_ssize_t insize, const char* errors)
7954
{
7955
    const DWORD flags = encode_code_page_flags(code_page, errors);
7956
    Py_ssize_t pos = unicode_offset;
7957
    Py_ssize_t endin = unicode_offset + insize;
7958
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7959
       2000 English version of the message. */
7960
    const char *reason = "invalid character";
7961
    /* 4=maximum length of a UTF-8 sequence */
7962
    char buffer[4];
7963
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7964
    Py_ssize_t outsize;
7965
    char *out;
7966
    PyObject *errorHandler = NULL;
7967
    PyObject *exc = NULL;
7968
    PyObject *encoding_obj = NULL;
7969
    const char *encoding;
7970
    Py_ssize_t newpos;
7971
    PyObject *rep;
7972
    int ret = -1;
7973
7974
    assert(insize > 0);
7975
7976
    encoding = code_page_name(code_page, &encoding_obj);
7977
    if (encoding == NULL)
7978
        return -1;
7979
7980
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7981
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7982
           then we raise a UnicodeEncodeError. */
7983
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7984
        if (exc != NULL) {
7985
            PyCodec_StrictErrors(exc);
7986
            Py_DECREF(exc);
7987
        }
7988
        Py_XDECREF(encoding_obj);
7989
        return -1;
7990
    }
7991
7992
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7993
        pusedDefaultChar = &usedDefaultChar;
7994
    else
7995
        pusedDefaultChar = NULL;
7996
7997
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7998
        PyErr_NoMemory();
7999
        goto error;
8000
    }
8001
    outsize = insize * Py_ARRAY_LENGTH(buffer);
8002
8003
    if (*writer == NULL) {
8004
        /* Create string object */
8005
        *writer = PyBytesWriter_Create(outsize);
8006
        if (*writer == NULL) {
8007
            goto error;
8008
        }
8009
        out = PyBytesWriter_GetData(*writer);
8010
    }
8011
    else {
8012
        /* Extend string object */
8013
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8014
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8015
            goto error;
8016
        }
8017
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8018
    }
8019
8020
    /* Encode the string character per character */
8021
    while (pos < endin)
8022
    {
8023
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8024
        wchar_t chars[2];
8025
        int charsize;
8026
        if (ch < 0x10000) {
8027
            chars[0] = (wchar_t)ch;
8028
            charsize = 1;
8029
        }
8030
        else {
8031
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8032
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8033
            charsize = 2;
8034
        }
8035
8036
        outsize = WideCharToMultiByte(code_page, flags,
8037
                                      chars, charsize,
8038
                                      buffer, Py_ARRAY_LENGTH(buffer),
8039
                                      NULL, pusedDefaultChar);
8040
        if (outsize > 0) {
8041
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8042
            {
8043
                pos++;
8044
                memcpy(out, buffer, outsize);
8045
                out += outsize;
8046
                continue;
8047
            }
8048
        }
8049
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8050
            PyErr_SetFromWindowsErr(0);
8051
            goto error;
8052
        }
8053
8054
        rep = unicode_encode_call_errorhandler(
8055
                  errors, &errorHandler, encoding, reason,
8056
                  unicode, &exc,
8057
                  pos, pos + 1, &newpos);
8058
        if (rep == NULL)
8059
            goto error;
8060
8061
        Py_ssize_t morebytes = pos - newpos;
8062
        if (PyBytes_Check(rep)) {
8063
            outsize = PyBytes_GET_SIZE(rep);
8064
            morebytes += outsize;
8065
            if (morebytes > 0) {
8066
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8067
                if (out == NULL) {
8068
                    Py_DECREF(rep);
8069
                    goto error;
8070
                }
8071
            }
8072
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8073
            out += outsize;
8074
        }
8075
        else {
8076
            Py_ssize_t i;
8077
            int kind;
8078
            const void *data;
8079
8080
            outsize = PyUnicode_GET_LENGTH(rep);
8081
            morebytes += outsize;
8082
            if (morebytes > 0) {
8083
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8084
                if (out == NULL) {
8085
                    Py_DECREF(rep);
8086
                    goto error;
8087
                }
8088
            }
8089
            kind = PyUnicode_KIND(rep);
8090
            data = PyUnicode_DATA(rep);
8091
            for (i=0; i < outsize; i++) {
8092
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8093
                if (ch > 127) {
8094
                    raise_encode_exception(&exc,
8095
                        encoding, unicode,
8096
                        pos, pos + 1,
8097
                        "unable to encode error handler result to ASCII");
8098
                    Py_DECREF(rep);
8099
                    goto error;
8100
                }
8101
                *out = (unsigned char)ch;
8102
                out++;
8103
            }
8104
        }
8105
        pos = newpos;
8106
        Py_DECREF(rep);
8107
    }
8108
    /* write a NUL byte */
8109
    *out = 0;
8110
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8111
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8112
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8113
        goto error;
8114
    }
8115
    ret = 0;
8116
8117
error:
8118
    Py_XDECREF(encoding_obj);
8119
    Py_XDECREF(errorHandler);
8120
    Py_XDECREF(exc);
8121
    return ret;
8122
}
8123
8124
8125
PyObject *
8126
PyUnicode_EncodeCodePage(int code_page,
8127
                         PyObject *unicode,
8128
                         const char *errors)
8129
{
8130
    Py_ssize_t len;
8131
    PyBytesWriter *writer = NULL;
8132
    Py_ssize_t offset;
8133
    int chunk_len, ret, done;
8134
8135
    if (!PyUnicode_Check(unicode)) {
8136
        PyErr_BadArgument();
8137
        return NULL;
8138
    }
8139
8140
    len = PyUnicode_GET_LENGTH(unicode);
8141
8142
    if (code_page < 0) {
8143
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8144
        return NULL;
8145
    }
8146
8147
    if (len == 0)
8148
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8149
8150
    offset = 0;
8151
    do
8152
    {
8153
#ifdef NEED_RETRY
8154
        if (len > DECODING_CHUNK_SIZE) {
8155
            chunk_len = DECODING_CHUNK_SIZE;
8156
            done = 0;
8157
        }
8158
        else
8159
#endif
8160
        {
8161
            chunk_len = (int)len;
8162
            done = 1;
8163
        }
8164
8165
        ret = encode_code_page_strict(code_page, &writer,
8166
                                      unicode, offset, chunk_len,
8167
                                      errors);
8168
        if (ret == -2)
8169
            ret = encode_code_page_errors(code_page, &writer,
8170
                                          unicode, offset,
8171
                                          chunk_len, errors);
8172
        if (ret < 0) {
8173
            PyBytesWriter_Discard(writer);
8174
            return NULL;
8175
        }
8176
8177
        offset += chunk_len;
8178
        len -= chunk_len;
8179
    } while (!done);
8180
8181
    return PyBytesWriter_Finish(writer);
8182
}
8183
8184
8185
PyObject *
8186
PyUnicode_AsMBCSString(PyObject *unicode)
8187
{
8188
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8189
}
8190
8191
#undef NEED_RETRY
8192
8193
#endif /* MS_WINDOWS */
8194
8195
/* --- Character Mapping Codec -------------------------------------------- */
8196
8197
static int
8198
charmap_decode_string(const char *s,
8199
                      Py_ssize_t size,
8200
                      PyObject *mapping,
8201
                      const char *errors,
8202
                      _PyUnicodeWriter *writer)
8203
940
{
8204
940
    const char *starts = s;
8205
940
    const char *e;
8206
940
    Py_ssize_t startinpos, endinpos;
8207
940
    PyObject *errorHandler = NULL, *exc = NULL;
8208
940
    Py_ssize_t maplen;
8209
940
    int mapkind;
8210
940
    const void *mapdata;
8211
940
    Py_UCS4 x;
8212
940
    unsigned char ch;
8213
8214
940
    maplen = PyUnicode_GET_LENGTH(mapping);
8215
940
    mapdata = PyUnicode_DATA(mapping);
8216
940
    mapkind = PyUnicode_KIND(mapping);
8217
8218
0
    e = s + size;
8219
8220
940
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8221
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8222
         * is disabled in encoding aliases, latin1 is preferred because
8223
         * its implementation is faster. */
8224
14
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8225
14
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8226
14
        Py_UCS4 maxchar = writer->maxchar;
8227
8228
14
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8229
11.0k
        while (s < e) {
8230
11.0k
            ch = *s;
8231
11.0k
            x = mapdata_ucs1[ch];
8232
11.0k
            if (x > maxchar) {
8233
14
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8234
0
                    goto onError;
8235
14
                maxchar = writer->maxchar;
8236
14
                outdata = (Py_UCS1 *)writer->data;
8237
14
            }
8238
11.0k
            outdata[writer->pos] = x;
8239
11.0k
            writer->pos++;
8240
11.0k
            ++s;
8241
11.0k
        }
8242
14
        return 0;
8243
14
    }
8244
8245
2.51k
    while (s < e) {
8246
2.50k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8247
2.50k
            int outkind = writer->kind;
8248
2.50k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8249
2.50k
            if (outkind == PyUnicode_1BYTE_KIND) {
8250
1.68k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8251
1.68k
                Py_UCS4 maxchar = writer->maxchar;
8252
149k
                while (s < e) {
8253
149k
                    ch = *s;
8254
149k
                    x = mapdata_ucs2[ch];
8255
149k
                    if (x > maxchar)
8256
1.58k
                        goto Error;
8257
147k
                    outdata[writer->pos] = x;
8258
147k
                    writer->pos++;
8259
147k
                    ++s;
8260
147k
                }
8261
100
                break;
8262
1.68k
            }
8263
819
            else if (outkind == PyUnicode_2BYTE_KIND) {
8264
819
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8265
1.61M
                while (s < e) {
8266
1.61M
                    ch = *s;
8267
1.61M
                    x = mapdata_ucs2[ch];
8268
1.61M
                    if (x == 0xFFFE)
8269
5
                        goto Error;
8270
1.61M
                    outdata[writer->pos] = x;
8271
1.61M
                    writer->pos++;
8272
1.61M
                    ++s;
8273
1.61M
                }
8274
814
                break;
8275
819
            }
8276
2.50k
        }
8277
0
        ch = *s;
8278
8279
0
        if (ch < maplen)
8280
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8281
0
        else
8282
0
            x = 0xfffe; /* invalid value */
8283
1.59k
Error:
8284
1.59k
        if (x == 0xfffe)
8285
9
        {
8286
            /* undefined mapping */
8287
9
            startinpos = s-starts;
8288
9
            endinpos = startinpos+1;
8289
9
            if (unicode_decode_call_errorhandler_writer(
8290
9
                    errors, &errorHandler,
8291
9
                    "charmap", "character maps to <undefined>",
8292
9
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8293
9
                    writer)) {
8294
9
                goto onError;
8295
9
            }
8296
0
            continue;
8297
9
        }
8298
8299
1.58k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8300
0
            goto onError;
8301
1.58k
        ++s;
8302
1.58k
    }
8303
917
    Py_XDECREF(errorHandler);
8304
917
    Py_XDECREF(exc);
8305
917
    return 0;
8306
8307
9
onError:
8308
9
    Py_XDECREF(errorHandler);
8309
9
    Py_XDECREF(exc);
8310
9
    return -1;
8311
926
}
8312
8313
static int
8314
charmap_decode_mapping(const char *s,
8315
                       Py_ssize_t size,
8316
                       PyObject *mapping,
8317
                       const char *errors,
8318
                       _PyUnicodeWriter *writer)
8319
0
{
8320
0
    const char *starts = s;
8321
0
    const char *e;
8322
0
    Py_ssize_t startinpos, endinpos;
8323
0
    PyObject *errorHandler = NULL, *exc = NULL;
8324
0
    unsigned char ch;
8325
0
    PyObject *key, *item = NULL;
8326
8327
0
    e = s + size;
8328
8329
0
    while (s < e) {
8330
0
        ch = *s;
8331
8332
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8333
0
        key = PyLong_FromLong((long)ch);
8334
0
        if (key == NULL)
8335
0
            goto onError;
8336
8337
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8338
0
        Py_DECREF(key);
8339
0
        if (rc == 0) {
8340
            /* No mapping found means: mapping is undefined. */
8341
0
            goto Undefined;
8342
0
        }
8343
0
        if (item == NULL) {
8344
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8345
                /* No mapping found means: mapping is undefined. */
8346
0
                PyErr_Clear();
8347
0
                goto Undefined;
8348
0
            } else
8349
0
                goto onError;
8350
0
        }
8351
8352
        /* Apply mapping */
8353
0
        if (item == Py_None)
8354
0
            goto Undefined;
8355
0
        if (PyLong_Check(item)) {
8356
0
            long value = PyLong_AsLong(item);
8357
0
            if (value == 0xFFFE)
8358
0
                goto Undefined;
8359
0
            if (value < 0 || value > MAX_UNICODE) {
8360
0
                PyErr_Format(PyExc_TypeError,
8361
0
                             "character mapping must be in range(0x%lx)",
8362
0
                             (unsigned long)MAX_UNICODE + 1);
8363
0
                goto onError;
8364
0
            }
8365
8366
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8367
0
                goto onError;
8368
0
        }
8369
0
        else if (PyUnicode_Check(item)) {
8370
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8371
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8372
0
                if (value == 0xFFFE)
8373
0
                    goto Undefined;
8374
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8375
0
                    goto onError;
8376
0
            }
8377
0
            else {
8378
0
                writer->overallocate = 1;
8379
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8380
0
                    goto onError;
8381
0
            }
8382
0
        }
8383
0
        else {
8384
            /* wrong return value */
8385
0
            PyErr_SetString(PyExc_TypeError,
8386
0
                            "character mapping must return integer, None or str");
8387
0
            goto onError;
8388
0
        }
8389
0
        Py_CLEAR(item);
8390
0
        ++s;
8391
0
        continue;
8392
8393
0
Undefined:
8394
        /* undefined mapping */
8395
0
        Py_CLEAR(item);
8396
0
        startinpos = s-starts;
8397
0
        endinpos = startinpos+1;
8398
0
        if (unicode_decode_call_errorhandler_writer(
8399
0
                errors, &errorHandler,
8400
0
                "charmap", "character maps to <undefined>",
8401
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8402
0
                writer)) {
8403
0
            goto onError;
8404
0
        }
8405
0
    }
8406
0
    Py_XDECREF(errorHandler);
8407
0
    Py_XDECREF(exc);
8408
0
    return 0;
8409
8410
0
onError:
8411
0
    Py_XDECREF(item);
8412
0
    Py_XDECREF(errorHandler);
8413
0
    Py_XDECREF(exc);
8414
0
    return -1;
8415
0
}
8416
8417
PyObject *
8418
PyUnicode_DecodeCharmap(const char *s,
8419
                        Py_ssize_t size,
8420
                        PyObject *mapping,
8421
                        const char *errors)
8422
940
{
8423
940
    _PyUnicodeWriter writer;
8424
8425
    /* Default to Latin-1 */
8426
940
    if (mapping == NULL)
8427
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8428
8429
940
    if (size == 0)
8430
0
        _Py_RETURN_UNICODE_EMPTY();
8431
940
    _PyUnicodeWriter_Init(&writer);
8432
940
    writer.min_length = size;
8433
940
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8434
0
        goto onError;
8435
8436
940
    if (PyUnicode_CheckExact(mapping)) {
8437
940
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8438
9
            goto onError;
8439
940
    }
8440
0
    else {
8441
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8442
0
            goto onError;
8443
0
    }
8444
931
    return _PyUnicodeWriter_Finish(&writer);
8445
8446
9
  onError:
8447
9
    _PyUnicodeWriter_Dealloc(&writer);
8448
9
    return NULL;
8449
940
}
8450
8451
/* Charmap encoding: the lookup table */
8452
8453
/*[clinic input]
8454
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8455
[clinic start generated code]*/
8456
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8457
8458
struct encoding_map {
8459
    PyObject_HEAD
8460
    unsigned char level1[32];
8461
    int count2, count3;
8462
    unsigned char level23[1];
8463
};
8464
8465
/*[clinic input]
8466
EncodingMap.size
8467
8468
Return the size (in bytes) of this object.
8469
[clinic start generated code]*/
8470
8471
static PyObject *
8472
EncodingMap_size_impl(struct encoding_map *self)
8473
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8474
0
{
8475
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8476
0
                           128*self->count3);
8477
0
}
8478
8479
static PyMethodDef encoding_map_methods[] = {
8480
    ENCODINGMAP_SIZE_METHODDEF
8481
    {NULL, NULL}
8482
};
8483
8484
static PyTypeObject EncodingMapType = {
8485
    PyVarObject_HEAD_INIT(NULL, 0)
8486
    .tp_name = "EncodingMap",
8487
    .tp_basicsize = sizeof(struct encoding_map),
8488
    /* methods */
8489
    .tp_flags = Py_TPFLAGS_DEFAULT,
8490
    .tp_methods = encoding_map_methods,
8491
};
8492
8493
PyObject*
8494
PyUnicode_BuildEncodingMap(PyObject* string)
8495
13
{
8496
13
    PyObject *result;
8497
13
    struct encoding_map *mresult;
8498
13
    int i;
8499
13
    int need_dict = 0;
8500
13
    unsigned char level1[32];
8501
13
    unsigned char level2[512];
8502
13
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8503
13
    int count2 = 0, count3 = 0;
8504
13
    int kind;
8505
13
    const void *data;
8506
13
    int length;
8507
13
    Py_UCS4 ch;
8508
8509
13
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8510
0
        PyErr_BadArgument();
8511
0
        return NULL;
8512
0
    }
8513
13
    kind = PyUnicode_KIND(string);
8514
13
    data = PyUnicode_DATA(string);
8515
13
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8516
13
    memset(level1, 0xFF, sizeof level1);
8517
13
    memset(level2, 0xFF, sizeof level2);
8518
8519
    /* If there isn't a one-to-one mapping of NULL to \0,
8520
       or if there are non-BMP characters, we need to use
8521
       a mapping dictionary. */
8522
13
    if (PyUnicode_READ(kind, data, 0) != 0)
8523
0
        need_dict = 1;
8524
3.32k
    for (i = 1; i < length; i++) {
8525
3.31k
        int l1, l2;
8526
3.31k
        ch = PyUnicode_READ(kind, data, i);
8527
3.31k
        if (ch == 0 || ch > 0xFFFF) {
8528
0
            need_dict = 1;
8529
0
            break;
8530
0
        }
8531
3.31k
        if (ch == 0xFFFE)
8532
            /* unmapped character */
8533
38
            continue;
8534
3.27k
        l1 = ch >> 11;
8535
3.27k
        l2 = ch >> 7;
8536
3.27k
        if (level1[l1] == 0xFF)
8537
21
            level1[l1] = count2++;
8538
3.27k
        if (level2[l2] == 0xFF)
8539
54
            level2[l2] = count3++;
8540
3.27k
    }
8541
8542
13
    if (count2 >= 0xFF || count3 >= 0xFF)
8543
0
        need_dict = 1;
8544
8545
13
    if (need_dict) {
8546
0
        PyObject *result = PyDict_New();
8547
0
        if (!result)
8548
0
            return NULL;
8549
0
        for (i = 0; i < length; i++) {
8550
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8551
0
            PyObject *key = PyLong_FromLong(c);
8552
0
            if (key == NULL) {
8553
0
                Py_DECREF(result);
8554
0
                return NULL;
8555
0
            }
8556
0
            PyObject *value = PyLong_FromLong(i);
8557
0
            if (value == NULL) {
8558
0
                Py_DECREF(key);
8559
0
                Py_DECREF(result);
8560
0
                return NULL;
8561
0
            }
8562
0
            int rc = PyDict_SetItem(result, key, value);
8563
0
            Py_DECREF(key);
8564
0
            Py_DECREF(value);
8565
0
            if (rc < 0) {
8566
0
                Py_DECREF(result);
8567
0
                return NULL;
8568
0
            }
8569
0
        }
8570
0
        return result;
8571
0
    }
8572
8573
    /* Create a three-level trie */
8574
13
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8575
13
                             16*count2 + 128*count3 - 1);
8576
13
    if (!result) {
8577
0
        return PyErr_NoMemory();
8578
0
    }
8579
8580
13
    _PyObject_Init(result, &EncodingMapType);
8581
13
    mresult = (struct encoding_map*)result;
8582
13
    mresult->count2 = count2;
8583
13
    mresult->count3 = count3;
8584
13
    mlevel1 = mresult->level1;
8585
13
    mlevel2 = mresult->level23;
8586
13
    mlevel3 = mresult->level23 + 16*count2;
8587
13
    memcpy(mlevel1, level1, 32);
8588
13
    memset(mlevel2, 0xFF, 16*count2);
8589
13
    memset(mlevel3, 0, 128*count3);
8590
13
    count3 = 0;
8591
3.32k
    for (i = 1; i < length; i++) {
8592
3.31k
        int o1, o2, o3, i2, i3;
8593
3.31k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8594
3.31k
        if (ch == 0xFFFE)
8595
            /* unmapped character */
8596
38
            continue;
8597
3.27k
        o1 = ch>>11;
8598
3.27k
        o2 = (ch>>7) & 0xF;
8599
3.27k
        i2 = 16*mlevel1[o1] + o2;
8600
3.27k
        if (mlevel2[i2] == 0xFF)
8601
54
            mlevel2[i2] = count3++;
8602
3.27k
        o3 = ch & 0x7F;
8603
3.27k
        i3 = 128*mlevel2[i2] + o3;
8604
3.27k
        mlevel3[i3] = i;
8605
3.27k
    }
8606
13
    return result;
8607
13
}
8608
8609
static int
8610
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8611
0
{
8612
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8613
0
    int l1 = c>>11;
8614
0
    int l2 = (c>>7) & 0xF;
8615
0
    int l3 = c & 0x7F;
8616
0
    int i;
8617
8618
0
    if (c > 0xFFFF)
8619
0
        return -1;
8620
0
    if (c == 0)
8621
0
        return 0;
8622
    /* level 1*/
8623
0
    i = map->level1[l1];
8624
0
    if (i == 0xFF) {
8625
0
        return -1;
8626
0
    }
8627
    /* level 2*/
8628
0
    i = map->level23[16*i+l2];
8629
0
    if (i == 0xFF) {
8630
0
        return -1;
8631
0
    }
8632
    /* level 3 */
8633
0
    i = map->level23[16*map->count2 + 128*i + l3];
8634
0
    if (i == 0) {
8635
0
        return -1;
8636
0
    }
8637
0
    return i;
8638
0
}
8639
8640
/* Lookup the character in the mapping.
8641
   On success, return PyLong, PyBytes or None (if the character can't be found).
8642
   If the result is PyLong, put its value in replace.
8643
   On error, return NULL.
8644
   */
8645
static PyObject *
8646
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8647
0
{
8648
0
    PyObject *w = PyLong_FromLong((long)c);
8649
0
    PyObject *x;
8650
8651
0
    if (w == NULL)
8652
0
        return NULL;
8653
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8654
0
    Py_DECREF(w);
8655
0
    if (rc == 0) {
8656
        /* No mapping found means: mapping is undefined. */
8657
0
        Py_RETURN_NONE;
8658
0
    }
8659
0
    if (x == NULL) {
8660
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8661
            /* No mapping found means: mapping is undefined. */
8662
0
            PyErr_Clear();
8663
0
            Py_RETURN_NONE;
8664
0
        } else
8665
0
            return NULL;
8666
0
    }
8667
0
    else if (x == Py_None)
8668
0
        return x;
8669
0
    else if (PyLong_Check(x)) {
8670
0
        long value = PyLong_AsLong(x);
8671
0
        if (value < 0 || value > 255) {
8672
0
            PyErr_SetString(PyExc_TypeError,
8673
0
                            "character mapping must be in range(256)");
8674
0
            Py_DECREF(x);
8675
0
            return NULL;
8676
0
        }
8677
0
        *replace = (unsigned char)value;
8678
0
        return x;
8679
0
    }
8680
0
    else if (PyBytes_Check(x))
8681
0
        return x;
8682
0
    else {
8683
        /* wrong return value */
8684
0
        PyErr_Format(PyExc_TypeError,
8685
0
                     "character mapping must return integer, bytes or None, not %.400s",
8686
0
                     Py_TYPE(x)->tp_name);
8687
0
        Py_DECREF(x);
8688
0
        return NULL;
8689
0
    }
8690
0
}
8691
8692
static int
8693
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8694
0
{
8695
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8696
    /* exponentially overallocate to minimize reallocations */
8697
0
    if (requiredsize < 2 * outsize)
8698
0
        requiredsize = 2 * outsize;
8699
0
    return PyBytesWriter_Resize(writer, requiredsize);
8700
0
}
8701
8702
typedef enum charmapencode_result {
8703
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8704
} charmapencode_result;
8705
/* lookup the character, put the result in the output string and adjust
8706
   various state variables. Resize the output bytes object if not enough
8707
   space is available. Return a new reference to the object that
8708
   was put in the output buffer, or Py_None, if the mapping was undefined
8709
   (in which case no character was written) or NULL, if a
8710
   reallocation error occurred. The caller must decref the result */
8711
static charmapencode_result
8712
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8713
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8714
0
{
8715
0
    PyObject *rep;
8716
0
    unsigned char replace;
8717
0
    char *outstart;
8718
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8719
8720
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8721
0
        int res = encoding_map_lookup(c, mapping);
8722
0
        Py_ssize_t requiredsize = *outpos+1;
8723
0
        if (res == -1) {
8724
0
            return enc_FAILED;
8725
0
        }
8726
8727
0
        if (outsize<requiredsize) {
8728
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8729
0
                return enc_EXCEPTION;
8730
0
            }
8731
0
        }
8732
0
        outstart = _PyBytesWriter_GetData(writer);
8733
0
        outstart[(*outpos)++] = (char)res;
8734
0
        return enc_SUCCESS;
8735
0
    }
8736
8737
0
    rep = charmapencode_lookup(c, mapping, &replace);
8738
0
    if (rep==NULL)
8739
0
        return enc_EXCEPTION;
8740
0
    else if (rep==Py_None) {
8741
0
        Py_DECREF(rep);
8742
0
        return enc_FAILED;
8743
0
    } else {
8744
0
        if (PyLong_Check(rep)) {
8745
0
            Py_ssize_t requiredsize = *outpos+1;
8746
0
            if (outsize<requiredsize)
8747
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8748
0
                    Py_DECREF(rep);
8749
0
                    return enc_EXCEPTION;
8750
0
                }
8751
0
            outstart = _PyBytesWriter_GetData(writer);
8752
0
            outstart[(*outpos)++] = (char)replace;
8753
0
        }
8754
0
        else {
8755
0
            const char *repchars = PyBytes_AS_STRING(rep);
8756
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8757
0
            Py_ssize_t requiredsize = *outpos+repsize;
8758
0
            if (outsize<requiredsize)
8759
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8760
0
                    Py_DECREF(rep);
8761
0
                    return enc_EXCEPTION;
8762
0
                }
8763
0
            outstart = _PyBytesWriter_GetData(writer);
8764
0
            memcpy(outstart + *outpos, repchars, repsize);
8765
0
            *outpos += repsize;
8766
0
        }
8767
0
    }
8768
0
    Py_DECREF(rep);
8769
0
    return enc_SUCCESS;
8770
0
}
8771
8772
/* handle an error in _PyUnicode_EncodeCharmap()
8773
   Return 0 on success, -1 on error */
8774
static int
8775
charmap_encoding_error(
8776
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8777
    PyObject **exceptionObject,
8778
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8779
    PyBytesWriter *writer, Py_ssize_t *respos)
8780
0
{
8781
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8782
0
    Py_ssize_t size, repsize;
8783
0
    Py_ssize_t newpos;
8784
0
    int kind;
8785
0
    const void *data;
8786
0
    Py_ssize_t index;
8787
    /* startpos for collecting unencodable chars */
8788
0
    Py_ssize_t collstartpos = *inpos;
8789
0
    Py_ssize_t collendpos = *inpos+1;
8790
0
    Py_ssize_t collpos;
8791
0
    const char *encoding = "charmap";
8792
0
    const char *reason = "character maps to <undefined>";
8793
0
    charmapencode_result x;
8794
0
    Py_UCS4 ch;
8795
0
    int val;
8796
8797
0
    size = PyUnicode_GET_LENGTH(unicode);
8798
    /* find all unencodable characters */
8799
0
    while (collendpos < size) {
8800
0
        PyObject *rep;
8801
0
        unsigned char replace;
8802
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8803
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8804
0
            val = encoding_map_lookup(ch, mapping);
8805
0
            if (val != -1)
8806
0
                break;
8807
0
            ++collendpos;
8808
0
            continue;
8809
0
        }
8810
8811
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8812
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8813
0
        if (rep==NULL)
8814
0
            return -1;
8815
0
        else if (rep!=Py_None) {
8816
0
            Py_DECREF(rep);
8817
0
            break;
8818
0
        }
8819
0
        Py_DECREF(rep);
8820
0
        ++collendpos;
8821
0
    }
8822
    /* cache callback name lookup
8823
     * (if not done yet, i.e. it's the first error) */
8824
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8825
0
        *error_handler = _Py_GetErrorHandler(errors);
8826
8827
0
    switch (*error_handler) {
8828
0
    case _Py_ERROR_STRICT:
8829
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8830
0
        return -1;
8831
8832
0
    case _Py_ERROR_REPLACE:
8833
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8834
0
            x = charmapencode_output('?', mapping, writer, respos);
8835
0
            if (x==enc_EXCEPTION) {
8836
0
                return -1;
8837
0
            }
8838
0
            else if (x==enc_FAILED) {
8839
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8840
0
                return -1;
8841
0
            }
8842
0
        }
8843
0
        _Py_FALLTHROUGH;
8844
0
    case _Py_ERROR_IGNORE:
8845
0
        *inpos = collendpos;
8846
0
        break;
8847
8848
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8849
        /* generate replacement (temporarily (mis)uses p) */
8850
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8851
0
            char buffer[2+29+1+1];
8852
0
            char *cp;
8853
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8854
0
            for (cp = buffer; *cp; ++cp) {
8855
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8856
0
                if (x==enc_EXCEPTION)
8857
0
                    return -1;
8858
0
                else if (x==enc_FAILED) {
8859
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8860
0
                    return -1;
8861
0
                }
8862
0
            }
8863
0
        }
8864
0
        *inpos = collendpos;
8865
0
        break;
8866
8867
0
    default:
8868
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8869
0
                                                      encoding, reason, unicode, exceptionObject,
8870
0
                                                      collstartpos, collendpos, &newpos);
8871
0
        if (repunicode == NULL)
8872
0
            return -1;
8873
0
        if (PyBytes_Check(repunicode)) {
8874
            /* Directly copy bytes result to output. */
8875
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8876
0
            Py_ssize_t requiredsize;
8877
0
            repsize = PyBytes_Size(repunicode);
8878
0
            requiredsize = *respos + repsize;
8879
0
            if (requiredsize > outsize)
8880
                /* Make room for all additional bytes. */
8881
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8882
0
                    Py_DECREF(repunicode);
8883
0
                    return -1;
8884
0
                }
8885
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8886
0
                   PyBytes_AsString(repunicode),  repsize);
8887
0
            *respos += repsize;
8888
0
            *inpos = newpos;
8889
0
            Py_DECREF(repunicode);
8890
0
            break;
8891
0
        }
8892
        /* generate replacement  */
8893
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8894
0
        data = PyUnicode_DATA(repunicode);
8895
0
        kind = PyUnicode_KIND(repunicode);
8896
0
        for (index = 0; index < repsize; index++) {
8897
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8898
0
            x = charmapencode_output(repch, mapping, writer, respos);
8899
0
            if (x==enc_EXCEPTION) {
8900
0
                Py_DECREF(repunicode);
8901
0
                return -1;
8902
0
            }
8903
0
            else if (x==enc_FAILED) {
8904
0
                Py_DECREF(repunicode);
8905
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8906
0
                return -1;
8907
0
            }
8908
0
        }
8909
0
        *inpos = newpos;
8910
0
        Py_DECREF(repunicode);
8911
0
    }
8912
0
    return 0;
8913
0
}
8914
8915
PyObject *
8916
_PyUnicode_EncodeCharmap(PyObject *unicode,
8917
                         PyObject *mapping,
8918
                         const char *errors)
8919
0
{
8920
    /* Default to Latin-1 */
8921
0
    if (mapping == NULL) {
8922
0
        return unicode_encode_ucs1(unicode, errors, 256);
8923
0
    }
8924
8925
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
8926
0
    if (size == 0) {
8927
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8928
0
    }
8929
0
    const void *data = PyUnicode_DATA(unicode);
8930
0
    int kind = PyUnicode_KIND(unicode);
8931
8932
0
    PyObject *error_handler_obj = NULL;
8933
0
    PyObject *exc = NULL;
8934
8935
    /* output object */
8936
0
    PyBytesWriter *writer;
8937
    /* allocate enough for a simple encoding without
8938
       replacements, if we need more, we'll resize */
8939
0
    writer = PyBytesWriter_Create(size);
8940
0
    if (writer == NULL) {
8941
0
        goto onError;
8942
0
    }
8943
8944
    /* current input position */
8945
0
    Py_ssize_t inpos = 0;
8946
    /* current output position */
8947
0
    Py_ssize_t respos = 0;
8948
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8949
8950
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8951
0
        char *outstart = _PyBytesWriter_GetData(writer);
8952
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8953
8954
0
        while (inpos<size) {
8955
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8956
8957
            /* try to encode it */
8958
0
            int res = encoding_map_lookup(ch, mapping);
8959
0
            Py_ssize_t requiredsize = respos+1;
8960
0
            if (res == -1) {
8961
0
                goto enc_FAILED;
8962
0
            }
8963
8964
0
            if (outsize<requiredsize) {
8965
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
8966
0
                    goto onError;
8967
0
                }
8968
0
                outstart = _PyBytesWriter_GetData(writer);
8969
0
                outsize = _PyBytesWriter_GetSize(writer);
8970
0
            }
8971
0
            outstart[respos++] = (char)res;
8972
8973
            /* done with this character => adjust input position */
8974
0
            ++inpos;
8975
0
            continue;
8976
8977
0
enc_FAILED:
8978
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
8979
0
                                       &exc,
8980
0
                                       &error_handler, &error_handler_obj, errors,
8981
0
                                       writer, &respos)) {
8982
0
                goto onError;
8983
0
            }
8984
0
            outstart = _PyBytesWriter_GetData(writer);
8985
0
            outsize = _PyBytesWriter_GetSize(writer);
8986
0
        }
8987
0
    }
8988
0
    else {
8989
0
        while (inpos<size) {
8990
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8991
            /* try to encode it */
8992
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
8993
0
            if (x==enc_EXCEPTION) { /* error */
8994
0
                goto onError;
8995
0
            }
8996
0
            if (x==enc_FAILED) { /* unencodable character */
8997
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
8998
0
                                           &exc,
8999
0
                                           &error_handler, &error_handler_obj, errors,
9000
0
                                           writer, &respos)) {
9001
0
                    goto onError;
9002
0
                }
9003
0
            }
9004
0
            else {
9005
                /* done with this character => adjust input position */
9006
0
                ++inpos;
9007
0
            }
9008
0
        }
9009
0
    }
9010
9011
0
    Py_XDECREF(exc);
9012
0
    Py_XDECREF(error_handler_obj);
9013
9014
    /* Resize if we allocated too much */
9015
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9016
9017
0
  onError:
9018
0
    PyBytesWriter_Discard(writer);
9019
0
    Py_XDECREF(exc);
9020
0
    Py_XDECREF(error_handler_obj);
9021
0
    return NULL;
9022
0
}
9023
9024
PyObject *
9025
PyUnicode_AsCharmapString(PyObject *unicode,
9026
                          PyObject *mapping)
9027
0
{
9028
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9029
0
        PyErr_BadArgument();
9030
0
        return NULL;
9031
0
    }
9032
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9033
0
}
9034
9035
/* create or adjust a UnicodeTranslateError */
9036
static void
9037
make_translate_exception(PyObject **exceptionObject,
9038
                         PyObject *unicode,
9039
                         Py_ssize_t startpos, Py_ssize_t endpos,
9040
                         const char *reason)
9041
0
{
9042
0
    if (*exceptionObject == NULL) {
9043
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9044
0
            unicode, startpos, endpos, reason);
9045
0
    }
9046
0
    else {
9047
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9048
0
            goto onError;
9049
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9050
0
            goto onError;
9051
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9052
0
            goto onError;
9053
0
        return;
9054
0
      onError:
9055
0
        Py_CLEAR(*exceptionObject);
9056
0
    }
9057
0
}
9058
9059
/* error handling callback helper:
9060
   build arguments, call the callback and check the arguments,
9061
   put the result into newpos and return the replacement string, which
9062
   has to be freed by the caller */
9063
static PyObject *
9064
unicode_translate_call_errorhandler(const char *errors,
9065
                                    PyObject **errorHandler,
9066
                                    const char *reason,
9067
                                    PyObject *unicode, PyObject **exceptionObject,
9068
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9069
                                    Py_ssize_t *newpos)
9070
0
{
9071
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9072
9073
0
    Py_ssize_t i_newpos;
9074
0
    PyObject *restuple;
9075
0
    PyObject *resunicode;
9076
9077
0
    if (*errorHandler == NULL) {
9078
0
        *errorHandler = PyCodec_LookupError(errors);
9079
0
        if (*errorHandler == NULL)
9080
0
            return NULL;
9081
0
    }
9082
9083
0
    make_translate_exception(exceptionObject,
9084
0
                             unicode, startpos, endpos, reason);
9085
0
    if (*exceptionObject == NULL)
9086
0
        return NULL;
9087
9088
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9089
0
    if (restuple == NULL)
9090
0
        return NULL;
9091
0
    if (!PyTuple_Check(restuple)) {
9092
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9093
0
        Py_DECREF(restuple);
9094
0
        return NULL;
9095
0
    }
9096
0
    if (!PyArg_ParseTuple(restuple, argparse,
9097
0
                          &resunicode, &i_newpos)) {
9098
0
        Py_DECREF(restuple);
9099
0
        return NULL;
9100
0
    }
9101
0
    if (i_newpos<0)
9102
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9103
0
    else
9104
0
        *newpos = i_newpos;
9105
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9106
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9107
0
        Py_DECREF(restuple);
9108
0
        return NULL;
9109
0
    }
9110
0
    Py_INCREF(resunicode);
9111
0
    Py_DECREF(restuple);
9112
0
    return resunicode;
9113
0
}
9114
9115
/* Lookup the character ch in the mapping and put the result in result,
9116
   which must be decrefed by the caller.
9117
   The result can be PyLong, PyUnicode, None or NULL.
9118
   If the result is PyLong, put its value in replace.
9119
   Return 0 on success, -1 on error */
9120
static int
9121
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9122
188
{
9123
188
    PyObject *w = PyLong_FromLong((long)c);
9124
188
    PyObject *x;
9125
9126
188
    if (w == NULL)
9127
0
        return -1;
9128
188
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9129
188
    Py_DECREF(w);
9130
188
    if (rc == 0) {
9131
        /* No mapping found means: use 1:1 mapping. */
9132
84
        *result = NULL;
9133
84
        return 0;
9134
84
    }
9135
104
    if (x == NULL) {
9136
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9137
            /* No mapping found means: use 1:1 mapping. */
9138
0
            PyErr_Clear();
9139
0
            *result = NULL;
9140
0
            return 0;
9141
0
        } else
9142
0
            return -1;
9143
0
    }
9144
104
    else if (x == Py_None) {
9145
0
        *result = x;
9146
0
        return 0;
9147
0
    }
9148
104
    else if (PyLong_Check(x)) {
9149
0
        long value = PyLong_AsLong(x);
9150
0
        if (value < 0 || value > MAX_UNICODE) {
9151
0
            PyErr_Format(PyExc_ValueError,
9152
0
                         "character mapping must be in range(0x%lx)",
9153
0
                         (unsigned long)MAX_UNICODE + 1);
9154
0
            Py_DECREF(x);
9155
0
            return -1;
9156
0
        }
9157
0
        *result = x;
9158
0
        *replace = (Py_UCS4)value;
9159
0
        return 0;
9160
0
    }
9161
104
    else if (PyUnicode_Check(x)) {
9162
104
        *result = x;
9163
104
        return 0;
9164
104
    }
9165
0
    else {
9166
        /* wrong return value */
9167
0
        PyErr_SetString(PyExc_TypeError,
9168
0
                        "character mapping must return integer, None or str");
9169
0
        Py_DECREF(x);
9170
0
        return -1;
9171
0
    }
9172
104
}
9173
9174
/* lookup the character, write the result into the writer.
9175
   Return 1 if the result was written into the writer, return 0 if the mapping
9176
   was undefined, raise an exception return -1 on error. */
9177
static int
9178
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9179
                        _PyUnicodeWriter *writer)
9180
72
{
9181
72
    PyObject *item;
9182
72
    Py_UCS4 replace;
9183
9184
72
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9185
0
        return -1;
9186
9187
72
    if (item == NULL) {
9188
        /* not found => default to 1:1 mapping */
9189
16
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9190
0
            return -1;
9191
0
        }
9192
16
        return 1;
9193
16
    }
9194
9195
56
    if (item == Py_None) {
9196
0
        Py_DECREF(item);
9197
0
        return 0;
9198
0
    }
9199
9200
56
    if (PyLong_Check(item)) {
9201
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9202
0
            Py_DECREF(item);
9203
0
            return -1;
9204
0
        }
9205
0
        Py_DECREF(item);
9206
0
        return 1;
9207
0
    }
9208
9209
56
    if (!PyUnicode_Check(item)) {
9210
0
        Py_DECREF(item);
9211
0
        return -1;
9212
0
    }
9213
9214
56
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9215
0
        Py_DECREF(item);
9216
0
        return -1;
9217
0
    }
9218
9219
56
    Py_DECREF(item);
9220
56
    return 1;
9221
56
}
9222
9223
static int
9224
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9225
                              Py_UCS1 *translate)
9226
116
{
9227
116
    PyObject *item = NULL;
9228
116
    Py_UCS4 replace;
9229
116
    int ret = 0;
9230
9231
116
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9232
0
        return -1;
9233
0
    }
9234
9235
116
    if (item == Py_None) {
9236
        /* deletion */
9237
0
        translate[ch] = 0xfe;
9238
0
    }
9239
116
    else if (item == NULL) {
9240
        /* not found => default to 1:1 mapping */
9241
68
        translate[ch] = ch;
9242
68
        return 1;
9243
68
    }
9244
48
    else if (PyLong_Check(item)) {
9245
0
        if (replace > 127) {
9246
            /* invalid character or character outside ASCII:
9247
               skip the fast translate */
9248
0
            goto exit;
9249
0
        }
9250
0
        translate[ch] = (Py_UCS1)replace;
9251
0
    }
9252
48
    else if (PyUnicode_Check(item)) {
9253
48
        if (PyUnicode_GET_LENGTH(item) != 1)
9254
48
            goto exit;
9255
9256
0
        replace = PyUnicode_READ_CHAR(item, 0);
9257
0
        if (replace > 127)
9258
0
            goto exit;
9259
0
        translate[ch] = (Py_UCS1)replace;
9260
0
    }
9261
0
    else {
9262
        /* not None, NULL, long or unicode */
9263
0
        goto exit;
9264
0
    }
9265
0
    ret = 1;
9266
9267
48
  exit:
9268
48
    Py_DECREF(item);
9269
48
    return ret;
9270
0
}
9271
9272
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9273
   was translated into writer, return 0 if the input string was partially
9274
   translated into writer, raise an exception and return -1 on error. */
9275
static int
9276
unicode_fast_translate(PyObject *input, PyObject *mapping,
9277
                       _PyUnicodeWriter *writer, int ignore,
9278
                       Py_ssize_t *input_pos)
9279
96
{
9280
96
    Py_UCS1 ascii_table[128], ch, ch2;
9281
96
    Py_ssize_t len;
9282
96
    const Py_UCS1 *in, *end;
9283
96
    Py_UCS1 *out;
9284
96
    int res = 0;
9285
9286
96
    len = PyUnicode_GET_LENGTH(input);
9287
9288
96
    memset(ascii_table, 0xff, 128);
9289
9290
96
    in = PyUnicode_1BYTE_DATA(input);
9291
96
    end = in + len;
9292
9293
96
    assert(PyUnicode_IS_ASCII(writer->buffer));
9294
96
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9295
96
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9296
9297
178
    for (; in < end; in++) {
9298
130
        ch = *in;
9299
130
        ch2 = ascii_table[ch];
9300
130
        if (ch2 == 0xff) {
9301
116
            int translate = unicode_fast_translate_lookup(mapping, ch,
9302
116
                                                          ascii_table);
9303
116
            if (translate < 0)
9304
0
                return -1;
9305
116
            if (translate == 0)
9306
48
                goto exit;
9307
68
            ch2 = ascii_table[ch];
9308
68
        }
9309
82
        if (ch2 == 0xfe) {
9310
0
            if (ignore)
9311
0
                continue;
9312
0
            goto exit;
9313
0
        }
9314
82
        assert(ch2 < 128);
9315
82
        *out = ch2;
9316
82
        out++;
9317
82
    }
9318
48
    res = 1;
9319
9320
96
exit:
9321
96
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9322
96
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9323
96
    return res;
9324
48
}
9325
9326
static PyObject *
9327
_PyUnicode_TranslateCharmap(PyObject *input,
9328
                            PyObject *mapping,
9329
                            const char *errors)
9330
96
{
9331
    /* input object */
9332
96
    const void *data;
9333
96
    Py_ssize_t size, i;
9334
96
    int kind;
9335
    /* output buffer */
9336
96
    _PyUnicodeWriter writer;
9337
    /* error handler */
9338
96
    const char *reason = "character maps to <undefined>";
9339
96
    PyObject *errorHandler = NULL;
9340
96
    PyObject *exc = NULL;
9341
96
    int ignore;
9342
96
    int res;
9343
9344
96
    if (mapping == NULL) {
9345
0
        PyErr_BadArgument();
9346
0
        return NULL;
9347
0
    }
9348
9349
96
    data = PyUnicode_DATA(input);
9350
96
    kind = PyUnicode_KIND(input);
9351
96
    size = PyUnicode_GET_LENGTH(input);
9352
9353
96
    if (size == 0)
9354
0
        return PyUnicode_FromObject(input);
9355
9356
    /* allocate enough for a simple 1:1 translation without
9357
       replacements, if we need more, we'll resize */
9358
96
    _PyUnicodeWriter_Init(&writer);
9359
96
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9360
0
        goto onError;
9361
9362
96
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9363
9364
96
    if (PyUnicode_IS_ASCII(input)) {
9365
96
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9366
96
        if (res < 0) {
9367
0
            _PyUnicodeWriter_Dealloc(&writer);
9368
0
            return NULL;
9369
0
        }
9370
96
        if (res == 1)
9371
48
            return _PyUnicodeWriter_Finish(&writer);
9372
96
    }
9373
0
    else {
9374
0
        i = 0;
9375
0
    }
9376
9377
120
    while (i<size) {
9378
        /* try to encode it */
9379
72
        int translate;
9380
72
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9381
72
        Py_ssize_t newpos;
9382
        /* startpos for collecting untranslatable chars */
9383
72
        Py_ssize_t collstart;
9384
72
        Py_ssize_t collend;
9385
72
        Py_UCS4 ch;
9386
9387
72
        ch = PyUnicode_READ(kind, data, i);
9388
72
        translate = charmaptranslate_output(ch, mapping, &writer);
9389
72
        if (translate < 0)
9390
0
            goto onError;
9391
9392
72
        if (translate != 0) {
9393
            /* it worked => adjust input pointer */
9394
72
            ++i;
9395
72
            continue;
9396
72
        }
9397
9398
        /* untranslatable character */
9399
0
        collstart = i;
9400
0
        collend = i+1;
9401
9402
        /* find all untranslatable characters */
9403
0
        while (collend < size) {
9404
0
            PyObject *x;
9405
0
            Py_UCS4 replace;
9406
0
            ch = PyUnicode_READ(kind, data, collend);
9407
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9408
0
                goto onError;
9409
0
            Py_XDECREF(x);
9410
0
            if (x != Py_None)
9411
0
                break;
9412
0
            ++collend;
9413
0
        }
9414
9415
0
        if (ignore) {
9416
0
            i = collend;
9417
0
        }
9418
0
        else {
9419
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9420
0
                                                             reason, input, &exc,
9421
0
                                                             collstart, collend, &newpos);
9422
0
            if (repunicode == NULL)
9423
0
                goto onError;
9424
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9425
0
                Py_DECREF(repunicode);
9426
0
                goto onError;
9427
0
            }
9428
0
            Py_DECREF(repunicode);
9429
0
            i = newpos;
9430
0
        }
9431
0
    }
9432
48
    Py_XDECREF(exc);
9433
48
    Py_XDECREF(errorHandler);
9434
48
    return _PyUnicodeWriter_Finish(&writer);
9435
9436
0
  onError:
9437
0
    _PyUnicodeWriter_Dealloc(&writer);
9438
0
    Py_XDECREF(exc);
9439
0
    Py_XDECREF(errorHandler);
9440
0
    return NULL;
9441
48
}
9442
9443
PyObject *
9444
PyUnicode_Translate(PyObject *str,
9445
                    PyObject *mapping,
9446
                    const char *errors)
9447
0
{
9448
0
    if (ensure_unicode(str) < 0)
9449
0
        return NULL;
9450
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9451
0
}
9452
9453
PyObject *
9454
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9455
73.0k
{
9456
73.0k
    if (!PyUnicode_Check(unicode)) {
9457
0
        PyErr_BadInternalCall();
9458
0
        return NULL;
9459
0
    }
9460
73.0k
    if (PyUnicode_IS_ASCII(unicode)) {
9461
        /* If the string is already ASCII, just return the same string */
9462
71.4k
        return Py_NewRef(unicode);
9463
71.4k
    }
9464
9465
1.65k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9466
1.65k
    PyObject *result = PyUnicode_New(len, 127);
9467
1.65k
    if (result == NULL) {
9468
0
        return NULL;
9469
0
    }
9470
9471
1.65k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9472
1.65k
    int kind = PyUnicode_KIND(unicode);
9473
1.65k
    const void *data = PyUnicode_DATA(unicode);
9474
1.65k
    Py_ssize_t i;
9475
1.64M
    for (i = 0; i < len; ++i) {
9476
1.64M
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9477
1.64M
        if (ch < 127) {
9478
1.63M
            out[i] = ch;
9479
1.63M
        }
9480
8.70k
        else if (Py_UNICODE_ISSPACE(ch)) {
9481
6.72k
            out[i] = ' ';
9482
6.72k
        }
9483
1.97k
        else {
9484
1.97k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9485
1.97k
            if (decimal < 0) {
9486
1.38k
                out[i] = '?';
9487
1.38k
                out[i+1] = '\0';
9488
1.38k
                _PyUnicode_LENGTH(result) = i + 1;
9489
0
                break;
9490
1.38k
            }
9491
594
            out[i] = '0' + decimal;
9492
594
        }
9493
1.64M
    }
9494
9495
1.65k
    assert(_PyUnicode_CheckConsistency(result, 1));
9496
1.65k
    return result;
9497
1.65k
}
9498
9499
/* --- Helpers ------------------------------------------------------------ */
9500
9501
/* helper macro to fixup start/end slice values */
9502
#define ADJUST_INDICES(start, end, len) \
9503
352k
    do {                                \
9504
352k
        if (end > len) {                \
9505
81.8k
            end = len;                  \
9506
81.8k
        }                               \
9507
352k
        else if (end < 0) {             \
9508
0
            end += len;                 \
9509
0
            if (end < 0) {              \
9510
0
                end = 0;                \
9511
0
            }                           \
9512
0
        }                               \
9513
352k
        if (start < 0) {                \
9514
0
            start += len;               \
9515
0
            if (start < 0) {            \
9516
0
                start = 0;              \
9517
0
            }                           \
9518
0
        }                               \
9519
352k
    } while (0)
9520
9521
static Py_ssize_t
9522
any_find_slice(PyObject* s1, PyObject* s2,
9523
               Py_ssize_t start,
9524
               Py_ssize_t end,
9525
               int direction)
9526
7.04k
{
9527
7.04k
    int kind1, kind2;
9528
7.04k
    const void *buf1, *buf2;
9529
7.04k
    Py_ssize_t len1, len2, result;
9530
9531
7.04k
    kind1 = PyUnicode_KIND(s1);
9532
7.04k
    kind2 = PyUnicode_KIND(s2);
9533
7.04k
    if (kind1 < kind2)
9534
0
        return -1;
9535
9536
7.04k
    len1 = PyUnicode_GET_LENGTH(s1);
9537
7.04k
    len2 = PyUnicode_GET_LENGTH(s2);
9538
7.04k
    ADJUST_INDICES(start, end, len1);
9539
7.04k
    if (end - start < len2)
9540
1.42k
        return -1;
9541
9542
5.62k
    buf1 = PyUnicode_DATA(s1);
9543
5.62k
    buf2 = PyUnicode_DATA(s2);
9544
5.62k
    if (len2 == 1) {
9545
5.62k
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9546
5.62k
        result = findchar((const char *)buf1 + kind1*start,
9547
5.62k
                          kind1, end - start, ch, direction);
9548
5.62k
        if (result == -1)
9549
4.58k
            return -1;
9550
1.04k
        else
9551
1.04k
            return start + result;
9552
5.62k
    }
9553
9554
0
    if (kind2 != kind1) {
9555
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9556
0
        if (!buf2)
9557
0
            return -2;
9558
0
    }
9559
9560
0
    if (direction > 0) {
9561
0
        switch (kind1) {
9562
0
        case PyUnicode_1BYTE_KIND:
9563
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9564
0
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9565
0
            else
9566
0
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9567
0
            break;
9568
0
        case PyUnicode_2BYTE_KIND:
9569
0
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9570
0
            break;
9571
0
        case PyUnicode_4BYTE_KIND:
9572
0
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9573
0
            break;
9574
0
        default:
9575
0
            Py_UNREACHABLE();
9576
0
        }
9577
0
    }
9578
0
    else {
9579
0
        switch (kind1) {
9580
0
        case PyUnicode_1BYTE_KIND:
9581
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9582
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9583
0
            else
9584
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9585
0
            break;
9586
0
        case PyUnicode_2BYTE_KIND:
9587
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9588
0
            break;
9589
0
        case PyUnicode_4BYTE_KIND:
9590
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9591
0
            break;
9592
0
        default:
9593
0
            Py_UNREACHABLE();
9594
0
        }
9595
0
    }
9596
9597
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9598
0
    if (kind2 != kind1)
9599
0
        PyMem_Free((void *)buf2);
9600
9601
0
    return result;
9602
0
}
9603
9604
9605
Py_ssize_t
9606
PyUnicode_Count(PyObject *str,
9607
                PyObject *substr,
9608
                Py_ssize_t start,
9609
                Py_ssize_t end)
9610
0
{
9611
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9612
0
        return -1;
9613
9614
0
    return unicode_count_impl(str, substr, start, end);
9615
0
}
9616
9617
Py_ssize_t
9618
PyUnicode_Find(PyObject *str,
9619
               PyObject *substr,
9620
               Py_ssize_t start,
9621
               Py_ssize_t end,
9622
               int direction)
9623
139
{
9624
139
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9625
0
        return -2;
9626
9627
139
    return any_find_slice(str, substr, start, end, direction);
9628
139
}
9629
9630
Py_ssize_t
9631
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9632
                   Py_ssize_t start, Py_ssize_t end,
9633
                   int direction)
9634
256k
{
9635
256k
    int kind;
9636
256k
    Py_ssize_t len, result;
9637
256k
    len = PyUnicode_GET_LENGTH(str);
9638
256k
    ADJUST_INDICES(start, end, len);
9639
256k
    if (end - start < 1)
9640
0
        return -1;
9641
256k
    kind = PyUnicode_KIND(str);
9642
256k
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9643
256k
                      kind, end-start, ch, direction);
9644
256k
    if (result == -1)
9645
223k
        return -1;
9646
33.2k
    else
9647
33.2k
        return start + result;
9648
256k
}
9649
9650
static int
9651
tailmatch(PyObject *self,
9652
          PyObject *substring,
9653
          Py_ssize_t start,
9654
          Py_ssize_t end,
9655
          int direction)
9656
81.6k
{
9657
81.6k
    int kind_self;
9658
81.6k
    int kind_sub;
9659
81.6k
    const void *data_self;
9660
81.6k
    const void *data_sub;
9661
81.6k
    Py_ssize_t offset;
9662
81.6k
    Py_ssize_t i;
9663
81.6k
    Py_ssize_t end_sub;
9664
9665
81.6k
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9666
81.6k
    end -= PyUnicode_GET_LENGTH(substring);
9667
81.6k
    if (end < start)
9668
571
        return 0;
9669
9670
81.0k
    if (PyUnicode_GET_LENGTH(substring) == 0)
9671
0
        return 1;
9672
9673
81.0k
    kind_self = PyUnicode_KIND(self);
9674
81.0k
    data_self = PyUnicode_DATA(self);
9675
81.0k
    kind_sub = PyUnicode_KIND(substring);
9676
81.0k
    data_sub = PyUnicode_DATA(substring);
9677
81.0k
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9678
9679
81.0k
    if (direction > 0)
9680
26.3k
        offset = end;
9681
54.6k
    else
9682
54.6k
        offset = start;
9683
9684
81.0k
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9685
81.0k
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9686
78.9k
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9687
78.9k
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9688
        /* If both are of the same kind, memcmp is sufficient */
9689
52.6k
        if (kind_self == kind_sub) {
9690
52.6k
            return ! memcmp((char *)data_self +
9691
52.6k
                                (offset * PyUnicode_KIND(substring)),
9692
0
                            data_sub,
9693
52.6k
                            PyUnicode_GET_LENGTH(substring) *
9694
52.6k
                                PyUnicode_KIND(substring));
9695
52.6k
        }
9696
        /* otherwise we have to compare each character by first accessing it */
9697
1
        else {
9698
            /* We do not need to compare 0 and len(substring)-1 because
9699
               the if statement above ensured already that they are equal
9700
               when we end up here. */
9701
3
            for (i = 1; i < end_sub; ++i) {
9702
2
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9703
2
                    PyUnicode_READ(kind_sub, data_sub, i))
9704
0
                    return 0;
9705
2
            }
9706
1
            return 1;
9707
1
        }
9708
52.6k
    }
9709
9710
28.4k
    return 0;
9711
81.0k
}
9712
9713
Py_ssize_t
9714
PyUnicode_Tailmatch(PyObject *str,
9715
                    PyObject *substr,
9716
                    Py_ssize_t start,
9717
                    Py_ssize_t end,
9718
                    int direction)
9719
0
{
9720
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9721
0
        return -1;
9722
9723
0
    return tailmatch(str, substr, start, end, direction);
9724
0
}
9725
9726
static PyObject *
9727
ascii_upper_or_lower(PyObject *self, int lower)
9728
66.4k
{
9729
66.4k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9730
66.4k
    const char *data = PyUnicode_DATA(self);
9731
66.4k
    char *resdata;
9732
66.4k
    PyObject *res;
9733
9734
66.4k
    res = PyUnicode_New(len, 127);
9735
66.4k
    if (res == NULL)
9736
0
        return NULL;
9737
66.4k
    resdata = PyUnicode_DATA(res);
9738
66.4k
    if (lower)
9739
66.3k
        _Py_bytes_lower(resdata, data, len);
9740
102
    else
9741
102
        _Py_bytes_upper(resdata, data, len);
9742
66.4k
    return res;
9743
66.4k
}
9744
9745
static Py_UCS4
9746
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9747
27
{
9748
27
    Py_ssize_t j;
9749
27
    int final_sigma;
9750
27
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9751
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9752
9753
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9754
9755
    where ! is a negation and \p{xxx} is a character with property xxx.
9756
    */
9757
27
    for (j = i - 1; j >= 0; j--) {
9758
0
        c = PyUnicode_READ(kind, data, j);
9759
0
        if (!_PyUnicode_IsCaseIgnorable(c))
9760
0
            break;
9761
0
    }
9762
27
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9763
27
    if (final_sigma) {
9764
0
        for (j = i + 1; j < length; j++) {
9765
0
            c = PyUnicode_READ(kind, data, j);
9766
0
            if (!_PyUnicode_IsCaseIgnorable(c))
9767
0
                break;
9768
0
        }
9769
0
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9770
0
    }
9771
27
    return (final_sigma) ? 0x3C2 : 0x3C3;
9772
27
}
9773
9774
static int
9775
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9776
           Py_UCS4 c, Py_UCS4 *mapped)
9777
185k
{
9778
    /* Obscure special case. */
9779
185k
    if (c == 0x3A3) {
9780
27
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9781
27
        return 1;
9782
27
    }
9783
185k
    return _PyUnicode_ToLowerFull(c, mapped);
9784
185k
}
9785
9786
static Py_ssize_t
9787
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9788
0
{
9789
0
    Py_ssize_t i, k = 0;
9790
0
    int n_res, j;
9791
0
    Py_UCS4 c, mapped[3];
9792
9793
0
    c = PyUnicode_READ(kind, data, 0);
9794
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9795
0
    for (j = 0; j < n_res; j++) {
9796
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9797
0
        res[k++] = mapped[j];
9798
0
    }
9799
0
    for (i = 1; i < length; i++) {
9800
0
        c = PyUnicode_READ(kind, data, i);
9801
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9802
0
        for (j = 0; j < n_res; j++) {
9803
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9804
0
            res[k++] = mapped[j];
9805
0
        }
9806
0
    }
9807
0
    return k;
9808
0
}
9809
9810
static Py_ssize_t
9811
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9812
0
    Py_ssize_t i, k = 0;
9813
9814
0
    for (i = 0; i < length; i++) {
9815
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9816
0
        int n_res, j;
9817
0
        if (Py_UNICODE_ISUPPER(c)) {
9818
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9819
0
        }
9820
0
        else if (Py_UNICODE_ISLOWER(c)) {
9821
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9822
0
        }
9823
0
        else {
9824
0
            n_res = 1;
9825
0
            mapped[0] = c;
9826
0
        }
9827
0
        for (j = 0; j < n_res; j++) {
9828
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9829
0
            res[k++] = mapped[j];
9830
0
        }
9831
0
    }
9832
0
    return k;
9833
0
}
9834
9835
static Py_ssize_t
9836
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9837
                  Py_UCS4 *maxchar, int lower)
9838
99.4k
{
9839
99.4k
    Py_ssize_t i, k = 0;
9840
9841
284k
    for (i = 0; i < length; i++) {
9842
185k
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9843
185k
        int n_res, j;
9844
185k
        if (lower)
9845
185k
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9846
0
        else
9847
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9848
370k
        for (j = 0; j < n_res; j++) {
9849
185k
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9850
185k
            res[k++] = mapped[j];
9851
185k
        }
9852
185k
    }
9853
99.4k
    return k;
9854
99.4k
}
9855
9856
static Py_ssize_t
9857
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9858
0
{
9859
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9860
0
}
9861
9862
static Py_ssize_t
9863
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9864
99.4k
{
9865
99.4k
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9866
99.4k
}
9867
9868
static Py_ssize_t
9869
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9870
0
{
9871
0
    Py_ssize_t i, k = 0;
9872
9873
0
    for (i = 0; i < length; i++) {
9874
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9875
0
        Py_UCS4 mapped[3];
9876
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9877
0
        for (j = 0; j < n_res; j++) {
9878
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9879
0
            res[k++] = mapped[j];
9880
0
        }
9881
0
    }
9882
0
    return k;
9883
0
}
9884
9885
static Py_ssize_t
9886
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9887
0
{
9888
0
    Py_ssize_t i, k = 0;
9889
0
    int previous_is_cased;
9890
9891
0
    previous_is_cased = 0;
9892
0
    for (i = 0; i < length; i++) {
9893
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9894
0
        Py_UCS4 mapped[3];
9895
0
        int n_res, j;
9896
9897
0
        if (previous_is_cased)
9898
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9899
0
        else
9900
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9901
9902
0
        for (j = 0; j < n_res; j++) {
9903
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9904
0
            res[k++] = mapped[j];
9905
0
        }
9906
9907
0
        previous_is_cased = _PyUnicode_IsCased(c);
9908
0
    }
9909
0
    return k;
9910
0
}
9911
9912
static PyObject *
9913
case_operation(PyObject *self,
9914
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9915
99.4k
{
9916
99.4k
    PyObject *res = NULL;
9917
99.4k
    Py_ssize_t length, newlength = 0;
9918
99.4k
    int kind, outkind;
9919
99.4k
    const void *data;
9920
99.4k
    void *outdata;
9921
99.4k
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9922
9923
99.4k
    kind = PyUnicode_KIND(self);
9924
99.4k
    data = PyUnicode_DATA(self);
9925
99.4k
    length = PyUnicode_GET_LENGTH(self);
9926
99.4k
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9927
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9928
0
        return NULL;
9929
0
    }
9930
99.4k
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9931
99.4k
    if (tmp == NULL)
9932
0
        return PyErr_NoMemory();
9933
99.4k
    newlength = perform(kind, data, length, tmp, &maxchar);
9934
99.4k
    res = PyUnicode_New(newlength, maxchar);
9935
99.4k
    if (res == NULL)
9936
0
        goto leave;
9937
99.4k
    tmpend = tmp + newlength;
9938
99.4k
    outdata = PyUnicode_DATA(res);
9939
99.4k
    outkind = PyUnicode_KIND(res);
9940
0
    switch (outkind) {
9941
6.95k
    case PyUnicode_1BYTE_KIND:
9942
6.95k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9943
6.95k
        break;
9944
82.9k
    case PyUnicode_2BYTE_KIND:
9945
82.9k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9946
82.9k
        break;
9947
9.51k
    case PyUnicode_4BYTE_KIND:
9948
9.51k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9949
9.51k
        break;
9950
0
    default:
9951
0
        Py_UNREACHABLE();
9952
99.4k
    }
9953
99.4k
  leave:
9954
99.4k
    PyMem_Free(tmp);
9955
99.4k
    return res;
9956
99.4k
}
9957
9958
PyObject *
9959
PyUnicode_Join(PyObject *separator, PyObject *seq)
9960
72.4k
{
9961
72.4k
    PyObject *res;
9962
72.4k
    PyObject *fseq;
9963
72.4k
    Py_ssize_t seqlen;
9964
72.4k
    PyObject **items;
9965
9966
72.4k
    fseq = PySequence_Fast(seq, "can only join an iterable");
9967
72.4k
    if (fseq == NULL) {
9968
0
        return NULL;
9969
0
    }
9970
9971
72.4k
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9972
9973
72.4k
    items = PySequence_Fast_ITEMS(fseq);
9974
72.4k
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9975
72.4k
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9976
9977
72.4k
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9978
9979
72.4k
    Py_DECREF(fseq);
9980
72.4k
    return res;
9981
72.4k
}
9982
9983
PyObject *
9984
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9985
100k
{
9986
100k
    PyObject *res = NULL; /* the result */
9987
100k
    PyObject *sep = NULL;
9988
100k
    Py_ssize_t seplen;
9989
100k
    PyObject *item;
9990
100k
    Py_ssize_t sz, i, res_offset;
9991
100k
    Py_UCS4 maxchar;
9992
100k
    Py_UCS4 item_maxchar;
9993
100k
    int use_memcpy;
9994
100k
    unsigned char *res_data = NULL, *sep_data = NULL;
9995
100k
    PyObject *last_obj;
9996
100k
    int kind = 0;
9997
9998
    /* If empty sequence, return u"". */
9999
100k
    if (seqlen == 0) {
10000
0
        _Py_RETURN_UNICODE_EMPTY();
10001
0
    }
10002
10003
    /* If singleton sequence with an exact Unicode, return that. */
10004
100k
    last_obj = NULL;
10005
100k
    if (seqlen == 1) {
10006
53.9k
        if (PyUnicode_CheckExact(items[0])) {
10007
53.9k
            res = items[0];
10008
53.9k
            return Py_NewRef(res);
10009
53.9k
        }
10010
0
        seplen = 0;
10011
0
        maxchar = 0;
10012
0
    }
10013
46.1k
    else {
10014
        /* Set up sep and seplen */
10015
46.1k
        if (separator == NULL) {
10016
            /* fall back to a blank space separator */
10017
0
            sep = PyUnicode_FromOrdinal(' ');
10018
0
            if (!sep)
10019
0
                goto onError;
10020
0
            seplen = 1;
10021
0
            maxchar = 32;
10022
0
        }
10023
46.1k
        else {
10024
46.1k
            if (!PyUnicode_Check(separator)) {
10025
0
                PyErr_Format(PyExc_TypeError,
10026
0
                             "separator: expected str instance,"
10027
0
                             " %.80s found",
10028
0
                             Py_TYPE(separator)->tp_name);
10029
0
                goto onError;
10030
0
            }
10031
46.1k
            sep = separator;
10032
46.1k
            seplen = PyUnicode_GET_LENGTH(separator);
10033
46.1k
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10034
            /* inc refcount to keep this code path symmetric with the
10035
               above case of a blank separator */
10036
46.1k
            Py_INCREF(sep);
10037
46.1k
        }
10038
46.1k
        last_obj = sep;
10039
46.1k
    }
10040
10041
    /* There are at least two things to join, or else we have a subclass
10042
     * of str in the sequence.
10043
     * Do a pre-pass to figure out the total amount of space we'll
10044
     * need (sz), and see whether all argument are strings.
10045
     */
10046
46.1k
    sz = 0;
10047
#ifdef Py_DEBUG
10048
    use_memcpy = 0;
10049
#else
10050
46.1k
    use_memcpy = 1;
10051
46.1k
#endif
10052
380k
    for (i = 0; i < seqlen; i++) {
10053
334k
        size_t add_sz;
10054
334k
        item = items[i];
10055
334k
        if (!PyUnicode_Check(item)) {
10056
0
            PyErr_Format(PyExc_TypeError,
10057
0
                         "sequence item %zd: expected str instance,"
10058
0
                         " %.80s found",
10059
0
                         i, Py_TYPE(item)->tp_name);
10060
0
            goto onError;
10061
0
        }
10062
334k
        add_sz = PyUnicode_GET_LENGTH(item);
10063
334k
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10064
334k
        maxchar = Py_MAX(maxchar, item_maxchar);
10065
334k
        if (i != 0) {
10066
288k
            add_sz += seplen;
10067
288k
        }
10068
334k
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10069
0
            PyErr_SetString(PyExc_OverflowError,
10070
0
                            "join() result is too long for a Python string");
10071
0
            goto onError;
10072
0
        }
10073
334k
        sz += add_sz;
10074
334k
        if (use_memcpy && last_obj != NULL) {
10075
522k
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10076
14.4k
                use_memcpy = 0;
10077
261k
        }
10078
0
        last_obj = item;
10079
334k
    }
10080
10081
46.1k
    res = PyUnicode_New(sz, maxchar);
10082
46.1k
    if (res == NULL)
10083
0
        goto onError;
10084
10085
    /* Catenate everything. */
10086
#ifdef Py_DEBUG
10087
    use_memcpy = 0;
10088
#else
10089
46.1k
    if (use_memcpy) {
10090
31.7k
        res_data = PyUnicode_1BYTE_DATA(res);
10091
31.7k
        kind = PyUnicode_KIND(res);
10092
31.7k
        if (seplen != 0)
10093
2.57k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10094
31.7k
    }
10095
46.1k
#endif
10096
46.1k
    if (use_memcpy) {
10097
263k
        for (i = 0; i < seqlen; ++i) {
10098
232k
            Py_ssize_t itemlen;
10099
232k
            item = items[i];
10100
10101
            /* Copy item, and maybe the separator. */
10102
232k
            if (i && seplen != 0) {
10103
4.15k
                memcpy(res_data,
10104
4.15k
                          sep_data,
10105
4.15k
                          kind * seplen);
10106
4.15k
                res_data += kind * seplen;
10107
4.15k
            }
10108
10109
232k
            itemlen = PyUnicode_GET_LENGTH(item);
10110
232k
            if (itemlen != 0) {
10111
231k
                memcpy(res_data,
10112
231k
                          PyUnicode_DATA(item),
10113
231k
                          kind * itemlen);
10114
231k
                res_data += kind * itemlen;
10115
231k
            }
10116
232k
        }
10117
31.7k
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10118
31.7k
                           + kind * PyUnicode_GET_LENGTH(res));
10119
31.7k
    }
10120
14.4k
    else {
10121
116k
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10122
102k
            Py_ssize_t itemlen;
10123
102k
            item = items[i];
10124
10125
            /* Copy item, and maybe the separator. */
10126
102k
            if (i && seplen != 0) {
10127
2.81k
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10128
2.81k
                res_offset += seplen;
10129
2.81k
            }
10130
10131
102k
            itemlen = PyUnicode_GET_LENGTH(item);
10132
102k
            if (itemlen != 0) {
10133
100k
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10134
100k
                res_offset += itemlen;
10135
100k
            }
10136
102k
        }
10137
14.4k
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10138
14.4k
    }
10139
10140
46.1k
    Py_XDECREF(sep);
10141
46.1k
    assert(_PyUnicode_CheckConsistency(res, 1));
10142
46.1k
    return res;
10143
10144
0
  onError:
10145
0
    Py_XDECREF(sep);
10146
0
    Py_XDECREF(res);
10147
0
    return NULL;
10148
46.1k
}
10149
10150
void
10151
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10152
                    Py_UCS4 fill_char)
10153
585
{
10154
585
    const int kind = PyUnicode_KIND(unicode);
10155
585
    void *data = PyUnicode_DATA(unicode);
10156
585
    assert(_PyUnicode_IsModifiable(unicode));
10157
585
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10158
585
    assert(start >= 0);
10159
585
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10160
585
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10161
585
}
10162
10163
Py_ssize_t
10164
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10165
               Py_UCS4 fill_char)
10166
585
{
10167
585
    Py_ssize_t maxlen;
10168
10169
585
    if (!PyUnicode_Check(unicode)) {
10170
0
        PyErr_BadInternalCall();
10171
0
        return -1;
10172
0
    }
10173
585
    if (unicode_check_modifiable(unicode))
10174
0
        return -1;
10175
10176
585
    if (start < 0) {
10177
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10178
0
        return -1;
10179
0
    }
10180
585
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10181
0
        PyErr_SetString(PyExc_ValueError,
10182
0
                         "fill character is bigger than "
10183
0
                         "the string maximum character");
10184
0
        return -1;
10185
0
    }
10186
10187
585
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10188
585
    length = Py_MIN(maxlen, length);
10189
585
    if (length <= 0)
10190
0
        return 0;
10191
10192
585
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10193
585
    return length;
10194
585
}
10195
10196
static PyObject *
10197
pad(PyObject *self,
10198
    Py_ssize_t left,
10199
    Py_ssize_t right,
10200
    Py_UCS4 fill)
10201
0
{
10202
0
    PyObject *u;
10203
0
    Py_UCS4 maxchar;
10204
0
    int kind;
10205
0
    void *data;
10206
10207
0
    if (left < 0)
10208
0
        left = 0;
10209
0
    if (right < 0)
10210
0
        right = 0;
10211
10212
0
    if (left == 0 && right == 0)
10213
0
        return unicode_result_unchanged(self);
10214
10215
0
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10216
0
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10217
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10218
0
        return NULL;
10219
0
    }
10220
0
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10221
0
    maxchar = Py_MAX(maxchar, fill);
10222
0
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10223
0
    if (!u)
10224
0
        return NULL;
10225
10226
0
    kind = PyUnicode_KIND(u);
10227
0
    data = PyUnicode_DATA(u);
10228
0
    if (left)
10229
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10230
0
    if (right)
10231
0
        _PyUnicode_Fill(kind, data, fill,
10232
0
                        left + _PyUnicode_LENGTH(self), right);
10233
0
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10234
0
    assert(_PyUnicode_CheckConsistency(u, 1));
10235
0
    return u;
10236
0
}
10237
10238
PyObject *
10239
PyUnicode_Splitlines(PyObject *string, int keepends)
10240
0
{
10241
0
    PyObject *list;
10242
10243
0
    if (ensure_unicode(string) < 0)
10244
0
        return NULL;
10245
10246
0
    switch (PyUnicode_KIND(string)) {
10247
0
    case PyUnicode_1BYTE_KIND:
10248
0
        if (PyUnicode_IS_ASCII(string))
10249
0
            list = asciilib_splitlines(
10250
0
                string, PyUnicode_1BYTE_DATA(string),
10251
0
                PyUnicode_GET_LENGTH(string), keepends);
10252
0
        else
10253
0
            list = ucs1lib_splitlines(
10254
0
                string, PyUnicode_1BYTE_DATA(string),
10255
0
                PyUnicode_GET_LENGTH(string), keepends);
10256
0
        break;
10257
0
    case PyUnicode_2BYTE_KIND:
10258
0
        list = ucs2lib_splitlines(
10259
0
            string, PyUnicode_2BYTE_DATA(string),
10260
0
            PyUnicode_GET_LENGTH(string), keepends);
10261
0
        break;
10262
0
    case PyUnicode_4BYTE_KIND:
10263
0
        list = ucs4lib_splitlines(
10264
0
            string, PyUnicode_4BYTE_DATA(string),
10265
0
            PyUnicode_GET_LENGTH(string), keepends);
10266
0
        break;
10267
0
    default:
10268
0
        Py_UNREACHABLE();
10269
0
    }
10270
0
    return list;
10271
0
}
10272
10273
static PyObject *
10274
split(PyObject *self,
10275
      PyObject *substring,
10276
      Py_ssize_t maxcount)
10277
1.97k
{
10278
1.97k
    int kind1, kind2;
10279
1.97k
    const void *buf1, *buf2;
10280
1.97k
    Py_ssize_t len1, len2;
10281
1.97k
    PyObject* out;
10282
1.97k
    len1 = PyUnicode_GET_LENGTH(self);
10283
1.97k
    kind1 = PyUnicode_KIND(self);
10284
10285
1.97k
    if (substring == NULL) {
10286
5
        if (maxcount < 0) {
10287
5
            maxcount = (len1 - 1) / 2 + 1;
10288
5
        }
10289
5
        switch (kind1) {
10290
5
        case PyUnicode_1BYTE_KIND:
10291
5
            if (PyUnicode_IS_ASCII(self))
10292
5
                return asciilib_split_whitespace(
10293
5
                    self,  PyUnicode_1BYTE_DATA(self),
10294
5
                    len1, maxcount
10295
5
                    );
10296
0
            else
10297
0
                return ucs1lib_split_whitespace(
10298
0
                    self,  PyUnicode_1BYTE_DATA(self),
10299
0
                    len1, maxcount
10300
0
                    );
10301
0
        case PyUnicode_2BYTE_KIND:
10302
0
            return ucs2lib_split_whitespace(
10303
0
                self,  PyUnicode_2BYTE_DATA(self),
10304
0
                len1, maxcount
10305
0
                );
10306
0
        case PyUnicode_4BYTE_KIND:
10307
0
            return ucs4lib_split_whitespace(
10308
0
                self,  PyUnicode_4BYTE_DATA(self),
10309
0
                len1, maxcount
10310
0
                );
10311
0
        default:
10312
0
            Py_UNREACHABLE();
10313
5
        }
10314
5
    }
10315
10316
1.96k
    kind2 = PyUnicode_KIND(substring);
10317
1.96k
    len2 = PyUnicode_GET_LENGTH(substring);
10318
1.96k
    if (maxcount < 0) {
10319
        // if len2 == 0, it will raise ValueError.
10320
1.96k
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10321
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10322
1.96k
        maxcount = maxcount < 0 ? len1 : maxcount;
10323
1.96k
    }
10324
1.96k
    if (kind1 < kind2 || len1 < len2) {
10325
2
        out = PyList_New(1);
10326
2
        if (out == NULL)
10327
0
            return NULL;
10328
2
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10329
2
        return out;
10330
2
    }
10331
1.96k
    buf1 = PyUnicode_DATA(self);
10332
1.96k
    buf2 = PyUnicode_DATA(substring);
10333
1.96k
    if (kind2 != kind1) {
10334
1.07k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10335
1.07k
        if (!buf2)
10336
0
            return NULL;
10337
1.07k
    }
10338
10339
1.96k
    switch (kind1) {
10340
888
    case PyUnicode_1BYTE_KIND:
10341
888
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10342
613
            out = asciilib_split(
10343
613
                self,  buf1, len1, buf2, len2, maxcount);
10344
275
        else
10345
275
            out = ucs1lib_split(
10346
275
                self,  buf1, len1, buf2, len2, maxcount);
10347
888
        break;
10348
564
    case PyUnicode_2BYTE_KIND:
10349
564
        out = ucs2lib_split(
10350
564
            self,  buf1, len1, buf2, len2, maxcount);
10351
564
        break;
10352
514
    case PyUnicode_4BYTE_KIND:
10353
514
        out = ucs4lib_split(
10354
514
            self,  buf1, len1, buf2, len2, maxcount);
10355
514
        break;
10356
0
    default:
10357
0
        out = NULL;
10358
1.96k
    }
10359
1.96k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10360
1.96k
    if (kind2 != kind1)
10361
1.07k
        PyMem_Free((void *)buf2);
10362
1.96k
    return out;
10363
1.96k
}
10364
10365
static PyObject *
10366
rsplit(PyObject *self,
10367
       PyObject *substring,
10368
       Py_ssize_t maxcount)
10369
0
{
10370
0
    int kind1, kind2;
10371
0
    const void *buf1, *buf2;
10372
0
    Py_ssize_t len1, len2;
10373
0
    PyObject* out;
10374
10375
0
    len1 = PyUnicode_GET_LENGTH(self);
10376
0
    kind1 = PyUnicode_KIND(self);
10377
10378
0
    if (substring == NULL) {
10379
0
        if (maxcount < 0) {
10380
0
            maxcount = (len1 - 1) / 2 + 1;
10381
0
        }
10382
0
        switch (kind1) {
10383
0
        case PyUnicode_1BYTE_KIND:
10384
0
            if (PyUnicode_IS_ASCII(self))
10385
0
                return asciilib_rsplit_whitespace(
10386
0
                    self,  PyUnicode_1BYTE_DATA(self),
10387
0
                    len1, maxcount
10388
0
                    );
10389
0
            else
10390
0
                return ucs1lib_rsplit_whitespace(
10391
0
                    self,  PyUnicode_1BYTE_DATA(self),
10392
0
                    len1, maxcount
10393
0
                    );
10394
0
        case PyUnicode_2BYTE_KIND:
10395
0
            return ucs2lib_rsplit_whitespace(
10396
0
                self,  PyUnicode_2BYTE_DATA(self),
10397
0
                len1, maxcount
10398
0
                );
10399
0
        case PyUnicode_4BYTE_KIND:
10400
0
            return ucs4lib_rsplit_whitespace(
10401
0
                self,  PyUnicode_4BYTE_DATA(self),
10402
0
                len1, maxcount
10403
0
                );
10404
0
        default:
10405
0
            Py_UNREACHABLE();
10406
0
        }
10407
0
    }
10408
0
    kind2 = PyUnicode_KIND(substring);
10409
0
    len2 = PyUnicode_GET_LENGTH(substring);
10410
0
    if (maxcount < 0) {
10411
        // if len2 == 0, it will raise ValueError.
10412
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10413
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10414
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10415
0
    }
10416
0
    if (kind1 < kind2 || len1 < len2) {
10417
0
        out = PyList_New(1);
10418
0
        if (out == NULL)
10419
0
            return NULL;
10420
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10421
0
        return out;
10422
0
    }
10423
0
    buf1 = PyUnicode_DATA(self);
10424
0
    buf2 = PyUnicode_DATA(substring);
10425
0
    if (kind2 != kind1) {
10426
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10427
0
        if (!buf2)
10428
0
            return NULL;
10429
0
    }
10430
10431
0
    switch (kind1) {
10432
0
    case PyUnicode_1BYTE_KIND:
10433
0
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10434
0
            out = asciilib_rsplit(
10435
0
                self,  buf1, len1, buf2, len2, maxcount);
10436
0
        else
10437
0
            out = ucs1lib_rsplit(
10438
0
                self,  buf1, len1, buf2, len2, maxcount);
10439
0
        break;
10440
0
    case PyUnicode_2BYTE_KIND:
10441
0
        out = ucs2lib_rsplit(
10442
0
            self,  buf1, len1, buf2, len2, maxcount);
10443
0
        break;
10444
0
    case PyUnicode_4BYTE_KIND:
10445
0
        out = ucs4lib_rsplit(
10446
0
            self,  buf1, len1, buf2, len2, maxcount);
10447
0
        break;
10448
0
    default:
10449
0
        out = NULL;
10450
0
    }
10451
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10452
0
    if (kind2 != kind1)
10453
0
        PyMem_Free((void *)buf2);
10454
0
    return out;
10455
0
}
10456
10457
static Py_ssize_t
10458
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10459
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10460
13.9k
{
10461
13.9k
    switch (kind) {
10462
13.6k
    case PyUnicode_1BYTE_KIND:
10463
13.6k
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10464
13.6k
            return asciilib_find(buf1, len1, buf2, len2, offset);
10465
66
        else
10466
66
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10467
132
    case PyUnicode_2BYTE_KIND:
10468
132
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10469
175
    case PyUnicode_4BYTE_KIND:
10470
175
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10471
13.9k
    }
10472
13.9k
    Py_UNREACHABLE();
10473
13.9k
}
10474
10475
static Py_ssize_t
10476
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10477
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10478
21.4k
{
10479
21.4k
    switch (kind) {
10480
21.3k
    case PyUnicode_1BYTE_KIND:
10481
21.3k
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10482
71
    case PyUnicode_2BYTE_KIND:
10483
71
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10484
27
    case PyUnicode_4BYTE_KIND:
10485
27
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10486
21.4k
    }
10487
21.4k
    Py_UNREACHABLE();
10488
21.4k
}
10489
10490
static void
10491
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10492
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10493
83
{
10494
83
    int kind = PyUnicode_KIND(u);
10495
83
    void *data = PyUnicode_DATA(u);
10496
83
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10497
83
    if (kind == PyUnicode_1BYTE_KIND) {
10498
83
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10499
83
                                      (Py_UCS1 *)data + len,
10500
83
                                      u1, u2, maxcount);
10501
83
    }
10502
0
    else if (kind == PyUnicode_2BYTE_KIND) {
10503
0
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10504
0
                                      (Py_UCS2 *)data + len,
10505
0
                                      u1, u2, maxcount);
10506
0
    }
10507
0
    else {
10508
0
        assert(kind == PyUnicode_4BYTE_KIND);
10509
0
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10510
0
                                      (Py_UCS4 *)data + len,
10511
0
                                      u1, u2, maxcount);
10512
0
    }
10513
83
}
10514
10515
static PyObject *
10516
replace(PyObject *self, PyObject *str1,
10517
        PyObject *str2, Py_ssize_t maxcount)
10518
22.1k
{
10519
22.1k
    PyObject *u;
10520
22.1k
    const char *sbuf = PyUnicode_DATA(self);
10521
22.1k
    const void *buf1 = PyUnicode_DATA(str1);
10522
22.1k
    const void *buf2 = PyUnicode_DATA(str2);
10523
22.1k
    int srelease = 0, release1 = 0, release2 = 0;
10524
22.1k
    int skind = PyUnicode_KIND(self);
10525
22.1k
    int kind1 = PyUnicode_KIND(str1);
10526
22.1k
    int kind2 = PyUnicode_KIND(str2);
10527
22.1k
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10528
22.1k
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10529
22.1k
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10530
22.1k
    int mayshrink;
10531
22.1k
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10532
10533
22.1k
    if (slen < len1)
10534
213
        goto nothing;
10535
10536
21.9k
    if (maxcount < 0)
10537
21.9k
        maxcount = PY_SSIZE_T_MAX;
10538
0
    else if (maxcount == 0)
10539
0
        goto nothing;
10540
10541
21.9k
    if (str1 == str2)
10542
0
        goto nothing;
10543
10544
21.9k
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10545
21.9k
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10546
21.9k
    if (maxchar < maxchar_str1)
10547
        /* substring too wide to be present */
10548
0
        goto nothing;
10549
21.9k
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10550
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10551
       result string. */
10552
21.9k
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10553
21.9k
    maxchar = Py_MAX(maxchar, maxchar_str2);
10554
10555
21.9k
    if (len1 == len2) {
10556
        /* same length */
10557
434
        if (len1 == 0)
10558
0
            goto nothing;
10559
434
        if (len1 == 1) {
10560
            /* replace characters */
10561
434
            Py_UCS4 u1, u2;
10562
434
            Py_ssize_t pos;
10563
10564
434
            u1 = PyUnicode_READ(kind1, buf1, 0);
10565
434
            pos = findchar(sbuf, skind, slen, u1, 1);
10566
434
            if (pos < 0)
10567
351
                goto nothing;
10568
83
            u2 = PyUnicode_READ(kind2, buf2, 0);
10569
83
            u = PyUnicode_New(slen, maxchar);
10570
83
            if (!u)
10571
0
                goto error;
10572
10573
83
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10574
83
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10575
83
        }
10576
0
        else {
10577
0
            int rkind = skind;
10578
0
            char *res;
10579
0
            Py_ssize_t i;
10580
10581
0
            if (kind1 < rkind) {
10582
                /* widen substring */
10583
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10584
0
                if (!buf1) goto error;
10585
0
                release1 = 1;
10586
0
            }
10587
0
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10588
0
            if (i < 0)
10589
0
                goto nothing;
10590
0
            if (rkind > kind2) {
10591
                /* widen replacement */
10592
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10593
0
                if (!buf2) goto error;
10594
0
                release2 = 1;
10595
0
            }
10596
0
            else if (rkind < kind2) {
10597
                /* widen self and buf1 */
10598
0
                rkind = kind2;
10599
0
                if (release1) {
10600
0
                    assert(buf1 != PyUnicode_DATA(str1));
10601
0
                    PyMem_Free((void *)buf1);
10602
0
                    buf1 = PyUnicode_DATA(str1);
10603
0
                    release1 = 0;
10604
0
                }
10605
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10606
0
                if (!sbuf) goto error;
10607
0
                srelease = 1;
10608
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10609
0
                if (!buf1) goto error;
10610
0
                release1 = 1;
10611
0
            }
10612
0
            u = PyUnicode_New(slen, maxchar);
10613
0
            if (!u)
10614
0
                goto error;
10615
0
            assert(PyUnicode_KIND(u) == rkind);
10616
0
            res = PyUnicode_DATA(u);
10617
10618
0
            memcpy(res, sbuf, rkind * slen);
10619
            /* change everything in-place, starting with this one */
10620
0
            memcpy(res + rkind * i,
10621
0
                   buf2,
10622
0
                   rkind * len2);
10623
0
            i += len1;
10624
10625
0
            while ( --maxcount > 0) {
10626
0
                i = anylib_find(rkind, self,
10627
0
                                sbuf+rkind*i, slen-i,
10628
0
                                str1, buf1, len1, i);
10629
0
                if (i == -1)
10630
0
                    break;
10631
0
                memcpy(res + rkind * i,
10632
0
                       buf2,
10633
0
                       rkind * len2);
10634
0
                i += len1;
10635
0
            }
10636
0
        }
10637
434
    }
10638
21.4k
    else {
10639
21.4k
        Py_ssize_t n, i, j, ires;
10640
21.4k
        Py_ssize_t new_size;
10641
21.4k
        int rkind = skind;
10642
21.4k
        char *res;
10643
10644
21.4k
        if (kind1 < rkind) {
10645
            /* widen substring */
10646
98
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10647
98
            if (!buf1) goto error;
10648
98
            release1 = 1;
10649
98
        }
10650
21.4k
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10651
21.4k
        if (n == 0)
10652
20.8k
            goto nothing;
10653
644
        if (kind2 < rkind) {
10654
            /* widen replacement */
10655
62
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10656
62
            if (!buf2) goto error;
10657
62
            release2 = 1;
10658
62
        }
10659
582
        else if (kind2 > rkind) {
10660
            /* widen self and buf1 */
10661
0
            rkind = kind2;
10662
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10663
0
            if (!sbuf) goto error;
10664
0
            srelease = 1;
10665
0
            if (release1) {
10666
0
                assert(buf1 != PyUnicode_DATA(str1));
10667
0
                PyMem_Free((void *)buf1);
10668
0
                buf1 = PyUnicode_DATA(str1);
10669
0
                release1 = 0;
10670
0
            }
10671
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10672
0
            if (!buf1) goto error;
10673
0
            release1 = 1;
10674
0
        }
10675
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10676
           PyUnicode_GET_LENGTH(str1)); */
10677
644
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10678
0
                PyErr_SetString(PyExc_OverflowError,
10679
0
                                "replace string is too long");
10680
0
                goto error;
10681
0
        }
10682
644
        new_size = slen + n * (len2 - len1);
10683
644
        if (new_size == 0) {
10684
0
            u = _PyUnicode_GetEmpty();
10685
0
            goto done;
10686
0
        }
10687
644
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10688
0
            PyErr_SetString(PyExc_OverflowError,
10689
0
                            "replace string is too long");
10690
0
            goto error;
10691
0
        }
10692
644
        u = PyUnicode_New(new_size, maxchar);
10693
644
        if (!u)
10694
0
            goto error;
10695
644
        assert(PyUnicode_KIND(u) == rkind);
10696
644
        res = PyUnicode_DATA(u);
10697
644
        ires = i = 0;
10698
644
        if (len1 > 0) {
10699
14.6k
            while (n-- > 0) {
10700
                /* look for next match */
10701
13.9k
                j = anylib_find(rkind, self,
10702
13.9k
                                sbuf + rkind * i, slen-i,
10703
13.9k
                                str1, buf1, len1, i);
10704
13.9k
                if (j == -1)
10705
0
                    break;
10706
13.9k
                else if (j > i) {
10707
                    /* copy unchanged part [i:j] */
10708
12.9k
                    memcpy(res + rkind * ires,
10709
12.9k
                           sbuf + rkind * i,
10710
12.9k
                           rkind * (j-i));
10711
12.9k
                    ires += j - i;
10712
12.9k
                }
10713
                /* copy substitution string */
10714
13.9k
                if (len2 > 0) {
10715
1.31k
                    memcpy(res + rkind * ires,
10716
1.31k
                           buf2,
10717
1.31k
                           rkind * len2);
10718
1.31k
                    ires += len2;
10719
1.31k
                }
10720
13.9k
                i = j + len1;
10721
13.9k
            }
10722
644
            if (i < slen)
10723
                /* copy tail [i:] */
10724
257
                memcpy(res + rkind * ires,
10725
257
                       sbuf + rkind * i,
10726
257
                       rkind * (slen-i));
10727
644
        }
10728
0
        else {
10729
            /* interleave */
10730
0
            while (n > 0) {
10731
0
                memcpy(res + rkind * ires,
10732
0
                       buf2,
10733
0
                       rkind * len2);
10734
0
                ires += len2;
10735
0
                if (--n <= 0)
10736
0
                    break;
10737
0
                memcpy(res + rkind * ires,
10738
0
                       sbuf + rkind * i,
10739
0
                       rkind);
10740
0
                ires++;
10741
0
                i++;
10742
0
            }
10743
0
            memcpy(res + rkind * ires,
10744
0
                   sbuf + rkind * i,
10745
0
                   rkind * (slen-i));
10746
0
        }
10747
644
    }
10748
10749
727
    if (mayshrink) {
10750
0
        unicode_adjust_maxchar(&u);
10751
0
        if (u == NULL)
10752
0
            goto error;
10753
0
    }
10754
10755
727
  done:
10756
727
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10757
727
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10758
727
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10759
727
    if (srelease)
10760
0
        PyMem_Free((void *)sbuf);
10761
727
    if (release1)
10762
62
        PyMem_Free((void *)buf1);
10763
727
    if (release2)
10764
62
        PyMem_Free((void *)buf2);
10765
727
    assert(_PyUnicode_CheckConsistency(u, 1));
10766
727
    return u;
10767
10768
21.4k
  nothing:
10769
    /* nothing to replace; return original string (when possible) */
10770
21.4k
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10771
21.4k
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10772
21.4k
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10773
21.4k
    if (srelease)
10774
0
        PyMem_Free((void *)sbuf);
10775
21.4k
    if (release1)
10776
36
        PyMem_Free((void *)buf1);
10777
21.4k
    if (release2)
10778
0
        PyMem_Free((void *)buf2);
10779
21.4k
    return unicode_result_unchanged(self);
10780
10781
0
  error:
10782
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10783
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10784
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10785
0
    if (srelease)
10786
0
        PyMem_Free((void *)sbuf);
10787
0
    if (release1)
10788
0
        PyMem_Free((void *)buf1);
10789
0
    if (release2)
10790
0
        PyMem_Free((void *)buf2);
10791
0
    return NULL;
10792
0
}
10793
10794
/* --- Unicode Object Methods --------------------------------------------- */
10795
10796
/*[clinic input]
10797
str.title as unicode_title
10798
10799
Return a version of the string where each word is titlecased.
10800
10801
More specifically, words start with uppercased characters and all
10802
remaining cased characters have lower case.
10803
[clinic start generated code]*/
10804
10805
static PyObject *
10806
unicode_title_impl(PyObject *self)
10807
/*[clinic end generated code: output=c75ae03809574902 input=2a07e2c7df94627a]*/
10808
0
{
10809
0
    return case_operation(self, do_title);
10810
0
}
10811
10812
/*[clinic input]
10813
str.capitalize as unicode_capitalize
10814
10815
Return a capitalized version of the string.
10816
10817
More specifically, make the first character have upper case and the
10818
rest lower case.
10819
[clinic start generated code]*/
10820
10821
static PyObject *
10822
unicode_capitalize_impl(PyObject *self)
10823
/*[clinic end generated code: output=e49a4c333cdb7667 input=e50e50ed45a654cf]*/
10824
0
{
10825
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10826
0
        return unicode_result_unchanged(self);
10827
0
    return case_operation(self, do_capitalize);
10828
0
}
10829
10830
/*[clinic input]
10831
str.casefold as unicode_casefold
10832
10833
Return a version of the string suitable for caseless comparisons.
10834
[clinic start generated code]*/
10835
10836
static PyObject *
10837
unicode_casefold_impl(PyObject *self)
10838
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10839
0
{
10840
0
    if (PyUnicode_IS_ASCII(self))
10841
0
        return ascii_upper_or_lower(self, 1);
10842
0
    return case_operation(self, do_casefold);
10843
0
}
10844
10845
10846
/* Argument converter. Accepts a single Unicode character. */
10847
10848
static int
10849
convert_uc(PyObject *obj, void *addr)
10850
0
{
10851
0
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10852
10853
0
    if (!PyUnicode_Check(obj)) {
10854
0
        PyErr_Format(PyExc_TypeError,
10855
0
                     "The fill character must be a unicode character, "
10856
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10857
0
        return 0;
10858
0
    }
10859
0
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10860
0
        PyErr_SetString(PyExc_TypeError,
10861
0
                        "The fill character must be exactly one character long");
10862
0
        return 0;
10863
0
    }
10864
0
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10865
0
    return 1;
10866
0
}
10867
10868
/*[clinic input]
10869
str.center as unicode_center
10870
10871
    width: Py_ssize_t
10872
    fillchar: Py_UCS4 = ' '
10873
    /
10874
10875
Return a centered string of length width.
10876
10877
Padding is done using the specified fill character (default is
10878
a space).
10879
[clinic start generated code]*/
10880
10881
static PyObject *
10882
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10883
/*[clinic end generated code: output=420c8859effc7c0c input=df91017dfd186a78]*/
10884
0
{
10885
0
    Py_ssize_t marg, left;
10886
10887
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10888
0
        return unicode_result_unchanged(self);
10889
10890
0
    marg = width - PyUnicode_GET_LENGTH(self);
10891
0
    left = marg / 2 + (marg & width & 1);
10892
10893
0
    return pad(self, left, marg - left, fillchar);
10894
0
}
10895
10896
/* This function assumes that str1 and str2 are readied by the caller. */
10897
10898
static int
10899
unicode_compare(PyObject *str1, PyObject *str2)
10900
6.89M
{
10901
6.89M
#define COMPARE(TYPE1, TYPE2) \
10902
6.89M
    do { \
10903
6.26M
        TYPE1* p1 = (TYPE1 *)data1; \
10904
6.26M
        TYPE2* p2 = (TYPE2 *)data2; \
10905
6.26M
        TYPE1* end = p1 + len; \
10906
6.26M
        Py_UCS4 c1, c2; \
10907
6.59M
        for (; p1 != end; p1++, p2++) { \
10908
6.57M
            c1 = *p1; \
10909
6.57M
            c2 = *p2; \
10910
6.57M
            if (c1 != c2) \
10911
6.57M
                return (c1 < c2) ? -1 : 1; \
10912
6.57M
        } \
10913
6.26M
    } \
10914
6.26M
    while (0)
10915
10916
6.89M
    int kind1, kind2;
10917
6.89M
    const void *data1, *data2;
10918
6.89M
    Py_ssize_t len1, len2, len;
10919
10920
6.89M
    kind1 = PyUnicode_KIND(str1);
10921
6.89M
    kind2 = PyUnicode_KIND(str2);
10922
6.89M
    data1 = PyUnicode_DATA(str1);
10923
6.89M
    data2 = PyUnicode_DATA(str2);
10924
6.89M
    len1 = PyUnicode_GET_LENGTH(str1);
10925
6.89M
    len2 = PyUnicode_GET_LENGTH(str2);
10926
6.89M
    len = Py_MIN(len1, len2);
10927
10928
6.89M
    switch(kind1) {
10929
3.15M
    case PyUnicode_1BYTE_KIND:
10930
3.15M
    {
10931
3.15M
        switch(kind2) {
10932
426k
        case PyUnicode_1BYTE_KIND:
10933
426k
        {
10934
426k
            int cmp = memcmp(data1, data2, len);
10935
            /* normalize result of memcmp() into the range [-1; 1] */
10936
426k
            if (cmp < 0)
10937
239k
                return -1;
10938
186k
            if (cmp > 0)
10939
163k
                return 1;
10940
23.2k
            break;
10941
186k
        }
10942
2.61M
        case PyUnicode_2BYTE_KIND:
10943
2.61M
            COMPARE(Py_UCS1, Py_UCS2);
10944
4.06k
            break;
10945
119k
        case PyUnicode_4BYTE_KIND:
10946
119k
            COMPARE(Py_UCS1, Py_UCS4);
10947
26
            break;
10948
26
        default:
10949
0
            Py_UNREACHABLE();
10950
3.15M
        }
10951
27.3k
        break;
10952
3.15M
    }
10953
3.43M
    case PyUnicode_2BYTE_KIND:
10954
3.43M
    {
10955
3.43M
        switch(kind2) {
10956
35.0k
        case PyUnicode_1BYTE_KIND:
10957
35.0k
            COMPARE(Py_UCS2, Py_UCS1);
10958
3.00k
            break;
10959
3.32M
        case PyUnicode_2BYTE_KIND:
10960
3.32M
        {
10961
3.32M
            COMPARE(Py_UCS2, Py_UCS2);
10962
11.1k
            break;
10963
3.32M
        }
10964
81.7k
        case PyUnicode_4BYTE_KIND:
10965
81.7k
            COMPARE(Py_UCS2, Py_UCS4);
10966
2
            break;
10967
2
        default:
10968
0
            Py_UNREACHABLE();
10969
3.43M
        }
10970
14.1k
        break;
10971
3.43M
    }
10972
293k
    case PyUnicode_4BYTE_KIND:
10973
293k
    {
10974
293k
        switch(kind2) {
10975
1.27k
        case PyUnicode_1BYTE_KIND:
10976
1.27k
            COMPARE(Py_UCS4, Py_UCS1);
10977
10
            break;
10978
95.9k
        case PyUnicode_2BYTE_KIND:
10979
95.9k
            COMPARE(Py_UCS4, Py_UCS2);
10980
6
            break;
10981
196k
        case PyUnicode_4BYTE_KIND:
10982
196k
        {
10983
196k
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10984
196k
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10985
            /* normalize result of wmemcmp() into the range [-1; 1] */
10986
196k
            if (cmp < 0)
10987
71.5k
                return -1;
10988
124k
            if (cmp > 0)
10989
124k
                return 1;
10990
#else
10991
            COMPARE(Py_UCS4, Py_UCS4);
10992
#endif
10993
4
            break;
10994
124k
        }
10995
4
        default:
10996
0
            Py_UNREACHABLE();
10997
293k
        }
10998
20
        break;
10999
293k
    }
11000
20
    default:
11001
0
        Py_UNREACHABLE();
11002
6.89M
    }
11003
11004
41.4k
    if (len1 == len2)
11005
150
        return 0;
11006
41.3k
    if (len1 < len2)
11007
20.8k
        return -1;
11008
20.4k
    else
11009
20.4k
        return 1;
11010
11011
41.3k
#undef COMPARE
11012
41.3k
}
11013
11014
11015
int
11016
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11017
6.84M
{
11018
6.84M
    assert(PyUnicode_Check(str1));
11019
6.84M
    assert(PyUnicode_Check(str2));
11020
6.84M
    if (str1 == str2) {
11021
29.5k
        return 1;
11022
29.5k
    }
11023
6.81M
    return unicode_eq(str1, str2);
11024
6.84M
}
11025
11026
11027
int
11028
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11029
0
{
11030
0
    if (!PyUnicode_Check(str1)) {
11031
0
        PyErr_Format(PyExc_TypeError,
11032
0
                     "first argument must be str, not %T", str1);
11033
0
        return -1;
11034
0
    }
11035
0
    if (!PyUnicode_Check(str2)) {
11036
0
        PyErr_Format(PyExc_TypeError,
11037
0
                     "second argument must be str, not %T", str2);
11038
0
        return -1;
11039
0
    }
11040
11041
0
    return _PyUnicode_Equal(str1, str2);
11042
0
}
11043
11044
11045
int
11046
PyUnicode_Compare(PyObject *left, PyObject *right)
11047
207
{
11048
207
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11049
        /* a string is equal to itself */
11050
207
        if (left == right)
11051
21
            return 0;
11052
11053
186
        return unicode_compare(left, right);
11054
207
    }
11055
0
    PyErr_Format(PyExc_TypeError,
11056
0
                 "Can't compare %.100s and %.100s",
11057
0
                 Py_TYPE(left)->tp_name,
11058
0
                 Py_TYPE(right)->tp_name);
11059
0
    return -1;
11060
207
}
11061
11062
int
11063
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11064
619k
{
11065
619k
    Py_ssize_t i;
11066
619k
    int kind;
11067
619k
    Py_UCS4 chr;
11068
11069
619k
    assert(_PyUnicode_CHECK(uni));
11070
619k
    kind = PyUnicode_KIND(uni);
11071
619k
    if (kind == PyUnicode_1BYTE_KIND) {
11072
619k
        const void *data = PyUnicode_1BYTE_DATA(uni);
11073
619k
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11074
619k
        size_t len, len2 = strlen(str);
11075
619k
        int cmp;
11076
11077
619k
        len = Py_MIN(len1, len2);
11078
619k
        cmp = memcmp(data, str, len);
11079
619k
        if (cmp != 0) {
11080
319k
            if (cmp < 0)
11081
21.4k
                return -1;
11082
298k
            else
11083
298k
                return 1;
11084
319k
        }
11085
299k
        if (len1 > len2)
11086
75
            return 1; /* uni is longer */
11087
299k
        if (len1 < len2)
11088
6.23k
            return -1; /* str is longer */
11089
293k
        return 0;
11090
299k
    }
11091
489
    else {
11092
489
        const void *data = PyUnicode_DATA(uni);
11093
        /* Compare Unicode string and source character set string */
11094
779
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11095
765
            if (chr != (unsigned char)str[i])
11096
475
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11097
        /* This check keeps Python strings that end in '\0' from comparing equal
11098
         to C strings identical up to that point. */
11099
14
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11100
14
            return 1; /* uni is longer */
11101
0
        if (str[i])
11102
0
            return -1; /* str is longer */
11103
0
        return 0;
11104
0
    }
11105
619k
}
11106
11107
int
11108
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11109
0
{
11110
0
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11111
0
}
11112
11113
int
11114
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11115
0
{
11116
0
    assert(_PyUnicode_CHECK(unicode));
11117
0
    assert(str);
11118
11119
0
    if (PyUnicode_IS_ASCII(unicode)) {
11120
0
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11121
0
        return size == len &&
11122
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11123
0
    }
11124
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11125
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11126
0
        return size == len &&
11127
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11128
0
    }
11129
11130
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11131
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11132
0
        return 0;
11133
0
    }
11134
0
    const unsigned char *s = (const unsigned char *)str;
11135
0
    const unsigned char *ends = s + (size_t)size;
11136
0
    int kind = PyUnicode_KIND(unicode);
11137
0
    const void *data = PyUnicode_DATA(unicode);
11138
    /* Compare Unicode string and UTF-8 string */
11139
0
    for (Py_ssize_t i = 0; i < len; i++) {
11140
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11141
0
        if (ch < 0x80) {
11142
0
            if (ends == s || s[0] != ch) {
11143
0
                return 0;
11144
0
            }
11145
0
            s += 1;
11146
0
        }
11147
0
        else if (ch < 0x800) {
11148
0
            if ((ends - s) < 2 ||
11149
0
                s[0] != (0xc0 | (ch >> 6)) ||
11150
0
                s[1] != (0x80 | (ch & 0x3f)))
11151
0
            {
11152
0
                return 0;
11153
0
            }
11154
0
            s += 2;
11155
0
        }
11156
0
        else if (ch < 0x10000) {
11157
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11158
0
                (ends - s) < 3 ||
11159
0
                s[0] != (0xe0 | (ch >> 12)) ||
11160
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11161
0
                s[2] != (0x80 | (ch & 0x3f)))
11162
0
            {
11163
0
                return 0;
11164
0
            }
11165
0
            s += 3;
11166
0
        }
11167
0
        else {
11168
0
            assert(ch <= MAX_UNICODE);
11169
0
            if ((ends - s) < 4 ||
11170
0
                s[0] != (0xf0 | (ch >> 18)) ||
11171
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11172
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11173
0
                s[3] != (0x80 | (ch & 0x3f)))
11174
0
            {
11175
0
                return 0;
11176
0
            }
11177
0
            s += 4;
11178
0
        }
11179
0
    }
11180
0
    return s == ends;
11181
0
}
11182
11183
int
11184
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11185
6.14M
{
11186
6.14M
    size_t len;
11187
6.14M
    assert(_PyUnicode_CHECK(unicode));
11188
6.14M
    assert(str);
11189
6.14M
#ifndef NDEBUG
11190
33.9M
    for (const char *p = str; *p; p++) {
11191
27.8M
        assert((unsigned char)*p < 128);
11192
27.8M
    }
11193
6.14M
#endif
11194
6.14M
    if (!PyUnicode_IS_ASCII(unicode))
11195
560k
        return 0;
11196
5.58M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11197
5.58M
    return strlen(str) == len &&
11198
388k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11199
6.14M
}
11200
11201
PyObject *
11202
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11203
7.11M
{
11204
7.11M
    int result;
11205
11206
7.11M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11207
635
        Py_RETURN_NOTIMPLEMENTED;
11208
11209
7.11M
    if (left == right) {
11210
190
        switch (op) {
11211
150
        case Py_EQ:
11212
150
        case Py_LE:
11213
150
        case Py_GE:
11214
            /* a string is equal to itself */
11215
150
            Py_RETURN_TRUE;
11216
40
        case Py_NE:
11217
40
        case Py_LT:
11218
40
        case Py_GT:
11219
40
            Py_RETURN_FALSE;
11220
0
        default:
11221
0
            PyErr_BadArgument();
11222
0
            return NULL;
11223
190
        }
11224
190
    }
11225
7.11M
    else if (op == Py_EQ || op == Py_NE) {
11226
221k
        result = unicode_eq(left, right);
11227
221k
        result ^= (op == Py_NE);
11228
221k
        return PyBool_FromLong(result);
11229
221k
    }
11230
6.88M
    else {
11231
6.88M
        result = unicode_compare(left, right);
11232
6.88M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11233
6.88M
    }
11234
7.11M
}
11235
11236
int
11237
PyUnicode_Contains(PyObject *str, PyObject *substr)
11238
3.04k
{
11239
3.04k
    int kind1, kind2;
11240
3.04k
    const void *buf1, *buf2;
11241
3.04k
    Py_ssize_t len1, len2;
11242
3.04k
    int result;
11243
11244
3.04k
    if (!PyUnicode_Check(substr)) {
11245
0
        PyErr_Format(PyExc_TypeError,
11246
0
                     "'in <string>' requires string as left operand, not %.100s",
11247
0
                     Py_TYPE(substr)->tp_name);
11248
0
        return -1;
11249
0
    }
11250
3.04k
    if (ensure_unicode(str) < 0)
11251
0
        return -1;
11252
11253
3.04k
    kind1 = PyUnicode_KIND(str);
11254
3.04k
    kind2 = PyUnicode_KIND(substr);
11255
3.04k
    if (kind1 < kind2)
11256
3
        return 0;
11257
3.04k
    len1 = PyUnicode_GET_LENGTH(str);
11258
3.04k
    len2 = PyUnicode_GET_LENGTH(substr);
11259
3.04k
    if (len1 < len2)
11260
38
        return 0;
11261
3.00k
    buf1 = PyUnicode_DATA(str);
11262
3.00k
    buf2 = PyUnicode_DATA(substr);
11263
3.00k
    if (len2 == 1) {
11264
2.95k
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11265
2.95k
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11266
2.95k
        return result;
11267
2.95k
    }
11268
52
    if (kind2 != kind1) {
11269
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11270
0
        if (!buf2)
11271
0
            return -1;
11272
0
    }
11273
11274
52
    switch (kind1) {
11275
52
    case PyUnicode_1BYTE_KIND:
11276
52
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11277
52
        break;
11278
0
    case PyUnicode_2BYTE_KIND:
11279
0
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11280
0
        break;
11281
0
    case PyUnicode_4BYTE_KIND:
11282
0
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11283
0
        break;
11284
0
    default:
11285
0
        Py_UNREACHABLE();
11286
52
    }
11287
11288
52
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11289
52
    if (kind2 != kind1)
11290
0
        PyMem_Free((void *)buf2);
11291
11292
52
    return result;
11293
52
}
11294
11295
/* Concat to string or Unicode object giving a new Unicode object. */
11296
11297
PyObject *
11298
PyUnicode_Concat(PyObject *left, PyObject *right)
11299
113k
{
11300
113k
    PyObject *result;
11301
113k
    Py_UCS4 maxchar, maxchar2;
11302
113k
    Py_ssize_t left_len, right_len, new_len;
11303
11304
113k
    if (ensure_unicode(left) < 0)
11305
0
        return NULL;
11306
11307
113k
    if (!PyUnicode_Check(right)) {
11308
6
        PyErr_Format(PyExc_TypeError,
11309
6
            "can only concatenate str (not \"%.200s\") to str",
11310
6
            Py_TYPE(right)->tp_name);
11311
6
        return NULL;
11312
6
    }
11313
11314
    /* Shortcuts */
11315
113k
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11316
113k
    if (left == empty) {
11317
3.61k
        return PyUnicode_FromObject(right);
11318
3.61k
    }
11319
110k
    if (right == empty) {
11320
4.24k
        return PyUnicode_FromObject(left);
11321
4.24k
    }
11322
11323
105k
    left_len = PyUnicode_GET_LENGTH(left);
11324
105k
    right_len = PyUnicode_GET_LENGTH(right);
11325
105k
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11326
0
        PyErr_SetString(PyExc_OverflowError,
11327
0
                        "strings are too large to concat");
11328
0
        return NULL;
11329
0
    }
11330
105k
    new_len = left_len + right_len;
11331
11332
105k
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11333
105k
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11334
105k
    maxchar = Py_MAX(maxchar, maxchar2);
11335
11336
    /* Concat the two Unicode strings */
11337
105k
    result = PyUnicode_New(new_len, maxchar);
11338
105k
    if (result == NULL)
11339
0
        return NULL;
11340
105k
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11341
105k
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11342
105k
    assert(_PyUnicode_CheckConsistency(result, 1));
11343
105k
    return result;
11344
105k
}
11345
11346
void
11347
PyUnicode_Append(PyObject **p_left, PyObject *right)
11348
5.59k
{
11349
5.59k
    PyObject *left, *res;
11350
5.59k
    Py_UCS4 maxchar, maxchar2;
11351
5.59k
    Py_ssize_t left_len, right_len, new_len;
11352
11353
5.59k
    if (p_left == NULL) {
11354
0
        if (!PyErr_Occurred())
11355
0
            PyErr_BadInternalCall();
11356
0
        return;
11357
0
    }
11358
5.59k
    left = *p_left;
11359
5.59k
    if (right == NULL || left == NULL
11360
5.59k
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11361
0
        if (!PyErr_Occurred())
11362
0
            PyErr_BadInternalCall();
11363
0
        goto error;
11364
0
    }
11365
11366
    /* Shortcuts */
11367
5.59k
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11368
5.59k
    if (left == empty) {
11369
54
        Py_DECREF(left);
11370
54
        *p_left = Py_NewRef(right);
11371
54
        return;
11372
54
    }
11373
5.54k
    if (right == empty) {
11374
0
        return;
11375
0
    }
11376
11377
5.54k
    left_len = PyUnicode_GET_LENGTH(left);
11378
5.54k
    right_len = PyUnicode_GET_LENGTH(right);
11379
5.54k
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11380
0
        PyErr_SetString(PyExc_OverflowError,
11381
0
                        "strings are too large to concat");
11382
0
        goto error;
11383
0
    }
11384
5.54k
    new_len = left_len + right_len;
11385
11386
5.54k
    if (_PyUnicode_IsModifiable(left)
11387
5.54k
        && PyUnicode_CheckExact(right)
11388
10.3k
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11389
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11390
           to change the structure size, but characters are stored just after
11391
           the structure, and so it requires to move all characters which is
11392
           not so different than duplicating the string. */
11393
5.17k
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11394
4.98k
    {
11395
        /* append inplace */
11396
4.98k
        if (unicode_resize(p_left, new_len) != 0)
11397
0
            goto error;
11398
11399
        /* copy 'right' into the newly allocated area of 'left' */
11400
4.98k
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11401
4.98k
    }
11402
552
    else {
11403
552
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11404
552
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11405
552
        maxchar = Py_MAX(maxchar, maxchar2);
11406
11407
        /* Concat the two Unicode strings */
11408
552
        res = PyUnicode_New(new_len, maxchar);
11409
552
        if (res == NULL)
11410
0
            goto error;
11411
552
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11412
552
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11413
552
        Py_DECREF(left);
11414
552
        *p_left = res;
11415
552
    }
11416
5.54k
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11417
5.54k
    return;
11418
11419
5.54k
error:
11420
0
    Py_CLEAR(*p_left);
11421
0
}
11422
11423
void
11424
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11425
0
{
11426
0
    PyUnicode_Append(pleft, right);
11427
0
    Py_XDECREF(right);
11428
0
}
11429
11430
/*[clinic input]
11431
@permit_long_summary
11432
@text_signature "($self, sub[, start[, end]], /)"
11433
str.count as unicode_count -> Py_ssize_t
11434
11435
    self as str: self
11436
    sub as substr: unicode
11437
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11438
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11439
    /
11440
11441
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11442
11443
Optional arguments start and end are interpreted as in slice
11444
notation.
11445
[clinic start generated code]*/
11446
11447
static Py_ssize_t
11448
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11449
                   Py_ssize_t end)
11450
/*[clinic end generated code: output=8fcc3aef0b18edbf input=c9209e05438cc352]*/
11451
6.68k
{
11452
6.68k
    assert(PyUnicode_Check(str));
11453
6.68k
    assert(PyUnicode_Check(substr));
11454
11455
6.68k
    Py_ssize_t result;
11456
6.68k
    int kind1, kind2;
11457
6.68k
    const void *buf1 = NULL, *buf2 = NULL;
11458
6.68k
    Py_ssize_t len1, len2;
11459
11460
6.68k
    kind1 = PyUnicode_KIND(str);
11461
6.68k
    kind2 = PyUnicode_KIND(substr);
11462
6.68k
    if (kind1 < kind2)
11463
0
        return 0;
11464
11465
6.68k
    len1 = PyUnicode_GET_LENGTH(str);
11466
6.68k
    len2 = PyUnicode_GET_LENGTH(substr);
11467
6.68k
    ADJUST_INDICES(start, end, len1);
11468
6.68k
    if (end - start < len2)
11469
1.42k
        return 0;
11470
11471
5.26k
    buf1 = PyUnicode_DATA(str);
11472
5.26k
    buf2 = PyUnicode_DATA(substr);
11473
5.26k
    if (kind2 != kind1) {
11474
2.35k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11475
2.35k
        if (!buf2)
11476
0
            goto onError;
11477
2.35k
    }
11478
11479
    // We don't reuse `anylib_count` here because of the explicit casts.
11480
5.26k
    switch (kind1) {
11481
2.90k
    case PyUnicode_1BYTE_KIND:
11482
2.90k
        result = ucs1lib_count(
11483
2.90k
            ((const Py_UCS1*)buf1) + start, end - start,
11484
2.90k
            buf2, len2, PY_SSIZE_T_MAX
11485
2.90k
            );
11486
2.90k
        break;
11487
998
    case PyUnicode_2BYTE_KIND:
11488
998
        result = ucs2lib_count(
11489
998
            ((const Py_UCS2*)buf1) + start, end - start,
11490
998
            buf2, len2, PY_SSIZE_T_MAX
11491
998
            );
11492
998
        break;
11493
1.35k
    case PyUnicode_4BYTE_KIND:
11494
1.35k
        result = ucs4lib_count(
11495
1.35k
            ((const Py_UCS4*)buf1) + start, end - start,
11496
1.35k
            buf2, len2, PY_SSIZE_T_MAX
11497
1.35k
            );
11498
1.35k
        break;
11499
0
    default:
11500
0
        Py_UNREACHABLE();
11501
5.26k
    }
11502
11503
5.26k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11504
5.26k
    if (kind2 != kind1)
11505
2.35k
        PyMem_Free((void *)buf2);
11506
11507
5.26k
    return result;
11508
0
  onError:
11509
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11510
0
    if (kind2 != kind1)
11511
0
        PyMem_Free((void *)buf2);
11512
0
    return -1;
11513
0
}
11514
11515
/*[clinic input]
11516
str.encode as unicode_encode
11517
11518
    encoding: str(c_default="NULL") = 'utf-8'
11519
        The encoding in which to encode the string.
11520
    errors: str(c_default="NULL") = 'strict'
11521
        The error handling scheme to use for encoding errors.
11522
        The default is 'strict' meaning that encoding errors raise a
11523
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace'
11524
        and 'xmlcharrefreplace' as well as any other name registered with
11525
        codecs.register_error that can handle UnicodeEncodeErrors.
11526
11527
Encode the string using the codec registered for encoding.
11528
[clinic start generated code]*/
11529
11530
static PyObject *
11531
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11532
/*[clinic end generated code: output=bf78b6e2a9470e3c input=b85a9645cb33b729]*/
11533
3.54k
{
11534
3.54k
    return PyUnicode_AsEncodedString(self, encoding, errors);
11535
3.54k
}
11536
11537
/*[clinic input]
11538
str.expandtabs as unicode_expandtabs
11539
11540
    tabsize: int = 8
11541
11542
Return a copy where all tab characters are expanded using spaces.
11543
11544
If tabsize is not given, a tab size of 8 characters is assumed.
11545
[clinic start generated code]*/
11546
11547
static PyObject *
11548
unicode_expandtabs_impl(PyObject *self, int tabsize)
11549
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11550
782
{
11551
782
    Py_ssize_t i, j, line_pos, src_len, incr;
11552
782
    Py_UCS4 ch;
11553
782
    PyObject *u;
11554
782
    const void *src_data;
11555
782
    void *dest_data;
11556
782
    int kind;
11557
782
    int found;
11558
11559
    /* First pass: determine size of output string */
11560
782
    src_len = PyUnicode_GET_LENGTH(self);
11561
782
    i = j = line_pos = 0;
11562
782
    kind = PyUnicode_KIND(self);
11563
782
    src_data = PyUnicode_DATA(self);
11564
782
    found = 0;
11565
41.5k
    for (; i < src_len; i++) {
11566
40.7k
        ch = PyUnicode_READ(kind, src_data, i);
11567
40.7k
        if (ch == '\t') {
11568
532
            found = 1;
11569
532
            if (tabsize > 0) {
11570
532
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11571
532
                if (j > PY_SSIZE_T_MAX - incr)
11572
0
                    goto overflow;
11573
532
                line_pos += incr;
11574
532
                j += incr;
11575
532
            }
11576
532
        }
11577
40.2k
        else {
11578
40.2k
            if (j > PY_SSIZE_T_MAX - 1)
11579
0
                goto overflow;
11580
40.2k
            line_pos++;
11581
40.2k
            j++;
11582
40.2k
            if (ch == '\n' || ch == '\r')
11583
765
                line_pos = 0;
11584
40.2k
        }
11585
40.7k
    }
11586
782
    if (!found)
11587
584
        return unicode_result_unchanged(self);
11588
11589
    /* Second pass: create output string and fill it */
11590
198
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11591
198
    if (!u)
11592
0
        return NULL;
11593
198
    dest_data = PyUnicode_DATA(u);
11594
11595
198
    i = j = line_pos = 0;
11596
11597
36.4k
    for (; i < src_len; i++) {
11598
36.2k
        ch = PyUnicode_READ(kind, src_data, i);
11599
36.2k
        if (ch == '\t') {
11600
532
            if (tabsize > 0) {
11601
532
                incr = tabsize - (line_pos % tabsize);
11602
532
                line_pos += incr;
11603
532
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11604
532
                j += incr;
11605
532
            }
11606
532
        }
11607
35.7k
        else {
11608
35.7k
            line_pos++;
11609
35.7k
            PyUnicode_WRITE(kind, dest_data, j, ch);
11610
35.7k
            j++;
11611
35.7k
            if (ch == '\n' || ch == '\r')
11612
591
                line_pos = 0;
11613
35.7k
        }
11614
36.2k
    }
11615
198
    assert (j == PyUnicode_GET_LENGTH(u));
11616
198
    return unicode_result(u);
11617
11618
0
  overflow:
11619
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11620
0
    return NULL;
11621
198
}
11622
11623
/*[clinic input]
11624
@permit_long_summary
11625
str.find as unicode_find = str.count
11626
11627
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11628
11629
Optional arguments start and end are interpreted as in slice
11630
notation.  Return -1 on failure.
11631
[clinic start generated code]*/
11632
11633
static Py_ssize_t
11634
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11635
                  Py_ssize_t end)
11636
/*[clinic end generated code: output=51dbe6255712e278 input=f57e93c59d1ee927]*/
11637
19
{
11638
19
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11639
19
    if (result < 0) {
11640
0
        return -1;
11641
0
    }
11642
19
    return result;
11643
19
}
11644
11645
static PyObject *
11646
unicode_getitem(PyObject *self, Py_ssize_t index)
11647
3.65M
{
11648
3.65M
    const void *data;
11649
3.65M
    int kind;
11650
3.65M
    Py_UCS4 ch;
11651
11652
3.65M
    if (!PyUnicode_Check(self)) {
11653
0
        PyErr_BadArgument();
11654
0
        return NULL;
11655
0
    }
11656
3.65M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11657
235
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11658
235
        return NULL;
11659
235
    }
11660
3.64M
    kind = PyUnicode_KIND(self);
11661
3.64M
    data = PyUnicode_DATA(self);
11662
3.64M
    ch = PyUnicode_READ(kind, data, index);
11663
3.64M
    return unicode_char(ch);
11664
3.64M
}
11665
11666
/* Believe it or not, this produces the same value for ASCII strings
11667
   as bytes_hash(). */
11668
static Py_hash_t
11669
unicode_hash(PyObject *self)
11670
2.53M
{
11671
2.53M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11672
11673
#ifdef Py_DEBUG
11674
    assert(_Py_HashSecret_Initialized);
11675
#endif
11676
2.53M
    Py_hash_t hash = PyUnicode_HASH(self);
11677
2.53M
    if (hash != -1) {
11678
458k
        return hash;
11679
458k
    }
11680
2.07M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11681
2.07M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11682
11683
0
    PyUnicode_SET_HASH(self, x);
11684
2.07M
    return x;
11685
2.07M
}
11686
11687
/*[clinic input]
11688
@permit_long_summary
11689
str.index as unicode_index = str.count
11690
11691
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11692
11693
Optional arguments start and end are interpreted as in slice
11694
notation.  Raises ValueError when the substring is not found.
11695
[clinic start generated code]*/
11696
11697
static Py_ssize_t
11698
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11699
                   Py_ssize_t end)
11700
/*[clinic end generated code: output=77558288837cdf40 input=5900ab84de55e628]*/
11701
0
{
11702
0
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11703
0
    if (result == -1) {
11704
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
11705
0
    }
11706
0
    else if (result < 0) {
11707
0
        return -1;
11708
0
    }
11709
0
    return result;
11710
0
}
11711
11712
/*[clinic input]
11713
@permit_long_summary
11714
str.isascii as unicode_isascii
11715
11716
Return True if all characters in the string are ASCII, False otherwise.
11717
11718
ASCII characters have code points in the range U+0000-U+007F.
11719
Empty string is ASCII too.
11720
[clinic start generated code]*/
11721
11722
static PyObject *
11723
unicode_isascii_impl(PyObject *self)
11724
/*[clinic end generated code: output=c5910d64b5a8003f input=dc74e1ced821159f]*/
11725
206
{
11726
206
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11727
206
}
11728
11729
/*[clinic input]
11730
str.islower as unicode_islower
11731
11732
Return True if the string is a lowercase string, False otherwise.
11733
11734
A string is lowercase if all cased characters in the string are
11735
lowercase and there is at least one cased character in the string.
11736
[clinic start generated code]*/
11737
11738
static PyObject *
11739
unicode_islower_impl(PyObject *self)
11740
/*[clinic end generated code: output=dbd41995bd005b81 input=1879b48dfc628366]*/
11741
0
{
11742
0
    Py_ssize_t i, length;
11743
0
    int kind;
11744
0
    const void *data;
11745
0
    int cased;
11746
11747
0
    length = PyUnicode_GET_LENGTH(self);
11748
0
    kind = PyUnicode_KIND(self);
11749
0
    data = PyUnicode_DATA(self);
11750
11751
    /* Shortcut for single character strings */
11752
0
    if (length == 1)
11753
0
        return PyBool_FromLong(
11754
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11755
11756
    /* Special case for empty strings */
11757
0
    if (length == 0)
11758
0
        Py_RETURN_FALSE;
11759
11760
0
    cased = 0;
11761
0
    for (i = 0; i < length; i++) {
11762
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11763
11764
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11765
0
            Py_RETURN_FALSE;
11766
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11767
0
            cased = 1;
11768
0
    }
11769
0
    return PyBool_FromLong(cased);
11770
0
}
11771
11772
/*[clinic input]
11773
str.isupper as unicode_isupper
11774
11775
Return True if the string is an uppercase string, False otherwise.
11776
11777
A string is uppercase if all cased characters in the string are
11778
uppercase and there is at least one cased character in the string.
11779
[clinic start generated code]*/
11780
11781
static PyObject *
11782
unicode_isupper_impl(PyObject *self)
11783
/*[clinic end generated code: output=049209c8e7f15f59 input=77d29904aef0e3a0]*/
11784
0
{
11785
0
    Py_ssize_t i, length;
11786
0
    int kind;
11787
0
    const void *data;
11788
0
    int cased;
11789
11790
0
    length = PyUnicode_GET_LENGTH(self);
11791
0
    kind = PyUnicode_KIND(self);
11792
0
    data = PyUnicode_DATA(self);
11793
11794
    /* Shortcut for single character strings */
11795
0
    if (length == 1)
11796
0
        return PyBool_FromLong(
11797
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11798
11799
    /* Special case for empty strings */
11800
0
    if (length == 0)
11801
0
        Py_RETURN_FALSE;
11802
11803
0
    cased = 0;
11804
0
    for (i = 0; i < length; i++) {
11805
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11806
11807
0
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11808
0
            Py_RETURN_FALSE;
11809
0
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11810
0
            cased = 1;
11811
0
    }
11812
0
    return PyBool_FromLong(cased);
11813
0
}
11814
11815
/*[clinic input]
11816
str.istitle as unicode_istitle
11817
11818
Return True if the string is a title-cased string, False otherwise.
11819
11820
In a title-cased string, upper- and title-case characters may only
11821
follow uncased characters and lowercase characters only cased ones.
11822
[clinic start generated code]*/
11823
11824
static PyObject *
11825
unicode_istitle_impl(PyObject *self)
11826
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11827
0
{
11828
0
    Py_ssize_t i, length;
11829
0
    int kind;
11830
0
    const void *data;
11831
0
    int cased, previous_is_cased;
11832
11833
0
    length = PyUnicode_GET_LENGTH(self);
11834
0
    kind = PyUnicode_KIND(self);
11835
0
    data = PyUnicode_DATA(self);
11836
11837
    /* Shortcut for single character strings */
11838
0
    if (length == 1) {
11839
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11840
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11841
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11842
0
    }
11843
11844
    /* Special case for empty strings */
11845
0
    if (length == 0)
11846
0
        Py_RETURN_FALSE;
11847
11848
0
    cased = 0;
11849
0
    previous_is_cased = 0;
11850
0
    for (i = 0; i < length; i++) {
11851
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11852
11853
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11854
0
            if (previous_is_cased)
11855
0
                Py_RETURN_FALSE;
11856
0
            previous_is_cased = 1;
11857
0
            cased = 1;
11858
0
        }
11859
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11860
0
            if (!previous_is_cased)
11861
0
                Py_RETURN_FALSE;
11862
0
            previous_is_cased = 1;
11863
0
            cased = 1;
11864
0
        }
11865
0
        else
11866
0
            previous_is_cased = 0;
11867
0
    }
11868
0
    return PyBool_FromLong(cased);
11869
0
}
11870
11871
/*[clinic input]
11872
str.isspace as unicode_isspace
11873
11874
Return True if the string is a whitespace string, False otherwise.
11875
11876
A string is whitespace if all characters in the string are
11877
whitespace and there is at least one character in the string.
11878
[clinic start generated code]*/
11879
11880
static PyObject *
11881
unicode_isspace_impl(PyObject *self)
11882
/*[clinic end generated code: output=163a63bfa08ac2b9 input=29e09560fc23fbeb]*/
11883
0
{
11884
0
    Py_ssize_t i, length;
11885
0
    int kind;
11886
0
    const void *data;
11887
11888
0
    length = PyUnicode_GET_LENGTH(self);
11889
0
    kind = PyUnicode_KIND(self);
11890
0
    data = PyUnicode_DATA(self);
11891
11892
    /* Shortcut for single character strings */
11893
0
    if (length == 1)
11894
0
        return PyBool_FromLong(
11895
0
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11896
11897
    /* Special case for empty strings */
11898
0
    if (length == 0)
11899
0
        Py_RETURN_FALSE;
11900
11901
0
    for (i = 0; i < length; i++) {
11902
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11903
0
        if (!Py_UNICODE_ISSPACE(ch))
11904
0
            Py_RETURN_FALSE;
11905
0
    }
11906
0
    Py_RETURN_TRUE;
11907
0
}
11908
11909
/*[clinic input]
11910
str.isalpha as unicode_isalpha
11911
11912
Return True if the string is an alphabetic string, False otherwise.
11913
11914
A string is alphabetic if all characters in the string are
11915
alphabetic and there is at least one character in the string.
11916
[clinic start generated code]*/
11917
11918
static PyObject *
11919
unicode_isalpha_impl(PyObject *self)
11920
/*[clinic end generated code: output=cc81b9ac3883ec4f input=9906a07f3e04892e]*/
11921
0
{
11922
0
    Py_ssize_t i, length;
11923
0
    int kind;
11924
0
    const void *data;
11925
11926
0
    length = PyUnicode_GET_LENGTH(self);
11927
0
    kind = PyUnicode_KIND(self);
11928
0
    data = PyUnicode_DATA(self);
11929
11930
    /* Shortcut for single character strings */
11931
0
    if (length == 1)
11932
0
        return PyBool_FromLong(
11933
0
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11934
11935
    /* Special case for empty strings */
11936
0
    if (length == 0)
11937
0
        Py_RETURN_FALSE;
11938
11939
0
    for (i = 0; i < length; i++) {
11940
0
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11941
0
            Py_RETURN_FALSE;
11942
0
    }
11943
0
    Py_RETURN_TRUE;
11944
0
}
11945
11946
/*[clinic input]
11947
@permit_long_summary
11948
str.isalnum as unicode_isalnum
11949
11950
Return True if the string is an alpha-numeric string, False otherwise.
11951
11952
A string is alpha-numeric if all characters in the string are
11953
alpha-numeric and there is at least one character in the string.
11954
[clinic start generated code]*/
11955
11956
static PyObject *
11957
unicode_isalnum_impl(PyObject *self)
11958
/*[clinic end generated code: output=a5a23490ffc3660c input=892f64ebc171fd4f]*/
11959
0
{
11960
0
    int kind;
11961
0
    const void *data;
11962
0
    Py_ssize_t len, i;
11963
11964
0
    kind = PyUnicode_KIND(self);
11965
0
    data = PyUnicode_DATA(self);
11966
0
    len = PyUnicode_GET_LENGTH(self);
11967
11968
    /* Shortcut for single character strings */
11969
0
    if (len == 1) {
11970
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11971
0
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11972
0
    }
11973
11974
    /* Special case for empty strings */
11975
0
    if (len == 0)
11976
0
        Py_RETURN_FALSE;
11977
11978
0
    for (i = 0; i < len; i++) {
11979
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11980
0
        if (!Py_UNICODE_ISALNUM(ch))
11981
0
            Py_RETURN_FALSE;
11982
0
    }
11983
0
    Py_RETURN_TRUE;
11984
0
}
11985
11986
/*[clinic input]
11987
str.isdecimal as unicode_isdecimal
11988
11989
Return True if the string is a decimal string, False otherwise.
11990
11991
A string is a decimal string if all characters in the string are
11992
decimal and there is at least one character in the string.
11993
[clinic start generated code]*/
11994
11995
static PyObject *
11996
unicode_isdecimal_impl(PyObject *self)
11997
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=63b0453c48cad0af]*/
11998
0
{
11999
0
    Py_ssize_t i, length;
12000
0
    int kind;
12001
0
    const void *data;
12002
12003
0
    length = PyUnicode_GET_LENGTH(self);
12004
0
    kind = PyUnicode_KIND(self);
12005
0
    data = PyUnicode_DATA(self);
12006
12007
    /* Shortcut for single character strings */
12008
0
    if (length == 1)
12009
0
        return PyBool_FromLong(
12010
0
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12011
12012
    /* Special case for empty strings */
12013
0
    if (length == 0)
12014
0
        Py_RETURN_FALSE;
12015
12016
0
    for (i = 0; i < length; i++) {
12017
0
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12018
0
            Py_RETURN_FALSE;
12019
0
    }
12020
0
    Py_RETURN_TRUE;
12021
0
}
12022
12023
/*[clinic input]
12024
str.isdigit as unicode_isdigit
12025
12026
Return True if the string is a digit string, False otherwise.
12027
12028
A string is a digit string if all characters in the string are
12029
digits and there is at least one character in the string.
12030
[clinic start generated code]*/
12031
12032
static PyObject *
12033
unicode_isdigit_impl(PyObject *self)
12034
/*[clinic end generated code: output=10a6985311da6858 input=353b03747b062e4b]*/
12035
0
{
12036
0
    Py_ssize_t i, length;
12037
0
    int kind;
12038
0
    const void *data;
12039
12040
0
    length = PyUnicode_GET_LENGTH(self);
12041
0
    kind = PyUnicode_KIND(self);
12042
0
    data = PyUnicode_DATA(self);
12043
12044
    /* Shortcut for single character strings */
12045
0
    if (length == 1) {
12046
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12047
0
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12048
0
    }
12049
12050
    /* Special case for empty strings */
12051
0
    if (length == 0)
12052
0
        Py_RETURN_FALSE;
12053
12054
0
    for (i = 0; i < length; i++) {
12055
0
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12056
0
            Py_RETURN_FALSE;
12057
0
    }
12058
0
    Py_RETURN_TRUE;
12059
0
}
12060
12061
/*[clinic input]
12062
str.isnumeric as unicode_isnumeric
12063
12064
Return True if the string is a numeric string, False otherwise.
12065
12066
A string is numeric if all characters in the string are numeric and
12067
there is at least one character in the string.
12068
[clinic start generated code]*/
12069
12070
static PyObject *
12071
unicode_isnumeric_impl(PyObject *self)
12072
/*[clinic end generated code: output=9172a32d9013051a input=83b2a072ed7aff48]*/
12073
0
{
12074
0
    Py_ssize_t i, length;
12075
0
    int kind;
12076
0
    const void *data;
12077
12078
0
    length = PyUnicode_GET_LENGTH(self);
12079
0
    kind = PyUnicode_KIND(self);
12080
0
    data = PyUnicode_DATA(self);
12081
12082
    /* Shortcut for single character strings */
12083
0
    if (length == 1)
12084
0
        return PyBool_FromLong(
12085
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12086
12087
    /* Special case for empty strings */
12088
0
    if (length == 0)
12089
0
        Py_RETURN_FALSE;
12090
12091
0
    for (i = 0; i < length; i++) {
12092
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12093
0
            Py_RETURN_FALSE;
12094
0
    }
12095
0
    Py_RETURN_TRUE;
12096
0
}
12097
12098
Py_ssize_t
12099
_PyUnicode_ScanIdentifier(PyObject *self)
12100
50.2k
{
12101
50.2k
    Py_ssize_t i;
12102
50.2k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12103
50.2k
    if (len == 0) {
12104
        /* an empty string is not a valid identifier */
12105
0
        return 0;
12106
0
    }
12107
12108
50.2k
    int kind = PyUnicode_KIND(self);
12109
50.2k
    const void *data = PyUnicode_DATA(self);
12110
50.2k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12111
    /* PEP 3131 says that the first character must be in
12112
       XID_Start and subsequent characters in XID_Continue,
12113
       and for the ASCII range, the 2.x rules apply (i.e
12114
       start with letters and underscore, continue with
12115
       letters, digits, underscore). However, given the current
12116
       definition of XID_Start and XID_Continue, it is sufficient
12117
       to check just for these, except that _ must be allowed
12118
       as starting an identifier.  */
12119
50.2k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12120
217
        return 0;
12121
217
    }
12122
12123
10.1M
    for (i = 1; i < len; i++) {
12124
10.0M
        ch = PyUnicode_READ(kind, data, i);
12125
10.0M
        if (!_PyUnicode_IsXidContinue(ch)) {
12126
112
            return i;
12127
112
        }
12128
10.0M
    }
12129
49.8k
    return i;
12130
49.9k
}
12131
12132
int
12133
PyUnicode_IsIdentifier(PyObject *self)
12134
363
{
12135
363
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12136
363
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12137
    /* an empty string is not a valid identifier */
12138
363
    return len && i == len;
12139
363
}
12140
12141
/*[clinic input]
12142
@permit_long_summary
12143
str.isidentifier as unicode_isidentifier
12144
12145
Return True if the string is a valid Python identifier, False otherwise.
12146
12147
Call keyword.iskeyword(s) to test whether string s is a reserved
12148
identifier, such as "def" or "class".
12149
[clinic start generated code]*/
12150
12151
static PyObject *
12152
unicode_isidentifier_impl(PyObject *self)
12153
/*[clinic end generated code: output=fe585a9666572905 input=cabde62c20a3be6b]*/
12154
100
{
12155
100
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12156
100
}
12157
12158
/*[clinic input]
12159
@permit_long_summary
12160
str.isprintable as unicode_isprintable
12161
12162
Return True if all characters in the string are printable, False otherwise.
12163
12164
A character is printable if repr() may use it in its output.
12165
[clinic start generated code]*/
12166
12167
static PyObject *
12168
unicode_isprintable_impl(PyObject *self)
12169
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12170
0
{
12171
0
    Py_ssize_t i, length;
12172
0
    int kind;
12173
0
    const void *data;
12174
12175
0
    length = PyUnicode_GET_LENGTH(self);
12176
0
    kind = PyUnicode_KIND(self);
12177
0
    data = PyUnicode_DATA(self);
12178
12179
    /* Shortcut for single character strings */
12180
0
    if (length == 1)
12181
0
        return PyBool_FromLong(
12182
0
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12183
12184
0
    for (i = 0; i < length; i++) {
12185
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12186
0
            Py_RETURN_FALSE;
12187
0
        }
12188
0
    }
12189
0
    Py_RETURN_TRUE;
12190
0
}
12191
12192
/*[clinic input]
12193
str.join as unicode_join
12194
12195
    iterable: object
12196
    /
12197
12198
Concatenate any number of strings.
12199
12200
The string whose method is called is inserted in between each given
12201
string.  The result is returned as a new string.
12202
12203
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12204
[clinic start generated code]*/
12205
12206
static PyObject *
12207
unicode_join(PyObject *self, PyObject *iterable)
12208
/*[clinic end generated code: output=6857e7cecfe7bf98 input=fd330a11ee845fb2]*/
12209
72.4k
{
12210
72.4k
    return PyUnicode_Join(self, iterable);
12211
72.4k
}
12212
12213
static Py_ssize_t
12214
unicode_length(PyObject *self)
12215
278k
{
12216
278k
    return PyUnicode_GET_LENGTH(self);
12217
278k
}
12218
12219
/*[clinic input]
12220
str.ljust as unicode_ljust
12221
12222
    width: Py_ssize_t
12223
    fillchar: Py_UCS4 = ' '
12224
    /
12225
12226
Return a left-justified string of length width.
12227
12228
Padding is done using the specified fill character (default is
12229
a space).
12230
[clinic start generated code]*/
12231
12232
static PyObject *
12233
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12234
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=8a55f06694c20ed6]*/
12235
0
{
12236
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12237
0
        return unicode_result_unchanged(self);
12238
12239
0
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12240
0
}
12241
12242
/*[clinic input]
12243
str.lower as unicode_lower
12244
12245
Return a copy of the string converted to lowercase.
12246
[clinic start generated code]*/
12247
12248
static PyObject *
12249
unicode_lower_impl(PyObject *self)
12250
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12251
165k
{
12252
165k
    if (PyUnicode_IS_ASCII(self))
12253
66.3k
        return ascii_upper_or_lower(self, 1);
12254
99.4k
    return case_operation(self, do_lower);
12255
165k
}
12256
12257
22.4k
#define LEFTSTRIP 0
12258
44.7k
#define RIGHTSTRIP 1
12259
9
#define BOTHSTRIP 2
12260
12261
/* Arrays indexed by above */
12262
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12263
12264
0
#define STRIPNAME(i) (stripfuncnames[i])
12265
12266
/* externally visible for str.strip(unicode) */
12267
PyObject *
12268
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12269
4.82k
{
12270
4.82k
    const void *data;
12271
4.82k
    int kind;
12272
4.82k
    Py_ssize_t i, j, len;
12273
4.82k
    BLOOM_MASK sepmask;
12274
4.82k
    Py_ssize_t seplen;
12275
12276
4.82k
    kind = PyUnicode_KIND(self);
12277
4.82k
    data = PyUnicode_DATA(self);
12278
4.82k
    len = PyUnicode_GET_LENGTH(self);
12279
4.82k
    seplen = PyUnicode_GET_LENGTH(sepobj);
12280
4.82k
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12281
4.82k
                              PyUnicode_DATA(sepobj),
12282
4.82k
                              seplen);
12283
12284
0
    i = 0;
12285
4.82k
    if (striptype != RIGHTSTRIP) {
12286
26
        while (i < len) {
12287
23
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12288
23
            if (!BLOOM(sepmask, ch))
12289
20
                break;
12290
3
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12291
0
                break;
12292
3
            i++;
12293
3
        }
12294
23
    }
12295
12296
4.82k
    j = len;
12297
4.82k
    if (striptype != LEFTSTRIP) {
12298
4.79k
        j--;
12299
4.83k
        while (j >= i) {
12300
4.83k
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12301
4.83k
            if (!BLOOM(sepmask, ch))
12302
3.27k
                break;
12303
1.56k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12304
1.52k
                break;
12305
38
            j--;
12306
38
        }
12307
12308
4.79k
        j++;
12309
4.79k
    }
12310
12311
4.82k
    return PyUnicode_Substring(self, i, j);
12312
4.82k
}
12313
12314
PyObject*
12315
_PyUnicode_BinarySlice(PyObject *container, PyObject *start_o, PyObject *stop_o)
12316
173k
{
12317
173k
    assert(PyUnicode_CheckExact(container));
12318
173k
    Py_ssize_t len = PyUnicode_GET_LENGTH(container);
12319
173k
    Py_ssize_t istart, istop;
12320
173k
    if (!_PyEval_UnpackIndices(start_o, stop_o, len, &istart, &istop)) {
12321
0
        return NULL;
12322
0
    }
12323
173k
    return PyUnicode_Substring(container, istart, istop);
12324
173k
}
12325
12326
PyObject*
12327
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12328
810k
{
12329
810k
    const unsigned char *data;
12330
810k
    int kind;
12331
810k
    Py_ssize_t length;
12332
12333
810k
    length = PyUnicode_GET_LENGTH(self);
12334
810k
    end = Py_MIN(end, length);
12335
12336
810k
    if (start == 0 && end == length)
12337
32.1k
        return unicode_result_unchanged(self);
12338
12339
778k
    if (start < 0 || end < 0) {
12340
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12341
0
        return NULL;
12342
0
    }
12343
778k
    if (start >= length || end < start)
12344
3.15k
        _Py_RETURN_UNICODE_EMPTY();
12345
12346
775k
    length = end - start;
12347
775k
    if (PyUnicode_IS_ASCII(self)) {
12348
303k
        data = PyUnicode_1BYTE_DATA(self);
12349
303k
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12350
303k
    }
12351
471k
    else {
12352
471k
        kind = PyUnicode_KIND(self);
12353
471k
        data = PyUnicode_1BYTE_DATA(self);
12354
471k
        return PyUnicode_FromKindAndData(kind,
12355
471k
                                         data + kind * start,
12356
471k
                                         length);
12357
471k
    }
12358
775k
}
12359
12360
static PyObject *
12361
do_strip(PyObject *self, int striptype)
12362
17.5k
{
12363
17.5k
    Py_ssize_t len, i, j;
12364
12365
17.5k
    len = PyUnicode_GET_LENGTH(self);
12366
12367
17.5k
    if (PyUnicode_IS_ASCII(self)) {
12368
17.5k
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12369
12370
17.5k
        i = 0;
12371
17.5k
        if (striptype != RIGHTSTRIP) {
12372
9
            while (i < len) {
12373
9
                Py_UCS1 ch = data[i];
12374
9
                if (!_Py_ascii_whitespace[ch])
12375
9
                    break;
12376
0
                i++;
12377
0
            }
12378
9
        }
12379
12380
17.5k
        j = len;
12381
17.5k
        if (striptype != LEFTSTRIP) {
12382
17.5k
            j--;
12383
17.5k
            while (j >= i) {
12384
17.5k
                Py_UCS1 ch = data[j];
12385
17.5k
                if (!_Py_ascii_whitespace[ch])
12386
17.5k
                    break;
12387
0
                j--;
12388
0
            }
12389
17.5k
            j++;
12390
17.5k
        }
12391
17.5k
    }
12392
0
    else {
12393
0
        int kind = PyUnicode_KIND(self);
12394
0
        const void *data = PyUnicode_DATA(self);
12395
12396
0
        i = 0;
12397
0
        if (striptype != RIGHTSTRIP) {
12398
0
            while (i < len) {
12399
0
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12400
0
                if (!Py_UNICODE_ISSPACE(ch))
12401
0
                    break;
12402
0
                i++;
12403
0
            }
12404
0
        }
12405
12406
0
        j = len;
12407
0
        if (striptype != LEFTSTRIP) {
12408
0
            j--;
12409
0
            while (j >= i) {
12410
0
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12411
0
                if (!Py_UNICODE_ISSPACE(ch))
12412
0
                    break;
12413
0
                j--;
12414
0
            }
12415
0
            j++;
12416
0
        }
12417
0
    }
12418
12419
17.5k
    return PyUnicode_Substring(self, i, j);
12420
17.5k
}
12421
12422
12423
static PyObject *
12424
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12425
22.3k
{
12426
22.3k
    if (sep != Py_None) {
12427
4.82k
        if (PyUnicode_Check(sep))
12428
4.82k
            return _PyUnicode_XStrip(self, striptype, sep);
12429
0
        else {
12430
0
            PyErr_Format(PyExc_TypeError,
12431
0
                         "%s arg must be None or str",
12432
0
                         STRIPNAME(striptype));
12433
0
            return NULL;
12434
0
        }
12435
4.82k
    }
12436
12437
17.5k
    return do_strip(self, striptype);
12438
22.3k
}
12439
12440
12441
/*[clinic input]
12442
@permit_long_summary
12443
str.strip as unicode_strip
12444
12445
    chars: object = None
12446
    /
12447
12448
Return a copy of the string with leading and trailing whitespace removed.
12449
12450
If chars is given and not None, remove characters in chars instead.
12451
[clinic start generated code]*/
12452
12453
static PyObject *
12454
unicode_strip_impl(PyObject *self, PyObject *chars)
12455
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12456
9
{
12457
9
    return do_argstrip(self, BOTHSTRIP, chars);
12458
9
}
12459
12460
12461
/*[clinic input]
12462
str.lstrip as unicode_lstrip
12463
12464
    chars: object = None
12465
    /
12466
12467
Return a copy of the string with leading whitespace removed.
12468
12469
If chars is given and not None, remove characters in chars instead.
12470
[clinic start generated code]*/
12471
12472
static PyObject *
12473
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12474
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12475
23
{
12476
23
    return do_argstrip(self, LEFTSTRIP, chars);
12477
23
}
12478
12479
12480
/*[clinic input]
12481
str.rstrip as unicode_rstrip
12482
12483
    chars: object = None
12484
    /
12485
12486
Return a copy of the string with trailing whitespace removed.
12487
12488
If chars is given and not None, remove characters in chars instead.
12489
[clinic start generated code]*/
12490
12491
static PyObject *
12492
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12493
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12494
22.3k
{
12495
22.3k
    return do_argstrip(self, RIGHTSTRIP, chars);
12496
22.3k
}
12497
12498
12499
PyObject *
12500
_PyUnicode_Repeat(PyObject *str, Py_ssize_t len)
12501
965
{
12502
965
    PyObject *u;
12503
965
    Py_ssize_t nchars, n;
12504
12505
965
    if (len < 1)
12506
27
        _Py_RETURN_UNICODE_EMPTY();
12507
12508
    /* no repeat, return original string */
12509
938
    if (len == 1)
12510
194
        return unicode_result_unchanged(str);
12511
12512
744
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12513
0
        PyErr_SetString(PyExc_OverflowError,
12514
0
                        "repeated string is too long");
12515
0
        return NULL;
12516
0
    }
12517
744
    nchars = len * PyUnicode_GET_LENGTH(str);
12518
12519
744
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12520
744
    if (!u)
12521
0
        return NULL;
12522
744
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12523
12524
744
    if (PyUnicode_GET_LENGTH(str) == 1) {
12525
306
        int kind = PyUnicode_KIND(str);
12526
306
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12527
306
        if (kind == PyUnicode_1BYTE_KIND) {
12528
300
            void *to = PyUnicode_DATA(u);
12529
300
            memset(to, (unsigned char)fill_char, len);
12530
300
        }
12531
6
        else if (kind == PyUnicode_2BYTE_KIND) {
12532
2
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12533
16
            for (n = 0; n < len; ++n)
12534
14
                ucs2[n] = fill_char;
12535
4
        } else {
12536
4
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12537
4
            assert(kind == PyUnicode_4BYTE_KIND);
12538
571
            for (n = 0; n < len; ++n)
12539
567
                ucs4[n] = fill_char;
12540
4
        }
12541
306
    }
12542
438
    else {
12543
438
        Py_ssize_t char_size = PyUnicode_KIND(str);
12544
438
        char *to = (char *) PyUnicode_DATA(u);
12545
438
        _PyBytes_RepeatBuffer(to, nchars * char_size, PyUnicode_DATA(str),
12546
438
            PyUnicode_GET_LENGTH(str) * char_size);
12547
438
    }
12548
12549
744
    assert(_PyUnicode_CheckConsistency(u, 1));
12550
744
    return u;
12551
744
}
12552
12553
PyObject *
12554
PyUnicode_Replace(PyObject *str,
12555
                  PyObject *substr,
12556
                  PyObject *replstr,
12557
                  Py_ssize_t maxcount)
12558
4.00k
{
12559
4.00k
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12560
4.00k
            ensure_unicode(replstr) < 0)
12561
0
        return NULL;
12562
4.00k
    return replace(str, substr, replstr, maxcount);
12563
4.00k
}
12564
12565
/*[clinic input]
12566
str.replace as unicode_replace
12567
12568
    old: unicode
12569
    new: unicode
12570
    /
12571
    count: Py_ssize_t = -1
12572
        Maximum number of occurrences to replace.
12573
        -1 (the default value) means replace all occurrences.
12574
12575
Return a copy with all occurrences of substring old replaced by new.
12576
12577
If count is given, only the first count occurrences are replaced.
12578
If count is not specified or -1, then all occurrences are replaced.
12579
[clinic start generated code]*/
12580
12581
static PyObject *
12582
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12583
                     Py_ssize_t count)
12584
/*[clinic end generated code: output=b63f1a8b5eebf448 input=d15a6886b05e2edc]*/
12585
18.1k
{
12586
18.1k
    return replace(self, old, new, count);
12587
18.1k
}
12588
12589
/*[clinic input]
12590
str.removeprefix as unicode_removeprefix
12591
12592
    prefix: unicode
12593
    /
12594
12595
Return a str with the given prefix string removed if present.
12596
12597
If the string starts with the prefix string, return
12598
string[len(prefix):].  Otherwise, return a copy of the original
12599
string.
12600
[clinic start generated code]*/
12601
12602
static PyObject *
12603
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12604
/*[clinic end generated code: output=f1e5945e9763bcb9 input=90d162724944bfa7]*/
12605
0
{
12606
0
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12607
0
    if (match == -1) {
12608
0
        return NULL;
12609
0
    }
12610
0
    if (match) {
12611
0
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12612
0
                                   PyUnicode_GET_LENGTH(self));
12613
0
    }
12614
0
    return unicode_result_unchanged(self);
12615
0
}
12616
12617
/*[clinic input]
12618
str.removesuffix as unicode_removesuffix
12619
12620
    suffix: unicode
12621
    /
12622
12623
Return a str with the given suffix string removed if present.
12624
12625
If the string ends with the suffix string and that suffix is not
12626
empty, return string[:-len(suffix)].  Otherwise, return a copy of
12627
the original string.
12628
[clinic start generated code]*/
12629
12630
static PyObject *
12631
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12632
/*[clinic end generated code: output=d36629e227636822 input=6efc96152d4bfcd5]*/
12633
0
{
12634
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12635
0
    if (match == -1) {
12636
0
        return NULL;
12637
0
    }
12638
0
    if (match) {
12639
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12640
0
                                            - PyUnicode_GET_LENGTH(suffix));
12641
0
    }
12642
0
    return unicode_result_unchanged(self);
12643
0
}
12644
12645
static PyObject *
12646
unicode_repr(PyObject *unicode)
12647
5.40k
{
12648
5.40k
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12649
5.40k
    const void *idata = PyUnicode_DATA(unicode);
12650
12651
    /* Compute length of output, quote characters, and
12652
       maximum character */
12653
5.40k
    Py_ssize_t osize = 0;
12654
5.40k
    Py_UCS4 maxch = 127;
12655
5.40k
    Py_ssize_t squote = 0;
12656
5.40k
    Py_ssize_t dquote = 0;
12657
5.40k
    int ikind = PyUnicode_KIND(unicode);
12658
16.8M
    for (Py_ssize_t i = 0; i < isize; i++) {
12659
16.8M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12660
16.8M
        Py_ssize_t incr = 1;
12661
16.8M
        switch (ch) {
12662
103k
        case '\'': squote++; break;
12663
50.6k
        case '"':  dquote++; break;
12664
21.1k
        case '\\': case '\t': case '\r': case '\n':
12665
21.1k
            incr = 2;
12666
21.1k
            break;
12667
16.6M
        default:
12668
            /* Fast-path ASCII */
12669
16.6M
            if (ch < ' ' || ch == 0x7f)
12670
1.39M
                incr = 4; /* \xHH */
12671
15.2M
            else if (ch < 0x7f)
12672
15.2M
                ;
12673
52.6k
            else if (Py_UNICODE_ISPRINTABLE(ch))
12674
31.7k
                maxch = (ch > maxch) ? ch : maxch;
12675
20.9k
            else if (ch < 0x100)
12676
9.15k
                incr = 4; /* \xHH */
12677
11.7k
            else if (ch < 0x10000)
12678
9.43k
                incr = 6; /* \uHHHH */
12679
2.31k
            else
12680
2.31k
                incr = 10; /* \uHHHHHHHH */
12681
16.8M
        }
12682
16.8M
        if (osize > PY_SSIZE_T_MAX - incr) {
12683
0
            PyErr_SetString(PyExc_OverflowError,
12684
0
                            "string is too long to generate repr");
12685
0
            return NULL;
12686
0
        }
12687
16.8M
        osize += incr;
12688
16.8M
    }
12689
12690
5.40k
    Py_UCS4 quote = '\'';
12691
5.40k
    int changed = (osize != isize);
12692
5.40k
    if (squote) {
12693
461
        changed = 1;
12694
461
        if (dquote)
12695
            /* Both squote and dquote present. Use squote,
12696
               and escape them */
12697
226
            osize += squote;
12698
235
        else
12699
235
            quote = '"';
12700
461
    }
12701
5.40k
    osize += 2;   /* quotes */
12702
12703
5.40k
    PyObject *repr = PyUnicode_New(osize, maxch);
12704
5.40k
    if (repr == NULL)
12705
0
        return NULL;
12706
5.40k
    int okind = PyUnicode_KIND(repr);
12707
5.40k
    void *odata = PyUnicode_DATA(repr);
12708
12709
5.40k
    if (!changed) {
12710
2.98k
        PyUnicode_WRITE(okind, odata, 0, quote);
12711
12712
2.98k
        _PyUnicode_FastCopyCharacters(repr, 1,
12713
2.98k
                                      unicode, 0,
12714
2.98k
                                      isize);
12715
12716
2.98k
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12717
2.98k
    }
12718
2.42k
    else {
12719
2.42k
        switch (okind) {
12720
1.48k
        case PyUnicode_1BYTE_KIND:
12721
1.48k
            ucs1lib_repr(unicode, quote, odata);
12722
1.48k
            break;
12723
562
        case PyUnicode_2BYTE_KIND:
12724
562
            ucs2lib_repr(unicode, quote, odata);
12725
562
            break;
12726
370
        default:
12727
370
            assert(okind == PyUnicode_4BYTE_KIND);
12728
370
            ucs4lib_repr(unicode, quote, odata);
12729
2.42k
        }
12730
2.42k
    }
12731
12732
5.40k
    assert(_PyUnicode_CheckConsistency(repr, 1));
12733
5.40k
    return repr;
12734
5.40k
}
12735
12736
/*[clinic input]
12737
@permit_long_summary
12738
str.rfind as unicode_rfind = str.count
12739
12740
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12741
12742
Optional arguments start and end are interpreted as in slice
12743
notation.  Return -1 on failure.
12744
[clinic start generated code]*/
12745
12746
static Py_ssize_t
12747
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12748
                   Py_ssize_t end)
12749
/*[clinic end generated code: output=880b29f01dd014c8 input=2e67789533baf2f5]*/
12750
6.89k
{
12751
6.89k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12752
6.89k
    if (result < 0) {
12753
5.86k
        return -1;
12754
5.86k
    }
12755
1.02k
    return result;
12756
6.89k
}
12757
12758
/*[clinic input]
12759
@permit_long_summary
12760
str.rindex as unicode_rindex = str.count
12761
12762
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12763
12764
Optional arguments start and end are interpreted as in slice
12765
notation.  Raises ValueError when the substring is not found.
12766
[clinic start generated code]*/
12767
12768
static Py_ssize_t
12769
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12770
                    Py_ssize_t end)
12771
/*[clinic end generated code: output=5f3aef124c867fe1 input=e29d446c8234c9d9]*/
12772
0
{
12773
0
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12774
0
    if (result == -1) {
12775
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12776
0
    }
12777
0
    else if (result < 0) {
12778
0
        return -1;
12779
0
    }
12780
0
    return result;
12781
0
}
12782
12783
/*[clinic input]
12784
str.rjust as unicode_rjust
12785
12786
    width: Py_ssize_t
12787
    fillchar: Py_UCS4 = ' '
12788
    /
12789
12790
Return a right-justified string of length width.
12791
12792
Padding is done using the specified fill character (default is
12793
a space).
12794
[clinic start generated code]*/
12795
12796
static PyObject *
12797
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12798
/*[clinic end generated code: output=804a1a57fbe8d5cf input=1256a8d659589907]*/
12799
0
{
12800
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12801
0
        return unicode_result_unchanged(self);
12802
12803
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12804
0
}
12805
12806
PyObject *
12807
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12808
0
{
12809
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12810
0
        return NULL;
12811
12812
0
    return split(s, sep, maxsplit);
12813
0
}
12814
12815
/*[clinic input]
12816
@permit_long_summary
12817
str.split as unicode_split
12818
12819
    sep: object = None
12820
        The separator used to split the string.
12821
12822
        When set to None (the default value), will split on any
12823
        whitespace character (including \n \r \t \f and spaces) and
12824
        will discard empty strings from the result.
12825
    maxsplit: Py_ssize_t = -1
12826
        Maximum number of splits.
12827
        -1 (the default value) means no limit.
12828
12829
Return a list of the substrings in the string, using sep as the separator string.
12830
12831
Splitting starts at the front of the string and works to the end.
12832
12833
Note, str.split() is mainly useful for data that has been
12834
intentionally delimited.  With natural text that includes
12835
punctuation, consider using the regular expression module.
12836
12837
[clinic start generated code]*/
12838
12839
static PyObject *
12840
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12841
/*[clinic end generated code: output=3a65b1db356948dc input=288cfd6bc8828f5a]*/
12842
1.97k
{
12843
1.97k
    if (sep == Py_None)
12844
5
        return split(self, NULL, maxsplit);
12845
1.96k
    if (PyUnicode_Check(sep))
12846
1.96k
        return split(self, sep, maxsplit);
12847
12848
0
    PyErr_Format(PyExc_TypeError,
12849
0
                 "must be str or None, not %.100s",
12850
0
                 Py_TYPE(sep)->tp_name);
12851
0
    return NULL;
12852
1.96k
}
12853
12854
PyObject *
12855
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12856
0
{
12857
0
    PyObject* out;
12858
0
    int kind1, kind2;
12859
0
    const void *buf1, *buf2;
12860
0
    Py_ssize_t len1, len2;
12861
12862
0
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12863
0
        return NULL;
12864
12865
0
    kind1 = PyUnicode_KIND(str_obj);
12866
0
    kind2 = PyUnicode_KIND(sep_obj);
12867
0
    len1 = PyUnicode_GET_LENGTH(str_obj);
12868
0
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12869
0
    if (kind1 < kind2 || len1 < len2) {
12870
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12871
0
        return PyTuple_Pack(3, str_obj, empty, empty);
12872
0
    }
12873
0
    buf1 = PyUnicode_DATA(str_obj);
12874
0
    buf2 = PyUnicode_DATA(sep_obj);
12875
0
    if (kind2 != kind1) {
12876
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12877
0
        if (!buf2)
12878
0
            return NULL;
12879
0
    }
12880
12881
0
    switch (kind1) {
12882
0
    case PyUnicode_1BYTE_KIND:
12883
0
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12884
0
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12885
0
        else
12886
0
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12887
0
        break;
12888
0
    case PyUnicode_2BYTE_KIND:
12889
0
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12890
0
        break;
12891
0
    case PyUnicode_4BYTE_KIND:
12892
0
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12893
0
        break;
12894
0
    default:
12895
0
        Py_UNREACHABLE();
12896
0
    }
12897
12898
0
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12899
0
    if (kind2 != kind1)
12900
0
        PyMem_Free((void *)buf2);
12901
12902
0
    return out;
12903
0
}
12904
12905
12906
PyObject *
12907
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12908
2.24k
{
12909
2.24k
    PyObject* out;
12910
2.24k
    int kind1, kind2;
12911
2.24k
    const void *buf1, *buf2;
12912
2.24k
    Py_ssize_t len1, len2;
12913
12914
2.24k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12915
0
        return NULL;
12916
12917
2.24k
    kind1 = PyUnicode_KIND(str_obj);
12918
2.24k
    kind2 = PyUnicode_KIND(sep_obj);
12919
2.24k
    len1 = PyUnicode_GET_LENGTH(str_obj);
12920
2.24k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12921
2.24k
    if (kind1 < kind2 || len1 < len2) {
12922
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12923
0
        return PyTuple_Pack(3, empty, empty, str_obj);
12924
0
    }
12925
2.24k
    buf1 = PyUnicode_DATA(str_obj);
12926
2.24k
    buf2 = PyUnicode_DATA(sep_obj);
12927
2.24k
    if (kind2 != kind1) {
12928
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12929
0
        if (!buf2)
12930
0
            return NULL;
12931
0
    }
12932
12933
2.24k
    switch (kind1) {
12934
2.24k
    case PyUnicode_1BYTE_KIND:
12935
2.24k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12936
2.24k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12937
0
        else
12938
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12939
2.24k
        break;
12940
0
    case PyUnicode_2BYTE_KIND:
12941
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12942
0
        break;
12943
0
    case PyUnicode_4BYTE_KIND:
12944
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12945
0
        break;
12946
0
    default:
12947
0
        Py_UNREACHABLE();
12948
2.24k
    }
12949
12950
2.24k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12951
2.24k
    if (kind2 != kind1)
12952
0
        PyMem_Free((void *)buf2);
12953
12954
2.24k
    return out;
12955
2.24k
}
12956
12957
/*[clinic input]
12958
str.partition as unicode_partition
12959
12960
    sep: object
12961
    /
12962
12963
Partition the string into three parts using the given separator.
12964
12965
This will search for the separator in the string.  If the separator
12966
is found, returns a 3-tuple containing the part before the
12967
separator, the separator itself, and the part after it.
12968
12969
If the separator is not found, returns a 3-tuple containing
12970
the original string and two empty strings.
12971
[clinic start generated code]*/
12972
12973
static PyObject *
12974
unicode_partition(PyObject *self, PyObject *sep)
12975
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=e45faa8c26270cb1]*/
12976
0
{
12977
0
    return PyUnicode_Partition(self, sep);
12978
0
}
12979
12980
/*[clinic input]
12981
str.rpartition as unicode_rpartition = str.partition
12982
12983
Partition the string into three parts using the given separator.
12984
12985
This will search for the separator in the string, starting at the
12986
end.  If the separator is found, returns a 3-tuple containing the
12987
part before the separator, the separator itself, and the part after
12988
it.
12989
12990
If the separator is not found, returns a 3-tuple containing two
12991
empty strings and the original string.
12992
[clinic start generated code]*/
12993
12994
static PyObject *
12995
unicode_rpartition(PyObject *self, PyObject *sep)
12996
/*[clinic end generated code: output=1aa13cf1156572aa input=53a7f8cb19975b7c]*/
12997
2.24k
{
12998
2.24k
    return PyUnicode_RPartition(self, sep);
12999
2.24k
}
13000
13001
PyObject *
13002
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13003
0
{
13004
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13005
0
        return NULL;
13006
13007
0
    return rsplit(s, sep, maxsplit);
13008
0
}
13009
13010
/*[clinic input]
13011
@permit_long_summary
13012
str.rsplit as unicode_rsplit = str.split
13013
13014
Return a list of the substrings in the string, using sep as the separator string.
13015
13016
Splitting starts at the end of the string and works to the front.
13017
[clinic start generated code]*/
13018
13019
static PyObject *
13020
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13021
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13022
0
{
13023
0
    if (sep == Py_None)
13024
0
        return rsplit(self, NULL, maxsplit);
13025
0
    if (PyUnicode_Check(sep))
13026
0
        return rsplit(self, sep, maxsplit);
13027
13028
0
    PyErr_Format(PyExc_TypeError,
13029
0
                 "must be str or None, not %.100s",
13030
0
                 Py_TYPE(sep)->tp_name);
13031
0
    return NULL;
13032
0
}
13033
13034
/*[clinic input]
13035
@permit_long_summary
13036
str.splitlines as unicode_splitlines
13037
13038
    keepends: bool = False
13039
13040
Return a list of the lines in the string, breaking at line boundaries.
13041
13042
Line breaks are not included in the resulting list unless keepends
13043
is given and true.
13044
[clinic start generated code]*/
13045
13046
static PyObject *
13047
unicode_splitlines_impl(PyObject *self, int keepends)
13048
/*[clinic end generated code: output=f664dcdad153ec40 input=b45ea0f87645a06d]*/
13049
0
{
13050
0
    return PyUnicode_Splitlines(self, keepends);
13051
0
}
13052
13053
static
13054
PyObject *unicode_str(PyObject *self)
13055
0
{
13056
0
    return unicode_result_unchanged(self);
13057
0
}
13058
13059
/*[clinic input]
13060
@permit_long_summary
13061
str.swapcase as unicode_swapcase
13062
13063
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13064
[clinic start generated code]*/
13065
13066
static PyObject *
13067
unicode_swapcase_impl(PyObject *self)
13068
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13069
0
{
13070
0
    return case_operation(self, do_swapcase);
13071
0
}
13072
13073
static int
13074
unicode_maketrans_from_dict(PyObject *x, PyObject *newdict)
13075
0
{
13076
0
    PyObject *key, *value;
13077
0
    Py_ssize_t i = 0;
13078
0
    int res;
13079
0
    while (PyDict_Next(x, &i, &key, &value)) {
13080
0
        if (PyUnicode_Check(key)) {
13081
0
            PyObject *newkey;
13082
0
            int kind;
13083
0
            const void *data;
13084
0
            if (PyUnicode_GET_LENGTH(key) != 1) {
13085
0
                PyErr_SetString(PyExc_ValueError, "string keys in translate"
13086
0
                                "table must be of length 1");
13087
0
                return -1;
13088
0
            }
13089
0
            kind = PyUnicode_KIND(key);
13090
0
            data = PyUnicode_DATA(key);
13091
0
            newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13092
0
            if (!newkey)
13093
0
                return -1;
13094
0
            res = PyDict_SetItem(newdict, newkey, value);
13095
0
            Py_DECREF(newkey);
13096
0
            if (res < 0)
13097
0
                return -1;
13098
0
        }
13099
0
        else if (PyLong_Check(key)) {
13100
0
            if (PyDict_SetItem(newdict, key, value) < 0)
13101
0
                return -1;
13102
0
        }
13103
0
        else {
13104
0
            PyErr_SetString(PyExc_TypeError, "keys in translate table must"
13105
0
                            "be strings or integers");
13106
0
            return -1;
13107
0
        }
13108
0
    }
13109
0
    return 0;
13110
0
}
13111
13112
/*[clinic input]
13113
13114
@staticmethod
13115
str.maketrans as unicode_maketrans
13116
13117
  x: object
13118
13119
  y: unicode=NULL
13120
13121
  z: unicode=NULL
13122
13123
  /
13124
13125
Return a translation table usable for str.translate().
13126
13127
If there is only one argument, it must be a dictionary mapping
13128
Unicode ordinals (integers) or characters to Unicode ordinals,
13129
strings or None.  Character keys will be then converted to ordinals.
13130
If there are two arguments, they must be strings of equal length,
13131
and in the resulting dictionary, each character in x will be mapped
13132
to the character at the same position in y.  If there is a third
13133
argument, it must be a string, whose characters will be mapped to
13134
None in the result.
13135
[clinic start generated code]*/
13136
13137
static PyObject *
13138
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13139
/*[clinic end generated code: output=a925c89452bd5881 input=66bc00a1b4258a6e]*/
13140
0
{
13141
0
    PyObject *new = NULL, *key, *value;
13142
0
    Py_ssize_t i = 0;
13143
0
    int res;
13144
13145
0
    new = PyDict_New();
13146
0
    if (!new)
13147
0
        return NULL;
13148
0
    if (y != NULL) {
13149
0
        int x_kind, y_kind, z_kind;
13150
0
        const void *x_data, *y_data, *z_data;
13151
13152
        /* x must be a string too, of equal length */
13153
0
        if (!PyUnicode_Check(x)) {
13154
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13155
0
                            "be a string if there is a second argument");
13156
0
            goto err;
13157
0
        }
13158
0
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13159
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13160
0
                            "arguments must have equal length");
13161
0
            goto err;
13162
0
        }
13163
        /* create entries for translating chars in x to those in y */
13164
0
        x_kind = PyUnicode_KIND(x);
13165
0
        y_kind = PyUnicode_KIND(y);
13166
0
        x_data = PyUnicode_DATA(x);
13167
0
        y_data = PyUnicode_DATA(y);
13168
0
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13169
0
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13170
0
            if (!key)
13171
0
                goto err;
13172
0
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13173
0
            if (!value) {
13174
0
                Py_DECREF(key);
13175
0
                goto err;
13176
0
            }
13177
0
            res = PyDict_SetItem(new, key, value);
13178
0
            Py_DECREF(key);
13179
0
            Py_DECREF(value);
13180
0
            if (res < 0)
13181
0
                goto err;
13182
0
        }
13183
        /* create entries for deleting chars in z */
13184
0
        if (z != NULL) {
13185
0
            z_kind = PyUnicode_KIND(z);
13186
0
            z_data = PyUnicode_DATA(z);
13187
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13188
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13189
0
                if (!key)
13190
0
                    goto err;
13191
0
                res = PyDict_SetItem(new, key, Py_None);
13192
0
                Py_DECREF(key);
13193
0
                if (res < 0)
13194
0
                    goto err;
13195
0
            }
13196
0
        }
13197
0
    } else {
13198
        /* x must be a dict */
13199
0
        if (!PyAnyDict_CheckExact(x)) {
13200
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13201
0
                            "to maketrans it must be a dict");
13202
0
            goto err;
13203
0
        }
13204
        /* copy entries into the new dict, converting string keys to int keys */
13205
0
        int errcode;
13206
0
        Py_BEGIN_CRITICAL_SECTION(x);
13207
0
        errcode = unicode_maketrans_from_dict(x, new);
13208
0
        Py_END_CRITICAL_SECTION();
13209
0
        if (errcode < 0)
13210
0
            goto err;
13211
0
    }
13212
0
    return new;
13213
0
  err:
13214
0
    Py_DECREF(new);
13215
0
    return NULL;
13216
0
}
13217
13218
/*[clinic input]
13219
@permit_long_summary
13220
str.translate as unicode_translate
13221
13222
    table: object
13223
        Translation table, which must be a mapping of Unicode ordinals
13224
        to Unicode ordinals, strings, or None.
13225
    /
13226
13227
Replace each character in the string using the given translation table.
13228
13229
The table must implement lookup/indexing via __getitem__, for
13230
instance a dictionary or list.  If this operation raises
13231
LookupError, the character is left untouched.  Characters mapped to
13232
None are deleted.
13233
[clinic start generated code]*/
13234
13235
static PyObject *
13236
unicode_translate(PyObject *self, PyObject *table)
13237
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=48cf0efe06bc1b75]*/
13238
96
{
13239
96
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13240
96
}
13241
13242
/*[clinic input]
13243
str.upper as unicode_upper
13244
13245
Return a copy of the string converted to uppercase.
13246
[clinic start generated code]*/
13247
13248
static PyObject *
13249
unicode_upper_impl(PyObject *self)
13250
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13251
102
{
13252
102
    if (PyUnicode_IS_ASCII(self))
13253
102
        return ascii_upper_or_lower(self, 0);
13254
0
    return case_operation(self, do_upper);
13255
102
}
13256
13257
/*[clinic input]
13258
@permit_long_summary
13259
str.zfill as unicode_zfill
13260
13261
    width: Py_ssize_t
13262
    /
13263
13264
Pad a numeric string with zeros on the left, to fill a field of the given width.
13265
13266
The string is never truncated.
13267
[clinic start generated code]*/
13268
13269
static PyObject *
13270
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13271
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13272
0
{
13273
0
    Py_ssize_t fill;
13274
0
    PyObject *u;
13275
0
    int kind;
13276
0
    const void *data;
13277
0
    Py_UCS4 chr;
13278
13279
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13280
0
        return unicode_result_unchanged(self);
13281
13282
0
    fill = width - PyUnicode_GET_LENGTH(self);
13283
13284
0
    u = pad(self, fill, 0, '0');
13285
13286
0
    if (u == NULL)
13287
0
        return NULL;
13288
13289
0
    kind = PyUnicode_KIND(u);
13290
0
    data = PyUnicode_DATA(u);
13291
0
    chr = PyUnicode_READ(kind, data, fill);
13292
13293
0
    if (chr == '+' || chr == '-') {
13294
        /* move sign to beginning of string */
13295
0
        PyUnicode_WRITE(kind, data, 0, chr);
13296
0
        PyUnicode_WRITE(kind, data, fill, '0');
13297
0
    }
13298
13299
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13300
0
    return u;
13301
0
}
13302
13303
/*[clinic input]
13304
@permit_long_summary
13305
@text_signature "($self, prefix[, start[, end]], /)"
13306
str.startswith as unicode_startswith
13307
13308
    prefix as subobj: object
13309
        A string or a tuple of strings to try.
13310
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13311
        Optional start position. Default: start of the string.
13312
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13313
        Optional stop position. Default: end of the string.
13314
    /
13315
13316
Return True if the string starts with the specified prefix, False otherwise.
13317
[clinic start generated code]*/
13318
13319
static PyObject *
13320
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13321
                        Py_ssize_t end)
13322
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13323
54.7k
{
13324
54.7k
    if (PyTuple_Check(subobj)) {
13325
84
        Py_ssize_t i;
13326
588
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13327
504
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13328
504
            if (!PyUnicode_Check(substring)) {
13329
0
                PyErr_Format(PyExc_TypeError,
13330
0
                             "tuple for startswith must only contain str, "
13331
0
                             "not %.100s",
13332
0
                             Py_TYPE(substring)->tp_name);
13333
0
                return NULL;
13334
0
            }
13335
504
            int result = tailmatch(self, substring, start, end, -1);
13336
504
            if (result < 0) {
13337
0
                return NULL;
13338
0
            }
13339
504
            if (result) {
13340
0
                Py_RETURN_TRUE;
13341
0
            }
13342
504
        }
13343
        /* nothing matched */
13344
84
        Py_RETURN_FALSE;
13345
84
    }
13346
54.6k
    if (!PyUnicode_Check(subobj)) {
13347
0
        PyErr_Format(PyExc_TypeError,
13348
0
                     "startswith first arg must be str or "
13349
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13350
0
        return NULL;
13351
0
    }
13352
54.6k
    int result = tailmatch(self, subobj, start, end, -1);
13353
54.6k
    if (result < 0) {
13354
0
        return NULL;
13355
0
    }
13356
54.6k
    return PyBool_FromLong(result);
13357
54.6k
}
13358
13359
13360
/*[clinic input]
13361
@permit_long_summary
13362
@text_signature "($self, suffix[, start[, end]], /)"
13363
str.endswith as unicode_endswith
13364
13365
    suffix as subobj: object
13366
        A string or a tuple of strings to try.
13367
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13368
        Optional start position. Default: start of the string.
13369
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13370
        Optional stop position. Default: end of the string.
13371
    /
13372
13373
Return True if the string ends with the specified suffix, False otherwise.
13374
[clinic start generated code]*/
13375
13376
static PyObject *
13377
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13378
                      Py_ssize_t end)
13379
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13380
26.4k
{
13381
26.4k
    if (PyTuple_Check(subobj)) {
13382
0
        Py_ssize_t i;
13383
0
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13384
0
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13385
0
            if (!PyUnicode_Check(substring)) {
13386
0
                PyErr_Format(PyExc_TypeError,
13387
0
                             "tuple for endswith must only contain str, "
13388
0
                             "not %.100s",
13389
0
                             Py_TYPE(substring)->tp_name);
13390
0
                return NULL;
13391
0
            }
13392
0
            int result = tailmatch(self, substring, start, end, +1);
13393
0
            if (result < 0) {
13394
0
                return NULL;
13395
0
            }
13396
0
            if (result) {
13397
0
                Py_RETURN_TRUE;
13398
0
            }
13399
0
        }
13400
0
        Py_RETURN_FALSE;
13401
0
    }
13402
26.4k
    if (!PyUnicode_Check(subobj)) {
13403
0
        PyErr_Format(PyExc_TypeError,
13404
0
                     "endswith first arg must be str or "
13405
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13406
0
        return NULL;
13407
0
    }
13408
26.4k
    int result = tailmatch(self, subobj, start, end, +1);
13409
26.4k
    if (result < 0) {
13410
0
        return NULL;
13411
0
    }
13412
26.4k
    return PyBool_FromLong(result);
13413
26.4k
}
13414
13415
13416
#include "stringlib/unicode_format.h"
13417
13418
PyDoc_STRVAR(format__doc__,
13419
             "format($self, /, *args, **kwargs)\n\
13420
--\n\
13421
\n\
13422
Return a formatted version of the string, using substitutions from args and kwargs.\n\
13423
The substitutions are identified by braces ('{' and '}').");
13424
13425
PyDoc_STRVAR(format_map__doc__,
13426
             "format_map($self, mapping, /)\n\
13427
--\n\
13428
\n\
13429
Return a formatted version of the string, using substitutions from mapping.\n\
13430
The substitutions are identified by braces ('{' and '}').");
13431
13432
/*[clinic input]
13433
@permit_long_summary
13434
str.__format__ as unicode___format__
13435
13436
    format_spec: unicode
13437
    /
13438
13439
Return a formatted version of the string as described by format_spec.
13440
[clinic start generated code]*/
13441
13442
static PyObject *
13443
unicode___format___impl(PyObject *self, PyObject *format_spec)
13444
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=77a2a19f3f7969f2]*/
13445
0
{
13446
0
    _PyUnicodeWriter writer;
13447
0
    int ret;
13448
13449
0
    _PyUnicodeWriter_Init(&writer);
13450
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13451
0
                                          self, format_spec, 0,
13452
0
                                          PyUnicode_GET_LENGTH(format_spec));
13453
0
    if (ret == -1) {
13454
0
        _PyUnicodeWriter_Dealloc(&writer);
13455
0
        return NULL;
13456
0
    }
13457
0
    return _PyUnicodeWriter_Finish(&writer);
13458
0
}
13459
13460
/*[clinic input]
13461
str.__sizeof__ as unicode_sizeof
13462
13463
Return the size of the string in memory, in bytes.
13464
[clinic start generated code]*/
13465
13466
static PyObject *
13467
unicode_sizeof_impl(PyObject *self)
13468
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13469
0
{
13470
0
    Py_ssize_t size;
13471
13472
    /* If it's a compact object, account for base structure +
13473
       character data. */
13474
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
13475
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13476
0
    }
13477
0
    else if (PyUnicode_IS_COMPACT(self)) {
13478
0
        size = sizeof(PyCompactUnicodeObject) +
13479
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13480
0
    }
13481
0
    else {
13482
        /* If it is a two-block object, account for base object, and
13483
           for character block if present. */
13484
0
        size = sizeof(PyUnicodeObject);
13485
0
        if (_PyUnicode_DATA_ANY(self))
13486
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13487
0
                PyUnicode_KIND(self);
13488
0
    }
13489
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13490
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13491
13492
0
    return PyLong_FromSsize_t(size);
13493
0
}
13494
13495
static PyObject *
13496
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13497
0
{
13498
0
    PyObject *copy = _PyUnicode_Copy(v);
13499
0
    if (!copy)
13500
0
        return NULL;
13501
0
    return Py_BuildValue("(N)", copy);
13502
0
}
13503
13504
/*
13505
This function searchs the longest common leading whitespace
13506
of all lines in the [src, end).
13507
It returns the length of the common leading whitespace and sets `output` to
13508
point to the beginning of the common leading whitespace if length > 0.
13509
*/
13510
static Py_ssize_t
13511
search_longest_common_leading_whitespace(
13512
    const char *const src,
13513
    const char *const end,
13514
    const char **output)
13515
0
{
13516
    // [_start, _start + _len)
13517
    // describes the current longest common leading whitespace
13518
0
    const char *_start = NULL;
13519
0
    Py_ssize_t _len = 0;
13520
13521
0
    for (const char *iter = src; iter < end; ++iter) {
13522
0
        const char *line_start = iter;
13523
0
        const char *leading_whitespace_end = NULL;
13524
13525
        // scan the whole line
13526
0
        while (iter < end && *iter != '\n') {
13527
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
13528
                /* `iter` points to the first non-whitespace character
13529
                   in this line */
13530
0
                if (iter == line_start) {
13531
                    // some line has no indent, fast exit!
13532
0
                    return 0;
13533
0
                }
13534
0
                leading_whitespace_end = iter;
13535
0
            }
13536
0
            ++iter;
13537
0
        }
13538
13539
        // if this line has all white space, skip it
13540
0
        if (!leading_whitespace_end) {
13541
0
            continue;
13542
0
        }
13543
13544
0
        if (!_start) {
13545
            // update the first leading whitespace
13546
0
            _start = line_start;
13547
0
            _len = leading_whitespace_end - line_start;
13548
0
            assert(_len > 0);
13549
0
        }
13550
0
        else {
13551
            /* We then compare with the current longest leading whitespace.
13552
13553
               [line_start, leading_whitespace_end) is the leading
13554
               whitespace of this line,
13555
13556
               [_start, _start + _len) is the leading whitespace of the
13557
               current longest leading whitespace. */
13558
0
            Py_ssize_t new_len = 0;
13559
0
            const char *_iter = _start, *line_iter = line_start;
13560
13561
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
13562
0
                   && *_iter == *line_iter)
13563
0
            {
13564
0
                ++_iter;
13565
0
                ++line_iter;
13566
0
                ++new_len;
13567
0
            }
13568
13569
0
            _len = new_len;
13570
0
            if (_len == 0) {
13571
                // No common things now, fast exit!
13572
0
                return 0;
13573
0
            }
13574
0
        }
13575
0
    }
13576
13577
0
    assert(_len >= 0);
13578
0
    if (_len > 0) {
13579
0
        *output = _start;
13580
0
    }
13581
0
    return _len;
13582
0
}
13583
13584
/* Dedent a string.
13585
   Intended to dedent Python source. Unlike `textwrap.dedent`, this
13586
   only supports spaces and tabs and doesn't normalize empty lines.
13587
   Return a new reference on success, NULL with exception set on error.
13588
   */
13589
PyObject *
13590
_PyUnicode_Dedent(PyObject *unicode)
13591
0
{
13592
0
    Py_ssize_t src_len = 0;
13593
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
13594
0
    if (!src) {
13595
0
        return NULL;
13596
0
    }
13597
0
    assert(src_len >= 0);
13598
0
    if (src_len == 0) {
13599
0
        return Py_NewRef(unicode);
13600
0
    }
13601
13602
0
    const char *const end = src + src_len;
13603
13604
    // [whitespace_start, whitespace_start + whitespace_len)
13605
    // describes the current longest common leading whitespace
13606
0
    const char *whitespace_start = NULL;
13607
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
13608
0
        src, end, &whitespace_start);
13609
13610
0
    if (whitespace_len == 0) {
13611
0
        return Py_NewRef(unicode);
13612
0
    }
13613
13614
    // now we should trigger a dedent
13615
0
    char *dest = PyMem_Malloc(src_len);
13616
0
    if (!dest) {
13617
0
        PyErr_NoMemory();
13618
0
        return NULL;
13619
0
    }
13620
0
    char *dest_iter = dest;
13621
13622
0
    for (const char *iter = src; iter < end; ++iter) {
13623
0
        const char *line_start = iter;
13624
0
        bool in_leading_space = true;
13625
13626
        // iterate over a line to find the end of a line
13627
0
        while (iter < end && *iter != '\n') {
13628
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
13629
0
                in_leading_space = false;
13630
0
            }
13631
0
            ++iter;
13632
0
        }
13633
13634
        // invariant: *iter == '\n' or iter == end
13635
0
        bool append_newline = iter < end;
13636
13637
        // if this line has all white space, write '\n' and continue
13638
0
        if (in_leading_space && append_newline) {
13639
0
            *dest_iter++ = '\n';
13640
0
            continue;
13641
0
        }
13642
13643
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
13644
            conditionally append '\n' */
13645
13646
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
13647
0
        assert(new_line_len >= 0);
13648
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
13649
13650
0
        dest_iter += new_line_len;
13651
13652
0
        if (append_newline) {
13653
0
            *dest_iter++ = '\n';
13654
0
        }
13655
0
    }
13656
13657
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
13658
0
    PyMem_Free(dest);
13659
0
    return res;
13660
0
}
13661
13662
static PyMethodDef unicode_methods[] = {
13663
    UNICODE_ENCODE_METHODDEF
13664
    UNICODE_REPLACE_METHODDEF
13665
    UNICODE_SPLIT_METHODDEF
13666
    UNICODE_RSPLIT_METHODDEF
13667
    UNICODE_JOIN_METHODDEF
13668
    UNICODE_CAPITALIZE_METHODDEF
13669
    UNICODE_CASEFOLD_METHODDEF
13670
    UNICODE_TITLE_METHODDEF
13671
    UNICODE_CENTER_METHODDEF
13672
    UNICODE_COUNT_METHODDEF
13673
    UNICODE_EXPANDTABS_METHODDEF
13674
    UNICODE_FIND_METHODDEF
13675
    UNICODE_PARTITION_METHODDEF
13676
    UNICODE_INDEX_METHODDEF
13677
    UNICODE_LJUST_METHODDEF
13678
    UNICODE_LOWER_METHODDEF
13679
    UNICODE_LSTRIP_METHODDEF
13680
    UNICODE_RFIND_METHODDEF
13681
    UNICODE_RINDEX_METHODDEF
13682
    UNICODE_RJUST_METHODDEF
13683
    UNICODE_RSTRIP_METHODDEF
13684
    UNICODE_RPARTITION_METHODDEF
13685
    UNICODE_SPLITLINES_METHODDEF
13686
    UNICODE_STRIP_METHODDEF
13687
    UNICODE_SWAPCASE_METHODDEF
13688
    UNICODE_TRANSLATE_METHODDEF
13689
    UNICODE_UPPER_METHODDEF
13690
    UNICODE_STARTSWITH_METHODDEF
13691
    UNICODE_ENDSWITH_METHODDEF
13692
    UNICODE_REMOVEPREFIX_METHODDEF
13693
    UNICODE_REMOVESUFFIX_METHODDEF
13694
    UNICODE_ISASCII_METHODDEF
13695
    UNICODE_ISLOWER_METHODDEF
13696
    UNICODE_ISUPPER_METHODDEF
13697
    UNICODE_ISTITLE_METHODDEF
13698
    UNICODE_ISSPACE_METHODDEF
13699
    UNICODE_ISDECIMAL_METHODDEF
13700
    UNICODE_ISDIGIT_METHODDEF
13701
    UNICODE_ISNUMERIC_METHODDEF
13702
    UNICODE_ISALPHA_METHODDEF
13703
    UNICODE_ISALNUM_METHODDEF
13704
    UNICODE_ISIDENTIFIER_METHODDEF
13705
    UNICODE_ISPRINTABLE_METHODDEF
13706
    UNICODE_ZFILL_METHODDEF
13707
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13708
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
13709
    UNICODE___FORMAT___METHODDEF
13710
    UNICODE_MAKETRANS_METHODDEF
13711
    UNICODE_SIZEOF_METHODDEF
13712
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13713
    {NULL, NULL}
13714
};
13715
13716
static PyObject *
13717
unicode_mod(PyObject *v, PyObject *w)
13718
6.79k
{
13719
6.79k
    if (!PyUnicode_Check(v))
13720
3
        Py_RETURN_NOTIMPLEMENTED;
13721
6.78k
    return PyUnicode_Format(v, w);
13722
6.79k
}
13723
13724
static PyNumberMethods unicode_as_number = {
13725
    0,              /*nb_add*/
13726
    0,              /*nb_subtract*/
13727
    0,              /*nb_multiply*/
13728
    unicode_mod,            /*nb_remainder*/
13729
};
13730
13731
static PySequenceMethods unicode_as_sequence = {
13732
    unicode_length,     /* sq_length */
13733
    PyUnicode_Concat,   /* sq_concat */
13734
    _PyUnicode_Repeat,  /* sq_repeat */
13735
    unicode_getitem,    /* sq_item */
13736
    0,                  /* sq_slice */
13737
    0,                  /* sq_ass_item */
13738
    0,                  /* sq_ass_slice */
13739
    PyUnicode_Contains, /* sq_contains */
13740
};
13741
13742
static PyObject*
13743
unicode_subscript(PyObject* self, PyObject* item)
13744
3.65M
{
13745
3.65M
    if (_PyIndex_Check(item)) {
13746
3.65M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13747
3.65M
        if (i == -1 && PyErr_Occurred())
13748
30
            return NULL;
13749
3.65M
        if (i < 0)
13750
1.09k
            i += PyUnicode_GET_LENGTH(self);
13751
3.65M
        return unicode_getitem(self, i);
13752
3.65M
    } else if (PySlice_Check(item)) {
13753
562
        Py_ssize_t start, stop, step, slicelength, i;
13754
562
        size_t cur;
13755
562
        PyObject *result;
13756
562
        const void *src_data;
13757
562
        void *dest_data;
13758
562
        int src_kind, dest_kind;
13759
562
        Py_UCS4 ch, max_char, kind_limit;
13760
13761
562
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13762
3
            return NULL;
13763
3
        }
13764
559
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13765
559
                                            &start, &stop, step);
13766
13767
559
        if (slicelength <= 0) {
13768
55
            _Py_RETURN_UNICODE_EMPTY();
13769
504
        } else if (start == 0 && step == 1 &&
13770
453
                   slicelength == PyUnicode_GET_LENGTH(self)) {
13771
13
            return unicode_result_unchanged(self);
13772
491
        } else if (step == 1) {
13773
464
            return PyUnicode_Substring(self,
13774
464
                                       start, start + slicelength);
13775
464
        }
13776
        /* General case */
13777
27
        src_kind = PyUnicode_KIND(self);
13778
27
        src_data = PyUnicode_DATA(self);
13779
27
        if (!PyUnicode_IS_ASCII(self)) {
13780
18
            kind_limit = kind_maxchar_limit(src_kind);
13781
18
            max_char = 0;
13782
158
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13783
151
                ch = PyUnicode_READ(src_kind, src_data, cur);
13784
151
                if (ch > max_char) {
13785
33
                    max_char = ch;
13786
33
                    if (max_char >= kind_limit)
13787
11
                        break;
13788
33
                }
13789
151
            }
13790
18
        }
13791
9
        else
13792
9
            max_char = 127;
13793
27
        result = PyUnicode_New(slicelength, max_char);
13794
27
        if (result == NULL)
13795
0
            return NULL;
13796
27
        dest_kind = PyUnicode_KIND(result);
13797
27
        dest_data = PyUnicode_DATA(result);
13798
13799
341
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13800
314
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13801
314
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13802
314
        }
13803
27
        assert(_PyUnicode_CheckConsistency(result, 1));
13804
27
        return result;
13805
27
    } else {
13806
1
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13807
1
                     Py_TYPE(item)->tp_name);
13808
1
        return NULL;
13809
1
    }
13810
3.65M
}
13811
13812
static PyMappingMethods unicode_as_mapping = {
13813
    unicode_length,     /* mp_length */
13814
    unicode_subscript,  /* mp_subscript */
13815
    0,                  /* mp_ass_subscript */
13816
};
13817
13818
13819
static PyObject *
13820
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
13821
13822
/*[clinic input]
13823
@classmethod
13824
str.__new__ as unicode_new
13825
13826
    object as x: object = NULL
13827
    encoding: str = NULL
13828
    errors: str = NULL
13829
13830
[clinic start generated code]*/
13831
13832
static PyObject *
13833
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
13834
                 const char *errors)
13835
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
13836
35
{
13837
35
    PyObject *unicode;
13838
35
    if (x == NULL) {
13839
0
        unicode = _PyUnicode_GetEmpty();
13840
0
    }
13841
35
    else if (encoding == NULL && errors == NULL) {
13842
35
        unicode = PyObject_Str(x);
13843
35
    }
13844
0
    else {
13845
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
13846
0
    }
13847
13848
35
    if (unicode != NULL && type != &PyUnicode_Type) {
13849
35
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
13850
35
    }
13851
35
    return unicode;
13852
35
}
13853
13854
static const char *
13855
arg_as_utf8(PyObject *obj, const char *name)
13856
10.1k
{
13857
10.1k
    if (!PyUnicode_Check(obj)) {
13858
0
        PyErr_Format(PyExc_TypeError,
13859
0
                     "str() argument '%s' must be str, not %T",
13860
0
                     name, obj);
13861
0
        return NULL;
13862
0
    }
13863
10.1k
    return _PyUnicode_AsUTF8NoNUL(obj);
13864
10.1k
}
13865
13866
static PyObject *
13867
unicode_vectorcall(PyObject *type, PyObject *const *args,
13868
                   size_t nargsf, PyObject *kwnames)
13869
9.97k
{
13870
9.97k
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
13871
13872
9.97k
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
13873
9.97k
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
13874
        // Fallback to unicode_new()
13875
0
        PyObject *tuple = PyTuple_FromArray(args, nargs);
13876
0
        if (tuple == NULL) {
13877
0
            return NULL;
13878
0
        }
13879
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
13880
0
        if (dict == NULL) {
13881
0
            Py_DECREF(tuple);
13882
0
            return NULL;
13883
0
        }
13884
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
13885
0
        Py_DECREF(tuple);
13886
0
        Py_DECREF(dict);
13887
0
        return ret;
13888
0
    }
13889
9.97k
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
13890
0
        return NULL;
13891
0
    }
13892
9.97k
    if (nargs == 0) {
13893
0
        return _PyUnicode_GetEmpty();
13894
0
    }
13895
9.97k
    PyObject *object = args[0];
13896
9.97k
    if (nargs == 1) {
13897
84
        return PyObject_Str(object);
13898
84
    }
13899
9.88k
    const char *encoding = arg_as_utf8(args[1], "encoding");
13900
9.88k
    if (encoding == NULL) {
13901
0
        return NULL;
13902
0
    }
13903
9.88k
    const char *errors = NULL;
13904
9.88k
    if (nargs == 3) {
13905
282
        errors = arg_as_utf8(args[2], "errors");
13906
282
        if (errors == NULL) {
13907
0
            return NULL;
13908
0
        }
13909
282
    }
13910
9.88k
    return PyUnicode_FromEncodedObject(object, encoding, errors);
13911
9.88k
}
13912
13913
static PyObject *
13914
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
13915
35
{
13916
35
    PyObject *self;
13917
35
    Py_ssize_t length, char_size;
13918
35
    int share_utf8;
13919
35
    int kind;
13920
35
    void *data;
13921
13922
35
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13923
35
    assert(_PyUnicode_CHECK(unicode));
13924
13925
35
    self = type->tp_alloc(type, 0);
13926
35
    if (self == NULL) {
13927
0
        return NULL;
13928
0
    }
13929
35
    kind = PyUnicode_KIND(unicode);
13930
35
    length = PyUnicode_GET_LENGTH(unicode);
13931
13932
35
    _PyUnicode_LENGTH(self) = length;
13933
#ifdef Py_DEBUG
13934
    _PyUnicode_HASH(self) = -1;
13935
#else
13936
35
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13937
0
#endif
13938
35
    _PyUnicode_STATE(self).interned = 0;
13939
35
    _PyUnicode_STATE(self).kind = kind;
13940
35
    _PyUnicode_STATE(self).compact = 0;
13941
35
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13942
35
    _PyUnicode_STATE(self).statically_allocated = 0;
13943
0
    PyUnicode_SET_UTF8_LENGTH(self, 0);
13944
35
    PyUnicode_SET_UTF8(self, NULL);
13945
35
    _PyUnicode_DATA_ANY(self) = NULL;
13946
13947
0
    share_utf8 = 0;
13948
35
    if (kind == PyUnicode_1BYTE_KIND) {
13949
35
        char_size = 1;
13950
35
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13951
35
            share_utf8 = 1;
13952
35
    }
13953
0
    else if (kind == PyUnicode_2BYTE_KIND) {
13954
0
        char_size = 2;
13955
0
    }
13956
0
    else {
13957
0
        assert(kind == PyUnicode_4BYTE_KIND);
13958
0
        char_size = 4;
13959
0
    }
13960
13961
    /* Ensure we won't overflow the length. */
13962
35
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13963
0
        PyErr_NoMemory();
13964
0
        goto onError;
13965
0
    }
13966
35
    data = PyMem_Malloc((length + 1) * char_size);
13967
35
    if (data == NULL) {
13968
0
        PyErr_NoMemory();
13969
0
        goto onError;
13970
0
    }
13971
13972
70
    _PyUnicode_DATA_ANY(self) = data;
13973
35
    if (share_utf8) {
13974
35
        PyUnicode_SET_UTF8_LENGTH(self, length);
13975
35
        PyUnicode_SET_UTF8(self, data);
13976
35
    }
13977
13978
70
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
13979
70
    assert(_PyUnicode_CheckConsistency(self, 1));
13980
#ifdef Py_DEBUG
13981
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13982
#endif
13983
35
    return self;
13984
13985
0
onError:
13986
0
    Py_DECREF(self);
13987
0
    return NULL;
13988
70
}
13989
13990
static _PyObjectIndexPair
13991
unicode_iteritem(PyObject *obj, Py_ssize_t index)
13992
6.85M
{
13993
6.85M
    if (index >= PyUnicode_GET_LENGTH(obj)) {
13994
92.1k
        return (_PyObjectIndexPair) { .object = NULL, .index = index };
13995
92.1k
    }
13996
6.76M
    const void *data = PyUnicode_DATA(obj);
13997
6.76M
    int kind = PyUnicode_KIND(obj);
13998
6.76M
    Py_UCS4 ch = PyUnicode_READ(kind, data, index);
13999
6.76M
    PyObject *result = unicode_char(ch);
14000
6.76M
    index = (result == NULL) ? -1 : index + 1;
14001
6.76M
    return (_PyObjectIndexPair) { .object = result, .index = index };
14002
6.76M
}
14003
14004
void
14005
_PyUnicode_ExactDealloc(PyObject *op)
14006
504k
{
14007
504k
    assert(PyUnicode_CheckExact(op));
14008
504k
    unicode_dealloc(op);
14009
504k
}
14010
14011
PyDoc_STRVAR(unicode_doc,
14012
"str(object='') -> str\n\
14013
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14014
\n\
14015
Create a new string object from the given object. If encoding or\n\
14016
errors is specified, then the object must expose a data buffer\n\
14017
that will be decoded using the given encoding and error handler.\n\
14018
Otherwise, returns the result of object.__str__() (if defined)\n\
14019
or repr(object).\n\
14020
encoding defaults to 'utf-8'.\n\
14021
errors defaults to 'strict'.");
14022
14023
static PyObject *unicode_iter(PyObject *seq);
14024
14025
PyTypeObject PyUnicode_Type = {
14026
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14027
    "str",                        /* tp_name */
14028
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14029
    0,                            /* tp_itemsize */
14030
    /* Slots */
14031
    unicode_dealloc,              /* tp_dealloc */
14032
    0,                            /* tp_vectorcall_offset */
14033
    0,                            /* tp_getattr */
14034
    0,                            /* tp_setattr */
14035
    0,                            /* tp_as_async */
14036
    unicode_repr,                 /* tp_repr */
14037
    &unicode_as_number,           /* tp_as_number */
14038
    &unicode_as_sequence,         /* tp_as_sequence */
14039
    &unicode_as_mapping,          /* tp_as_mapping */
14040
    unicode_hash,                 /* tp_hash*/
14041
    0,                            /* tp_call*/
14042
    unicode_str,                  /* tp_str */
14043
    PyObject_GenericGetAttr,      /* tp_getattro */
14044
    0,                            /* tp_setattro */
14045
    0,                            /* tp_as_buffer */
14046
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14047
        Py_TPFLAGS_UNICODE_SUBCLASS |
14048
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14049
    unicode_doc,                  /* tp_doc */
14050
    0,                            /* tp_traverse */
14051
    0,                            /* tp_clear */
14052
    PyUnicode_RichCompare,        /* tp_richcompare */
14053
    0,                            /* tp_weaklistoffset */
14054
    unicode_iter,                 /* tp_iter */
14055
    0,                            /* tp_iternext */
14056
    unicode_methods,              /* tp_methods */
14057
    0,                            /* tp_members */
14058
    0,                            /* tp_getset */
14059
    0,                            /* tp_base */
14060
    0,                            /* tp_dict */
14061
    0,                            /* tp_descr_get */
14062
    0,                            /* tp_descr_set */
14063
    0,                            /* tp_dictoffset */
14064
    0,                            /* tp_init */
14065
    0,                            /* tp_alloc */
14066
    unicode_new,                  /* tp_new */
14067
    PyObject_Free,                /* tp_free */
14068
    .tp_vectorcall = unicode_vectorcall,
14069
    ._tp_iteritem = unicode_iteritem,
14070
};
14071
14072
/* Initialize the Unicode implementation */
14073
14074
static void
14075
_init_global_state(void)
14076
19
{
14077
19
    static int initialized = 0;
14078
19
    if (initialized) {
14079
0
        return;
14080
0
    }
14081
19
    initialized = 1;
14082
14083
    /* initialize the linebreak bloom filter */
14084
19
    const Py_UCS2 linebreak[] = {
14085
19
        0x000A, /* LINE FEED */
14086
19
        0x000D, /* CARRIAGE RETURN */
14087
19
        0x001C, /* FILE SEPARATOR */
14088
19
        0x001D, /* GROUP SEPARATOR */
14089
19
        0x001E, /* RECORD SEPARATOR */
14090
19
        0x0085, /* NEXT LINE */
14091
19
        0x2028, /* LINE SEPARATOR */
14092
19
        0x2029, /* PARAGRAPH SEPARATOR */
14093
19
    };
14094
19
    bloom_linebreak = make_bloom_mask(
14095
19
        PyUnicode_2BYTE_KIND, linebreak,
14096
19
        Py_ARRAY_LENGTH(linebreak));
14097
19
}
14098
14099
void
14100
_PyUnicode_InitState(PyInterpreterState *interp)
14101
19
{
14102
19
    if (!_Py_IsMainInterpreter(interp)) {
14103
0
        return;
14104
0
    }
14105
19
    _init_global_state();
14106
19
}
14107
14108
14109
PyStatus
14110
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14111
19
{
14112
19
    if (_Py_IsMainInterpreter(interp)) {
14113
19
        PyStatus status = init_global_interned_strings(interp);
14114
19
        if (_PyStatus_EXCEPTION(status)) {
14115
0
            return status;
14116
0
        }
14117
19
    }
14118
19
    assert(INTERNED_STRINGS);
14119
14120
19
    if (init_interned_dict(interp)) {
14121
0
        PyErr_Clear();
14122
0
        return _PyStatus_ERR("failed to create interned dict");
14123
0
    }
14124
14125
19
    return _PyStatus_OK();
14126
19
}
14127
14128
14129
PyStatus
14130
_PyUnicode_InitTypes(PyInterpreterState *interp)
14131
19
{
14132
19
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
14133
0
        goto error;
14134
0
    }
14135
19
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
14136
0
        goto error;
14137
0
    }
14138
19
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
14139
0
        goto error;
14140
0
    }
14141
19
    return _PyStatus_OK();
14142
14143
0
error:
14144
0
    return _PyStatus_ERR("Can't initialize unicode types");
14145
19
}
14146
14147
static /* non-null */ PyObject*
14148
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
14149
21.3k
{
14150
    // Note that this steals a reference to `s`, but in many cases that
14151
    // stolen ref is returned, requiring no decref/incref.
14152
14153
21.3k
    assert(s != NULL);
14154
21.3k
    assert(_PyUnicode_CHECK(s));
14155
21.3k
    assert(_PyUnicode_STATE(s).statically_allocated);
14156
21.3k
    assert(!PyUnicode_CHECK_INTERNED(s));
14157
14158
#ifdef Py_DEBUG
14159
    /* We must not add process-global interned string if there's already a
14160
     * per-interpreter interned_dict, which might contain duplicates.
14161
     */
14162
    PyObject *interned = get_interned_dict(interp);
14163
    assert(interned == NULL);
14164
#endif
14165
14166
    /* Look in the global cache first. */
14167
21.3k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14168
    /* We should only init each string once */
14169
21.3k
    assert(r == NULL);
14170
    /* but just in case (for the non-debug build), handle this */
14171
21.3k
    if (r != NULL && r != s) {
14172
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
14173
0
        assert(_PyUnicode_CHECK(r));
14174
0
        Py_DECREF(s);
14175
0
        return Py_NewRef(r);
14176
0
    }
14177
14178
21.3k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
14179
0
        Py_FatalError("failed to intern static string");
14180
0
    }
14181
14182
21.3k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14183
0
    return s;
14184
21.3k
}
14185
14186
void
14187
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
14188
21.3k
{
14189
    // This should only be called as part of runtime initialization
14190
21.3k
    assert(!Py_IsInitialized());
14191
14192
21.3k
    *p = intern_static(interp, *p);
14193
21.3k
    assert(*p);
14194
21.3k
}
14195
14196
static void
14197
immortalize_interned(PyObject *s)
14198
79.5k
{
14199
79.5k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
14200
79.5k
    assert(!_Py_IsImmortal(s));
14201
#ifdef Py_REF_DEBUG
14202
    /* The reference count value should be excluded from the RefTotal.
14203
       The decrements to these objects will not be registered so they
14204
       need to be accounted for in here. */
14205
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
14206
        _Py_DecRefTotal(_PyThreadState_GET());
14207
    }
14208
#endif
14209
79.5k
    _Py_SetImmortal(s);
14210
    // The switch to SSTATE_INTERNED_IMMORTAL must be the last thing done here
14211
    // to synchronize with the check in intern_common() that avoids locking if
14212
    // the string is already immortal.
14213
79.5k
    FT_ATOMIC_STORE_UINT8(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
14214
79.5k
}
14215
14216
#ifdef Py_GIL_DISABLED
14217
static bool
14218
can_immortalize_safely(PyObject *s)
14219
{
14220
    if (_Py_IsOwnedByCurrentThread(s) || _Py_IsImmortal(s)) {
14221
        return true;
14222
    }
14223
    Py_ssize_t shared = _Py_atomic_load_ssize(&s->ob_ref_shared);
14224
    return _Py_REF_IS_MERGED(shared);
14225
}
14226
#endif
14227
14228
static /* non-null */ PyObject*
14229
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
14230
              bool immortalize)
14231
4.54M
{
14232
    // Note that this steals a reference to `s`, but in many cases that
14233
    // stolen ref is returned, requiring no decref/incref.
14234
14235
#ifdef Py_DEBUG
14236
    assert(s != NULL);
14237
    assert(_PyUnicode_CHECK(s));
14238
#else
14239
4.54M
    if (s == NULL || !PyUnicode_Check(s)) {
14240
0
        return s;
14241
0
    }
14242
4.54M
#endif
14243
14244
    /* If it's a subclass, we don't really know what putting
14245
       it in the interned dict might do. */
14246
4.54M
    if (!PyUnicode_CheckExact(s)) {
14247
0
        return s;
14248
0
    }
14249
14250
    /* Is it already interned? */
14251
4.54M
    switch (PyUnicode_CHECK_INTERNED(s)) {
14252
1.51M
        case SSTATE_NOT_INTERNED:
14253
            // no, go on
14254
1.51M
            break;
14255
24.2k
        case SSTATE_INTERNED_MORTAL:
14256
24.2k
#ifndef Py_GIL_DISABLED
14257
            // yes but we might need to make it immortal
14258
24.2k
            if (immortalize) {
14259
32
                immortalize_interned(s);
14260
32
            }
14261
24.2k
            return s;
14262
#else
14263
            // not fully interned yet; fall through to the locking path
14264
            break;
14265
#endif
14266
3.00M
        default:
14267
            // all done
14268
3.00M
            return s;
14269
4.54M
    }
14270
14271
    /* Statically allocated strings must be already interned. */
14272
4.54M
    assert(!_PyUnicode_STATE(s).statically_allocated);
14273
14274
#if Py_GIL_DISABLED
14275
    /* In the free-threaded build, all interned strings are immortal */
14276
    immortalize = 1;
14277
#endif
14278
14279
    /* If it's already immortal, intern it as such */
14280
1.51M
    if (_Py_IsImmortal(s)) {
14281
0
        immortalize = 1;
14282
0
    }
14283
14284
    /* if it's a short string, get the singleton */
14285
1.51M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
14286
48.7k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
14287
1
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
14288
1
        assert(PyUnicode_CHECK_INTERNED(r));
14289
1
        Py_DECREF(s);
14290
1
        return r;
14291
1
    }
14292
#ifdef Py_DEBUG
14293
    assert(!unicode_is_singleton(s));
14294
#endif
14295
14296
    /* Look in the global cache now. */
14297
1.51M
    {
14298
1.51M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14299
1.51M
        if (r != NULL) {
14300
94.8k
            assert(_PyUnicode_STATE(r).statically_allocated);
14301
94.8k
            assert(r != s);  // r must be statically_allocated; s is not
14302
94.8k
            Py_DECREF(s);
14303
94.8k
            return Py_NewRef(r);
14304
94.8k
        }
14305
1.51M
    }
14306
14307
    /* Do a setdefault on the per-interpreter cache. */
14308
1.42M
    PyObject *interned = get_interned_dict(interp);
14309
1.42M
    assert(interned != NULL);
14310
#ifdef Py_GIL_DISABLED
14311
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
14312
    // Lock-free fast path: check if there's already an interned copy that
14313
    // is in its final immortal state.
14314
    PyObject *r;
14315
    int res = PyDict_GetItemRef(interned, s, &r);
14316
    if (res < 0) {
14317
        PyErr_Clear();
14318
        return s;
14319
    }
14320
    if (res > 0) {
14321
        unsigned int state = _Py_atomic_load_uint8(&_PyUnicode_STATE(r).interned);
14322
        if (state == SSTATE_INTERNED_IMMORTAL) {
14323
            Py_DECREF(s);
14324
            return r;
14325
        }
14326
        // Not yet fully interned; fall through to the locking path.
14327
        Py_DECREF(r);
14328
    }
14329
#endif
14330
14331
#ifdef Py_GIL_DISABLED
14332
    // Immortalization writes to the refcount fields non-atomically. That
14333
    // races with Py_INCREF / Py_DECREF on the thread that owns `s`. If we
14334
    // don't own it (and its refcount hasn't been merged), intern a copy
14335
    // we own instead.
14336
    if (!can_immortalize_safely(s)) {
14337
        PyObject *copy = _PyUnicode_Copy(s);
14338
        if (copy == NULL) {
14339
            PyErr_Clear();
14340
            return s;
14341
        }
14342
        Py_DECREF(s);
14343
        s = copy;
14344
    }
14345
#endif
14346
14347
1.42M
    FT_MUTEX_LOCK(INTERN_MUTEX);
14348
1.42M
    PyObject *t;
14349
1.42M
    {
14350
1.42M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
14351
1.42M
        if (res < 0) {
14352
0
            PyErr_Clear();
14353
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14354
0
            return s;
14355
0
        }
14356
1.42M
        else if (res == 1) {
14357
            // value was already present (not inserted)
14358
1.19M
            Py_DECREF(s);
14359
1.19M
            if (immortalize &&
14360
912k
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
14361
3.18k
                immortalize_interned(t);
14362
3.18k
            }
14363
1.19M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14364
1.19M
            return t;
14365
1.19M
        }
14366
227k
        else {
14367
            // value was newly inserted
14368
227k
            assert (s == t);
14369
227k
            Py_DECREF(t);
14370
227k
        }
14371
1.42M
    }
14372
14373
    /* NOT_INTERNED -> INTERNED_MORTAL */
14374
14375
1.42M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
14376
14377
227k
    if (!_Py_IsImmortal(s)) {
14378
        /* The two references in interned dict (key and value) are not counted.
14379
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
14380
227k
        Py_DECREF(s);
14381
227k
        Py_DECREF(s);
14382
227k
    }
14383
227k
    FT_ATOMIC_STORE_UINT8(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
14384
14385
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
14386
14387
#ifdef Py_DEBUG
14388
    if (_Py_IsImmortal(s)) {
14389
        assert(immortalize);
14390
    }
14391
#endif
14392
227k
    if (immortalize) {
14393
76.3k
        immortalize_interned(s);
14394
76.3k
    }
14395
14396
227k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
14397
227k
    return s;
14398
227k
}
14399
14400
void
14401
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
14402
2.16M
{
14403
2.16M
    *p = intern_common(interp, *p, 1);
14404
2.16M
    assert(*p);
14405
2.16M
}
14406
14407
void
14408
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
14409
2.38M
{
14410
2.38M
    *p = intern_common(interp, *p, 0);
14411
2.38M
    assert(*p);
14412
2.38M
}
14413
14414
14415
void
14416
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
14417
0
{
14418
0
    _PyUnicode_InternImmortal(interp, p);
14419
0
    return;
14420
0
}
14421
14422
void
14423
PyUnicode_InternInPlace(PyObject **p)
14424
0
{
14425
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14426
0
    _PyUnicode_InternMortal(interp, p);
14427
0
}
14428
14429
// Public-looking name kept for the stable ABI; user should not call this:
14430
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14431
void
14432
PyUnicode_InternImmortal(PyObject **p)
14433
0
{
14434
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14435
0
    _PyUnicode_InternImmortal(interp, p);
14436
0
}
14437
14438
PyObject *
14439
PyUnicode_InternFromString(const char *cp)
14440
448k
{
14441
448k
    PyObject *s = PyUnicode_FromString(cp);
14442
448k
    if (s == NULL) {
14443
0
        return NULL;
14444
0
    }
14445
448k
    PyInterpreterState *interp = _PyInterpreterState_GET();
14446
448k
    _PyUnicode_InternMortal(interp, &s);
14447
448k
    return s;
14448
448k
}
14449
14450
14451
void
14452
_PyUnicode_ClearInterned(PyInterpreterState *interp)
14453
0
{
14454
0
    PyObject *interned = get_interned_dict(interp);
14455
0
    if (interned == NULL) {
14456
0
        return;
14457
0
    }
14458
0
    assert(PyDict_CheckExact(interned));
14459
14460
0
    if (has_shared_intern_dict(interp)) {
14461
        // the dict doesn't belong to this interpreter, skip the debug
14462
        // checks on it and just clear the pointer to it
14463
0
        clear_interned_dict(interp);
14464
0
        return;
14465
0
    }
14466
14467
#ifdef INTERNED_STATS
14468
    fprintf(stderr, "releasing %zd interned strings\n",
14469
            PyDict_GET_SIZE(interned));
14470
14471
    Py_ssize_t total_length = 0;
14472
#endif
14473
0
    Py_ssize_t pos = 0;
14474
0
    PyObject *s, *ignored_value;
14475
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
14476
0
        int shared = 0;
14477
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
14478
0
        case SSTATE_INTERNED_IMMORTAL:
14479
            /* Make immortal interned strings mortal again. */
14480
            // Skip the Immortal Instance check and restore
14481
            // the two references (key and value) ignored
14482
            // by PyUnicode_InternInPlace().
14483
0
            _Py_SetMortal(s, 2);
14484
#ifdef Py_REF_DEBUG
14485
            /* let's be pedantic with the ref total */
14486
            _Py_IncRefTotal(_PyThreadState_GET());
14487
            _Py_IncRefTotal(_PyThreadState_GET());
14488
#endif
14489
#ifdef INTERNED_STATS
14490
            total_length += PyUnicode_GET_LENGTH(s);
14491
#endif
14492
0
            break;
14493
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
14494
            /* It is shared between interpreters, so we should unmark it
14495
               only when this is the last interpreter in which it's
14496
               interned.  We immortalize all the statically initialized
14497
               strings during startup, so we can rely on the
14498
               main interpreter to be the last one. */
14499
0
            if (!_Py_IsMainInterpreter(interp)) {
14500
0
                shared = 1;
14501
0
            }
14502
0
            break;
14503
0
        case SSTATE_INTERNED_MORTAL:
14504
            // Restore 2 references held by the interned dict; these will
14505
            // be decref'd by clear_interned_dict's PyDict_Clear.
14506
0
            _Py_RefcntAdd(s, 2);
14507
#ifdef Py_REF_DEBUG
14508
            /* let's be pedantic with the ref total */
14509
            _Py_IncRefTotal(_PyThreadState_GET());
14510
            _Py_IncRefTotal(_PyThreadState_GET());
14511
#endif
14512
0
            break;
14513
0
        case SSTATE_NOT_INTERNED:
14514
0
            _Py_FALLTHROUGH;
14515
0
        default:
14516
0
            Py_UNREACHABLE();
14517
0
        }
14518
0
        if (!shared) {
14519
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
14520
0
        }
14521
0
    }
14522
#ifdef INTERNED_STATS
14523
    fprintf(stderr,
14524
            "total length of all interned strings: %zd characters\n",
14525
            total_length);
14526
#endif
14527
14528
0
    struct _Py_unicode_state *state = &interp->unicode;
14529
0
    struct _Py_unicode_ids *ids = &state->ids;
14530
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
14531
0
        Py_XINCREF(ids->array[i]);
14532
0
    }
14533
0
    clear_interned_dict(interp);
14534
0
    if (_Py_IsMainInterpreter(interp)) {
14535
0
        clear_global_interned_strings();
14536
0
    }
14537
0
}
14538
14539
14540
/********************* Unicode Iterator **************************/
14541
14542
typedef struct {
14543
    PyObject_HEAD
14544
    Py_ssize_t it_index;
14545
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14546
} unicodeiterobject;
14547
14548
static void
14549
unicodeiter_dealloc(PyObject *op)
14550
1.32k
{
14551
1.32k
    unicodeiterobject *it = (unicodeiterobject *)op;
14552
1.32k
    _PyObject_GC_UNTRACK(it);
14553
1.32k
    Py_XDECREF(it->it_seq);
14554
1.32k
    PyObject_GC_Del(it);
14555
1.32k
}
14556
14557
static int
14558
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
14559
0
{
14560
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14561
0
    Py_VISIT(it->it_seq);
14562
0
    return 0;
14563
0
}
14564
14565
static PyObject *
14566
unicodeiter_next(PyObject *op)
14567
92.2k
{
14568
92.2k
    unicodeiterobject *it = (unicodeiterobject *)op;
14569
92.2k
    PyObject *seq;
14570
14571
92.2k
    assert(it != NULL);
14572
92.2k
    seq = it->it_seq;
14573
92.2k
    if (seq == NULL)
14574
0
        return NULL;
14575
92.2k
    assert(_PyUnicode_CHECK(seq));
14576
14577
92.2k
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14578
91.0k
        int kind = PyUnicode_KIND(seq);
14579
91.0k
        const void *data = PyUnicode_DATA(seq);
14580
91.0k
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14581
91.0k
        it->it_index++;
14582
91.0k
        return unicode_char(chr);
14583
91.0k
    }
14584
14585
1.17k
    it->it_seq = NULL;
14586
1.17k
    Py_DECREF(seq);
14587
1.17k
    return NULL;
14588
92.2k
}
14589
14590
static PyObject *
14591
unicode_ascii_iter_next(PyObject *op)
14592
1.10k
{
14593
1.10k
    unicodeiterobject *it = (unicodeiterobject *)op;
14594
1.10k
    assert(it != NULL);
14595
1.10k
    PyObject *seq = it->it_seq;
14596
1.10k
    if (seq == NULL) {
14597
0
        return NULL;
14598
0
    }
14599
1.10k
    assert(_PyUnicode_CHECK(seq));
14600
1.10k
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
14601
1.10k
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14602
1.02k
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14603
1.02k
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14604
1.02k
                                              data, it->it_index);
14605
1.02k
        it->it_index++;
14606
1.02k
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14607
1.02k
    }
14608
76
    it->it_seq = NULL;
14609
76
    Py_DECREF(seq);
14610
76
    return NULL;
14611
1.10k
}
14612
14613
static PyObject *
14614
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
14615
0
{
14616
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14617
0
    Py_ssize_t len = 0;
14618
0
    if (it->it_seq)
14619
0
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14620
0
    return PyLong_FromSsize_t(len);
14621
0
}
14622
14623
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14624
14625
static PyObject *
14626
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
14627
0
{
14628
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14629
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
14630
14631
    /* _PyEval_GetBuiltin can invoke arbitrary code,
14632
     * call must be before access of iterator pointers.
14633
     * see issue #101765 */
14634
14635
0
    if (it->it_seq != NULL) {
14636
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
14637
0
    } else {
14638
0
        PyObject *u = _PyUnicode_GetEmpty();
14639
0
        if (u == NULL) {
14640
0
            Py_XDECREF(iter);
14641
0
            return NULL;
14642
0
        }
14643
0
        return Py_BuildValue("N(N)", iter, u);
14644
0
    }
14645
0
}
14646
14647
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14648
14649
static PyObject *
14650
unicodeiter_setstate(PyObject *op, PyObject *state)
14651
0
{
14652
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14653
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
14654
0
    if (index == -1 && PyErr_Occurred())
14655
0
        return NULL;
14656
0
    if (it->it_seq != NULL) {
14657
0
        if (index < 0)
14658
0
            index = 0;
14659
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14660
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14661
0
        it->it_index = index;
14662
0
    }
14663
0
    Py_RETURN_NONE;
14664
0
}
14665
14666
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14667
14668
static PyMethodDef unicodeiter_methods[] = {
14669
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
14670
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
14671
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
14672
    {NULL,      NULL}       /* sentinel */
14673
};
14674
14675
PyTypeObject PyUnicodeIter_Type = {
14676
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14677
    "str_iterator",         /* tp_name */
14678
    sizeof(unicodeiterobject),      /* tp_basicsize */
14679
    0,                  /* tp_itemsize */
14680
    /* methods */
14681
    unicodeiter_dealloc,/* tp_dealloc */
14682
    0,                  /* tp_vectorcall_offset */
14683
    0,                  /* tp_getattr */
14684
    0,                  /* tp_setattr */
14685
    0,                  /* tp_as_async */
14686
    0,                  /* tp_repr */
14687
    0,                  /* tp_as_number */
14688
    0,                  /* tp_as_sequence */
14689
    0,                  /* tp_as_mapping */
14690
    0,                  /* tp_hash */
14691
    0,                  /* tp_call */
14692
    0,                  /* tp_str */
14693
    PyObject_GenericGetAttr,        /* tp_getattro */
14694
    0,                  /* tp_setattro */
14695
    0,                  /* tp_as_buffer */
14696
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14697
    0,                  /* tp_doc */
14698
    unicodeiter_traverse, /* tp_traverse */
14699
    0,                  /* tp_clear */
14700
    0,                  /* tp_richcompare */
14701
    0,                  /* tp_weaklistoffset */
14702
    PyObject_SelfIter,          /* tp_iter */
14703
    unicodeiter_next,   /* tp_iternext */
14704
    unicodeiter_methods,            /* tp_methods */
14705
    0,
14706
};
14707
14708
PyTypeObject _PyUnicodeASCIIIter_Type = {
14709
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14710
    .tp_name = "str_ascii_iterator",
14711
    .tp_basicsize = sizeof(unicodeiterobject),
14712
    .tp_dealloc = unicodeiter_dealloc,
14713
    .tp_getattro = PyObject_GenericGetAttr,
14714
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14715
    .tp_traverse = unicodeiter_traverse,
14716
    .tp_iter = PyObject_SelfIter,
14717
    .tp_iternext = unicode_ascii_iter_next,
14718
    .tp_methods = unicodeiter_methods,
14719
};
14720
14721
static PyObject *
14722
unicode_iter(PyObject *seq)
14723
1.32k
{
14724
1.32k
    unicodeiterobject *it;
14725
14726
1.32k
    if (!PyUnicode_Check(seq)) {
14727
0
        PyErr_BadInternalCall();
14728
0
        return NULL;
14729
0
    }
14730
1.32k
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14731
95
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14732
95
    }
14733
1.23k
    else {
14734
1.23k
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14735
1.23k
    }
14736
1.32k
    if (it == NULL)
14737
0
        return NULL;
14738
1.32k
    it->it_index = 0;
14739
1.32k
    it->it_seq = Py_NewRef(seq);
14740
1.32k
    _PyObject_GC_TRACK(it);
14741
1.32k
    return (PyObject *)it;
14742
1.32k
}
14743
14744
static int
14745
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14746
76
{
14747
76
    int res;
14748
76
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14749
76
    if (res == -2) {
14750
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
14751
0
        return -1;
14752
0
    }
14753
76
    if (res < 0) {
14754
0
        PyErr_NoMemory();
14755
0
        return -1;
14756
0
    }
14757
76
    return 0;
14758
76
}
14759
14760
14761
static int
14762
config_get_codec_name(wchar_t **config_encoding)
14763
38
{
14764
38
    char *encoding;
14765
38
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
14766
0
        return -1;
14767
0
    }
14768
14769
38
    PyObject *name_obj = NULL;
14770
38
    PyObject *codec = _PyCodec_Lookup(encoding);
14771
38
    PyMem_RawFree(encoding);
14772
14773
38
    if (!codec)
14774
0
        goto error;
14775
14776
38
    name_obj = PyObject_GetAttrString(codec, "name");
14777
38
    Py_CLEAR(codec);
14778
38
    if (!name_obj) {
14779
0
        goto error;
14780
0
    }
14781
14782
38
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14783
38
    Py_DECREF(name_obj);
14784
38
    if (wname == NULL) {
14785
0
        goto error;
14786
0
    }
14787
14788
38
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14789
38
    if (raw_wname == NULL) {
14790
0
        PyMem_Free(wname);
14791
0
        PyErr_NoMemory();
14792
0
        goto error;
14793
0
    }
14794
14795
38
    PyMem_RawFree(*config_encoding);
14796
38
    *config_encoding = raw_wname;
14797
14798
38
    PyMem_Free(wname);
14799
38
    return 0;
14800
14801
0
error:
14802
0
    Py_XDECREF(codec);
14803
0
    Py_XDECREF(name_obj);
14804
0
    return -1;
14805
38
}
14806
14807
14808
static PyStatus
14809
init_stdio_encoding(PyInterpreterState *interp)
14810
19
{
14811
    /* Update the stdio encoding to the normalized Python codec name. */
14812
19
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14813
19
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
14814
0
        return _PyStatus_ERR("failed to get the Python codec name "
14815
0
                             "of the stdio encoding");
14816
0
    }
14817
19
    return _PyStatus_OK();
14818
19
}
14819
14820
14821
static int
14822
init_fs_codec(PyInterpreterState *interp)
14823
19
{
14824
19
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
14825
14826
19
    _Py_error_handler error_handler;
14827
19
    error_handler = get_error_handler_wide(config->filesystem_errors);
14828
19
    if (error_handler == _Py_ERROR_UNKNOWN) {
14829
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
14830
0
        return -1;
14831
0
    }
14832
14833
19
    char *encoding, *errors;
14834
19
    if (encode_wstr_utf8(config->filesystem_encoding,
14835
19
                         &encoding,
14836
19
                         "filesystem_encoding") < 0) {
14837
0
        return -1;
14838
0
    }
14839
14840
19
    if (encode_wstr_utf8(config->filesystem_errors,
14841
19
                         &errors,
14842
19
                         "filesystem_errors") < 0) {
14843
0
        PyMem_RawFree(encoding);
14844
0
        return -1;
14845
0
    }
14846
14847
19
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
14848
19
    PyMem_RawFree(fs_codec->encoding);
14849
19
    fs_codec->encoding = encoding;
14850
    /* encoding has been normalized by init_fs_encoding() */
14851
19
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
14852
19
    PyMem_RawFree(fs_codec->errors);
14853
19
    fs_codec->errors = errors;
14854
19
    fs_codec->error_handler = error_handler;
14855
14856
#ifdef _Py_FORCE_UTF8_FS_ENCODING
14857
    assert(fs_codec->utf8 == 1);
14858
#endif
14859
14860
    /* At this point, PyUnicode_EncodeFSDefault() and
14861
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
14862
       the C implementation of the filesystem encoding. */
14863
14864
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
14865
       global configuration variables. */
14866
19
    if (_Py_IsMainInterpreter(interp)) {
14867
14868
19
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
14869
19
                                      fs_codec->errors) < 0) {
14870
0
            PyErr_NoMemory();
14871
0
            return -1;
14872
0
        }
14873
19
    }
14874
19
    return 0;
14875
19
}
14876
14877
14878
static PyStatus
14879
init_fs_encoding(PyThreadState *tstate)
14880
19
{
14881
19
    PyInterpreterState *interp = tstate->interp;
14882
14883
    /* Update the filesystem encoding to the normalized Python codec name.
14884
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
14885
       (Python codec name). */
14886
19
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14887
19
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
14888
0
        _Py_DumpPathConfig(tstate);
14889
0
        return _PyStatus_ERR("failed to get the Python codec "
14890
0
                             "of the filesystem encoding");
14891
0
    }
14892
14893
19
    if (init_fs_codec(interp) < 0) {
14894
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
14895
0
    }
14896
19
    return _PyStatus_OK();
14897
19
}
14898
14899
14900
PyStatus
14901
_PyUnicode_InitEncodings(PyThreadState *tstate)
14902
19
{
14903
19
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
14904
19
    if (_PyStatus_EXCEPTION(status)) {
14905
0
        return status;
14906
0
    }
14907
19
    status = init_fs_encoding(tstate);
14908
19
    if (_PyStatus_EXCEPTION(status)) {
14909
0
        return status;
14910
0
    }
14911
14912
19
    return init_stdio_encoding(tstate->interp);
14913
19
}
14914
14915
14916
static void
14917
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
14918
0
{
14919
0
    PyMem_RawFree(fs_codec->encoding);
14920
0
    fs_codec->encoding = NULL;
14921
0
    fs_codec->utf8 = 0;
14922
0
    PyMem_RawFree(fs_codec->errors);
14923
0
    fs_codec->errors = NULL;
14924
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
14925
0
}
14926
14927
14928
#ifdef Py_DEBUG
14929
static inline int
14930
unicode_is_finalizing(void)
14931
{
14932
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
14933
}
14934
#endif
14935
14936
14937
void
14938
_PyUnicode_FiniTypes(PyInterpreterState *interp)
14939
0
{
14940
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
14941
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
14942
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
14943
0
}
14944
14945
14946
void
14947
_PyUnicode_Fini(PyInterpreterState *interp)
14948
0
{
14949
0
    struct _Py_unicode_state *state = &interp->unicode;
14950
14951
0
    if (!has_shared_intern_dict(interp)) {
14952
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
14953
0
        assert(get_interned_dict(interp) == NULL);
14954
0
    }
14955
14956
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
14957
14958
    // bpo-47182: force a unicodedata CAPI capsule re-import on
14959
    // subsequent initialization of interpreter.
14960
0
    interp->unicode.ucnhash_capi = NULL;
14961
14962
0
    unicode_clear_identifiers(state);
14963
0
}
14964
14965
/* A _string module, to export formatter_parser and formatter_field_name_split
14966
   to the string.Formatter class implemented in Python. */
14967
14968
static PyMethodDef _string_methods[] = {
14969
    {"formatter_field_name_split", formatter_field_name_split,
14970
     METH_O, PyDoc_STR("split the argument as a field name")},
14971
    {"formatter_parser", formatter_parser,
14972
     METH_O, PyDoc_STR("parse the argument as a format string")},
14973
    {NULL, NULL}
14974
};
14975
14976
static PyModuleDef_Slot module_slots[] = {
14977
    _Py_ABI_SLOT,
14978
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
14979
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
14980
    {0, NULL}
14981
};
14982
14983
static struct PyModuleDef _string_module = {
14984
    PyModuleDef_HEAD_INIT,
14985
    .m_name = "_string",
14986
    .m_doc = PyDoc_STR("string helper module"),
14987
    .m_size = 0,
14988
    .m_methods = _string_methods,
14989
    .m_slots = module_slots,
14990
};
14991
14992
PyMODINIT_FUNC
14993
PyInit__string(void)
14994
0
{
14995
0
    return PyModuleDef_Init(&_string_module);
14996
0
}
14997
14998
14999
#undef PyUnicode_KIND
15000
int PyUnicode_KIND(PyObject *op)
15001
0
{
15002
0
    if (!PyUnicode_Check(op)) {
15003
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
15004
0
        return -1;
15005
0
    }
15006
0
    return _PyASCIIObject_CAST(op)->state.kind;
15007
0
}
15008
15009
#undef PyUnicode_DATA
15010
void* PyUnicode_DATA(PyObject *op)
15011
0
{
15012
0
    if (!PyUnicode_Check(op)) {
15013
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
15014
0
        return NULL;
15015
0
    }
15016
0
    return _PyUnicode_DATA(op);
15017
0
}