Coverage Report

Created: 2025-11-09 06:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_initconfig.h"    // _PyStatus_OK()
50
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
51
#include "pycore_long.h"          // _PyLong_FormatWriter()
52
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
54
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
55
#include "pycore_pyhash.h"        // _Py_HashSecret_t
56
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
57
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
58
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
59
#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart
60
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
61
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
62
63
#include "stringlib/eq.h"         // unicode_eq()
64
#include <stddef.h>               // ptrdiff_t
65
66
#ifdef MS_WINDOWS
67
#include <windows.h>
68
#endif
69
70
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
71
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
72
#endif
73
74
/* Uncomment to display statistics on interned strings at exit
75
   in _PyUnicode_ClearInterned(). */
76
/* #define INTERNED_STATS 1 */
77
78
79
/*[clinic input]
80
class str "PyObject *" "&PyUnicode_Type"
81
[clinic start generated code]*/
82
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
83
84
/*[python input]
85
class Py_UCS4_converter(CConverter):
86
    type = 'Py_UCS4'
87
    converter = 'convert_uc'
88
89
    def converter_init(self):
90
        if self.default is not unspecified:
91
            self.c_default = ascii(self.default)
92
            if len(self.c_default) > 4 or self.c_default[0] != "'":
93
                self.c_default = hex(ord(self.default))
94
95
[python start generated code]*/
96
/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
97
98
/* --- Globals ------------------------------------------------------------
99
100
NOTE: In the interpreter's initialization phase, some globals are currently
101
      initialized dynamically as needed. In the process Unicode objects may
102
      be created before the Unicode type is ready.
103
104
*/
105
106
11.7M
#define MAX_UNICODE _Py_MAX_UNICODE
107
149M
#define ensure_unicode _PyUnicode_EnsureUnicode
108
109
#ifdef Py_DEBUG
110
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
111
#else
112
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
113
#endif
114
115
static inline char* _PyUnicode_UTF8(PyObject *op)
116
201M
{
117
201M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
118
201M
}
119
120
static inline char* PyUnicode_UTF8(PyObject *op)
121
73.4M
{
122
73.4M
    assert(_PyUnicode_CHECK(op));
123
73.4M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
124
58.6M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
125
58.6M
    }
126
14.8M
    else {
127
14.8M
         return _PyUnicode_UTF8(op);
128
14.8M
    }
129
73.4M
}
130
131
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
132
21.5M
{
133
21.5M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
134
21.5M
}
135
136
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
137
32.7M
{
138
32.7M
    assert(_PyUnicode_CHECK(op));
139
32.7M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
140
29.2M
         return _PyASCIIObject_CAST(op)->length;
141
29.2M
    }
142
3.54M
    else {
143
3.54M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
144
3.54M
    }
145
32.7M
}
146
147
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
148
21.5M
{
149
21.5M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
150
21.5M
}
151
152
#define _PyUnicode_LENGTH(op)                           \
153
542M
    (_PyASCIIObject_CAST(op)->length)
154
#define _PyUnicode_STATE(op)                            \
155
3.40G
    (_PyASCIIObject_CAST(op)->state)
156
#define _PyUnicode_HASH(op)                             \
157
496M
    (_PyASCIIObject_CAST(op)->hash)
158
159
104M
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
160
161
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
162
44.4M
{
163
44.4M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
164
44.4M
}
165
166
#define _PyUnicode_DATA_ANY(op)                         \
167
45.4M
    (_PyUnicodeObject_CAST(op)->data.any)
168
169
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
170
0
{
171
0
    assert(_PyUnicode_CHECK(op));
172
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
173
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
174
0
}
175
176
/* true if the Unicode object has an allocated UTF-8 memory block
177
   (not shared with other data) */
178
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
179
542M
{
180
542M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
181
176M
            && _PyUnicode_UTF8(op) != NULL
182
10.1M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
183
542M
}
184
185
186
240M
#define LATIN1 _Py_LATIN1_CHR
187
188
/* Forward declaration */
189
static PyObject *
190
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
191
                    const char *errors);
192
static PyObject *
193
unicode_decode_utf8(const char *s, Py_ssize_t size,
194
                    _Py_error_handler error_handler, const char *errors,
195
                    Py_ssize_t *consumed);
196
#ifdef Py_DEBUG
197
static inline int unicode_is_finalizing(void);
198
static int unicode_is_singleton(PyObject *unicode);
199
#endif
200
201
202
// Return a reference to the immortal empty string singleton.
203
PyObject*
204
_PyUnicode_GetEmpty(void)
205
119M
{
206
119M
    _Py_DECLARE_STR(empty, "");
207
119M
    return &_Py_STR(empty);
208
119M
}
209
210
/* This dictionary holds per-interpreter interned strings.
211
 * See InternalDocs/string_interning.md for details.
212
 */
213
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
214
3.01M
{
215
3.01M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
216
3.01M
}
217
218
/* This hashtable holds statically allocated interned strings.
219
 * See InternalDocs/string_interning.md for details.
220
 */
221
2.87M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
222
223
/* Get number of all interned strings for the current interpreter. */
224
Py_ssize_t
225
_PyUnicode_InternedSize(void)
226
0
{
227
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
228
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
229
0
}
230
231
/* Get number of immortal interned strings for the current interpreter. */
232
Py_ssize_t
233
_PyUnicode_InternedSize_Immortal(void)
234
0
{
235
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
236
0
    PyObject *key, *value;
237
0
    Py_ssize_t pos = 0;
238
0
    Py_ssize_t count = 0;
239
240
    // It's tempting to keep a count and avoid a loop here. But, this function
241
    // is intended for refleak tests. It spends extra work to report the true
242
    // value, to help detect bugs in optimizations.
243
244
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
245
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
246
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
247
0
           count++;
248
0
       }
249
0
    }
250
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
251
0
}
252
253
static Py_hash_t unicode_hash(PyObject *);
254
255
static Py_uhash_t
256
hashtable_unicode_hash(const void *key)
257
2.87M
{
258
2.87M
    return unicode_hash((PyObject *)key);
259
2.87M
}
260
261
static int
262
hashtable_unicode_compare(const void *key1, const void *key2)
263
289k
{
264
289k
    PyObject *obj1 = (PyObject *)key1;
265
289k
    PyObject *obj2 = (PyObject *)key2;
266
289k
    if (obj1 != NULL && obj2 != NULL) {
267
289k
        return unicode_eq(obj1, obj2);
268
289k
    }
269
0
    else {
270
0
        return obj1 == obj2;
271
0
    }
272
289k
}
273
274
/* Return true if this interpreter should share the main interpreter's
275
   intern_dict.  That's important for interpreters which load basic
276
   single-phase init extension modules (m_size == -1).  There could be interned
277
   immortal strings that are shared between interpreters, due to the
278
   PyDict_Update(mdict, m_copy) call in import_find_extension().
279
280
   It's not safe to deallocate those strings until all interpreters that
281
   potentially use them are freed.  By storing them in the main interpreter, we
282
   ensure they get freed after all other interpreters are freed.
283
*/
284
static bool
285
has_shared_intern_dict(PyInterpreterState *interp)
286
16
{
287
16
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
288
16
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
289
16
}
290
291
static int
292
init_interned_dict(PyInterpreterState *interp)
293
16
{
294
16
    assert(get_interned_dict(interp) == NULL);
295
16
    PyObject *interned;
296
16
    if (has_shared_intern_dict(interp)) {
297
0
        interned = get_interned_dict(_PyInterpreterState_Main());
298
0
        Py_INCREF(interned);
299
0
    }
300
16
    else {
301
16
        interned = PyDict_New();
302
16
        if (interned == NULL) {
303
0
            return -1;
304
0
        }
305
16
    }
306
16
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
307
16
    return 0;
308
16
}
309
310
static void
311
clear_interned_dict(PyInterpreterState *interp)
312
0
{
313
0
    PyObject *interned = get_interned_dict(interp);
314
0
    if (interned != NULL) {
315
0
        if (!has_shared_intern_dict(interp)) {
316
            // only clear if the dict belongs to this interpreter
317
0
            PyDict_Clear(interned);
318
0
        }
319
0
        Py_DECREF(interned);
320
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
321
0
    }
322
0
}
323
324
static PyStatus
325
init_global_interned_strings(PyInterpreterState *interp)
326
16
{
327
16
    assert(INTERNED_STRINGS == NULL);
328
16
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
329
330
16
    INTERNED_STRINGS = _Py_hashtable_new_full(
331
16
        hashtable_unicode_hash,
332
16
        hashtable_unicode_compare,
333
        // Objects stored here are immortal and statically allocated,
334
        // so we don't need key_destroy_func & value_destroy_func:
335
16
        NULL,
336
16
        NULL,
337
16
        &hashtable_alloc
338
16
    );
339
16
    if (INTERNED_STRINGS == NULL) {
340
0
        PyErr_Clear();
341
0
        return _PyStatus_ERR("failed to create global interned dict");
342
0
    }
343
344
    /* Intern statically allocated string identifiers, deepfreeze strings,
345
        * and one-byte latin-1 strings.
346
        * This must be done before any module initialization so that statically
347
        * allocated string identifiers are used instead of heap allocated strings.
348
        * Deepfreeze uses the interned identifiers if present to save space
349
        * else generates them and they are interned to speed up dict lookups.
350
    */
351
16
    _PyUnicode_InitStaticStrings(interp);
352
353
4.11k
    for (int i = 0; i < 256; i++) {
354
4.09k
        PyObject *s = LATIN1(i);
355
4.09k
        _PyUnicode_InternStatic(interp, &s);
356
4.09k
        assert(s == LATIN1(i));
357
4.09k
    }
358
#ifdef Py_DEBUG
359
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
360
361
    for (int i = 0; i < 256; i++) {
362
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
363
    }
364
#endif
365
16
    return _PyStatus_OK();
366
16
}
367
368
static void clear_global_interned_strings(void)
369
0
{
370
0
    if (INTERNED_STRINGS != NULL) {
371
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
372
0
        INTERNED_STRINGS = NULL;
373
0
    }
374
0
}
375
376
#define _Py_RETURN_UNICODE_EMPTY()   \
377
46.5M
    do {                             \
378
46.5M
        return _PyUnicode_GetEmpty();\
379
46.5M
    } while (0)
380
381
382
/* Fast detection of the most frequent whitespace characters */
383
const unsigned char _Py_ascii_whitespace[] = {
384
    0, 0, 0, 0, 0, 0, 0, 0,
385
/*     case 0x0009: * CHARACTER TABULATION */
386
/*     case 0x000A: * LINE FEED */
387
/*     case 0x000B: * LINE TABULATION */
388
/*     case 0x000C: * FORM FEED */
389
/*     case 0x000D: * CARRIAGE RETURN */
390
    0, 1, 1, 1, 1, 1, 0, 0,
391
    0, 0, 0, 0, 0, 0, 0, 0,
392
/*     case 0x001C: * FILE SEPARATOR */
393
/*     case 0x001D: * GROUP SEPARATOR */
394
/*     case 0x001E: * RECORD SEPARATOR */
395
/*     case 0x001F: * UNIT SEPARATOR */
396
    0, 0, 0, 0, 1, 1, 1, 1,
397
/*     case 0x0020: * SPACE */
398
    1, 0, 0, 0, 0, 0, 0, 0,
399
    0, 0, 0, 0, 0, 0, 0, 0,
400
    0, 0, 0, 0, 0, 0, 0, 0,
401
    0, 0, 0, 0, 0, 0, 0, 0,
402
403
    0, 0, 0, 0, 0, 0, 0, 0,
404
    0, 0, 0, 0, 0, 0, 0, 0,
405
    0, 0, 0, 0, 0, 0, 0, 0,
406
    0, 0, 0, 0, 0, 0, 0, 0,
407
    0, 0, 0, 0, 0, 0, 0, 0,
408
    0, 0, 0, 0, 0, 0, 0, 0,
409
    0, 0, 0, 0, 0, 0, 0, 0,
410
    0, 0, 0, 0, 0, 0, 0, 0
411
};
412
413
/* forward */
414
static PyObject* get_latin1_char(unsigned char ch);
415
416
417
static PyObject *
418
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
419
static PyObject *
420
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
421
static PyObject *
422
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
423
424
static PyObject *
425
unicode_encode_call_errorhandler(const char *errors,
426
       PyObject **errorHandler,const char *encoding, const char *reason,
427
       PyObject *unicode, PyObject **exceptionObject,
428
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
429
430
static void
431
raise_encode_exception(PyObject **exceptionObject,
432
                       const char *encoding,
433
                       PyObject *unicode,
434
                       Py_ssize_t startpos, Py_ssize_t endpos,
435
                       const char *reason);
436
437
/* Same for linebreaks */
438
static const unsigned char ascii_linebreak[] = {
439
    0, 0, 0, 0, 0, 0, 0, 0,
440
/*         0x000A, * LINE FEED */
441
/*         0x000B, * LINE TABULATION */
442
/*         0x000C, * FORM FEED */
443
/*         0x000D, * CARRIAGE RETURN */
444
    0, 0, 1, 1, 1, 1, 0, 0,
445
    0, 0, 0, 0, 0, 0, 0, 0,
446
/*         0x001C, * FILE SEPARATOR */
447
/*         0x001D, * GROUP SEPARATOR */
448
/*         0x001E, * RECORD SEPARATOR */
449
    0, 0, 0, 0, 1, 1, 1, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0,
452
    0, 0, 0, 0, 0, 0, 0, 0,
453
    0, 0, 0, 0, 0, 0, 0, 0,
454
455
    0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 0, 0, 0, 0, 0, 0, 0,
459
    0, 0, 0, 0, 0, 0, 0, 0,
460
    0, 0, 0, 0, 0, 0, 0, 0,
461
    0, 0, 0, 0, 0, 0, 0, 0,
462
    0, 0, 0, 0, 0, 0, 0, 0
463
};
464
465
static int convert_uc(PyObject *obj, void *addr);
466
467
struct encoding_map;
468
#include "clinic/unicodeobject.c.h"
469
470
_Py_error_handler
471
_Py_GetErrorHandler(const char *errors)
472
618k
{
473
618k
    if (errors == NULL || strcmp(errors, "strict") == 0) {
474
234k
        return _Py_ERROR_STRICT;
475
234k
    }
476
384k
    if (strcmp(errors, "surrogateescape") == 0) {
477
188k
        return _Py_ERROR_SURROGATEESCAPE;
478
188k
    }
479
195k
    if (strcmp(errors, "replace") == 0) {
480
195k
        return _Py_ERROR_REPLACE;
481
195k
    }
482
0
    if (strcmp(errors, "ignore") == 0) {
483
0
        return _Py_ERROR_IGNORE;
484
0
    }
485
0
    if (strcmp(errors, "backslashreplace") == 0) {
486
0
        return _Py_ERROR_BACKSLASHREPLACE;
487
0
    }
488
0
    if (strcmp(errors, "surrogatepass") == 0) {
489
0
        return _Py_ERROR_SURROGATEPASS;
490
0
    }
491
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
492
0
        return _Py_ERROR_XMLCHARREFREPLACE;
493
0
    }
494
0
    return _Py_ERROR_OTHER;
495
0
}
496
497
498
static _Py_error_handler
499
get_error_handler_wide(const wchar_t *errors)
500
5.57k
{
501
5.57k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
502
0
        return _Py_ERROR_STRICT;
503
0
    }
504
5.57k
    if (wcscmp(errors, L"surrogateescape") == 0) {
505
5.57k
        return _Py_ERROR_SURROGATEESCAPE;
506
5.57k
    }
507
0
    if (wcscmp(errors, L"replace") == 0) {
508
0
        return _Py_ERROR_REPLACE;
509
0
    }
510
0
    if (wcscmp(errors, L"ignore") == 0) {
511
0
        return _Py_ERROR_IGNORE;
512
0
    }
513
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
514
0
        return _Py_ERROR_BACKSLASHREPLACE;
515
0
    }
516
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
517
0
        return _Py_ERROR_SURROGATEPASS;
518
0
    }
519
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
520
0
        return _Py_ERROR_XMLCHARREFREPLACE;
521
0
    }
522
0
    return _Py_ERROR_OTHER;
523
0
}
524
525
526
static inline int
527
unicode_check_encoding_errors(const char *encoding, const char *errors)
528
24.7M
{
529
24.7M
    if (encoding == NULL && errors == NULL) {
530
13.6M
        return 0;
531
13.6M
    }
532
533
11.0M
    PyInterpreterState *interp = _PyInterpreterState_GET();
534
11.0M
#ifndef Py_DEBUG
535
    /* In release mode, only check in development mode (-X dev) */
536
11.0M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
537
11.0M
        return 0;
538
11.0M
    }
539
#else
540
    /* Always check in debug mode */
541
#endif
542
543
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
544
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
545
0
    if (!interp->unicode.fs_codec.encoding) {
546
0
        return 0;
547
0
    }
548
549
    /* Disable checks during Python finalization. For example, it allows to
550
       call _PyObject_Dump() during finalization for debugging purpose. */
551
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
552
0
        return 0;
553
0
    }
554
555
0
    if (encoding != NULL
556
        // Fast path for the most common built-in encodings. Even if the codec
557
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
558
        // create a temporary Unicode string (the key in the cache).
559
0
        && strcmp(encoding, "utf-8") != 0
560
0
        && strcmp(encoding, "utf8") != 0
561
0
        && strcmp(encoding, "ascii") != 0)
562
0
    {
563
0
        PyObject *handler = _PyCodec_Lookup(encoding);
564
0
        if (handler == NULL) {
565
0
            return -1;
566
0
        }
567
0
        Py_DECREF(handler);
568
0
    }
569
570
0
    if (errors != NULL
571
        // Fast path for the most common built-in error handlers.
572
0
        && strcmp(errors, "strict") != 0
573
0
        && strcmp(errors, "ignore") != 0
574
0
        && strcmp(errors, "replace") != 0
575
0
        && strcmp(errors, "surrogateescape") != 0
576
0
        && strcmp(errors, "surrogatepass") != 0)
577
0
    {
578
0
        PyObject *handler = PyCodec_LookupError(errors);
579
0
        if (handler == NULL) {
580
0
            return -1;
581
0
        }
582
0
        Py_DECREF(handler);
583
0
    }
584
0
    return 0;
585
0
}
586
587
588
int
589
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
590
0
{
591
0
#define CHECK(expr) \
592
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
593
594
0
    assert(op != NULL);
595
0
    CHECK(PyUnicode_Check(op));
596
597
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
598
0
    int kind = ascii->state.kind;
599
600
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
601
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
602
0
    }
603
0
    else {
604
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
605
0
        void *data;
606
607
0
        if (ascii->state.compact == 1) {
608
0
            data = compact + 1;
609
0
            CHECK(kind == PyUnicode_1BYTE_KIND
610
0
                                 || kind == PyUnicode_2BYTE_KIND
611
0
                                 || kind == PyUnicode_4BYTE_KIND);
612
0
            CHECK(ascii->state.ascii == 0);
613
0
            CHECK(_PyUnicode_UTF8(op) != data);
614
0
        }
615
0
        else {
616
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
617
618
0
            data = unicode->data.any;
619
0
            CHECK(kind == PyUnicode_1BYTE_KIND
620
0
                     || kind == PyUnicode_2BYTE_KIND
621
0
                     || kind == PyUnicode_4BYTE_KIND);
622
0
            CHECK(ascii->state.compact == 0);
623
0
            CHECK(data != NULL);
624
0
            if (ascii->state.ascii) {
625
0
                CHECK(_PyUnicode_UTF8(op) == data);
626
0
                CHECK(compact->utf8_length == ascii->length);
627
0
            }
628
0
            else {
629
0
                CHECK(_PyUnicode_UTF8(op) != data);
630
0
            }
631
0
        }
632
0
#ifndef Py_GIL_DISABLED
633
0
        if (_PyUnicode_UTF8(op) == NULL)
634
0
            CHECK(compact->utf8_length == 0);
635
0
#endif
636
0
    }
637
638
    /* check that the best kind is used: O(n) operation */
639
0
    if (check_content) {
640
0
        Py_ssize_t i;
641
0
        Py_UCS4 maxchar = 0;
642
0
        const void *data;
643
0
        Py_UCS4 ch;
644
645
0
        data = PyUnicode_DATA(ascii);
646
0
        for (i=0; i < ascii->length; i++)
647
0
        {
648
0
            ch = PyUnicode_READ(kind, data, i);
649
0
            if (ch > maxchar)
650
0
                maxchar = ch;
651
0
        }
652
0
        if (kind == PyUnicode_1BYTE_KIND) {
653
0
            if (ascii->state.ascii == 0) {
654
0
                CHECK(maxchar >= 128);
655
0
                CHECK(maxchar <= 255);
656
0
            }
657
0
            else
658
0
                CHECK(maxchar < 128);
659
0
        }
660
0
        else if (kind == PyUnicode_2BYTE_KIND) {
661
0
            CHECK(maxchar >= 0x100);
662
0
            CHECK(maxchar <= 0xFFFF);
663
0
        }
664
0
        else {
665
0
            CHECK(maxchar >= 0x10000);
666
0
            CHECK(maxchar <= MAX_UNICODE);
667
0
        }
668
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
669
0
    }
670
671
    /* Check interning state */
672
#ifdef Py_DEBUG
673
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
674
    // extensions can make immortal strings mortal (but with a high enough
675
    // refcount).
676
    // The other way is extremely unlikely (worth a potential failed assertion
677
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
678
    switch (PyUnicode_CHECK_INTERNED(op)) {
679
        case SSTATE_NOT_INTERNED:
680
            if (ascii->state.statically_allocated) {
681
                // This state is for two exceptions:
682
                // - strings are currently checked before they're interned
683
                // - the 256 one-latin1-character strings
684
                //   are static but use SSTATE_NOT_INTERNED
685
            }
686
            else {
687
                CHECK(!_Py_IsImmortal(op));
688
            }
689
            break;
690
        case SSTATE_INTERNED_MORTAL:
691
            CHECK(!ascii->state.statically_allocated);
692
            CHECK(!_Py_IsImmortal(op));
693
            break;
694
        case SSTATE_INTERNED_IMMORTAL:
695
            CHECK(!ascii->state.statically_allocated);
696
            break;
697
        case SSTATE_INTERNED_IMMORTAL_STATIC:
698
            CHECK(ascii->state.statically_allocated);
699
            break;
700
        default:
701
            Py_UNREACHABLE();
702
    }
703
#endif
704
705
0
    return 1;
706
707
0
#undef CHECK
708
0
}
709
710
PyObject*
711
_PyUnicode_Result(PyObject *unicode)
712
46.5M
{
713
46.5M
    assert(_PyUnicode_CHECK(unicode));
714
715
46.5M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
716
46.5M
    if (length == 0) {
717
276
        PyObject *empty = _PyUnicode_GetEmpty();
718
276
        if (unicode != empty) {
719
0
            Py_DECREF(unicode);
720
0
        }
721
276
        return empty;
722
276
    }
723
724
46.5M
    if (length == 1) {
725
293k
        int kind = PyUnicode_KIND(unicode);
726
293k
        if (kind == PyUnicode_1BYTE_KIND) {
727
87.0k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
728
87.0k
            Py_UCS1 ch = data[0];
729
87.0k
            PyObject *latin1_char = LATIN1(ch);
730
87.0k
            if (unicode != latin1_char) {
731
82.0k
                Py_DECREF(unicode);
732
82.0k
            }
733
87.0k
            return latin1_char;
734
87.0k
        }
735
293k
    }
736
737
46.5M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
738
46.4M
    return unicode;
739
46.5M
}
740
536k
#define unicode_result _PyUnicode_Result
741
742
static PyObject*
743
unicode_result_unchanged(PyObject *unicode)
744
147M
{
745
147M
    if (PyUnicode_CheckExact(unicode)) {
746
144M
        return Py_NewRef(unicode);
747
144M
    }
748
3.62M
    else
749
        /* Subtype -- return genuine unicode string with the same value. */
750
3.62M
        return _PyUnicode_Copy(unicode);
751
147M
}
752
753
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
754
   ASCII, Latin1, UTF-8, etc. */
755
static char*
756
backslashreplace(PyBytesWriter *writer, char *str,
757
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
758
0
{
759
0
    Py_ssize_t size, i;
760
0
    Py_UCS4 ch;
761
0
    int kind;
762
0
    const void *data;
763
764
0
    kind = PyUnicode_KIND(unicode);
765
0
    data = PyUnicode_DATA(unicode);
766
767
0
    size = 0;
768
    /* determine replacement size */
769
0
    for (i = collstart; i < collend; ++i) {
770
0
        Py_ssize_t incr;
771
772
0
        ch = PyUnicode_READ(kind, data, i);
773
0
        if (ch < 0x100)
774
0
            incr = 2+2;
775
0
        else if (ch < 0x10000)
776
0
            incr = 2+4;
777
0
        else {
778
0
            assert(ch <= MAX_UNICODE);
779
0
            incr = 2+8;
780
0
        }
781
0
        if (size > PY_SSIZE_T_MAX - incr) {
782
0
            PyErr_SetString(PyExc_OverflowError,
783
0
                            "encoded result is too long for a Python string");
784
0
            return NULL;
785
0
        }
786
0
        size += incr;
787
0
    }
788
789
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
790
0
    if (str == NULL) {
791
0
        return NULL;
792
0
    }
793
794
    /* generate replacement */
795
0
    for (i = collstart; i < collend; ++i) {
796
0
        ch = PyUnicode_READ(kind, data, i);
797
0
        *str++ = '\\';
798
0
        if (ch >= 0x00010000) {
799
0
            *str++ = 'U';
800
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
801
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
802
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
803
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
804
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
805
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
806
0
        }
807
0
        else if (ch >= 0x100) {
808
0
            *str++ = 'u';
809
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
810
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
811
0
        }
812
0
        else
813
0
            *str++ = 'x';
814
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
815
0
        *str++ = Py_hexdigits[ch&0xf];
816
0
    }
817
0
    return str;
818
0
}
819
820
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
821
   ASCII, Latin1, UTF-8, etc. */
822
static char*
823
xmlcharrefreplace(PyBytesWriter *writer, char *str,
824
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
825
0
{
826
0
    Py_ssize_t size, i;
827
0
    Py_UCS4 ch;
828
0
    int kind;
829
0
    const void *data;
830
831
0
    kind = PyUnicode_KIND(unicode);
832
0
    data = PyUnicode_DATA(unicode);
833
834
0
    size = 0;
835
    /* determine replacement size */
836
0
    for (i = collstart; i < collend; ++i) {
837
0
        Py_ssize_t incr;
838
839
0
        ch = PyUnicode_READ(kind, data, i);
840
0
        if (ch < 10)
841
0
            incr = 2+1+1;
842
0
        else if (ch < 100)
843
0
            incr = 2+2+1;
844
0
        else if (ch < 1000)
845
0
            incr = 2+3+1;
846
0
        else if (ch < 10000)
847
0
            incr = 2+4+1;
848
0
        else if (ch < 100000)
849
0
            incr = 2+5+1;
850
0
        else if (ch < 1000000)
851
0
            incr = 2+6+1;
852
0
        else {
853
0
            assert(ch <= MAX_UNICODE);
854
0
            incr = 2+7+1;
855
0
        }
856
0
        if (size > PY_SSIZE_T_MAX - incr) {
857
0
            PyErr_SetString(PyExc_OverflowError,
858
0
                            "encoded result is too long for a Python string");
859
0
            return NULL;
860
0
        }
861
0
        size += incr;
862
0
    }
863
864
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
865
0
    if (str == NULL) {
866
0
        return NULL;
867
0
    }
868
869
    /* generate replacement */
870
0
    for (i = collstart; i < collend; ++i) {
871
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
872
0
        if (size < 0) {
873
0
            return NULL;
874
0
        }
875
0
        str += size;
876
0
    }
877
0
    return str;
878
0
}
879
880
/* --- Bloom Filters ----------------------------------------------------- */
881
882
/* stuff to implement simple "bloom filters" for Unicode characters.
883
   to keep things simple, we use a single bitmask, using the least 5
884
   bits from each unicode characters as the bit index. */
885
886
/* the linebreak mask is set up by _PyUnicode_Init() below */
887
888
#if LONG_BIT >= 128
889
#define BLOOM_WIDTH 128
890
#elif LONG_BIT >= 64
891
51.2M
#define BLOOM_WIDTH 64
892
#elif LONG_BIT >= 32
893
#define BLOOM_WIDTH 32
894
#else
895
#error "LONG_BIT is smaller than 32"
896
#endif
897
898
20.4M
#define BLOOM_MASK unsigned long
899
900
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
901
902
72.8M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
903
904
#define BLOOM_LINEBREAK(ch)                                             \
905
276M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
906
276M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
907
908
static inline BLOOM_MASK
909
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
910
10.2M
{
911
10.2M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
912
10.2M
    do {                                               \
913
10.2M
        TYPE *data = (TYPE *)PTR;                      \
914
10.2M
        TYPE *end = data + LEN;                        \
915
10.2M
        Py_UCS4 ch;                                    \
916
22.3M
        for (; data != end; data++) {                  \
917
12.1M
            ch = *data;                                \
918
12.1M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
919
12.1M
        }                                              \
920
10.2M
        break;                                         \
921
10.2M
    } while (0)
922
923
    /* calculate simple bloom-style bitmask for a given unicode string */
924
925
10.2M
    BLOOM_MASK mask;
926
927
10.2M
    mask = 0;
928
10.2M
    switch (kind) {
929
10.2M
    case PyUnicode_1BYTE_KIND:
930
10.2M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
931
10.2M
        break;
932
16
    case PyUnicode_2BYTE_KIND:
933
16
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
934
16
        break;
935
0
    case PyUnicode_4BYTE_KIND:
936
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
937
0
        break;
938
0
    default:
939
0
        Py_UNREACHABLE();
940
10.2M
    }
941
10.2M
    return mask;
942
943
10.2M
#undef BLOOM_UPDATE
944
10.2M
}
945
946
/* Compilation of templated routines */
947
948
1.68M
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
949
950
#include "stringlib/asciilib.h"
951
#include "stringlib/fastsearch.h"
952
#include "stringlib/partition.h"
953
#include "stringlib/split.h"
954
#include "stringlib/count.h"
955
#include "stringlib/find.h"
956
#include "stringlib/find_max_char.h"
957
#include "stringlib/undef.h"
958
959
#include "stringlib/ucs1lib.h"
960
#include "stringlib/fastsearch.h"
961
#include "stringlib/partition.h"
962
#include "stringlib/split.h"
963
#include "stringlib/count.h"
964
#include "stringlib/find.h"
965
#include "stringlib/replace.h"
966
#include "stringlib/repr.h"
967
#include "stringlib/find_max_char.h"
968
#include "stringlib/undef.h"
969
970
#include "stringlib/ucs2lib.h"
971
#include "stringlib/fastsearch.h"
972
#include "stringlib/partition.h"
973
#include "stringlib/split.h"
974
#include "stringlib/count.h"
975
#include "stringlib/find.h"
976
#include "stringlib/replace.h"
977
#include "stringlib/repr.h"
978
#include "stringlib/find_max_char.h"
979
#include "stringlib/undef.h"
980
981
#include "stringlib/ucs4lib.h"
982
#include "stringlib/fastsearch.h"
983
#include "stringlib/partition.h"
984
#include "stringlib/split.h"
985
#include "stringlib/count.h"
986
#include "stringlib/find.h"
987
#include "stringlib/replace.h"
988
#include "stringlib/repr.h"
989
#include "stringlib/find_max_char.h"
990
#include "stringlib/undef.h"
991
992
#undef STRINGLIB_GET_EMPTY
993
994
/* --- Unicode Object ----------------------------------------------------- */
995
996
static inline Py_ssize_t
997
findchar(const void *s, int kind,
998
         Py_ssize_t size, Py_UCS4 ch,
999
         int direction)
1000
117M
{
1001
117M
    switch (kind) {
1002
104M
    case PyUnicode_1BYTE_KIND:
1003
104M
        if ((Py_UCS1) ch != ch)
1004
3.69k
            return -1;
1005
104M
        if (direction > 0)
1006
104M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1007
7.83k
        else
1008
7.83k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1009
11.3M
    case PyUnicode_2BYTE_KIND:
1010
11.3M
        if ((Py_UCS2) ch != ch)
1011
0
            return -1;
1012
11.3M
        if (direction > 0)
1013
11.3M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1014
25.5k
        else
1015
25.5k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1016
1.64M
    case PyUnicode_4BYTE_KIND:
1017
1.64M
        if (direction > 0)
1018
1.53M
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1019
110k
        else
1020
110k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1021
0
    default:
1022
0
        Py_UNREACHABLE();
1023
117M
    }
1024
117M
}
1025
1026
#ifdef Py_DEBUG
1027
/* Fill the data of a Unicode string with invalid characters to detect bugs
1028
   earlier.
1029
1030
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1031
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1032
   invalid character in Unicode 6.0. */
1033
static void
1034
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1035
{
1036
    int kind = PyUnicode_KIND(unicode);
1037
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1038
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1039
    if (length <= old_length)
1040
        return;
1041
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1042
}
1043
#endif
1044
1045
static PyObject*
1046
resize_copy(PyObject *unicode, Py_ssize_t length)
1047
0
{
1048
0
    Py_ssize_t copy_length;
1049
0
    PyObject *copy;
1050
1051
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1052
0
    if (copy == NULL)
1053
0
        return NULL;
1054
1055
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1056
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1057
0
    return copy;
1058
0
}
1059
1060
PyObject*
1061
_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
1062
57.6M
{
1063
57.6M
    Py_ssize_t char_size;
1064
57.6M
    Py_ssize_t struct_size;
1065
57.6M
    Py_ssize_t new_size;
1066
57.6M
    PyObject *new_unicode;
1067
#ifdef Py_DEBUG
1068
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1069
#endif
1070
1071
57.6M
    if (!_PyUnicode_IsModifiable(unicode)) {
1072
0
        PyObject *copy = resize_copy(unicode, length);
1073
0
        if (copy == NULL) {
1074
0
            return NULL;
1075
0
        }
1076
0
        Py_DECREF(unicode);
1077
0
        return copy;
1078
0
    }
1079
57.6M
    assert(PyUnicode_IS_COMPACT(unicode));
1080
1081
57.6M
    char_size = PyUnicode_KIND(unicode);
1082
57.6M
    if (PyUnicode_IS_ASCII(unicode))
1083
48.3M
        struct_size = sizeof(PyASCIIObject);
1084
9.35M
    else
1085
9.35M
        struct_size = sizeof(PyCompactUnicodeObject);
1086
1087
57.6M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1088
0
        PyErr_NoMemory();
1089
0
        return NULL;
1090
0
    }
1091
57.6M
    new_size = (struct_size + (length + 1) * char_size);
1092
1093
57.6M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1094
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1095
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1096
0
        PyUnicode_SET_UTF8(unicode, NULL);
1097
0
    }
1098
#ifdef Py_TRACE_REFS
1099
    _Py_ForgetReference(unicode);
1100
#endif
1101
57.6M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1102
1103
57.6M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1104
57.6M
    if (new_unicode == NULL) {
1105
0
        _Py_NewReferenceNoTotal(unicode);
1106
0
        PyErr_NoMemory();
1107
0
        return NULL;
1108
0
    }
1109
57.6M
    unicode = new_unicode;
1110
57.6M
    _Py_NewReferenceNoTotal(unicode);
1111
1112
57.6M
    _PyUnicode_LENGTH(unicode) = length;
1113
#ifdef Py_DEBUG
1114
    unicode_fill_invalid(unicode, old_length);
1115
#endif
1116
57.6M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1117
57.6M
                    length, 0);
1118
57.6M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1119
57.6M
    return unicode;
1120
57.6M
}
1121
1122
static int
1123
resize_inplace(PyObject *unicode, Py_ssize_t length)
1124
0
{
1125
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1126
0
    assert(Py_REFCNT(unicode) == 1);
1127
1128
0
    Py_ssize_t new_size;
1129
0
    Py_ssize_t char_size;
1130
0
    int share_utf8;
1131
0
    void *data;
1132
#ifdef Py_DEBUG
1133
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1134
#endif
1135
1136
0
    data = _PyUnicode_DATA_ANY(unicode);
1137
0
    char_size = PyUnicode_KIND(unicode);
1138
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1139
1140
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1141
0
        PyErr_NoMemory();
1142
0
        return -1;
1143
0
    }
1144
0
    new_size = (length + 1) * char_size;
1145
1146
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1147
0
    {
1148
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1149
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1150
0
        PyUnicode_SET_UTF8(unicode, NULL);
1151
0
    }
1152
1153
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1154
0
    if (data == NULL) {
1155
0
        PyErr_NoMemory();
1156
0
        return -1;
1157
0
    }
1158
0
    _PyUnicode_DATA_ANY(unicode) = data;
1159
0
    if (share_utf8) {
1160
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1161
0
        PyUnicode_SET_UTF8(unicode, data);
1162
0
    }
1163
0
    _PyUnicode_LENGTH(unicode) = length;
1164
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1165
#ifdef Py_DEBUG
1166
    unicode_fill_invalid(unicode, old_length);
1167
#endif
1168
1169
    /* check for integer overflow */
1170
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1171
0
        PyErr_NoMemory();
1172
0
        return -1;
1173
0
    }
1174
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1175
0
    return 0;
1176
0
}
1177
1178
static const char*
1179
unicode_kind_name(PyObject *unicode)
1180
0
{
1181
    /* don't check consistency: unicode_kind_name() is called from
1182
       _PyUnicode_Dump() */
1183
0
    if (!PyUnicode_IS_COMPACT(unicode))
1184
0
    {
1185
0
        switch (PyUnicode_KIND(unicode))
1186
0
        {
1187
0
        case PyUnicode_1BYTE_KIND:
1188
0
            if (PyUnicode_IS_ASCII(unicode))
1189
0
                return "legacy ascii";
1190
0
            else
1191
0
                return "legacy latin1";
1192
0
        case PyUnicode_2BYTE_KIND:
1193
0
            return "legacy UCS2";
1194
0
        case PyUnicode_4BYTE_KIND:
1195
0
            return "legacy UCS4";
1196
0
        default:
1197
0
            return "<legacy invalid kind>";
1198
0
        }
1199
0
    }
1200
0
    switch (PyUnicode_KIND(unicode)) {
1201
0
    case PyUnicode_1BYTE_KIND:
1202
0
        if (PyUnicode_IS_ASCII(unicode))
1203
0
            return "ascii";
1204
0
        else
1205
0
            return "latin1";
1206
0
    case PyUnicode_2BYTE_KIND:
1207
0
        return "UCS2";
1208
0
    case PyUnicode_4BYTE_KIND:
1209
0
        return "UCS4";
1210
0
    default:
1211
0
        return "<invalid compact kind>";
1212
0
    }
1213
0
}
1214
1215
#ifdef Py_DEBUG
1216
/* Functions wrapping macros for use in debugger */
1217
const char *_PyUnicode_utf8(void *unicode_raw){
1218
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1219
    return PyUnicode_UTF8(unicode);
1220
}
1221
1222
const void *_PyUnicode_compact_data(void *unicode_raw) {
1223
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1224
    return _PyUnicode_COMPACT_DATA(unicode);
1225
}
1226
const void *_PyUnicode_data(void *unicode_raw) {
1227
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1228
    printf("obj %p\n", (void*)unicode);
1229
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1230
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1231
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1232
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1233
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1234
    return PyUnicode_DATA(unicode);
1235
}
1236
1237
void
1238
_PyUnicode_Dump(PyObject *op)
1239
{
1240
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1241
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1242
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1243
    const void *data;
1244
1245
    if (ascii->state.compact)
1246
    {
1247
        if (ascii->state.ascii)
1248
            data = (ascii + 1);
1249
        else
1250
            data = (compact + 1);
1251
    }
1252
    else
1253
        data = unicode->data.any;
1254
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1255
1256
    if (!ascii->state.ascii) {
1257
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1258
    }
1259
    printf(", data=%p\n", data);
1260
}
1261
#endif
1262
1263
1264
PyObject *
1265
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1266
497M
{
1267
    /* Optimization for empty strings */
1268
497M
    if (size == 0) {
1269
23.6M
        return _PyUnicode_GetEmpty();
1270
23.6M
    }
1271
1272
473M
    PyObject *obj;
1273
473M
    PyCompactUnicodeObject *unicode;
1274
473M
    void *data;
1275
473M
    int kind;
1276
473M
    int is_ascii;
1277
473M
    Py_ssize_t char_size;
1278
473M
    Py_ssize_t struct_size;
1279
1280
473M
    is_ascii = 0;
1281
473M
    struct_size = sizeof(PyCompactUnicodeObject);
1282
473M
    if (maxchar < 128) {
1283
317M
        kind = PyUnicode_1BYTE_KIND;
1284
317M
        char_size = 1;
1285
317M
        is_ascii = 1;
1286
317M
        struct_size = sizeof(PyASCIIObject);
1287
317M
    }
1288
156M
    else if (maxchar < 256) {
1289
14.0M
        kind = PyUnicode_1BYTE_KIND;
1290
14.0M
        char_size = 1;
1291
14.0M
    }
1292
141M
    else if (maxchar < 65536) {
1293
135M
        kind = PyUnicode_2BYTE_KIND;
1294
135M
        char_size = 2;
1295
135M
    }
1296
6.31M
    else {
1297
6.31M
        if (maxchar > MAX_UNICODE) {
1298
0
            PyErr_SetString(PyExc_SystemError,
1299
0
                            "invalid maximum character passed to PyUnicode_New");
1300
0
            return NULL;
1301
0
        }
1302
6.31M
        kind = PyUnicode_4BYTE_KIND;
1303
6.31M
        char_size = 4;
1304
6.31M
    }
1305
1306
    /* Ensure we won't overflow the size. */
1307
473M
    if (size < 0) {
1308
0
        PyErr_SetString(PyExc_SystemError,
1309
0
                        "Negative size passed to PyUnicode_New");
1310
0
        return NULL;
1311
0
    }
1312
473M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1313
0
        return PyErr_NoMemory();
1314
1315
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1316
     * PyObject_New() so we are able to allocate space for the object and
1317
     * it's data buffer.
1318
     */
1319
473M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1320
473M
    if (obj == NULL) {
1321
0
        return PyErr_NoMemory();
1322
0
    }
1323
473M
    _PyObject_Init(obj, &PyUnicode_Type);
1324
1325
473M
    unicode = (PyCompactUnicodeObject *)obj;
1326
473M
    if (is_ascii)
1327
317M
        data = ((PyASCIIObject*)obj) + 1;
1328
156M
    else
1329
156M
        data = unicode + 1;
1330
473M
    _PyUnicode_LENGTH(unicode) = size;
1331
473M
    _PyUnicode_HASH(unicode) = -1;
1332
473M
    _PyUnicode_STATE(unicode).interned = 0;
1333
473M
    _PyUnicode_STATE(unicode).kind = kind;
1334
473M
    _PyUnicode_STATE(unicode).compact = 1;
1335
473M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1336
473M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1337
473M
    if (is_ascii) {
1338
317M
        ((char*)data)[size] = 0;
1339
317M
    }
1340
156M
    else if (kind == PyUnicode_1BYTE_KIND) {
1341
14.0M
        ((char*)data)[size] = 0;
1342
14.0M
        unicode->utf8 = NULL;
1343
14.0M
        unicode->utf8_length = 0;
1344
14.0M
    }
1345
141M
    else {
1346
141M
        unicode->utf8 = NULL;
1347
141M
        unicode->utf8_length = 0;
1348
141M
        if (kind == PyUnicode_2BYTE_KIND)
1349
135M
            ((Py_UCS2*)data)[size] = 0;
1350
6.31M
        else /* kind == PyUnicode_4BYTE_KIND */
1351
6.31M
            ((Py_UCS4*)data)[size] = 0;
1352
141M
    }
1353
#ifdef Py_DEBUG
1354
    unicode_fill_invalid((PyObject*)unicode, 0);
1355
#endif
1356
473M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1357
473M
    return obj;
1358
473M
}
1359
1360
static int
1361
unicode_check_modifiable(PyObject *unicode)
1362
628
{
1363
628
    if (!_PyUnicode_IsModifiable(unicode)) {
1364
0
        PyErr_SetString(PyExc_SystemError,
1365
0
                        "Cannot modify a string currently used");
1366
0
        return -1;
1367
0
    }
1368
628
    return 0;
1369
628
}
1370
1371
static int
1372
_copy_characters(PyObject *to, Py_ssize_t to_start,
1373
                 PyObject *from, Py_ssize_t from_start,
1374
                 Py_ssize_t how_many, int check_maxchar)
1375
308M
{
1376
308M
    int from_kind, to_kind;
1377
308M
    const void *from_data;
1378
308M
    void *to_data;
1379
1380
308M
    assert(0 <= how_many);
1381
308M
    assert(0 <= from_start);
1382
308M
    assert(0 <= to_start);
1383
308M
    assert(PyUnicode_Check(from));
1384
308M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1385
1386
308M
    assert(to == NULL || PyUnicode_Check(to));
1387
1388
308M
    if (how_many == 0) {
1389
280k
        return 0;
1390
280k
    }
1391
1392
308M
    assert(to != NULL);
1393
307M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1394
1395
307M
    from_kind = PyUnicode_KIND(from);
1396
307M
    from_data = PyUnicode_DATA(from);
1397
307M
    to_kind = PyUnicode_KIND(to);
1398
307M
    to_data = PyUnicode_DATA(to);
1399
1400
#ifdef Py_DEBUG
1401
    if (!check_maxchar
1402
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1403
    {
1404
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1405
        Py_UCS4 ch;
1406
        Py_ssize_t i;
1407
        for (i=0; i < how_many; i++) {
1408
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1409
            assert(ch <= to_maxchar);
1410
        }
1411
    }
1412
#endif
1413
1414
307M
    if (from_kind == to_kind) {
1415
200M
        if (check_maxchar
1416
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1417
0
        {
1418
            /* Writing Latin-1 characters into an ASCII string requires to
1419
               check that all written characters are pure ASCII */
1420
0
            Py_UCS4 max_char;
1421
0
            max_char = ucs1lib_find_max_char(from_data,
1422
0
                                             (const Py_UCS1*)from_data + how_many);
1423
0
            if (max_char >= 128)
1424
0
                return -1;
1425
0
        }
1426
200M
        memcpy((char*)to_data + to_kind * to_start,
1427
200M
                  (const char*)from_data + from_kind * from_start,
1428
200M
                  to_kind * how_many);
1429
200M
    }
1430
107M
    else if (from_kind == PyUnicode_1BYTE_KIND
1431
105M
             && to_kind == PyUnicode_2BYTE_KIND)
1432
89.4M
    {
1433
89.4M
        _PyUnicode_CONVERT_BYTES(
1434
89.4M
            Py_UCS1, Py_UCS2,
1435
89.4M
            PyUnicode_1BYTE_DATA(from) + from_start,
1436
89.4M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1437
89.4M
            PyUnicode_2BYTE_DATA(to) + to_start
1438
89.4M
            );
1439
89.4M
    }
1440
18.2M
    else if (from_kind == PyUnicode_1BYTE_KIND
1441
16.0M
             && to_kind == PyUnicode_4BYTE_KIND)
1442
16.0M
    {
1443
16.0M
        _PyUnicode_CONVERT_BYTES(
1444
16.0M
            Py_UCS1, Py_UCS4,
1445
16.0M
            PyUnicode_1BYTE_DATA(from) + from_start,
1446
16.0M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1447
16.0M
            PyUnicode_4BYTE_DATA(to) + to_start
1448
16.0M
            );
1449
16.0M
    }
1450
2.22M
    else if (from_kind == PyUnicode_2BYTE_KIND
1451
2.19M
             && to_kind == PyUnicode_4BYTE_KIND)
1452
2.18M
    {
1453
2.18M
        _PyUnicode_CONVERT_BYTES(
1454
2.18M
            Py_UCS2, Py_UCS4,
1455
2.18M
            PyUnicode_2BYTE_DATA(from) + from_start,
1456
2.18M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1457
2.18M
            PyUnicode_4BYTE_DATA(to) + to_start
1458
2.18M
            );
1459
2.18M
    }
1460
31.6k
    else {
1461
31.6k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1462
1463
31.6k
        if (!check_maxchar) {
1464
31.6k
            if (from_kind == PyUnicode_2BYTE_KIND
1465
2.47k
                && to_kind == PyUnicode_1BYTE_KIND)
1466
2.47k
            {
1467
2.47k
                _PyUnicode_CONVERT_BYTES(
1468
2.47k
                    Py_UCS2, Py_UCS1,
1469
2.47k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1470
2.47k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1471
2.47k
                    PyUnicode_1BYTE_DATA(to) + to_start
1472
2.47k
                    );
1473
2.47k
            }
1474
29.2k
            else if (from_kind == PyUnicode_4BYTE_KIND
1475
29.2k
                     && to_kind == PyUnicode_1BYTE_KIND)
1476
9.60k
            {
1477
9.60k
                _PyUnicode_CONVERT_BYTES(
1478
9.60k
                    Py_UCS4, Py_UCS1,
1479
9.60k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1480
9.60k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1481
9.60k
                    PyUnicode_1BYTE_DATA(to) + to_start
1482
9.60k
                    );
1483
9.60k
            }
1484
19.6k
            else if (from_kind == PyUnicode_4BYTE_KIND
1485
19.6k
                     && to_kind == PyUnicode_2BYTE_KIND)
1486
19.6k
            {
1487
19.6k
                _PyUnicode_CONVERT_BYTES(
1488
19.6k
                    Py_UCS4, Py_UCS2,
1489
19.6k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1490
19.6k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1491
19.6k
                    PyUnicode_2BYTE_DATA(to) + to_start
1492
19.6k
                    );
1493
19.6k
            }
1494
0
            else {
1495
0
                Py_UNREACHABLE();
1496
0
            }
1497
31.6k
        }
1498
0
        else {
1499
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1500
0
            Py_UCS4 ch;
1501
0
            Py_ssize_t i;
1502
1503
0
            for (i=0; i < how_many; i++) {
1504
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1505
0
                if (ch > to_maxchar)
1506
0
                    return -1;
1507
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1508
0
            }
1509
0
        }
1510
31.6k
    }
1511
307M
    return 0;
1512
307M
}
1513
1514
void
1515
_PyUnicode_FastCopyCharacters(
1516
    PyObject *to, Py_ssize_t to_start,
1517
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1518
308M
{
1519
308M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1520
308M
}
1521
1522
Py_ssize_t
1523
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1524
                         PyObject *from, Py_ssize_t from_start,
1525
                         Py_ssize_t how_many)
1526
0
{
1527
0
    int err;
1528
1529
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1530
0
        PyErr_BadInternalCall();
1531
0
        return -1;
1532
0
    }
1533
1534
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1535
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1536
0
        return -1;
1537
0
    }
1538
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1539
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1540
0
        return -1;
1541
0
    }
1542
0
    if (how_many < 0) {
1543
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1544
0
        return -1;
1545
0
    }
1546
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1547
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1548
0
        PyErr_Format(PyExc_SystemError,
1549
0
                     "Cannot write %zi characters at %zi "
1550
0
                     "in a string of %zi characters",
1551
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1552
0
        return -1;
1553
0
    }
1554
1555
0
    if (how_many == 0)
1556
0
        return 0;
1557
1558
0
    if (unicode_check_modifiable(to))
1559
0
        return -1;
1560
1561
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1562
0
    if (err) {
1563
0
        PyErr_Format(PyExc_SystemError,
1564
0
                     "Cannot copy %s characters "
1565
0
                     "into a string of %s characters",
1566
0
                     unicode_kind_name(from),
1567
0
                     unicode_kind_name(to));
1568
0
        return -1;
1569
0
    }
1570
0
    return how_many;
1571
0
}
1572
1573
/* Find the maximum code point and count the number of surrogate pairs so a
1574
   correct string length can be computed before converting a string to UCS4.
1575
   This function counts single surrogates as a character and not as a pair.
1576
1577
   Return 0 on success, or -1 on error. */
1578
static int
1579
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1580
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1581
16.5k
{
1582
16.5k
    const wchar_t *iter;
1583
16.5k
    Py_UCS4 ch;
1584
1585
16.5k
    assert(num_surrogates != NULL && maxchar != NULL);
1586
16.5k
    *num_surrogates = 0;
1587
16.5k
    *maxchar = 0;
1588
1589
364k
    for (iter = begin; iter < end; ) {
1590
#if SIZEOF_WCHAR_T == 2
1591
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1592
            && (iter+1) < end
1593
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1594
        {
1595
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1596
            ++(*num_surrogates);
1597
            iter += 2;
1598
        }
1599
        else
1600
#endif
1601
348k
        {
1602
348k
            ch = *iter;
1603
348k
            iter++;
1604
348k
        }
1605
348k
        if (ch > *maxchar) {
1606
71.3k
            *maxchar = ch;
1607
71.3k
            if (*maxchar > MAX_UNICODE) {
1608
0
                PyErr_Format(PyExc_ValueError,
1609
0
                             "character U+%x is not in range [U+0000; U+%x]",
1610
0
                             ch, MAX_UNICODE);
1611
0
                return -1;
1612
0
            }
1613
71.3k
        }
1614
348k
    }
1615
16.5k
    return 0;
1616
16.5k
}
1617
1618
static void
1619
unicode_dealloc(PyObject *unicode)
1620
484M
{
1621
#ifdef Py_DEBUG
1622
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1623
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1624
    }
1625
#endif
1626
484M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1627
        /* This should never get called, but we also don't want to SEGV if
1628
        * we accidentally decref an immortal string out of existence. Since
1629
        * the string is an immortal object, just re-set the reference count.
1630
        */
1631
#ifdef Py_DEBUG
1632
        Py_UNREACHABLE();
1633
#endif
1634
0
        _Py_SetImmortal(unicode);
1635
0
        return;
1636
0
    }
1637
484M
    switch (_PyUnicode_STATE(unicode).interned) {
1638
484M
        case SSTATE_NOT_INTERNED:
1639
484M
            break;
1640
460k
        case SSTATE_INTERNED_MORTAL:
1641
            /* Remove the object from the intern dict.
1642
             * Before doing so, we set the refcount to 2: the key and value
1643
             * in the interned_dict.
1644
             */
1645
460k
            assert(Py_REFCNT(unicode) == 0);
1646
460k
            Py_SET_REFCNT(unicode, 2);
1647
#ifdef Py_REF_DEBUG
1648
            /* let's be pedantic with the ref total */
1649
            _Py_IncRefTotal(_PyThreadState_GET());
1650
            _Py_IncRefTotal(_PyThreadState_GET());
1651
#endif
1652
460k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1653
460k
            PyObject *interned = get_interned_dict(interp);
1654
460k
            assert(interned != NULL);
1655
460k
            PyObject *popped;
1656
460k
            int r = PyDict_Pop(interned, unicode, &popped);
1657
460k
            if (r == -1) {
1658
0
                PyErr_FormatUnraisable("Exception ignored while "
1659
0
                                       "removing an interned string %R",
1660
0
                                       unicode);
1661
                // We don't know what happened to the string. It's probably
1662
                // best to leak it:
1663
                // - if it was popped, there are no more references to it
1664
                //   so it can't cause trouble (except wasted memory)
1665
                // - if it wasn't popped, it'll remain interned
1666
0
                _Py_SetImmortal(unicode);
1667
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1668
0
                return;
1669
0
            }
1670
460k
            if (r == 0) {
1671
                // The interned string was not found in the interned_dict.
1672
#ifdef Py_DEBUG
1673
                Py_UNREACHABLE();
1674
#endif
1675
0
                _Py_SetImmortal(unicode);
1676
0
                return;
1677
0
            }
1678
            // Successfully popped.
1679
460k
            assert(popped == unicode);
1680
            // Only our `popped` reference should be left; remove it too.
1681
460k
            assert(Py_REFCNT(unicode) == 1);
1682
460k
            Py_SET_REFCNT(unicode, 0);
1683
#ifdef Py_REF_DEBUG
1684
            /* let's be pedantic with the ref total */
1685
            _Py_DecRefTotal(_PyThreadState_GET());
1686
#endif
1687
460k
            break;
1688
0
        default:
1689
            // As with `statically_allocated` above.
1690
#ifdef Py_REF_DEBUG
1691
            Py_UNREACHABLE();
1692
#endif
1693
0
            _Py_SetImmortal(unicode);
1694
0
            return;
1695
484M
    }
1696
484M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1697
177k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1698
177k
    }
1699
484M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1700
11.3M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1701
11.3M
    }
1702
1703
484M
    Py_TYPE(unicode)->tp_free(unicode);
1704
484M
}
1705
1706
#ifdef Py_DEBUG
1707
static int
1708
unicode_is_singleton(PyObject *unicode)
1709
{
1710
    if (unicode == &_Py_STR(empty)) {
1711
        return 1;
1712
    }
1713
1714
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1715
    if (ascii->length == 1) {
1716
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1717
        if (ch < 256 && LATIN1(ch) == unicode) {
1718
            return 1;
1719
        }
1720
    }
1721
    return 0;
1722
}
1723
#endif
1724
1725
int
1726
_PyUnicode_IsModifiable(PyObject *unicode)
1727
59.4M
{
1728
59.4M
    assert(_PyUnicode_CHECK(unicode));
1729
59.4M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1730
49.5k
        return 0;
1731
59.4M
    if (PyUnicode_HASH(unicode) != -1)
1732
0
        return 0;
1733
59.4M
    if (PyUnicode_CHECK_INTERNED(unicode))
1734
0
        return 0;
1735
59.4M
    if (!PyUnicode_CheckExact(unicode))
1736
0
        return 0;
1737
#ifdef Py_DEBUG
1738
    /* singleton refcount is greater than 1 */
1739
    assert(!unicode_is_singleton(unicode));
1740
#endif
1741
59.4M
    return 1;
1742
59.4M
}
1743
1744
static int
1745
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1746
878k
{
1747
878k
    PyObject *unicode;
1748
878k
    Py_ssize_t old_length;
1749
1750
878k
    assert(p_unicode != NULL);
1751
878k
    unicode = *p_unicode;
1752
1753
878k
    assert(unicode != NULL);
1754
878k
    assert(PyUnicode_Check(unicode));
1755
878k
    assert(0 <= length);
1756
1757
878k
    old_length = PyUnicode_GET_LENGTH(unicode);
1758
878k
    if (old_length == length)
1759
0
        return 0;
1760
1761
878k
    if (length == 0) {
1762
0
        PyObject *empty = _PyUnicode_GetEmpty();
1763
0
        Py_SETREF(*p_unicode, empty);
1764
0
        return 0;
1765
0
    }
1766
1767
878k
    if (!_PyUnicode_IsModifiable(unicode)) {
1768
0
        PyObject *copy = resize_copy(unicode, length);
1769
0
        if (copy == NULL)
1770
0
            return -1;
1771
0
        Py_SETREF(*p_unicode, copy);
1772
0
        return 0;
1773
0
    }
1774
1775
878k
    if (PyUnicode_IS_COMPACT(unicode)) {
1776
878k
        PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
1777
878k
        if (new_unicode == NULL)
1778
0
            return -1;
1779
878k
        *p_unicode = new_unicode;
1780
878k
        return 0;
1781
878k
    }
1782
0
    return resize_inplace(unicode, length);
1783
878k
}
1784
1785
int
1786
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1787
0
{
1788
0
    PyObject *unicode;
1789
0
    if (p_unicode == NULL) {
1790
0
        PyErr_BadInternalCall();
1791
0
        return -1;
1792
0
    }
1793
0
    unicode = *p_unicode;
1794
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1795
0
    {
1796
0
        PyErr_BadInternalCall();
1797
0
        return -1;
1798
0
    }
1799
0
    return unicode_resize(p_unicode, length);
1800
0
}
1801
1802
static PyObject*
1803
get_latin1_char(Py_UCS1 ch)
1804
240M
{
1805
240M
    PyObject *o = LATIN1(ch);
1806
240M
    return o;
1807
240M
}
1808
1809
static PyObject*
1810
unicode_char(Py_UCS4 ch)
1811
281M
{
1812
281M
    PyObject *unicode;
1813
1814
281M
    assert(ch <= MAX_UNICODE);
1815
1816
281M
    if (ch < 256) {
1817
193M
        return get_latin1_char(ch);
1818
193M
    }
1819
1820
88.4M
    unicode = PyUnicode_New(1, ch);
1821
88.4M
    if (unicode == NULL)
1822
0
        return NULL;
1823
1824
88.4M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1825
88.4M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1826
84.8M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1827
84.8M
    } else {
1828
3.60M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1829
3.60M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1830
3.60M
    }
1831
88.4M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1832
88.4M
    return unicode;
1833
88.4M
}
1834
1835
1836
static inline void
1837
unicode_write_widechar(int kind, void *data,
1838
                       const wchar_t *u, Py_ssize_t size,
1839
                       Py_ssize_t num_surrogates)
1840
16.5k
{
1841
16.5k
    switch (kind) {
1842
16.5k
    case PyUnicode_1BYTE_KIND:
1843
16.5k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1844
16.5k
        break;
1845
1846
0
    case PyUnicode_2BYTE_KIND:
1847
#if SIZEOF_WCHAR_T == 2
1848
        memcpy(data, u, size * 2);
1849
#else
1850
0
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1851
0
#endif
1852
0
        break;
1853
1854
0
    case PyUnicode_4BYTE_KIND:
1855
0
    {
1856
#if SIZEOF_WCHAR_T == 2
1857
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1858
        // surrogate pairs.
1859
        const wchar_t *end = u + size;
1860
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1861
#  ifndef NDEBUG
1862
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1863
#  endif
1864
        for (const wchar_t *iter = u; iter < end; ) {
1865
            assert(ucs4_out < ucs4_end);
1866
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1867
                && (iter+1) < end
1868
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1869
            {
1870
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1871
                iter += 2;
1872
            }
1873
            else {
1874
                *ucs4_out++ = *iter;
1875
                iter++;
1876
            }
1877
        }
1878
        assert(ucs4_out == ucs4_end);
1879
#else
1880
0
        assert(num_surrogates == 0);
1881
0
        memcpy(data, u, size * 4);
1882
0
#endif
1883
0
        break;
1884
0
    }
1885
0
    default:
1886
0
        Py_UNREACHABLE();
1887
16.5k
    }
1888
16.5k
}
1889
1890
1891
PyObject *
1892
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1893
16.5k
{
1894
16.5k
    PyObject *unicode;
1895
16.5k
    Py_UCS4 maxchar = 0;
1896
16.5k
    Py_ssize_t num_surrogates;
1897
1898
16.5k
    if (u == NULL && size != 0) {
1899
0
        PyErr_BadInternalCall();
1900
0
        return NULL;
1901
0
    }
1902
1903
16.5k
    if (size == -1) {
1904
576
        size = wcslen(u);
1905
576
    }
1906
1907
    /* If the Unicode data is known at construction time, we can apply
1908
       some optimizations which share commonly used objects. */
1909
1910
    /* Optimization for empty strings */
1911
16.5k
    if (size == 0)
1912
32
        _Py_RETURN_UNICODE_EMPTY();
1913
1914
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1915
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1916
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1917
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1918
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1919
        if (!converted) {
1920
            return NULL;
1921
        }
1922
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1923
        PyMem_Free(converted);
1924
        return unicode;
1925
    }
1926
#endif
1927
1928
    /* Single character Unicode objects in the Latin-1 range are
1929
       shared when using this constructor */
1930
16.5k
    if (size == 1 && (Py_UCS4)*u < 256)
1931
0
        return get_latin1_char((unsigned char)*u);
1932
1933
    /* If not empty and not single character, copy the Unicode data
1934
       into the new object */
1935
16.5k
    if (find_maxchar_surrogates(u, u + size,
1936
16.5k
                                &maxchar, &num_surrogates) == -1)
1937
0
        return NULL;
1938
1939
16.5k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1940
16.5k
    if (!unicode)
1941
0
        return NULL;
1942
1943
16.5k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1944
16.5k
                           u, size, num_surrogates);
1945
1946
16.5k
    return unicode_result(unicode);
1947
16.5k
}
1948
1949
1950
int
1951
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
1952
                              const wchar_t *str,
1953
                              Py_ssize_t size)
1954
0
{
1955
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
1956
1957
0
    if (size < 0) {
1958
0
        size = wcslen(str);
1959
0
    }
1960
1961
0
    if (size == 0) {
1962
0
        return 0;
1963
0
    }
1964
1965
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1966
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1967
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1968
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1969
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
1970
        if (!converted) {
1971
            return -1;
1972
        }
1973
1974
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
1975
        PyMem_Free(converted);
1976
        return res;
1977
    }
1978
#endif
1979
1980
0
    Py_UCS4 maxchar = 0;
1981
0
    Py_ssize_t num_surrogates;
1982
0
    if (find_maxchar_surrogates(str, str + size,
1983
0
                                &maxchar, &num_surrogates) == -1) {
1984
0
        return -1;
1985
0
    }
1986
1987
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
1988
0
        return -1;
1989
0
    }
1990
1991
0
    int kind = writer->kind;
1992
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
1993
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
1994
1995
0
    writer->pos += size - num_surrogates;
1996
0
    return 0;
1997
0
}
1998
1999
2000
PyObject *
2001
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2002
673k
{
2003
673k
    if (size < 0) {
2004
0
        PyErr_SetString(PyExc_SystemError,
2005
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2006
0
        return NULL;
2007
0
    }
2008
673k
    if (u != NULL) {
2009
673k
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2010
673k
    }
2011
0
    if (size > 0) {
2012
0
        PyErr_SetString(PyExc_SystemError,
2013
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2014
0
        return NULL;
2015
0
    }
2016
0
    return _PyUnicode_GetEmpty();
2017
0
}
2018
2019
PyObject *
2020
PyUnicode_FromString(const char *u)
2021
6.62M
{
2022
6.62M
    size_t size = strlen(u);
2023
6.62M
    if (size > PY_SSIZE_T_MAX) {
2024
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2025
0
        return NULL;
2026
0
    }
2027
6.62M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2028
6.62M
}
2029
2030
2031
PyObject *
2032
_PyUnicode_FromId(_Py_Identifier *id)
2033
0
{
2034
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2035
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2036
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2037
2038
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2039
0
    if (index < 0) {
2040
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2041
2042
0
        PyMutex_Lock(&rt_ids->mutex);
2043
        // Check again to detect concurrent access. Another thread can have
2044
        // initialized the index while this thread waited for the lock.
2045
0
        index = _Py_atomic_load_ssize(&id->index);
2046
0
        if (index < 0) {
2047
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2048
0
            index = rt_ids->next_index;
2049
0
            rt_ids->next_index++;
2050
0
            _Py_atomic_store_ssize(&id->index, index);
2051
0
        }
2052
0
        PyMutex_Unlock(&rt_ids->mutex);
2053
0
    }
2054
0
    assert(index >= 0);
2055
2056
0
    PyObject *obj;
2057
0
    if (index < ids->size) {
2058
0
        obj = ids->array[index];
2059
0
        if (obj) {
2060
            // Return a borrowed reference
2061
0
            goto end;
2062
0
        }
2063
0
    }
2064
2065
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2066
0
                                       NULL, NULL);
2067
0
    if (!obj) {
2068
0
        goto end;
2069
0
    }
2070
0
    _PyUnicode_InternImmortal(interp, &obj);
2071
2072
0
    if (index >= ids->size) {
2073
        // Overallocate to reduce the number of realloc
2074
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2075
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2076
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2077
0
        if (new_array == NULL) {
2078
0
            PyErr_NoMemory();
2079
0
            obj = NULL;
2080
0
            goto end;
2081
0
        }
2082
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2083
0
        ids->array = new_array;
2084
0
        ids->size = new_size;
2085
0
    }
2086
2087
    // The array stores a strong reference
2088
0
    ids->array[index] = obj;
2089
2090
0
end:
2091
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2092
    // Return a borrowed reference
2093
0
    return obj;
2094
0
}
2095
2096
2097
static void
2098
unicode_clear_identifiers(struct _Py_unicode_state *state)
2099
0
{
2100
0
    struct _Py_unicode_ids *ids = &state->ids;
2101
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2102
0
        Py_XDECREF(ids->array[i]);
2103
0
    }
2104
0
    ids->size = 0;
2105
0
    PyMem_Free(ids->array);
2106
0
    ids->array = NULL;
2107
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2108
    // after Py_Finalize().
2109
0
}
2110
2111
2112
/* Internal function, doesn't check maximum character */
2113
2114
PyObject*
2115
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2116
100M
{
2117
100M
    const unsigned char *s = (const unsigned char *)buffer;
2118
100M
    PyObject *unicode;
2119
100M
    if (size == 1) {
2120
#ifdef Py_DEBUG
2121
        assert((unsigned char)s[0] < 128);
2122
#endif
2123
33.0M
        return get_latin1_char(s[0]);
2124
33.0M
    }
2125
67.5M
    unicode = PyUnicode_New(size, 127);
2126
67.5M
    if (!unicode)
2127
0
        return NULL;
2128
67.5M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2129
67.5M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2130
67.5M
    return unicode;
2131
67.5M
}
2132
2133
static Py_UCS4
2134
kind_maxchar_limit(int kind)
2135
0
{
2136
0
    switch (kind) {
2137
0
    case PyUnicode_1BYTE_KIND:
2138
0
        return 0x80;
2139
0
    case PyUnicode_2BYTE_KIND:
2140
0
        return 0x100;
2141
0
    case PyUnicode_4BYTE_KIND:
2142
0
        return 0x10000;
2143
0
    default:
2144
0
        Py_UNREACHABLE();
2145
0
    }
2146
0
}
2147
2148
static PyObject*
2149
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2150
48.6M
{
2151
48.6M
    PyObject *res;
2152
48.6M
    unsigned char max_char;
2153
2154
48.6M
    if (size == 0) {
2155
5.91M
        _Py_RETURN_UNICODE_EMPTY();
2156
5.91M
    }
2157
48.6M
    assert(size > 0);
2158
42.7M
    if (size == 1) {
2159
12.9M
        return get_latin1_char(u[0]);
2160
12.9M
    }
2161
2162
29.8M
    max_char = ucs1lib_find_max_char(u, u + size);
2163
29.8M
    res = PyUnicode_New(size, max_char);
2164
29.8M
    if (!res)
2165
0
        return NULL;
2166
29.8M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2167
29.8M
    assert(_PyUnicode_CheckConsistency(res, 1));
2168
29.8M
    return res;
2169
29.8M
}
2170
2171
static PyObject*
2172
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2173
99.0M
{
2174
99.0M
    PyObject *res;
2175
99.0M
    Py_UCS2 max_char;
2176
2177
99.0M
    if (size == 0)
2178
12.0M
        _Py_RETURN_UNICODE_EMPTY();
2179
99.0M
    assert(size > 0);
2180
86.9M
    if (size == 1)
2181
55.8M
        return unicode_char(u[0]);
2182
2183
31.0M
    max_char = ucs2lib_find_max_char(u, u + size);
2184
31.0M
    res = PyUnicode_New(size, max_char);
2185
31.0M
    if (!res)
2186
0
        return NULL;
2187
31.0M
    if (max_char >= 256)
2188
17.9M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2189
13.1M
    else {
2190
13.1M
        _PyUnicode_CONVERT_BYTES(
2191
13.1M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2192
13.1M
    }
2193
31.0M
    assert(_PyUnicode_CheckConsistency(res, 1));
2194
31.0M
    return res;
2195
31.0M
}
2196
2197
static PyObject*
2198
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2199
67.9M
{
2200
67.9M
    PyObject *res;
2201
67.9M
    Py_UCS4 max_char;
2202
2203
67.9M
    if (size == 0)
2204
8.71M
        _Py_RETURN_UNICODE_EMPTY();
2205
67.9M
    assert(size > 0);
2206
59.2M
    if (size == 1)
2207
40.1M
        return unicode_char(u[0]);
2208
2209
19.0M
    max_char = ucs4lib_find_max_char(u, u + size);
2210
19.0M
    res = PyUnicode_New(size, max_char);
2211
19.0M
    if (!res)
2212
0
        return NULL;
2213
19.0M
    if (max_char < 256)
2214
13.2M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2215
19.0M
                                 PyUnicode_1BYTE_DATA(res));
2216
5.78M
    else if (max_char < 0x10000)
2217
3.90M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2218
5.78M
                                 PyUnicode_2BYTE_DATA(res));
2219
1.87M
    else
2220
1.87M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2221
19.0M
    assert(_PyUnicode_CheckConsistency(res, 1));
2222
19.0M
    return res;
2223
19.0M
}
2224
2225
2226
int
2227
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2228
                          Py_UCS4 *str,
2229
                          Py_ssize_t size)
2230
0
{
2231
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2232
2233
0
    if (size < 0) {
2234
0
        PyErr_SetString(PyExc_ValueError,
2235
0
                        "size must be positive");
2236
0
        return -1;
2237
0
    }
2238
2239
0
    if (size == 0) {
2240
0
        return 0;
2241
0
    }
2242
2243
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2244
2245
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2246
0
        return -1;
2247
0
    }
2248
2249
0
    int kind = writer->kind;
2250
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2251
0
    if (kind == PyUnicode_1BYTE_KIND) {
2252
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2253
0
                                 str, str + size,
2254
0
                                 data);
2255
0
    }
2256
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2257
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2258
0
                                 str, str + size,
2259
0
                                 data);
2260
0
    }
2261
0
    else {
2262
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2263
0
    }
2264
0
    writer->pos += size;
2265
2266
0
    return 0;
2267
0
}
2268
2269
2270
PyObject*
2271
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2272
160M
{
2273
160M
    if (size < 0) {
2274
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2275
0
        return NULL;
2276
0
    }
2277
160M
    switch (kind) {
2278
22.6M
    case PyUnicode_1BYTE_KIND:
2279
22.6M
        return _PyUnicode_FromUCS1(buffer, size);
2280
83.7M
    case PyUnicode_2BYTE_KIND:
2281
83.7M
        return _PyUnicode_FromUCS2(buffer, size);
2282
54.6M
    case PyUnicode_4BYTE_KIND:
2283
54.6M
        return _PyUnicode_FromUCS4(buffer, size);
2284
0
    default:
2285
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2286
0
        return NULL;
2287
160M
    }
2288
160M
}
2289
2290
Py_UCS4
2291
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2292
12.5M
{
2293
12.5M
    int kind;
2294
12.5M
    const void *startptr, *endptr;
2295
2296
12.5M
    assert(0 <= start);
2297
12.5M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2298
12.5M
    assert(start <= end);
2299
2300
12.5M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2301
0
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2302
2303
12.5M
    if (start == end)
2304
0
        return 127;
2305
2306
12.5M
    if (PyUnicode_IS_ASCII(unicode))
2307
12.5M
        return 127;
2308
2309
40.3k
    kind = PyUnicode_KIND(unicode);
2310
40.3k
    startptr = PyUnicode_DATA(unicode);
2311
40.3k
    endptr = (char *)startptr + end * kind;
2312
40.3k
    startptr = (char *)startptr + start * kind;
2313
40.3k
    switch(kind) {
2314
1.58k
    case PyUnicode_1BYTE_KIND:
2315
1.58k
        return ucs1lib_find_max_char(startptr, endptr);
2316
4.49k
    case PyUnicode_2BYTE_KIND:
2317
4.49k
        return ucs2lib_find_max_char(startptr, endptr);
2318
34.2k
    case PyUnicode_4BYTE_KIND:
2319
34.2k
        return ucs4lib_find_max_char(startptr, endptr);
2320
0
    default:
2321
0
        Py_UNREACHABLE();
2322
40.3k
    }
2323
40.3k
}
2324
2325
/* Ensure that a string uses the most efficient storage, if it is not the
2326
   case: create a new string with of the right kind. Write NULL into *p_unicode
2327
   on error. */
2328
static void
2329
unicode_adjust_maxchar(PyObject **p_unicode)
2330
0
{
2331
0
    PyObject *unicode, *copy;
2332
0
    Py_UCS4 max_char;
2333
0
    Py_ssize_t len;
2334
0
    int kind;
2335
2336
0
    assert(p_unicode != NULL);
2337
0
    unicode = *p_unicode;
2338
0
    if (PyUnicode_IS_ASCII(unicode))
2339
0
        return;
2340
2341
0
    len = PyUnicode_GET_LENGTH(unicode);
2342
0
    kind = PyUnicode_KIND(unicode);
2343
0
    if (kind == PyUnicode_1BYTE_KIND) {
2344
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2345
0
        max_char = ucs1lib_find_max_char(u, u + len);
2346
0
        if (max_char >= 128)
2347
0
            return;
2348
0
    }
2349
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2350
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2351
0
        max_char = ucs2lib_find_max_char(u, u + len);
2352
0
        if (max_char >= 256)
2353
0
            return;
2354
0
    }
2355
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2356
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2357
0
        max_char = ucs4lib_find_max_char(u, u + len);
2358
0
        if (max_char >= 0x10000)
2359
0
            return;
2360
0
    }
2361
0
    else
2362
0
        Py_UNREACHABLE();
2363
2364
0
    copy = PyUnicode_New(len, max_char);
2365
0
    if (copy != NULL)
2366
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2367
0
    Py_DECREF(unicode);
2368
0
    *p_unicode = copy;
2369
0
}
2370
2371
PyObject*
2372
_PyUnicode_Copy(PyObject *unicode)
2373
3.62M
{
2374
3.62M
    Py_ssize_t length;
2375
3.62M
    PyObject *copy;
2376
2377
3.62M
    if (!PyUnicode_Check(unicode)) {
2378
0
        PyErr_BadInternalCall();
2379
0
        return NULL;
2380
0
    }
2381
2382
3.62M
    length = PyUnicode_GET_LENGTH(unicode);
2383
3.62M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2384
3.62M
    if (!copy)
2385
0
        return NULL;
2386
3.62M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2387
2388
3.62M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2389
3.62M
              length * PyUnicode_KIND(unicode));
2390
3.62M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2391
3.62M
    return copy;
2392
3.62M
}
2393
2394
2395
/* Widen Unicode objects to larger buffers. Don't write terminating null
2396
   character. Return NULL on error. */
2397
2398
static void*
2399
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2400
15.7M
{
2401
15.7M
    void *result;
2402
2403
15.7M
    assert(skind < kind);
2404
15.7M
    switch (kind) {
2405
14.2M
    case PyUnicode_2BYTE_KIND:
2406
14.2M
        result = PyMem_New(Py_UCS2, len);
2407
14.2M
        if (!result)
2408
0
            return PyErr_NoMemory();
2409
14.2M
        assert(skind == PyUnicode_1BYTE_KIND);
2410
14.2M
        _PyUnicode_CONVERT_BYTES(
2411
14.2M
            Py_UCS1, Py_UCS2,
2412
14.2M
            (const Py_UCS1 *)data,
2413
14.2M
            ((const Py_UCS1 *)data) + len,
2414
14.2M
            result);
2415
14.2M
        return result;
2416
1.52M
    case PyUnicode_4BYTE_KIND:
2417
1.52M
        result = PyMem_New(Py_UCS4, len);
2418
1.52M
        if (!result)
2419
0
            return PyErr_NoMemory();
2420
1.52M
        if (skind == PyUnicode_2BYTE_KIND) {
2421
0
            _PyUnicode_CONVERT_BYTES(
2422
0
                Py_UCS2, Py_UCS4,
2423
0
                (const Py_UCS2 *)data,
2424
0
                ((const Py_UCS2 *)data) + len,
2425
0
                result);
2426
0
        }
2427
1.52M
        else {
2428
1.52M
            assert(skind == PyUnicode_1BYTE_KIND);
2429
1.52M
            _PyUnicode_CONVERT_BYTES(
2430
1.52M
                Py_UCS1, Py_UCS4,
2431
1.52M
                (const Py_UCS1 *)data,
2432
1.52M
                ((const Py_UCS1 *)data) + len,
2433
1.52M
                result);
2434
1.52M
        }
2435
1.52M
        return result;
2436
0
    default:
2437
0
        Py_UNREACHABLE();
2438
0
        return NULL;
2439
15.7M
    }
2440
15.7M
}
2441
2442
static Py_UCS4*
2443
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2444
        int copy_null)
2445
77.7k
{
2446
77.7k
    int kind;
2447
77.7k
    const void *data;
2448
77.7k
    Py_ssize_t len, targetlen;
2449
77.7k
    kind = PyUnicode_KIND(string);
2450
77.7k
    data = PyUnicode_DATA(string);
2451
77.7k
    len = PyUnicode_GET_LENGTH(string);
2452
77.7k
    targetlen = len;
2453
77.7k
    if (copy_null)
2454
0
        targetlen++;
2455
77.7k
    if (!target) {
2456
0
        target = PyMem_New(Py_UCS4, targetlen);
2457
0
        if (!target) {
2458
0
            PyErr_NoMemory();
2459
0
            return NULL;
2460
0
        }
2461
0
    }
2462
77.7k
    else {
2463
77.7k
        if (targetsize < targetlen) {
2464
0
            PyErr_Format(PyExc_SystemError,
2465
0
                         "string is longer than the buffer");
2466
0
            if (copy_null && 0 < targetsize)
2467
0
                target[0] = 0;
2468
0
            return NULL;
2469
0
        }
2470
77.7k
    }
2471
77.7k
    if (kind == PyUnicode_1BYTE_KIND) {
2472
58.8k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2473
58.8k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2474
58.8k
    }
2475
18.8k
    else if (kind == PyUnicode_2BYTE_KIND) {
2476
14.2k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2477
14.2k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2478
14.2k
    }
2479
4.58k
    else if (kind == PyUnicode_4BYTE_KIND) {
2480
4.58k
        memcpy(target, data, len * sizeof(Py_UCS4));
2481
4.58k
    }
2482
0
    else {
2483
0
        Py_UNREACHABLE();
2484
0
    }
2485
77.7k
    if (copy_null)
2486
0
        target[len] = 0;
2487
77.7k
    return target;
2488
77.7k
}
2489
2490
Py_UCS4*
2491
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2492
                 int copy_null)
2493
77.7k
{
2494
77.7k
    if (target == NULL || targetsize < 0) {
2495
0
        PyErr_BadInternalCall();
2496
0
        return NULL;
2497
0
    }
2498
77.7k
    return as_ucs4(string, target, targetsize, copy_null);
2499
77.7k
}
2500
2501
Py_UCS4*
2502
PyUnicode_AsUCS4Copy(PyObject *string)
2503
0
{
2504
0
    return as_ucs4(string, NULL, 0, 1);
2505
0
}
2506
2507
/* maximum number of characters required for output of %jo or %jd or %p.
2508
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2509
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2510
   plus 1 for the terminal NUL. */
2511
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2512
2513
static int
2514
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2515
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2516
28.3k
{
2517
28.3k
    Py_ssize_t length, fill, arglen;
2518
28.3k
    Py_UCS4 maxchar;
2519
2520
28.3k
    length = PyUnicode_GET_LENGTH(str);
2521
28.3k
    if ((precision == -1 || precision >= length)
2522
28.2k
        && width <= length)
2523
28.2k
        return _PyUnicodeWriter_WriteStr(writer, str);
2524
2525
51
    if (precision != -1)
2526
51
        length = Py_MIN(precision, length);
2527
2528
51
    arglen = Py_MAX(length, width);
2529
51
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2530
22
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2531
29
    else
2532
29
        maxchar = writer->maxchar;
2533
2534
51
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2535
0
        return -1;
2536
2537
51
    fill = Py_MAX(width - length, 0);
2538
51
    if (fill && !(flags & F_LJUST)) {
2539
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2540
0
            return -1;
2541
0
        writer->pos += fill;
2542
0
    }
2543
2544
51
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2545
51
                                  str, 0, length);
2546
51
    writer->pos += length;
2547
2548
51
    if (fill && (flags & F_LJUST)) {
2549
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2550
0
            return -1;
2551
0
        writer->pos += fill;
2552
0
    }
2553
2554
51
    return 0;
2555
51
}
2556
2557
static int
2558
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2559
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2560
5.33M
{
2561
    /* UTF-8 */
2562
5.33M
    Py_ssize_t *pconsumed = NULL;
2563
5.33M
    Py_ssize_t length;
2564
5.33M
    if (precision == -1) {
2565
231k
        length = strlen(str);
2566
231k
    }
2567
5.09M
    else {
2568
5.09M
        length = 0;
2569
20.9M
        while (length < precision && str[length]) {
2570
15.8M
            length++;
2571
15.8M
        }
2572
5.09M
        if (length == precision) {
2573
            /* The input string is not NUL-terminated.  If it ends with an
2574
             * incomplete UTF-8 sequence, truncate the string just before it.
2575
             * Incomplete sequences in the middle and sequences which cannot
2576
             * be valid prefixes are still treated as errors and replaced
2577
             * with \xfffd. */
2578
1.78k
            pconsumed = &length;
2579
1.78k
        }
2580
5.09M
    }
2581
2582
5.33M
    if (width < 0) {
2583
5.33M
        return _PyUnicode_DecodeUTF8Writer(writer, str, length,
2584
5.33M
                                           _Py_ERROR_REPLACE, "replace", pconsumed);
2585
5.33M
    }
2586
2587
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2588
0
                                                     "replace", pconsumed);
2589
0
    if (unicode == NULL)
2590
0
        return -1;
2591
2592
0
    int res = unicode_fromformat_write_str(writer, unicode,
2593
0
                                           width, -1, flags);
2594
0
    Py_DECREF(unicode);
2595
0
    return res;
2596
0
}
2597
2598
static int
2599
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2600
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2601
0
{
2602
0
    Py_ssize_t length;
2603
0
    if (precision == -1) {
2604
0
        length = wcslen(str);
2605
0
    }
2606
0
    else {
2607
0
        length = 0;
2608
0
        while (length < precision && str[length]) {
2609
0
            length++;
2610
0
        }
2611
0
    }
2612
2613
0
    if (width < 0) {
2614
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2615
0
                                             str, length);
2616
0
    }
2617
2618
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2619
0
    if (unicode == NULL)
2620
0
        return -1;
2621
2622
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2623
0
    Py_DECREF(unicode);
2624
0
    return res;
2625
0
}
2626
2627
0
#define F_LONG 1
2628
0
#define F_LONGLONG 2
2629
90.8k
#define F_SIZE 3
2630
0
#define F_PTRDIFF 4
2631
0
#define F_INTMAX 5
2632
2633
static const char*
2634
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2635
                       const char *f, va_list *vargs)
2636
28.3M
{
2637
28.3M
    const char *p;
2638
28.3M
    Py_ssize_t len;
2639
28.3M
    int flags = 0;
2640
28.3M
    Py_ssize_t width;
2641
28.3M
    Py_ssize_t precision;
2642
2643
28.3M
    p = f;
2644
28.3M
    f++;
2645
28.3M
    if (*f == '%') {
2646
5.08M
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2647
0
            return NULL;
2648
5.08M
        f++;
2649
5.08M
        return f;
2650
5.08M
    }
2651
2652
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2653
    /* Flags '+', ' ' and '#' are not particularly useful.
2654
     * They are not worth the implementation and maintenance costs.
2655
     * In addition, '#' should add "0" for "o" conversions for compatibility
2656
     * with printf, but it would confuse Python users. */
2657
23.2M
    while (1) {
2658
23.2M
        switch (*f++) {
2659
0
        case '-': flags |= F_LJUST; continue;
2660
2.12k
        case '0': flags |= F_ZERO; continue;
2661
0
        case '#': flags |= F_ALT; continue;
2662
23.2M
        }
2663
23.2M
        f--;
2664
23.2M
        break;
2665
23.2M
    }
2666
2667
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2668
23.2M
    width = -1;
2669
23.2M
    if (*f == '*') {
2670
0
        width = va_arg(*vargs, int);
2671
0
        if (width < 0) {
2672
0
            flags |= F_LJUST;
2673
0
            width = -width;
2674
0
        }
2675
0
        f++;
2676
0
    }
2677
23.2M
    else if (Py_ISDIGIT((unsigned)*f)) {
2678
2.12k
        width = *f - '0';
2679
2.12k
        f++;
2680
2.12k
        while (Py_ISDIGIT((unsigned)*f)) {
2681
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2682
0
                PyErr_SetString(PyExc_ValueError,
2683
0
                                "width too big");
2684
0
                return NULL;
2685
0
            }
2686
0
            width = (width * 10) + (*f - '0');
2687
0
            f++;
2688
0
        }
2689
2.12k
    }
2690
23.2M
    precision = -1;
2691
23.2M
    if (*f == '.') {
2692
5.10M
        f++;
2693
5.10M
        if (*f == '*') {
2694
0
            precision = va_arg(*vargs, int);
2695
0
            if (precision < 0) {
2696
0
                precision = -2;
2697
0
            }
2698
0
            f++;
2699
0
        }
2700
5.10M
        else if (Py_ISDIGIT((unsigned)*f)) {
2701
5.10M
            precision = (*f - '0');
2702
5.10M
            f++;
2703
15.3M
            while (Py_ISDIGIT((unsigned)*f)) {
2704
10.1M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2705
0
                    PyErr_SetString(PyExc_ValueError,
2706
0
                                    "precision too big");
2707
0
                    return NULL;
2708
0
                }
2709
10.1M
                precision = (precision * 10) + (*f - '0');
2710
10.1M
                f++;
2711
10.1M
            }
2712
5.10M
        }
2713
5.10M
    }
2714
2715
23.2M
    int sizemod = 0;
2716
23.2M
    if (*f == 'l') {
2717
0
        if (f[1] == 'l') {
2718
0
            sizemod = F_LONGLONG;
2719
0
            f += 2;
2720
0
        }
2721
0
        else {
2722
0
            sizemod = F_LONG;
2723
0
            ++f;
2724
0
        }
2725
0
    }
2726
23.2M
    else if (*f == 'z') {
2727
45.4k
        sizemod = F_SIZE;
2728
45.4k
        ++f;
2729
45.4k
    }
2730
23.2M
    else if (*f == 't') {
2731
0
        sizemod = F_PTRDIFF;
2732
0
        ++f;
2733
0
    }
2734
23.2M
    else if (*f == 'j') {
2735
0
        sizemod = F_INTMAX;
2736
0
        ++f;
2737
0
    }
2738
23.2M
    if (f[0] != '\0' && f[1] == '\0')
2739
5.17M
        writer->overallocate = 0;
2740
2741
23.2M
    switch (*f) {
2742
12.8M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2743
12.8M
        break;
2744
5.09M
    case 'c': case 'p':
2745
5.09M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2746
5.09M
        break;
2747
5.33M
    case 's':
2748
5.33M
    case 'V':
2749
5.33M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2750
5.33M
        break;
2751
5.33M
    default:
2752
28.3k
        if (sizemod) goto invalid_format;
2753
28.3k
        break;
2754
23.2M
    }
2755
2756
23.2M
    switch (*f) {
2757
5.09M
    case 'c':
2758
5.09M
    {
2759
5.09M
        int ordinal = va_arg(*vargs, int);
2760
5.09M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2761
0
            PyErr_SetString(PyExc_OverflowError,
2762
0
                            "character argument not in range(0x110000)");
2763
0
            return NULL;
2764
0
        }
2765
5.09M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2766
0
            return NULL;
2767
5.09M
        break;
2768
5.09M
    }
2769
2770
12.8M
    case 'd': case 'i':
2771
12.8M
    case 'o': case 'u': case 'x': case 'X':
2772
12.8M
    {
2773
12.8M
        char buffer[MAX_INTMAX_CHARS];
2774
2775
        // Fill buffer using sprinf, with one of many possible format
2776
        // strings, like "%llX" for `long long` in hexadecimal.
2777
        // The type/size is in `sizemod`; the format is in `*f`.
2778
2779
        // Use macros with nested switches to keep the sprintf format strings
2780
        // as compile-time literals, avoiding warnings and maybe allowing
2781
        // optimizations.
2782
2783
        // `SPRINT` macro does one sprintf
2784
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2785
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2786
12.8M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2787
12.8M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2788
2789
        // One inner switch to handle all format variants
2790
12.8M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2791
12.8M
            switch (*f) {                                                     \
2792
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2793
0
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2794
1.60k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2795
1.08k
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2796
12.8M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2797
12.8M
            }
2798
2799
        // Outer switch to handle all the sizes/types
2800
12.8M
        switch (sizemod) {
2801
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2802
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2803
45.4k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2804
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2805
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2806
12.7M
            default:         DO_SPRINTS("", int, unsigned int); break;
2807
12.8M
        }
2808
12.8M
        #undef SPRINT
2809
12.8M
        #undef DO_SPRINTS
2810
2811
12.8M
        assert(len >= 0);
2812
2813
12.8M
        int sign = (buffer[0] == '-');
2814
12.8M
        len -= sign;
2815
2816
12.8M
        precision = Py_MAX(precision, len);
2817
12.8M
        width = Py_MAX(width, precision + sign);
2818
12.8M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2819
2.12k
            precision = width - sign;
2820
2.12k
        }
2821
2822
12.8M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2823
12.8M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2824
2825
12.8M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2826
0
            return NULL;
2827
2828
12.8M
        if (spacepad && !(flags & F_LJUST)) {
2829
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2830
0
                return NULL;
2831
0
            writer->pos += spacepad;
2832
0
        }
2833
2834
12.8M
        if (sign) {
2835
0
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2836
0
                return NULL;
2837
0
        }
2838
2839
12.8M
        if (zeropad) {
2840
628
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2841
0
                return NULL;
2842
628
            writer->pos += zeropad;
2843
628
        }
2844
2845
12.8M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2846
0
            return NULL;
2847
2848
12.8M
        if (spacepad && (flags & F_LJUST)) {
2849
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2850
0
                return NULL;
2851
0
            writer->pos += spacepad;
2852
0
        }
2853
12.8M
        break;
2854
12.8M
    }
2855
2856
12.8M
    case 'p':
2857
0
    {
2858
0
        char number[MAX_INTMAX_CHARS];
2859
2860
0
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2861
0
        assert(len >= 0);
2862
2863
        /* %p is ill-defined:  ensure leading 0x. */
2864
0
        if (number[1] == 'X')
2865
0
            number[1] = 'x';
2866
0
        else if (number[1] != 'x') {
2867
0
            memmove(number + 2, number,
2868
0
                    strlen(number) + 1);
2869
0
            number[0] = '0';
2870
0
            number[1] = 'x';
2871
0
            len += 2;
2872
0
        }
2873
2874
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2875
0
            return NULL;
2876
0
        break;
2877
0
    }
2878
2879
5.33M
    case 's':
2880
5.33M
    {
2881
5.33M
        if (sizemod) {
2882
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2883
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2884
0
                return NULL;
2885
0
        }
2886
5.33M
        else {
2887
            /* UTF-8 */
2888
5.33M
            const char *s = va_arg(*vargs, const char*);
2889
5.33M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2890
0
                return NULL;
2891
5.33M
        }
2892
5.33M
        break;
2893
5.33M
    }
2894
2895
5.33M
    case 'U':
2896
27.5k
    {
2897
27.5k
        PyObject *obj = va_arg(*vargs, PyObject *);
2898
27.5k
        assert(obj && _PyUnicode_CHECK(obj));
2899
2900
27.5k
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2901
0
            return NULL;
2902
27.5k
        break;
2903
27.5k
    }
2904
2905
27.5k
    case 'V':
2906
560
    {
2907
560
        PyObject *obj = va_arg(*vargs, PyObject *);
2908
560
        const char *str;
2909
560
        const wchar_t *wstr;
2910
560
        if (sizemod) {
2911
0
            wstr = va_arg(*vargs, const wchar_t*);
2912
0
        }
2913
560
        else {
2914
560
            str = va_arg(*vargs, const char *);
2915
560
        }
2916
560
        if (obj) {
2917
0
            assert(_PyUnicode_CHECK(obj));
2918
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2919
0
                return NULL;
2920
0
        }
2921
560
        else if (sizemod) {
2922
0
            assert(wstr != NULL);
2923
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2924
0
                return NULL;
2925
0
        }
2926
560
        else {
2927
560
            assert(str != NULL);
2928
560
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
2929
0
                return NULL;
2930
560
        }
2931
560
        break;
2932
560
    }
2933
2934
560
    case 'S':
2935
47
    {
2936
47
        PyObject *obj = va_arg(*vargs, PyObject *);
2937
47
        PyObject *str;
2938
47
        assert(obj);
2939
47
        str = PyObject_Str(obj);
2940
47
        if (!str)
2941
0
            return NULL;
2942
47
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2943
0
            Py_DECREF(str);
2944
0
            return NULL;
2945
0
        }
2946
47
        Py_DECREF(str);
2947
47
        break;
2948
47
    }
2949
2950
720
    case 'R':
2951
720
    {
2952
720
        PyObject *obj = va_arg(*vargs, PyObject *);
2953
720
        PyObject *repr;
2954
720
        assert(obj);
2955
720
        repr = PyObject_Repr(obj);
2956
720
        if (!repr)
2957
0
            return NULL;
2958
720
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2959
0
            Py_DECREF(repr);
2960
0
            return NULL;
2961
0
        }
2962
720
        Py_DECREF(repr);
2963
720
        break;
2964
720
    }
2965
2966
0
    case 'A':
2967
0
    {
2968
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2969
0
        PyObject *ascii;
2970
0
        assert(obj);
2971
0
        ascii = PyObject_ASCII(obj);
2972
0
        if (!ascii)
2973
0
            return NULL;
2974
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2975
0
            Py_DECREF(ascii);
2976
0
            return NULL;
2977
0
        }
2978
0
        Py_DECREF(ascii);
2979
0
        break;
2980
0
    }
2981
2982
0
    case 'T':
2983
0
    {
2984
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2985
0
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2986
2987
0
        PyObject *type_name;
2988
0
        if (flags & F_ALT) {
2989
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
2990
0
        }
2991
0
        else {
2992
0
            type_name = PyType_GetFullyQualifiedName(type);
2993
0
        }
2994
0
        Py_DECREF(type);
2995
0
        if (!type_name) {
2996
0
            return NULL;
2997
0
        }
2998
2999
0
        if (unicode_fromformat_write_str(writer, type_name,
3000
0
                                         width, precision, flags) == -1) {
3001
0
            Py_DECREF(type_name);
3002
0
            return NULL;
3003
0
        }
3004
0
        Py_DECREF(type_name);
3005
0
        break;
3006
0
    }
3007
3008
0
    case 'N':
3009
0
    {
3010
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3011
0
        assert(type_raw != NULL);
3012
3013
0
        if (!PyType_Check(type_raw)) {
3014
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3015
0
            return NULL;
3016
0
        }
3017
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3018
3019
0
        PyObject *type_name;
3020
0
        if (flags & F_ALT) {
3021
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3022
0
        }
3023
0
        else {
3024
0
            type_name = PyType_GetFullyQualifiedName(type);
3025
0
        }
3026
0
        if (!type_name) {
3027
0
            return NULL;
3028
0
        }
3029
0
        if (unicode_fromformat_write_str(writer, type_name,
3030
0
                                         width, precision, flags) == -1) {
3031
0
            Py_DECREF(type_name);
3032
0
            return NULL;
3033
0
        }
3034
0
        Py_DECREF(type_name);
3035
0
        break;
3036
0
    }
3037
3038
0
    default:
3039
0
    invalid_format:
3040
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3041
0
        return NULL;
3042
23.2M
    }
3043
3044
23.2M
    f++;
3045
23.2M
    return f;
3046
23.2M
}
3047
3048
static int
3049
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3050
11.6M
{
3051
11.6M
    Py_ssize_t len = strlen(format);
3052
11.6M
    writer->min_length += len + 100;
3053
11.6M
    writer->overallocate = 1;
3054
3055
    // Copy varags to be able to pass a reference to a subfunction.
3056
11.6M
    va_list vargs2;
3057
11.6M
    va_copy(vargs2, vargs);
3058
3059
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3060
    // to be encoded to ASCII.
3061
11.6M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3062
11.6M
    if (!is_ascii) {
3063
0
        Py_ssize_t i;
3064
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3065
0
        PyErr_Format(PyExc_ValueError,
3066
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3067
0
            "string, got a non-ASCII byte: 0x%02x",
3068
0
            (unsigned char)format[i]);
3069
0
        goto fail;
3070
0
    }
3071
3072
64.6M
    for (const char *f = format; *f; ) {
3073
52.9M
        if (*f == '%') {
3074
28.3M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3075
28.3M
            if (f == NULL)
3076
0
                goto fail;
3077
28.3M
        }
3078
24.6M
        else {
3079
24.6M
            const char *p = strchr(f, '%');
3080
24.6M
            if (p != NULL) {
3081
18.1M
                len = p - f;
3082
18.1M
            }
3083
6.51M
            else {
3084
6.51M
                len = strlen(f);
3085
6.51M
                writer->overallocate = 0;
3086
6.51M
            }
3087
3088
24.6M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3089
0
                goto fail;
3090
0
            }
3091
24.6M
            f += len;
3092
24.6M
        }
3093
52.9M
    }
3094
11.6M
    va_end(vargs2);
3095
11.6M
    return 0;
3096
3097
0
  fail:
3098
0
    va_end(vargs2);
3099
0
    return -1;
3100
11.6M
}
3101
3102
PyObject *
3103
PyUnicode_FromFormatV(const char *format, va_list vargs)
3104
11.6M
{
3105
11.6M
    _PyUnicodeWriter writer;
3106
11.6M
    _PyUnicodeWriter_Init(&writer);
3107
3108
11.6M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3109
0
        _PyUnicodeWriter_Dealloc(&writer);
3110
0
        return NULL;
3111
0
    }
3112
11.6M
    return _PyUnicodeWriter_Finish(&writer);
3113
11.6M
}
3114
3115
PyObject *
3116
PyUnicode_FromFormat(const char *format, ...)
3117
9.72k
{
3118
9.72k
    PyObject* ret;
3119
9.72k
    va_list vargs;
3120
3121
9.72k
    va_start(vargs, format);
3122
9.72k
    ret = PyUnicode_FromFormatV(format, vargs);
3123
9.72k
    va_end(vargs);
3124
9.72k
    return ret;
3125
9.72k
}
3126
3127
int
3128
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3129
0
{
3130
0
    va_list vargs;
3131
0
    va_start(vargs, format);
3132
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3133
0
    va_end(vargs);
3134
0
    return res;
3135
0
}
3136
3137
int
3138
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3139
                         va_list vargs)
3140
0
{
3141
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3142
0
    Py_ssize_t old_pos = _writer->pos;
3143
3144
0
    int res = unicode_from_format(_writer, format, vargs);
3145
3146
0
    if (res < 0) {
3147
0
        _writer->pos = old_pos;
3148
0
    }
3149
0
    return res;
3150
0
}
3151
3152
static Py_ssize_t
3153
unicode_get_widechar_size(PyObject *unicode)
3154
7.17k
{
3155
7.17k
    Py_ssize_t res;
3156
3157
7.17k
    assert(unicode != NULL);
3158
7.17k
    assert(_PyUnicode_CHECK(unicode));
3159
3160
7.17k
    res = _PyUnicode_LENGTH(unicode);
3161
#if SIZEOF_WCHAR_T == 2
3162
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3163
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3164
        const Py_UCS4 *end = s + res;
3165
        for (; s < end; ++s) {
3166
            if (*s > 0xFFFF) {
3167
                ++res;
3168
            }
3169
        }
3170
    }
3171
#endif
3172
7.17k
    return res;
3173
7.17k
}
3174
3175
static void
3176
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3177
7.17k
{
3178
7.17k
    assert(unicode != NULL);
3179
7.17k
    assert(_PyUnicode_CHECK(unicode));
3180
3181
7.17k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3182
0
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3183
0
        return;
3184
0
    }
3185
3186
7.17k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3187
7.17k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3188
616k
        for (; size--; ++s, ++w) {
3189
609k
            *w = *s;
3190
609k
        }
3191
7.17k
    }
3192
0
    else {
3193
0
#if SIZEOF_WCHAR_T == 4
3194
0
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3195
0
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3196
0
        for (; size--; ++s, ++w) {
3197
0
            *w = *s;
3198
0
        }
3199
#else
3200
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3201
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3202
        for (; size--; ++s, ++w) {
3203
            Py_UCS4 ch = *s;
3204
            if (ch > 0xFFFF) {
3205
                assert(ch <= MAX_UNICODE);
3206
                /* encode surrogate pair in this case */
3207
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3208
                if (!size--)
3209
                    break;
3210
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3211
            }
3212
            else {
3213
                *w = ch;
3214
            }
3215
        }
3216
#endif
3217
0
    }
3218
7.17k
}
3219
3220
#ifdef HAVE_WCHAR_H
3221
3222
/* Convert a Unicode object to a wide character string.
3223
3224
   - If w is NULL: return the number of wide characters (including the null
3225
     character) required to convert the unicode object. Ignore size argument.
3226
3227
   - Otherwise: return the number of wide characters (excluding the null
3228
     character) written into w. Write at most size wide characters (including
3229
     the null character). */
3230
Py_ssize_t
3231
PyUnicode_AsWideChar(PyObject *unicode,
3232
                     wchar_t *w,
3233
                     Py_ssize_t size)
3234
5.90k
{
3235
5.90k
    Py_ssize_t res;
3236
3237
5.90k
    if (unicode == NULL) {
3238
0
        PyErr_BadInternalCall();
3239
0
        return -1;
3240
0
    }
3241
5.90k
    if (!PyUnicode_Check(unicode)) {
3242
0
        PyErr_BadArgument();
3243
0
        return -1;
3244
0
    }
3245
3246
5.90k
    res = unicode_get_widechar_size(unicode);
3247
5.90k
    if (w == NULL) {
3248
0
        return res + 1;
3249
0
    }
3250
3251
5.90k
    if (size > res) {
3252
5.90k
        size = res + 1;
3253
5.90k
    }
3254
0
    else {
3255
0
        res = size;
3256
0
    }
3257
5.90k
    unicode_copy_as_widechar(unicode, w, size);
3258
3259
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3260
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3261
       non-Unicode locales and hence needs conversion first. */
3262
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3263
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3264
            return -1;
3265
        }
3266
    }
3267
#endif
3268
3269
5.90k
    return res;
3270
5.90k
}
3271
3272
wchar_t*
3273
PyUnicode_AsWideCharString(PyObject *unicode,
3274
                           Py_ssize_t *size)
3275
1.26k
{
3276
1.26k
    wchar_t *buffer;
3277
1.26k
    Py_ssize_t buflen;
3278
3279
1.26k
    if (unicode == NULL) {
3280
0
        PyErr_BadInternalCall();
3281
0
        return NULL;
3282
0
    }
3283
1.26k
    if (!PyUnicode_Check(unicode)) {
3284
0
        PyErr_BadArgument();
3285
0
        return NULL;
3286
0
    }
3287
3288
1.26k
    buflen = unicode_get_widechar_size(unicode);
3289
1.26k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3290
1.26k
    if (buffer == NULL) {
3291
0
        PyErr_NoMemory();
3292
0
        return NULL;
3293
0
    }
3294
1.26k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3295
3296
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3297
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3298
       non-Unicode locales and hence needs conversion first. */
3299
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3300
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3301
            return NULL;
3302
        }
3303
    }
3304
#endif
3305
3306
1.26k
    if (size != NULL) {
3307
820
        *size = buflen;
3308
820
    }
3309
448
    else if (wcslen(buffer) != (size_t)buflen) {
3310
0
        PyMem_Free(buffer);
3311
0
        PyErr_SetString(PyExc_ValueError,
3312
0
                        "embedded null character");
3313
0
        return NULL;
3314
0
    }
3315
1.26k
    return buffer;
3316
1.26k
}
3317
3318
#endif /* HAVE_WCHAR_H */
3319
3320
int
3321
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3322
0
{
3323
0
    wchar_t **p = (wchar_t **)ptr;
3324
0
    if (obj == NULL) {
3325
0
        PyMem_Free(*p);
3326
0
        *p = NULL;
3327
0
        return 1;
3328
0
    }
3329
0
    if (PyUnicode_Check(obj)) {
3330
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3331
0
        if (*p == NULL) {
3332
0
            return 0;
3333
0
        }
3334
0
        return Py_CLEANUP_SUPPORTED;
3335
0
    }
3336
0
    PyErr_Format(PyExc_TypeError,
3337
0
                 "argument must be str, not %.50s",
3338
0
                 Py_TYPE(obj)->tp_name);
3339
0
    return 0;
3340
0
}
3341
3342
int
3343
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3344
0
{
3345
0
    wchar_t **p = (wchar_t **)ptr;
3346
0
    if (obj == NULL) {
3347
0
        PyMem_Free(*p);
3348
0
        *p = NULL;
3349
0
        return 1;
3350
0
    }
3351
0
    if (obj == Py_None) {
3352
0
        *p = NULL;
3353
0
        return 1;
3354
0
    }
3355
0
    if (PyUnicode_Check(obj)) {
3356
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3357
0
        if (*p == NULL) {
3358
0
            return 0;
3359
0
        }
3360
0
        return Py_CLEANUP_SUPPORTED;
3361
0
    }
3362
0
    PyErr_Format(PyExc_TypeError,
3363
0
                 "argument must be str or None, not %.50s",
3364
0
                 Py_TYPE(obj)->tp_name);
3365
0
    return 0;
3366
0
}
3367
3368
PyObject *
3369
PyUnicode_FromOrdinal(int ordinal)
3370
209k
{
3371
209k
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3372
0
        PyErr_SetString(PyExc_ValueError,
3373
0
                        "chr() arg not in range(0x110000)");
3374
0
        return NULL;
3375
0
    }
3376
3377
209k
    return unicode_char((Py_UCS4)ordinal);
3378
209k
}
3379
3380
PyObject *
3381
PyUnicode_FromObject(PyObject *obj)
3382
5.04M
{
3383
    /* XXX Perhaps we should make this API an alias of
3384
       PyObject_Str() instead ?! */
3385
5.04M
    if (PyUnicode_CheckExact(obj)) {
3386
5.04M
        return Py_NewRef(obj);
3387
5.04M
    }
3388
0
    if (PyUnicode_Check(obj)) {
3389
        /* For a Unicode subtype that's not a Unicode object,
3390
           return a true Unicode object with the same data. */
3391
0
        return _PyUnicode_Copy(obj);
3392
0
    }
3393
0
    PyErr_Format(PyExc_TypeError,
3394
0
                 "Can't convert '%.100s' object to str implicitly",
3395
0
                 Py_TYPE(obj)->tp_name);
3396
0
    return NULL;
3397
0
}
3398
3399
PyObject *
3400
PyUnicode_FromEncodedObject(PyObject *obj,
3401
                            const char *encoding,
3402
                            const char *errors)
3403
5.79M
{
3404
5.79M
    Py_buffer buffer;
3405
5.79M
    PyObject *v;
3406
3407
5.79M
    if (obj == NULL) {
3408
0
        PyErr_BadInternalCall();
3409
0
        return NULL;
3410
0
    }
3411
3412
    /* Decoding bytes objects is the most common case and should be fast */
3413
5.79M
    if (PyBytes_Check(obj)) {
3414
5.10M
        if (PyBytes_GET_SIZE(obj) == 0) {
3415
379k
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3416
0
                return NULL;
3417
0
            }
3418
379k
            _Py_RETURN_UNICODE_EMPTY();
3419
379k
        }
3420
4.72M
        return PyUnicode_Decode(
3421
4.72M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3422
4.72M
                encoding, errors);
3423
5.10M
    }
3424
3425
683k
    if (PyUnicode_Check(obj)) {
3426
0
        PyErr_SetString(PyExc_TypeError,
3427
0
                        "decoding str is not supported");
3428
0
        return NULL;
3429
0
    }
3430
3431
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3432
683k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3433
0
        PyErr_Format(PyExc_TypeError,
3434
0
                     "decoding to str: need a bytes-like object, %.80s found",
3435
0
                     Py_TYPE(obj)->tp_name);
3436
0
        return NULL;
3437
0
    }
3438
3439
683k
    if (buffer.len == 0) {
3440
0
        PyBuffer_Release(&buffer);
3441
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3442
0
            return NULL;
3443
0
        }
3444
0
        _Py_RETURN_UNICODE_EMPTY();
3445
0
    }
3446
3447
683k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3448
683k
    PyBuffer_Release(&buffer);
3449
683k
    return v;
3450
683k
}
3451
3452
/* Normalize an encoding name like encodings.normalize_encoding()
3453
   but allow to convert to lowercase if *to_lower* is true.
3454
   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
3455
int
3456
_Py_normalize_encoding(const char *encoding,
3457
                       char *lower,
3458
                       size_t lower_len,
3459
                       int to_lower)
3460
10.6M
{
3461
10.6M
    const char *e;
3462
10.6M
    char *l;
3463
10.6M
    char *l_end;
3464
10.6M
    int punct;
3465
3466
10.6M
    assert(encoding != NULL);
3467
3468
10.6M
    e = encoding;
3469
10.6M
    l = lower;
3470
10.6M
    l_end = &lower[lower_len - 1];
3471
10.6M
    punct = 0;
3472
182M
    while (1) {
3473
182M
        char c = *e;
3474
182M
        if (c == 0) {
3475
10.0M
            break;
3476
10.0M
        }
3477
3478
172M
        if (Py_ISALNUM(c) || c == '.') {
3479
63.2M
            if (punct && l != lower) {
3480
9.88M
                if (l == l_end) {
3481
1.26k
                    return 0;
3482
1.26k
                }
3483
9.88M
                *l++ = '_';
3484
9.88M
            }
3485
63.2M
            punct = 0;
3486
3487
63.2M
            if (l == l_end) {
3488
583k
                return 0;
3489
583k
            }
3490
62.6M
            *l++ = to_lower ? Py_TOLOWER(c) : c;
3491
62.6M
        }
3492
109M
        else {
3493
109M
            punct = 1;
3494
109M
        }
3495
3496
171M
        e++;
3497
171M
    }
3498
10.0M
    *l = '\0';
3499
10.0M
    return 1;
3500
10.6M
}
3501
3502
PyObject *
3503
PyUnicode_Decode(const char *s,
3504
                 Py_ssize_t size,
3505
                 const char *encoding,
3506
                 const char *errors)
3507
5.41M
{
3508
5.41M
    PyObject *buffer = NULL, *unicode;
3509
5.41M
    Py_buffer info;
3510
5.41M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3511
3512
5.41M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3513
0
        return NULL;
3514
0
    }
3515
3516
5.41M
    if (size == 0) {
3517
0
        _Py_RETURN_UNICODE_EMPTY();
3518
0
    }
3519
3520
5.41M
    if (encoding == NULL) {
3521
40.0k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3522
40.0k
    }
3523
3524
    /* Shortcuts for common default encodings */
3525
5.37M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3526
5.36M
        char *lower = buflower;
3527
3528
        /* Fast paths */
3529
5.36M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3530
930k
            lower += 3;
3531
930k
            if (*lower == '_') {
3532
                /* Match "utf8" and "utf_8" */
3533
929k
                lower++;
3534
929k
            }
3535
3536
930k
            if (lower[0] == '8' && lower[1] == 0) {
3537
929k
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3538
929k
            }
3539
863
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3540
96
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3541
96
            }
3542
767
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3543
108
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3544
108
            }
3545
930k
        }
3546
4.43M
        else {
3547
4.43M
            if (strcmp(lower, "ascii") == 0
3548
4.05M
                || strcmp(lower, "us_ascii") == 0) {
3549
544k
                return PyUnicode_DecodeASCII(s, size, errors);
3550
544k
            }
3551
    #ifdef MS_WINDOWS
3552
            else if (strcmp(lower, "mbcs") == 0) {
3553
                return PyUnicode_DecodeMBCS(s, size, errors);
3554
            }
3555
    #endif
3556
3.89M
            else if (strcmp(lower, "latin1") == 0
3557
3.89M
                     || strcmp(lower, "latin_1") == 0
3558
391k
                     || strcmp(lower, "iso_8859_1") == 0
3559
3.52M
                     || strcmp(lower, "iso8859_1") == 0) {
3560
3.52M
                return PyUnicode_DecodeLatin1(s, size, errors);
3561
3.52M
            }
3562
4.43M
        }
3563
5.36M
    }
3564
3565
    /* Decode via the codec registry */
3566
377k
    buffer = NULL;
3567
377k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3568
0
        goto onError;
3569
377k
    buffer = PyMemoryView_FromBuffer(&info);
3570
377k
    if (buffer == NULL)
3571
0
        goto onError;
3572
377k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3573
377k
    if (unicode == NULL)
3574
146k
        goto onError;
3575
230k
    if (!PyUnicode_Check(unicode)) {
3576
0
        PyErr_Format(PyExc_TypeError,
3577
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3578
0
                     "use codecs.decode() to decode to arbitrary types",
3579
0
                     encoding,
3580
0
                     Py_TYPE(unicode)->tp_name);
3581
0
        Py_DECREF(unicode);
3582
0
        goto onError;
3583
0
    }
3584
230k
    Py_DECREF(buffer);
3585
230k
    return unicode_result(unicode);
3586
3587
146k
  onError:
3588
146k
    Py_XDECREF(buffer);
3589
146k
    return NULL;
3590
230k
}
3591
3592
PyAPI_FUNC(PyObject *)
3593
PyUnicode_AsDecodedObject(PyObject *unicode,
3594
                          const char *encoding,
3595
                          const char *errors)
3596
0
{
3597
0
    if (!PyUnicode_Check(unicode)) {
3598
0
        PyErr_BadArgument();
3599
0
        return NULL;
3600
0
    }
3601
3602
0
    if (encoding == NULL)
3603
0
        encoding = PyUnicode_GetDefaultEncoding();
3604
3605
    /* Decode via the codec registry */
3606
0
    return PyCodec_Decode(unicode, encoding, errors);
3607
0
}
3608
3609
PyAPI_FUNC(PyObject *)
3610
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3611
                           const char *encoding,
3612
                           const char *errors)
3613
0
{
3614
0
    PyObject *v;
3615
3616
0
    if (!PyUnicode_Check(unicode)) {
3617
0
        PyErr_BadArgument();
3618
0
        goto onError;
3619
0
    }
3620
3621
0
    if (encoding == NULL)
3622
0
        encoding = PyUnicode_GetDefaultEncoding();
3623
3624
    /* Decode via the codec registry */
3625
0
    v = PyCodec_Decode(unicode, encoding, errors);
3626
0
    if (v == NULL)
3627
0
        goto onError;
3628
0
    if (!PyUnicode_Check(v)) {
3629
0
        PyErr_Format(PyExc_TypeError,
3630
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3631
0
                     "use codecs.decode() to decode to arbitrary types",
3632
0
                     encoding,
3633
0
                     Py_TYPE(unicode)->tp_name);
3634
0
        Py_DECREF(v);
3635
0
        goto onError;
3636
0
    }
3637
0
    return unicode_result(v);
3638
3639
0
  onError:
3640
0
    return NULL;
3641
0
}
3642
3643
PyAPI_FUNC(PyObject *)
3644
PyUnicode_AsEncodedObject(PyObject *unicode,
3645
                          const char *encoding,
3646
                          const char *errors)
3647
0
{
3648
0
    PyObject *v;
3649
3650
0
    if (!PyUnicode_Check(unicode)) {
3651
0
        PyErr_BadArgument();
3652
0
        goto onError;
3653
0
    }
3654
3655
0
    if (encoding == NULL)
3656
0
        encoding = PyUnicode_GetDefaultEncoding();
3657
3658
    /* Encode via the codec registry */
3659
0
    v = PyCodec_Encode(unicode, encoding, errors);
3660
0
    if (v == NULL)
3661
0
        goto onError;
3662
0
    return v;
3663
3664
0
  onError:
3665
0
    return NULL;
3666
0
}
3667
3668
3669
static PyObject *
3670
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3671
                      int current_locale)
3672
420
{
3673
420
    Py_ssize_t wlen;
3674
420
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3675
420
    if (wstr == NULL) {
3676
0
        return NULL;
3677
0
    }
3678
3679
420
    if ((size_t)wlen != wcslen(wstr)) {
3680
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3681
0
        PyMem_Free(wstr);
3682
0
        return NULL;
3683
0
    }
3684
3685
420
    char *str;
3686
420
    size_t error_pos;
3687
420
    const char *reason;
3688
420
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3689
420
                                 current_locale, error_handler);
3690
420
    PyMem_Free(wstr);
3691
3692
420
    if (res != 0) {
3693
0
        if (res == -2) {
3694
0
            PyObject *exc;
3695
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3696
0
                    "locale", unicode,
3697
0
                    (Py_ssize_t)error_pos,
3698
0
                    (Py_ssize_t)(error_pos+1),
3699
0
                    reason);
3700
0
            if (exc != NULL) {
3701
0
                PyCodec_StrictErrors(exc);
3702
0
                Py_DECREF(exc);
3703
0
            }
3704
0
        }
3705
0
        else if (res == -3) {
3706
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3707
0
        }
3708
0
        else {
3709
0
            PyErr_NoMemory();
3710
0
        }
3711
0
        return NULL;
3712
0
    }
3713
3714
420
    PyObject *bytes = PyBytes_FromString(str);
3715
420
    PyMem_RawFree(str);
3716
420
    return bytes;
3717
420
}
3718
3719
PyObject *
3720
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3721
0
{
3722
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3723
0
    return unicode_encode_locale(unicode, error_handler, 1);
3724
0
}
3725
3726
PyObject *
3727
PyUnicode_EncodeFSDefault(PyObject *unicode)
3728
17.7k
{
3729
17.7k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3730
17.7k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3731
17.7k
    if (fs_codec->utf8) {
3732
17.3k
        return unicode_encode_utf8(unicode,
3733
17.3k
                                   fs_codec->error_handler,
3734
17.3k
                                   fs_codec->errors);
3735
17.3k
    }
3736
420
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3737
420
    else if (fs_codec->encoding) {
3738
0
        return PyUnicode_AsEncodedString(unicode,
3739
0
                                         fs_codec->encoding,
3740
0
                                         fs_codec->errors);
3741
0
    }
3742
420
#endif
3743
420
    else {
3744
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3745
           machinery is not ready and so cannot be used:
3746
           use wcstombs() in this case. */
3747
420
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3748
420
        const wchar_t *filesystem_errors = config->filesystem_errors;
3749
420
        assert(filesystem_errors != NULL);
3750
420
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3751
420
        assert(errors != _Py_ERROR_UNKNOWN);
3752
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3753
        return unicode_encode_utf8(unicode, errors, NULL);
3754
#else
3755
420
        return unicode_encode_locale(unicode, errors, 0);
3756
420
#endif
3757
420
    }
3758
17.7k
}
3759
3760
PyObject *
3761
PyUnicode_AsEncodedString(PyObject *unicode,
3762
                          const char *encoding,
3763
                          const char *errors)
3764
18.9M
{
3765
18.9M
    PyObject *v;
3766
18.9M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3767
3768
18.9M
    if (!PyUnicode_Check(unicode)) {
3769
0
        PyErr_BadArgument();
3770
0
        return NULL;
3771
0
    }
3772
3773
18.9M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3774
0
        return NULL;
3775
0
    }
3776
3777
18.9M
    if (encoding == NULL) {
3778
13.6M
        return _PyUnicode_AsUTF8String(unicode, errors);
3779
13.6M
    }
3780
3781
    /* Shortcuts for common default encodings */
3782
5.29M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3783
4.72M
        char *lower = buflower;
3784
3785
        /* Fast paths */
3786
4.72M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3787
4.58M
            lower += 3;
3788
4.58M
            if (*lower == '_') {
3789
                /* Match "utf8" and "utf_8" */
3790
4.58M
                lower++;
3791
4.58M
            }
3792
3793
4.58M
            if (lower[0] == '8' && lower[1] == 0) {
3794
4.58M
                return _PyUnicode_AsUTF8String(unicode, errors);
3795
4.58M
            }
3796
0
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3797
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3798
0
            }
3799
0
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3800
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3801
0
            }
3802
4.58M
        }
3803
137k
        else {
3804
137k
            if (strcmp(lower, "ascii") == 0
3805
117k
                || strcmp(lower, "us_ascii") == 0) {
3806
117k
                return _PyUnicode_AsASCIIString(unicode, errors);
3807
117k
            }
3808
#ifdef MS_WINDOWS
3809
            else if (strcmp(lower, "mbcs") == 0) {
3810
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3811
            }
3812
#endif
3813
19.9k
            else if (strcmp(lower, "latin1") == 0 ||
3814
19.9k
                     strcmp(lower, "latin_1") == 0 ||
3815
19.9k
                     strcmp(lower, "iso_8859_1") == 0 ||
3816
19.9k
                     strcmp(lower, "iso8859_1") == 0) {
3817
0
                return _PyUnicode_AsLatin1String(unicode, errors);
3818
0
            }
3819
137k
        }
3820
4.72M
    }
3821
3822
    /* Encode via the codec registry */
3823
596k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3824
596k
    if (v == NULL)
3825
0
        return NULL;
3826
3827
    /* The normal path */
3828
596k
    if (PyBytes_Check(v))
3829
596k
        return v;
3830
3831
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3832
0
    if (PyByteArray_Check(v)) {
3833
0
        int error;
3834
0
        PyObject *b;
3835
3836
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3837
0
            "encoder %s returned bytearray instead of bytes; "
3838
0
            "use codecs.encode() to encode to arbitrary types",
3839
0
            encoding);
3840
0
        if (error) {
3841
0
            Py_DECREF(v);
3842
0
            return NULL;
3843
0
        }
3844
3845
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3846
0
                                      PyByteArray_GET_SIZE(v));
3847
0
        Py_DECREF(v);
3848
0
        return b;
3849
0
    }
3850
3851
0
    PyErr_Format(PyExc_TypeError,
3852
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3853
0
                 "use codecs.encode() to encode to arbitrary types",
3854
0
                 encoding,
3855
0
                 Py_TYPE(v)->tp_name);
3856
0
    Py_DECREF(v);
3857
0
    return NULL;
3858
0
}
3859
3860
PyAPI_FUNC(PyObject *)
3861
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3862
                           const char *encoding,
3863
                           const char *errors)
3864
0
{
3865
0
    PyObject *v;
3866
3867
0
    if (!PyUnicode_Check(unicode)) {
3868
0
        PyErr_BadArgument();
3869
0
        goto onError;
3870
0
    }
3871
3872
0
    if (encoding == NULL)
3873
0
        encoding = PyUnicode_GetDefaultEncoding();
3874
3875
    /* Encode via the codec registry */
3876
0
    v = PyCodec_Encode(unicode, encoding, errors);
3877
0
    if (v == NULL)
3878
0
        goto onError;
3879
0
    if (!PyUnicode_Check(v)) {
3880
0
        PyErr_Format(PyExc_TypeError,
3881
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3882
0
                     "use codecs.encode() to encode to arbitrary types",
3883
0
                     encoding,
3884
0
                     Py_TYPE(v)->tp_name);
3885
0
        Py_DECREF(v);
3886
0
        goto onError;
3887
0
    }
3888
0
    return v;
3889
3890
0
  onError:
3891
0
    return NULL;
3892
0
}
3893
3894
static PyObject*
3895
unicode_decode_locale(const char *str, Py_ssize_t len,
3896
                      _Py_error_handler errors, int current_locale)
3897
15.8k
{
3898
15.8k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3899
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3900
0
        return NULL;
3901
0
    }
3902
3903
15.8k
    wchar_t *wstr;
3904
15.8k
    size_t wlen;
3905
15.8k
    const char *reason;
3906
15.8k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3907
15.8k
                                 current_locale, errors);
3908
15.8k
    if (res != 0) {
3909
0
        if (res == -2) {
3910
0
            PyObject *exc;
3911
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3912
0
                                        "locale", str, len,
3913
0
                                        (Py_ssize_t)wlen,
3914
0
                                        (Py_ssize_t)(wlen + 1),
3915
0
                                        reason);
3916
0
            if (exc != NULL) {
3917
0
                PyCodec_StrictErrors(exc);
3918
0
                Py_DECREF(exc);
3919
0
            }
3920
0
        }
3921
0
        else if (res == -3) {
3922
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3923
0
        }
3924
0
        else {
3925
0
            PyErr_NoMemory();
3926
0
        }
3927
0
        return NULL;
3928
0
    }
3929
3930
15.8k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3931
15.8k
    PyMem_RawFree(wstr);
3932
15.8k
    return unicode;
3933
15.8k
}
3934
3935
PyObject*
3936
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3937
                              const char *errors)
3938
0
{
3939
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3940
0
    return unicode_decode_locale(str, len, error_handler, 1);
3941
0
}
3942
3943
PyObject*
3944
PyUnicode_DecodeLocale(const char *str, const char *errors)
3945
10.6k
{
3946
10.6k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3947
10.6k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3948
10.6k
    return unicode_decode_locale(str, size, error_handler, 1);
3949
10.6k
}
3950
3951
3952
PyObject*
3953
0
PyUnicode_DecodeFSDefault(const char *s) {
3954
0
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3955
0
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3956
0
}
3957
3958
PyObject*
3959
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3960
6.74k
{
3961
6.74k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3962
6.74k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3963
6.74k
    if (fs_codec->utf8) {
3964
1.60k
        return unicode_decode_utf8(s, size,
3965
1.60k
                                   fs_codec->error_handler,
3966
1.60k
                                   fs_codec->errors,
3967
1.60k
                                   NULL);
3968
1.60k
    }
3969
5.13k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3970
5.13k
    else if (fs_codec->encoding) {
3971
0
        return PyUnicode_Decode(s, size,
3972
0
                                fs_codec->encoding,
3973
0
                                fs_codec->errors);
3974
0
    }
3975
5.13k
#endif
3976
5.13k
    else {
3977
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3978
           machinery is not ready and so cannot be used:
3979
           use mbstowcs() in this case. */
3980
5.13k
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3981
5.13k
        const wchar_t *filesystem_errors = config->filesystem_errors;
3982
5.13k
        assert(filesystem_errors != NULL);
3983
5.13k
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3984
5.13k
        assert(errors != _Py_ERROR_UNKNOWN);
3985
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3986
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3987
#else
3988
5.13k
        return unicode_decode_locale(s, size, errors, 0);
3989
5.13k
#endif
3990
5.13k
    }
3991
6.74k
}
3992
3993
3994
int
3995
PyUnicode_FSConverter(PyObject* arg, void* addr)
3996
11.3k
{
3997
11.3k
    PyObject *path = NULL;
3998
11.3k
    PyObject *output = NULL;
3999
11.3k
    Py_ssize_t size;
4000
11.3k
    const char *data;
4001
11.3k
    if (arg == NULL) {
4002
0
        Py_DECREF(*(PyObject**)addr);
4003
0
        *(PyObject**)addr = NULL;
4004
0
        return 1;
4005
0
    }
4006
11.3k
    path = PyOS_FSPath(arg);
4007
11.3k
    if (path == NULL) {
4008
0
        return 0;
4009
0
    }
4010
11.3k
    if (PyBytes_Check(path)) {
4011
0
        output = path;
4012
0
    }
4013
11.3k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4014
11.3k
        output = PyUnicode_EncodeFSDefault(path);
4015
11.3k
        Py_DECREF(path);
4016
11.3k
        if (!output) {
4017
0
            return 0;
4018
0
        }
4019
11.3k
        assert(PyBytes_Check(output));
4020
11.3k
    }
4021
4022
11.3k
    size = PyBytes_GET_SIZE(output);
4023
11.3k
    data = PyBytes_AS_STRING(output);
4024
11.3k
    if ((size_t)size != strlen(data)) {
4025
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4026
0
        Py_DECREF(output);
4027
0
        return 0;
4028
0
    }
4029
11.3k
    *(PyObject**)addr = output;
4030
11.3k
    return Py_CLEANUP_SUPPORTED;
4031
11.3k
}
4032
4033
4034
int
4035
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4036
21.1k
{
4037
21.1k
    if (arg == NULL) {
4038
0
        Py_DECREF(*(PyObject**)addr);
4039
0
        *(PyObject**)addr = NULL;
4040
0
        return 1;
4041
0
    }
4042
4043
21.1k
    PyObject *path = PyOS_FSPath(arg);
4044
21.1k
    if (path == NULL) {
4045
0
        return 0;
4046
0
    }
4047
4048
21.1k
    PyObject *output = NULL;
4049
21.1k
    if (PyUnicode_Check(path)) {
4050
21.1k
        output = path;
4051
21.1k
    }
4052
0
    else if (PyBytes_Check(path)) {
4053
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4054
0
                                                  PyBytes_GET_SIZE(path));
4055
0
        Py_DECREF(path);
4056
0
        if (!output) {
4057
0
            return 0;
4058
0
        }
4059
0
    }
4060
0
    else {
4061
0
        PyErr_Format(PyExc_TypeError,
4062
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4063
0
                     Py_TYPE(arg)->tp_name);
4064
0
        Py_DECREF(path);
4065
0
        return 0;
4066
0
    }
4067
4068
21.1k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4069
21.1k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4070
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4071
0
        Py_DECREF(output);
4072
0
        return 0;
4073
0
    }
4074
21.1k
    *(PyObject**)addr = output;
4075
21.1k
    return Py_CLEANUP_SUPPORTED;
4076
21.1k
}
4077
4078
4079
static int unicode_fill_utf8(PyObject *unicode);
4080
4081
4082
static int
4083
unicode_ensure_utf8(PyObject *unicode)
4084
22.1M
{
4085
22.1M
    int err = 0;
4086
22.1M
    if (PyUnicode_UTF8(unicode) == NULL) {
4087
184k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4088
184k
        if (PyUnicode_UTF8(unicode) == NULL) {
4089
184k
            err = unicode_fill_utf8(unicode);
4090
184k
        }
4091
184k
        Py_END_CRITICAL_SECTION();
4092
184k
    }
4093
22.1M
    return err;
4094
22.1M
}
4095
4096
const char *
4097
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4098
22.1M
{
4099
22.1M
    if (!PyUnicode_Check(unicode)) {
4100
0
        PyErr_BadArgument();
4101
0
        if (psize) {
4102
0
            *psize = -1;
4103
0
        }
4104
0
        return NULL;
4105
0
    }
4106
4107
22.1M
    if (unicode_ensure_utf8(unicode) == -1) {
4108
207
        if (psize) {
4109
207
            *psize = -1;
4110
207
        }
4111
207
        return NULL;
4112
207
    }
4113
4114
22.1M
    if (psize) {
4115
22.1M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4116
22.1M
    }
4117
22.1M
    return PyUnicode_UTF8(unicode);
4118
22.1M
}
4119
4120
const char *
4121
PyUnicode_AsUTF8(PyObject *unicode)
4122
65.4k
{
4123
65.4k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4124
65.4k
}
4125
4126
const char *
4127
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4128
904k
{
4129
904k
    Py_ssize_t size;
4130
904k
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4131
904k
    if (s && strlen(s) != (size_t)size) {
4132
155
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4133
155
        return NULL;
4134
155
    }
4135
904k
    return s;
4136
904k
}
4137
4138
/*
4139
PyUnicode_GetSize() has been deprecated since Python 3.3
4140
because it returned length of Py_UNICODE.
4141
4142
But this function is part of stable abi, because it doesn't
4143
include Py_UNICODE in signature and it was not excluded from
4144
stable ABI in PEP 384.
4145
*/
4146
PyAPI_FUNC(Py_ssize_t)
4147
PyUnicode_GetSize(PyObject *unicode)
4148
0
{
4149
0
    PyErr_SetString(PyExc_RuntimeError,
4150
0
                    "PyUnicode_GetSize has been removed.");
4151
0
    return -1;
4152
0
}
4153
4154
Py_ssize_t
4155
PyUnicode_GetLength(PyObject *unicode)
4156
19.2k
{
4157
19.2k
    if (!PyUnicode_Check(unicode)) {
4158
0
        PyErr_BadArgument();
4159
0
        return -1;
4160
0
    }
4161
19.2k
    return PyUnicode_GET_LENGTH(unicode);
4162
19.2k
}
4163
4164
Py_UCS4
4165
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4166
20
{
4167
20
    const void *data;
4168
20
    int kind;
4169
4170
20
    if (!PyUnicode_Check(unicode)) {
4171
0
        PyErr_BadArgument();
4172
0
        return (Py_UCS4)-1;
4173
0
    }
4174
20
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4175
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4176
0
        return (Py_UCS4)-1;
4177
0
    }
4178
20
    data = PyUnicode_DATA(unicode);
4179
20
    kind = PyUnicode_KIND(unicode);
4180
20
    return PyUnicode_READ(kind, data, index);
4181
20
}
4182
4183
int
4184
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4185
0
{
4186
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4187
0
        PyErr_BadArgument();
4188
0
        return -1;
4189
0
    }
4190
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4191
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4192
0
        return -1;
4193
0
    }
4194
0
    if (unicode_check_modifiable(unicode))
4195
0
        return -1;
4196
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4197
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4198
0
        return -1;
4199
0
    }
4200
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4201
0
                    index, ch);
4202
0
    return 0;
4203
0
}
4204
4205
const char *
4206
PyUnicode_GetDefaultEncoding(void)
4207
0
{
4208
0
    return "utf-8";
4209
0
}
4210
4211
/* create or adjust a UnicodeDecodeError */
4212
static void
4213
make_decode_exception(PyObject **exceptionObject,
4214
                      const char *encoding,
4215
                      const char *input, Py_ssize_t length,
4216
                      Py_ssize_t startpos, Py_ssize_t endpos,
4217
                      const char *reason)
4218
317k
{
4219
317k
    if (*exceptionObject == NULL) {
4220
96.6k
        *exceptionObject = PyUnicodeDecodeError_Create(
4221
96.6k
            encoding, input, length, startpos, endpos, reason);
4222
96.6k
    }
4223
220k
    else {
4224
220k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4225
0
            goto onError;
4226
220k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4227
0
            goto onError;
4228
220k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4229
0
            goto onError;
4230
220k
    }
4231
317k
    return;
4232
4233
317k
onError:
4234
0
    Py_CLEAR(*exceptionObject);
4235
0
}
4236
4237
#ifdef MS_WINDOWS
4238
static int
4239
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4240
{
4241
    if (newsize > *size) {
4242
        wchar_t *newbuf = *buf;
4243
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4244
            PyErr_NoMemory();
4245
            return -1;
4246
        }
4247
        *buf = newbuf;
4248
    }
4249
    *size = newsize;
4250
    return 0;
4251
}
4252
4253
/* error handling callback helper:
4254
   build arguments, call the callback and check the arguments,
4255
   if no exception occurred, copy the replacement to the output
4256
   and adjust various state variables.
4257
   return 0 on success, -1 on error
4258
*/
4259
4260
static int
4261
unicode_decode_call_errorhandler_wchar(
4262
    const char *errors, PyObject **errorHandler,
4263
    const char *encoding, const char *reason,
4264
    const char **input, const char **inend, Py_ssize_t *startinpos,
4265
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4266
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4267
{
4268
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4269
4270
    PyObject *restuple = NULL;
4271
    PyObject *repunicode = NULL;
4272
    Py_ssize_t outsize;
4273
    Py_ssize_t insize;
4274
    Py_ssize_t requiredsize;
4275
    Py_ssize_t newpos;
4276
    PyObject *inputobj = NULL;
4277
    Py_ssize_t repwlen;
4278
4279
    if (*errorHandler == NULL) {
4280
        *errorHandler = PyCodec_LookupError(errors);
4281
        if (*errorHandler == NULL)
4282
            goto onError;
4283
    }
4284
4285
    make_decode_exception(exceptionObject,
4286
        encoding,
4287
        *input, *inend - *input,
4288
        *startinpos, *endinpos,
4289
        reason);
4290
    if (*exceptionObject == NULL)
4291
        goto onError;
4292
4293
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4294
    if (restuple == NULL)
4295
        goto onError;
4296
    if (!PyTuple_Check(restuple)) {
4297
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4298
        goto onError;
4299
    }
4300
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4301
        goto onError;
4302
4303
    /* Copy back the bytes variables, which might have been modified by the
4304
       callback */
4305
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4306
    if (!inputobj)
4307
        goto onError;
4308
    *input = PyBytes_AS_STRING(inputobj);
4309
    insize = PyBytes_GET_SIZE(inputobj);
4310
    *inend = *input + insize;
4311
    /* we can DECREF safely, as the exception has another reference,
4312
       so the object won't go away. */
4313
    Py_DECREF(inputobj);
4314
4315
    if (newpos<0)
4316
        newpos = insize+newpos;
4317
    if (newpos<0 || newpos>insize) {
4318
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4319
        goto onError;
4320
    }
4321
4322
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4323
    if (repwlen < 0)
4324
        goto onError;
4325
    repwlen--;
4326
    /* need more space? (at least enough for what we
4327
       have+the replacement+the rest of the string (starting
4328
       at the new input position), so we won't have to check space
4329
       when there are no errors in the rest of the string) */
4330
    requiredsize = *outpos;
4331
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4332
        goto overflow;
4333
    requiredsize += repwlen;
4334
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4335
        goto overflow;
4336
    requiredsize += insize - newpos;
4337
    outsize = *bufsize;
4338
    if (requiredsize > outsize) {
4339
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4340
            requiredsize = 2*outsize;
4341
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4342
            goto onError;
4343
        }
4344
    }
4345
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4346
    *outpos += repwlen;
4347
    *endinpos = newpos;
4348
    *inptr = *input + newpos;
4349
4350
    /* we made it! */
4351
    Py_DECREF(restuple);
4352
    return 0;
4353
4354
  overflow:
4355
    PyErr_SetString(PyExc_OverflowError,
4356
                    "decoded result is too long for a Python string");
4357
4358
  onError:
4359
    Py_XDECREF(restuple);
4360
    return -1;
4361
}
4362
#endif   /* MS_WINDOWS */
4363
4364
static int
4365
unicode_decode_call_errorhandler_writer(
4366
    const char *errors, PyObject **errorHandler,
4367
    const char *encoding, const char *reason,
4368
    const char **input, const char **inend, Py_ssize_t *startinpos,
4369
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4370
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4371
317k
{
4372
317k
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4373
4374
317k
    PyObject *restuple = NULL;
4375
317k
    PyObject *repunicode = NULL;
4376
317k
    Py_ssize_t insize;
4377
317k
    Py_ssize_t newpos;
4378
317k
    Py_ssize_t replen;
4379
317k
    Py_ssize_t remain;
4380
317k
    PyObject *inputobj = NULL;
4381
317k
    int need_to_grow = 0;
4382
317k
    const char *new_inptr;
4383
4384
317k
    if (*errorHandler == NULL) {
4385
96.6k
        *errorHandler = PyCodec_LookupError(errors);
4386
96.6k
        if (*errorHandler == NULL)
4387
0
            goto onError;
4388
96.6k
    }
4389
4390
317k
    make_decode_exception(exceptionObject,
4391
317k
        encoding,
4392
317k
        *input, *inend - *input,
4393
317k
        *startinpos, *endinpos,
4394
317k
        reason);
4395
317k
    if (*exceptionObject == NULL)
4396
0
        goto onError;
4397
4398
317k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4399
317k
    if (restuple == NULL)
4400
59.3k
        goto onError;
4401
258k
    if (!PyTuple_Check(restuple)) {
4402
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4403
0
        goto onError;
4404
0
    }
4405
258k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4406
0
        goto onError;
4407
4408
    /* Copy back the bytes variables, which might have been modified by the
4409
       callback */
4410
258k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4411
258k
    if (!inputobj)
4412
0
        goto onError;
4413
258k
    remain = *inend - *input - *endinpos;
4414
258k
    *input = PyBytes_AS_STRING(inputobj);
4415
258k
    insize = PyBytes_GET_SIZE(inputobj);
4416
258k
    *inend = *input + insize;
4417
    /* we can DECREF safely, as the exception has another reference,
4418
       so the object won't go away. */
4419
258k
    Py_DECREF(inputobj);
4420
4421
258k
    if (newpos<0)
4422
0
        newpos = insize+newpos;
4423
258k
    if (newpos<0 || newpos>insize) {
4424
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4425
0
        goto onError;
4426
0
    }
4427
4428
258k
    replen = PyUnicode_GET_LENGTH(repunicode);
4429
258k
    if (replen > 1) {
4430
24.0k
        writer->min_length += replen - 1;
4431
24.0k
        need_to_grow = 1;
4432
24.0k
    }
4433
258k
    new_inptr = *input + newpos;
4434
258k
    if (*inend - new_inptr > remain) {
4435
        /* We don't know the decoding algorithm here so we make the worst
4436
           assumption that one byte decodes to one unicode character.
4437
           If unfortunately one byte could decode to more unicode characters,
4438
           the decoder may write out-of-bound then.  Is it possible for the
4439
           algorithms using this function? */
4440
11.6k
        writer->min_length += *inend - new_inptr - remain;
4441
11.6k
        need_to_grow = 1;
4442
11.6k
    }
4443
258k
    if (need_to_grow) {
4444
24.1k
        writer->overallocate = 1;
4445
24.1k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4446
24.1k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4447
0
            goto onError;
4448
24.1k
    }
4449
258k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4450
0
        goto onError;
4451
4452
258k
    *endinpos = newpos;
4453
258k
    *inptr = new_inptr;
4454
4455
    /* we made it! */
4456
258k
    Py_DECREF(restuple);
4457
258k
    return 0;
4458
4459
59.3k
  onError:
4460
59.3k
    Py_XDECREF(restuple);
4461
59.3k
    return -1;
4462
258k
}
4463
4464
/* --- UTF-7 Codec -------------------------------------------------------- */
4465
4466
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4467
4468
/* Three simple macros defining base-64. */
4469
4470
/* Is c a base-64 character? */
4471
4472
#define IS_BASE64(c) \
4473
333k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4474
333k
     ((c) >= 'a' && (c) <= 'z') ||     \
4475
333k
     ((c) >= '0' && (c) <= '9') ||     \
4476
333k
     (c) == '+' || (c) == '/')
4477
4478
/* given that c is a base-64 character, what is its base-64 value? */
4479
4480
#define FROM_BASE64(c)                                                  \
4481
288k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4482
288k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4483
209k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4484
118k
     (c) == '+' ? 62 : 63)
4485
4486
/* What is the base-64 character of the bottom 6 bits of n? */
4487
4488
#define TO_BASE64(n)  \
4489
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4490
4491
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4492
 * decoded as itself.  We are permissive on decoding; the only ASCII
4493
 * byte not decoding to itself is the + which begins a base64
4494
 * string. */
4495
4496
#define DECODE_DIRECT(c)                                \
4497
7.60M
    ((c) <= 127 && (c) != '+')
4498
4499
/* The UTF-7 encoder treats ASCII characters differently according to
4500
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4501
 * the above).  See RFC2152.  This array identifies these different
4502
 * sets:
4503
 * 0 : "Set D"
4504
 *     alphanumeric and '(),-./:?
4505
 * 1 : "Set O"
4506
 *     !"#$%&*;<=>@[]^_`{|}
4507
 * 2 : "whitespace"
4508
 *     ht nl cr sp
4509
 * 3 : special (must be base64 encoded)
4510
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4511
 */
4512
4513
static
4514
char utf7_category[128] = {
4515
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4516
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4517
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4518
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4519
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4520
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4521
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4522
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4523
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4524
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4525
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4526
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4527
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4528
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4529
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4530
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4531
};
4532
4533
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4534
 * answer depends on whether we are encoding set O as itself, and also
4535
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4536
 * clear that the answers to these questions vary between
4537
 * applications, so this code needs to be flexible.  */
4538
4539
#define ENCODE_DIRECT(c) \
4540
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4541
4542
PyObject *
4543
PyUnicode_DecodeUTF7(const char *s,
4544
                     Py_ssize_t size,
4545
                     const char *errors)
4546
0
{
4547
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4548
0
}
4549
4550
/* The decoder.  The only state we preserve is our read position,
4551
 * i.e. how many characters we have consumed.  So if we end in the
4552
 * middle of a shift sequence we have to back off the read position
4553
 * and the output to the beginning of the sequence, otherwise we lose
4554
 * all the shift state (seen bits, number of bits seen, high
4555
 * surrogate). */
4556
4557
PyObject *
4558
PyUnicode_DecodeUTF7Stateful(const char *s,
4559
                             Py_ssize_t size,
4560
                             const char *errors,
4561
                             Py_ssize_t *consumed)
4562
28.1k
{
4563
28.1k
    const char *starts = s;
4564
28.1k
    Py_ssize_t startinpos;
4565
28.1k
    Py_ssize_t endinpos;
4566
28.1k
    const char *e;
4567
28.1k
    _PyUnicodeWriter writer;
4568
28.1k
    const char *errmsg = "";
4569
28.1k
    int inShift = 0;
4570
28.1k
    Py_ssize_t shiftOutStart;
4571
28.1k
    unsigned int base64bits = 0;
4572
28.1k
    unsigned long base64buffer = 0;
4573
28.1k
    Py_UCS4 surrogate = 0;
4574
28.1k
    PyObject *errorHandler = NULL;
4575
28.1k
    PyObject *exc = NULL;
4576
4577
28.1k
    if (size == 0) {
4578
0
        if (consumed)
4579
0
            *consumed = 0;
4580
0
        _Py_RETURN_UNICODE_EMPTY();
4581
0
    }
4582
4583
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4584
28.1k
    _PyUnicodeWriter_Init(&writer);
4585
28.1k
    writer.min_length = size;
4586
4587
28.1k
    shiftOutStart = 0;
4588
28.1k
    e = s + size;
4589
4590
7.95M
    while (s < e) {
4591
7.93M
        Py_UCS4 ch;
4592
7.93M
      restart:
4593
7.93M
        ch = (unsigned char) *s;
4594
4595
7.93M
        if (inShift) { /* in a base-64 section */
4596
307k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4597
288k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4598
288k
                base64bits += 6;
4599
288k
                s++;
4600
288k
                if (base64bits >= 16) {
4601
                    /* we have enough bits for a UTF-16 value */
4602
100k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4603
100k
                    base64bits -= 16;
4604
100k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4605
100k
                    assert(outCh <= 0xffff);
4606
100k
                    if (surrogate) {
4607
                        /* expecting a second surrogate */
4608
8.68k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4609
3.78k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4610
3.78k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4611
0
                                goto onError;
4612
3.78k
                            surrogate = 0;
4613
3.78k
                            continue;
4614
3.78k
                        }
4615
4.90k
                        else {
4616
4.90k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4617
0
                                goto onError;
4618
4.90k
                            surrogate = 0;
4619
4.90k
                        }
4620
8.68k
                    }
4621
96.6k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4622
                        /* first surrogate */
4623
12.8k
                        surrogate = outCh;
4624
12.8k
                    }
4625
83.7k
                    else {
4626
83.7k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4627
0
                            goto onError;
4628
83.7k
                    }
4629
96.6k
                }
4630
288k
            }
4631
18.5k
            else { /* now leaving a base-64 section */
4632
18.5k
                inShift = 0;
4633
18.5k
                if (base64bits > 0) { /* left-over bits */
4634
15.0k
                    if (base64bits >= 6) {
4635
                        /* We've seen at least one base-64 character */
4636
8.91k
                        s++;
4637
8.91k
                        errmsg = "partial character in shift sequence";
4638
8.91k
                        goto utf7Error;
4639
8.91k
                    }
4640
6.17k
                    else {
4641
                        /* Some bits remain; they should be zero */
4642
6.17k
                        if (base64buffer != 0) {
4643
1.28k
                            s++;
4644
1.28k
                            errmsg = "non-zero padding bits in shift sequence";
4645
1.28k
                            goto utf7Error;
4646
1.28k
                        }
4647
6.17k
                    }
4648
15.0k
                }
4649
8.32k
                if (surrogate && DECODE_DIRECT(ch)) {
4650
2.85k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4651
0
                        goto onError;
4652
2.85k
                }
4653
8.32k
                surrogate = 0;
4654
8.32k
                if (ch == '-') {
4655
                    /* '-' is absorbed; other terminating
4656
                       characters are preserved */
4657
2.13k
                    s++;
4658
2.13k
                }
4659
8.32k
            }
4660
307k
        }
4661
7.63M
        else if ( ch == '+' ) {
4662
29.0k
            startinpos = s-starts;
4663
29.0k
            s++; /* consume '+' */
4664
29.0k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4665
2.30k
                s++;
4666
2.30k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4667
0
                    goto onError;
4668
2.30k
            }
4669
26.7k
            else if (s < e && !IS_BASE64(*s)) {
4670
3.87k
                s++;
4671
3.87k
                errmsg = "ill-formed sequence";
4672
3.87k
                goto utf7Error;
4673
3.87k
            }
4674
22.9k
            else { /* begin base64-encoded section */
4675
22.9k
                inShift = 1;
4676
22.9k
                surrogate = 0;
4677
22.9k
                shiftOutStart = writer.pos;
4678
22.9k
                base64bits = 0;
4679
22.9k
                base64buffer = 0;
4680
22.9k
            }
4681
29.0k
        }
4682
7.60M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4683
7.50M
            s++;
4684
7.50M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4685
0
                goto onError;
4686
7.50M
        }
4687
96.5k
        else {
4688
96.5k
            startinpos = s-starts;
4689
96.5k
            s++;
4690
96.5k
            errmsg = "unexpected special character";
4691
96.5k
            goto utf7Error;
4692
96.5k
        }
4693
7.82M
        continue;
4694
7.82M
utf7Error:
4695
110k
        endinpos = s-starts;
4696
110k
        if (unicode_decode_call_errorhandler_writer(
4697
110k
                errors, &errorHandler,
4698
110k
                "utf7", errmsg,
4699
110k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4700
110k
                &writer))
4701
11.9k
            goto onError;
4702
110k
    }
4703
4704
    /* end of string */
4705
4706
16.2k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4707
        /* if we're in an inconsistent state, that's an error */
4708
4.39k
        inShift = 0;
4709
4.39k
        if (surrogate ||
4710
3.49k
                (base64bits >= 6) ||
4711
3.08k
                (base64bits > 0 && base64buffer != 0)) {
4712
3.08k
            endinpos = size;
4713
3.08k
            if (unicode_decode_call_errorhandler_writer(
4714
3.08k
                    errors, &errorHandler,
4715
3.08k
                    "utf7", "unterminated shift sequence",
4716
3.08k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4717
3.08k
                    &writer))
4718
2.72k
                goto onError;
4719
359
            if (s < e)
4720
0
                goto restart;
4721
359
        }
4722
4.39k
    }
4723
4724
    /* return state */
4725
13.4k
    if (consumed) {
4726
0
        if (inShift) {
4727
0
            *consumed = startinpos;
4728
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4729
0
                PyObject *result = PyUnicode_FromKindAndData(
4730
0
                        writer.kind, writer.data, shiftOutStart);
4731
0
                Py_XDECREF(errorHandler);
4732
0
                Py_XDECREF(exc);
4733
0
                _PyUnicodeWriter_Dealloc(&writer);
4734
0
                return result;
4735
0
            }
4736
0
            writer.pos = shiftOutStart; /* back off output */
4737
0
        }
4738
0
        else {
4739
0
            *consumed = s-starts;
4740
0
        }
4741
0
    }
4742
4743
13.4k
    Py_XDECREF(errorHandler);
4744
13.4k
    Py_XDECREF(exc);
4745
13.4k
    return _PyUnicodeWriter_Finish(&writer);
4746
4747
14.6k
  onError:
4748
14.6k
    Py_XDECREF(errorHandler);
4749
14.6k
    Py_XDECREF(exc);
4750
14.6k
    _PyUnicodeWriter_Dealloc(&writer);
4751
14.6k
    return NULL;
4752
13.4k
}
4753
4754
4755
PyObject *
4756
_PyUnicode_EncodeUTF7(PyObject *str,
4757
                      const char *errors)
4758
0
{
4759
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4760
0
    if (len == 0) {
4761
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4762
0
    }
4763
0
    int kind = PyUnicode_KIND(str);
4764
0
    const void *data = PyUnicode_DATA(str);
4765
4766
    /* It might be possible to tighten this worst case */
4767
0
    if (len > PY_SSIZE_T_MAX / 8) {
4768
0
        return PyErr_NoMemory();
4769
0
    }
4770
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4771
0
    if (writer == NULL) {
4772
0
        return NULL;
4773
0
    }
4774
4775
0
    int inShift = 0;
4776
0
    unsigned int base64bits = 0;
4777
0
    unsigned long base64buffer = 0;
4778
0
    char *out = PyBytesWriter_GetData(writer);
4779
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4780
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4781
4782
0
        if (inShift) {
4783
0
            if (ENCODE_DIRECT(ch)) {
4784
                /* shifting out */
4785
0
                if (base64bits) { /* output remaining bits */
4786
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4787
0
                    base64buffer = 0;
4788
0
                    base64bits = 0;
4789
0
                }
4790
0
                inShift = 0;
4791
                /* Characters not in the BASE64 set implicitly unshift the sequence
4792
                   so no '-' is required, except if the character is itself a '-' */
4793
0
                if (IS_BASE64(ch) || ch == '-') {
4794
0
                    *out++ = '-';
4795
0
                }
4796
0
                *out++ = (char) ch;
4797
0
            }
4798
0
            else {
4799
0
                goto encode_char;
4800
0
            }
4801
0
        }
4802
0
        else { /* not in a shift sequence */
4803
0
            if (ch == '+') {
4804
0
                *out++ = '+';
4805
0
                        *out++ = '-';
4806
0
            }
4807
0
            else if (ENCODE_DIRECT(ch)) {
4808
0
                *out++ = (char) ch;
4809
0
            }
4810
0
            else {
4811
0
                *out++ = '+';
4812
0
                inShift = 1;
4813
0
                goto encode_char;
4814
0
            }
4815
0
        }
4816
0
        continue;
4817
0
encode_char:
4818
0
        if (ch >= 0x10000) {
4819
0
            assert(ch <= MAX_UNICODE);
4820
4821
            /* code first surrogate */
4822
0
            base64bits += 16;
4823
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4824
0
            while (base64bits >= 6) {
4825
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4826
0
                base64bits -= 6;
4827
0
            }
4828
            /* prepare second surrogate */
4829
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4830
0
        }
4831
0
        base64bits += 16;
4832
0
        base64buffer = (base64buffer << 16) | ch;
4833
0
        while (base64bits >= 6) {
4834
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4835
0
            base64bits -= 6;
4836
0
        }
4837
0
    }
4838
0
    if (base64bits)
4839
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4840
0
    if (inShift)
4841
0
        *out++ = '-';
4842
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4843
0
}
4844
4845
#undef IS_BASE64
4846
#undef FROM_BASE64
4847
#undef TO_BASE64
4848
#undef DECODE_DIRECT
4849
#undef ENCODE_DIRECT
4850
4851
/* --- UTF-8 Codec -------------------------------------------------------- */
4852
4853
PyObject *
4854
PyUnicode_DecodeUTF8(const char *s,
4855
                     Py_ssize_t size,
4856
                     const char *errors)
4857
2.24M
{
4858
2.24M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4859
2.24M
}
4860
4861
#include "stringlib/asciilib.h"
4862
#include "stringlib/codecs.h"
4863
#include "stringlib/undef.h"
4864
4865
#include "stringlib/ucs1lib.h"
4866
#include "stringlib/codecs.h"
4867
#include "stringlib/undef.h"
4868
4869
#include "stringlib/ucs2lib.h"
4870
#include "stringlib/codecs.h"
4871
#include "stringlib/undef.h"
4872
4873
#include "stringlib/ucs4lib.h"
4874
#include "stringlib/codecs.h"
4875
#include "stringlib/undef.h"
4876
4877
#if (SIZEOF_SIZE_T == 8)
4878
/* Mask to quickly check whether a C 'size_t' contains a
4879
   non-ASCII, UTF8-encoded char. */
4880
131M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4881
// used to count codepoints in UTF-8 string.
4882
317M
# define VECTOR_0101     0x0101010101010101ULL
4883
2.86M
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4884
#elif (SIZEOF_SIZE_T == 4)
4885
# define ASCII_CHAR_MASK 0x80808080U
4886
# define VECTOR_0101     0x01010101U
4887
# define VECTOR_00FF     0x00ff00ffU
4888
#else
4889
# error C 'size_t' size should be either 4 or 8!
4890
#endif
4891
4892
#if (defined(__clang__) || defined(__GNUC__))
4893
#define HAVE_CTZ 1
4894
static inline unsigned int
4895
ctz(size_t v)
4896
609k
{
4897
609k
    return __builtin_ctzll((unsigned long long)v);
4898
609k
}
4899
#elif defined(_MSC_VER)
4900
#define HAVE_CTZ 1
4901
static inline unsigned int
4902
ctz(size_t v)
4903
{
4904
    unsigned long pos;
4905
#if SIZEOF_SIZE_T == 4
4906
    _BitScanForward(&pos, v);
4907
#else
4908
    _BitScanForward64(&pos, v);
4909
#endif /* SIZEOF_SIZE_T */
4910
    return pos;
4911
}
4912
#else
4913
#define HAVE_CTZ 0
4914
#endif
4915
4916
#if HAVE_CTZ && PY_LITTLE_ENDIAN
4917
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
4918
static size_t
4919
load_unaligned(const unsigned char *p, size_t size)
4920
13.7M
{
4921
13.7M
    union {
4922
13.7M
        size_t s;
4923
13.7M
        unsigned char b[SIZEOF_SIZE_T];
4924
13.7M
    } u;
4925
13.7M
    u.s = 0;
4926
    // This switch statement assumes little endian because:
4927
    // * union is faster than bitwise or and shift.
4928
    // * big endian machine is rare and hard to maintain.
4929
13.7M
    switch (size) {
4930
0
    default:
4931
0
#if SIZEOF_SIZE_T == 8
4932
0
    case 8:
4933
0
        u.b[7] = p[7];
4934
0
        _Py_FALLTHROUGH;
4935
905k
    case 7:
4936
905k
        u.b[6] = p[6];
4937
905k
        _Py_FALLTHROUGH;
4938
2.08M
    case 6:
4939
2.08M
        u.b[5] = p[5];
4940
2.08M
        _Py_FALLTHROUGH;
4941
2.70M
    case 5:
4942
2.70M
        u.b[4] = p[4];
4943
2.70M
        _Py_FALLTHROUGH;
4944
2.70M
#endif
4945
3.18M
    case 4:
4946
3.18M
        u.b[3] = p[3];
4947
3.18M
        _Py_FALLTHROUGH;
4948
9.09M
    case 3:
4949
9.09M
        u.b[2] = p[2];
4950
9.09M
        _Py_FALLTHROUGH;
4951
12.3M
    case 2:
4952
12.3M
        u.b[1] = p[1];
4953
12.3M
        _Py_FALLTHROUGH;
4954
13.5M
    case 1:
4955
13.5M
        u.b[0] = p[0];
4956
13.5M
        break;
4957
211k
    case 0:
4958
211k
        break;
4959
13.7M
    }
4960
13.7M
    return u.s;
4961
13.7M
}
4962
#endif
4963
4964
/*
4965
 * Find the first non-ASCII character in a byte sequence.
4966
 *
4967
 * This function scans a range of bytes from `start` to `end` and returns the
4968
 * index of the first byte that is not an ASCII character (i.e., has the most
4969
 * significant bit set). If all characters in the range are ASCII, it returns
4970
 * `end - start`.
4971
 */
4972
static Py_ssize_t
4973
find_first_nonascii(const unsigned char *start, const unsigned char *end)
4974
14.1M
{
4975
    // The search is done in `size_t` chunks.
4976
    // The start and end might not be aligned at `size_t` boundaries,
4977
    // so they're handled specially.
4978
4979
14.1M
    const unsigned char *p = start;
4980
4981
14.1M
    if (end - start >= SIZEOF_SIZE_T) {
4982
        // Avoid unaligned read.
4983
3.36M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
4984
3.36M
        size_t u;
4985
3.36M
        memcpy(&u, p, sizeof(size_t));
4986
3.36M
        u &= ASCII_CHAR_MASK;
4987
3.36M
        if (u) {
4988
222k
            return (ctz(u) - 7) / 8;
4989
222k
        }
4990
3.14M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
4991
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
4992
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
4993
        while (p < p2) {
4994
            if (*p & 0x80) {
4995
                return p - start;
4996
            }
4997
            p++;
4998
        }
4999
#endif
5000
5001
3.14M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5002
116M
        while (p <= e) {
5003
113M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5004
113M
            if (u) {
5005
199k
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5006
199k
                return p - start + (ctz(u) - 7) / 8;
5007
#else
5008
                // big endian and minor compilers are difficult to test.
5009
                // fallback to per byte check.
5010
                break;
5011
#endif
5012
199k
            }
5013
113M
            p += SIZEOF_SIZE_T;
5014
113M
        }
5015
3.14M
    }
5016
13.7M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5017
14.1M
    assert((end - p) < SIZEOF_SIZE_T);
5018
    // we can not use *(const size_t*)p to avoid buffer overrun.
5019
13.7M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5020
13.7M
    if (u) {
5021
187k
        return p - start + (ctz(u) - 7) / 8;
5022
187k
    }
5023
13.5M
    return end - start;
5024
#else
5025
    while (p < end) {
5026
        if (*p & 0x80) {
5027
            break;
5028
        }
5029
        p++;
5030
    }
5031
    return p - start;
5032
#endif
5033
13.7M
}
5034
5035
static inline int
5036
scalar_utf8_start_char(unsigned int ch)
5037
765k
{
5038
    // 0xxxxxxx or 11xxxxxx are first byte.
5039
765k
    return (~ch >> 7 | ch >> 6) & 1;
5040
765k
}
5041
5042
static inline size_t
5043
vector_utf8_start_chars(size_t v)
5044
317M
{
5045
317M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5046
317M
}
5047
5048
5049
// Count the number of UTF-8 code points in a given byte sequence.
5050
static Py_ssize_t
5051
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5052
280k
{
5053
280k
    Py_ssize_t len = 0;
5054
5055
280k
    if (end - s >= SIZEOF_SIZE_T) {
5056
212k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5057
17.3k
            len += scalar_utf8_start_char(*s++);
5058
17.3k
        }
5059
5060
1.62M
        while (s + SIZEOF_SIZE_T <= end) {
5061
1.43M
            const unsigned char *e = end;
5062
1.43M
            if (e - s > SIZEOF_SIZE_T * 255) {
5063
1.24M
                e = s + SIZEOF_SIZE_T * 255;
5064
1.24M
            }
5065
1.43M
            Py_ssize_t vstart = 0;
5066
319M
            while (s + SIZEOF_SIZE_T <= e) {
5067
317M
                size_t v = *(size_t*)s;
5068
317M
                size_t vs = vector_utf8_start_chars(v);
5069
317M
                vstart += vs;
5070
317M
                s += SIZEOF_SIZE_T;
5071
317M
            }
5072
1.43M
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5073
1.43M
            vstart += vstart >> 16;
5074
1.43M
#if SIZEOF_SIZE_T == 8
5075
1.43M
            vstart += vstart >> 32;
5076
1.43M
#endif
5077
1.43M
            len += vstart & 0x7ff;
5078
1.43M
        }
5079
195k
    }
5080
1.02M
    while (s < end) {
5081
748k
        len += scalar_utf8_start_char(*s++);
5082
748k
    }
5083
280k
    return len;
5084
280k
}
5085
5086
static Py_ssize_t
5087
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5088
5.88M
{
5089
5.88M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5090
5.88M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5091
5.86M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5092
639k
    {
5093
        /* Fast path, see in STRINGLIB(utf8_decode) for
5094
           an explanation. */
5095
639k
        const char *p = start;
5096
639k
        Py_UCS1 *q = dest;
5097
1.65M
        while (p + SIZEOF_SIZE_T <= end) {
5098
1.16M
            size_t value = *(const size_t *) p;
5099
1.16M
            if (value & ASCII_CHAR_MASK)
5100
149k
                break;
5101
1.01M
            *((size_t *)q) = value;
5102
1.01M
            p += SIZEOF_SIZE_T;
5103
1.01M
            q += SIZEOF_SIZE_T;
5104
1.01M
        }
5105
2.57M
        while (p < end) {
5106
2.10M
            if ((unsigned char)*p & 0x80)
5107
172k
                break;
5108
1.93M
            *q++ = *p++;
5109
1.93M
        }
5110
639k
        return p - start;
5111
639k
    }
5112
5.24M
#endif
5113
5.24M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5114
5.24M
                                         (const unsigned char*)end);
5115
5.24M
    memcpy(dest, start, pos);
5116
5.24M
    return pos;
5117
5.88M
}
5118
5119
static int
5120
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5121
                         const char *starts, const char *s, const char *end,
5122
                         _Py_error_handler error_handler,
5123
                         const char *errors,
5124
                         Py_ssize_t *consumed)
5125
612k
{
5126
612k
    Py_ssize_t startinpos, endinpos;
5127
612k
    const char *errmsg = "";
5128
612k
    PyObject *error_handler_obj = NULL;
5129
612k
    PyObject *exc = NULL;
5130
5131
151M
    while (s < end) {
5132
151M
        Py_UCS4 ch;
5133
151M
        int kind = writer->kind;
5134
5135
151M
        if (kind == PyUnicode_1BYTE_KIND) {
5136
582k
            if (PyUnicode_IS_ASCII(writer->buffer))
5137
329k
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5138
252k
            else
5139
252k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5140
151M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5141
68.7M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5142
82.3M
        } else {
5143
82.3M
            assert(kind == PyUnicode_4BYTE_KIND);
5144
82.3M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5145
82.3M
        }
5146
5147
151M
        switch (ch) {
5148
534k
        case 0:
5149
534k
            if (s == end || consumed)
5150
509k
                goto End;
5151
25.4k
            errmsg = "unexpected end of data";
5152
25.4k
            startinpos = s - starts;
5153
25.4k
            endinpos = end - starts;
5154
25.4k
            break;
5155
120M
        case 1:
5156
120M
            errmsg = "invalid start byte";
5157
120M
            startinpos = s - starts;
5158
120M
            endinpos = startinpos + 1;
5159
120M
            break;
5160
29.0M
        case 2:
5161
29.0M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5162
0
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5163
0
            {
5164
                /* Truncated surrogate code in range D800-DFFF */
5165
0
                goto End;
5166
0
            }
5167
29.0M
            _Py_FALLTHROUGH;
5168
30.3M
        case 3:
5169
30.4M
        case 4:
5170
30.4M
            errmsg = "invalid continuation byte";
5171
30.4M
            startinpos = s - starts;
5172
30.4M
            endinpos = startinpos + ch - 1;
5173
30.4M
            break;
5174
326k
        default:
5175
            // ch doesn't fit into kind, so change the buffer kind to write
5176
            // the character
5177
326k
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5178
0
                goto onError;
5179
326k
            continue;
5180
151M
        }
5181
5182
150M
        if (error_handler == _Py_ERROR_UNKNOWN)
5183
131k
            error_handler = _Py_GetErrorHandler(errors);
5184
5185
150M
        switch (error_handler) {
5186
0
        case _Py_ERROR_IGNORE:
5187
0
            s += (endinpos - startinpos);
5188
0
            break;
5189
5190
150M
        case _Py_ERROR_REPLACE:
5191
150M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5192
0
                goto onError;
5193
150M
            s += (endinpos - startinpos);
5194
150M
            break;
5195
5196
2.19k
        case _Py_ERROR_SURROGATEESCAPE:
5197
2.19k
        {
5198
2.19k
            Py_ssize_t i;
5199
5200
2.19k
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5201
0
                goto onError;
5202
4.67k
            for (i=startinpos; i<endinpos; i++) {
5203
2.47k
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5204
2.47k
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5205
2.47k
                                ch + 0xdc00);
5206
2.47k
                writer->pos++;
5207
2.47k
            }
5208
2.19k
            s += (endinpos - startinpos);
5209
2.19k
            break;
5210
2.19k
        }
5211
5212
787
        default:
5213
787
            if (unicode_decode_call_errorhandler_writer(
5214
787
                    errors, &error_handler_obj,
5215
787
                    "utf-8", errmsg,
5216
787
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5217
787
                    writer)) {
5218
787
                goto onError;
5219
787
            }
5220
5221
0
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5222
0
                return -1;
5223
0
            }
5224
150M
        }
5225
150M
    }
5226
5227
611k
End:
5228
611k
    if (consumed)
5229
927
        *consumed = s - starts;
5230
5231
611k
    Py_XDECREF(error_handler_obj);
5232
611k
    Py_XDECREF(exc);
5233
611k
    return 0;
5234
5235
787
onError:
5236
787
    Py_XDECREF(error_handler_obj);
5237
787
    Py_XDECREF(exc);
5238
787
    return -1;
5239
612k
}
5240
5241
5242
static PyObject *
5243
unicode_decode_utf8(const char *s, Py_ssize_t size,
5244
                    _Py_error_handler error_handler, const char *errors,
5245
                    Py_ssize_t *consumed)
5246
10.6M
{
5247
10.6M
    if (size == 0) {
5248
70.8k
        if (consumed) {
5249
0
            *consumed = 0;
5250
0
        }
5251
70.8k
        _Py_RETURN_UNICODE_EMPTY();
5252
70.8k
    }
5253
5254
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5255
10.5M
    if (size == 1 && (unsigned char)s[0] < 128) {
5256
1.66M
        if (consumed) {
5257
0
            *consumed = 1;
5258
0
        }
5259
1.66M
        return get_latin1_char((unsigned char)s[0]);
5260
1.66M
    }
5261
5262
    // I don't know this check is necessary or not. But there is a test
5263
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5264
8.92M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5265
0
        PyErr_NoMemory();
5266
0
        return NULL;
5267
0
    }
5268
5269
8.92M
    const char *starts = s;
5270
8.92M
    const char *end = s + size;
5271
5272
8.92M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5273
8.92M
    if (pos == size) {  // fast path: ASCII string.
5274
8.36M
        PyObject *u = PyUnicode_New(size, 127);
5275
8.36M
        if (u == NULL) {
5276
0
            return NULL;
5277
0
        }
5278
8.36M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5279
8.36M
        if (consumed) {
5280
0
            *consumed = size;
5281
0
        }
5282
8.36M
        return u;
5283
8.36M
    }
5284
5285
562k
    int maxchr = 127;
5286
562k
    Py_ssize_t maxsize = size;
5287
5288
562k
    unsigned char ch = (unsigned char)(s[pos]);
5289
    // error handler other than strict may remove/replace the invalid byte.
5290
    // consumed != NULL allows 1~3 bytes remainings.
5291
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5292
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5293
    // reallocation and copy.
5294
562k
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5295
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5296
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5297
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5298
        // means that it is no longer necessary to allocate several times the required amount
5299
        // of memory.
5300
280k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5301
280k
        if (ch < 0xc4) { // latin1
5302
164k
            maxchr = 0xff;
5303
164k
        }
5304
115k
        else if (ch < 0xf0) { // ucs2
5305
105k
            maxchr = 0xffff;
5306
105k
        }
5307
10.1k
        else { // ucs4
5308
10.1k
            maxchr = 0x10ffff;
5309
10.1k
        }
5310
280k
    }
5311
562k
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5312
562k
    if (!u) {
5313
0
        return NULL;
5314
0
    }
5315
5316
    // Use _PyUnicodeWriter after fast path is failed.
5317
562k
    _PyUnicodeWriter writer;
5318
562k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5319
562k
    if (maxchr <= 255) {
5320
446k
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5321
446k
        s += pos;
5322
446k
        writer.pos = pos;
5323
446k
    }
5324
5325
562k
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5326
562k
                                 error_handler, errors,
5327
562k
                                 consumed) < 0) {
5328
787
        _PyUnicodeWriter_Dealloc(&writer);
5329
787
        return NULL;
5330
787
    }
5331
561k
    return _PyUnicodeWriter_Finish(&writer);
5332
562k
}
5333
5334
5335
// Used by PyUnicodeWriter_WriteUTF8() implementation
5336
int
5337
_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
5338
                            const char *s, Py_ssize_t size,
5339
                            _Py_error_handler error_handler, const char *errors,
5340
                            Py_ssize_t *consumed)
5341
5.33M
{
5342
5.33M
    if (size == 0) {
5343
8.30k
        if (consumed) {
5344
0
            *consumed = 0;
5345
0
        }
5346
8.30k
        return 0;
5347
8.30k
    }
5348
5349
    // fast path: try ASCII string.
5350
5.32M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5351
0
        return -1;
5352
0
    }
5353
5354
5.32M
    const char *starts = s;
5355
5.32M
    const char *end = s + size;
5356
5.32M
    Py_ssize_t decoded = 0;
5357
5.32M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5358
5.32M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5359
5.32M
        decoded = ascii_decode(s, end, dest);
5360
5.32M
        writer->pos += decoded;
5361
5362
5.32M
        if (decoded == size) {
5363
5.27M
            if (consumed) {
5364
856
                *consumed = size;
5365
856
            }
5366
5.27M
            return 0;
5367
5.27M
        }
5368
46.9k
        s += decoded;
5369
46.9k
    }
5370
5371
49.3k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5372
49.3k
                                    error_handler, errors, consumed);
5373
5.32M
}
5374
5375
5376
PyObject *
5377
PyUnicode_DecodeUTF8Stateful(const char *s,
5378
                             Py_ssize_t size,
5379
                             const char *errors,
5380
                             Py_ssize_t *consumed)
5381
10.6M
{
5382
10.6M
    return unicode_decode_utf8(s, size,
5383
10.6M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5384
10.6M
                               errors, consumed);
5385
10.6M
}
5386
5387
5388
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5389
   non-zero, use strict error handler otherwise.
5390
5391
   On success, write a pointer to a newly allocated wide character string into
5392
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5393
   (in number of wchar_t units) into *wlen (if wlen is set).
5394
5395
   On memory allocation failure, return -1.
5396
5397
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5398
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5399
   is not NULL, write the decoding error message into *reason. */
5400
int
5401
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5402
                 const char **reason, _Py_error_handler errors)
5403
5.24k
{
5404
5.24k
    const char *orig_s = s;
5405
5.24k
    const char *e;
5406
5.24k
    wchar_t *unicode;
5407
5.24k
    Py_ssize_t outpos;
5408
5409
5.24k
    int surrogateescape = 0;
5410
5.24k
    int surrogatepass = 0;
5411
5.24k
    switch (errors)
5412
5.24k
    {
5413
0
    case _Py_ERROR_STRICT:
5414
0
        break;
5415
5.24k
    case _Py_ERROR_SURROGATEESCAPE:
5416
5.24k
        surrogateescape = 1;
5417
5.24k
        break;
5418
0
    case _Py_ERROR_SURROGATEPASS:
5419
0
        surrogatepass = 1;
5420
0
        break;
5421
0
    default:
5422
0
        return -3;
5423
5.24k
    }
5424
5425
    /* Note: size will always be longer than the resulting Unicode
5426
       character count */
5427
5.24k
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5428
0
        return -1;
5429
0
    }
5430
5431
5.24k
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5432
5.24k
    if (!unicode) {
5433
0
        return -1;
5434
0
    }
5435
5436
    /* Unpack UTF-8 encoded data */
5437
5.24k
    e = s + size;
5438
5.24k
    outpos = 0;
5439
5.24k
    while (s < e) {
5440
5.24k
        Py_UCS4 ch;
5441
5.24k
#if SIZEOF_WCHAR_T == 4
5442
5.24k
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5443
#else
5444
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5445
#endif
5446
5.24k
        if (ch > 0xFF) {
5447
0
#if SIZEOF_WCHAR_T == 4
5448
0
            Py_UNREACHABLE();
5449
#else
5450
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5451
            /* write a surrogate pair */
5452
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5453
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5454
#endif
5455
0
        }
5456
5.24k
        else {
5457
5.24k
            if (!ch && s == e) {
5458
5.24k
                break;
5459
5.24k
            }
5460
5461
0
            if (surrogateescape) {
5462
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5463
0
            }
5464
0
            else {
5465
                /* Is it a valid three-byte code? */
5466
0
                if (surrogatepass
5467
0
                    && (e - s) >= 3
5468
0
                    && (s[0] & 0xf0) == 0xe0
5469
0
                    && (s[1] & 0xc0) == 0x80
5470
0
                    && (s[2] & 0xc0) == 0x80)
5471
0
                {
5472
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5473
0
                    s += 3;
5474
0
                    unicode[outpos++] = ch;
5475
0
                }
5476
0
                else {
5477
0
                    PyMem_RawFree(unicode );
5478
0
                    if (reason != NULL) {
5479
0
                        switch (ch) {
5480
0
                        case 0:
5481
0
                            *reason = "unexpected end of data";
5482
0
                            break;
5483
0
                        case 1:
5484
0
                            *reason = "invalid start byte";
5485
0
                            break;
5486
                        /* 2, 3, 4 */
5487
0
                        default:
5488
0
                            *reason = "invalid continuation byte";
5489
0
                            break;
5490
0
                        }
5491
0
                    }
5492
0
                    if (wlen != NULL) {
5493
0
                        *wlen = s - orig_s;
5494
0
                    }
5495
0
                    return -2;
5496
0
                }
5497
0
            }
5498
0
        }
5499
5.24k
    }
5500
5.24k
    unicode[outpos] = L'\0';
5501
5.24k
    if (wlen) {
5502
5.24k
        *wlen = outpos;
5503
5.24k
    }
5504
5.24k
    *wstr = unicode;
5505
5.24k
    return 0;
5506
5.24k
}
5507
5508
5509
wchar_t*
5510
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5511
                               size_t *wlen)
5512
0
{
5513
0
    wchar_t *wstr;
5514
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5515
0
                               &wstr, wlen,
5516
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5517
0
    if (res != 0) {
5518
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5519
0
        assert(res != -3);
5520
0
        if (wlen) {
5521
0
            *wlen = (size_t)res;
5522
0
        }
5523
0
        return NULL;
5524
0
    }
5525
0
    return wstr;
5526
0
}
5527
5528
5529
/* UTF-8 encoder.
5530
5531
   On success, return 0 and write the newly allocated character string (use
5532
   PyMem_Free() to free the memory) into *str.
5533
5534
   On encoding failure, return -2 and write the position of the invalid
5535
   surrogate character into *error_pos (if error_pos is set) and the decoding
5536
   error message into *reason (if reason is set).
5537
5538
   On memory allocation failure, return -1. */
5539
int
5540
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5541
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5542
644
{
5543
644
    const Py_ssize_t max_char_size = 4;
5544
644
    Py_ssize_t len = wcslen(text);
5545
5546
644
    assert(len >= 0);
5547
5548
644
    int surrogateescape = 0;
5549
644
    int surrogatepass = 0;
5550
644
    switch (errors)
5551
644
    {
5552
64
    case _Py_ERROR_STRICT:
5553
64
        break;
5554
580
    case _Py_ERROR_SURROGATEESCAPE:
5555
580
        surrogateescape = 1;
5556
580
        break;
5557
0
    case _Py_ERROR_SURROGATEPASS:
5558
0
        surrogatepass = 1;
5559
0
        break;
5560
0
    default:
5561
0
        return -3;
5562
644
    }
5563
5564
644
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5565
0
        return -1;
5566
0
    }
5567
644
    char *bytes;
5568
644
    if (raw_malloc) {
5569
644
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5570
644
    }
5571
0
    else {
5572
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5573
0
    }
5574
644
    if (bytes == NULL) {
5575
0
        return -1;
5576
0
    }
5577
5578
644
    char *p = bytes;
5579
644
    Py_ssize_t i;
5580
42.9k
    for (i = 0; i < len; ) {
5581
42.3k
        Py_ssize_t ch_pos = i;
5582
42.3k
        Py_UCS4 ch = text[i];
5583
42.3k
        i++;
5584
#if Py_UNICODE_SIZE == 2
5585
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5586
            && i < len
5587
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5588
        {
5589
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5590
            i++;
5591
        }
5592
#endif
5593
5594
42.3k
        if (ch < 0x80) {
5595
            /* Encode ASCII */
5596
42.3k
            *p++ = (char) ch;
5597
5598
42.3k
        }
5599
0
        else if (ch < 0x0800) {
5600
            /* Encode Latin-1 */
5601
0
            *p++ = (char)(0xc0 | (ch >> 6));
5602
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5603
0
        }
5604
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5605
            /* surrogateescape error handler */
5606
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5607
0
                if (error_pos != NULL) {
5608
0
                    *error_pos = (size_t)ch_pos;
5609
0
                }
5610
0
                if (reason != NULL) {
5611
0
                    *reason = "encoding error";
5612
0
                }
5613
0
                if (raw_malloc) {
5614
0
                    PyMem_RawFree(bytes);
5615
0
                }
5616
0
                else {
5617
0
                    PyMem_Free(bytes);
5618
0
                }
5619
0
                return -2;
5620
0
            }
5621
0
            *p++ = (char)(ch & 0xff);
5622
0
        }
5623
0
        else if (ch < 0x10000) {
5624
0
            *p++ = (char)(0xe0 | (ch >> 12));
5625
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5626
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5627
0
        }
5628
0
        else {  /* ch >= 0x10000 */
5629
0
            assert(ch <= MAX_UNICODE);
5630
            /* Encode UCS4 Unicode ordinals */
5631
0
            *p++ = (char)(0xf0 | (ch >> 18));
5632
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5633
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5634
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5635
0
        }
5636
42.3k
    }
5637
644
    *p++ = '\0';
5638
5639
644
    size_t final_size = (p - bytes);
5640
644
    char *bytes2;
5641
644
    if (raw_malloc) {
5642
644
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5643
644
    }
5644
0
    else {
5645
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5646
0
    }
5647
644
    if (bytes2 == NULL) {
5648
0
        if (error_pos != NULL) {
5649
0
            *error_pos = (size_t)-1;
5650
0
        }
5651
0
        if (raw_malloc) {
5652
0
            PyMem_RawFree(bytes);
5653
0
        }
5654
0
        else {
5655
0
            PyMem_Free(bytes);
5656
0
        }
5657
0
        return -1;
5658
0
    }
5659
644
    *str = bytes2;
5660
644
    return 0;
5661
644
}
5662
5663
5664
/* Primary internal function which creates utf8 encoded bytes objects.
5665
5666
   Allocation strategy:  if the string is short, convert into a stack buffer
5667
   and allocate exactly as much space needed at the end.  Else allocate the
5668
   maximum possible needed (4 result bytes per Unicode character), and return
5669
   the excess memory at the end.
5670
*/
5671
static PyObject *
5672
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5673
                    const char *errors)
5674
18.2M
{
5675
18.2M
    if (!PyUnicode_Check(unicode)) {
5676
0
        PyErr_BadArgument();
5677
0
        return NULL;
5678
0
    }
5679
5680
18.2M
    if (PyUnicode_UTF8(unicode))
5681
10.6M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5682
10.6M
                                         PyUnicode_UTF8_LENGTH(unicode));
5683
5684
7.52M
    int kind = PyUnicode_KIND(unicode);
5685
7.52M
    const void *data = PyUnicode_DATA(unicode);
5686
7.52M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5687
5688
7.52M
    PyBytesWriter *writer;
5689
7.52M
    char *end;
5690
5691
7.52M
    switch (kind) {
5692
0
    default:
5693
0
        Py_UNREACHABLE();
5694
5.92M
    case PyUnicode_1BYTE_KIND:
5695
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5696
5.92M
        assert(!PyUnicode_IS_ASCII(unicode));
5697
5.92M
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5698
5.92M
                                      error_handler, errors, &end);
5699
5.92M
        break;
5700
1.53M
    case PyUnicode_2BYTE_KIND:
5701
1.53M
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5702
1.53M
                                      error_handler, errors, &end);
5703
1.53M
        break;
5704
65.2k
    case PyUnicode_4BYTE_KIND:
5705
65.2k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5706
65.2k
                                      error_handler, errors, &end);
5707
65.2k
        break;
5708
7.52M
    }
5709
5710
7.52M
    if (writer == NULL) {
5711
180k
        PyBytesWriter_Discard(writer);
5712
180k
        return NULL;
5713
180k
    }
5714
7.34M
    return PyBytesWriter_FinishWithPointer(writer, end);
5715
7.52M
}
5716
5717
static int
5718
unicode_fill_utf8(PyObject *unicode)
5719
184k
{
5720
184k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5721
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5722
184k
    assert(!PyUnicode_IS_ASCII(unicode));
5723
5724
184k
    int kind = PyUnicode_KIND(unicode);
5725
184k
    const void *data = PyUnicode_DATA(unicode);
5726
184k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5727
5728
184k
    PyBytesWriter *writer;
5729
184k
    char *end;
5730
5731
184k
    switch (kind) {
5732
0
    default:
5733
0
        Py_UNREACHABLE();
5734
149k
    case PyUnicode_1BYTE_KIND:
5735
149k
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5736
149k
                                      _Py_ERROR_STRICT, NULL, &end);
5737
149k
        break;
5738
28.8k
    case PyUnicode_2BYTE_KIND:
5739
28.8k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5740
28.8k
                                      _Py_ERROR_STRICT, NULL, &end);
5741
28.8k
        break;
5742
5.87k
    case PyUnicode_4BYTE_KIND:
5743
5.87k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5744
5.87k
                                      _Py_ERROR_STRICT, NULL, &end);
5745
5.87k
        break;
5746
184k
    }
5747
184k
    if (writer == NULL) {
5748
207
        return -1;
5749
207
    }
5750
5751
184k
    const char *start = PyBytesWriter_GetData(writer);
5752
184k
    Py_ssize_t len = end - start;
5753
5754
184k
    char *cache = PyMem_Malloc(len + 1);
5755
184k
    if (cache == NULL) {
5756
0
        PyBytesWriter_Discard(writer);
5757
0
        PyErr_NoMemory();
5758
0
        return -1;
5759
0
    }
5760
184k
    memcpy(cache, start, len);
5761
184k
    cache[len] = '\0';
5762
184k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5763
184k
    PyUnicode_SET_UTF8(unicode, cache);
5764
184k
    PyBytesWriter_Discard(writer);
5765
184k
    return 0;
5766
184k
}
5767
5768
PyObject *
5769
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5770
18.1M
{
5771
18.1M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5772
18.1M
}
5773
5774
5775
PyObject *
5776
PyUnicode_AsUTF8String(PyObject *unicode)
5777
2.88k
{
5778
2.88k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5779
2.88k
}
5780
5781
/* --- UTF-32 Codec ------------------------------------------------------- */
5782
5783
PyObject *
5784
PyUnicode_DecodeUTF32(const char *s,
5785
                      Py_ssize_t size,
5786
                      const char *errors,
5787
                      int *byteorder)
5788
108
{
5789
108
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5790
108
}
5791
5792
PyObject *
5793
PyUnicode_DecodeUTF32Stateful(const char *s,
5794
                              Py_ssize_t size,
5795
                              const char *errors,
5796
                              int *byteorder,
5797
                              Py_ssize_t *consumed)
5798
29.7k
{
5799
29.7k
    const char *starts = s;
5800
29.7k
    Py_ssize_t startinpos;
5801
29.7k
    Py_ssize_t endinpos;
5802
29.7k
    _PyUnicodeWriter writer;
5803
29.7k
    const unsigned char *q, *e;
5804
29.7k
    int le, bo = 0;       /* assume native ordering by default */
5805
29.7k
    const char *encoding;
5806
29.7k
    const char *errmsg = "";
5807
29.7k
    PyObject *errorHandler = NULL;
5808
29.7k
    PyObject *exc = NULL;
5809
5810
29.7k
    q = (const unsigned char *)s;
5811
29.7k
    e = q + size;
5812
5813
29.7k
    if (byteorder)
5814
29.6k
        bo = *byteorder;
5815
5816
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5817
       byte order setting accordingly. In native mode, the leading BOM
5818
       mark is skipped, in all other modes, it is copied to the output
5819
       stream as-is (giving a ZWNBSP character). */
5820
29.7k
    if (bo == 0 && size >= 4) {
5821
27.0k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5822
27.0k
        if (bom == 0x0000FEFF) {
5823
80
            bo = -1;
5824
80
            q += 4;
5825
80
        }
5826
26.9k
        else if (bom == 0xFFFE0000) {
5827
229
            bo = 1;
5828
229
            q += 4;
5829
229
        }
5830
27.0k
        if (byteorder)
5831
26.9k
            *byteorder = bo;
5832
27.0k
    }
5833
5834
29.7k
    if (q == e) {
5835
101
        if (consumed)
5836
0
            *consumed = size;
5837
101
        _Py_RETURN_UNICODE_EMPTY();
5838
101
    }
5839
5840
#ifdef WORDS_BIGENDIAN
5841
    le = bo < 0;
5842
#else
5843
29.6k
    le = bo <= 0;
5844
29.6k
#endif
5845
29.6k
    encoding = le ? "utf-32-le" : "utf-32-be";
5846
5847
29.6k
    _PyUnicodeWriter_Init(&writer);
5848
29.6k
    writer.min_length = (e - q + 3) / 4;
5849
29.6k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5850
0
        goto onError;
5851
5852
118k
    while (1) {
5853
118k
        Py_UCS4 ch = 0;
5854
118k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5855
5856
118k
        if (e - q >= 4) {
5857
99.7k
            int kind = writer.kind;
5858
99.7k
            void *data = writer.data;
5859
99.7k
            const unsigned char *last = e - 4;
5860
99.7k
            Py_ssize_t pos = writer.pos;
5861
99.7k
            if (le) {
5862
118k
                do {
5863
118k
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5864
118k
                    if (ch > maxch)
5865
95.1k
                        break;
5866
23.4k
                    if (kind != PyUnicode_1BYTE_KIND &&
5867
7.79k
                        Py_UNICODE_IS_SURROGATE(ch))
5868
202
                        break;
5869
23.2k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5870
23.2k
                    q += 4;
5871
23.2k
                } while (q <= last);
5872
96.5k
            }
5873
3.19k
            else {
5874
6.13k
                do {
5875
6.13k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5876
6.13k
                    if (ch > maxch)
5877
2.96k
                        break;
5878
3.17k
                    if (kind != PyUnicode_1BYTE_KIND &&
5879
2.50k
                        Py_UNICODE_IS_SURROGATE(ch))
5880
102
                        break;
5881
3.06k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5882
3.06k
                    q += 4;
5883
3.06k
                } while (q <= last);
5884
3.19k
            }
5885
99.7k
            writer.pos = pos;
5886
99.7k
        }
5887
5888
118k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5889
305
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5890
305
            startinpos = ((const char *)q) - starts;
5891
305
            endinpos = startinpos + 4;
5892
305
        }
5893
118k
        else if (ch <= maxch) {
5894
20.3k
            if (q == e || consumed)
5895
3.86k
                break;
5896
            /* remaining bytes at the end? (size should be divisible by 4) */
5897
16.5k
            errmsg = "truncated data";
5898
16.5k
            startinpos = ((const char *)q) - starts;
5899
16.5k
            endinpos = ((const char *)e) - starts;
5900
16.5k
        }
5901
98.0k
        else {
5902
98.0k
            if (ch < 0x110000) {
5903
4.26k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5904
0
                    goto onError;
5905
4.26k
                q += 4;
5906
4.26k
                continue;
5907
4.26k
            }
5908
93.8k
            errmsg = "code point not in range(0x110000)";
5909
93.8k
            startinpos = ((const char *)q) - starts;
5910
93.8k
            endinpos = startinpos + 4;
5911
93.8k
        }
5912
5913
        /* The remaining input chars are ignored if the callback
5914
           chooses to skip the input */
5915
110k
        if (unicode_decode_call_errorhandler_writer(
5916
110k
                errors, &errorHandler,
5917
110k
                encoding, errmsg,
5918
110k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5919
110k
                &writer))
5920
25.7k
            goto onError;
5921
110k
    }
5922
5923
3.86k
    if (consumed)
5924
0
        *consumed = (const char *)q-starts;
5925
5926
3.86k
    Py_XDECREF(errorHandler);
5927
3.86k
    Py_XDECREF(exc);
5928
3.86k
    return _PyUnicodeWriter_Finish(&writer);
5929
5930
25.7k
  onError:
5931
25.7k
    _PyUnicodeWriter_Dealloc(&writer);
5932
25.7k
    Py_XDECREF(errorHandler);
5933
25.7k
    Py_XDECREF(exc);
5934
25.7k
    return NULL;
5935
29.6k
}
5936
5937
PyObject *
5938
_PyUnicode_EncodeUTF32(PyObject *str,
5939
                       const char *errors,
5940
                       int byteorder)
5941
0
{
5942
0
    if (!PyUnicode_Check(str)) {
5943
0
        PyErr_BadArgument();
5944
0
        return NULL;
5945
0
    }
5946
0
    int kind = PyUnicode_KIND(str);
5947
0
    const void *data = PyUnicode_DATA(str);
5948
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
5949
5950
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5951
0
        return PyErr_NoMemory();
5952
0
    Py_ssize_t nsize = len + (byteorder == 0);
5953
5954
0
#if PY_LITTLE_ENDIAN
5955
0
    int native_ordering = byteorder <= 0;
5956
#else
5957
    int native_ordering = byteorder >= 0;
5958
#endif
5959
5960
0
    if (kind == PyUnicode_1BYTE_KIND) {
5961
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
5962
        // on short strings
5963
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5964
0
        if (v == NULL) {
5965
0
            return NULL;
5966
0
        }
5967
5968
        /* output buffer is 4-bytes aligned */
5969
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5970
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
5971
0
        if (byteorder == 0) {
5972
0
            *out++ = 0xFEFF;
5973
0
        }
5974
0
        if (len > 0) {
5975
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
5976
0
                                 &out, native_ordering);
5977
0
        }
5978
0
        return v;
5979
0
    }
5980
5981
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
5982
0
    if (writer == NULL) {
5983
0
        return NULL;
5984
0
    }
5985
5986
    /* output buffer is 4-bytes aligned */
5987
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
5988
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
5989
0
    if (byteorder == 0) {
5990
0
        *out++ = 0xFEFF;
5991
0
    }
5992
0
    if (len == 0) {
5993
0
        return PyBytesWriter_Finish(writer);
5994
0
    }
5995
5996
0
    const char *encoding;
5997
0
    if (byteorder == -1)
5998
0
        encoding = "utf-32-le";
5999
0
    else if (byteorder == 1)
6000
0
        encoding = "utf-32-be";
6001
0
    else
6002
0
        encoding = "utf-32";
6003
6004
0
    PyObject *errorHandler = NULL;
6005
0
    PyObject *exc = NULL;
6006
0
    PyObject *rep = NULL;
6007
6008
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6009
0
        if (kind == PyUnicode_2BYTE_KIND) {
6010
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6011
0
                                        &out, native_ordering);
6012
0
        }
6013
0
        else {
6014
0
            assert(kind == PyUnicode_4BYTE_KIND);
6015
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6016
0
                                        &out, native_ordering);
6017
0
        }
6018
0
        if (pos == len)
6019
0
            break;
6020
6021
0
        Py_ssize_t newpos;
6022
0
        rep = unicode_encode_call_errorhandler(
6023
0
                errors, &errorHandler,
6024
0
                encoding, "surrogates not allowed",
6025
0
                str, &exc, pos, pos + 1, &newpos);
6026
0
        if (!rep)
6027
0
            goto error;
6028
6029
0
        Py_ssize_t repsize, moreunits;
6030
0
        if (PyBytes_Check(rep)) {
6031
0
            repsize = PyBytes_GET_SIZE(rep);
6032
0
            if (repsize & 3) {
6033
0
                raise_encode_exception(&exc, encoding,
6034
0
                                       str, pos, pos + 1,
6035
0
                                       "surrogates not allowed");
6036
0
                goto error;
6037
0
            }
6038
0
            moreunits = repsize / 4;
6039
0
        }
6040
0
        else {
6041
0
            assert(PyUnicode_Check(rep));
6042
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6043
0
            if (!PyUnicode_IS_ASCII(rep)) {
6044
0
                raise_encode_exception(&exc, encoding,
6045
0
                                       str, pos, pos + 1,
6046
0
                                       "surrogates not allowed");
6047
0
                goto error;
6048
0
            }
6049
0
        }
6050
0
        moreunits += pos - newpos;
6051
0
        pos = newpos;
6052
6053
        /* four bytes are reserved for each surrogate */
6054
0
        if (moreunits > 0) {
6055
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6056
0
            if (out == NULL) {
6057
0
                goto error;
6058
0
            }
6059
0
        }
6060
6061
0
        if (PyBytes_Check(rep)) {
6062
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6063
0
            out += repsize / 4;
6064
0
        }
6065
0
        else {
6066
            /* rep is unicode */
6067
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6068
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6069
0
                                 &out, native_ordering);
6070
0
        }
6071
6072
0
        Py_CLEAR(rep);
6073
0
    }
6074
6075
0
    Py_XDECREF(errorHandler);
6076
0
    Py_XDECREF(exc);
6077
6078
    /* Cut back to size actually needed. This is necessary for, for example,
6079
       encoding of a string containing isolated surrogates and the 'ignore'
6080
       handler is used. */
6081
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6082
6083
0
  error:
6084
0
    Py_XDECREF(rep);
6085
0
    Py_XDECREF(errorHandler);
6086
0
    Py_XDECREF(exc);
6087
0
    PyBytesWriter_Discard(writer);
6088
0
    return NULL;
6089
0
}
6090
6091
PyObject *
6092
PyUnicode_AsUTF32String(PyObject *unicode)
6093
0
{
6094
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6095
0
}
6096
6097
/* --- UTF-16 Codec ------------------------------------------------------- */
6098
6099
PyObject *
6100
PyUnicode_DecodeUTF16(const char *s,
6101
                      Py_ssize_t size,
6102
                      const char *errors,
6103
                      int *byteorder)
6104
96
{
6105
96
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6106
96
}
6107
6108
PyObject *
6109
PyUnicode_DecodeUTF16Stateful(const char *s,
6110
                              Py_ssize_t size,
6111
                              const char *errors,
6112
                              int *byteorder,
6113
                              Py_ssize_t *consumed)
6114
13.6k
{
6115
13.6k
    const char *starts = s;
6116
13.6k
    Py_ssize_t startinpos;
6117
13.6k
    Py_ssize_t endinpos;
6118
13.6k
    _PyUnicodeWriter writer;
6119
13.6k
    const unsigned char *q, *e;
6120
13.6k
    int bo = 0;       /* assume native ordering by default */
6121
13.6k
    int native_ordering;
6122
13.6k
    const char *errmsg = "";
6123
13.6k
    PyObject *errorHandler = NULL;
6124
13.6k
    PyObject *exc = NULL;
6125
13.6k
    const char *encoding;
6126
6127
13.6k
    q = (const unsigned char *)s;
6128
13.6k
    e = q + size;
6129
6130
13.6k
    if (byteorder)
6131
13.5k
        bo = *byteorder;
6132
6133
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6134
       byte order setting accordingly. In native mode, the leading BOM
6135
       mark is skipped, in all other modes, it is copied to the output
6136
       stream as-is (giving a ZWNBSP character). */
6137
13.6k
    if (bo == 0 && size >= 2) {
6138
12.9k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6139
12.9k
        if (bom == 0xFEFF) {
6140
415
            q += 2;
6141
415
            bo = -1;
6142
415
        }
6143
12.5k
        else if (bom == 0xFFFE) {
6144
2.01k
            q += 2;
6145
2.01k
            bo = 1;
6146
2.01k
        }
6147
12.9k
        if (byteorder)
6148
12.8k
            *byteorder = bo;
6149
12.9k
    }
6150
6151
13.6k
    if (q == e) {
6152
75
        if (consumed)
6153
0
            *consumed = size;
6154
75
        _Py_RETURN_UNICODE_EMPTY();
6155
75
    }
6156
6157
13.6k
#if PY_LITTLE_ENDIAN
6158
13.6k
    native_ordering = bo <= 0;
6159
13.6k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6160
#else
6161
    native_ordering = bo >= 0;
6162
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6163
#endif
6164
6165
    /* Note: size will always be longer than the resulting Unicode
6166
       character count normally.  Error handler will take care of
6167
       resizing when needed. */
6168
13.6k
    _PyUnicodeWriter_Init(&writer);
6169
13.6k
    writer.min_length = (e - q + 1) / 2;
6170
13.6k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6171
0
        goto onError;
6172
6173
52.3k
    while (1) {
6174
52.3k
        Py_UCS4 ch = 0;
6175
52.3k
        if (e - q >= 2) {
6176
44.9k
            int kind = writer.kind;
6177
44.9k
            if (kind == PyUnicode_1BYTE_KIND) {
6178
16.5k
                if (PyUnicode_IS_ASCII(writer.buffer))
6179
13.0k
                    ch = asciilib_utf16_decode(&q, e,
6180
13.0k
                            (Py_UCS1*)writer.data, &writer.pos,
6181
13.0k
                            native_ordering);
6182
3.45k
                else
6183
3.45k
                    ch = ucs1lib_utf16_decode(&q, e,
6184
3.45k
                            (Py_UCS1*)writer.data, &writer.pos,
6185
3.45k
                            native_ordering);
6186
28.4k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6187
11.7k
                ch = ucs2lib_utf16_decode(&q, e,
6188
11.7k
                        (Py_UCS2*)writer.data, &writer.pos,
6189
11.7k
                        native_ordering);
6190
16.7k
            } else {
6191
16.7k
                assert(kind == PyUnicode_4BYTE_KIND);
6192
16.7k
                ch = ucs4lib_utf16_decode(&q, e,
6193
16.7k
                        (Py_UCS4*)writer.data, &writer.pos,
6194
16.7k
                        native_ordering);
6195
16.7k
            }
6196
44.9k
        }
6197
6198
52.3k
        switch (ch)
6199
52.3k
        {
6200
13.7k
        case 0:
6201
            /* remaining byte at the end? (size should be even) */
6202
13.7k
            if (q == e || consumed)
6203
8.50k
                goto End;
6204
5.20k
            errmsg = "truncated data";
6205
5.20k
            startinpos = ((const char *)q) - starts;
6206
5.20k
            endinpos = ((const char *)e) - starts;
6207
5.20k
            break;
6208
            /* The remaining input chars are ignored if the callback
6209
               chooses to skip the input */
6210
1.50k
        case 1:
6211
1.50k
            q -= 2;
6212
1.50k
            if (consumed)
6213
0
                goto End;
6214
1.50k
            errmsg = "unexpected end of data";
6215
1.50k
            startinpos = ((const char *)q) - starts;
6216
1.50k
            endinpos = ((const char *)e) - starts;
6217
1.50k
            break;
6218
14.9k
        case 2:
6219
14.9k
            errmsg = "illegal encoding";
6220
14.9k
            startinpos = ((const char *)q) - 2 - starts;
6221
14.9k
            endinpos = startinpos + 2;
6222
14.9k
            break;
6223
6.12k
        case 3:
6224
6.12k
            errmsg = "illegal UTF-16 surrogate";
6225
6.12k
            startinpos = ((const char *)q) - 4 - starts;
6226
6.12k
            endinpos = startinpos + 2;
6227
6.12k
            break;
6228
16.0k
        default:
6229
16.0k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6230
0
                goto onError;
6231
16.0k
            continue;
6232
52.3k
        }
6233
6234
27.7k
        if (unicode_decode_call_errorhandler_writer(
6235
27.7k
                errors,
6236
27.7k
                &errorHandler,
6237
27.7k
                encoding, errmsg,
6238
27.7k
                &starts,
6239
27.7k
                (const char **)&e,
6240
27.7k
                &startinpos,
6241
27.7k
                &endinpos,
6242
27.7k
                &exc,
6243
27.7k
                (const char **)&q,
6244
27.7k
                &writer))
6245
5.09k
            goto onError;
6246
27.7k
    }
6247
6248
8.50k
End:
6249
8.50k
    if (consumed)
6250
0
        *consumed = (const char *)q-starts;
6251
6252
8.50k
    Py_XDECREF(errorHandler);
6253
8.50k
    Py_XDECREF(exc);
6254
8.50k
    return _PyUnicodeWriter_Finish(&writer);
6255
6256
5.09k
  onError:
6257
5.09k
    _PyUnicodeWriter_Dealloc(&writer);
6258
5.09k
    Py_XDECREF(errorHandler);
6259
5.09k
    Py_XDECREF(exc);
6260
5.09k
    return NULL;
6261
13.6k
}
6262
6263
PyObject *
6264
_PyUnicode_EncodeUTF16(PyObject *str,
6265
                       const char *errors,
6266
                       int byteorder)
6267
0
{
6268
0
    if (!PyUnicode_Check(str)) {
6269
0
        PyErr_BadArgument();
6270
0
        return NULL;
6271
0
    }
6272
0
    int kind = PyUnicode_KIND(str);
6273
0
    const void *data = PyUnicode_DATA(str);
6274
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6275
6276
0
    Py_ssize_t pairs = 0;
6277
0
    if (kind == PyUnicode_4BYTE_KIND) {
6278
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6279
0
        const Py_UCS4 *end = in + len;
6280
0
        while (in < end) {
6281
0
            if (*in++ >= 0x10000) {
6282
0
                pairs++;
6283
0
            }
6284
0
        }
6285
0
    }
6286
0
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6287
0
        return PyErr_NoMemory();
6288
0
    }
6289
0
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6290
6291
#if PY_BIG_ENDIAN
6292
    int native_ordering = byteorder >= 0;
6293
#else
6294
0
    int native_ordering = byteorder <= 0;
6295
0
#endif
6296
6297
0
    if (kind == PyUnicode_1BYTE_KIND) {
6298
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6299
        // on short strings
6300
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6301
0
        if (v == NULL) {
6302
0
            return NULL;
6303
0
        }
6304
6305
        /* output buffer is 2-bytes aligned */
6306
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6307
0
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6308
0
        if (byteorder == 0) {
6309
0
            *out++ = 0xFEFF;
6310
0
        }
6311
0
        if (len > 0) {
6312
0
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6313
0
        }
6314
0
        return v;
6315
0
    }
6316
6317
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6318
0
    if (writer == NULL) {
6319
0
        return NULL;
6320
0
    }
6321
6322
    /* output buffer is 2-bytes aligned */
6323
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6324
0
    unsigned short *out = PyBytesWriter_GetData(writer);
6325
0
    if (byteorder == 0) {
6326
0
        *out++ = 0xFEFF;
6327
0
    }
6328
0
    if (len == 0) {
6329
0
        return PyBytesWriter_Finish(writer);
6330
0
    }
6331
6332
0
    const char *encoding;
6333
0
    if (byteorder < 0) {
6334
0
        encoding = "utf-16-le";
6335
0
    }
6336
0
    else if (byteorder > 0) {
6337
0
        encoding = "utf-16-be";
6338
0
    }
6339
0
    else {
6340
0
        encoding = "utf-16";
6341
0
    }
6342
6343
0
    PyObject *errorHandler = NULL;
6344
0
    PyObject *exc = NULL;
6345
0
    PyObject *rep = NULL;
6346
6347
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6348
0
        if (kind == PyUnicode_2BYTE_KIND) {
6349
0
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6350
0
                                        &out, native_ordering);
6351
0
        }
6352
0
        else {
6353
0
            assert(kind == PyUnicode_4BYTE_KIND);
6354
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6355
0
                                        &out, native_ordering);
6356
0
        }
6357
0
        if (pos == len)
6358
0
            break;
6359
6360
0
        Py_ssize_t newpos;
6361
0
        rep = unicode_encode_call_errorhandler(
6362
0
                errors, &errorHandler,
6363
0
                encoding, "surrogates not allowed",
6364
0
                str, &exc, pos, pos + 1, &newpos);
6365
0
        if (!rep)
6366
0
            goto error;
6367
6368
0
        Py_ssize_t repsize, moreunits;
6369
0
        if (PyBytes_Check(rep)) {
6370
0
            repsize = PyBytes_GET_SIZE(rep);
6371
0
            if (repsize & 1) {
6372
0
                raise_encode_exception(&exc, encoding,
6373
0
                                       str, pos, pos + 1,
6374
0
                                       "surrogates not allowed");
6375
0
                goto error;
6376
0
            }
6377
0
            moreunits = repsize / 2;
6378
0
        }
6379
0
        else {
6380
0
            assert(PyUnicode_Check(rep));
6381
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6382
0
            if (!PyUnicode_IS_ASCII(rep)) {
6383
0
                raise_encode_exception(&exc, encoding,
6384
0
                                       str, pos, pos + 1,
6385
0
                                       "surrogates not allowed");
6386
0
                goto error;
6387
0
            }
6388
0
        }
6389
0
        moreunits += pos - newpos;
6390
0
        pos = newpos;
6391
6392
        /* two bytes are reserved for each surrogate */
6393
0
        if (moreunits > 0) {
6394
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6395
0
            if (out == NULL) {
6396
0
                goto error;
6397
0
            }
6398
0
        }
6399
6400
0
        if (PyBytes_Check(rep)) {
6401
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6402
0
            out += repsize / 2;
6403
0
        } else {
6404
            /* rep is unicode */
6405
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6406
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6407
0
                                 &out, native_ordering);
6408
0
        }
6409
6410
0
        Py_CLEAR(rep);
6411
0
    }
6412
6413
0
    Py_XDECREF(errorHandler);
6414
0
    Py_XDECREF(exc);
6415
6416
    /* Cut back to size actually needed. This is necessary for, for example,
6417
    encoding of a string containing isolated surrogates and the 'ignore' handler
6418
    is used. */
6419
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6420
6421
0
  error:
6422
0
    Py_XDECREF(rep);
6423
0
    Py_XDECREF(errorHandler);
6424
0
    Py_XDECREF(exc);
6425
0
    PyBytesWriter_Discard(writer);
6426
0
    return NULL;
6427
0
}
6428
6429
PyObject *
6430
PyUnicode_AsUTF16String(PyObject *unicode)
6431
0
{
6432
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6433
0
}
6434
6435
_PyUnicode_Name_CAPI *
6436
_PyUnicode_GetNameCAPI(void)
6437
1.73k
{
6438
1.73k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6439
1.73k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6440
6441
1.73k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6442
1.73k
    if (ucnhash_capi == NULL) {
6443
1
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6444
1
                PyUnicodeData_CAPSULE_NAME, 1);
6445
6446
        // It's fine if we overwrite the value here. It's always the same value.
6447
1
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6448
1
    }
6449
1.73k
    return ucnhash_capi;
6450
1.73k
}
6451
6452
/* --- Unicode Escape Codec ----------------------------------------------- */
6453
6454
PyObject *
6455
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6456
                               Py_ssize_t size,
6457
                               const char *errors,
6458
                               Py_ssize_t *consumed,
6459
                               int *first_invalid_escape_char,
6460
                               const char **first_invalid_escape_ptr)
6461
30.6k
{
6462
30.6k
    const char *starts = s;
6463
30.6k
    const char *initial_starts = starts;
6464
30.6k
    _PyUnicodeWriter writer;
6465
30.6k
    const char *end;
6466
30.6k
    PyObject *errorHandler = NULL;
6467
30.6k
    PyObject *exc = NULL;
6468
30.6k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6469
6470
    // so we can remember if we've seen an invalid escape char or not
6471
30.6k
    *first_invalid_escape_char = -1;
6472
30.6k
    *first_invalid_escape_ptr = NULL;
6473
6474
30.6k
    if (size == 0) {
6475
1.79k
        if (consumed) {
6476
0
            *consumed = 0;
6477
0
        }
6478
1.79k
        _Py_RETURN_UNICODE_EMPTY();
6479
1.79k
    }
6480
    /* Escaped strings will always be longer than the resulting
6481
       Unicode string, so we start with size here and then reduce the
6482
       length after conversion to the true value.
6483
       (but if the error callback returns a long replacement string
6484
       we'll have to allocate more space) */
6485
28.8k
    _PyUnicodeWriter_Init(&writer);
6486
28.8k
    writer.min_length = size;
6487
28.8k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6488
0
        goto onError;
6489
0
    }
6490
6491
28.8k
    end = s + size;
6492
177k
    while (s < end) {
6493
148k
        unsigned char c = (unsigned char) *s++;
6494
148k
        Py_UCS4 ch;
6495
148k
        int count;
6496
148k
        const char *message;
6497
6498
148k
#define WRITE_ASCII_CHAR(ch)                                                  \
6499
148k
            do {                                                              \
6500
15.0k
                assert(ch <= 127);                                            \
6501
15.0k
                assert(writer.pos < writer.size);                             \
6502
15.0k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6503
15.0k
            } while(0)
6504
6505
148k
#define WRITE_CHAR(ch)                                                        \
6506
148k
            do {                                                              \
6507
137k
                if (ch <= writer.maxchar) {                                   \
6508
122k
                    assert(writer.pos < writer.size);                         \
6509
122k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6510
122k
                }                                                             \
6511
137k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6512
0
                    goto onError;                                             \
6513
0
                }                                                             \
6514
137k
            } while(0)
6515
6516
        /* Non-escape characters are interpreted as Unicode ordinals */
6517
148k
        if (c != '\\') {
6518
97.1k
            WRITE_CHAR(c);
6519
97.1k
            continue;
6520
97.1k
        }
6521
6522
51.1k
        Py_ssize_t startinpos = s - starts - 1;
6523
        /* \ - Escapes */
6524
51.1k
        if (s >= end) {
6525
0
            message = "\\ at end of string";
6526
0
            goto incomplete;
6527
0
        }
6528
51.1k
        c = (unsigned char) *s++;
6529
6530
51.1k
        assert(writer.pos < writer.size);
6531
51.1k
        switch (c) {
6532
6533
            /* \x escapes */
6534
670
        case '\n': continue;
6535
1.43k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6536
912
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6537
1.19k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6538
1.01k
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6539
        /* FF */
6540
733
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6541
784
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6542
929
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6543
1.41k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6544
        /* VT */
6545
826
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6546
        /* BEL, not classic C */
6547
708
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6548
6549
            /* \OOO (octal) escapes */
6550
3.76k
        case '0': case '1': case '2': case '3':
6551
6.77k
        case '4': case '5': case '6': case '7':
6552
6.77k
            ch = c - '0';
6553
6.77k
            if (s < end && '0' <= *s && *s <= '7') {
6554
2.50k
                ch = (ch<<3) + *s++ - '0';
6555
2.50k
                if (s < end && '0' <= *s && *s <= '7') {
6556
1.25k
                    ch = (ch<<3) + *s++ - '0';
6557
1.25k
                }
6558
2.50k
            }
6559
6.77k
            if (ch > 0377) {
6560
1.08k
                if (*first_invalid_escape_char == -1) {
6561
753
                    *first_invalid_escape_char = ch;
6562
753
                    if (starts == initial_starts) {
6563
                        /* Back up 3 chars, since we've already incremented s. */
6564
753
                        *first_invalid_escape_ptr = s - 3;
6565
753
                    }
6566
753
                }
6567
1.08k
            }
6568
6.77k
            WRITE_CHAR(ch);
6569
6.77k
            continue;
6570
6571
            /* hex escapes */
6572
            /* \xXX */
6573
6.77k
        case 'x':
6574
5.97k
            count = 2;
6575
5.97k
            message = "truncated \\xXX escape";
6576
5.97k
            goto hexescape;
6577
6578
            /* \uXXXX */
6579
9.27k
        case 'u':
6580
9.27k
            count = 4;
6581
9.27k
            message = "truncated \\uXXXX escape";
6582
9.27k
            goto hexescape;
6583
6584
            /* \UXXXXXXXX */
6585
11.7k
        case 'U':
6586
11.7k
            count = 8;
6587
11.7k
            message = "truncated \\UXXXXXXXX escape";
6588
26.9k
        hexescape:
6589
169k
            for (ch = 0; count; ++s, --count) {
6590
142k
                if (s >= end) {
6591
6
                    goto incomplete;
6592
6
                }
6593
142k
                c = (unsigned char)*s;
6594
142k
                ch <<= 4;
6595
142k
                if (c >= '0' && c <= '9') {
6596
109k
                    ch += c - '0';
6597
109k
                }
6598
32.9k
                else if (c >= 'a' && c <= 'f') {
6599
32.6k
                    ch += c - ('a' - 10);
6600
32.6k
                }
6601
251
                else if (c >= 'A' && c <= 'F') {
6602
243
                    ch += c - ('A' - 10);
6603
243
                }
6604
8
                else {
6605
8
                    goto error;
6606
8
                }
6607
142k
            }
6608
6609
            /* when we get here, ch is a 32-bit unicode character */
6610
26.9k
            if (ch > MAX_UNICODE) {
6611
1
                message = "illegal Unicode character";
6612
1
                goto error;
6613
1
            }
6614
6615
26.9k
            WRITE_CHAR(ch);
6616
26.9k
            continue;
6617
6618
            /* \N{name} */
6619
26.9k
        case 'N':
6620
1.73k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6621
1.73k
            if (ucnhash_capi == NULL) {
6622
0
                PyErr_SetString(
6623
0
                        PyExc_UnicodeError,
6624
0
                        "\\N escapes not supported (can't load unicodedata module)"
6625
0
                );
6626
0
                goto onError;
6627
0
            }
6628
6629
1.73k
            message = "malformed \\N character escape";
6630
1.73k
            if (s >= end) {
6631
4
                goto incomplete;
6632
4
            }
6633
1.73k
            if (*s == '{') {
6634
1.72k
                const char *start = ++s;
6635
1.72k
                size_t namelen;
6636
                /* look for the closing brace */
6637
23.1k
                while (s < end && *s != '}')
6638
21.4k
                    s++;
6639
1.72k
                if (s >= end) {
6640
18
                    goto incomplete;
6641
18
                }
6642
1.70k
                namelen = s - start;
6643
1.70k
                if (namelen) {
6644
                    /* found a name.  look it up in the unicode database */
6645
1.70k
                    s++;
6646
1.70k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6647
1.70k
                    if (namelen <= INT_MAX &&
6648
1.70k
                        ucnhash_capi->getcode(start, (int)namelen,
6649
1.70k
                                              &ch, 0)) {
6650
1.63k
                        assert(ch <= MAX_UNICODE);
6651
1.63k
                        WRITE_CHAR(ch);
6652
1.63k
                        continue;
6653
1.63k
                    }
6654
72
                    message = "unknown Unicode character name";
6655
72
                }
6656
1.70k
            }
6657
79
            goto error;
6658
6659
5.05k
        default:
6660
5.05k
            if (*first_invalid_escape_char == -1) {
6661
3.72k
                *first_invalid_escape_char = c;
6662
3.72k
                if (starts == initial_starts) {
6663
                    /* Back up one char, since we've already incremented s. */
6664
3.72k
                    *first_invalid_escape_ptr = s - 1;
6665
3.72k
                }
6666
3.72k
            }
6667
5.05k
            WRITE_ASCII_CHAR('\\');
6668
5.05k
            WRITE_CHAR(c);
6669
5.05k
            continue;
6670
51.1k
        }
6671
6672
28
      incomplete:
6673
28
        if (consumed) {
6674
0
            *consumed = startinpos;
6675
0
            break;
6676
0
        }
6677
116
      error:;
6678
116
        Py_ssize_t endinpos = s-starts;
6679
116
        writer.min_length = end - s + writer.pos;
6680
116
        if (unicode_decode_call_errorhandler_writer(
6681
116
                errors, &errorHandler,
6682
116
                "unicodeescape", message,
6683
116
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6684
116
                &writer)) {
6685
116
            goto onError;
6686
116
        }
6687
116
        assert(end - s <= writer.size - writer.pos);
6688
6689
0
#undef WRITE_ASCII_CHAR
6690
0
#undef WRITE_CHAR
6691
0
    }
6692
6693
28.7k
    Py_XDECREF(errorHandler);
6694
28.7k
    Py_XDECREF(exc);
6695
28.7k
    return _PyUnicodeWriter_Finish(&writer);
6696
6697
116
  onError:
6698
116
    _PyUnicodeWriter_Dealloc(&writer);
6699
116
    Py_XDECREF(errorHandler);
6700
116
    Py_XDECREF(exc);
6701
116
    return NULL;
6702
28.8k
}
6703
6704
PyObject *
6705
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6706
                              Py_ssize_t size,
6707
                              const char *errors,
6708
                              Py_ssize_t *consumed)
6709
0
{
6710
0
    int first_invalid_escape_char;
6711
0
    const char *first_invalid_escape_ptr;
6712
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6713
0
                                                      consumed,
6714
0
                                                      &first_invalid_escape_char,
6715
0
                                                      &first_invalid_escape_ptr);
6716
0
    if (result == NULL)
6717
0
        return NULL;
6718
0
    if (first_invalid_escape_char != -1) {
6719
0
        if (first_invalid_escape_char > 0xff) {
6720
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6721
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6722
0
                                 "Such sequences will not work in the future. ",
6723
0
                                 first_invalid_escape_char) < 0)
6724
0
            {
6725
0
                Py_DECREF(result);
6726
0
                return NULL;
6727
0
            }
6728
0
        }
6729
0
        else {
6730
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6731
0
                                 "\"\\%c\" is an invalid escape sequence. "
6732
0
                                 "Such sequences will not work in the future. ",
6733
0
                                 first_invalid_escape_char) < 0)
6734
0
            {
6735
0
                Py_DECREF(result);
6736
0
                return NULL;
6737
0
            }
6738
0
        }
6739
0
    }
6740
0
    return result;
6741
0
}
6742
6743
PyObject *
6744
PyUnicode_DecodeUnicodeEscape(const char *s,
6745
                              Py_ssize_t size,
6746
                              const char *errors)
6747
0
{
6748
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6749
0
}
6750
6751
/* Return a Unicode-Escape string version of the Unicode object. */
6752
6753
PyObject *
6754
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6755
311k
{
6756
311k
    if (!PyUnicode_Check(unicode)) {
6757
0
        PyErr_BadArgument();
6758
0
        return NULL;
6759
0
    }
6760
6761
311k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6762
311k
    if (len == 0) {
6763
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6764
0
    }
6765
311k
    int kind = PyUnicode_KIND(unicode);
6766
311k
    const void *data = PyUnicode_DATA(unicode);
6767
6768
    /* Initial allocation is based on the longest-possible character
6769
     * escape.
6770
     *
6771
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6772
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6773
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6774
311k
    Py_ssize_t expandsize = kind * 2 + 2;
6775
311k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6776
0
        return PyErr_NoMemory();
6777
0
    }
6778
6779
311k
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6780
311k
    if (writer == NULL) {
6781
0
        return NULL;
6782
0
    }
6783
311k
    char *p = PyBytesWriter_GetData(writer);
6784
6785
622k
    for (Py_ssize_t i = 0; i < len; i++) {
6786
311k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6787
6788
        /* U+0000-U+00ff range */
6789
311k
        if (ch < 0x100) {
6790
304k
            if (ch >= ' ' && ch < 127) {
6791
23.9k
                if (ch != '\\') {
6792
                    /* Copy printable US ASCII as-is */
6793
0
                    *p++ = (char) ch;
6794
0
                }
6795
                /* Escape backslashes */
6796
23.9k
                else {
6797
23.9k
                    *p++ = '\\';
6798
23.9k
                    *p++ = '\\';
6799
23.9k
                }
6800
23.9k
            }
6801
6802
            /* Map special whitespace to '\t', \n', '\r' */
6803
280k
            else if (ch == '\t') {
6804
2.80k
                *p++ = '\\';
6805
2.80k
                *p++ = 't';
6806
2.80k
            }
6807
277k
            else if (ch == '\n') {
6808
4.18k
                *p++ = '\\';
6809
4.18k
                *p++ = 'n';
6810
4.18k
            }
6811
273k
            else if (ch == '\r') {
6812
518
                *p++ = '\\';
6813
518
                *p++ = 'r';
6814
518
            }
6815
6816
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6817
272k
            else {
6818
272k
                *p++ = '\\';
6819
272k
                *p++ = 'x';
6820
272k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6821
272k
                *p++ = Py_hexdigits[ch & 0x000F];
6822
272k
            }
6823
304k
        }
6824
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6825
7.13k
        else if (ch < 0x10000) {
6826
5.94k
            *p++ = '\\';
6827
5.94k
            *p++ = 'u';
6828
5.94k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6829
5.94k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6830
5.94k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6831
5.94k
            *p++ = Py_hexdigits[ch & 0x000F];
6832
5.94k
        }
6833
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6834
1.19k
        else {
6835
6836
            /* Make sure that the first two digits are zero */
6837
1.19k
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6838
1.19k
            *p++ = '\\';
6839
1.19k
            *p++ = 'U';
6840
1.19k
            *p++ = '0';
6841
1.19k
            *p++ = '0';
6842
1.19k
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6843
1.19k
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6844
1.19k
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6845
1.19k
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6846
1.19k
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6847
1.19k
            *p++ = Py_hexdigits[ch & 0x0000000F];
6848
1.19k
        }
6849
311k
    }
6850
6851
311k
    return PyBytesWriter_FinishWithPointer(writer, p);
6852
311k
}
6853
6854
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6855
6856
PyObject *
6857
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6858
                                          Py_ssize_t size,
6859
                                          const char *errors,
6860
                                          Py_ssize_t *consumed)
6861
0
{
6862
0
    const char *starts = s;
6863
0
    _PyUnicodeWriter writer;
6864
0
    const char *end;
6865
0
    PyObject *errorHandler = NULL;
6866
0
    PyObject *exc = NULL;
6867
6868
0
    if (size == 0) {
6869
0
        if (consumed) {
6870
0
            *consumed = 0;
6871
0
        }
6872
0
        _Py_RETURN_UNICODE_EMPTY();
6873
0
    }
6874
6875
    /* Escaped strings will always be longer than the resulting
6876
       Unicode string, so we start with size here and then reduce the
6877
       length after conversion to the true value. (But decoding error
6878
       handler might have to resize the string) */
6879
0
    _PyUnicodeWriter_Init(&writer);
6880
0
    writer.min_length = size;
6881
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6882
0
        goto onError;
6883
0
    }
6884
6885
0
    end = s + size;
6886
0
    while (s < end) {
6887
0
        unsigned char c = (unsigned char) *s++;
6888
0
        Py_UCS4 ch;
6889
0
        int count;
6890
0
        const char *message;
6891
6892
0
#define WRITE_CHAR(ch)                                                        \
6893
0
            do {                                                              \
6894
0
                if (ch <= writer.maxchar) {                                   \
6895
0
                    assert(writer.pos < writer.size);                         \
6896
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6897
0
                }                                                             \
6898
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6899
0
                    goto onError;                                             \
6900
0
                }                                                             \
6901
0
            } while(0)
6902
6903
        /* Non-escape characters are interpreted as Unicode ordinals */
6904
0
        if (c != '\\' || (s >= end && !consumed)) {
6905
0
            WRITE_CHAR(c);
6906
0
            continue;
6907
0
        }
6908
6909
0
        Py_ssize_t startinpos = s - starts - 1;
6910
        /* \ - Escapes */
6911
0
        if (s >= end) {
6912
0
            assert(consumed);
6913
            // Set message to silent compiler warning.
6914
            // Actually it is never used.
6915
0
            message = "\\ at end of string";
6916
0
            goto incomplete;
6917
0
        }
6918
6919
0
        c = (unsigned char) *s++;
6920
0
        if (c == 'u') {
6921
0
            count = 4;
6922
0
            message = "truncated \\uXXXX escape";
6923
0
        }
6924
0
        else if (c == 'U') {
6925
0
            count = 8;
6926
0
            message = "truncated \\UXXXXXXXX escape";
6927
0
        }
6928
0
        else {
6929
0
            assert(writer.pos < writer.size);
6930
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6931
0
            WRITE_CHAR(c);
6932
0
            continue;
6933
0
        }
6934
6935
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6936
0
        for (ch = 0; count; ++s, --count) {
6937
0
            if (s >= end) {
6938
0
                goto incomplete;
6939
0
            }
6940
0
            c = (unsigned char)*s;
6941
0
            ch <<= 4;
6942
0
            if (c >= '0' && c <= '9') {
6943
0
                ch += c - '0';
6944
0
            }
6945
0
            else if (c >= 'a' && c <= 'f') {
6946
0
                ch += c - ('a' - 10);
6947
0
            }
6948
0
            else if (c >= 'A' && c <= 'F') {
6949
0
                ch += c - ('A' - 10);
6950
0
            }
6951
0
            else {
6952
0
                goto error;
6953
0
            }
6954
0
        }
6955
0
        if (ch > MAX_UNICODE) {
6956
0
            message = "\\Uxxxxxxxx out of range";
6957
0
            goto error;
6958
0
        }
6959
0
        WRITE_CHAR(ch);
6960
0
        continue;
6961
6962
0
      incomplete:
6963
0
        if (consumed) {
6964
0
            *consumed = startinpos;
6965
0
            break;
6966
0
        }
6967
0
      error:;
6968
0
        Py_ssize_t endinpos = s-starts;
6969
0
        writer.min_length = end - s + writer.pos;
6970
0
        if (unicode_decode_call_errorhandler_writer(
6971
0
                errors, &errorHandler,
6972
0
                "rawunicodeescape", message,
6973
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6974
0
                &writer)) {
6975
0
            goto onError;
6976
0
        }
6977
0
        assert(end - s <= writer.size - writer.pos);
6978
6979
0
#undef WRITE_CHAR
6980
0
    }
6981
0
    Py_XDECREF(errorHandler);
6982
0
    Py_XDECREF(exc);
6983
0
    return _PyUnicodeWriter_Finish(&writer);
6984
6985
0
  onError:
6986
0
    _PyUnicodeWriter_Dealloc(&writer);
6987
0
    Py_XDECREF(errorHandler);
6988
0
    Py_XDECREF(exc);
6989
0
    return NULL;
6990
0
}
6991
6992
PyObject *
6993
PyUnicode_DecodeRawUnicodeEscape(const char *s,
6994
                                 Py_ssize_t size,
6995
                                 const char *errors)
6996
0
{
6997
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6998
0
}
6999
7000
7001
PyObject *
7002
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7003
265k
{
7004
265k
    if (!PyUnicode_Check(unicode)) {
7005
0
        PyErr_BadArgument();
7006
0
        return NULL;
7007
0
    }
7008
265k
    int kind = PyUnicode_KIND(unicode);
7009
265k
    const void *data = PyUnicode_DATA(unicode);
7010
265k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7011
265k
    if (len == 0) {
7012
586
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7013
586
    }
7014
264k
    if (kind == PyUnicode_1BYTE_KIND) {
7015
264k
        return PyBytes_FromStringAndSize(data, len);
7016
264k
    }
7017
7018
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7019
       bytes, and 1 byte characters 4. */
7020
311
    Py_ssize_t expandsize = kind * 2 + 2;
7021
311
    if (len > PY_SSIZE_T_MAX / expandsize) {
7022
0
        return PyErr_NoMemory();
7023
0
    }
7024
7025
311
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7026
311
    if (writer == NULL) {
7027
0
        return NULL;
7028
0
    }
7029
311
    char *p = PyBytesWriter_GetData(writer);
7030
7031
5.30M
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7032
5.30M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7033
7034
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7035
5.30M
        if (ch < 0x100) {
7036
5.27M
            *p++ = (char) ch;
7037
5.27M
        }
7038
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7039
35.1k
        else if (ch < 0x10000) {
7040
34.6k
            *p++ = '\\';
7041
34.6k
            *p++ = 'u';
7042
34.6k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7043
34.6k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7044
34.6k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7045
34.6k
            *p++ = Py_hexdigits[ch & 15];
7046
34.6k
        }
7047
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7048
506
        else {
7049
506
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7050
506
            *p++ = '\\';
7051
506
            *p++ = 'U';
7052
506
            *p++ = '0';
7053
506
            *p++ = '0';
7054
506
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7055
506
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7056
506
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7057
506
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7058
506
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7059
506
            *p++ = Py_hexdigits[ch & 15];
7060
506
        }
7061
5.30M
    }
7062
7063
311
    return PyBytesWriter_FinishWithPointer(writer, p);
7064
311
}
7065
7066
/* --- Latin-1 Codec ------------------------------------------------------ */
7067
7068
PyObject *
7069
PyUnicode_DecodeLatin1(const char *s,
7070
                       Py_ssize_t size,
7071
                       const char *errors)
7072
3.53M
{
7073
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7074
3.53M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7075
3.53M
}
7076
7077
/* create or adjust a UnicodeEncodeError */
7078
static void
7079
make_encode_exception(PyObject **exceptionObject,
7080
                      const char *encoding,
7081
                      PyObject *unicode,
7082
                      Py_ssize_t startpos, Py_ssize_t endpos,
7083
                      const char *reason)
7084
232k
{
7085
232k
    if (*exceptionObject == NULL) {
7086
232k
        *exceptionObject = PyObject_CallFunction(
7087
232k
            PyExc_UnicodeEncodeError, "sOnns",
7088
232k
            encoding, unicode, startpos, endpos, reason);
7089
232k
    }
7090
0
    else {
7091
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7092
0
            goto onError;
7093
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7094
0
            goto onError;
7095
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7096
0
            goto onError;
7097
0
        return;
7098
0
      onError:
7099
0
        Py_CLEAR(*exceptionObject);
7100
0
    }
7101
232k
}
7102
7103
/* raises a UnicodeEncodeError */
7104
static void
7105
raise_encode_exception(PyObject **exceptionObject,
7106
                       const char *encoding,
7107
                       PyObject *unicode,
7108
                       Py_ssize_t startpos, Py_ssize_t endpos,
7109
                       const char *reason)
7110
40.2k
{
7111
40.2k
    make_encode_exception(exceptionObject,
7112
40.2k
                          encoding, unicode, startpos, endpos, reason);
7113
40.2k
    if (*exceptionObject != NULL)
7114
40.2k
        PyCodec_StrictErrors(*exceptionObject);
7115
40.2k
}
7116
7117
/* error handling callback helper:
7118
   build arguments, call the callback and check the arguments,
7119
   put the result into newpos and return the replacement string, which
7120
   has to be freed by the caller */
7121
static PyObject *
7122
unicode_encode_call_errorhandler(const char *errors,
7123
                                 PyObject **errorHandler,
7124
                                 const char *encoding, const char *reason,
7125
                                 PyObject *unicode, PyObject **exceptionObject,
7126
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7127
                                 Py_ssize_t *newpos)
7128
192k
{
7129
192k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7130
192k
    Py_ssize_t len;
7131
192k
    PyObject *restuple;
7132
192k
    PyObject *resunicode;
7133
7134
192k
    if (*errorHandler == NULL) {
7135
192k
        *errorHandler = PyCodec_LookupError(errors);
7136
192k
        if (*errorHandler == NULL)
7137
0
            return NULL;
7138
192k
    }
7139
7140
192k
    len = PyUnicode_GET_LENGTH(unicode);
7141
7142
192k
    make_encode_exception(exceptionObject,
7143
192k
                          encoding, unicode, startpos, endpos, reason);
7144
192k
    if (*exceptionObject == NULL)
7145
0
        return NULL;
7146
7147
192k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7148
192k
    if (restuple == NULL)
7149
192k
        return NULL;
7150
0
    if (!PyTuple_Check(restuple)) {
7151
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7152
0
        Py_DECREF(restuple);
7153
0
        return NULL;
7154
0
    }
7155
0
    if (!PyArg_ParseTuple(restuple, argparse,
7156
0
                          &resunicode, newpos)) {
7157
0
        Py_DECREF(restuple);
7158
0
        return NULL;
7159
0
    }
7160
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7161
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7162
0
        Py_DECREF(restuple);
7163
0
        return NULL;
7164
0
    }
7165
0
    if (*newpos<0)
7166
0
        *newpos = len + *newpos;
7167
0
    if (*newpos<0 || *newpos>len) {
7168
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7169
0
        Py_DECREF(restuple);
7170
0
        return NULL;
7171
0
    }
7172
0
    Py_INCREF(resunicode);
7173
0
    Py_DECREF(restuple);
7174
0
    return resunicode;
7175
0
}
7176
7177
static PyObject *
7178
unicode_encode_ucs1(PyObject *unicode,
7179
                    const char *errors,
7180
                    const Py_UCS4 limit)
7181
51.5k
{
7182
    /* input state */
7183
51.5k
    Py_ssize_t pos=0, size;
7184
51.5k
    int kind;
7185
51.5k
    const void *data;
7186
51.5k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7187
51.5k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7188
51.5k
    PyObject *error_handler_obj = NULL;
7189
51.5k
    PyObject *exc = NULL;
7190
51.5k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7191
51.5k
    PyObject *rep = NULL;
7192
7193
51.5k
    size = PyUnicode_GET_LENGTH(unicode);
7194
51.5k
    kind = PyUnicode_KIND(unicode);
7195
51.5k
    data = PyUnicode_DATA(unicode);
7196
    /* allocate enough for a simple encoding without
7197
       replacements, if we need more, we'll resize */
7198
51.5k
    if (size == 0)
7199
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7200
7201
    /* output object */
7202
51.5k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7203
51.5k
    if (writer == NULL) {
7204
0
        return NULL;
7205
0
    }
7206
    /* pointer into the output */
7207
51.5k
    char *str = PyBytesWriter_GetData(writer);
7208
7209
3.41M
    while (pos < size) {
7210
3.41M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7211
7212
        /* can we encode this? */
7213
3.41M
        if (ch < limit) {
7214
            /* no overflow check, because we know that the space is enough */
7215
3.36M
            *str++ = (char)ch;
7216
3.36M
            ++pos;
7217
3.36M
        }
7218
51.5k
        else {
7219
51.5k
            Py_ssize_t newpos, i;
7220
            /* startpos for collecting unencodable chars */
7221
51.5k
            Py_ssize_t collstart = pos;
7222
51.5k
            Py_ssize_t collend = collstart + 1;
7223
            /* find all unecodable characters */
7224
7225
363k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7226
311k
                ++collend;
7227
7228
            /* Only overallocate the buffer if it's not the last write */
7229
51.5k
            writer->overallocate = (collend < size);
7230
7231
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7232
51.5k
            if (error_handler == _Py_ERROR_UNKNOWN)
7233
51.5k
                error_handler = _Py_GetErrorHandler(errors);
7234
7235
51.5k
            switch (error_handler) {
7236
40.2k
            case _Py_ERROR_STRICT:
7237
40.2k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7238
40.2k
                goto onError;
7239
7240
0
            case _Py_ERROR_REPLACE:
7241
0
                memset(str, '?', collend - collstart);
7242
0
                str += (collend - collstart);
7243
0
                _Py_FALLTHROUGH;
7244
0
            case _Py_ERROR_IGNORE:
7245
0
                pos = collend;
7246
0
                break;
7247
7248
0
            case _Py_ERROR_BACKSLASHREPLACE:
7249
                /* subtract preallocated bytes */
7250
0
                writer->size -= (collend - collstart);
7251
0
                str = backslashreplace(writer, str,
7252
0
                                       unicode, collstart, collend);
7253
0
                if (str == NULL)
7254
0
                    goto onError;
7255
0
                pos = collend;
7256
0
                break;
7257
7258
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7259
                /* subtract preallocated bytes */
7260
0
                writer->size -= (collend - collstart);
7261
0
                str = xmlcharrefreplace(writer, str,
7262
0
                                        unicode, collstart, collend);
7263
0
                if (str == NULL)
7264
0
                    goto onError;
7265
0
                pos = collend;
7266
0
                break;
7267
7268
11.3k
            case _Py_ERROR_SURROGATEESCAPE:
7269
11.3k
                for (i = collstart; i < collend; ++i) {
7270
11.3k
                    ch = PyUnicode_READ(kind, data, i);
7271
11.3k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7272
                        /* Not a UTF-8b surrogate */
7273
11.3k
                        break;
7274
11.3k
                    }
7275
0
                    *str++ = (char)(ch - 0xdc00);
7276
0
                    ++pos;
7277
0
                }
7278
11.3k
                if (i >= collend)
7279
0
                    break;
7280
11.3k
                collstart = pos;
7281
11.3k
                assert(collstart != collend);
7282
11.3k
                _Py_FALLTHROUGH;
7283
7284
11.3k
            default:
7285
11.3k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7286
11.3k
                                                       encoding, reason, unicode, &exc,
7287
11.3k
                                                       collstart, collend, &newpos);
7288
11.3k
                if (rep == NULL)
7289
11.3k
                    goto onError;
7290
7291
0
                if (newpos < collstart) {
7292
0
                    writer->overallocate = 1;
7293
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7294
0
                                                             collstart - newpos,
7295
0
                                                             str);
7296
0
                    if (str == NULL) {
7297
0
                        goto onError;
7298
0
                    }
7299
0
                }
7300
0
                else {
7301
                    /* subtract preallocated bytes */
7302
0
                    writer->size -= newpos - collstart;
7303
                    /* Only overallocate the buffer if it's not the last write */
7304
0
                    writer->overallocate = (newpos < size);
7305
0
                }
7306
7307
0
                char *rep_str;
7308
0
                Py_ssize_t rep_len;
7309
0
                if (PyBytes_Check(rep)) {
7310
                    /* Directly copy bytes result to output. */
7311
0
                    rep_str = PyBytes_AS_STRING(rep);
7312
0
                    rep_len = PyBytes_GET_SIZE(rep);
7313
0
                }
7314
0
                else {
7315
0
                    assert(PyUnicode_Check(rep));
7316
7317
0
                    if (limit == 256 ?
7318
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7319
0
                        !PyUnicode_IS_ASCII(rep))
7320
0
                    {
7321
                        /* Not all characters are smaller than limit */
7322
0
                        raise_encode_exception(&exc, encoding, unicode,
7323
0
                                               collstart, collend, reason);
7324
0
                        goto onError;
7325
0
                    }
7326
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7327
0
                    rep_str = PyUnicode_DATA(rep);
7328
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7329
0
                }
7330
7331
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7332
0
                if (str == NULL) {
7333
0
                    goto onError;
7334
0
                }
7335
0
                memcpy(str, rep_str, rep_len);
7336
0
                str += rep_len;
7337
7338
0
                pos = newpos;
7339
0
                Py_CLEAR(rep);
7340
51.5k
            }
7341
7342
            /* If overallocation was disabled, ensure that it was the last
7343
               write. Otherwise, we missed an optimization */
7344
51.5k
            assert(writer->overallocate || pos == size);
7345
0
        }
7346
3.41M
    }
7347
7348
0
    Py_XDECREF(error_handler_obj);
7349
0
    Py_XDECREF(exc);
7350
0
    return PyBytesWriter_FinishWithPointer(writer, str);
7351
7352
51.5k
  onError:
7353
51.5k
    Py_XDECREF(rep);
7354
51.5k
    PyBytesWriter_Discard(writer);
7355
51.5k
    Py_XDECREF(error_handler_obj);
7356
51.5k
    Py_XDECREF(exc);
7357
51.5k
    return NULL;
7358
51.5k
}
7359
7360
PyObject *
7361
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7362
0
{
7363
0
    if (!PyUnicode_Check(unicode)) {
7364
0
        PyErr_BadArgument();
7365
0
        return NULL;
7366
0
    }
7367
    /* Fast path: if it is a one-byte string, construct
7368
       bytes object directly. */
7369
0
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7370
0
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7371
0
                                         PyUnicode_GET_LENGTH(unicode));
7372
    /* Non-Latin-1 characters present. Defer to above function to
7373
       raise the exception. */
7374
0
    return unicode_encode_ucs1(unicode, errors, 256);
7375
0
}
7376
7377
PyObject*
7378
PyUnicode_AsLatin1String(PyObject *unicode)
7379
0
{
7380
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7381
0
}
7382
7383
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7384
7385
PyObject *
7386
PyUnicode_DecodeASCII(const char *s,
7387
                      Py_ssize_t size,
7388
                      const char *errors)
7389
574k
{
7390
574k
    const char *starts = s;
7391
574k
    const char *e = s + size;
7392
574k
    PyObject *error_handler_obj = NULL;
7393
574k
    PyObject *exc = NULL;
7394
574k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7395
7396
574k
    if (size == 0)
7397
0
        _Py_RETURN_UNICODE_EMPTY();
7398
7399
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7400
574k
    if (size == 1 && (unsigned char)s[0] < 128) {
7401
7.79k
        return get_latin1_char((unsigned char)s[0]);
7402
7.79k
    }
7403
7404
    // Shortcut for simple case
7405
566k
    PyObject *u = PyUnicode_New(size, 127);
7406
566k
    if (u == NULL) {
7407
0
        return NULL;
7408
0
    }
7409
566k
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7410
566k
    if (outpos == size) {
7411
394k
        return u;
7412
394k
    }
7413
7414
172k
    _PyUnicodeWriter writer;
7415
172k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7416
172k
    writer.pos = outpos;
7417
7418
172k
    s += outpos;
7419
172k
    int kind = writer.kind;
7420
172k
    void *data = writer.data;
7421
172k
    Py_ssize_t startinpos, endinpos;
7422
7423
19.9M
    while (s < e) {
7424
19.8M
        unsigned char c = (unsigned char)*s;
7425
19.8M
        if (c < 128) {
7426
7.56M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7427
7.56M
            writer.pos++;
7428
7.56M
            ++s;
7429
7.56M
            continue;
7430
7.56M
        }
7431
7432
        /* byte outsize range 0x00..0x7f: call the error handler */
7433
7434
12.2M
        if (error_handler == _Py_ERROR_UNKNOWN)
7435
172k
            error_handler = _Py_GetErrorHandler(errors);
7436
7437
12.2M
        switch (error_handler)
7438
12.2M
        {
7439
835k
        case _Py_ERROR_REPLACE:
7440
12.2M
        case _Py_ERROR_SURROGATEESCAPE:
7441
            /* Fast-path: the error handler only writes one character,
7442
               but we may switch to UCS2 at the first write */
7443
12.2M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7444
0
                goto onError;
7445
12.2M
            kind = writer.kind;
7446
12.2M
            data = writer.data;
7447
7448
12.2M
            if (error_handler == _Py_ERROR_REPLACE)
7449
835k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7450
11.3M
            else
7451
11.3M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7452
12.2M
            writer.pos++;
7453
12.2M
            ++s;
7454
12.2M
            break;
7455
7456
0
        case _Py_ERROR_IGNORE:
7457
0
            ++s;
7458
0
            break;
7459
7460
12.8k
        default:
7461
12.8k
            startinpos = s-starts;
7462
12.8k
            endinpos = startinpos + 1;
7463
12.8k
            if (unicode_decode_call_errorhandler_writer(
7464
12.8k
                    errors, &error_handler_obj,
7465
12.8k
                    "ascii", "ordinal not in range(128)",
7466
12.8k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7467
12.8k
                    &writer))
7468
12.8k
                goto onError;
7469
0
            kind = writer.kind;
7470
0
            data = writer.data;
7471
12.2M
        }
7472
12.2M
    }
7473
159k
    Py_XDECREF(error_handler_obj);
7474
159k
    Py_XDECREF(exc);
7475
159k
    return _PyUnicodeWriter_Finish(&writer);
7476
7477
12.8k
  onError:
7478
12.8k
    _PyUnicodeWriter_Dealloc(&writer);
7479
12.8k
    Py_XDECREF(error_handler_obj);
7480
12.8k
    Py_XDECREF(exc);
7481
12.8k
    return NULL;
7482
172k
}
7483
7484
PyObject *
7485
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7486
117k
{
7487
117k
    if (!PyUnicode_Check(unicode)) {
7488
0
        PyErr_BadArgument();
7489
0
        return NULL;
7490
0
    }
7491
    /* Fast path: if it is an ASCII-only string, construct bytes object
7492
       directly. Else defer to above function to raise the exception. */
7493
117k
    if (PyUnicode_IS_ASCII(unicode))
7494
65.8k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7495
65.8k
                                         PyUnicode_GET_LENGTH(unicode));
7496
51.5k
    return unicode_encode_ucs1(unicode, errors, 128);
7497
117k
}
7498
7499
PyObject *
7500
PyUnicode_AsASCIIString(PyObject *unicode)
7501
4
{
7502
4
    return _PyUnicode_AsASCIIString(unicode, NULL);
7503
4
}
7504
7505
#ifdef MS_WINDOWS
7506
7507
/* --- MBCS codecs for Windows -------------------------------------------- */
7508
7509
#if SIZEOF_INT < SIZEOF_SIZE_T
7510
#define NEED_RETRY
7511
#endif
7512
7513
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7514
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7515
   both cases also and avoids partial characters overrunning the
7516
   length limit in MultiByteToWideChar on Windows */
7517
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7518
7519
#ifndef WC_ERR_INVALID_CHARS
7520
#  define WC_ERR_INVALID_CHARS 0x0080
7521
#endif
7522
7523
static const char*
7524
code_page_name(UINT code_page, PyObject **obj)
7525
{
7526
    *obj = NULL;
7527
    if (code_page == CP_ACP)
7528
        return "mbcs";
7529
7530
    *obj = PyBytes_FromFormat("cp%u", code_page);
7531
    if (*obj == NULL)
7532
        return NULL;
7533
    return PyBytes_AS_STRING(*obj);
7534
}
7535
7536
static DWORD
7537
decode_code_page_flags(UINT code_page)
7538
{
7539
    if (code_page == CP_UTF7) {
7540
        /* The CP_UTF7 decoder only supports flags=0 */
7541
        return 0;
7542
    }
7543
    else
7544
        return MB_ERR_INVALID_CHARS;
7545
}
7546
7547
/*
7548
 * Decode a byte string from a Windows code page into unicode object in strict
7549
 * mode.
7550
 *
7551
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7552
 * OSError and returns -1 on other error.
7553
 */
7554
static int
7555
decode_code_page_strict(UINT code_page,
7556
                        wchar_t **buf,
7557
                        Py_ssize_t *bufsize,
7558
                        const char *in,
7559
                        int insize)
7560
{
7561
    DWORD flags = MB_ERR_INVALID_CHARS;
7562
    wchar_t *out;
7563
    DWORD outsize;
7564
7565
    /* First get the size of the result */
7566
    assert(insize > 0);
7567
    while ((outsize = MultiByteToWideChar(code_page, flags,
7568
                                          in, insize, NULL, 0)) <= 0)
7569
    {
7570
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7571
            goto error;
7572
        }
7573
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7574
        flags = 0;
7575
    }
7576
7577
    /* Extend a wchar_t* buffer */
7578
    Py_ssize_t n = *bufsize;   /* Get the current length */
7579
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7580
        return -1;
7581
    }
7582
    out = *buf + n;
7583
7584
    /* Do the conversion */
7585
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7586
    if (outsize <= 0)
7587
        goto error;
7588
    return insize;
7589
7590
error:
7591
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7592
        return -2;
7593
    PyErr_SetFromWindowsErr(0);
7594
    return -1;
7595
}
7596
7597
/*
7598
 * Decode a byte string from a code page into unicode object with an error
7599
 * handler.
7600
 *
7601
 * Returns consumed size if succeed, or raise an OSError or
7602
 * UnicodeDecodeError exception and returns -1 on error.
7603
 */
7604
static int
7605
decode_code_page_errors(UINT code_page,
7606
                        wchar_t **buf,
7607
                        Py_ssize_t *bufsize,
7608
                        const char *in, const int size,
7609
                        const char *errors, int final)
7610
{
7611
    const char *startin = in;
7612
    const char *endin = in + size;
7613
    DWORD flags = MB_ERR_INVALID_CHARS;
7614
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7615
       2000 English version of the message. */
7616
    const char *reason = "No mapping for the Unicode character exists "
7617
                         "in the target code page.";
7618
    /* each step cannot decode more than 1 character, but a character can be
7619
       represented as a surrogate pair */
7620
    wchar_t buffer[2], *out;
7621
    int insize;
7622
    Py_ssize_t outsize;
7623
    PyObject *errorHandler = NULL;
7624
    PyObject *exc = NULL;
7625
    PyObject *encoding_obj = NULL;
7626
    const char *encoding;
7627
    DWORD err;
7628
    int ret = -1;
7629
7630
    assert(size > 0);
7631
7632
    encoding = code_page_name(code_page, &encoding_obj);
7633
    if (encoding == NULL)
7634
        return -1;
7635
7636
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7637
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7638
           UnicodeDecodeError. */
7639
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7640
        if (exc != NULL) {
7641
            PyCodec_StrictErrors(exc);
7642
            Py_CLEAR(exc);
7643
        }
7644
        goto error;
7645
    }
7646
7647
    /* Extend a wchar_t* buffer */
7648
    Py_ssize_t n = *bufsize;   /* Get the current length */
7649
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7650
        PyErr_NoMemory();
7651
        goto error;
7652
    }
7653
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7654
        goto error;
7655
    }
7656
    out = *buf + n;
7657
7658
    /* Decode the byte string character per character */
7659
    while (in < endin)
7660
    {
7661
        /* Decode a character */
7662
        insize = 1;
7663
        do
7664
        {
7665
            outsize = MultiByteToWideChar(code_page, flags,
7666
                                          in, insize,
7667
                                          buffer, Py_ARRAY_LENGTH(buffer));
7668
            if (outsize > 0)
7669
                break;
7670
            err = GetLastError();
7671
            if (err == ERROR_INVALID_FLAGS && flags) {
7672
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7673
                flags = 0;
7674
                continue;
7675
            }
7676
            if (err != ERROR_NO_UNICODE_TRANSLATION
7677
                && err != ERROR_INSUFFICIENT_BUFFER)
7678
            {
7679
                PyErr_SetFromWindowsErr(err);
7680
                goto error;
7681
            }
7682
            insize++;
7683
        }
7684
        /* 4=maximum length of a UTF-8 sequence */
7685
        while (insize <= 4 && (in + insize) <= endin);
7686
7687
        if (outsize <= 0) {
7688
            Py_ssize_t startinpos, endinpos, outpos;
7689
7690
            /* last character in partial decode? */
7691
            if (in + insize >= endin && !final)
7692
                break;
7693
7694
            startinpos = in - startin;
7695
            endinpos = startinpos + 1;
7696
            outpos = out - *buf;
7697
            if (unicode_decode_call_errorhandler_wchar(
7698
                    errors, &errorHandler,
7699
                    encoding, reason,
7700
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7701
                    buf, bufsize, &outpos))
7702
            {
7703
                goto error;
7704
            }
7705
            out = *buf + outpos;
7706
        }
7707
        else {
7708
            in += insize;
7709
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7710
            out += outsize;
7711
        }
7712
    }
7713
7714
    /* Shrink the buffer */
7715
    assert(out - *buf <= *bufsize);
7716
    *bufsize = out - *buf;
7717
    /* (in - startin) <= size and size is an int */
7718
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7719
7720
error:
7721
    Py_XDECREF(encoding_obj);
7722
    Py_XDECREF(errorHandler);
7723
    Py_XDECREF(exc);
7724
    return ret;
7725
}
7726
7727
static PyObject *
7728
decode_code_page_stateful(int code_page,
7729
                          const char *s, Py_ssize_t size,
7730
                          const char *errors, Py_ssize_t *consumed)
7731
{
7732
    wchar_t *buf = NULL;
7733
    Py_ssize_t bufsize = 0;
7734
    int chunk_size, final, converted, done;
7735
7736
    if (code_page < 0) {
7737
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7738
        return NULL;
7739
    }
7740
    if (size < 0) {
7741
        PyErr_BadInternalCall();
7742
        return NULL;
7743
    }
7744
7745
    if (consumed)
7746
        *consumed = 0;
7747
7748
    do
7749
    {
7750
#ifdef NEED_RETRY
7751
        if (size > DECODING_CHUNK_SIZE) {
7752
            chunk_size = DECODING_CHUNK_SIZE;
7753
            final = 0;
7754
            done = 0;
7755
        }
7756
        else
7757
#endif
7758
        {
7759
            chunk_size = (int)size;
7760
            final = (consumed == NULL);
7761
            done = 1;
7762
        }
7763
7764
        if (chunk_size == 0 && done) {
7765
            if (buf != NULL)
7766
                break;
7767
            _Py_RETURN_UNICODE_EMPTY();
7768
        }
7769
7770
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7771
                                            s, chunk_size);
7772
        if (converted == -2)
7773
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7774
                                                s, chunk_size,
7775
                                                errors, final);
7776
        assert(converted != 0 || done);
7777
7778
        if (converted < 0) {
7779
            PyMem_Free(buf);
7780
            return NULL;
7781
        }
7782
7783
        if (consumed)
7784
            *consumed += converted;
7785
7786
        s += converted;
7787
        size -= converted;
7788
    } while (!done);
7789
7790
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7791
    PyMem_Free(buf);
7792
    return v;
7793
}
7794
7795
PyObject *
7796
PyUnicode_DecodeCodePageStateful(int code_page,
7797
                                 const char *s,
7798
                                 Py_ssize_t size,
7799
                                 const char *errors,
7800
                                 Py_ssize_t *consumed)
7801
{
7802
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7803
}
7804
7805
PyObject *
7806
PyUnicode_DecodeMBCSStateful(const char *s,
7807
                             Py_ssize_t size,
7808
                             const char *errors,
7809
                             Py_ssize_t *consumed)
7810
{
7811
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7812
}
7813
7814
PyObject *
7815
PyUnicode_DecodeMBCS(const char *s,
7816
                     Py_ssize_t size,
7817
                     const char *errors)
7818
{
7819
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7820
}
7821
7822
static DWORD
7823
encode_code_page_flags(UINT code_page, const char *errors)
7824
{
7825
    if (code_page == CP_UTF8) {
7826
        return WC_ERR_INVALID_CHARS;
7827
    }
7828
    else if (code_page == CP_UTF7) {
7829
        /* CP_UTF7 only supports flags=0 */
7830
        return 0;
7831
    }
7832
    else {
7833
        if (errors != NULL && strcmp(errors, "replace") == 0)
7834
            return 0;
7835
        else
7836
            return WC_NO_BEST_FIT_CHARS;
7837
    }
7838
}
7839
7840
/*
7841
 * Encode a Unicode string to a Windows code page into a byte string in strict
7842
 * mode.
7843
 *
7844
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7845
 * an OSError and returns -1 on other error.
7846
 */
7847
static int
7848
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7849
                        PyObject *unicode, Py_ssize_t offset, int len,
7850
                        const char* errors)
7851
{
7852
    BOOL usedDefaultChar = FALSE;
7853
    BOOL *pusedDefaultChar = &usedDefaultChar;
7854
    int outsize;
7855
    wchar_t *p;
7856
    Py_ssize_t size;
7857
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7858
    char *out;
7859
    /* Create a substring so that we can get the UTF-16 representation
7860
       of just the slice under consideration. */
7861
    PyObject *substring;
7862
    int ret = -1;
7863
7864
    assert(len > 0);
7865
7866
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7867
        pusedDefaultChar = &usedDefaultChar;
7868
    else
7869
        pusedDefaultChar = NULL;
7870
7871
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7872
    if (substring == NULL)
7873
        return -1;
7874
    p = PyUnicode_AsWideCharString(substring, &size);
7875
    Py_CLEAR(substring);
7876
    if (p == NULL) {
7877
        return -1;
7878
    }
7879
    assert(size <= INT_MAX);
7880
7881
    /* First get the size of the result */
7882
    outsize = WideCharToMultiByte(code_page, flags,
7883
                                  p, (int)size,
7884
                                  NULL, 0,
7885
                                  NULL, pusedDefaultChar);
7886
    if (outsize <= 0)
7887
        goto error;
7888
    /* If we used a default char, then we failed! */
7889
    if (pusedDefaultChar && *pusedDefaultChar) {
7890
        ret = -2;
7891
        goto done;
7892
    }
7893
7894
    if (*writer == NULL) {
7895
        /* Create string object */
7896
        *writer = PyBytesWriter_Create(outsize);
7897
        if (*writer == NULL) {
7898
            goto done;
7899
        }
7900
        out = PyBytesWriter_GetData(*writer);
7901
    }
7902
    else {
7903
        /* Extend string object */
7904
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
7905
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
7906
            goto done;
7907
        }
7908
        out = (char*)PyBytesWriter_GetData(*writer) + n;
7909
    }
7910
7911
    /* Do the conversion */
7912
    outsize = WideCharToMultiByte(code_page, flags,
7913
                                  p, (int)size,
7914
                                  out, outsize,
7915
                                  NULL, pusedDefaultChar);
7916
    if (outsize <= 0)
7917
        goto error;
7918
    if (pusedDefaultChar && *pusedDefaultChar) {
7919
        ret = -2;
7920
        goto done;
7921
    }
7922
    ret = 0;
7923
7924
done:
7925
    PyMem_Free(p);
7926
    return ret;
7927
7928
error:
7929
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7930
        ret = -2;
7931
        goto done;
7932
    }
7933
    PyErr_SetFromWindowsErr(0);
7934
    goto done;
7935
}
7936
7937
/*
7938
 * Encode a Unicode string to a Windows code page into a byte string using an
7939
 * error handler.
7940
 *
7941
 * Returns consumed characters if succeed, or raise an OSError and returns
7942
 * -1 on other error.
7943
 */
7944
static int
7945
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
7946
                        PyObject *unicode, Py_ssize_t unicode_offset,
7947
                        Py_ssize_t insize, const char* errors)
7948
{
7949
    const DWORD flags = encode_code_page_flags(code_page, errors);
7950
    Py_ssize_t pos = unicode_offset;
7951
    Py_ssize_t endin = unicode_offset + insize;
7952
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7953
       2000 English version of the message. */
7954
    const char *reason = "invalid character";
7955
    /* 4=maximum length of a UTF-8 sequence */
7956
    char buffer[4];
7957
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7958
    Py_ssize_t outsize;
7959
    char *out;
7960
    PyObject *errorHandler = NULL;
7961
    PyObject *exc = NULL;
7962
    PyObject *encoding_obj = NULL;
7963
    const char *encoding;
7964
    Py_ssize_t newpos;
7965
    PyObject *rep;
7966
    int ret = -1;
7967
7968
    assert(insize > 0);
7969
7970
    encoding = code_page_name(code_page, &encoding_obj);
7971
    if (encoding == NULL)
7972
        return -1;
7973
7974
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7975
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7976
           then we raise a UnicodeEncodeError. */
7977
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7978
        if (exc != NULL) {
7979
            PyCodec_StrictErrors(exc);
7980
            Py_DECREF(exc);
7981
        }
7982
        Py_XDECREF(encoding_obj);
7983
        return -1;
7984
    }
7985
7986
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7987
        pusedDefaultChar = &usedDefaultChar;
7988
    else
7989
        pusedDefaultChar = NULL;
7990
7991
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7992
        PyErr_NoMemory();
7993
        goto error;
7994
    }
7995
    outsize = insize * Py_ARRAY_LENGTH(buffer);
7996
7997
    if (*writer == NULL) {
7998
        /* Create string object */
7999
        *writer = PyBytesWriter_Create(outsize);
8000
        if (*writer == NULL) {
8001
            goto error;
8002
        }
8003
        out = PyBytesWriter_GetData(*writer);
8004
    }
8005
    else {
8006
        /* Extend string object */
8007
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8008
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8009
            goto error;
8010
        }
8011
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8012
    }
8013
8014
    /* Encode the string character per character */
8015
    while (pos < endin)
8016
    {
8017
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8018
        wchar_t chars[2];
8019
        int charsize;
8020
        if (ch < 0x10000) {
8021
            chars[0] = (wchar_t)ch;
8022
            charsize = 1;
8023
        }
8024
        else {
8025
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8026
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8027
            charsize = 2;
8028
        }
8029
8030
        outsize = WideCharToMultiByte(code_page, flags,
8031
                                      chars, charsize,
8032
                                      buffer, Py_ARRAY_LENGTH(buffer),
8033
                                      NULL, pusedDefaultChar);
8034
        if (outsize > 0) {
8035
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8036
            {
8037
                pos++;
8038
                memcpy(out, buffer, outsize);
8039
                out += outsize;
8040
                continue;
8041
            }
8042
        }
8043
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8044
            PyErr_SetFromWindowsErr(0);
8045
            goto error;
8046
        }
8047
8048
        rep = unicode_encode_call_errorhandler(
8049
                  errors, &errorHandler, encoding, reason,
8050
                  unicode, &exc,
8051
                  pos, pos + 1, &newpos);
8052
        if (rep == NULL)
8053
            goto error;
8054
8055
        Py_ssize_t morebytes = pos - newpos;
8056
        if (PyBytes_Check(rep)) {
8057
            outsize = PyBytes_GET_SIZE(rep);
8058
            morebytes += outsize;
8059
            if (morebytes > 0) {
8060
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8061
                if (out == NULL) {
8062
                    Py_DECREF(rep);
8063
                    goto error;
8064
                }
8065
            }
8066
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8067
            out += outsize;
8068
        }
8069
        else {
8070
            Py_ssize_t i;
8071
            int kind;
8072
            const void *data;
8073
8074
            outsize = PyUnicode_GET_LENGTH(rep);
8075
            morebytes += outsize;
8076
            if (morebytes > 0) {
8077
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8078
                if (out == NULL) {
8079
                    Py_DECREF(rep);
8080
                    goto error;
8081
                }
8082
            }
8083
            kind = PyUnicode_KIND(rep);
8084
            data = PyUnicode_DATA(rep);
8085
            for (i=0; i < outsize; i++) {
8086
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8087
                if (ch > 127) {
8088
                    raise_encode_exception(&exc,
8089
                        encoding, unicode,
8090
                        pos, pos + 1,
8091
                        "unable to encode error handler result to ASCII");
8092
                    Py_DECREF(rep);
8093
                    goto error;
8094
                }
8095
                *out = (unsigned char)ch;
8096
                out++;
8097
            }
8098
        }
8099
        pos = newpos;
8100
        Py_DECREF(rep);
8101
    }
8102
    /* write a NUL byte */
8103
    *out = 0;
8104
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8105
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8106
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8107
        goto error;
8108
    }
8109
    ret = 0;
8110
8111
error:
8112
    Py_XDECREF(encoding_obj);
8113
    Py_XDECREF(errorHandler);
8114
    Py_XDECREF(exc);
8115
    return ret;
8116
}
8117
8118
8119
PyObject *
8120
PyUnicode_EncodeCodePage(int code_page,
8121
                         PyObject *unicode,
8122
                         const char *errors)
8123
{
8124
    Py_ssize_t len;
8125
    PyBytesWriter *writer = NULL;
8126
    Py_ssize_t offset;
8127
    int chunk_len, ret, done;
8128
8129
    if (!PyUnicode_Check(unicode)) {
8130
        PyErr_BadArgument();
8131
        return NULL;
8132
    }
8133
8134
    len = PyUnicode_GET_LENGTH(unicode);
8135
8136
    if (code_page < 0) {
8137
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8138
        return NULL;
8139
    }
8140
8141
    if (len == 0)
8142
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8143
8144
    offset = 0;
8145
    do
8146
    {
8147
#ifdef NEED_RETRY
8148
        if (len > DECODING_CHUNK_SIZE) {
8149
            chunk_len = DECODING_CHUNK_SIZE;
8150
            done = 0;
8151
        }
8152
        else
8153
#endif
8154
        {
8155
            chunk_len = (int)len;
8156
            done = 1;
8157
        }
8158
8159
        ret = encode_code_page_strict(code_page, &writer,
8160
                                      unicode, offset, chunk_len,
8161
                                      errors);
8162
        if (ret == -2)
8163
            ret = encode_code_page_errors(code_page, &writer,
8164
                                          unicode, offset,
8165
                                          chunk_len, errors);
8166
        if (ret < 0) {
8167
            PyBytesWriter_Discard(writer);
8168
            return NULL;
8169
        }
8170
8171
        offset += chunk_len;
8172
        len -= chunk_len;
8173
    } while (!done);
8174
8175
    return PyBytesWriter_Finish(writer);
8176
}
8177
8178
8179
PyObject *
8180
PyUnicode_AsMBCSString(PyObject *unicode)
8181
{
8182
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8183
}
8184
8185
#undef NEED_RETRY
8186
8187
#endif /* MS_WINDOWS */
8188
8189
/* --- Character Mapping Codec -------------------------------------------- */
8190
8191
static int
8192
charmap_decode_string(const char *s,
8193
                      Py_ssize_t size,
8194
                      PyObject *mapping,
8195
                      const char *errors,
8196
                      _PyUnicodeWriter *writer)
8197
23.7k
{
8198
23.7k
    const char *starts = s;
8199
23.7k
    const char *e;
8200
23.7k
    Py_ssize_t startinpos, endinpos;
8201
23.7k
    PyObject *errorHandler = NULL, *exc = NULL;
8202
23.7k
    Py_ssize_t maplen;
8203
23.7k
    int mapkind;
8204
23.7k
    const void *mapdata;
8205
23.7k
    Py_UCS4 x;
8206
23.7k
    unsigned char ch;
8207
8208
23.7k
    maplen = PyUnicode_GET_LENGTH(mapping);
8209
23.7k
    mapdata = PyUnicode_DATA(mapping);
8210
23.7k
    mapkind = PyUnicode_KIND(mapping);
8211
8212
23.7k
    e = s + size;
8213
8214
23.7k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8215
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8216
         * is disabled in encoding aliases, latin1 is preferred because
8217
         * its implementation is faster. */
8218
158
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8219
158
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8220
158
        Py_UCS4 maxchar = writer->maxchar;
8221
8222
158
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8223
2.39k
        while (s < e) {
8224
2.23k
            ch = *s;
8225
2.23k
            x = mapdata_ucs1[ch];
8226
2.23k
            if (x > maxchar) {
8227
147
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8228
0
                    goto onError;
8229
147
                maxchar = writer->maxchar;
8230
147
                outdata = (Py_UCS1 *)writer->data;
8231
147
            }
8232
2.23k
            outdata[writer->pos] = x;
8233
2.23k
            writer->pos++;
8234
2.23k
            ++s;
8235
2.23k
        }
8236
158
        return 0;
8237
158
    }
8238
8239
99.0k
    while (s < e) {
8240
85.7k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8241
85.7k
            int outkind = writer->kind;
8242
85.7k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8243
85.7k
            if (outkind == PyUnicode_1BYTE_KIND) {
8244
45.8k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8245
45.8k
                Py_UCS4 maxchar = writer->maxchar;
8246
293k
                while (s < e) {
8247
291k
                    ch = *s;
8248
291k
                    x = mapdata_ucs2[ch];
8249
291k
                    if (x > maxchar)
8250
43.7k
                        goto Error;
8251
247k
                    outdata[writer->pos] = x;
8252
247k
                    writer->pos++;
8253
247k
                    ++s;
8254
247k
                }
8255
2.09k
                break;
8256
45.8k
            }
8257
39.8k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8258
39.8k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8259
1.23M
                while (s < e) {
8260
1.22M
                    ch = *s;
8261
1.22M
                    x = mapdata_ucs2[ch];
8262
1.22M
                    if (x == 0xFFFE)
8263
31.6k
                        goto Error;
8264
1.19M
                    outdata[writer->pos] = x;
8265
1.19M
                    writer->pos++;
8266
1.19M
                    ++s;
8267
1.19M
                }
8268
8.25k
                break;
8269
39.8k
            }
8270
85.7k
        }
8271
0
        ch = *s;
8272
8273
0
        if (ch < maplen)
8274
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8275
0
        else
8276
0
            x = 0xfffe; /* invalid value */
8277
75.4k
Error:
8278
75.4k
        if (x == 0xfffe)
8279
51.4k
        {
8280
            /* undefined mapping */
8281
51.4k
            startinpos = s-starts;
8282
51.4k
            endinpos = startinpos+1;
8283
51.4k
            if (unicode_decode_call_errorhandler_writer(
8284
51.4k
                    errors, &errorHandler,
8285
51.4k
                    "charmap", "character maps to <undefined>",
8286
51.4k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8287
51.4k
                    writer)) {
8288
17
                goto onError;
8289
17
            }
8290
51.4k
            continue;
8291
51.4k
        }
8292
8293
23.9k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8294
0
            goto onError;
8295
23.9k
        ++s;
8296
23.9k
    }
8297
23.6k
    Py_XDECREF(errorHandler);
8298
23.6k
    Py_XDECREF(exc);
8299
23.6k
    return 0;
8300
8301
17
onError:
8302
17
    Py_XDECREF(errorHandler);
8303
17
    Py_XDECREF(exc);
8304
17
    return -1;
8305
23.6k
}
8306
8307
static int
8308
charmap_decode_mapping(const char *s,
8309
                       Py_ssize_t size,
8310
                       PyObject *mapping,
8311
                       const char *errors,
8312
                       _PyUnicodeWriter *writer)
8313
0
{
8314
0
    const char *starts = s;
8315
0
    const char *e;
8316
0
    Py_ssize_t startinpos, endinpos;
8317
0
    PyObject *errorHandler = NULL, *exc = NULL;
8318
0
    unsigned char ch;
8319
0
    PyObject *key, *item = NULL;
8320
8321
0
    e = s + size;
8322
8323
0
    while (s < e) {
8324
0
        ch = *s;
8325
8326
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8327
0
        key = PyLong_FromLong((long)ch);
8328
0
        if (key == NULL)
8329
0
            goto onError;
8330
8331
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8332
0
        Py_DECREF(key);
8333
0
        if (rc == 0) {
8334
            /* No mapping found means: mapping is undefined. */
8335
0
            goto Undefined;
8336
0
        }
8337
0
        if (item == NULL) {
8338
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8339
                /* No mapping found means: mapping is undefined. */
8340
0
                PyErr_Clear();
8341
0
                goto Undefined;
8342
0
            } else
8343
0
                goto onError;
8344
0
        }
8345
8346
        /* Apply mapping */
8347
0
        if (item == Py_None)
8348
0
            goto Undefined;
8349
0
        if (PyLong_Check(item)) {
8350
0
            long value = PyLong_AsLong(item);
8351
0
            if (value == 0xFFFE)
8352
0
                goto Undefined;
8353
0
            if (value < 0 || value > MAX_UNICODE) {
8354
0
                PyErr_Format(PyExc_TypeError,
8355
0
                             "character mapping must be in range(0x%x)",
8356
0
                             (unsigned long)MAX_UNICODE + 1);
8357
0
                goto onError;
8358
0
            }
8359
8360
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8361
0
                goto onError;
8362
0
        }
8363
0
        else if (PyUnicode_Check(item)) {
8364
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8365
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8366
0
                if (value == 0xFFFE)
8367
0
                    goto Undefined;
8368
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8369
0
                    goto onError;
8370
0
            }
8371
0
            else {
8372
0
                writer->overallocate = 1;
8373
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8374
0
                    goto onError;
8375
0
            }
8376
0
        }
8377
0
        else {
8378
            /* wrong return value */
8379
0
            PyErr_SetString(PyExc_TypeError,
8380
0
                            "character mapping must return integer, None or str");
8381
0
            goto onError;
8382
0
        }
8383
0
        Py_CLEAR(item);
8384
0
        ++s;
8385
0
        continue;
8386
8387
0
Undefined:
8388
        /* undefined mapping */
8389
0
        Py_CLEAR(item);
8390
0
        startinpos = s-starts;
8391
0
        endinpos = startinpos+1;
8392
0
        if (unicode_decode_call_errorhandler_writer(
8393
0
                errors, &errorHandler,
8394
0
                "charmap", "character maps to <undefined>",
8395
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8396
0
                writer)) {
8397
0
            goto onError;
8398
0
        }
8399
0
    }
8400
0
    Py_XDECREF(errorHandler);
8401
0
    Py_XDECREF(exc);
8402
0
    return 0;
8403
8404
0
onError:
8405
0
    Py_XDECREF(item);
8406
0
    Py_XDECREF(errorHandler);
8407
0
    Py_XDECREF(exc);
8408
0
    return -1;
8409
0
}
8410
8411
PyObject *
8412
PyUnicode_DecodeCharmap(const char *s,
8413
                        Py_ssize_t size,
8414
                        PyObject *mapping,
8415
                        const char *errors)
8416
23.7k
{
8417
23.7k
    _PyUnicodeWriter writer;
8418
8419
    /* Default to Latin-1 */
8420
23.7k
    if (mapping == NULL)
8421
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8422
8423
23.7k
    if (size == 0)
8424
0
        _Py_RETURN_UNICODE_EMPTY();
8425
23.7k
    _PyUnicodeWriter_Init(&writer);
8426
23.7k
    writer.min_length = size;
8427
23.7k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8428
0
        goto onError;
8429
8430
23.7k
    if (PyUnicode_CheckExact(mapping)) {
8431
23.7k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8432
17
            goto onError;
8433
23.7k
    }
8434
0
    else {
8435
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8436
0
            goto onError;
8437
0
    }
8438
23.7k
    return _PyUnicodeWriter_Finish(&writer);
8439
8440
17
  onError:
8441
17
    _PyUnicodeWriter_Dealloc(&writer);
8442
17
    return NULL;
8443
23.7k
}
8444
8445
/* Charmap encoding: the lookup table */
8446
8447
/*[clinic input]
8448
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8449
[clinic start generated code]*/
8450
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8451
8452
struct encoding_map {
8453
    PyObject_HEAD
8454
    unsigned char level1[32];
8455
    int count2, count3;
8456
    unsigned char level23[1];
8457
};
8458
8459
/*[clinic input]
8460
EncodingMap.size
8461
8462
Return the size (in bytes) of this object.
8463
[clinic start generated code]*/
8464
8465
static PyObject *
8466
EncodingMap_size_impl(struct encoding_map *self)
8467
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8468
0
{
8469
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8470
0
                           128*self->count3);
8471
0
}
8472
8473
static PyMethodDef encoding_map_methods[] = {
8474
    ENCODINGMAP_SIZE_METHODDEF
8475
    {NULL, NULL}
8476
};
8477
8478
static PyTypeObject EncodingMapType = {
8479
    PyVarObject_HEAD_INIT(NULL, 0)
8480
    .tp_name = "EncodingMap",
8481
    .tp_basicsize = sizeof(struct encoding_map),
8482
    /* methods */
8483
    .tp_flags = Py_TPFLAGS_DEFAULT,
8484
    .tp_methods = encoding_map_methods,
8485
};
8486
8487
PyObject*
8488
PyUnicode_BuildEncodingMap(PyObject* string)
8489
112
{
8490
112
    PyObject *result;
8491
112
    struct encoding_map *mresult;
8492
112
    int i;
8493
112
    int need_dict = 0;
8494
112
    unsigned char level1[32];
8495
112
    unsigned char level2[512];
8496
112
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8497
112
    int count2 = 0, count3 = 0;
8498
112
    int kind;
8499
112
    const void *data;
8500
112
    int length;
8501
112
    Py_UCS4 ch;
8502
8503
112
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8504
0
        PyErr_BadArgument();
8505
0
        return NULL;
8506
0
    }
8507
112
    kind = PyUnicode_KIND(string);
8508
112
    data = PyUnicode_DATA(string);
8509
112
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8510
112
    memset(level1, 0xFF, sizeof level1);
8511
112
    memset(level2, 0xFF, sizeof level2);
8512
8513
    /* If there isn't a one-to-one mapping of NULL to \0,
8514
       or if there are non-BMP characters, we need to use
8515
       a mapping dictionary. */
8516
112
    if (PyUnicode_READ(kind, data, 0) != 0)
8517
0
        need_dict = 1;
8518
28.6k
    for (i = 1; i < length; i++) {
8519
28.5k
        int l1, l2;
8520
28.5k
        ch = PyUnicode_READ(kind, data, i);
8521
28.5k
        if (ch == 0 || ch > 0xFFFF) {
8522
0
            need_dict = 1;
8523
0
            break;
8524
0
        }
8525
28.5k
        if (ch == 0xFFFE)
8526
            /* unmapped character */
8527
720
            continue;
8528
27.8k
        l1 = ch >> 11;
8529
27.8k
        l2 = ch >> 7;
8530
27.8k
        if (level1[l1] == 0xFF)
8531
203
            level1[l1] = count2++;
8532
27.8k
        if (level2[l2] == 0xFF)
8533
609
            level2[l2] = count3++;
8534
27.8k
    }
8535
8536
112
    if (count2 >= 0xFF || count3 >= 0xFF)
8537
0
        need_dict = 1;
8538
8539
112
    if (need_dict) {
8540
0
        PyObject *result = PyDict_New();
8541
0
        if (!result)
8542
0
            return NULL;
8543
0
        for (i = 0; i < length; i++) {
8544
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8545
0
            PyObject *key = PyLong_FromLong(c);
8546
0
            if (key == NULL) {
8547
0
                Py_DECREF(result);
8548
0
                return NULL;
8549
0
            }
8550
0
            PyObject *value = PyLong_FromLong(i);
8551
0
            if (value == NULL) {
8552
0
                Py_DECREF(key);
8553
0
                Py_DECREF(result);
8554
0
                return NULL;
8555
0
            }
8556
0
            int rc = PyDict_SetItem(result, key, value);
8557
0
            Py_DECREF(key);
8558
0
            Py_DECREF(value);
8559
0
            if (rc < 0) {
8560
0
                Py_DECREF(result);
8561
0
                return NULL;
8562
0
            }
8563
0
        }
8564
0
        return result;
8565
0
    }
8566
8567
    /* Create a three-level trie */
8568
112
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8569
112
                             16*count2 + 128*count3 - 1);
8570
112
    if (!result) {
8571
0
        return PyErr_NoMemory();
8572
0
    }
8573
8574
112
    _PyObject_Init(result, &EncodingMapType);
8575
112
    mresult = (struct encoding_map*)result;
8576
112
    mresult->count2 = count2;
8577
112
    mresult->count3 = count3;
8578
112
    mlevel1 = mresult->level1;
8579
112
    mlevel2 = mresult->level23;
8580
112
    mlevel3 = mresult->level23 + 16*count2;
8581
112
    memcpy(mlevel1, level1, 32);
8582
112
    memset(mlevel2, 0xFF, 16*count2);
8583
112
    memset(mlevel3, 0, 128*count3);
8584
112
    count3 = 0;
8585
28.6k
    for (i = 1; i < length; i++) {
8586
28.5k
        int o1, o2, o3, i2, i3;
8587
28.5k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8588
28.5k
        if (ch == 0xFFFE)
8589
            /* unmapped character */
8590
720
            continue;
8591
27.8k
        o1 = ch>>11;
8592
27.8k
        o2 = (ch>>7) & 0xF;
8593
27.8k
        i2 = 16*mlevel1[o1] + o2;
8594
27.8k
        if (mlevel2[i2] == 0xFF)
8595
609
            mlevel2[i2] = count3++;
8596
27.8k
        o3 = ch & 0x7F;
8597
27.8k
        i3 = 128*mlevel2[i2] + o3;
8598
27.8k
        mlevel3[i3] = i;
8599
27.8k
    }
8600
112
    return result;
8601
112
}
8602
8603
static int
8604
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8605
0
{
8606
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8607
0
    int l1 = c>>11;
8608
0
    int l2 = (c>>7) & 0xF;
8609
0
    int l3 = c & 0x7F;
8610
0
    int i;
8611
8612
0
    if (c > 0xFFFF)
8613
0
        return -1;
8614
0
    if (c == 0)
8615
0
        return 0;
8616
    /* level 1*/
8617
0
    i = map->level1[l1];
8618
0
    if (i == 0xFF) {
8619
0
        return -1;
8620
0
    }
8621
    /* level 2*/
8622
0
    i = map->level23[16*i+l2];
8623
0
    if (i == 0xFF) {
8624
0
        return -1;
8625
0
    }
8626
    /* level 3 */
8627
0
    i = map->level23[16*map->count2 + 128*i + l3];
8628
0
    if (i == 0) {
8629
0
        return -1;
8630
0
    }
8631
0
    return i;
8632
0
}
8633
8634
/* Lookup the character in the mapping.
8635
   On success, return PyLong, PyBytes or None (if the character can't be found).
8636
   If the result is PyLong, put its value in replace.
8637
   On error, return NULL.
8638
   */
8639
static PyObject *
8640
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8641
0
{
8642
0
    PyObject *w = PyLong_FromLong((long)c);
8643
0
    PyObject *x;
8644
8645
0
    if (w == NULL)
8646
0
        return NULL;
8647
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8648
0
    Py_DECREF(w);
8649
0
    if (rc == 0) {
8650
        /* No mapping found means: mapping is undefined. */
8651
0
        Py_RETURN_NONE;
8652
0
    }
8653
0
    if (x == NULL) {
8654
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8655
            /* No mapping found means: mapping is undefined. */
8656
0
            PyErr_Clear();
8657
0
            Py_RETURN_NONE;
8658
0
        } else
8659
0
            return NULL;
8660
0
    }
8661
0
    else if (x == Py_None)
8662
0
        return x;
8663
0
    else if (PyLong_Check(x)) {
8664
0
        long value = PyLong_AsLong(x);
8665
0
        if (value < 0 || value > 255) {
8666
0
            PyErr_SetString(PyExc_TypeError,
8667
0
                            "character mapping must be in range(256)");
8668
0
            Py_DECREF(x);
8669
0
            return NULL;
8670
0
        }
8671
0
        *replace = (unsigned char)value;
8672
0
        return x;
8673
0
    }
8674
0
    else if (PyBytes_Check(x))
8675
0
        return x;
8676
0
    else {
8677
        /* wrong return value */
8678
0
        PyErr_Format(PyExc_TypeError,
8679
0
                     "character mapping must return integer, bytes or None, not %.400s",
8680
0
                     Py_TYPE(x)->tp_name);
8681
0
        Py_DECREF(x);
8682
0
        return NULL;
8683
0
    }
8684
0
}
8685
8686
static int
8687
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8688
0
{
8689
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8690
    /* exponentially overallocate to minimize reallocations */
8691
0
    if (requiredsize < 2 * outsize)
8692
0
        requiredsize = 2 * outsize;
8693
0
    return PyBytesWriter_Resize(writer, requiredsize);
8694
0
}
8695
8696
typedef enum charmapencode_result {
8697
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8698
} charmapencode_result;
8699
/* lookup the character, put the result in the output string and adjust
8700
   various state variables. Resize the output bytes object if not enough
8701
   space is available. Return a new reference to the object that
8702
   was put in the output buffer, or Py_None, if the mapping was undefined
8703
   (in which case no character was written) or NULL, if a
8704
   reallocation error occurred. The caller must decref the result */
8705
static charmapencode_result
8706
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8707
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8708
0
{
8709
0
    PyObject *rep;
8710
0
    unsigned char replace;
8711
0
    char *outstart;
8712
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8713
8714
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8715
0
        int res = encoding_map_lookup(c, mapping);
8716
0
        Py_ssize_t requiredsize = *outpos+1;
8717
0
        if (res == -1) {
8718
0
            return enc_FAILED;
8719
0
        }
8720
8721
0
        if (outsize<requiredsize) {
8722
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8723
0
                return enc_EXCEPTION;
8724
0
            }
8725
0
        }
8726
0
        outstart = _PyBytesWriter_GetData(writer);
8727
0
        outstart[(*outpos)++] = (char)res;
8728
0
        return enc_SUCCESS;
8729
0
    }
8730
8731
0
    rep = charmapencode_lookup(c, mapping, &replace);
8732
0
    if (rep==NULL)
8733
0
        return enc_EXCEPTION;
8734
0
    else if (rep==Py_None) {
8735
0
        Py_DECREF(rep);
8736
0
        return enc_FAILED;
8737
0
    } else {
8738
0
        if (PyLong_Check(rep)) {
8739
0
            Py_ssize_t requiredsize = *outpos+1;
8740
0
            if (outsize<requiredsize)
8741
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8742
0
                    Py_DECREF(rep);
8743
0
                    return enc_EXCEPTION;
8744
0
                }
8745
0
            outstart = _PyBytesWriter_GetData(writer);
8746
0
            outstart[(*outpos)++] = (char)replace;
8747
0
        }
8748
0
        else {
8749
0
            const char *repchars = PyBytes_AS_STRING(rep);
8750
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8751
0
            Py_ssize_t requiredsize = *outpos+repsize;
8752
0
            if (outsize<requiredsize)
8753
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8754
0
                    Py_DECREF(rep);
8755
0
                    return enc_EXCEPTION;
8756
0
                }
8757
0
            outstart = _PyBytesWriter_GetData(writer);
8758
0
            memcpy(outstart + *outpos, repchars, repsize);
8759
0
            *outpos += repsize;
8760
0
        }
8761
0
    }
8762
0
    Py_DECREF(rep);
8763
0
    return enc_SUCCESS;
8764
0
}
8765
8766
/* handle an error in _PyUnicode_EncodeCharmap()
8767
   Return 0 on success, -1 on error */
8768
static int
8769
charmap_encoding_error(
8770
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8771
    PyObject **exceptionObject,
8772
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8773
    PyBytesWriter *writer, Py_ssize_t *respos)
8774
0
{
8775
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8776
0
    Py_ssize_t size, repsize;
8777
0
    Py_ssize_t newpos;
8778
0
    int kind;
8779
0
    const void *data;
8780
0
    Py_ssize_t index;
8781
    /* startpos for collecting unencodable chars */
8782
0
    Py_ssize_t collstartpos = *inpos;
8783
0
    Py_ssize_t collendpos = *inpos+1;
8784
0
    Py_ssize_t collpos;
8785
0
    const char *encoding = "charmap";
8786
0
    const char *reason = "character maps to <undefined>";
8787
0
    charmapencode_result x;
8788
0
    Py_UCS4 ch;
8789
0
    int val;
8790
8791
0
    size = PyUnicode_GET_LENGTH(unicode);
8792
    /* find all unencodable characters */
8793
0
    while (collendpos < size) {
8794
0
        PyObject *rep;
8795
0
        unsigned char replace;
8796
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8797
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8798
0
            val = encoding_map_lookup(ch, mapping);
8799
0
            if (val != -1)
8800
0
                break;
8801
0
            ++collendpos;
8802
0
            continue;
8803
0
        }
8804
8805
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8806
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8807
0
        if (rep==NULL)
8808
0
            return -1;
8809
0
        else if (rep!=Py_None) {
8810
0
            Py_DECREF(rep);
8811
0
            break;
8812
0
        }
8813
0
        Py_DECREF(rep);
8814
0
        ++collendpos;
8815
0
    }
8816
    /* cache callback name lookup
8817
     * (if not done yet, i.e. it's the first error) */
8818
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8819
0
        *error_handler = _Py_GetErrorHandler(errors);
8820
8821
0
    switch (*error_handler) {
8822
0
    case _Py_ERROR_STRICT:
8823
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8824
0
        return -1;
8825
8826
0
    case _Py_ERROR_REPLACE:
8827
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8828
0
            x = charmapencode_output('?', mapping, writer, respos);
8829
0
            if (x==enc_EXCEPTION) {
8830
0
                return -1;
8831
0
            }
8832
0
            else if (x==enc_FAILED) {
8833
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8834
0
                return -1;
8835
0
            }
8836
0
        }
8837
0
        _Py_FALLTHROUGH;
8838
0
    case _Py_ERROR_IGNORE:
8839
0
        *inpos = collendpos;
8840
0
        break;
8841
8842
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8843
        /* generate replacement (temporarily (mis)uses p) */
8844
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8845
0
            char buffer[2+29+1+1];
8846
0
            char *cp;
8847
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8848
0
            for (cp = buffer; *cp; ++cp) {
8849
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8850
0
                if (x==enc_EXCEPTION)
8851
0
                    return -1;
8852
0
                else if (x==enc_FAILED) {
8853
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8854
0
                    return -1;
8855
0
                }
8856
0
            }
8857
0
        }
8858
0
        *inpos = collendpos;
8859
0
        break;
8860
8861
0
    default:
8862
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8863
0
                                                      encoding, reason, unicode, exceptionObject,
8864
0
                                                      collstartpos, collendpos, &newpos);
8865
0
        if (repunicode == NULL)
8866
0
            return -1;
8867
0
        if (PyBytes_Check(repunicode)) {
8868
            /* Directly copy bytes result to output. */
8869
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8870
0
            Py_ssize_t requiredsize;
8871
0
            repsize = PyBytes_Size(repunicode);
8872
0
            requiredsize = *respos + repsize;
8873
0
            if (requiredsize > outsize)
8874
                /* Make room for all additional bytes. */
8875
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8876
0
                    Py_DECREF(repunicode);
8877
0
                    return -1;
8878
0
                }
8879
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8880
0
                   PyBytes_AsString(repunicode),  repsize);
8881
0
            *respos += repsize;
8882
0
            *inpos = newpos;
8883
0
            Py_DECREF(repunicode);
8884
0
            break;
8885
0
        }
8886
        /* generate replacement  */
8887
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8888
0
        data = PyUnicode_DATA(repunicode);
8889
0
        kind = PyUnicode_KIND(repunicode);
8890
0
        for (index = 0; index < repsize; index++) {
8891
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8892
0
            x = charmapencode_output(repch, mapping, writer, respos);
8893
0
            if (x==enc_EXCEPTION) {
8894
0
                Py_DECREF(repunicode);
8895
0
                return -1;
8896
0
            }
8897
0
            else if (x==enc_FAILED) {
8898
0
                Py_DECREF(repunicode);
8899
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8900
0
                return -1;
8901
0
            }
8902
0
        }
8903
0
        *inpos = newpos;
8904
0
        Py_DECREF(repunicode);
8905
0
    }
8906
0
    return 0;
8907
0
}
8908
8909
PyObject *
8910
_PyUnicode_EncodeCharmap(PyObject *unicode,
8911
                         PyObject *mapping,
8912
                         const char *errors)
8913
0
{
8914
    /* Default to Latin-1 */
8915
0
    if (mapping == NULL) {
8916
0
        return unicode_encode_ucs1(unicode, errors, 256);
8917
0
    }
8918
8919
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
8920
0
    if (size == 0) {
8921
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8922
0
    }
8923
0
    const void *data = PyUnicode_DATA(unicode);
8924
0
    int kind = PyUnicode_KIND(unicode);
8925
8926
0
    PyObject *error_handler_obj = NULL;
8927
0
    PyObject *exc = NULL;
8928
8929
    /* output object */
8930
0
    PyBytesWriter *writer;
8931
    /* allocate enough for a simple encoding without
8932
       replacements, if we need more, we'll resize */
8933
0
    writer = PyBytesWriter_Create(size);
8934
0
    if (writer == NULL) {
8935
0
        goto onError;
8936
0
    }
8937
8938
    /* current input position */
8939
0
    Py_ssize_t inpos = 0;
8940
    /* current output position */
8941
0
    Py_ssize_t respos = 0;
8942
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8943
8944
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8945
0
        char *outstart = _PyBytesWriter_GetData(writer);
8946
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8947
8948
0
        while (inpos<size) {
8949
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8950
8951
            /* try to encode it */
8952
0
            int res = encoding_map_lookup(ch, mapping);
8953
0
            Py_ssize_t requiredsize = respos+1;
8954
0
            if (res == -1) {
8955
0
                goto enc_FAILED;
8956
0
            }
8957
8958
0
            if (outsize<requiredsize) {
8959
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
8960
0
                    goto onError;
8961
0
                }
8962
0
                outstart = _PyBytesWriter_GetData(writer);
8963
0
                outsize = _PyBytesWriter_GetSize(writer);
8964
0
            }
8965
0
            outstart[respos++] = (char)res;
8966
8967
            /* done with this character => adjust input position */
8968
0
            ++inpos;
8969
0
            continue;
8970
8971
0
enc_FAILED:
8972
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
8973
0
                                       &exc,
8974
0
                                       &error_handler, &error_handler_obj, errors,
8975
0
                                       writer, &respos)) {
8976
0
                goto onError;
8977
0
            }
8978
0
            outstart = _PyBytesWriter_GetData(writer);
8979
0
            outsize = _PyBytesWriter_GetSize(writer);
8980
0
        }
8981
0
    }
8982
0
    else {
8983
0
        while (inpos<size) {
8984
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8985
            /* try to encode it */
8986
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
8987
0
            if (x==enc_EXCEPTION) { /* error */
8988
0
                goto onError;
8989
0
            }
8990
0
            if (x==enc_FAILED) { /* unencodable character */
8991
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
8992
0
                                           &exc,
8993
0
                                           &error_handler, &error_handler_obj, errors,
8994
0
                                           writer, &respos)) {
8995
0
                    goto onError;
8996
0
                }
8997
0
            }
8998
0
            else {
8999
                /* done with this character => adjust input position */
9000
0
                ++inpos;
9001
0
            }
9002
0
        }
9003
0
    }
9004
9005
0
    Py_XDECREF(exc);
9006
0
    Py_XDECREF(error_handler_obj);
9007
9008
    /* Resize if we allocated too much */
9009
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9010
9011
0
  onError:
9012
0
    PyBytesWriter_Discard(writer);
9013
0
    Py_XDECREF(exc);
9014
0
    Py_XDECREF(error_handler_obj);
9015
0
    return NULL;
9016
0
}
9017
9018
PyObject *
9019
PyUnicode_AsCharmapString(PyObject *unicode,
9020
                          PyObject *mapping)
9021
0
{
9022
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9023
0
        PyErr_BadArgument();
9024
0
        return NULL;
9025
0
    }
9026
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9027
0
}
9028
9029
/* create or adjust a UnicodeTranslateError */
9030
static void
9031
make_translate_exception(PyObject **exceptionObject,
9032
                         PyObject *unicode,
9033
                         Py_ssize_t startpos, Py_ssize_t endpos,
9034
                         const char *reason)
9035
0
{
9036
0
    if (*exceptionObject == NULL) {
9037
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9038
0
            unicode, startpos, endpos, reason);
9039
0
    }
9040
0
    else {
9041
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9042
0
            goto onError;
9043
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9044
0
            goto onError;
9045
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9046
0
            goto onError;
9047
0
        return;
9048
0
      onError:
9049
0
        Py_CLEAR(*exceptionObject);
9050
0
    }
9051
0
}
9052
9053
/* error handling callback helper:
9054
   build arguments, call the callback and check the arguments,
9055
   put the result into newpos and return the replacement string, which
9056
   has to be freed by the caller */
9057
static PyObject *
9058
unicode_translate_call_errorhandler(const char *errors,
9059
                                    PyObject **errorHandler,
9060
                                    const char *reason,
9061
                                    PyObject *unicode, PyObject **exceptionObject,
9062
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9063
                                    Py_ssize_t *newpos)
9064
0
{
9065
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9066
9067
0
    Py_ssize_t i_newpos;
9068
0
    PyObject *restuple;
9069
0
    PyObject *resunicode;
9070
9071
0
    if (*errorHandler == NULL) {
9072
0
        *errorHandler = PyCodec_LookupError(errors);
9073
0
        if (*errorHandler == NULL)
9074
0
            return NULL;
9075
0
    }
9076
9077
0
    make_translate_exception(exceptionObject,
9078
0
                             unicode, startpos, endpos, reason);
9079
0
    if (*exceptionObject == NULL)
9080
0
        return NULL;
9081
9082
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9083
0
    if (restuple == NULL)
9084
0
        return NULL;
9085
0
    if (!PyTuple_Check(restuple)) {
9086
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9087
0
        Py_DECREF(restuple);
9088
0
        return NULL;
9089
0
    }
9090
0
    if (!PyArg_ParseTuple(restuple, argparse,
9091
0
                          &resunicode, &i_newpos)) {
9092
0
        Py_DECREF(restuple);
9093
0
        return NULL;
9094
0
    }
9095
0
    if (i_newpos<0)
9096
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9097
0
    else
9098
0
        *newpos = i_newpos;
9099
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9100
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9101
0
        Py_DECREF(restuple);
9102
0
        return NULL;
9103
0
    }
9104
0
    Py_INCREF(resunicode);
9105
0
    Py_DECREF(restuple);
9106
0
    return resunicode;
9107
0
}
9108
9109
/* Lookup the character ch in the mapping and put the result in result,
9110
   which must be decrefed by the caller.
9111
   The result can be PyLong, PyUnicode, None or NULL.
9112
   If the result is PyLong, put its value in replace.
9113
   Return 0 on success, -1 on error */
9114
static int
9115
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9116
338
{
9117
338
    PyObject *w = PyLong_FromLong((long)c);
9118
338
    PyObject *x;
9119
9120
338
    if (w == NULL)
9121
0
        return -1;
9122
338
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9123
338
    Py_DECREF(w);
9124
338
    if (rc == 0) {
9125
        /* No mapping found means: use 1:1 mapping. */
9126
158
        *result = NULL;
9127
158
        return 0;
9128
158
    }
9129
180
    if (x == NULL) {
9130
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9131
            /* No mapping found means: use 1:1 mapping. */
9132
0
            PyErr_Clear();
9133
0
            *result = NULL;
9134
0
            return 0;
9135
0
        } else
9136
0
            return -1;
9137
0
    }
9138
180
    else if (x == Py_None) {
9139
0
        *result = x;
9140
0
        return 0;
9141
0
    }
9142
180
    else if (PyLong_Check(x)) {
9143
0
        long value = PyLong_AsLong(x);
9144
0
        if (value < 0 || value > MAX_UNICODE) {
9145
0
            PyErr_Format(PyExc_ValueError,
9146
0
                         "character mapping must be in range(0x%x)",
9147
0
                         MAX_UNICODE+1);
9148
0
            Py_DECREF(x);
9149
0
            return -1;
9150
0
        }
9151
0
        *result = x;
9152
0
        *replace = (Py_UCS4)value;
9153
0
        return 0;
9154
0
    }
9155
180
    else if (PyUnicode_Check(x)) {
9156
180
        *result = x;
9157
180
        return 0;
9158
180
    }
9159
0
    else {
9160
        /* wrong return value */
9161
0
        PyErr_SetString(PyExc_TypeError,
9162
0
                        "character mapping must return integer, None or str");
9163
0
        Py_DECREF(x);
9164
0
        return -1;
9165
0
    }
9166
180
}
9167
9168
/* lookup the character, write the result into the writer.
9169
   Return 1 if the result was written into the writer, return 0 if the mapping
9170
   was undefined, raise an exception return -1 on error. */
9171
static int
9172
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9173
                        _PyUnicodeWriter *writer)
9174
205
{
9175
205
    PyObject *item;
9176
205
    Py_UCS4 replace;
9177
9178
205
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9179
0
        return -1;
9180
9181
205
    if (item == NULL) {
9182
        /* not found => default to 1:1 mapping */
9183
81
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9184
0
            return -1;
9185
0
        }
9186
81
        return 1;
9187
81
    }
9188
9189
124
    if (item == Py_None) {
9190
0
        Py_DECREF(item);
9191
0
        return 0;
9192
0
    }
9193
9194
124
    if (PyLong_Check(item)) {
9195
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9196
0
            Py_DECREF(item);
9197
0
            return -1;
9198
0
        }
9199
0
        Py_DECREF(item);
9200
0
        return 1;
9201
0
    }
9202
9203
124
    if (!PyUnicode_Check(item)) {
9204
0
        Py_DECREF(item);
9205
0
        return -1;
9206
0
    }
9207
9208
124
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9209
0
        Py_DECREF(item);
9210
0
        return -1;
9211
0
    }
9212
9213
124
    Py_DECREF(item);
9214
124
    return 1;
9215
124
}
9216
9217
static int
9218
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9219
                              Py_UCS1 *translate)
9220
133
{
9221
133
    PyObject *item = NULL;
9222
133
    Py_UCS4 replace;
9223
133
    int ret = 0;
9224
9225
133
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9226
0
        return -1;
9227
0
    }
9228
9229
133
    if (item == Py_None) {
9230
        /* deletion */
9231
0
        translate[ch] = 0xfe;
9232
0
    }
9233
133
    else if (item == NULL) {
9234
        /* not found => default to 1:1 mapping */
9235
77
        translate[ch] = ch;
9236
77
        return 1;
9237
77
    }
9238
56
    else if (PyLong_Check(item)) {
9239
0
        if (replace > 127) {
9240
            /* invalid character or character outside ASCII:
9241
               skip the fast translate */
9242
0
            goto exit;
9243
0
        }
9244
0
        translate[ch] = (Py_UCS1)replace;
9245
0
    }
9246
56
    else if (PyUnicode_Check(item)) {
9247
56
        if (PyUnicode_GET_LENGTH(item) != 1)
9248
56
            goto exit;
9249
9250
0
        replace = PyUnicode_READ_CHAR(item, 0);
9251
0
        if (replace > 127)
9252
0
            goto exit;
9253
0
        translate[ch] = (Py_UCS1)replace;
9254
0
    }
9255
0
    else {
9256
        /* not None, NULL, long or unicode */
9257
0
        goto exit;
9258
0
    }
9259
0
    ret = 1;
9260
9261
56
  exit:
9262
56
    Py_DECREF(item);
9263
56
    return ret;
9264
0
}
9265
9266
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9267
   was translated into writer, return 0 if the input string was partially
9268
   translated into writer, raise an exception and return -1 on error. */
9269
static int
9270
unicode_fast_translate(PyObject *input, PyObject *mapping,
9271
                       _PyUnicodeWriter *writer, int ignore,
9272
                       Py_ssize_t *input_pos)
9273
104
{
9274
104
    Py_UCS1 ascii_table[128], ch, ch2;
9275
104
    Py_ssize_t len;
9276
104
    const Py_UCS1 *in, *end;
9277
104
    Py_UCS1 *out;
9278
104
    int res = 0;
9279
9280
104
    len = PyUnicode_GET_LENGTH(input);
9281
9282
104
    memset(ascii_table, 0xff, 128);
9283
9284
104
    in = PyUnicode_1BYTE_DATA(input);
9285
104
    end = in + len;
9286
9287
104
    assert(PyUnicode_IS_ASCII(writer->buffer));
9288
104
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9289
104
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9290
9291
195
    for (; in < end; in++) {
9292
147
        ch = *in;
9293
147
        ch2 = ascii_table[ch];
9294
147
        if (ch2 == 0xff) {
9295
133
            int translate = unicode_fast_translate_lookup(mapping, ch,
9296
133
                                                          ascii_table);
9297
133
            if (translate < 0)
9298
0
                return -1;
9299
133
            if (translate == 0)
9300
56
                goto exit;
9301
77
            ch2 = ascii_table[ch];
9302
77
        }
9303
91
        if (ch2 == 0xfe) {
9304
0
            if (ignore)
9305
0
                continue;
9306
0
            goto exit;
9307
0
        }
9308
91
        assert(ch2 < 128);
9309
91
        *out = ch2;
9310
91
        out++;
9311
91
    }
9312
48
    res = 1;
9313
9314
104
exit:
9315
104
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9316
104
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9317
104
    return res;
9318
48
}
9319
9320
static PyObject *
9321
_PyUnicode_TranslateCharmap(PyObject *input,
9322
                            PyObject *mapping,
9323
                            const char *errors)
9324
104
{
9325
    /* input object */
9326
104
    const void *data;
9327
104
    Py_ssize_t size, i;
9328
104
    int kind;
9329
    /* output buffer */
9330
104
    _PyUnicodeWriter writer;
9331
    /* error handler */
9332
104
    const char *reason = "character maps to <undefined>";
9333
104
    PyObject *errorHandler = NULL;
9334
104
    PyObject *exc = NULL;
9335
104
    int ignore;
9336
104
    int res;
9337
9338
104
    if (mapping == NULL) {
9339
0
        PyErr_BadArgument();
9340
0
        return NULL;
9341
0
    }
9342
9343
104
    data = PyUnicode_DATA(input);
9344
104
    kind = PyUnicode_KIND(input);
9345
104
    size = PyUnicode_GET_LENGTH(input);
9346
9347
104
    if (size == 0)
9348
0
        return PyUnicode_FromObject(input);
9349
9350
    /* allocate enough for a simple 1:1 translation without
9351
       replacements, if we need more, we'll resize */
9352
104
    _PyUnicodeWriter_Init(&writer);
9353
104
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9354
0
        goto onError;
9355
9356
104
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9357
9358
104
    if (PyUnicode_IS_ASCII(input)) {
9359
104
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9360
104
        if (res < 0) {
9361
0
            _PyUnicodeWriter_Dealloc(&writer);
9362
0
            return NULL;
9363
0
        }
9364
104
        if (res == 1)
9365
48
            return _PyUnicodeWriter_Finish(&writer);
9366
104
    }
9367
0
    else {
9368
0
        i = 0;
9369
0
    }
9370
9371
261
    while (i<size) {
9372
        /* try to encode it */
9373
205
        int translate;
9374
205
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9375
205
        Py_ssize_t newpos;
9376
        /* startpos for collecting untranslatable chars */
9377
205
        Py_ssize_t collstart;
9378
205
        Py_ssize_t collend;
9379
205
        Py_UCS4 ch;
9380
9381
205
        ch = PyUnicode_READ(kind, data, i);
9382
205
        translate = charmaptranslate_output(ch, mapping, &writer);
9383
205
        if (translate < 0)
9384
0
            goto onError;
9385
9386
205
        if (translate != 0) {
9387
            /* it worked => adjust input pointer */
9388
205
            ++i;
9389
205
            continue;
9390
205
        }
9391
9392
        /* untranslatable character */
9393
0
        collstart = i;
9394
0
        collend = i+1;
9395
9396
        /* find all untranslatable characters */
9397
0
        while (collend < size) {
9398
0
            PyObject *x;
9399
0
            Py_UCS4 replace;
9400
0
            ch = PyUnicode_READ(kind, data, collend);
9401
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9402
0
                goto onError;
9403
0
            Py_XDECREF(x);
9404
0
            if (x != Py_None)
9405
0
                break;
9406
0
            ++collend;
9407
0
        }
9408
9409
0
        if (ignore) {
9410
0
            i = collend;
9411
0
        }
9412
0
        else {
9413
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9414
0
                                                             reason, input, &exc,
9415
0
                                                             collstart, collend, &newpos);
9416
0
            if (repunicode == NULL)
9417
0
                goto onError;
9418
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9419
0
                Py_DECREF(repunicode);
9420
0
                goto onError;
9421
0
            }
9422
0
            Py_DECREF(repunicode);
9423
0
            i = newpos;
9424
0
        }
9425
0
    }
9426
56
    Py_XDECREF(exc);
9427
56
    Py_XDECREF(errorHandler);
9428
56
    return _PyUnicodeWriter_Finish(&writer);
9429
9430
0
  onError:
9431
0
    _PyUnicodeWriter_Dealloc(&writer);
9432
0
    Py_XDECREF(exc);
9433
0
    Py_XDECREF(errorHandler);
9434
0
    return NULL;
9435
56
}
9436
9437
PyObject *
9438
PyUnicode_Translate(PyObject *str,
9439
                    PyObject *mapping,
9440
                    const char *errors)
9441
0
{
9442
0
    if (ensure_unicode(str) < 0)
9443
0
        return NULL;
9444
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9445
0
}
9446
9447
PyObject *
9448
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9449
4.45M
{
9450
4.45M
    if (!PyUnicode_Check(unicode)) {
9451
0
        PyErr_BadInternalCall();
9452
0
        return NULL;
9453
0
    }
9454
4.45M
    if (PyUnicode_IS_ASCII(unicode)) {
9455
        /* If the string is already ASCII, just return the same string */
9456
4.45M
        return Py_NewRef(unicode);
9457
4.45M
    }
9458
9459
2.38k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9460
2.38k
    PyObject *result = PyUnicode_New(len, 127);
9461
2.38k
    if (result == NULL) {
9462
0
        return NULL;
9463
0
    }
9464
9465
2.38k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9466
2.38k
    int kind = PyUnicode_KIND(unicode);
9467
2.38k
    const void *data = PyUnicode_DATA(unicode);
9468
2.38k
    Py_ssize_t i;
9469
33.7k
    for (i = 0; i < len; ++i) {
9470
31.4k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9471
31.4k
        if (ch < 127) {
9472
28.6k
            out[i] = ch;
9473
28.6k
        }
9474
2.83k
        else if (Py_UNICODE_ISSPACE(ch)) {
9475
1.05k
            out[i] = ' ';
9476
1.05k
        }
9477
1.78k
        else {
9478
1.78k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9479
1.78k
            if (decimal < 0) {
9480
142
                out[i] = '?';
9481
142
                out[i+1] = '\0';
9482
142
                _PyUnicode_LENGTH(result) = i + 1;
9483
142
                break;
9484
142
            }
9485
1.64k
            out[i] = '0' + decimal;
9486
1.64k
        }
9487
31.4k
    }
9488
9489
2.38k
    assert(_PyUnicode_CheckConsistency(result, 1));
9490
2.38k
    return result;
9491
2.38k
}
9492
9493
/* --- Helpers ------------------------------------------------------------ */
9494
9495
/* helper macro to fixup start/end slice values */
9496
#define ADJUST_INDICES(start, end, len) \
9497
180M
    do {                                \
9498
180M
        if (end > len) {                \
9499
149M
            end = len;                  \
9500
149M
        }                               \
9501
180M
        else if (end < 0) {             \
9502
0
            end += len;                 \
9503
0
            if (end < 0) {              \
9504
0
                end = 0;                \
9505
0
            }                           \
9506
0
        }                               \
9507
180M
        if (start < 0) {                \
9508
20.8k
            start += len;               \
9509
20.8k
            if (start < 0) {            \
9510
0
                start = 0;              \
9511
0
            }                           \
9512
20.8k
        }                               \
9513
180M
    } while (0)
9514
9515
static Py_ssize_t
9516
any_find_slice(PyObject* s1, PyObject* s2,
9517
               Py_ssize_t start,
9518
               Py_ssize_t end,
9519
               int direction)
9520
30.8M
{
9521
30.8M
    int kind1, kind2;
9522
30.8M
    const void *buf1, *buf2;
9523
30.8M
    Py_ssize_t len1, len2, result;
9524
9525
30.8M
    kind1 = PyUnicode_KIND(s1);
9526
30.8M
    kind2 = PyUnicode_KIND(s2);
9527
30.8M
    if (kind1 < kind2)
9528
0
        return -1;
9529
9530
30.8M
    len1 = PyUnicode_GET_LENGTH(s1);
9531
30.8M
    len2 = PyUnicode_GET_LENGTH(s2);
9532
30.8M
    ADJUST_INDICES(start, end, len1);
9533
30.8M
    if (end - start < len2)
9534
3.14M
        return -1;
9535
9536
27.7M
    buf1 = PyUnicode_DATA(s1);
9537
27.7M
    buf2 = PyUnicode_DATA(s2);
9538
27.7M
    if (len2 == 1) {
9539
27.6M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9540
27.6M
        result = findchar((const char *)buf1 + kind1*start,
9541
27.6M
                          kind1, end - start, ch, direction);
9542
27.6M
        if (result == -1)
9543
3.75M
            return -1;
9544
23.8M
        else
9545
23.8M
            return start + result;
9546
27.6M
    }
9547
9548
83.3k
    if (kind2 != kind1) {
9549
59.8k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9550
59.8k
        if (!buf2)
9551
0
            return -2;
9552
59.8k
    }
9553
9554
83.3k
    if (direction > 0) {
9555
83.3k
        switch (kind1) {
9556
23.4k
        case PyUnicode_1BYTE_KIND:
9557
23.4k
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9558
7.97k
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9559
15.5k
            else
9560
15.5k
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9561
23.4k
            break;
9562
30.1k
        case PyUnicode_2BYTE_KIND:
9563
30.1k
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9564
30.1k
            break;
9565
29.6k
        case PyUnicode_4BYTE_KIND:
9566
29.6k
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9567
29.6k
            break;
9568
0
        default:
9569
0
            Py_UNREACHABLE();
9570
83.3k
        }
9571
83.3k
    }
9572
0
    else {
9573
0
        switch (kind1) {
9574
0
        case PyUnicode_1BYTE_KIND:
9575
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9576
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9577
0
            else
9578
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9579
0
            break;
9580
0
        case PyUnicode_2BYTE_KIND:
9581
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9582
0
            break;
9583
0
        case PyUnicode_4BYTE_KIND:
9584
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9585
0
            break;
9586
0
        default:
9587
0
            Py_UNREACHABLE();
9588
0
        }
9589
0
    }
9590
9591
83.3k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9592
83.3k
    if (kind2 != kind1)
9593
59.8k
        PyMem_Free((void *)buf2);
9594
9595
83.3k
    return result;
9596
83.3k
}
9597
9598
9599
Py_ssize_t
9600
PyUnicode_Count(PyObject *str,
9601
                PyObject *substr,
9602
                Py_ssize_t start,
9603
                Py_ssize_t end)
9604
0
{
9605
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9606
0
        return -1;
9607
9608
0
    return unicode_count_impl(str, substr, start, end);
9609
0
}
9610
9611
Py_ssize_t
9612
PyUnicode_Find(PyObject *str,
9613
               PyObject *substr,
9614
               Py_ssize_t start,
9615
               Py_ssize_t end,
9616
               int direction)
9617
0
{
9618
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9619
0
        return -2;
9620
9621
0
    return any_find_slice(str, substr, start, end, direction);
9622
0
}
9623
9624
Py_ssize_t
9625
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9626
                   Py_ssize_t start, Py_ssize_t end,
9627
                   int direction)
9628
570k
{
9629
570k
    int kind;
9630
570k
    Py_ssize_t len, result;
9631
570k
    len = PyUnicode_GET_LENGTH(str);
9632
570k
    ADJUST_INDICES(start, end, len);
9633
570k
    if (end - start < 1)
9634
0
        return -1;
9635
570k
    kind = PyUnicode_KIND(str);
9636
570k
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9637
570k
                      kind, end-start, ch, direction);
9638
570k
    if (result == -1)
9639
58.5k
        return -1;
9640
512k
    else
9641
512k
        return start + result;
9642
570k
}
9643
9644
static int
9645
tailmatch(PyObject *self,
9646
          PyObject *substring,
9647
          Py_ssize_t start,
9648
          Py_ssize_t end,
9649
          int direction)
9650
117M
{
9651
117M
    int kind_self;
9652
117M
    int kind_sub;
9653
117M
    const void *data_self;
9654
117M
    const void *data_sub;
9655
117M
    Py_ssize_t offset;
9656
117M
    Py_ssize_t i;
9657
117M
    Py_ssize_t end_sub;
9658
9659
117M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9660
117M
    end -= PyUnicode_GET_LENGTH(substring);
9661
117M
    if (end < start)
9662
10.8M
        return 0;
9663
9664
106M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9665
0
        return 1;
9666
9667
106M
    kind_self = PyUnicode_KIND(self);
9668
106M
    data_self = PyUnicode_DATA(self);
9669
106M
    kind_sub = PyUnicode_KIND(substring);
9670
106M
    data_sub = PyUnicode_DATA(substring);
9671
106M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9672
9673
106M
    if (direction > 0)
9674
7.86M
        offset = end;
9675
98.4M
    else
9676
98.4M
        offset = start;
9677
9678
106M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9679
106M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9680
55.0M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9681
55.0M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9682
        /* If both are of the same kind, memcmp is sufficient */
9683
18.4M
        if (kind_self == kind_sub) {
9684
10.9M
            return ! memcmp((char *)data_self +
9685
10.9M
                                (offset * PyUnicode_KIND(substring)),
9686
10.9M
                            data_sub,
9687
10.9M
                            PyUnicode_GET_LENGTH(substring) *
9688
10.9M
                                PyUnicode_KIND(substring));
9689
10.9M
        }
9690
        /* otherwise we have to compare each character by first accessing it */
9691
7.42M
        else {
9692
            /* We do not need to compare 0 and len(substring)-1 because
9693
               the if statement above ensured already that they are equal
9694
               when we end up here. */
9695
7.48M
            for (i = 1; i < end_sub; ++i) {
9696
58.2k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9697
58.2k
                    PyUnicode_READ(kind_sub, data_sub, i))
9698
4.15k
                    return 0;
9699
58.2k
            }
9700
7.42M
            return 1;
9701
7.42M
        }
9702
18.4M
    }
9703
9704
87.9M
    return 0;
9705
106M
}
9706
9707
Py_ssize_t
9708
PyUnicode_Tailmatch(PyObject *str,
9709
                    PyObject *substr,
9710
                    Py_ssize_t start,
9711
                    Py_ssize_t end,
9712
                    int direction)
9713
0
{
9714
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9715
0
        return -1;
9716
9717
0
    return tailmatch(str, substr, start, end, direction);
9718
0
}
9719
9720
static PyObject *
9721
ascii_upper_or_lower(PyObject *self, int lower)
9722
75.3M
{
9723
75.3M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9724
75.3M
    const char *data = PyUnicode_DATA(self);
9725
75.3M
    char *resdata;
9726
75.3M
    PyObject *res;
9727
9728
75.3M
    res = PyUnicode_New(len, 127);
9729
75.3M
    if (res == NULL)
9730
0
        return NULL;
9731
75.3M
    resdata = PyUnicode_DATA(res);
9732
75.3M
    if (lower)
9733
75.3M
        _Py_bytes_lower(resdata, data, len);
9734
102
    else
9735
102
        _Py_bytes_upper(resdata, data, len);
9736
75.3M
    return res;
9737
75.3M
}
9738
9739
static Py_UCS4
9740
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9741
227k
{
9742
227k
    Py_ssize_t j;
9743
227k
    int final_sigma;
9744
227k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9745
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9746
9747
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9748
9749
    where ! is a negation and \p{xxx} is a character with property xxx.
9750
    */
9751
459k
    for (j = i - 1; j >= 0; j--) {
9752
457k
        c = PyUnicode_READ(kind, data, j);
9753
457k
        if (!_PyUnicode_IsCaseIgnorable(c))
9754
225k
            break;
9755
457k
    }
9756
227k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9757
227k
    if (final_sigma) {
9758
366k
        for (j = i + 1; j < length; j++) {
9759
359k
            c = PyUnicode_READ(kind, data, j);
9760
359k
            if (!_PyUnicode_IsCaseIgnorable(c))
9761
162k
                break;
9762
359k
        }
9763
169k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9764
169k
    }
9765
227k
    return (final_sigma) ? 0x3C2 : 0x3C3;
9766
227k
}
9767
9768
static int
9769
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9770
           Py_UCS4 c, Py_UCS4 *mapped)
9771
74.2M
{
9772
    /* Obscure special case. */
9773
74.2M
    if (c == 0x3A3) {
9774
227k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9775
227k
        return 1;
9776
227k
    }
9777
74.0M
    return _PyUnicode_ToLowerFull(c, mapped);
9778
74.2M
}
9779
9780
static Py_ssize_t
9781
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9782
0
{
9783
0
    Py_ssize_t i, k = 0;
9784
0
    int n_res, j;
9785
0
    Py_UCS4 c, mapped[3];
9786
9787
0
    c = PyUnicode_READ(kind, data, 0);
9788
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9789
0
    for (j = 0; j < n_res; j++) {
9790
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9791
0
        res[k++] = mapped[j];
9792
0
    }
9793
0
    for (i = 1; i < length; i++) {
9794
0
        c = PyUnicode_READ(kind, data, i);
9795
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9796
0
        for (j = 0; j < n_res; j++) {
9797
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9798
0
            res[k++] = mapped[j];
9799
0
        }
9800
0
    }
9801
0
    return k;
9802
0
}
9803
9804
static Py_ssize_t
9805
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9806
0
    Py_ssize_t i, k = 0;
9807
9808
0
    for (i = 0; i < length; i++) {
9809
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9810
0
        int n_res, j;
9811
0
        if (Py_UNICODE_ISUPPER(c)) {
9812
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9813
0
        }
9814
0
        else if (Py_UNICODE_ISLOWER(c)) {
9815
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9816
0
        }
9817
0
        else {
9818
0
            n_res = 1;
9819
0
            mapped[0] = c;
9820
0
        }
9821
0
        for (j = 0; j < n_res; j++) {
9822
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9823
0
            res[k++] = mapped[j];
9824
0
        }
9825
0
    }
9826
0
    return k;
9827
0
}
9828
9829
static Py_ssize_t
9830
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9831
                  Py_UCS4 *maxchar, int lower)
9832
5.40M
{
9833
5.40M
    Py_ssize_t i, k = 0;
9834
9835
79.6M
    for (i = 0; i < length; i++) {
9836
74.2M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9837
74.2M
        int n_res, j;
9838
74.2M
        if (lower)
9839
74.2M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9840
0
        else
9841
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9842
148M
        for (j = 0; j < n_res; j++) {
9843
74.2M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9844
74.2M
            res[k++] = mapped[j];
9845
74.2M
        }
9846
74.2M
    }
9847
5.40M
    return k;
9848
5.40M
}
9849
9850
static Py_ssize_t
9851
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9852
0
{
9853
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9854
0
}
9855
9856
static Py_ssize_t
9857
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9858
5.40M
{
9859
5.40M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9860
5.40M
}
9861
9862
static Py_ssize_t
9863
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9864
0
{
9865
0
    Py_ssize_t i, k = 0;
9866
9867
0
    for (i = 0; i < length; i++) {
9868
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9869
0
        Py_UCS4 mapped[3];
9870
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9871
0
        for (j = 0; j < n_res; j++) {
9872
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9873
0
            res[k++] = mapped[j];
9874
0
        }
9875
0
    }
9876
0
    return k;
9877
0
}
9878
9879
static Py_ssize_t
9880
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9881
0
{
9882
0
    Py_ssize_t i, k = 0;
9883
0
    int previous_is_cased;
9884
9885
0
    previous_is_cased = 0;
9886
0
    for (i = 0; i < length; i++) {
9887
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9888
0
        Py_UCS4 mapped[3];
9889
0
        int n_res, j;
9890
9891
0
        if (previous_is_cased)
9892
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9893
0
        else
9894
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9895
9896
0
        for (j = 0; j < n_res; j++) {
9897
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9898
0
            res[k++] = mapped[j];
9899
0
        }
9900
9901
0
        previous_is_cased = _PyUnicode_IsCased(c);
9902
0
    }
9903
0
    return k;
9904
0
}
9905
9906
static PyObject *
9907
case_operation(PyObject *self,
9908
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9909
5.40M
{
9910
5.40M
    PyObject *res = NULL;
9911
5.40M
    Py_ssize_t length, newlength = 0;
9912
5.40M
    int kind, outkind;
9913
5.40M
    const void *data;
9914
5.40M
    void *outdata;
9915
5.40M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9916
9917
5.40M
    kind = PyUnicode_KIND(self);
9918
5.40M
    data = PyUnicode_DATA(self);
9919
5.40M
    length = PyUnicode_GET_LENGTH(self);
9920
5.40M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9921
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9922
0
        return NULL;
9923
0
    }
9924
5.40M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9925
5.40M
    if (tmp == NULL)
9926
0
        return PyErr_NoMemory();
9927
5.40M
    newlength = perform(kind, data, length, tmp, &maxchar);
9928
5.40M
    res = PyUnicode_New(newlength, maxchar);
9929
5.40M
    if (res == NULL)
9930
0
        goto leave;
9931
5.40M
    tmpend = tmp + newlength;
9932
5.40M
    outdata = PyUnicode_DATA(res);
9933
5.40M
    outkind = PyUnicode_KIND(res);
9934
5.40M
    switch (outkind) {
9935
221k
    case PyUnicode_1BYTE_KIND:
9936
221k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9937
221k
        break;
9938
5.14M
    case PyUnicode_2BYTE_KIND:
9939
5.14M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9940
5.14M
        break;
9941
43.1k
    case PyUnicode_4BYTE_KIND:
9942
43.1k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9943
43.1k
        break;
9944
0
    default:
9945
0
        Py_UNREACHABLE();
9946
5.40M
    }
9947
5.40M
  leave:
9948
5.40M
    PyMem_Free(tmp);
9949
5.40M
    return res;
9950
5.40M
}
9951
9952
PyObject *
9953
PyUnicode_Join(PyObject *separator, PyObject *seq)
9954
26.6M
{
9955
26.6M
    PyObject *res;
9956
26.6M
    PyObject *fseq;
9957
26.6M
    Py_ssize_t seqlen;
9958
26.6M
    PyObject **items;
9959
9960
26.6M
    fseq = PySequence_Fast(seq, "can only join an iterable");
9961
26.6M
    if (fseq == NULL) {
9962
647
        return NULL;
9963
647
    }
9964
9965
26.6M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9966
9967
26.6M
    items = PySequence_Fast_ITEMS(fseq);
9968
26.6M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9969
26.6M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9970
9971
26.6M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9972
9973
26.6M
    Py_DECREF(fseq);
9974
26.6M
    return res;
9975
26.6M
}
9976
9977
PyObject *
9978
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9979
73.9M
{
9980
73.9M
    PyObject *res = NULL; /* the result */
9981
73.9M
    PyObject *sep = NULL;
9982
73.9M
    Py_ssize_t seplen;
9983
73.9M
    PyObject *item;
9984
73.9M
    Py_ssize_t sz, i, res_offset;
9985
73.9M
    Py_UCS4 maxchar;
9986
73.9M
    Py_UCS4 item_maxchar;
9987
73.9M
    int use_memcpy;
9988
73.9M
    unsigned char *res_data = NULL, *sep_data = NULL;
9989
73.9M
    PyObject *last_obj;
9990
73.9M
    int kind = 0;
9991
9992
    /* If empty sequence, return u"". */
9993
73.9M
    if (seqlen == 0) {
9994
7.15M
        _Py_RETURN_UNICODE_EMPTY();
9995
7.15M
    }
9996
9997
    /* If singleton sequence with an exact Unicode, return that. */
9998
66.8M
    last_obj = NULL;
9999
66.8M
    if (seqlen == 1) {
10000
8.00M
        if (PyUnicode_CheckExact(items[0])) {
10001
6.26M
            res = items[0];
10002
6.26M
            return Py_NewRef(res);
10003
6.26M
        }
10004
1.73M
        seplen = 0;
10005
1.73M
        maxchar = 0;
10006
1.73M
    }
10007
58.8M
    else {
10008
        /* Set up sep and seplen */
10009
58.8M
        if (separator == NULL) {
10010
            /* fall back to a blank space separator */
10011
0
            sep = PyUnicode_FromOrdinal(' ');
10012
0
            if (!sep)
10013
0
                goto onError;
10014
0
            seplen = 1;
10015
0
            maxchar = 32;
10016
0
        }
10017
58.8M
        else {
10018
58.8M
            if (!PyUnicode_Check(separator)) {
10019
0
                PyErr_Format(PyExc_TypeError,
10020
0
                             "separator: expected str instance,"
10021
0
                             " %.80s found",
10022
0
                             Py_TYPE(separator)->tp_name);
10023
0
                goto onError;
10024
0
            }
10025
58.8M
            sep = separator;
10026
58.8M
            seplen = PyUnicode_GET_LENGTH(separator);
10027
58.8M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10028
            /* inc refcount to keep this code path symmetric with the
10029
               above case of a blank separator */
10030
58.8M
            Py_INCREF(sep);
10031
58.8M
        }
10032
58.8M
        last_obj = sep;
10033
58.8M
    }
10034
10035
    /* There are at least two things to join, or else we have a subclass
10036
     * of str in the sequence.
10037
     * Do a pre-pass to figure out the total amount of space we'll
10038
     * need (sz), and see whether all argument are strings.
10039
     */
10040
60.5M
    sz = 0;
10041
#ifdef Py_DEBUG
10042
    use_memcpy = 0;
10043
#else
10044
60.5M
    use_memcpy = 1;
10045
60.5M
#endif
10046
457M
    for (i = 0; i < seqlen; i++) {
10047
397M
        size_t add_sz;
10048
397M
        item = items[i];
10049
397M
        if (!PyUnicode_Check(item)) {
10050
0
            PyErr_Format(PyExc_TypeError,
10051
0
                         "sequence item %zd: expected str instance,"
10052
0
                         " %.80s found",
10053
0
                         i, Py_TYPE(item)->tp_name);
10054
0
            goto onError;
10055
0
        }
10056
397M
        add_sz = PyUnicode_GET_LENGTH(item);
10057
397M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10058
397M
        maxchar = Py_MAX(maxchar, item_maxchar);
10059
397M
        if (i != 0) {
10060
336M
            add_sz += seplen;
10061
336M
        }
10062
397M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10063
0
            PyErr_SetString(PyExc_OverflowError,
10064
0
                            "join() result is too long for a Python string");
10065
0
            goto onError;
10066
0
        }
10067
397M
        sz += add_sz;
10068
397M
        if (use_memcpy && last_obj != NULL) {
10069
326M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10070
6.60M
                use_memcpy = 0;
10071
326M
        }
10072
397M
        last_obj = item;
10073
397M
    }
10074
10075
60.5M
    res = PyUnicode_New(sz, maxchar);
10076
60.5M
    if (res == NULL)
10077
0
        goto onError;
10078
10079
    /* Catenate everything. */
10080
#ifdef Py_DEBUG
10081
    use_memcpy = 0;
10082
#else
10083
60.5M
    if (use_memcpy) {
10084
53.9M
        res_data = PyUnicode_1BYTE_DATA(res);
10085
53.9M
        kind = PyUnicode_KIND(res);
10086
53.9M
        if (seplen != 0)
10087
17.8k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10088
53.9M
    }
10089
60.5M
#endif
10090
60.5M
    if (use_memcpy) {
10091
348M
        for (i = 0; i < seqlen; ++i) {
10092
294M
            Py_ssize_t itemlen;
10093
294M
            item = items[i];
10094
10095
            /* Copy item, and maybe the separator. */
10096
294M
            if (i && seplen != 0) {
10097
24.5k
                memcpy(res_data,
10098
24.5k
                          sep_data,
10099
24.5k
                          kind * seplen);
10100
24.5k
                res_data += kind * seplen;
10101
24.5k
            }
10102
10103
294M
            itemlen = PyUnicode_GET_LENGTH(item);
10104
294M
            if (itemlen != 0) {
10105
254M
                memcpy(res_data,
10106
254M
                          PyUnicode_DATA(item),
10107
254M
                          kind * itemlen);
10108
254M
                res_data += kind * itemlen;
10109
254M
            }
10110
294M
        }
10111
53.9M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10112
53.9M
                           + kind * PyUnicode_GET_LENGTH(res));
10113
53.9M
    }
10114
6.60M
    else {
10115
109M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10116
102M
            Py_ssize_t itemlen;
10117
102M
            item = items[i];
10118
10119
            /* Copy item, and maybe the separator. */
10120
102M
            if (i && seplen != 0) {
10121
65.1k
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10122
65.1k
                res_offset += seplen;
10123
65.1k
            }
10124
10125
102M
            itemlen = PyUnicode_GET_LENGTH(item);
10126
102M
            if (itemlen != 0) {
10127
100M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10128
100M
                res_offset += itemlen;
10129
100M
            }
10130
102M
        }
10131
6.60M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10132
6.60M
    }
10133
10134
60.5M
    Py_XDECREF(sep);
10135
60.5M
    assert(_PyUnicode_CheckConsistency(res, 1));
10136
60.5M
    return res;
10137
10138
0
  onError:
10139
0
    Py_XDECREF(sep);
10140
0
    Py_XDECREF(res);
10141
0
    return NULL;
10142
60.5M
}
10143
10144
void
10145
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10146
                    Py_UCS4 fill_char)
10147
628
{
10148
628
    const int kind = PyUnicode_KIND(unicode);
10149
628
    void *data = PyUnicode_DATA(unicode);
10150
628
    assert(_PyUnicode_IsModifiable(unicode));
10151
628
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10152
628
    assert(start >= 0);
10153
628
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10154
628
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10155
628
}
10156
10157
Py_ssize_t
10158
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10159
               Py_UCS4 fill_char)
10160
628
{
10161
628
    Py_ssize_t maxlen;
10162
10163
628
    if (!PyUnicode_Check(unicode)) {
10164
0
        PyErr_BadInternalCall();
10165
0
        return -1;
10166
0
    }
10167
628
    if (unicode_check_modifiable(unicode))
10168
0
        return -1;
10169
10170
628
    if (start < 0) {
10171
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10172
0
        return -1;
10173
0
    }
10174
628
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10175
0
        PyErr_SetString(PyExc_ValueError,
10176
0
                         "fill character is bigger than "
10177
0
                         "the string maximum character");
10178
0
        return -1;
10179
0
    }
10180
10181
628
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10182
628
    length = Py_MIN(maxlen, length);
10183
628
    if (length <= 0)
10184
0
        return 0;
10185
10186
628
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10187
628
    return length;
10188
628
}
10189
10190
static PyObject *
10191
pad(PyObject *self,
10192
    Py_ssize_t left,
10193
    Py_ssize_t right,
10194
    Py_UCS4 fill)
10195
0
{
10196
0
    PyObject *u;
10197
0
    Py_UCS4 maxchar;
10198
0
    int kind;
10199
0
    void *data;
10200
10201
0
    if (left < 0)
10202
0
        left = 0;
10203
0
    if (right < 0)
10204
0
        right = 0;
10205
10206
0
    if (left == 0 && right == 0)
10207
0
        return unicode_result_unchanged(self);
10208
10209
0
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10210
0
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10211
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10212
0
        return NULL;
10213
0
    }
10214
0
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10215
0
    maxchar = Py_MAX(maxchar, fill);
10216
0
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10217
0
    if (!u)
10218
0
        return NULL;
10219
10220
0
    kind = PyUnicode_KIND(u);
10221
0
    data = PyUnicode_DATA(u);
10222
0
    if (left)
10223
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10224
0
    if (right)
10225
0
        _PyUnicode_Fill(kind, data, fill,
10226
0
                        left + _PyUnicode_LENGTH(self), right);
10227
0
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10228
0
    assert(_PyUnicode_CheckConsistency(u, 1));
10229
0
    return u;
10230
0
}
10231
10232
PyObject *
10233
PyUnicode_Splitlines(PyObject *string, int keepends)
10234
13.9k
{
10235
13.9k
    PyObject *list;
10236
10237
13.9k
    if (ensure_unicode(string) < 0)
10238
0
        return NULL;
10239
10240
13.9k
    switch (PyUnicode_KIND(string)) {
10241
3.61k
    case PyUnicode_1BYTE_KIND:
10242
3.61k
        if (PyUnicode_IS_ASCII(string))
10243
2.76k
            list = asciilib_splitlines(
10244
2.76k
                string, PyUnicode_1BYTE_DATA(string),
10245
2.76k
                PyUnicode_GET_LENGTH(string), keepends);
10246
853
        else
10247
853
            list = ucs1lib_splitlines(
10248
853
                string, PyUnicode_1BYTE_DATA(string),
10249
853
                PyUnicode_GET_LENGTH(string), keepends);
10250
3.61k
        break;
10251
7.30k
    case PyUnicode_2BYTE_KIND:
10252
7.30k
        list = ucs2lib_splitlines(
10253
7.30k
            string, PyUnicode_2BYTE_DATA(string),
10254
7.30k
            PyUnicode_GET_LENGTH(string), keepends);
10255
7.30k
        break;
10256
3.04k
    case PyUnicode_4BYTE_KIND:
10257
3.04k
        list = ucs4lib_splitlines(
10258
3.04k
            string, PyUnicode_4BYTE_DATA(string),
10259
3.04k
            PyUnicode_GET_LENGTH(string), keepends);
10260
3.04k
        break;
10261
0
    default:
10262
0
        Py_UNREACHABLE();
10263
13.9k
    }
10264
13.9k
    return list;
10265
13.9k
}
10266
10267
static PyObject *
10268
split(PyObject *self,
10269
      PyObject *substring,
10270
      Py_ssize_t maxcount)
10271
23.7M
{
10272
23.7M
    int kind1, kind2;
10273
23.7M
    const void *buf1, *buf2;
10274
23.7M
    Py_ssize_t len1, len2;
10275
23.7M
    PyObject* out;
10276
23.7M
    len1 = PyUnicode_GET_LENGTH(self);
10277
23.7M
    kind1 = PyUnicode_KIND(self);
10278
10279
23.7M
    if (substring == NULL) {
10280
173k
        if (maxcount < 0) {
10281
149k
            maxcount = (len1 - 1) / 2 + 1;
10282
149k
        }
10283
173k
        switch (kind1) {
10284
111k
        case PyUnicode_1BYTE_KIND:
10285
111k
            if (PyUnicode_IS_ASCII(self))
10286
86.2k
                return asciilib_split_whitespace(
10287
86.2k
                    self,  PyUnicode_1BYTE_DATA(self),
10288
86.2k
                    len1, maxcount
10289
86.2k
                    );
10290
25.4k
            else
10291
25.4k
                return ucs1lib_split_whitespace(
10292
25.4k
                    self,  PyUnicode_1BYTE_DATA(self),
10293
25.4k
                    len1, maxcount
10294
25.4k
                    );
10295
49.5k
        case PyUnicode_2BYTE_KIND:
10296
49.5k
            return ucs2lib_split_whitespace(
10297
49.5k
                self,  PyUnicode_2BYTE_DATA(self),
10298
49.5k
                len1, maxcount
10299
49.5k
                );
10300
12.4k
        case PyUnicode_4BYTE_KIND:
10301
12.4k
            return ucs4lib_split_whitespace(
10302
12.4k
                self,  PyUnicode_4BYTE_DATA(self),
10303
12.4k
                len1, maxcount
10304
12.4k
                );
10305
0
        default:
10306
0
            Py_UNREACHABLE();
10307
173k
        }
10308
173k
    }
10309
10310
23.5M
    kind2 = PyUnicode_KIND(substring);
10311
23.5M
    len2 = PyUnicode_GET_LENGTH(substring);
10312
23.5M
    if (maxcount < 0) {
10313
        // if len2 == 0, it will raise ValueError.
10314
15.5M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10315
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10316
15.5M
        maxcount = maxcount < 0 ? len1 : maxcount;
10317
15.5M
    }
10318
23.5M
    if (kind1 < kind2 || len1 < len2) {
10319
2.82M
        out = PyList_New(1);
10320
2.82M
        if (out == NULL)
10321
0
            return NULL;
10322
2.82M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10323
2.82M
        return out;
10324
2.82M
    }
10325
20.7M
    buf1 = PyUnicode_DATA(self);
10326
20.7M
    buf2 = PyUnicode_DATA(substring);
10327
20.7M
    if (kind2 != kind1) {
10328
219k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10329
219k
        if (!buf2)
10330
0
            return NULL;
10331
219k
    }
10332
10333
20.7M
    switch (kind1) {
10334
20.4M
    case PyUnicode_1BYTE_KIND:
10335
20.4M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10336
19.1M
            out = asciilib_split(
10337
19.1M
                self,  buf1, len1, buf2, len2, maxcount);
10338
1.32M
        else
10339
1.32M
            out = ucs1lib_split(
10340
1.32M
                self,  buf1, len1, buf2, len2, maxcount);
10341
20.4M
        break;
10342
180k
    case PyUnicode_2BYTE_KIND:
10343
180k
        out = ucs2lib_split(
10344
180k
            self,  buf1, len1, buf2, len2, maxcount);
10345
180k
        break;
10346
38.2k
    case PyUnicode_4BYTE_KIND:
10347
38.2k
        out = ucs4lib_split(
10348
38.2k
            self,  buf1, len1, buf2, len2, maxcount);
10349
38.2k
        break;
10350
0
    default:
10351
0
        out = NULL;
10352
20.7M
    }
10353
20.7M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10354
20.7M
    if (kind2 != kind1)
10355
219k
        PyMem_Free((void *)buf2);
10356
20.7M
    return out;
10357
20.7M
}
10358
10359
static PyObject *
10360
rsplit(PyObject *self,
10361
       PyObject *substring,
10362
       Py_ssize_t maxcount)
10363
50
{
10364
50
    int kind1, kind2;
10365
50
    const void *buf1, *buf2;
10366
50
    Py_ssize_t len1, len2;
10367
50
    PyObject* out;
10368
10369
50
    len1 = PyUnicode_GET_LENGTH(self);
10370
50
    kind1 = PyUnicode_KIND(self);
10371
10372
50
    if (substring == NULL) {
10373
0
        if (maxcount < 0) {
10374
0
            maxcount = (len1 - 1) / 2 + 1;
10375
0
        }
10376
0
        switch (kind1) {
10377
0
        case PyUnicode_1BYTE_KIND:
10378
0
            if (PyUnicode_IS_ASCII(self))
10379
0
                return asciilib_rsplit_whitespace(
10380
0
                    self,  PyUnicode_1BYTE_DATA(self),
10381
0
                    len1, maxcount
10382
0
                    );
10383
0
            else
10384
0
                return ucs1lib_rsplit_whitespace(
10385
0
                    self,  PyUnicode_1BYTE_DATA(self),
10386
0
                    len1, maxcount
10387
0
                    );
10388
0
        case PyUnicode_2BYTE_KIND:
10389
0
            return ucs2lib_rsplit_whitespace(
10390
0
                self,  PyUnicode_2BYTE_DATA(self),
10391
0
                len1, maxcount
10392
0
                );
10393
0
        case PyUnicode_4BYTE_KIND:
10394
0
            return ucs4lib_rsplit_whitespace(
10395
0
                self,  PyUnicode_4BYTE_DATA(self),
10396
0
                len1, maxcount
10397
0
                );
10398
0
        default:
10399
0
            Py_UNREACHABLE();
10400
0
        }
10401
0
    }
10402
50
    kind2 = PyUnicode_KIND(substring);
10403
50
    len2 = PyUnicode_GET_LENGTH(substring);
10404
50
    if (maxcount < 0) {
10405
        // if len2 == 0, it will raise ValueError.
10406
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10407
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10408
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10409
0
    }
10410
50
    if (kind1 < kind2 || len1 < len2) {
10411
0
        out = PyList_New(1);
10412
0
        if (out == NULL)
10413
0
            return NULL;
10414
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10415
0
        return out;
10416
0
    }
10417
50
    buf1 = PyUnicode_DATA(self);
10418
50
    buf2 = PyUnicode_DATA(substring);
10419
50
    if (kind2 != kind1) {
10420
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10421
0
        if (!buf2)
10422
0
            return NULL;
10423
0
    }
10424
10425
50
    switch (kind1) {
10426
50
    case PyUnicode_1BYTE_KIND:
10427
50
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10428
50
            out = asciilib_rsplit(
10429
50
                self,  buf1, len1, buf2, len2, maxcount);
10430
0
        else
10431
0
            out = ucs1lib_rsplit(
10432
0
                self,  buf1, len1, buf2, len2, maxcount);
10433
50
        break;
10434
0
    case PyUnicode_2BYTE_KIND:
10435
0
        out = ucs2lib_rsplit(
10436
0
            self,  buf1, len1, buf2, len2, maxcount);
10437
0
        break;
10438
0
    case PyUnicode_4BYTE_KIND:
10439
0
        out = ucs4lib_rsplit(
10440
0
            self,  buf1, len1, buf2, len2, maxcount);
10441
0
        break;
10442
0
    default:
10443
0
        out = NULL;
10444
50
    }
10445
50
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10446
50
    if (kind2 != kind1)
10447
0
        PyMem_Free((void *)buf2);
10448
50
    return out;
10449
50
}
10450
10451
static Py_ssize_t
10452
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10453
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10454
173M
{
10455
173M
    switch (kind) {
10456
33.2M
    case PyUnicode_1BYTE_KIND:
10457
33.2M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10458
28.6M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10459
4.64M
        else
10460
4.64M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10461
72.2M
    case PyUnicode_2BYTE_KIND:
10462
72.2M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10463
67.7M
    case PyUnicode_4BYTE_KIND:
10464
67.7M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10465
173M
    }
10466
173M
    Py_UNREACHABLE();
10467
173M
}
10468
10469
static Py_ssize_t
10470
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10471
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10472
48.8M
{
10473
48.8M
    switch (kind) {
10474
41.7M
    case PyUnicode_1BYTE_KIND:
10475
41.7M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10476
6.95M
    case PyUnicode_2BYTE_KIND:
10477
6.95M
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10478
138k
    case PyUnicode_4BYTE_KIND:
10479
138k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10480
48.8M
    }
10481
48.8M
    Py_UNREACHABLE();
10482
48.8M
}
10483
10484
static void
10485
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10486
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10487
1.82M
{
10488
1.82M
    int kind = PyUnicode_KIND(u);
10489
1.82M
    void *data = PyUnicode_DATA(u);
10490
1.82M
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10491
1.82M
    if (kind == PyUnicode_1BYTE_KIND) {
10492
612k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10493
612k
                                      (Py_UCS1 *)data + len,
10494
612k
                                      u1, u2, maxcount);
10495
612k
    }
10496
1.20M
    else if (kind == PyUnicode_2BYTE_KIND) {
10497
1.18M
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10498
1.18M
                                      (Py_UCS2 *)data + len,
10499
1.18M
                                      u1, u2, maxcount);
10500
1.18M
    }
10501
19.9k
    else {
10502
19.9k
        assert(kind == PyUnicode_4BYTE_KIND);
10503
19.9k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10504
19.9k
                                      (Py_UCS4 *)data + len,
10505
19.9k
                                      u1, u2, maxcount);
10506
19.9k
    }
10507
1.82M
}
10508
10509
static PyObject *
10510
replace(PyObject *self, PyObject *str1,
10511
        PyObject *str2, Py_ssize_t maxcount)
10512
86.6M
{
10513
86.6M
    PyObject *u;
10514
86.6M
    const char *sbuf = PyUnicode_DATA(self);
10515
86.6M
    const void *buf1 = PyUnicode_DATA(str1);
10516
86.6M
    const void *buf2 = PyUnicode_DATA(str2);
10517
86.6M
    int srelease = 0, release1 = 0, release2 = 0;
10518
86.6M
    int skind = PyUnicode_KIND(self);
10519
86.6M
    int kind1 = PyUnicode_KIND(str1);
10520
86.6M
    int kind2 = PyUnicode_KIND(str2);
10521
86.6M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10522
86.6M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10523
86.6M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10524
86.6M
    int mayshrink;
10525
86.6M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10526
10527
86.6M
    if (slen < len1)
10528
29.6M
        goto nothing;
10529
10530
56.9M
    if (maxcount < 0)
10531
56.9M
        maxcount = PY_SSIZE_T_MAX;
10532
0
    else if (maxcount == 0)
10533
0
        goto nothing;
10534
10535
56.9M
    if (str1 == str2)
10536
0
        goto nothing;
10537
10538
56.9M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10539
56.9M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10540
56.9M
    if (maxchar < maxchar_str1)
10541
        /* substring too wide to be present */
10542
0
        goto nothing;
10543
56.9M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10544
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10545
       result string. */
10546
56.9M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10547
56.9M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10548
10549
56.9M
    if (len1 == len2) {
10550
        /* same length */
10551
8.15M
        if (len1 == 0)
10552
0
            goto nothing;
10553
8.15M
        if (len1 == 1) {
10554
            /* replace characters */
10555
8.15M
            Py_UCS4 u1, u2;
10556
8.15M
            Py_ssize_t pos;
10557
10558
8.15M
            u1 = PyUnicode_READ(kind1, buf1, 0);
10559
8.15M
            pos = findchar(sbuf, skind, slen, u1, 1);
10560
8.15M
            if (pos < 0)
10561
6.33M
                goto nothing;
10562
1.82M
            u2 = PyUnicode_READ(kind2, buf2, 0);
10563
1.82M
            u = PyUnicode_New(slen, maxchar);
10564
1.82M
            if (!u)
10565
0
                goto error;
10566
10567
1.82M
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10568
1.82M
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10569
1.82M
        }
10570
0
        else {
10571
0
            int rkind = skind;
10572
0
            char *res;
10573
0
            Py_ssize_t i;
10574
10575
0
            if (kind1 < rkind) {
10576
                /* widen substring */
10577
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10578
0
                if (!buf1) goto error;
10579
0
                release1 = 1;
10580
0
            }
10581
0
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10582
0
            if (i < 0)
10583
0
                goto nothing;
10584
0
            if (rkind > kind2) {
10585
                /* widen replacement */
10586
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10587
0
                if (!buf2) goto error;
10588
0
                release2 = 1;
10589
0
            }
10590
0
            else if (rkind < kind2) {
10591
                /* widen self and buf1 */
10592
0
                rkind = kind2;
10593
0
                if (release1) {
10594
0
                    assert(buf1 != PyUnicode_DATA(str1));
10595
0
                    PyMem_Free((void *)buf1);
10596
0
                    buf1 = PyUnicode_DATA(str1);
10597
0
                    release1 = 0;
10598
0
                }
10599
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10600
0
                if (!sbuf) goto error;
10601
0
                srelease = 1;
10602
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10603
0
                if (!buf1) goto error;
10604
0
                release1 = 1;
10605
0
            }
10606
0
            u = PyUnicode_New(slen, maxchar);
10607
0
            if (!u)
10608
0
                goto error;
10609
0
            assert(PyUnicode_KIND(u) == rkind);
10610
0
            res = PyUnicode_DATA(u);
10611
10612
0
            memcpy(res, sbuf, rkind * slen);
10613
            /* change everything in-place, starting with this one */
10614
0
            memcpy(res + rkind * i,
10615
0
                   buf2,
10616
0
                   rkind * len2);
10617
0
            i += len1;
10618
10619
0
            while ( --maxcount > 0) {
10620
0
                i = anylib_find(rkind, self,
10621
0
                                sbuf+rkind*i, slen-i,
10622
0
                                str1, buf1, len1, i);
10623
0
                if (i == -1)
10624
0
                    break;
10625
0
                memcpy(res + rkind * i,
10626
0
                       buf2,
10627
0
                       rkind * len2);
10628
0
                i += len1;
10629
0
            }
10630
0
        }
10631
8.15M
    }
10632
48.8M
    else {
10633
48.8M
        Py_ssize_t n, i, j, ires;
10634
48.8M
        Py_ssize_t new_size;
10635
48.8M
        int rkind = skind;
10636
48.8M
        char *res;
10637
10638
48.8M
        if (kind1 < rkind) {
10639
            /* widen substring */
10640
7.09M
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10641
7.09M
            if (!buf1) goto error;
10642
7.09M
            release1 = 1;
10643
7.09M
        }
10644
48.8M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10645
48.8M
        if (n == 0)
10646
42.9M
            goto nothing;
10647
5.87M
        if (kind2 < rkind) {
10648
            /* widen replacement */
10649
1.45M
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10650
1.45M
            if (!buf2) goto error;
10651
1.45M
            release2 = 1;
10652
1.45M
        }
10653
4.41M
        else if (kind2 > rkind) {
10654
            /* widen self and buf1 */
10655
0
            rkind = kind2;
10656
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10657
0
            if (!sbuf) goto error;
10658
0
            srelease = 1;
10659
0
            if (release1) {
10660
0
                assert(buf1 != PyUnicode_DATA(str1));
10661
0
                PyMem_Free((void *)buf1);
10662
0
                buf1 = PyUnicode_DATA(str1);
10663
0
                release1 = 0;
10664
0
            }
10665
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10666
0
            if (!buf1) goto error;
10667
0
            release1 = 1;
10668
0
        }
10669
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10670
           PyUnicode_GET_LENGTH(str1)); */
10671
5.87M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10672
0
                PyErr_SetString(PyExc_OverflowError,
10673
0
                                "replace string is too long");
10674
0
                goto error;
10675
0
        }
10676
5.87M
        new_size = slen + n * (len2 - len1);
10677
5.87M
        if (new_size == 0) {
10678
0
            u = _PyUnicode_GetEmpty();
10679
0
            goto done;
10680
0
        }
10681
5.87M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10682
0
            PyErr_SetString(PyExc_OverflowError,
10683
0
                            "replace string is too long");
10684
0
            goto error;
10685
0
        }
10686
5.87M
        u = PyUnicode_New(new_size, maxchar);
10687
5.87M
        if (!u)
10688
0
            goto error;
10689
5.87M
        assert(PyUnicode_KIND(u) == rkind);
10690
5.87M
        res = PyUnicode_DATA(u);
10691
5.87M
        ires = i = 0;
10692
5.87M
        if (len1 > 0) {
10693
179M
            while (n-- > 0) {
10694
                /* look for next match */
10695
173M
                j = anylib_find(rkind, self,
10696
173M
                                sbuf + rkind * i, slen-i,
10697
173M
                                str1, buf1, len1, i);
10698
173M
                if (j == -1)
10699
0
                    break;
10700
173M
                else if (j > i) {
10701
                    /* copy unchanged part [i:j] */
10702
24.9M
                    memcpy(res + rkind * ires,
10703
24.9M
                           sbuf + rkind * i,
10704
24.9M
                           rkind * (j-i));
10705
24.9M
                    ires += j - i;
10706
24.9M
                }
10707
                /* copy substitution string */
10708
173M
                if (len2 > 0) {
10709
173M
                    memcpy(res + rkind * ires,
10710
173M
                           buf2,
10711
173M
                           rkind * len2);
10712
173M
                    ires += len2;
10713
173M
                }
10714
173M
                i = j + len1;
10715
173M
            }
10716
5.87M
            if (i < slen)
10717
                /* copy tail [i:] */
10718
5.77M
                memcpy(res + rkind * ires,
10719
5.77M
                       sbuf + rkind * i,
10720
5.77M
                       rkind * (slen-i));
10721
5.87M
        }
10722
0
        else {
10723
            /* interleave */
10724
0
            while (n > 0) {
10725
0
                memcpy(res + rkind * ires,
10726
0
                       buf2,
10727
0
                       rkind * len2);
10728
0
                ires += len2;
10729
0
                if (--n <= 0)
10730
0
                    break;
10731
0
                memcpy(res + rkind * ires,
10732
0
                       sbuf + rkind * i,
10733
0
                       rkind);
10734
0
                ires++;
10735
0
                i++;
10736
0
            }
10737
0
            memcpy(res + rkind * ires,
10738
0
                   sbuf + rkind * i,
10739
0
                   rkind * (slen-i));
10740
0
        }
10741
5.87M
    }
10742
10743
7.69M
    if (mayshrink) {
10744
0
        unicode_adjust_maxchar(&u);
10745
0
        if (u == NULL)
10746
0
            goto error;
10747
0
    }
10748
10749
7.69M
  done:
10750
7.69M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10751
7.69M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10752
7.69M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10753
7.69M
    if (srelease)
10754
0
        PyMem_Free((void *)sbuf);
10755
7.69M
    if (release1)
10756
1.45M
        PyMem_Free((void *)buf1);
10757
7.69M
    if (release2)
10758
1.45M
        PyMem_Free((void *)buf2);
10759
7.69M
    assert(_PyUnicode_CheckConsistency(u, 1));
10760
7.69M
    return u;
10761
10762
78.9M
  nothing:
10763
    /* nothing to replace; return original string (when possible) */
10764
78.9M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10765
78.9M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10766
78.9M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10767
78.9M
    if (srelease)
10768
0
        PyMem_Free((void *)sbuf);
10769
78.9M
    if (release1)
10770
5.63M
        PyMem_Free((void *)buf1);
10771
78.9M
    if (release2)
10772
0
        PyMem_Free((void *)buf2);
10773
78.9M
    return unicode_result_unchanged(self);
10774
10775
0
  error:
10776
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10777
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10778
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10779
0
    if (srelease)
10780
0
        PyMem_Free((void *)sbuf);
10781
0
    if (release1)
10782
0
        PyMem_Free((void *)buf1);
10783
0
    if (release2)
10784
0
        PyMem_Free((void *)buf2);
10785
0
    return NULL;
10786
7.69M
}
10787
10788
/* --- Unicode Object Methods --------------------------------------------- */
10789
10790
/*[clinic input]
10791
@permit_long_docstring_body
10792
str.title as unicode_title
10793
10794
Return a version of the string where each word is titlecased.
10795
10796
More specifically, words start with uppercased characters and all remaining
10797
cased characters have lower case.
10798
[clinic start generated code]*/
10799
10800
static PyObject *
10801
unicode_title_impl(PyObject *self)
10802
/*[clinic end generated code: output=c75ae03809574902 input=533ce0eb6a7f5d1b]*/
10803
0
{
10804
0
    return case_operation(self, do_title);
10805
0
}
10806
10807
/*[clinic input]
10808
@permit_long_docstring_body
10809
str.capitalize as unicode_capitalize
10810
10811
Return a capitalized version of the string.
10812
10813
More specifically, make the first character have upper case and the rest lower
10814
case.
10815
[clinic start generated code]*/
10816
10817
static PyObject *
10818
unicode_capitalize_impl(PyObject *self)
10819
/*[clinic end generated code: output=e49a4c333cdb7667 input=a4a15ade41f6f9e9]*/
10820
0
{
10821
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10822
0
        return unicode_result_unchanged(self);
10823
0
    return case_operation(self, do_capitalize);
10824
0
}
10825
10826
/*[clinic input]
10827
str.casefold as unicode_casefold
10828
10829
Return a version of the string suitable for caseless comparisons.
10830
[clinic start generated code]*/
10831
10832
static PyObject *
10833
unicode_casefold_impl(PyObject *self)
10834
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10835
0
{
10836
0
    if (PyUnicode_IS_ASCII(self))
10837
0
        return ascii_upper_or_lower(self, 1);
10838
0
    return case_operation(self, do_casefold);
10839
0
}
10840
10841
10842
/* Argument converter. Accepts a single Unicode character. */
10843
10844
static int
10845
convert_uc(PyObject *obj, void *addr)
10846
0
{
10847
0
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10848
10849
0
    if (!PyUnicode_Check(obj)) {
10850
0
        PyErr_Format(PyExc_TypeError,
10851
0
                     "The fill character must be a unicode character, "
10852
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10853
0
        return 0;
10854
0
    }
10855
0
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10856
0
        PyErr_SetString(PyExc_TypeError,
10857
0
                        "The fill character must be exactly one character long");
10858
0
        return 0;
10859
0
    }
10860
0
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10861
0
    return 1;
10862
0
}
10863
10864
/*[clinic input]
10865
str.center as unicode_center
10866
10867
    width: Py_ssize_t
10868
    fillchar: Py_UCS4 = ' '
10869
    /
10870
10871
Return a centered string of length width.
10872
10873
Padding is done using the specified fill character (default is a space).
10874
[clinic start generated code]*/
10875
10876
static PyObject *
10877
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10878
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10879
0
{
10880
0
    Py_ssize_t marg, left;
10881
10882
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10883
0
        return unicode_result_unchanged(self);
10884
10885
0
    marg = width - PyUnicode_GET_LENGTH(self);
10886
0
    left = marg / 2 + (marg & width & 1);
10887
10888
0
    return pad(self, left, marg - left, fillchar);
10889
0
}
10890
10891
/* This function assumes that str1 and str2 are readied by the caller. */
10892
10893
static int
10894
unicode_compare(PyObject *str1, PyObject *str2)
10895
18.2M
{
10896
18.2M
#define COMPARE(TYPE1, TYPE2) \
10897
18.2M
    do { \
10898
17.1M
        TYPE1* p1 = (TYPE1 *)data1; \
10899
17.1M
        TYPE2* p2 = (TYPE2 *)data2; \
10900
17.1M
        TYPE1* end = p1 + len; \
10901
17.1M
        Py_UCS4 c1, c2; \
10902
17.1M
        for (; p1 != end; p1++, p2++) { \
10903
17.1M
            c1 = *p1; \
10904
17.1M
            c2 = *p2; \
10905
17.1M
            if (c1 != c2) \
10906
17.1M
                return (c1 < c2) ? -1 : 1; \
10907
17.1M
        } \
10908
17.1M
    } \
10909
17.1M
    while (0)
10910
10911
18.2M
    int kind1, kind2;
10912
18.2M
    const void *data1, *data2;
10913
18.2M
    Py_ssize_t len1, len2, len;
10914
10915
18.2M
    kind1 = PyUnicode_KIND(str1);
10916
18.2M
    kind2 = PyUnicode_KIND(str2);
10917
18.2M
    data1 = PyUnicode_DATA(str1);
10918
18.2M
    data2 = PyUnicode_DATA(str2);
10919
18.2M
    len1 = PyUnicode_GET_LENGTH(str1);
10920
18.2M
    len2 = PyUnicode_GET_LENGTH(str2);
10921
18.2M
    len = Py_MIN(len1, len2);
10922
10923
18.2M
    switch(kind1) {
10924
1.50M
    case PyUnicode_1BYTE_KIND:
10925
1.50M
    {
10926
1.50M
        switch(kind2) {
10927
78.1k
        case PyUnicode_1BYTE_KIND:
10928
78.1k
        {
10929
78.1k
            int cmp = memcmp(data1, data2, len);
10930
            /* normalize result of memcmp() into the range [-1; 1] */
10931
78.1k
            if (cmp < 0)
10932
56.1k
                return -1;
10933
22.0k
            if (cmp > 0)
10934
21.4k
                return 1;
10935
556
            break;
10936
22.0k
        }
10937
1.21M
        case PyUnicode_2BYTE_KIND:
10938
1.21M
            COMPARE(Py_UCS1, Py_UCS2);
10939
0
            break;
10940
217k
        case PyUnicode_4BYTE_KIND:
10941
217k
            COMPARE(Py_UCS1, Py_UCS4);
10942
0
            break;
10943
0
        default:
10944
0
            Py_UNREACHABLE();
10945
1.50M
        }
10946
556
        break;
10947
1.50M
    }
10948
15.0M
    case PyUnicode_2BYTE_KIND:
10949
15.0M
    {
10950
15.0M
        switch(kind2) {
10951
7.75k
        case PyUnicode_1BYTE_KIND:
10952
7.75k
            COMPARE(Py_UCS2, Py_UCS1);
10953
0
            break;
10954
13.4M
        case PyUnicode_2BYTE_KIND:
10955
13.4M
        {
10956
13.4M
            COMPARE(Py_UCS2, Py_UCS2);
10957
0
            break;
10958
13.4M
        }
10959
1.60M
        case PyUnicode_4BYTE_KIND:
10960
1.60M
            COMPARE(Py_UCS2, Py_UCS4);
10961
0
            break;
10962
0
        default:
10963
0
            Py_UNREACHABLE();
10964
15.0M
        }
10965
0
        break;
10966
15.0M
    }
10967
1.64M
    case PyUnicode_4BYTE_KIND:
10968
1.64M
    {
10969
1.64M
        switch(kind2) {
10970
1.48k
        case PyUnicode_1BYTE_KIND:
10971
1.48k
            COMPARE(Py_UCS4, Py_UCS1);
10972
0
            break;
10973
618k
        case PyUnicode_2BYTE_KIND:
10974
618k
            COMPARE(Py_UCS4, Py_UCS2);
10975
0
            break;
10976
1.02M
        case PyUnicode_4BYTE_KIND:
10977
1.02M
        {
10978
1.02M
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10979
1.02M
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10980
            /* normalize result of wmemcmp() into the range [-1; 1] */
10981
1.02M
            if (cmp < 0)
10982
503k
                return -1;
10983
517k
            if (cmp > 0)
10984
517k
                return 1;
10985
#else
10986
            COMPARE(Py_UCS4, Py_UCS4);
10987
#endif
10988
0
            break;
10989
517k
        }
10990
0
        default:
10991
0
            Py_UNREACHABLE();
10992
1.64M
        }
10993
0
        break;
10994
1.64M
    }
10995
0
    default:
10996
0
        Py_UNREACHABLE();
10997
18.2M
    }
10998
10999
556
    if (len1 == len2)
11000
553
        return 0;
11001
3
    if (len1 < len2)
11002
3
        return -1;
11003
0
    else
11004
0
        return 1;
11005
11006
3
#undef COMPARE
11007
3
}
11008
11009
11010
int
11011
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11012
296M
{
11013
296M
    assert(PyUnicode_Check(str1));
11014
296M
    assert(PyUnicode_Check(str2));
11015
296M
    if (str1 == str2) {
11016
83.6M
        return 1;
11017
83.6M
    }
11018
212M
    return unicode_eq(str1, str2);
11019
296M
}
11020
11021
11022
int
11023
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11024
0
{
11025
0
    if (!PyUnicode_Check(str1)) {
11026
0
        PyErr_Format(PyExc_TypeError,
11027
0
                     "first argument must be str, not %T", str1);
11028
0
        return -1;
11029
0
    }
11030
0
    if (!PyUnicode_Check(str2)) {
11031
0
        PyErr_Format(PyExc_TypeError,
11032
0
                     "second argument must be str, not %T", str2);
11033
0
        return -1;
11034
0
    }
11035
11036
0
    return _PyUnicode_Equal(str1, str2);
11037
0
}
11038
11039
11040
int
11041
PyUnicode_Compare(PyObject *left, PyObject *right)
11042
7.19k
{
11043
7.19k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11044
        /* a string is equal to itself */
11045
7.19k
        if (left == right)
11046
0
            return 0;
11047
11048
7.19k
        return unicode_compare(left, right);
11049
7.19k
    }
11050
0
    PyErr_Format(PyExc_TypeError,
11051
0
                 "Can't compare %.100s and %.100s",
11052
0
                 Py_TYPE(left)->tp_name,
11053
0
                 Py_TYPE(right)->tp_name);
11054
0
    return -1;
11055
7.19k
}
11056
11057
int
11058
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11059
2.02M
{
11060
2.02M
    Py_ssize_t i;
11061
2.02M
    int kind;
11062
2.02M
    Py_UCS4 chr;
11063
11064
2.02M
    assert(_PyUnicode_CHECK(uni));
11065
2.02M
    kind = PyUnicode_KIND(uni);
11066
2.02M
    if (kind == PyUnicode_1BYTE_KIND) {
11067
2.02M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11068
2.02M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11069
2.02M
        size_t len, len2 = strlen(str);
11070
2.02M
        int cmp;
11071
11072
2.02M
        len = Py_MIN(len1, len2);
11073
2.02M
        cmp = memcmp(data, str, len);
11074
2.02M
        if (cmp != 0) {
11075
1.50M
            if (cmp < 0)
11076
9.15k
                return -1;
11077
1.49M
            else
11078
1.49M
                return 1;
11079
1.50M
        }
11080
520k
        if (len1 > len2)
11081
101
            return 1; /* uni is longer */
11082
520k
        if (len1 < len2)
11083
781
            return -1; /* str is longer */
11084
519k
        return 0;
11085
520k
    }
11086
1.55k
    else {
11087
1.55k
        const void *data = PyUnicode_DATA(uni);
11088
        /* Compare Unicode string and source character set string */
11089
2.79k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11090
2.55k
            if (chr != (unsigned char)str[i])
11091
1.30k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11092
        /* This check keeps Python strings that end in '\0' from comparing equal
11093
         to C strings identical up to that point. */
11094
242
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11095
242
            return 1; /* uni is longer */
11096
0
        if (str[i])
11097
0
            return -1; /* str is longer */
11098
0
        return 0;
11099
0
    }
11100
2.02M
}
11101
11102
int
11103
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11104
18
{
11105
18
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11106
18
}
11107
11108
int
11109
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11110
18
{
11111
18
    assert(_PyUnicode_CHECK(unicode));
11112
18
    assert(str);
11113
11114
18
    if (PyUnicode_IS_ASCII(unicode)) {
11115
18
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11116
18
        return size == len &&
11117
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11118
18
    }
11119
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11120
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11121
0
        return size == len &&
11122
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11123
0
    }
11124
11125
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11126
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11127
0
        return 0;
11128
0
    }
11129
0
    const unsigned char *s = (const unsigned char *)str;
11130
0
    const unsigned char *ends = s + (size_t)size;
11131
0
    int kind = PyUnicode_KIND(unicode);
11132
0
    const void *data = PyUnicode_DATA(unicode);
11133
    /* Compare Unicode string and UTF-8 string */
11134
0
    for (Py_ssize_t i = 0; i < len; i++) {
11135
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11136
0
        if (ch < 0x80) {
11137
0
            if (ends == s || s[0] != ch) {
11138
0
                return 0;
11139
0
            }
11140
0
            s += 1;
11141
0
        }
11142
0
        else if (ch < 0x800) {
11143
0
            if ((ends - s) < 2 ||
11144
0
                s[0] != (0xc0 | (ch >> 6)) ||
11145
0
                s[1] != (0x80 | (ch & 0x3f)))
11146
0
            {
11147
0
                return 0;
11148
0
            }
11149
0
            s += 2;
11150
0
        }
11151
0
        else if (ch < 0x10000) {
11152
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11153
0
                (ends - s) < 3 ||
11154
0
                s[0] != (0xe0 | (ch >> 12)) ||
11155
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11156
0
                s[2] != (0x80 | (ch & 0x3f)))
11157
0
            {
11158
0
                return 0;
11159
0
            }
11160
0
            s += 3;
11161
0
        }
11162
0
        else {
11163
0
            assert(ch <= MAX_UNICODE);
11164
0
            if ((ends - s) < 4 ||
11165
0
                s[0] != (0xf0 | (ch >> 18)) ||
11166
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11167
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11168
0
                s[3] != (0x80 | (ch & 0x3f)))
11169
0
            {
11170
0
                return 0;
11171
0
            }
11172
0
            s += 4;
11173
0
        }
11174
0
    }
11175
0
    return s == ends;
11176
0
}
11177
11178
int
11179
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11180
6.90M
{
11181
6.90M
    size_t len;
11182
6.90M
    assert(_PyUnicode_CHECK(unicode));
11183
6.90M
    assert(str);
11184
#ifndef NDEBUG
11185
    for (const char *p = str; *p; p++) {
11186
        assert((unsigned char)*p < 128);
11187
    }
11188
#endif
11189
6.90M
    if (!PyUnicode_IS_ASCII(unicode))
11190
150k
        return 0;
11191
6.75M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11192
6.75M
    return strlen(str) == len &&
11193
453k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11194
6.90M
}
11195
11196
int
11197
_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11198
0
{
11199
0
    PyObject *right_uni;
11200
11201
0
    assert(_PyUnicode_CHECK(left));
11202
0
    assert(right->string);
11203
#ifndef NDEBUG
11204
    for (const char *p = right->string; *p; p++) {
11205
        assert((unsigned char)*p < 128);
11206
    }
11207
#endif
11208
11209
0
    if (!PyUnicode_IS_ASCII(left))
11210
0
        return 0;
11211
11212
0
    right_uni = _PyUnicode_FromId(right);       /* borrowed */
11213
0
    if (right_uni == NULL) {
11214
        /* memory error or bad data */
11215
0
        PyErr_Clear();
11216
0
        return _PyUnicode_EqualToASCIIString(left, right->string);
11217
0
    }
11218
11219
0
    if (left == right_uni)
11220
0
        return 1;
11221
11222
0
    assert(PyUnicode_CHECK_INTERNED(right_uni));
11223
0
    if (PyUnicode_CHECK_INTERNED(left)) {
11224
0
        return 0;
11225
0
    }
11226
11227
0
    Py_hash_t right_hash = PyUnicode_HASH(right_uni);
11228
0
    assert(right_hash != -1);
11229
0
    Py_hash_t hash = PyUnicode_HASH(left);
11230
0
    if (hash != -1 && hash != right_hash) {
11231
0
        return 0;
11232
0
    }
11233
11234
0
    return unicode_eq(left, right_uni);
11235
0
}
11236
11237
PyObject *
11238
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11239
42.0M
{
11240
42.0M
    int result;
11241
11242
42.0M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11243
89.8k
        Py_RETURN_NOTIMPLEMENTED;
11244
11245
41.9M
    if (left == right) {
11246
1.68k
        switch (op) {
11247
1.59k
        case Py_EQ:
11248
1.59k
        case Py_LE:
11249
1.59k
        case Py_GE:
11250
            /* a string is equal to itself */
11251
1.59k
            Py_RETURN_TRUE;
11252
84
        case Py_NE:
11253
84
        case Py_LT:
11254
84
        case Py_GT:
11255
84
            Py_RETURN_FALSE;
11256
0
        default:
11257
0
            PyErr_BadArgument();
11258
0
            return NULL;
11259
1.68k
        }
11260
1.68k
    }
11261
41.9M
    else if (op == Py_EQ || op == Py_NE) {
11262
23.7M
        result = unicode_eq(left, right);
11263
23.7M
        result ^= (op == Py_NE);
11264
23.7M
        return PyBool_FromLong(result);
11265
23.7M
    }
11266
18.2M
    else {
11267
18.2M
        result = unicode_compare(left, right);
11268
18.2M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11269
18.2M
    }
11270
41.9M
}
11271
11272
int
11273
PyUnicode_Contains(PyObject *str, PyObject *substr)
11274
85.5M
{
11275
85.5M
    int kind1, kind2;
11276
85.5M
    const void *buf1, *buf2;
11277
85.5M
    Py_ssize_t len1, len2;
11278
85.5M
    int result;
11279
11280
85.5M
    if (!PyUnicode_Check(substr)) {
11281
0
        PyErr_Format(PyExc_TypeError,
11282
0
                     "'in <string>' requires string as left operand, not %.100s",
11283
0
                     Py_TYPE(substr)->tp_name);
11284
0
        return -1;
11285
0
    }
11286
85.5M
    if (ensure_unicode(str) < 0)
11287
0
        return -1;
11288
11289
85.5M
    kind1 = PyUnicode_KIND(str);
11290
85.5M
    kind2 = PyUnicode_KIND(substr);
11291
85.5M
    if (kind1 < kind2)
11292
4.21M
        return 0;
11293
81.3M
    len1 = PyUnicode_GET_LENGTH(str);
11294
81.3M
    len2 = PyUnicode_GET_LENGTH(substr);
11295
81.3M
    if (len1 < len2)
11296
21.6k
        return 0;
11297
81.3M
    buf1 = PyUnicode_DATA(str);
11298
81.3M
    buf2 = PyUnicode_DATA(substr);
11299
81.3M
    if (len2 == 1) {
11300
81.3M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11301
81.3M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11302
81.3M
        return result;
11303
81.3M
    }
11304
33.6k
    if (kind2 != kind1) {
11305
16.6k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11306
16.6k
        if (!buf2)
11307
0
            return -1;
11308
16.6k
    }
11309
11310
33.6k
    switch (kind1) {
11311
16.9k
    case PyUnicode_1BYTE_KIND:
11312
16.9k
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11313
16.9k
        break;
11314
12.7k
    case PyUnicode_2BYTE_KIND:
11315
12.7k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11316
12.7k
        break;
11317
3.90k
    case PyUnicode_4BYTE_KIND:
11318
3.90k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11319
3.90k
        break;
11320
0
    default:
11321
0
        Py_UNREACHABLE();
11322
33.6k
    }
11323
11324
33.6k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11325
33.6k
    if (kind2 != kind1)
11326
16.6k
        PyMem_Free((void *)buf2);
11327
11328
33.6k
    return result;
11329
33.6k
}
11330
11331
/* Concat to string or Unicode object giving a new Unicode object. */
11332
11333
PyObject *
11334
PyUnicode_Concat(PyObject *left, PyObject *right)
11335
46.4M
{
11336
46.4M
    PyObject *result;
11337
46.4M
    Py_UCS4 maxchar, maxchar2;
11338
46.4M
    Py_ssize_t left_len, right_len, new_len;
11339
11340
46.4M
    if (ensure_unicode(left) < 0)
11341
0
        return NULL;
11342
11343
46.4M
    if (!PyUnicode_Check(right)) {
11344
0
        PyErr_Format(PyExc_TypeError,
11345
0
            "can only concatenate str (not \"%.200s\") to str",
11346
0
            Py_TYPE(right)->tp_name);
11347
0
        return NULL;
11348
0
    }
11349
11350
    /* Shortcuts */
11351
46.4M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11352
46.4M
    if (left == empty) {
11353
69.1k
        return PyUnicode_FromObject(right);
11354
69.1k
    }
11355
46.3M
    if (right == empty) {
11356
4.95M
        return PyUnicode_FromObject(left);
11357
4.95M
    }
11358
11359
41.3M
    left_len = PyUnicode_GET_LENGTH(left);
11360
41.3M
    right_len = PyUnicode_GET_LENGTH(right);
11361
41.3M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11362
0
        PyErr_SetString(PyExc_OverflowError,
11363
0
                        "strings are too large to concat");
11364
0
        return NULL;
11365
0
    }
11366
41.3M
    new_len = left_len + right_len;
11367
11368
41.3M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11369
41.3M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11370
41.3M
    maxchar = Py_MAX(maxchar, maxchar2);
11371
11372
    /* Concat the two Unicode strings */
11373
41.3M
    result = PyUnicode_New(new_len, maxchar);
11374
41.3M
    if (result == NULL)
11375
0
        return NULL;
11376
41.3M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11377
41.3M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11378
41.3M
    assert(_PyUnicode_CheckConsistency(result, 1));
11379
41.3M
    return result;
11380
41.3M
}
11381
11382
void
11383
PyUnicode_Append(PyObject **p_left, PyObject *right)
11384
1.38M
{
11385
1.38M
    PyObject *left, *res;
11386
1.38M
    Py_UCS4 maxchar, maxchar2;
11387
1.38M
    Py_ssize_t left_len, right_len, new_len;
11388
11389
1.38M
    if (p_left == NULL) {
11390
0
        if (!PyErr_Occurred())
11391
0
            PyErr_BadInternalCall();
11392
0
        return;
11393
0
    }
11394
1.38M
    left = *p_left;
11395
1.38M
    if (right == NULL || left == NULL
11396
1.38M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11397
0
        if (!PyErr_Occurred())
11398
0
            PyErr_BadInternalCall();
11399
0
        goto error;
11400
0
    }
11401
11402
    /* Shortcuts */
11403
1.38M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11404
1.38M
    if (left == empty) {
11405
456k
        Py_DECREF(left);
11406
456k
        *p_left = Py_NewRef(right);
11407
456k
        return;
11408
456k
    }
11409
928k
    if (right == empty) {
11410
0
        return;
11411
0
    }
11412
11413
928k
    left_len = PyUnicode_GET_LENGTH(left);
11414
928k
    right_len = PyUnicode_GET_LENGTH(right);
11415
928k
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11416
0
        PyErr_SetString(PyExc_OverflowError,
11417
0
                        "strings are too large to concat");
11418
0
        goto error;
11419
0
    }
11420
928k
    new_len = left_len + right_len;
11421
11422
928k
    if (_PyUnicode_IsModifiable(left)
11423
928k
        && PyUnicode_CheckExact(right)
11424
928k
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11425
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11426
           to change the structure size, but characters are stored just after
11427
           the structure, and so it requires to move all characters which is
11428
           not so different than duplicating the string. */
11429
878k
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11430
878k
    {
11431
        /* append inplace */
11432
878k
        if (unicode_resize(p_left, new_len) != 0)
11433
0
            goto error;
11434
11435
        /* copy 'right' into the newly allocated area of 'left' */
11436
878k
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11437
878k
    }
11438
50.6k
    else {
11439
50.6k
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11440
50.6k
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11441
50.6k
        maxchar = Py_MAX(maxchar, maxchar2);
11442
11443
        /* Concat the two Unicode strings */
11444
50.6k
        res = PyUnicode_New(new_len, maxchar);
11445
50.6k
        if (res == NULL)
11446
0
            goto error;
11447
50.6k
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11448
50.6k
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11449
50.6k
        Py_DECREF(left);
11450
50.6k
        *p_left = res;
11451
50.6k
    }
11452
928k
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11453
928k
    return;
11454
11455
0
error:
11456
0
    Py_CLEAR(*p_left);
11457
0
}
11458
11459
void
11460
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11461
0
{
11462
0
    PyUnicode_Append(pleft, right);
11463
0
    Py_XDECREF(right);
11464
0
}
11465
11466
/*[clinic input]
11467
@permit_long_summary
11468
@text_signature "($self, sub[, start[, end]], /)"
11469
str.count as unicode_count -> Py_ssize_t
11470
11471
    self as str: self
11472
    sub as substr: unicode
11473
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11474
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11475
    /
11476
11477
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11478
11479
Optional arguments start and end are interpreted as in slice notation.
11480
[clinic start generated code]*/
11481
11482
static Py_ssize_t
11483
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11484
                   Py_ssize_t end)
11485
/*[clinic end generated code: output=8fcc3aef0b18edbf input=8590716ee228b935]*/
11486
31.8M
{
11487
31.8M
    assert(PyUnicode_Check(str));
11488
31.8M
    assert(PyUnicode_Check(substr));
11489
11490
31.8M
    Py_ssize_t result;
11491
31.8M
    int kind1, kind2;
11492
31.8M
    const void *buf1 = NULL, *buf2 = NULL;
11493
31.8M
    Py_ssize_t len1, len2;
11494
11495
31.8M
    kind1 = PyUnicode_KIND(str);
11496
31.8M
    kind2 = PyUnicode_KIND(substr);
11497
31.8M
    if (kind1 < kind2)
11498
0
        return 0;
11499
11500
31.8M
    len1 = PyUnicode_GET_LENGTH(str);
11501
31.8M
    len2 = PyUnicode_GET_LENGTH(substr);
11502
31.8M
    ADJUST_INDICES(start, end, len1);
11503
31.8M
    if (end - start < len2)
11504
7.14M
        return 0;
11505
11506
24.6M
    buf1 = PyUnicode_DATA(str);
11507
24.6M
    buf2 = PyUnicode_DATA(substr);
11508
24.6M
    if (kind2 != kind1) {
11509
6.85M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11510
6.85M
        if (!buf2)
11511
0
            goto onError;
11512
6.85M
    }
11513
11514
    // We don't reuse `anylib_count` here because of the explicit casts.
11515
24.6M
    switch (kind1) {
11516
17.8M
    case PyUnicode_1BYTE_KIND:
11517
17.8M
        result = ucs1lib_count(
11518
17.8M
            ((const Py_UCS1*)buf1) + start, end - start,
11519
17.8M
            buf2, len2, PY_SSIZE_T_MAX
11520
17.8M
            );
11521
17.8M
        break;
11522
5.58M
    case PyUnicode_2BYTE_KIND:
11523
5.58M
        result = ucs2lib_count(
11524
5.58M
            ((const Py_UCS2*)buf1) + start, end - start,
11525
5.58M
            buf2, len2, PY_SSIZE_T_MAX
11526
5.58M
            );
11527
5.58M
        break;
11528
1.27M
    case PyUnicode_4BYTE_KIND:
11529
1.27M
        result = ucs4lib_count(
11530
1.27M
            ((const Py_UCS4*)buf1) + start, end - start,
11531
1.27M
            buf2, len2, PY_SSIZE_T_MAX
11532
1.27M
            );
11533
1.27M
        break;
11534
0
    default:
11535
0
        Py_UNREACHABLE();
11536
24.6M
    }
11537
11538
24.6M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11539
24.6M
    if (kind2 != kind1)
11540
6.85M
        PyMem_Free((void *)buf2);
11541
11542
24.6M
    return result;
11543
0
  onError:
11544
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11545
0
    if (kind2 != kind1)
11546
0
        PyMem_Free((void *)buf2);
11547
0
    return -1;
11548
24.6M
}
11549
11550
/*[clinic input]
11551
str.encode as unicode_encode
11552
11553
    encoding: str(c_default="NULL") = 'utf-8'
11554
        The encoding in which to encode the string.
11555
    errors: str(c_default="NULL") = 'strict'
11556
        The error handling scheme to use for encoding errors.
11557
        The default is 'strict' meaning that encoding errors raise a
11558
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11559
        'xmlcharrefreplace' as well as any other name registered with
11560
        codecs.register_error that can handle UnicodeEncodeErrors.
11561
11562
Encode the string using the codec registered for encoding.
11563
[clinic start generated code]*/
11564
11565
static PyObject *
11566
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11567
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11568
18.6M
{
11569
18.6M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11570
18.6M
}
11571
11572
/*[clinic input]
11573
str.expandtabs as unicode_expandtabs
11574
11575
    tabsize: int = 8
11576
11577
Return a copy where all tab characters are expanded using spaces.
11578
11579
If tabsize is not given, a tab size of 8 characters is assumed.
11580
[clinic start generated code]*/
11581
11582
static PyObject *
11583
unicode_expandtabs_impl(PyObject *self, int tabsize)
11584
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11585
8.08M
{
11586
8.08M
    Py_ssize_t i, j, line_pos, src_len, incr;
11587
8.08M
    Py_UCS4 ch;
11588
8.08M
    PyObject *u;
11589
8.08M
    const void *src_data;
11590
8.08M
    void *dest_data;
11591
8.08M
    int kind;
11592
8.08M
    int found;
11593
11594
    /* First pass: determine size of output string */
11595
8.08M
    src_len = PyUnicode_GET_LENGTH(self);
11596
8.08M
    i = j = line_pos = 0;
11597
8.08M
    kind = PyUnicode_KIND(self);
11598
8.08M
    src_data = PyUnicode_DATA(self);
11599
8.08M
    found = 0;
11600
158M
    for (; i < src_len; i++) {
11601
149M
        ch = PyUnicode_READ(kind, src_data, i);
11602
149M
        if (ch == '\t') {
11603
12.6M
            found = 1;
11604
12.6M
            if (tabsize > 0) {
11605
12.6M
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11606
12.6M
                if (j > PY_SSIZE_T_MAX - incr)
11607
0
                    goto overflow;
11608
12.6M
                line_pos += incr;
11609
12.6M
                j += incr;
11610
12.6M
            }
11611
12.6M
        }
11612
137M
        else {
11613
137M
            if (j > PY_SSIZE_T_MAX - 1)
11614
0
                goto overflow;
11615
137M
            line_pos++;
11616
137M
            j++;
11617
137M
            if (ch == '\n' || ch == '\r')
11618
12.9k
                line_pos = 0;
11619
137M
        }
11620
149M
    }
11621
8.08M
    if (!found)
11622
7.79M
        return unicode_result_unchanged(self);
11623
11624
    /* Second pass: create output string and fill it */
11625
289k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11626
289k
    if (!u)
11627
0
        return NULL;
11628
289k
    dest_data = PyUnicode_DATA(u);
11629
11630
289k
    i = j = line_pos = 0;
11631
11632
30.9M
    for (; i < src_len; i++) {
11633
30.6M
        ch = PyUnicode_READ(kind, src_data, i);
11634
30.6M
        if (ch == '\t') {
11635
12.6M
            if (tabsize > 0) {
11636
12.6M
                incr = tabsize - (line_pos % tabsize);
11637
12.6M
                line_pos += incr;
11638
12.6M
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11639
12.6M
                j += incr;
11640
12.6M
            }
11641
12.6M
        }
11642
18.0M
        else {
11643
18.0M
            line_pos++;
11644
18.0M
            PyUnicode_WRITE(kind, dest_data, j, ch);
11645
18.0M
            j++;
11646
18.0M
            if (ch == '\n' || ch == '\r')
11647
0
                line_pos = 0;
11648
18.0M
        }
11649
30.6M
    }
11650
289k
    assert (j == PyUnicode_GET_LENGTH(u));
11651
289k
    return unicode_result(u);
11652
11653
0
  overflow:
11654
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11655
0
    return NULL;
11656
289k
}
11657
11658
/*[clinic input]
11659
@permit_long_summary
11660
str.find as unicode_find = str.count
11661
11662
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11663
11664
Optional arguments start and end are interpreted as in slice notation.
11665
Return -1 on failure.
11666
[clinic start generated code]*/
11667
11668
static Py_ssize_t
11669
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11670
                  Py_ssize_t end)
11671
/*[clinic end generated code: output=51dbe6255712e278 input=3a9d650fe4c24695]*/
11672
30.7M
{
11673
30.7M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11674
30.7M
    if (result < 0) {
11675
6.89M
        return -1;
11676
6.89M
    }
11677
23.8M
    return result;
11678
30.7M
}
11679
11680
static PyObject *
11681
unicode_getitem(PyObject *self, Py_ssize_t index)
11682
52.1M
{
11683
52.1M
    const void *data;
11684
52.1M
    int kind;
11685
52.1M
    Py_UCS4 ch;
11686
11687
52.1M
    if (!PyUnicode_Check(self)) {
11688
0
        PyErr_BadArgument();
11689
0
        return NULL;
11690
0
    }
11691
52.1M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11692
356
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11693
356
        return NULL;
11694
356
    }
11695
52.1M
    kind = PyUnicode_KIND(self);
11696
52.1M
    data = PyUnicode_DATA(self);
11697
52.1M
    ch = PyUnicode_READ(kind, data, index);
11698
52.1M
    return unicode_char(ch);
11699
52.1M
}
11700
11701
/* Believe it or not, this produces the same value for ASCII strings
11702
   as bytes_hash(). */
11703
static Py_hash_t
11704
unicode_hash(PyObject *self)
11705
44.6M
{
11706
44.6M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11707
11708
#ifdef Py_DEBUG
11709
    assert(_Py_HashSecret_Initialized);
11710
#endif
11711
44.6M
    Py_hash_t hash = PyUnicode_HASH(self);
11712
44.6M
    if (hash != -1) {
11713
255k
        return hash;
11714
255k
    }
11715
44.4M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11716
44.4M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11717
11718
44.4M
    PyUnicode_SET_HASH(self, x);
11719
44.4M
    return x;
11720
44.6M
}
11721
11722
/*[clinic input]
11723
@permit_long_summary
11724
str.index as unicode_index = str.count
11725
11726
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11727
11728
Optional arguments start and end are interpreted as in slice notation.
11729
Raises ValueError when the substring is not found.
11730
[clinic start generated code]*/
11731
11732
static Py_ssize_t
11733
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11734
                   Py_ssize_t end)
11735
/*[clinic end generated code: output=77558288837cdf40 input=ae5e48f69ed75b06]*/
11736
0
{
11737
0
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11738
0
    if (result == -1) {
11739
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
11740
0
    }
11741
0
    else if (result < 0) {
11742
0
        return -1;
11743
0
    }
11744
0
    return result;
11745
0
}
11746
11747
/*[clinic input]
11748
str.isascii as unicode_isascii
11749
11750
Return True if all characters in the string are ASCII, False otherwise.
11751
11752
ASCII characters have code points in the range U+0000-U+007F.
11753
Empty string is ASCII too.
11754
[clinic start generated code]*/
11755
11756
static PyObject *
11757
unicode_isascii_impl(PyObject *self)
11758
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11759
684
{
11760
684
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11761
684
}
11762
11763
/*[clinic input]
11764
@permit_long_docstring_body
11765
str.islower as unicode_islower
11766
11767
Return True if the string is a lowercase string, False otherwise.
11768
11769
A string is lowercase if all cased characters in the string are lowercase and
11770
there is at least one cased character in the string.
11771
[clinic start generated code]*/
11772
11773
static PyObject *
11774
unicode_islower_impl(PyObject *self)
11775
/*[clinic end generated code: output=dbd41995bd005b81 input=c6fc0295241a1aaa]*/
11776
0
{
11777
0
    Py_ssize_t i, length;
11778
0
    int kind;
11779
0
    const void *data;
11780
0
    int cased;
11781
11782
0
    length = PyUnicode_GET_LENGTH(self);
11783
0
    kind = PyUnicode_KIND(self);
11784
0
    data = PyUnicode_DATA(self);
11785
11786
    /* Shortcut for single character strings */
11787
0
    if (length == 1)
11788
0
        return PyBool_FromLong(
11789
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11790
11791
    /* Special case for empty strings */
11792
0
    if (length == 0)
11793
0
        Py_RETURN_FALSE;
11794
11795
0
    cased = 0;
11796
0
    for (i = 0; i < length; i++) {
11797
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11798
11799
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11800
0
            Py_RETURN_FALSE;
11801
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11802
0
            cased = 1;
11803
0
    }
11804
0
    return PyBool_FromLong(cased);
11805
0
}
11806
11807
/*[clinic input]
11808
@permit_long_docstring_body
11809
str.isupper as unicode_isupper
11810
11811
Return True if the string is an uppercase string, False otherwise.
11812
11813
A string is uppercase if all cased characters in the string are uppercase and
11814
there is at least one cased character in the string.
11815
[clinic start generated code]*/
11816
11817
static PyObject *
11818
unicode_isupper_impl(PyObject *self)
11819
/*[clinic end generated code: output=049209c8e7f15f59 input=8d5cb33e67efde72]*/
11820
6.98k
{
11821
6.98k
    Py_ssize_t i, length;
11822
6.98k
    int kind;
11823
6.98k
    const void *data;
11824
6.98k
    int cased;
11825
11826
6.98k
    length = PyUnicode_GET_LENGTH(self);
11827
6.98k
    kind = PyUnicode_KIND(self);
11828
6.98k
    data = PyUnicode_DATA(self);
11829
11830
    /* Shortcut for single character strings */
11831
6.98k
    if (length == 1)
11832
0
        return PyBool_FromLong(
11833
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11834
11835
    /* Special case for empty strings */
11836
6.98k
    if (length == 0)
11837
0
        Py_RETURN_FALSE;
11838
11839
6.98k
    cased = 0;
11840
89.1k
    for (i = 0; i < length; i++) {
11841
83.0k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11842
11843
83.0k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11844
840
            Py_RETURN_FALSE;
11845
82.1k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11846
6.23k
            cased = 1;
11847
83.0k
    }
11848
6.14k
    return PyBool_FromLong(cased);
11849
6.98k
}
11850
11851
/*[clinic input]
11852
str.istitle as unicode_istitle
11853
11854
Return True if the string is a title-cased string, False otherwise.
11855
11856
In a title-cased string, upper- and title-case characters may only
11857
follow uncased characters and lowercase characters only cased ones.
11858
[clinic start generated code]*/
11859
11860
static PyObject *
11861
unicode_istitle_impl(PyObject *self)
11862
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11863
0
{
11864
0
    Py_ssize_t i, length;
11865
0
    int kind;
11866
0
    const void *data;
11867
0
    int cased, previous_is_cased;
11868
11869
0
    length = PyUnicode_GET_LENGTH(self);
11870
0
    kind = PyUnicode_KIND(self);
11871
0
    data = PyUnicode_DATA(self);
11872
11873
    /* Shortcut for single character strings */
11874
0
    if (length == 1) {
11875
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11876
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11877
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11878
0
    }
11879
11880
    /* Special case for empty strings */
11881
0
    if (length == 0)
11882
0
        Py_RETURN_FALSE;
11883
11884
0
    cased = 0;
11885
0
    previous_is_cased = 0;
11886
0
    for (i = 0; i < length; i++) {
11887
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11888
11889
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11890
0
            if (previous_is_cased)
11891
0
                Py_RETURN_FALSE;
11892
0
            previous_is_cased = 1;
11893
0
            cased = 1;
11894
0
        }
11895
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11896
0
            if (!previous_is_cased)
11897
0
                Py_RETURN_FALSE;
11898
0
            previous_is_cased = 1;
11899
0
            cased = 1;
11900
0
        }
11901
0
        else
11902
0
            previous_is_cased = 0;
11903
0
    }
11904
0
    return PyBool_FromLong(cased);
11905
0
}
11906
11907
/*[clinic input]
11908
@permit_long_docstring_body
11909
str.isspace as unicode_isspace
11910
11911
Return True if the string is a whitespace string, False otherwise.
11912
11913
A string is whitespace if all characters in the string are whitespace and there
11914
is at least one character in the string.
11915
[clinic start generated code]*/
11916
11917
static PyObject *
11918
unicode_isspace_impl(PyObject *self)
11919
/*[clinic end generated code: output=163a63bfa08ac2b9 input=44fe05e248c6e159]*/
11920
26.3M
{
11921
26.3M
    Py_ssize_t i, length;
11922
26.3M
    int kind;
11923
26.3M
    const void *data;
11924
11925
26.3M
    length = PyUnicode_GET_LENGTH(self);
11926
26.3M
    kind = PyUnicode_KIND(self);
11927
26.3M
    data = PyUnicode_DATA(self);
11928
11929
    /* Shortcut for single character strings */
11930
26.3M
    if (length == 1)
11931
26.3M
        return PyBool_FromLong(
11932
26.3M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11933
11934
    /* Special case for empty strings */
11935
0
    if (length == 0)
11936
0
        Py_RETURN_FALSE;
11937
11938
0
    for (i = 0; i < length; i++) {
11939
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11940
0
        if (!Py_UNICODE_ISSPACE(ch))
11941
0
            Py_RETURN_FALSE;
11942
0
    }
11943
0
    Py_RETURN_TRUE;
11944
0
}
11945
11946
/*[clinic input]
11947
@permit_long_docstring_body
11948
str.isalpha as unicode_isalpha
11949
11950
Return True if the string is an alphabetic string, False otherwise.
11951
11952
A string is alphabetic if all characters in the string are alphabetic and there
11953
is at least one character in the string.
11954
[clinic start generated code]*/
11955
11956
static PyObject *
11957
unicode_isalpha_impl(PyObject *self)
11958
/*[clinic end generated code: output=cc81b9ac3883ec4f input=c233000624a56e0d]*/
11959
0
{
11960
0
    Py_ssize_t i, length;
11961
0
    int kind;
11962
0
    const void *data;
11963
11964
0
    length = PyUnicode_GET_LENGTH(self);
11965
0
    kind = PyUnicode_KIND(self);
11966
0
    data = PyUnicode_DATA(self);
11967
11968
    /* Shortcut for single character strings */
11969
0
    if (length == 1)
11970
0
        return PyBool_FromLong(
11971
0
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11972
11973
    /* Special case for empty strings */
11974
0
    if (length == 0)
11975
0
        Py_RETURN_FALSE;
11976
11977
0
    for (i = 0; i < length; i++) {
11978
0
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11979
0
            Py_RETURN_FALSE;
11980
0
    }
11981
0
    Py_RETURN_TRUE;
11982
0
}
11983
11984
/*[clinic input]
11985
@permit_long_docstring_body
11986
str.isalnum as unicode_isalnum
11987
11988
Return True if the string is an alpha-numeric string, False otherwise.
11989
11990
A string is alpha-numeric if all characters in the string are alpha-numeric and
11991
there is at least one character in the string.
11992
[clinic start generated code]*/
11993
11994
static PyObject *
11995
unicode_isalnum_impl(PyObject *self)
11996
/*[clinic end generated code: output=a5a23490ffc3660c input=5d63ba9c9bafdb6b]*/
11997
0
{
11998
0
    int kind;
11999
0
    const void *data;
12000
0
    Py_ssize_t len, i;
12001
12002
0
    kind = PyUnicode_KIND(self);
12003
0
    data = PyUnicode_DATA(self);
12004
0
    len = PyUnicode_GET_LENGTH(self);
12005
12006
    /* Shortcut for single character strings */
12007
0
    if (len == 1) {
12008
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12009
0
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12010
0
    }
12011
12012
    /* Special case for empty strings */
12013
0
    if (len == 0)
12014
0
        Py_RETURN_FALSE;
12015
12016
0
    for (i = 0; i < len; i++) {
12017
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12018
0
        if (!Py_UNICODE_ISALNUM(ch))
12019
0
            Py_RETURN_FALSE;
12020
0
    }
12021
0
    Py_RETURN_TRUE;
12022
0
}
12023
12024
/*[clinic input]
12025
@permit_long_docstring_body
12026
str.isdecimal as unicode_isdecimal
12027
12028
Return True if the string is a decimal string, False otherwise.
12029
12030
A string is a decimal string if all characters in the string are decimal and
12031
there is at least one character in the string.
12032
[clinic start generated code]*/
12033
12034
static PyObject *
12035
unicode_isdecimal_impl(PyObject *self)
12036
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=8e84a58b414935a3]*/
12037
0
{
12038
0
    Py_ssize_t i, length;
12039
0
    int kind;
12040
0
    const void *data;
12041
12042
0
    length = PyUnicode_GET_LENGTH(self);
12043
0
    kind = PyUnicode_KIND(self);
12044
0
    data = PyUnicode_DATA(self);
12045
12046
    /* Shortcut for single character strings */
12047
0
    if (length == 1)
12048
0
        return PyBool_FromLong(
12049
0
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12050
12051
    /* Special case for empty strings */
12052
0
    if (length == 0)
12053
0
        Py_RETURN_FALSE;
12054
12055
0
    for (i = 0; i < length; i++) {
12056
0
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12057
0
            Py_RETURN_FALSE;
12058
0
    }
12059
0
    Py_RETURN_TRUE;
12060
0
}
12061
12062
/*[clinic input]
12063
@permit_long_docstring_body
12064
str.isdigit as unicode_isdigit
12065
12066
Return True if the string is a digit string, False otherwise.
12067
12068
A string is a digit string if all characters in the string are digits and there
12069
is at least one character in the string.
12070
[clinic start generated code]*/
12071
12072
static PyObject *
12073
unicode_isdigit_impl(PyObject *self)
12074
/*[clinic end generated code: output=10a6985311da6858 input=99e284affb54d4a0]*/
12075
1.79M
{
12076
1.79M
    Py_ssize_t i, length;
12077
1.79M
    int kind;
12078
1.79M
    const void *data;
12079
12080
1.79M
    length = PyUnicode_GET_LENGTH(self);
12081
1.79M
    kind = PyUnicode_KIND(self);
12082
1.79M
    data = PyUnicode_DATA(self);
12083
12084
    /* Shortcut for single character strings */
12085
1.79M
    if (length == 1) {
12086
1.79M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12087
1.79M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12088
1.79M
    }
12089
12090
    /* Special case for empty strings */
12091
306
    if (length == 0)
12092
0
        Py_RETURN_FALSE;
12093
12094
1.09k
    for (i = 0; i < length; i++) {
12095
786
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12096
0
            Py_RETURN_FALSE;
12097
786
    }
12098
306
    Py_RETURN_TRUE;
12099
306
}
12100
12101
/*[clinic input]
12102
@permit_long_docstring_body
12103
str.isnumeric as unicode_isnumeric
12104
12105
Return True if the string is a numeric string, False otherwise.
12106
12107
A string is numeric if all characters in the string are numeric and there is at
12108
least one character in the string.
12109
[clinic start generated code]*/
12110
12111
static PyObject *
12112
unicode_isnumeric_impl(PyObject *self)
12113
/*[clinic end generated code: output=9172a32d9013051a input=e9f5b6b8b29b0ee6]*/
12114
0
{
12115
0
    Py_ssize_t i, length;
12116
0
    int kind;
12117
0
    const void *data;
12118
12119
0
    length = PyUnicode_GET_LENGTH(self);
12120
0
    kind = PyUnicode_KIND(self);
12121
0
    data = PyUnicode_DATA(self);
12122
12123
    /* Shortcut for single character strings */
12124
0
    if (length == 1)
12125
0
        return PyBool_FromLong(
12126
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12127
12128
    /* Special case for empty strings */
12129
0
    if (length == 0)
12130
0
        Py_RETURN_FALSE;
12131
12132
0
    for (i = 0; i < length; i++) {
12133
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12134
0
            Py_RETURN_FALSE;
12135
0
    }
12136
0
    Py_RETURN_TRUE;
12137
0
}
12138
12139
Py_ssize_t
12140
_PyUnicode_ScanIdentifier(PyObject *self)
12141
12.6k
{
12142
12.6k
    Py_ssize_t i;
12143
12.6k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12144
12.6k
    if (len == 0) {
12145
        /* an empty string is not a valid identifier */
12146
0
        return 0;
12147
0
    }
12148
12149
12.6k
    int kind = PyUnicode_KIND(self);
12150
12.6k
    const void *data = PyUnicode_DATA(self);
12151
12.6k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12152
    /* PEP 3131 says that the first character must be in
12153
       XID_Start and subsequent characters in XID_Continue,
12154
       and for the ASCII range, the 2.x rules apply (i.e
12155
       start with letters and underscore, continue with
12156
       letters, digits, underscore). However, given the current
12157
       definition of XID_Start and XID_Continue, it is sufficient
12158
       to check just for these, except that _ must be allowed
12159
       as starting an identifier.  */
12160
12.6k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12161
438
        return 0;
12162
438
    }
12163
12164
44.4k
    for (i = 1; i < len; i++) {
12165
32.4k
        ch = PyUnicode_READ(kind, data, i);
12166
32.4k
        if (!_PyUnicode_IsXidContinue(ch)) {
12167
231
            return i;
12168
231
        }
12169
32.4k
    }
12170
12.0k
    return i;
12171
12.2k
}
12172
12173
int
12174
PyUnicode_IsIdentifier(PyObject *self)
12175
964
{
12176
964
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12177
964
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12178
    /* an empty string is not a valid identifier */
12179
964
    return len && i == len;
12180
964
}
12181
12182
/*[clinic input]
12183
@permit_long_docstring_body
12184
str.isidentifier as unicode_isidentifier
12185
12186
Return True if the string is a valid Python identifier, False otherwise.
12187
12188
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12189
such as "def" or "class".
12190
[clinic start generated code]*/
12191
12192
static PyObject *
12193
unicode_isidentifier_impl(PyObject *self)
12194
/*[clinic end generated code: output=fe585a9666572905 input=86315dd889d7bd04]*/
12195
502
{
12196
502
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12197
502
}
12198
12199
/*[clinic input]
12200
@permit_long_summary
12201
str.isprintable as unicode_isprintable
12202
12203
Return True if all characters in the string are printable, False otherwise.
12204
12205
A character is printable if repr() may use it in its output.
12206
[clinic start generated code]*/
12207
12208
static PyObject *
12209
unicode_isprintable_impl(PyObject *self)
12210
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12211
1.08M
{
12212
1.08M
    Py_ssize_t i, length;
12213
1.08M
    int kind;
12214
1.08M
    const void *data;
12215
12216
1.08M
    length = PyUnicode_GET_LENGTH(self);
12217
1.08M
    kind = PyUnicode_KIND(self);
12218
1.08M
    data = PyUnicode_DATA(self);
12219
12220
    /* Shortcut for single character strings */
12221
1.08M
    if (length == 1)
12222
1.08M
        return PyBool_FromLong(
12223
1.08M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12224
12225
0
    for (i = 0; i < length; i++) {
12226
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12227
0
            Py_RETURN_FALSE;
12228
0
        }
12229
0
    }
12230
0
    Py_RETURN_TRUE;
12231
0
}
12232
12233
/*[clinic input]
12234
@permit_long_docstring_body
12235
str.join as unicode_join
12236
12237
    iterable: object
12238
    /
12239
12240
Concatenate any number of strings.
12241
12242
The string whose method is called is inserted in between each given string.
12243
The result is returned as a new string.
12244
12245
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12246
[clinic start generated code]*/
12247
12248
static PyObject *
12249
unicode_join(PyObject *self, PyObject *iterable)
12250
/*[clinic end generated code: output=6857e7cecfe7bf98 input=bac724ed412ef3f8]*/
12251
24.7M
{
12252
24.7M
    return PyUnicode_Join(self, iterable);
12253
24.7M
}
12254
12255
static Py_ssize_t
12256
unicode_length(PyObject *self)
12257
41.4M
{
12258
41.4M
    return PyUnicode_GET_LENGTH(self);
12259
41.4M
}
12260
12261
/*[clinic input]
12262
str.ljust as unicode_ljust
12263
12264
    width: Py_ssize_t
12265
    fillchar: Py_UCS4 = ' '
12266
    /
12267
12268
Return a left-justified string of length width.
12269
12270
Padding is done using the specified fill character (default is a space).
12271
[clinic start generated code]*/
12272
12273
static PyObject *
12274
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12275
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12276
0
{
12277
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12278
0
        return unicode_result_unchanged(self);
12279
12280
0
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12281
0
}
12282
12283
/*[clinic input]
12284
str.lower as unicode_lower
12285
12286
Return a copy of the string converted to lowercase.
12287
[clinic start generated code]*/
12288
12289
static PyObject *
12290
unicode_lower_impl(PyObject *self)
12291
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12292
80.7M
{
12293
80.7M
    if (PyUnicode_IS_ASCII(self))
12294
75.3M
        return ascii_upper_or_lower(self, 1);
12295
5.40M
    return case_operation(self, do_lower);
12296
80.7M
}
12297
12298
64.4M
#define LEFTSTRIP 0
12299
85.9M
#define RIGHTSTRIP 1
12300
35.4M
#define BOTHSTRIP 2
12301
12302
/* Arrays indexed by above */
12303
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12304
12305
0
#define STRIPNAME(i) (stripfuncnames[i])
12306
12307
/* externally visible for str.strip(unicode) */
12308
PyObject *
12309
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12310
10.2M
{
12311
10.2M
    const void *data;
12312
10.2M
    int kind;
12313
10.2M
    Py_ssize_t i, j, len;
12314
10.2M
    BLOOM_MASK sepmask;
12315
10.2M
    Py_ssize_t seplen;
12316
12317
10.2M
    kind = PyUnicode_KIND(self);
12318
10.2M
    data = PyUnicode_DATA(self);
12319
10.2M
    len = PyUnicode_GET_LENGTH(self);
12320
10.2M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12321
10.2M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12322
10.2M
                              PyUnicode_DATA(sepobj),
12323
10.2M
                              seplen);
12324
12325
10.2M
    i = 0;
12326
10.2M
    if (striptype != RIGHTSTRIP) {
12327
516k
        while (i < len) {
12328
513k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12329
513k
            if (!BLOOM(sepmask, ch))
12330
471k
                break;
12331
41.6k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12332
3.18k
                break;
12333
38.4k
            i++;
12334
38.4k
        }
12335
477k
    }
12336
12337
10.2M
    j = len;
12338
10.2M
    if (striptype != LEFTSTRIP) {
12339
9.76M
        j--;
12340
10.2M
        while (j >= i) {
12341
4.86M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12342
4.86M
            if (!BLOOM(sepmask, ch))
12343
4.36M
                break;
12344
502k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12345
31.8k
                break;
12346
471k
            j--;
12347
471k
        }
12348
12349
9.76M
        j++;
12350
9.76M
    }
12351
12352
10.2M
    return PyUnicode_Substring(self, i, j);
12353
10.2M
}
12354
12355
PyObject*
12356
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12357
240M
{
12358
240M
    const unsigned char *data;
12359
240M
    int kind;
12360
240M
    Py_ssize_t length;
12361
12362
240M
    length = PyUnicode_GET_LENGTH(self);
12363
240M
    end = Py_MIN(end, length);
12364
12365
240M
    if (start == 0 && end == length)
12366
55.5M
        return unicode_result_unchanged(self);
12367
12368
184M
    if (start < 0 || end < 0) {
12369
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12370
0
        return NULL;
12371
0
    }
12372
184M
    if (start >= length || end < start)
12373
206k
        _Py_RETURN_UNICODE_EMPTY();
12374
12375
184M
    length = end - start;
12376
184M
    if (PyUnicode_IS_ASCII(self)) {
12377
47.8M
        data = PyUnicode_1BYTE_DATA(self);
12378
47.8M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12379
47.8M
    }
12380
136M
    else {
12381
136M
        kind = PyUnicode_KIND(self);
12382
136M
        data = PyUnicode_1BYTE_DATA(self);
12383
136M
        return PyUnicode_FromKindAndData(kind,
12384
136M
                                         data + kind * start,
12385
136M
                                         length);
12386
136M
    }
12387
184M
}
12388
12389
static PyObject *
12390
do_strip(PyObject *self, int striptype)
12391
51.7M
{
12392
51.7M
    Py_ssize_t len, i, j;
12393
12394
51.7M
    len = PyUnicode_GET_LENGTH(self);
12395
12396
51.7M
    if (PyUnicode_IS_ASCII(self)) {
12397
40.6M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12398
12399
40.6M
        i = 0;
12400
40.6M
        if (striptype != RIGHTSTRIP) {
12401
29.0M
            while (i < len) {
12402
22.9M
                Py_UCS1 ch = data[i];
12403
22.9M
                if (!_Py_ascii_whitespace[ch])
12404
22.1M
                    break;
12405
790k
                i++;
12406
790k
            }
12407
28.2M
        }
12408
12409
40.6M
        j = len;
12410
40.6M
        if (striptype != LEFTSTRIP) {
12411
40.1M
            j--;
12412
54.4M
            while (j >= i) {
12413
41.6M
                Py_UCS1 ch = data[j];
12414
41.6M
                if (!_Py_ascii_whitespace[ch])
12415
27.3M
                    break;
12416
14.3M
                j--;
12417
14.3M
            }
12418
40.1M
            j++;
12419
40.1M
        }
12420
40.6M
    }
12421
11.1M
    else {
12422
11.1M
        int kind = PyUnicode_KIND(self);
12423
11.1M
        const void *data = PyUnicode_DATA(self);
12424
12425
11.1M
        i = 0;
12426
11.1M
        if (striptype != RIGHTSTRIP) {
12427
10.9M
            while (i < len) {
12428
10.9M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12429
10.9M
                if (!Py_UNICODE_ISSPACE(ch))
12430
9.18M
                    break;
12431
1.74M
                i++;
12432
1.74M
            }
12433
9.18M
        }
12434
12435
11.1M
        j = len;
12436
11.1M
        if (striptype != LEFTSTRIP) {
12437
9.53M
            j--;
12438
10.2M
            while (j >= i) {
12439
10.2M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12440
10.2M
                if (!Py_UNICODE_ISSPACE(ch))
12441
9.50M
                    break;
12442
704k
                j--;
12443
704k
            }
12444
9.53M
            j++;
12445
9.53M
        }
12446
11.1M
    }
12447
12448
51.7M
    return PyUnicode_Substring(self, i, j);
12449
51.7M
}
12450
12451
12452
static PyObject *
12453
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12454
61.9M
{
12455
61.9M
    if (sep != Py_None) {
12456
10.2M
        if (PyUnicode_Check(sep))
12457
10.2M
            return _PyUnicode_XStrip(self, striptype, sep);
12458
0
        else {
12459
0
            PyErr_Format(PyExc_TypeError,
12460
0
                         "%s arg must be None or str",
12461
0
                         STRIPNAME(striptype));
12462
0
            return NULL;
12463
0
        }
12464
10.2M
    }
12465
12466
51.7M
    return do_strip(self, striptype);
12467
61.9M
}
12468
12469
12470
/*[clinic input]
12471
@permit_long_summary
12472
str.strip as unicode_strip
12473
12474
    chars: object = None
12475
    /
12476
12477
Return a copy of the string with leading and trailing whitespace removed.
12478
12479
If chars is given and not None, remove characters in chars instead.
12480
[clinic start generated code]*/
12481
12482
static PyObject *
12483
unicode_strip_impl(PyObject *self, PyObject *chars)
12484
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12485
35.4M
{
12486
35.4M
    return do_argstrip(self, BOTHSTRIP, chars);
12487
35.4M
}
12488
12489
12490
/*[clinic input]
12491
str.lstrip as unicode_lstrip
12492
12493
    chars: object = None
12494
    /
12495
12496
Return a copy of the string with leading whitespace removed.
12497
12498
If chars is given and not None, remove characters in chars instead.
12499
[clinic start generated code]*/
12500
12501
static PyObject *
12502
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12503
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12504
2.47M
{
12505
2.47M
    return do_argstrip(self, LEFTSTRIP, chars);
12506
2.47M
}
12507
12508
12509
/*[clinic input]
12510
str.rstrip as unicode_rstrip
12511
12512
    chars: object = None
12513
    /
12514
12515
Return a copy of the string with trailing whitespace removed.
12516
12517
If chars is given and not None, remove characters in chars instead.
12518
[clinic start generated code]*/
12519
12520
static PyObject *
12521
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12522
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12523
24.0M
{
12524
24.0M
    return do_argstrip(self, RIGHTSTRIP, chars);
12525
24.0M
}
12526
12527
12528
static PyObject*
12529
unicode_repeat(PyObject *str, Py_ssize_t len)
12530
444k
{
12531
444k
    PyObject *u;
12532
444k
    Py_ssize_t nchars, n;
12533
12534
444k
    if (len < 1)
12535
32.0k
        _Py_RETURN_UNICODE_EMPTY();
12536
12537
    /* no repeat, return original string */
12538
411k
    if (len == 1)
12539
128k
        return unicode_result_unchanged(str);
12540
12541
283k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12542
0
        PyErr_SetString(PyExc_OverflowError,
12543
0
                        "repeated string is too long");
12544
0
        return NULL;
12545
0
    }
12546
283k
    nchars = len * PyUnicode_GET_LENGTH(str);
12547
12548
283k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12549
283k
    if (!u)
12550
0
        return NULL;
12551
283k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12552
12553
283k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12554
281k
        int kind = PyUnicode_KIND(str);
12555
281k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12556
281k
        if (kind == PyUnicode_1BYTE_KIND) {
12557
281k
            void *to = PyUnicode_DATA(u);
12558
281k
            memset(to, (unsigned char)fill_char, len);
12559
281k
        }
12560
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12561
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12562
0
            for (n = 0; n < len; ++n)
12563
0
                ucs2[n] = fill_char;
12564
0
        } else {
12565
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12566
0
            assert(kind == PyUnicode_4BYTE_KIND);
12567
0
            for (n = 0; n < len; ++n)
12568
0
                ucs4[n] = fill_char;
12569
0
        }
12570
281k
    }
12571
2.00k
    else {
12572
2.00k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12573
2.00k
        char *to = (char *) PyUnicode_DATA(u);
12574
2.00k
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12575
2.00k
            PyUnicode_GET_LENGTH(str) * char_size);
12576
2.00k
    }
12577
12578
283k
    assert(_PyUnicode_CheckConsistency(u, 1));
12579
283k
    return u;
12580
283k
}
12581
12582
PyObject *
12583
PyUnicode_Replace(PyObject *str,
12584
                  PyObject *substr,
12585
                  PyObject *replstr,
12586
                  Py_ssize_t maxcount)
12587
2
{
12588
2
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12589
2
            ensure_unicode(replstr) < 0)
12590
0
        return NULL;
12591
2
    return replace(str, substr, replstr, maxcount);
12592
2
}
12593
12594
/*[clinic input]
12595
@permit_long_docstring_body
12596
str.replace as unicode_replace
12597
12598
    old: unicode
12599
    new: unicode
12600
    /
12601
    count: Py_ssize_t = -1
12602
        Maximum number of occurrences to replace.
12603
        -1 (the default value) means replace all occurrences.
12604
12605
Return a copy with all occurrences of substring old replaced by new.
12606
12607
If the optional argument count is given, only the first count occurrences are
12608
replaced.
12609
[clinic start generated code]*/
12610
12611
static PyObject *
12612
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12613
                     Py_ssize_t count)
12614
/*[clinic end generated code: output=b63f1a8b5eebf448 input=f27ca92ac46b65a1]*/
12615
86.6M
{
12616
86.6M
    return replace(self, old, new, count);
12617
86.6M
}
12618
12619
/*[clinic input]
12620
@permit_long_docstring_body
12621
str.removeprefix as unicode_removeprefix
12622
12623
    prefix: unicode
12624
    /
12625
12626
Return a str with the given prefix string removed if present.
12627
12628
If the string starts with the prefix string, return string[len(prefix):].
12629
Otherwise, return a copy of the original string.
12630
[clinic start generated code]*/
12631
12632
static PyObject *
12633
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12634
/*[clinic end generated code: output=f1e5945e9763bcb9 input=1989a856dbb813f1]*/
12635
0
{
12636
0
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12637
0
    if (match == -1) {
12638
0
        return NULL;
12639
0
    }
12640
0
    if (match) {
12641
0
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12642
0
                                   PyUnicode_GET_LENGTH(self));
12643
0
    }
12644
0
    return unicode_result_unchanged(self);
12645
0
}
12646
12647
/*[clinic input]
12648
str.removesuffix as unicode_removesuffix
12649
12650
    suffix: unicode
12651
    /
12652
12653
Return a str with the given suffix string removed if present.
12654
12655
If the string ends with the suffix string and that suffix is not empty,
12656
return string[:-len(suffix)]. Otherwise, return a copy of the original
12657
string.
12658
[clinic start generated code]*/
12659
12660
static PyObject *
12661
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12662
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12663
0
{
12664
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12665
0
    if (match == -1) {
12666
0
        return NULL;
12667
0
    }
12668
0
    if (match) {
12669
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12670
0
                                            - PyUnicode_GET_LENGTH(suffix));
12671
0
    }
12672
0
    return unicode_result_unchanged(self);
12673
0
}
12674
12675
static PyObject *
12676
unicode_repr(PyObject *unicode)
12677
3.97M
{
12678
3.97M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12679
3.97M
    const void *idata = PyUnicode_DATA(unicode);
12680
12681
    /* Compute length of output, quote characters, and
12682
       maximum character */
12683
3.97M
    Py_ssize_t osize = 0;
12684
3.97M
    Py_UCS4 maxch = 127;
12685
3.97M
    Py_ssize_t squote = 0;
12686
3.97M
    Py_ssize_t dquote = 0;
12687
3.97M
    int ikind = PyUnicode_KIND(unicode);
12688
130M
    for (Py_ssize_t i = 0; i < isize; i++) {
12689
126M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12690
126M
        Py_ssize_t incr = 1;
12691
126M
        switch (ch) {
12692
180k
        case '\'': squote++; break;
12693
579k
        case '"':  dquote++; break;
12694
208k
        case '\\': case '\t': case '\r': case '\n':
12695
208k
            incr = 2;
12696
208k
            break;
12697
125M
        default:
12698
            /* Fast-path ASCII */
12699
125M
            if (ch < ' ' || ch == 0x7f)
12700
75.9M
                incr = 4; /* \xHH */
12701
50.0M
            else if (ch < 0x7f)
12702
42.3M
                ;
12703
7.72M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12704
7.62M
                maxch = (ch > maxch) ? ch : maxch;
12705
103k
            else if (ch < 0x100)
12706
29.7k
                incr = 4; /* \xHH */
12707
74.0k
            else if (ch < 0x10000)
12708
51.2k
                incr = 6; /* \uHHHH */
12709
22.7k
            else
12710
22.7k
                incr = 10; /* \uHHHHHHHH */
12711
126M
        }
12712
126M
        if (osize > PY_SSIZE_T_MAX - incr) {
12713
0
            PyErr_SetString(PyExc_OverflowError,
12714
0
                            "string is too long to generate repr");
12715
0
            return NULL;
12716
0
        }
12717
126M
        osize += incr;
12718
126M
    }
12719
12720
3.97M
    Py_UCS4 quote = '\'';
12721
3.97M
    int changed = (osize != isize);
12722
3.97M
    if (squote) {
12723
83.0k
        changed = 1;
12724
83.0k
        if (dquote)
12725
            /* Both squote and dquote present. Use squote,
12726
               and escape them */
12727
7.92k
            osize += squote;
12728
75.0k
        else
12729
75.0k
            quote = '"';
12730
83.0k
    }
12731
3.97M
    osize += 2;   /* quotes */
12732
12733
3.97M
    PyObject *repr = PyUnicode_New(osize, maxch);
12734
3.97M
    if (repr == NULL)
12735
0
        return NULL;
12736
3.97M
    int okind = PyUnicode_KIND(repr);
12737
3.97M
    void *odata = PyUnicode_DATA(repr);
12738
12739
3.97M
    if (!changed) {
12740
3.26M
        PyUnicode_WRITE(okind, odata, 0, quote);
12741
12742
3.26M
        _PyUnicode_FastCopyCharacters(repr, 1,
12743
3.26M
                                      unicode, 0,
12744
3.26M
                                      isize);
12745
12746
3.26M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12747
3.26M
    }
12748
709k
    else {
12749
709k
        switch (okind) {
12750
478k
        case PyUnicode_1BYTE_KIND:
12751
478k
            ucs1lib_repr(unicode, quote, odata);
12752
478k
            break;
12753
227k
        case PyUnicode_2BYTE_KIND:
12754
227k
            ucs2lib_repr(unicode, quote, odata);
12755
227k
            break;
12756
3.96k
        default:
12757
3.96k
            assert(okind == PyUnicode_4BYTE_KIND);
12758
3.96k
            ucs4lib_repr(unicode, quote, odata);
12759
709k
        }
12760
709k
    }
12761
12762
3.97M
    assert(_PyUnicode_CheckConsistency(repr, 1));
12763
3.97M
    return repr;
12764
3.97M
}
12765
12766
/*[clinic input]
12767
@permit_long_summary
12768
str.rfind as unicode_rfind = str.count
12769
12770
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12771
12772
Optional arguments start and end are interpreted as in slice notation.
12773
Return -1 on failure.
12774
[clinic start generated code]*/
12775
12776
static Py_ssize_t
12777
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12778
                   Py_ssize_t end)
12779
/*[clinic end generated code: output=880b29f01dd014c8 input=7f7e97d5cd3299a2]*/
12780
10.6k
{
12781
10.6k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12782
10.6k
    if (result < 0) {
12783
7.13k
        return -1;
12784
7.13k
    }
12785
3.50k
    return result;
12786
10.6k
}
12787
12788
/*[clinic input]
12789
@permit_long_summary
12790
str.rindex as unicode_rindex = str.count
12791
12792
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12793
12794
Optional arguments start and end are interpreted as in slice notation.
12795
Raises ValueError when the substring is not found.
12796
[clinic start generated code]*/
12797
12798
static Py_ssize_t
12799
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12800
                    Py_ssize_t end)
12801
/*[clinic end generated code: output=5f3aef124c867fe1 input=0363a324740b3e62]*/
12802
135k
{
12803
135k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12804
135k
    if (result == -1) {
12805
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12806
0
    }
12807
135k
    else if (result < 0) {
12808
0
        return -1;
12809
0
    }
12810
135k
    return result;
12811
135k
}
12812
12813
/*[clinic input]
12814
str.rjust as unicode_rjust
12815
12816
    width: Py_ssize_t
12817
    fillchar: Py_UCS4 = ' '
12818
    /
12819
12820
Return a right-justified string of length width.
12821
12822
Padding is done using the specified fill character (default is a space).
12823
[clinic start generated code]*/
12824
12825
static PyObject *
12826
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12827
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12828
0
{
12829
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12830
0
        return unicode_result_unchanged(self);
12831
12832
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12833
0
}
12834
12835
PyObject *
12836
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12837
0
{
12838
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12839
0
        return NULL;
12840
12841
0
    return split(s, sep, maxsplit);
12842
0
}
12843
12844
/*[clinic input]
12845
@permit_long_summary
12846
str.split as unicode_split
12847
12848
    sep: object = None
12849
        The separator used to split the string.
12850
12851
        When set to None (the default value), will split on any whitespace
12852
        character (including \n \r \t \f and spaces) and will discard
12853
        empty strings from the result.
12854
    maxsplit: Py_ssize_t = -1
12855
        Maximum number of splits.
12856
        -1 (the default value) means no limit.
12857
12858
Return a list of the substrings in the string, using sep as the separator string.
12859
12860
Splitting starts at the front of the string and works to the end.
12861
12862
Note, str.split() is mainly useful for data that has been intentionally
12863
delimited.  With natural text that includes punctuation, consider using
12864
the regular expression module.
12865
12866
[clinic start generated code]*/
12867
12868
static PyObject *
12869
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12870
/*[clinic end generated code: output=3a65b1db356948dc input=2c1fd08a78e038b8]*/
12871
23.7M
{
12872
23.7M
    if (sep == Py_None)
12873
173k
        return split(self, NULL, maxsplit);
12874
23.5M
    if (PyUnicode_Check(sep))
12875
23.5M
        return split(self, sep, maxsplit);
12876
12877
0
    PyErr_Format(PyExc_TypeError,
12878
0
                 "must be str or None, not %.100s",
12879
0
                 Py_TYPE(sep)->tp_name);
12880
0
    return NULL;
12881
23.5M
}
12882
12883
PyObject *
12884
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12885
8.72M
{
12886
8.72M
    PyObject* out;
12887
8.72M
    int kind1, kind2;
12888
8.72M
    const void *buf1, *buf2;
12889
8.72M
    Py_ssize_t len1, len2;
12890
12891
8.72M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12892
0
        return NULL;
12893
12894
8.72M
    kind1 = PyUnicode_KIND(str_obj);
12895
8.72M
    kind2 = PyUnicode_KIND(sep_obj);
12896
8.72M
    len1 = PyUnicode_GET_LENGTH(str_obj);
12897
8.72M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12898
8.72M
    if (kind1 < kind2 || len1 < len2) {
12899
1.17k
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12900
1.17k
        return PyTuple_Pack(3, str_obj, empty, empty);
12901
1.17k
    }
12902
8.72M
    buf1 = PyUnicode_DATA(str_obj);
12903
8.72M
    buf2 = PyUnicode_DATA(sep_obj);
12904
8.72M
    if (kind2 != kind1) {
12905
93.2k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12906
93.2k
        if (!buf2)
12907
0
            return NULL;
12908
93.2k
    }
12909
12910
8.72M
    switch (kind1) {
12911
8.63M
    case PyUnicode_1BYTE_KIND:
12912
8.63M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12913
3.02M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12914
5.61M
        else
12915
5.61M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12916
8.63M
        break;
12917
83.0k
    case PyUnicode_2BYTE_KIND:
12918
83.0k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12919
83.0k
        break;
12920
10.2k
    case PyUnicode_4BYTE_KIND:
12921
10.2k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12922
10.2k
        break;
12923
0
    default:
12924
0
        Py_UNREACHABLE();
12925
8.72M
    }
12926
12927
8.72M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12928
8.72M
    if (kind2 != kind1)
12929
93.2k
        PyMem_Free((void *)buf2);
12930
12931
8.72M
    return out;
12932
8.72M
}
12933
12934
12935
PyObject *
12936
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12937
11.1k
{
12938
11.1k
    PyObject* out;
12939
11.1k
    int kind1, kind2;
12940
11.1k
    const void *buf1, *buf2;
12941
11.1k
    Py_ssize_t len1, len2;
12942
12943
11.1k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12944
0
        return NULL;
12945
12946
11.1k
    kind1 = PyUnicode_KIND(str_obj);
12947
11.1k
    kind2 = PyUnicode_KIND(sep_obj);
12948
11.1k
    len1 = PyUnicode_GET_LENGTH(str_obj);
12949
11.1k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12950
11.1k
    if (kind1 < kind2 || len1 < len2) {
12951
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12952
0
        return PyTuple_Pack(3, empty, empty, str_obj);
12953
0
    }
12954
11.1k
    buf1 = PyUnicode_DATA(str_obj);
12955
11.1k
    buf2 = PyUnicode_DATA(sep_obj);
12956
11.1k
    if (kind2 != kind1) {
12957
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12958
0
        if (!buf2)
12959
0
            return NULL;
12960
0
    }
12961
12962
11.1k
    switch (kind1) {
12963
11.1k
    case PyUnicode_1BYTE_KIND:
12964
11.1k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12965
11.1k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12966
0
        else
12967
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12968
11.1k
        break;
12969
0
    case PyUnicode_2BYTE_KIND:
12970
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12971
0
        break;
12972
0
    case PyUnicode_4BYTE_KIND:
12973
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12974
0
        break;
12975
0
    default:
12976
0
        Py_UNREACHABLE();
12977
11.1k
    }
12978
12979
11.1k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12980
11.1k
    if (kind2 != kind1)
12981
0
        PyMem_Free((void *)buf2);
12982
12983
11.1k
    return out;
12984
11.1k
}
12985
12986
/*[clinic input]
12987
@permit_long_docstring_body
12988
str.partition as unicode_partition
12989
12990
    sep: object
12991
    /
12992
12993
Partition the string into three parts using the given separator.
12994
12995
This will search for the separator in the string.  If the separator is found,
12996
returns a 3-tuple containing the part before the separator, the separator
12997
itself, and the part after it.
12998
12999
If the separator is not found, returns a 3-tuple containing the original string
13000
and two empty strings.
13001
[clinic start generated code]*/
13002
13003
static PyObject *
13004
unicode_partition(PyObject *self, PyObject *sep)
13005
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=4d854b520d7b0e97]*/
13006
8.72M
{
13007
8.72M
    return PyUnicode_Partition(self, sep);
13008
8.72M
}
13009
13010
/*[clinic input]
13011
@permit_long_docstring_body
13012
str.rpartition as unicode_rpartition = str.partition
13013
13014
Partition the string into three parts using the given separator.
13015
13016
This will search for the separator in the string, starting at the end. If
13017
the separator is found, returns a 3-tuple containing the part before the
13018
separator, the separator itself, and the part after it.
13019
13020
If the separator is not found, returns a 3-tuple containing two empty strings
13021
and the original string.
13022
[clinic start generated code]*/
13023
13024
static PyObject *
13025
unicode_rpartition(PyObject *self, PyObject *sep)
13026
/*[clinic end generated code: output=1aa13cf1156572aa input=a6adabe91e75b486]*/
13027
11.1k
{
13028
11.1k
    return PyUnicode_RPartition(self, sep);
13029
11.1k
}
13030
13031
PyObject *
13032
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13033
0
{
13034
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13035
0
        return NULL;
13036
13037
0
    return rsplit(s, sep, maxsplit);
13038
0
}
13039
13040
/*[clinic input]
13041
@permit_long_summary
13042
str.rsplit as unicode_rsplit = str.split
13043
13044
Return a list of the substrings in the string, using sep as the separator string.
13045
13046
Splitting starts at the end of the string and works to the front.
13047
[clinic start generated code]*/
13048
13049
static PyObject *
13050
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13051
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13052
50
{
13053
50
    if (sep == Py_None)
13054
0
        return rsplit(self, NULL, maxsplit);
13055
50
    if (PyUnicode_Check(sep))
13056
50
        return rsplit(self, sep, maxsplit);
13057
13058
0
    PyErr_Format(PyExc_TypeError,
13059
0
                 "must be str or None, not %.100s",
13060
0
                 Py_TYPE(sep)->tp_name);
13061
0
    return NULL;
13062
50
}
13063
13064
/*[clinic input]
13065
@permit_long_docstring_body
13066
str.splitlines as unicode_splitlines
13067
13068
    keepends: bool = False
13069
13070
Return a list of the lines in the string, breaking at line boundaries.
13071
13072
Line breaks are not included in the resulting list unless keepends is given and
13073
true.
13074
[clinic start generated code]*/
13075
13076
static PyObject *
13077
unicode_splitlines_impl(PyObject *self, int keepends)
13078
/*[clinic end generated code: output=f664dcdad153ec40 input=39eeafbfef61c827]*/
13079
13.9k
{
13080
13.9k
    return PyUnicode_Splitlines(self, keepends);
13081
13.9k
}
13082
13083
static
13084
PyObject *unicode_str(PyObject *self)
13085
3.54M
{
13086
3.54M
    return unicode_result_unchanged(self);
13087
3.54M
}
13088
13089
/*[clinic input]
13090
@permit_long_summary
13091
str.swapcase as unicode_swapcase
13092
13093
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13094
[clinic start generated code]*/
13095
13096
static PyObject *
13097
unicode_swapcase_impl(PyObject *self)
13098
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13099
0
{
13100
0
    return case_operation(self, do_swapcase);
13101
0
}
13102
13103
/*[clinic input]
13104
13105
@staticmethod
13106
str.maketrans as unicode_maketrans
13107
13108
  x: object
13109
13110
  y: unicode=NULL
13111
13112
  z: unicode=NULL
13113
13114
  /
13115
13116
Return a translation table usable for str.translate().
13117
13118
If there is only one argument, it must be a dictionary mapping Unicode
13119
ordinals (integers) or characters to Unicode ordinals, strings or None.
13120
Character keys will be then converted to ordinals.
13121
If there are two arguments, they must be strings of equal length, and
13122
in the resulting dictionary, each character in x will be mapped to the
13123
character at the same position in y. If there is a third argument, it
13124
must be a string, whose characters will be mapped to None in the result.
13125
[clinic start generated code]*/
13126
13127
static PyObject *
13128
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13129
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13130
0
{
13131
0
    PyObject *new = NULL, *key, *value;
13132
0
    Py_ssize_t i = 0;
13133
0
    int res;
13134
13135
0
    new = PyDict_New();
13136
0
    if (!new)
13137
0
        return NULL;
13138
0
    if (y != NULL) {
13139
0
        int x_kind, y_kind, z_kind;
13140
0
        const void *x_data, *y_data, *z_data;
13141
13142
        /* x must be a string too, of equal length */
13143
0
        if (!PyUnicode_Check(x)) {
13144
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13145
0
                            "be a string if there is a second argument");
13146
0
            goto err;
13147
0
        }
13148
0
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13149
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13150
0
                            "arguments must have equal length");
13151
0
            goto err;
13152
0
        }
13153
        /* create entries for translating chars in x to those in y */
13154
0
        x_kind = PyUnicode_KIND(x);
13155
0
        y_kind = PyUnicode_KIND(y);
13156
0
        x_data = PyUnicode_DATA(x);
13157
0
        y_data = PyUnicode_DATA(y);
13158
0
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13159
0
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13160
0
            if (!key)
13161
0
                goto err;
13162
0
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13163
0
            if (!value) {
13164
0
                Py_DECREF(key);
13165
0
                goto err;
13166
0
            }
13167
0
            res = PyDict_SetItem(new, key, value);
13168
0
            Py_DECREF(key);
13169
0
            Py_DECREF(value);
13170
0
            if (res < 0)
13171
0
                goto err;
13172
0
        }
13173
        /* create entries for deleting chars in z */
13174
0
        if (z != NULL) {
13175
0
            z_kind = PyUnicode_KIND(z);
13176
0
            z_data = PyUnicode_DATA(z);
13177
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13178
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13179
0
                if (!key)
13180
0
                    goto err;
13181
0
                res = PyDict_SetItem(new, key, Py_None);
13182
0
                Py_DECREF(key);
13183
0
                if (res < 0)
13184
0
                    goto err;
13185
0
            }
13186
0
        }
13187
0
    } else {
13188
0
        int kind;
13189
0
        const void *data;
13190
13191
        /* x must be a dict */
13192
0
        if (!PyDict_CheckExact(x)) {
13193
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13194
0
                            "to maketrans it must be a dict");
13195
0
            goto err;
13196
0
        }
13197
        /* copy entries into the new dict, converting string keys to int keys */
13198
0
        while (PyDict_Next(x, &i, &key, &value)) {
13199
0
            if (PyUnicode_Check(key)) {
13200
                /* convert string keys to integer keys */
13201
0
                PyObject *newkey;
13202
0
                if (PyUnicode_GET_LENGTH(key) != 1) {
13203
0
                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13204
0
                                    "table must be of length 1");
13205
0
                    goto err;
13206
0
                }
13207
0
                kind = PyUnicode_KIND(key);
13208
0
                data = PyUnicode_DATA(key);
13209
0
                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13210
0
                if (!newkey)
13211
0
                    goto err;
13212
0
                res = PyDict_SetItem(new, newkey, value);
13213
0
                Py_DECREF(newkey);
13214
0
                if (res < 0)
13215
0
                    goto err;
13216
0
            } else if (PyLong_Check(key)) {
13217
                /* just keep integer keys */
13218
0
                if (PyDict_SetItem(new, key, value) < 0)
13219
0
                    goto err;
13220
0
            } else {
13221
0
                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13222
0
                                "be strings or integers");
13223
0
                goto err;
13224
0
            }
13225
0
        }
13226
0
    }
13227
0
    return new;
13228
0
  err:
13229
0
    Py_DECREF(new);
13230
0
    return NULL;
13231
0
}
13232
13233
/*[clinic input]
13234
@permit_long_docstring_body
13235
str.translate as unicode_translate
13236
13237
    table: object
13238
        Translation table, which must be a mapping of Unicode ordinals to
13239
        Unicode ordinals, strings, or None.
13240
    /
13241
13242
Replace each character in the string using the given translation table.
13243
13244
The table must implement lookup/indexing via __getitem__, for instance a
13245
dictionary or list.  If this operation raises LookupError, the character is
13246
left untouched.  Characters mapped to None are deleted.
13247
[clinic start generated code]*/
13248
13249
static PyObject *
13250
unicode_translate(PyObject *self, PyObject *table)
13251
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=699e5fa0ebf9f5e9]*/
13252
104
{
13253
104
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13254
104
}
13255
13256
/*[clinic input]
13257
str.upper as unicode_upper
13258
13259
Return a copy of the string converted to uppercase.
13260
[clinic start generated code]*/
13261
13262
static PyObject *
13263
unicode_upper_impl(PyObject *self)
13264
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13265
102
{
13266
102
    if (PyUnicode_IS_ASCII(self))
13267
102
        return ascii_upper_or_lower(self, 0);
13268
0
    return case_operation(self, do_upper);
13269
102
}
13270
13271
/*[clinic input]
13272
@permit_long_summary
13273
str.zfill as unicode_zfill
13274
13275
    width: Py_ssize_t
13276
    /
13277
13278
Pad a numeric string with zeros on the left, to fill a field of the given width.
13279
13280
The string is never truncated.
13281
[clinic start generated code]*/
13282
13283
static PyObject *
13284
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13285
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13286
0
{
13287
0
    Py_ssize_t fill;
13288
0
    PyObject *u;
13289
0
    int kind;
13290
0
    const void *data;
13291
0
    Py_UCS4 chr;
13292
13293
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13294
0
        return unicode_result_unchanged(self);
13295
13296
0
    fill = width - PyUnicode_GET_LENGTH(self);
13297
13298
0
    u = pad(self, fill, 0, '0');
13299
13300
0
    if (u == NULL)
13301
0
        return NULL;
13302
13303
0
    kind = PyUnicode_KIND(u);
13304
0
    data = PyUnicode_DATA(u);
13305
0
    chr = PyUnicode_READ(kind, data, fill);
13306
13307
0
    if (chr == '+' || chr == '-') {
13308
        /* move sign to beginning of string */
13309
0
        PyUnicode_WRITE(kind, data, 0, chr);
13310
0
        PyUnicode_WRITE(kind, data, fill, '0');
13311
0
    }
13312
13313
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13314
0
    return u;
13315
0
}
13316
13317
/*[clinic input]
13318
@permit_long_summary
13319
@text_signature "($self, prefix[, start[, end]], /)"
13320
str.startswith as unicode_startswith
13321
13322
    prefix as subobj: object
13323
        A string or a tuple of strings to try.
13324
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13325
        Optional start position. Default: start of the string.
13326
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13327
        Optional stop position. Default: end of the string.
13328
    /
13329
13330
Return True if the string starts with the specified prefix, False otherwise.
13331
[clinic start generated code]*/
13332
13333
static PyObject *
13334
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13335
                        Py_ssize_t end)
13336
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13337
86.1M
{
13338
86.1M
    if (PyTuple_Check(subobj)) {
13339
10.4M
        Py_ssize_t i;
13340
37.9M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13341
27.5M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13342
27.5M
            if (!PyUnicode_Check(substring)) {
13343
0
                PyErr_Format(PyExc_TypeError,
13344
0
                             "tuple for startswith must only contain str, "
13345
0
                             "not %.100s",
13346
0
                             Py_TYPE(substring)->tp_name);
13347
0
                return NULL;
13348
0
            }
13349
27.5M
            int result = tailmatch(self, substring, start, end, -1);
13350
27.5M
            if (result < 0) {
13351
0
                return NULL;
13352
0
            }
13353
27.5M
            if (result) {
13354
36.5k
                Py_RETURN_TRUE;
13355
36.5k
            }
13356
27.5M
        }
13357
        /* nothing matched */
13358
10.4M
        Py_RETURN_FALSE;
13359
10.4M
    }
13360
75.6M
    if (!PyUnicode_Check(subobj)) {
13361
0
        PyErr_Format(PyExc_TypeError,
13362
0
                     "startswith first arg must be str or "
13363
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13364
0
        return NULL;
13365
0
    }
13366
75.6M
    int result = tailmatch(self, subobj, start, end, -1);
13367
75.6M
    if (result < 0) {
13368
0
        return NULL;
13369
0
    }
13370
75.6M
    return PyBool_FromLong(result);
13371
75.6M
}
13372
13373
13374
/*[clinic input]
13375
@permit_long_summary
13376
@text_signature "($self, suffix[, start[, end]], /)"
13377
str.endswith as unicode_endswith
13378
13379
    suffix as subobj: object
13380
        A string or a tuple of strings to try.
13381
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13382
        Optional start position. Default: start of the string.
13383
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13384
        Optional stop position. Default: end of the string.
13385
    /
13386
13387
Return True if the string ends with the specified suffix, False otherwise.
13388
[clinic start generated code]*/
13389
13390
static PyObject *
13391
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13392
                      Py_ssize_t end)
13393
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13394
13.8M
{
13395
13.8M
    if (PyTuple_Check(subobj)) {
13396
211k
        Py_ssize_t i;
13397
402k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13398
369k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13399
369k
            if (!PyUnicode_Check(substring)) {
13400
0
                PyErr_Format(PyExc_TypeError,
13401
0
                             "tuple for endswith must only contain str, "
13402
0
                             "not %.100s",
13403
0
                             Py_TYPE(substring)->tp_name);
13404
0
                return NULL;
13405
0
            }
13406
369k
            int result = tailmatch(self, substring, start, end, +1);
13407
369k
            if (result < 0) {
13408
0
                return NULL;
13409
0
            }
13410
369k
            if (result) {
13411
178k
                Py_RETURN_TRUE;
13412
178k
            }
13413
369k
        }
13414
211k
        Py_RETURN_FALSE;
13415
211k
    }
13416
13.6M
    if (!PyUnicode_Check(subobj)) {
13417
0
        PyErr_Format(PyExc_TypeError,
13418
0
                     "endswith first arg must be str or "
13419
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13420
0
        return NULL;
13421
0
    }
13422
13.6M
    int result = tailmatch(self, subobj, start, end, +1);
13423
13.6M
    if (result < 0) {
13424
0
        return NULL;
13425
0
    }
13426
13.6M
    return PyBool_FromLong(result);
13427
13.6M
}
13428
13429
13430
#include "stringlib/unicode_format.h"
13431
13432
PyDoc_STRVAR(format__doc__,
13433
             "format($self, /, *args, **kwargs)\n\
13434
--\n\
13435
\n\
13436
Return a formatted version of the string, using substitutions from args and kwargs.\n\
13437
The substitutions are identified by braces ('{' and '}').");
13438
13439
PyDoc_STRVAR(format_map__doc__,
13440
             "format_map($self, mapping, /)\n\
13441
--\n\
13442
\n\
13443
Return a formatted version of the string, using substitutions from mapping.\n\
13444
The substitutions are identified by braces ('{' and '}').");
13445
13446
/*[clinic input]
13447
str.__format__ as unicode___format__
13448
13449
    format_spec: unicode
13450
    /
13451
13452
Return a formatted version of the string as described by format_spec.
13453
[clinic start generated code]*/
13454
13455
static PyObject *
13456
unicode___format___impl(PyObject *self, PyObject *format_spec)
13457
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13458
0
{
13459
0
    _PyUnicodeWriter writer;
13460
0
    int ret;
13461
13462
0
    _PyUnicodeWriter_Init(&writer);
13463
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13464
0
                                          self, format_spec, 0,
13465
0
                                          PyUnicode_GET_LENGTH(format_spec));
13466
0
    if (ret == -1) {
13467
0
        _PyUnicodeWriter_Dealloc(&writer);
13468
0
        return NULL;
13469
0
    }
13470
0
    return _PyUnicodeWriter_Finish(&writer);
13471
0
}
13472
13473
/*[clinic input]
13474
str.__sizeof__ as unicode_sizeof
13475
13476
Return the size of the string in memory, in bytes.
13477
[clinic start generated code]*/
13478
13479
static PyObject *
13480
unicode_sizeof_impl(PyObject *self)
13481
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13482
0
{
13483
0
    Py_ssize_t size;
13484
13485
    /* If it's a compact object, account for base structure +
13486
       character data. */
13487
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
13488
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13489
0
    }
13490
0
    else if (PyUnicode_IS_COMPACT(self)) {
13491
0
        size = sizeof(PyCompactUnicodeObject) +
13492
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13493
0
    }
13494
0
    else {
13495
        /* If it is a two-block object, account for base object, and
13496
           for character block if present. */
13497
0
        size = sizeof(PyUnicodeObject);
13498
0
        if (_PyUnicode_DATA_ANY(self))
13499
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13500
0
                PyUnicode_KIND(self);
13501
0
    }
13502
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13503
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13504
13505
0
    return PyLong_FromSsize_t(size);
13506
0
}
13507
13508
static PyObject *
13509
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13510
0
{
13511
0
    PyObject *copy = _PyUnicode_Copy(v);
13512
0
    if (!copy)
13513
0
        return NULL;
13514
0
    return Py_BuildValue("(N)", copy);
13515
0
}
13516
13517
/*
13518
This function searchs the longest common leading whitespace
13519
of all lines in the [src, end).
13520
It returns the length of the common leading whitespace and sets `output` to
13521
point to the beginning of the common leading whitespace if length > 0.
13522
*/
13523
static Py_ssize_t
13524
search_longest_common_leading_whitespace(
13525
    const char *const src,
13526
    const char *const end,
13527
    const char **output)
13528
0
{
13529
    // [_start, _start + _len)
13530
    // describes the current longest common leading whitespace
13531
0
    const char *_start = NULL;
13532
0
    Py_ssize_t _len = 0;
13533
13534
0
    for (const char *iter = src; iter < end; ++iter) {
13535
0
        const char *line_start = iter;
13536
0
        const char *leading_whitespace_end = NULL;
13537
13538
        // scan the whole line
13539
0
        while (iter < end && *iter != '\n') {
13540
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
13541
                /* `iter` points to the first non-whitespace character
13542
                   in this line */
13543
0
                if (iter == line_start) {
13544
                    // some line has no indent, fast exit!
13545
0
                    return 0;
13546
0
                }
13547
0
                leading_whitespace_end = iter;
13548
0
            }
13549
0
            ++iter;
13550
0
        }
13551
13552
        // if this line has all white space, skip it
13553
0
        if (!leading_whitespace_end) {
13554
0
            continue;
13555
0
        }
13556
13557
0
        if (!_start) {
13558
            // update the first leading whitespace
13559
0
            _start = line_start;
13560
0
            _len = leading_whitespace_end - line_start;
13561
0
            assert(_len > 0);
13562
0
        }
13563
0
        else {
13564
            /* We then compare with the current longest leading whitespace.
13565
13566
               [line_start, leading_whitespace_end) is the leading
13567
               whitespace of this line,
13568
13569
               [_start, _start + _len) is the leading whitespace of the
13570
               current longest leading whitespace. */
13571
0
            Py_ssize_t new_len = 0;
13572
0
            const char *_iter = _start, *line_iter = line_start;
13573
13574
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
13575
0
                   && *_iter == *line_iter)
13576
0
            {
13577
0
                ++_iter;
13578
0
                ++line_iter;
13579
0
                ++new_len;
13580
0
            }
13581
13582
0
            _len = new_len;
13583
0
            if (_len == 0) {
13584
                // No common things now, fast exit!
13585
0
                return 0;
13586
0
            }
13587
0
        }
13588
0
    }
13589
13590
0
    assert(_len >= 0);
13591
0
    if (_len > 0) {
13592
0
        *output = _start;
13593
0
    }
13594
0
    return _len;
13595
0
}
13596
13597
/* Dedent a string.
13598
   Behaviour is expected to be an exact match of `textwrap.dedent`.
13599
   Return a new reference on success, NULL with exception set on error.
13600
   */
13601
PyObject *
13602
_PyUnicode_Dedent(PyObject *unicode)
13603
0
{
13604
0
    Py_ssize_t src_len = 0;
13605
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
13606
0
    if (!src) {
13607
0
        return NULL;
13608
0
    }
13609
0
    assert(src_len >= 0);
13610
0
    if (src_len == 0) {
13611
0
        return Py_NewRef(unicode);
13612
0
    }
13613
13614
0
    const char *const end = src + src_len;
13615
13616
    // [whitespace_start, whitespace_start + whitespace_len)
13617
    // describes the current longest common leading whitespace
13618
0
    const char *whitespace_start = NULL;
13619
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
13620
0
        src, end, &whitespace_start);
13621
13622
0
    if (whitespace_len == 0) {
13623
0
        return Py_NewRef(unicode);
13624
0
    }
13625
13626
    // now we should trigger a dedent
13627
0
    char *dest = PyMem_Malloc(src_len);
13628
0
    if (!dest) {
13629
0
        PyErr_NoMemory();
13630
0
        return NULL;
13631
0
    }
13632
0
    char *dest_iter = dest;
13633
13634
0
    for (const char *iter = src; iter < end; ++iter) {
13635
0
        const char *line_start = iter;
13636
0
        bool in_leading_space = true;
13637
13638
        // iterate over a line to find the end of a line
13639
0
        while (iter < end && *iter != '\n') {
13640
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
13641
0
                in_leading_space = false;
13642
0
            }
13643
0
            ++iter;
13644
0
        }
13645
13646
        // invariant: *iter == '\n' or iter == end
13647
0
        bool append_newline = iter < end;
13648
13649
        // if this line has all white space, write '\n' and continue
13650
0
        if (in_leading_space && append_newline) {
13651
0
            *dest_iter++ = '\n';
13652
0
            continue;
13653
0
        }
13654
13655
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
13656
            conditionally append '\n' */
13657
13658
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
13659
0
        assert(new_line_len >= 0);
13660
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
13661
13662
0
        dest_iter += new_line_len;
13663
13664
0
        if (append_newline) {
13665
0
            *dest_iter++ = '\n';
13666
0
        }
13667
0
    }
13668
13669
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
13670
0
    PyMem_Free(dest);
13671
0
    return res;
13672
0
}
13673
13674
static PyMethodDef unicode_methods[] = {
13675
    UNICODE_ENCODE_METHODDEF
13676
    UNICODE_REPLACE_METHODDEF
13677
    UNICODE_SPLIT_METHODDEF
13678
    UNICODE_RSPLIT_METHODDEF
13679
    UNICODE_JOIN_METHODDEF
13680
    UNICODE_CAPITALIZE_METHODDEF
13681
    UNICODE_CASEFOLD_METHODDEF
13682
    UNICODE_TITLE_METHODDEF
13683
    UNICODE_CENTER_METHODDEF
13684
    UNICODE_COUNT_METHODDEF
13685
    UNICODE_EXPANDTABS_METHODDEF
13686
    UNICODE_FIND_METHODDEF
13687
    UNICODE_PARTITION_METHODDEF
13688
    UNICODE_INDEX_METHODDEF
13689
    UNICODE_LJUST_METHODDEF
13690
    UNICODE_LOWER_METHODDEF
13691
    UNICODE_LSTRIP_METHODDEF
13692
    UNICODE_RFIND_METHODDEF
13693
    UNICODE_RINDEX_METHODDEF
13694
    UNICODE_RJUST_METHODDEF
13695
    UNICODE_RSTRIP_METHODDEF
13696
    UNICODE_RPARTITION_METHODDEF
13697
    UNICODE_SPLITLINES_METHODDEF
13698
    UNICODE_STRIP_METHODDEF
13699
    UNICODE_SWAPCASE_METHODDEF
13700
    UNICODE_TRANSLATE_METHODDEF
13701
    UNICODE_UPPER_METHODDEF
13702
    UNICODE_STARTSWITH_METHODDEF
13703
    UNICODE_ENDSWITH_METHODDEF
13704
    UNICODE_REMOVEPREFIX_METHODDEF
13705
    UNICODE_REMOVESUFFIX_METHODDEF
13706
    UNICODE_ISASCII_METHODDEF
13707
    UNICODE_ISLOWER_METHODDEF
13708
    UNICODE_ISUPPER_METHODDEF
13709
    UNICODE_ISTITLE_METHODDEF
13710
    UNICODE_ISSPACE_METHODDEF
13711
    UNICODE_ISDECIMAL_METHODDEF
13712
    UNICODE_ISDIGIT_METHODDEF
13713
    UNICODE_ISNUMERIC_METHODDEF
13714
    UNICODE_ISALPHA_METHODDEF
13715
    UNICODE_ISALNUM_METHODDEF
13716
    UNICODE_ISIDENTIFIER_METHODDEF
13717
    UNICODE_ISPRINTABLE_METHODDEF
13718
    UNICODE_ZFILL_METHODDEF
13719
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13720
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
13721
    UNICODE___FORMAT___METHODDEF
13722
    UNICODE_MAKETRANS_METHODDEF
13723
    UNICODE_SIZEOF_METHODDEF
13724
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13725
    {NULL, NULL}
13726
};
13727
13728
static PyObject *
13729
unicode_mod(PyObject *v, PyObject *w)
13730
24.3M
{
13731
24.3M
    if (!PyUnicode_Check(v))
13732
0
        Py_RETURN_NOTIMPLEMENTED;
13733
24.3M
    return PyUnicode_Format(v, w);
13734
24.3M
}
13735
13736
static PyNumberMethods unicode_as_number = {
13737
    0,              /*nb_add*/
13738
    0,              /*nb_subtract*/
13739
    0,              /*nb_multiply*/
13740
    unicode_mod,            /*nb_remainder*/
13741
};
13742
13743
static PySequenceMethods unicode_as_sequence = {
13744
    unicode_length,     /* sq_length */
13745
    PyUnicode_Concat,   /* sq_concat */
13746
    unicode_repeat,     /* sq_repeat */
13747
    unicode_getitem,    /* sq_item */
13748
    0,                  /* sq_slice */
13749
    0,                  /* sq_ass_item */
13750
    0,                  /* sq_ass_slice */
13751
    PyUnicode_Contains, /* sq_contains */
13752
};
13753
13754
static PyObject*
13755
unicode_subscript(PyObject* self, PyObject* item)
13756
113M
{
13757
113M
    if (_PyIndex_Check(item)) {
13758
52.1M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13759
52.1M
        if (i == -1 && PyErr_Occurred())
13760
0
            return NULL;
13761
52.1M
        if (i < 0)
13762
52.9k
            i += PyUnicode_GET_LENGTH(self);
13763
52.1M
        return unicode_getitem(self, i);
13764
60.8M
    } else if (PySlice_Check(item)) {
13765
60.8M
        Py_ssize_t start, stop, step, slicelength, i;
13766
60.8M
        size_t cur;
13767
60.8M
        PyObject *result;
13768
60.8M
        const void *src_data;
13769
60.8M
        void *dest_data;
13770
60.8M
        int src_kind, dest_kind;
13771
60.8M
        Py_UCS4 ch, max_char, kind_limit;
13772
13773
60.8M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13774
0
            return NULL;
13775
0
        }
13776
60.8M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13777
60.8M
                                            &start, &stop, step);
13778
13779
60.8M
        if (slicelength <= 0) {
13780
12.0M
            _Py_RETURN_UNICODE_EMPTY();
13781
48.8M
        } else if (start == 0 && step == 1 &&
13782
10.9M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
13783
2.01M
            return unicode_result_unchanged(self);
13784
46.7M
        } else if (step == 1) {
13785
46.7M
            return PyUnicode_Substring(self,
13786
46.7M
                                       start, start + slicelength);
13787
46.7M
        }
13788
        /* General case */
13789
0
        src_kind = PyUnicode_KIND(self);
13790
0
        src_data = PyUnicode_DATA(self);
13791
0
        if (!PyUnicode_IS_ASCII(self)) {
13792
0
            kind_limit = kind_maxchar_limit(src_kind);
13793
0
            max_char = 0;
13794
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13795
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
13796
0
                if (ch > max_char) {
13797
0
                    max_char = ch;
13798
0
                    if (max_char >= kind_limit)
13799
0
                        break;
13800
0
                }
13801
0
            }
13802
0
        }
13803
0
        else
13804
0
            max_char = 127;
13805
0
        result = PyUnicode_New(slicelength, max_char);
13806
0
        if (result == NULL)
13807
0
            return NULL;
13808
0
        dest_kind = PyUnicode_KIND(result);
13809
0
        dest_data = PyUnicode_DATA(result);
13810
13811
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13812
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13813
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13814
0
        }
13815
0
        assert(_PyUnicode_CheckConsistency(result, 1));
13816
0
        return result;
13817
0
    } else {
13818
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13819
0
                     Py_TYPE(item)->tp_name);
13820
0
        return NULL;
13821
0
    }
13822
113M
}
13823
13824
static PyMappingMethods unicode_as_mapping = {
13825
    unicode_length,     /* mp_length */
13826
    unicode_subscript,  /* mp_subscript */
13827
    0,                  /* mp_ass_subscript */
13828
};
13829
13830
13831
static PyObject *
13832
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
13833
13834
/*[clinic input]
13835
@classmethod
13836
str.__new__ as unicode_new
13837
13838
    object as x: object = NULL
13839
    encoding: str = NULL
13840
    errors: str = NULL
13841
13842
[clinic start generated code]*/
13843
13844
static PyObject *
13845
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
13846
                 const char *errors)
13847
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
13848
11.3M
{
13849
11.3M
    PyObject *unicode;
13850
11.3M
    if (x == NULL) {
13851
0
        unicode = _PyUnicode_GetEmpty();
13852
0
    }
13853
11.3M
    else if (encoding == NULL && errors == NULL) {
13854
11.3M
        unicode = PyObject_Str(x);
13855
11.3M
    }
13856
0
    else {
13857
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
13858
0
    }
13859
13860
11.3M
    if (unicode != NULL && type != &PyUnicode_Type) {
13861
11.3M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
13862
11.3M
    }
13863
11.3M
    return unicode;
13864
11.3M
}
13865
13866
static const char *
13867
arg_as_utf8(PyObject *obj, const char *name)
13868
903k
{
13869
903k
    if (!PyUnicode_Check(obj)) {
13870
0
        PyErr_Format(PyExc_TypeError,
13871
0
                     "str() argument '%s' must be str, not %T",
13872
0
                     name, obj);
13873
0
        return NULL;
13874
0
    }
13875
903k
    return _PyUnicode_AsUTF8NoNUL(obj);
13876
903k
}
13877
13878
static PyObject *
13879
unicode_vectorcall(PyObject *type, PyObject *const *args,
13880
                   size_t nargsf, PyObject *kwnames)
13881
633k
{
13882
633k
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
13883
13884
633k
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
13885
633k
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
13886
        // Fallback to unicode_new()
13887
0
        PyObject *tuple = PyTuple_FromArray(args, nargs);
13888
0
        if (tuple == NULL) {
13889
0
            return NULL;
13890
0
        }
13891
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
13892
0
        if (dict == NULL) {
13893
0
            Py_DECREF(tuple);
13894
0
            return NULL;
13895
0
        }
13896
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
13897
0
        Py_DECREF(tuple);
13898
0
        Py_DECREF(dict);
13899
0
        return ret;
13900
0
    }
13901
633k
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
13902
0
        return NULL;
13903
0
    }
13904
633k
    if (nargs == 0) {
13905
0
        return _PyUnicode_GetEmpty();
13906
0
    }
13907
633k
    PyObject *object = args[0];
13908
633k
    if (nargs == 1) {
13909
426
        return PyObject_Str(object);
13910
426
    }
13911
632k
    const char *encoding = arg_as_utf8(args[1], "encoding");
13912
632k
    if (encoding == NULL) {
13913
155
        return NULL;
13914
155
    }
13915
632k
    const char *errors = NULL;
13916
632k
    if (nargs == 3) {
13917
270k
        errors = arg_as_utf8(args[2], "errors");
13918
270k
        if (errors == NULL) {
13919
0
            return NULL;
13920
0
        }
13921
270k
    }
13922
632k
    return PyUnicode_FromEncodedObject(object, encoding, errors);
13923
632k
}
13924
13925
static PyObject *
13926
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
13927
11.3M
{
13928
11.3M
    PyObject *self;
13929
11.3M
    Py_ssize_t length, char_size;
13930
11.3M
    int share_utf8;
13931
11.3M
    int kind;
13932
11.3M
    void *data;
13933
13934
11.3M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13935
11.3M
    assert(_PyUnicode_CHECK(unicode));
13936
13937
11.3M
    self = type->tp_alloc(type, 0);
13938
11.3M
    if (self == NULL) {
13939
0
        return NULL;
13940
0
    }
13941
11.3M
    kind = PyUnicode_KIND(unicode);
13942
11.3M
    length = PyUnicode_GET_LENGTH(unicode);
13943
13944
11.3M
    _PyUnicode_LENGTH(self) = length;
13945
#ifdef Py_DEBUG
13946
    _PyUnicode_HASH(self) = -1;
13947
#else
13948
11.3M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13949
11.3M
#endif
13950
11.3M
    _PyUnicode_STATE(self).interned = 0;
13951
11.3M
    _PyUnicode_STATE(self).kind = kind;
13952
11.3M
    _PyUnicode_STATE(self).compact = 0;
13953
11.3M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13954
11.3M
    _PyUnicode_STATE(self).statically_allocated = 0;
13955
11.3M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
13956
11.3M
    PyUnicode_SET_UTF8(self, NULL);
13957
11.3M
    _PyUnicode_DATA_ANY(self) = NULL;
13958
13959
11.3M
    share_utf8 = 0;
13960
11.3M
    if (kind == PyUnicode_1BYTE_KIND) {
13961
10.0M
        char_size = 1;
13962
10.0M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13963
10.0M
            share_utf8 = 1;
13964
10.0M
    }
13965
1.30M
    else if (kind == PyUnicode_2BYTE_KIND) {
13966
1.25M
        char_size = 2;
13967
1.25M
    }
13968
52.3k
    else {
13969
52.3k
        assert(kind == PyUnicode_4BYTE_KIND);
13970
52.3k
        char_size = 4;
13971
52.3k
    }
13972
13973
    /* Ensure we won't overflow the length. */
13974
11.3M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13975
0
        PyErr_NoMemory();
13976
0
        goto onError;
13977
0
    }
13978
11.3M
    data = PyMem_Malloc((length + 1) * char_size);
13979
11.3M
    if (data == NULL) {
13980
0
        PyErr_NoMemory();
13981
0
        goto onError;
13982
0
    }
13983
13984
11.3M
    _PyUnicode_DATA_ANY(self) = data;
13985
11.3M
    if (share_utf8) {
13986
10.0M
        PyUnicode_SET_UTF8_LENGTH(self, length);
13987
10.0M
        PyUnicode_SET_UTF8(self, data);
13988
10.0M
    }
13989
13990
11.3M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
13991
11.3M
    assert(_PyUnicode_CheckConsistency(self, 1));
13992
#ifdef Py_DEBUG
13993
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13994
#endif
13995
11.3M
    return self;
13996
13997
0
onError:
13998
0
    Py_DECREF(self);
13999
0
    return NULL;
14000
11.3M
}
14001
14002
void
14003
_PyUnicode_ExactDealloc(PyObject *op)
14004
90.2M
{
14005
90.2M
    assert(PyUnicode_CheckExact(op));
14006
90.2M
    unicode_dealloc(op);
14007
90.2M
}
14008
14009
PyDoc_STRVAR(unicode_doc,
14010
"str(object='') -> str\n\
14011
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14012
\n\
14013
Create a new string object from the given object. If encoding or\n\
14014
errors is specified, then the object must expose a data buffer\n\
14015
that will be decoded using the given encoding and error handler.\n\
14016
Otherwise, returns the result of object.__str__() (if defined)\n\
14017
or repr(object).\n\
14018
encoding defaults to 'utf-8'.\n\
14019
errors defaults to 'strict'.");
14020
14021
static PyObject *unicode_iter(PyObject *seq);
14022
14023
PyTypeObject PyUnicode_Type = {
14024
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14025
    "str",                        /* tp_name */
14026
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14027
    0,                            /* tp_itemsize */
14028
    /* Slots */
14029
    unicode_dealloc,              /* tp_dealloc */
14030
    0,                            /* tp_vectorcall_offset */
14031
    0,                            /* tp_getattr */
14032
    0,                            /* tp_setattr */
14033
    0,                            /* tp_as_async */
14034
    unicode_repr,                 /* tp_repr */
14035
    &unicode_as_number,           /* tp_as_number */
14036
    &unicode_as_sequence,         /* tp_as_sequence */
14037
    &unicode_as_mapping,          /* tp_as_mapping */
14038
    unicode_hash,                 /* tp_hash*/
14039
    0,                            /* tp_call*/
14040
    unicode_str,                  /* tp_str */
14041
    PyObject_GenericGetAttr,      /* tp_getattro */
14042
    0,                            /* tp_setattro */
14043
    0,                            /* tp_as_buffer */
14044
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14045
        Py_TPFLAGS_UNICODE_SUBCLASS |
14046
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14047
    unicode_doc,                  /* tp_doc */
14048
    0,                            /* tp_traverse */
14049
    0,                            /* tp_clear */
14050
    PyUnicode_RichCompare,        /* tp_richcompare */
14051
    0,                            /* tp_weaklistoffset */
14052
    unicode_iter,                 /* tp_iter */
14053
    0,                            /* tp_iternext */
14054
    unicode_methods,              /* tp_methods */
14055
    0,                            /* tp_members */
14056
    0,                            /* tp_getset */
14057
    0,                            /* tp_base */
14058
    0,                            /* tp_dict */
14059
    0,                            /* tp_descr_get */
14060
    0,                            /* tp_descr_set */
14061
    0,                            /* tp_dictoffset */
14062
    0,                            /* tp_init */
14063
    0,                            /* tp_alloc */
14064
    unicode_new,                  /* tp_new */
14065
    PyObject_Free,                /* tp_free */
14066
    .tp_vectorcall = unicode_vectorcall,
14067
};
14068
14069
/* Initialize the Unicode implementation */
14070
14071
static void
14072
_init_global_state(void)
14073
16
{
14074
16
    static int initialized = 0;
14075
16
    if (initialized) {
14076
0
        return;
14077
0
    }
14078
16
    initialized = 1;
14079
14080
    /* initialize the linebreak bloom filter */
14081
16
    const Py_UCS2 linebreak[] = {
14082
16
        0x000A, /* LINE FEED */
14083
16
        0x000D, /* CARRIAGE RETURN */
14084
16
        0x001C, /* FILE SEPARATOR */
14085
16
        0x001D, /* GROUP SEPARATOR */
14086
16
        0x001E, /* RECORD SEPARATOR */
14087
16
        0x0085, /* NEXT LINE */
14088
16
        0x2028, /* LINE SEPARATOR */
14089
16
        0x2029, /* PARAGRAPH SEPARATOR */
14090
16
    };
14091
16
    bloom_linebreak = make_bloom_mask(
14092
16
        PyUnicode_2BYTE_KIND, linebreak,
14093
16
        Py_ARRAY_LENGTH(linebreak));
14094
16
}
14095
14096
void
14097
_PyUnicode_InitState(PyInterpreterState *interp)
14098
16
{
14099
16
    if (!_Py_IsMainInterpreter(interp)) {
14100
0
        return;
14101
0
    }
14102
16
    _init_global_state();
14103
16
}
14104
14105
14106
PyStatus
14107
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14108
16
{
14109
16
    if (_Py_IsMainInterpreter(interp)) {
14110
16
        PyStatus status = init_global_interned_strings(interp);
14111
16
        if (_PyStatus_EXCEPTION(status)) {
14112
0
            return status;
14113
0
        }
14114
16
    }
14115
16
    assert(INTERNED_STRINGS);
14116
14117
16
    if (init_interned_dict(interp)) {
14118
0
        PyErr_Clear();
14119
0
        return _PyStatus_ERR("failed to create interned dict");
14120
0
    }
14121
14122
16
    return _PyStatus_OK();
14123
16
}
14124
14125
14126
PyStatus
14127
_PyUnicode_InitTypes(PyInterpreterState *interp)
14128
16
{
14129
16
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
14130
0
        goto error;
14131
0
    }
14132
16
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
14133
0
        goto error;
14134
0
    }
14135
16
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
14136
0
        goto error;
14137
0
    }
14138
16
    return _PyStatus_OK();
14139
14140
0
error:
14141
0
    return _PyStatus_ERR("Can't initialize unicode types");
14142
16
}
14143
14144
static /* non-null */ PyObject*
14145
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
14146
17.2k
{
14147
    // Note that this steals a reference to `s`, but in many cases that
14148
    // stolen ref is returned, requiring no decref/incref.
14149
14150
17.2k
    assert(s != NULL);
14151
17.2k
    assert(_PyUnicode_CHECK(s));
14152
17.2k
    assert(_PyUnicode_STATE(s).statically_allocated);
14153
17.2k
    assert(!PyUnicode_CHECK_INTERNED(s));
14154
14155
#ifdef Py_DEBUG
14156
    /* We must not add process-global interned string if there's already a
14157
     * per-interpreter interned_dict, which might contain duplicates.
14158
     */
14159
    PyObject *interned = get_interned_dict(interp);
14160
    assert(interned == NULL);
14161
#endif
14162
14163
    /* Look in the global cache first. */
14164
17.2k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14165
    /* We should only init each string once */
14166
17.2k
    assert(r == NULL);
14167
    /* but just in case (for the non-debug build), handle this */
14168
17.2k
    if (r != NULL && r != s) {
14169
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
14170
0
        assert(_PyUnicode_CHECK(r));
14171
0
        Py_DECREF(s);
14172
0
        return Py_NewRef(r);
14173
0
    }
14174
14175
17.2k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
14176
0
        Py_FatalError("failed to intern static string");
14177
0
    }
14178
14179
17.2k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14180
17.2k
    return s;
14181
17.2k
}
14182
14183
void
14184
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
14185
17.2k
{
14186
    // This should only be called as part of runtime initialization
14187
17.2k
    assert(!Py_IsInitialized());
14188
14189
17.2k
    *p = intern_static(interp, *p);
14190
17.2k
    assert(*p);
14191
17.2k
}
14192
14193
static void
14194
immortalize_interned(PyObject *s)
14195
99.8k
{
14196
99.8k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
14197
99.8k
    assert(!_Py_IsImmortal(s));
14198
#ifdef Py_REF_DEBUG
14199
    /* The reference count value should be excluded from the RefTotal.
14200
       The decrements to these objects will not be registered so they
14201
       need to be accounted for in here. */
14202
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
14203
        _Py_DecRefTotal(_PyThreadState_GET());
14204
    }
14205
#endif
14206
99.8k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
14207
99.8k
    _Py_SetImmortal(s);
14208
99.8k
}
14209
14210
static /* non-null */ PyObject*
14211
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
14212
              bool immortalize)
14213
39.2M
{
14214
    // Note that this steals a reference to `s`, but in many cases that
14215
    // stolen ref is returned, requiring no decref/incref.
14216
14217
#ifdef Py_DEBUG
14218
    assert(s != NULL);
14219
    assert(_PyUnicode_CHECK(s));
14220
#else
14221
39.2M
    if (s == NULL || !PyUnicode_Check(s)) {
14222
0
        return s;
14223
0
    }
14224
39.2M
#endif
14225
14226
    /* If it's a subclass, we don't really know what putting
14227
       it in the interned dict might do. */
14228
39.2M
    if (!PyUnicode_CheckExact(s)) {
14229
0
        return s;
14230
0
    }
14231
14232
    /* Is it already interned? */
14233
39.2M
    switch (PyUnicode_CHECK_INTERNED(s)) {
14234
2.83M
        case SSTATE_NOT_INTERNED:
14235
            // no, go on
14236
2.83M
            break;
14237
19.6k
        case SSTATE_INTERNED_MORTAL:
14238
            // yes but we might need to make it immortal
14239
19.6k
            if (immortalize) {
14240
5.63k
                immortalize_interned(s);
14241
5.63k
            }
14242
19.6k
            return s;
14243
36.3M
        default:
14244
            // all done
14245
36.3M
            return s;
14246
39.2M
    }
14247
14248
    /* Statically allocated strings must be already interned. */
14249
39.2M
    assert(!_PyUnicode_STATE(s).statically_allocated);
14250
14251
#if Py_GIL_DISABLED
14252
    /* In the free-threaded build, all interned strings are immortal */
14253
    immortalize = 1;
14254
#endif
14255
14256
    /* If it's already immortal, intern it as such */
14257
2.83M
    if (_Py_IsImmortal(s)) {
14258
0
        immortalize = 1;
14259
0
    }
14260
14261
    /* if it's a short string, get the singleton */
14262
2.83M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
14263
22.1k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
14264
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
14265
0
        assert(PyUnicode_CHECK_INTERNED(r));
14266
0
        Py_DECREF(s);
14267
0
        return r;
14268
0
    }
14269
#ifdef Py_DEBUG
14270
    assert(!unicode_is_singleton(s));
14271
#endif
14272
14273
    /* Look in the global cache now. */
14274
2.83M
    {
14275
2.83M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14276
2.83M
        if (r != NULL) {
14277
288k
            assert(_PyUnicode_STATE(r).statically_allocated);
14278
288k
            assert(r != s);  // r must be statically_allocated; s is not
14279
288k
            Py_DECREF(s);
14280
288k
            return Py_NewRef(r);
14281
288k
        }
14282
2.83M
    }
14283
14284
    /* Do a setdefault on the per-interpreter cache. */
14285
2.54M
    PyObject *interned = get_interned_dict(interp);
14286
2.54M
    assert(interned != NULL);
14287
#ifdef Py_GIL_DISABLED
14288
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
14289
#endif
14290
2.54M
    FT_MUTEX_LOCK(INTERN_MUTEX);
14291
2.54M
    PyObject *t;
14292
2.54M
    {
14293
2.54M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
14294
2.54M
        if (res < 0) {
14295
0
            PyErr_Clear();
14296
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14297
0
            return s;
14298
0
        }
14299
2.54M
        else if (res == 1) {
14300
            // value was already present (not inserted)
14301
1.97M
            Py_DECREF(s);
14302
1.97M
            if (immortalize &&
14303
595k
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
14304
4.24k
                immortalize_interned(t);
14305
4.24k
            }
14306
1.97M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14307
1.97M
            return t;
14308
1.97M
        }
14309
577k
        else {
14310
            // value was newly inserted
14311
577k
            assert (s == t);
14312
577k
            Py_DECREF(t);
14313
577k
        }
14314
2.54M
    }
14315
14316
    /* NOT_INTERNED -> INTERNED_MORTAL */
14317
14318
2.54M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
14319
14320
577k
    if (!_Py_IsImmortal(s)) {
14321
        /* The two references in interned dict (key and value) are not counted.
14322
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
14323
577k
        Py_DECREF(s);
14324
577k
        Py_DECREF(s);
14325
577k
    }
14326
577k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
14327
14328
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
14329
14330
#ifdef Py_DEBUG
14331
    if (_Py_IsImmortal(s)) {
14332
        assert(immortalize);
14333
    }
14334
#endif
14335
577k
    if (immortalize) {
14336
90.0k
        immortalize_interned(s);
14337
90.0k
    }
14338
14339
577k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
14340
577k
    return s;
14341
2.54M
}
14342
14343
void
14344
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
14345
2.65M
{
14346
2.65M
    *p = intern_common(interp, *p, 1);
14347
2.65M
    assert(*p);
14348
2.65M
}
14349
14350
void
14351
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
14352
36.5M
{
14353
36.5M
    *p = intern_common(interp, *p, 0);
14354
36.5M
    assert(*p);
14355
36.5M
}
14356
14357
14358
void
14359
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
14360
0
{
14361
0
    _PyUnicode_InternImmortal(interp, p);
14362
0
    return;
14363
0
}
14364
14365
void
14366
PyUnicode_InternInPlace(PyObject **p)
14367
0
{
14368
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14369
0
    _PyUnicode_InternMortal(interp, p);
14370
0
}
14371
14372
// Public-looking name kept for the stable ABI; user should not call this:
14373
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14374
void
14375
PyUnicode_InternImmortal(PyObject **p)
14376
0
{
14377
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14378
0
    _PyUnicode_InternImmortal(interp, p);
14379
0
}
14380
14381
PyObject *
14382
PyUnicode_InternFromString(const char *cp)
14383
956k
{
14384
956k
    PyObject *s = PyUnicode_FromString(cp);
14385
956k
    if (s == NULL) {
14386
0
        return NULL;
14387
0
    }
14388
956k
    PyInterpreterState *interp = _PyInterpreterState_GET();
14389
956k
    _PyUnicode_InternMortal(interp, &s);
14390
956k
    return s;
14391
956k
}
14392
14393
14394
void
14395
_PyUnicode_ClearInterned(PyInterpreterState *interp)
14396
0
{
14397
0
    PyObject *interned = get_interned_dict(interp);
14398
0
    if (interned == NULL) {
14399
0
        return;
14400
0
    }
14401
0
    assert(PyDict_CheckExact(interned));
14402
14403
0
    if (has_shared_intern_dict(interp)) {
14404
        // the dict doesn't belong to this interpreter, skip the debug
14405
        // checks on it and just clear the pointer to it
14406
0
        clear_interned_dict(interp);
14407
0
        return;
14408
0
    }
14409
14410
#ifdef INTERNED_STATS
14411
    fprintf(stderr, "releasing %zd interned strings\n",
14412
            PyDict_GET_SIZE(interned));
14413
14414
    Py_ssize_t total_length = 0;
14415
#endif
14416
0
    Py_ssize_t pos = 0;
14417
0
    PyObject *s, *ignored_value;
14418
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
14419
0
        int shared = 0;
14420
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
14421
0
        case SSTATE_INTERNED_IMMORTAL:
14422
            /* Make immortal interned strings mortal again. */
14423
            // Skip the Immortal Instance check and restore
14424
            // the two references (key and value) ignored
14425
            // by PyUnicode_InternInPlace().
14426
0
            _Py_SetMortal(s, 2);
14427
#ifdef Py_REF_DEBUG
14428
            /* let's be pedantic with the ref total */
14429
            _Py_IncRefTotal(_PyThreadState_GET());
14430
            _Py_IncRefTotal(_PyThreadState_GET());
14431
#endif
14432
#ifdef INTERNED_STATS
14433
            total_length += PyUnicode_GET_LENGTH(s);
14434
#endif
14435
0
            break;
14436
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
14437
            /* It is shared between interpreters, so we should unmark it
14438
               only when this is the last interpreter in which it's
14439
               interned.  We immortalize all the statically initialized
14440
               strings during startup, so we can rely on the
14441
               main interpreter to be the last one. */
14442
0
            if (!_Py_IsMainInterpreter(interp)) {
14443
0
                shared = 1;
14444
0
            }
14445
0
            break;
14446
0
        case SSTATE_INTERNED_MORTAL:
14447
            // Restore 2 references held by the interned dict; these will
14448
            // be decref'd by clear_interned_dict's PyDict_Clear.
14449
0
            _Py_RefcntAdd(s, 2);
14450
#ifdef Py_REF_DEBUG
14451
            /* let's be pedantic with the ref total */
14452
            _Py_IncRefTotal(_PyThreadState_GET());
14453
            _Py_IncRefTotal(_PyThreadState_GET());
14454
#endif
14455
0
            break;
14456
0
        case SSTATE_NOT_INTERNED:
14457
0
            _Py_FALLTHROUGH;
14458
0
        default:
14459
0
            Py_UNREACHABLE();
14460
0
        }
14461
0
        if (!shared) {
14462
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
14463
0
        }
14464
0
    }
14465
#ifdef INTERNED_STATS
14466
    fprintf(stderr,
14467
            "total length of all interned strings: %zd characters\n",
14468
            total_length);
14469
#endif
14470
14471
0
    struct _Py_unicode_state *state = &interp->unicode;
14472
0
    struct _Py_unicode_ids *ids = &state->ids;
14473
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
14474
0
        Py_XINCREF(ids->array[i]);
14475
0
    }
14476
0
    clear_interned_dict(interp);
14477
0
    if (_Py_IsMainInterpreter(interp)) {
14478
0
        clear_global_interned_strings();
14479
0
    }
14480
0
}
14481
14482
14483
/********************* Unicode Iterator **************************/
14484
14485
typedef struct {
14486
    PyObject_HEAD
14487
    Py_ssize_t it_index;
14488
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14489
} unicodeiterobject;
14490
14491
static void
14492
unicodeiter_dealloc(PyObject *op)
14493
1.88M
{
14494
1.88M
    unicodeiterobject *it = (unicodeiterobject *)op;
14495
1.88M
    _PyObject_GC_UNTRACK(it);
14496
1.88M
    Py_XDECREF(it->it_seq);
14497
1.88M
    PyObject_GC_Del(it);
14498
1.88M
}
14499
14500
static int
14501
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
14502
4
{
14503
4
    unicodeiterobject *it = (unicodeiterobject *)op;
14504
4
    Py_VISIT(it->it_seq);
14505
4
    return 0;
14506
4
}
14507
14508
static PyObject *
14509
unicodeiter_next(PyObject *op)
14510
133M
{
14511
133M
    unicodeiterobject *it = (unicodeiterobject *)op;
14512
133M
    PyObject *seq;
14513
14514
133M
    assert(it != NULL);
14515
133M
    seq = it->it_seq;
14516
133M
    if (seq == NULL)
14517
0
        return NULL;
14518
133M
    assert(_PyUnicode_CHECK(seq));
14519
14520
133M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14521
133M
        int kind = PyUnicode_KIND(seq);
14522
133M
        const void *data = PyUnicode_DATA(seq);
14523
133M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14524
133M
        it->it_index++;
14525
133M
        return unicode_char(chr);
14526
133M
    }
14527
14528
861k
    it->it_seq = NULL;
14529
861k
    Py_DECREF(seq);
14530
861k
    return NULL;
14531
133M
}
14532
14533
static PyObject *
14534
unicode_ascii_iter_next(PyObject *op)
14535
120M
{
14536
120M
    unicodeiterobject *it = (unicodeiterobject *)op;
14537
120M
    assert(it != NULL);
14538
120M
    PyObject *seq = it->it_seq;
14539
120M
    if (seq == NULL) {
14540
0
        return NULL;
14541
0
    }
14542
120M
    assert(_PyUnicode_CHECK(seq));
14543
120M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
14544
120M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14545
119M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14546
119M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14547
119M
                                              data, it->it_index);
14548
119M
        it->it_index++;
14549
119M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14550
119M
    }
14551
923k
    it->it_seq = NULL;
14552
923k
    Py_DECREF(seq);
14553
923k
    return NULL;
14554
120M
}
14555
14556
static PyObject *
14557
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
14558
0
{
14559
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14560
0
    Py_ssize_t len = 0;
14561
0
    if (it->it_seq)
14562
0
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14563
0
    return PyLong_FromSsize_t(len);
14564
0
}
14565
14566
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14567
14568
static PyObject *
14569
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
14570
0
{
14571
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14572
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
14573
14574
    /* _PyEval_GetBuiltin can invoke arbitrary code,
14575
     * call must be before access of iterator pointers.
14576
     * see issue #101765 */
14577
14578
0
    if (it->it_seq != NULL) {
14579
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
14580
0
    } else {
14581
0
        PyObject *u = _PyUnicode_GetEmpty();
14582
0
        if (u == NULL) {
14583
0
            Py_XDECREF(iter);
14584
0
            return NULL;
14585
0
        }
14586
0
        return Py_BuildValue("N(N)", iter, u);
14587
0
    }
14588
0
}
14589
14590
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14591
14592
static PyObject *
14593
unicodeiter_setstate(PyObject *op, PyObject *state)
14594
0
{
14595
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14596
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
14597
0
    if (index == -1 && PyErr_Occurred())
14598
0
        return NULL;
14599
0
    if (it->it_seq != NULL) {
14600
0
        if (index < 0)
14601
0
            index = 0;
14602
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14603
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14604
0
        it->it_index = index;
14605
0
    }
14606
0
    Py_RETURN_NONE;
14607
0
}
14608
14609
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14610
14611
static PyMethodDef unicodeiter_methods[] = {
14612
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
14613
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
14614
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
14615
    {NULL,      NULL}       /* sentinel */
14616
};
14617
14618
PyTypeObject PyUnicodeIter_Type = {
14619
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14620
    "str_iterator",         /* tp_name */
14621
    sizeof(unicodeiterobject),      /* tp_basicsize */
14622
    0,                  /* tp_itemsize */
14623
    /* methods */
14624
    unicodeiter_dealloc,/* tp_dealloc */
14625
    0,                  /* tp_vectorcall_offset */
14626
    0,                  /* tp_getattr */
14627
    0,                  /* tp_setattr */
14628
    0,                  /* tp_as_async */
14629
    0,                  /* tp_repr */
14630
    0,                  /* tp_as_number */
14631
    0,                  /* tp_as_sequence */
14632
    0,                  /* tp_as_mapping */
14633
    0,                  /* tp_hash */
14634
    0,                  /* tp_call */
14635
    0,                  /* tp_str */
14636
    PyObject_GenericGetAttr,        /* tp_getattro */
14637
    0,                  /* tp_setattro */
14638
    0,                  /* tp_as_buffer */
14639
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14640
    0,                  /* tp_doc */
14641
    unicodeiter_traverse, /* tp_traverse */
14642
    0,                  /* tp_clear */
14643
    0,                  /* tp_richcompare */
14644
    0,                  /* tp_weaklistoffset */
14645
    PyObject_SelfIter,          /* tp_iter */
14646
    unicodeiter_next,   /* tp_iternext */
14647
    unicodeiter_methods,            /* tp_methods */
14648
    0,
14649
};
14650
14651
PyTypeObject _PyUnicodeASCIIIter_Type = {
14652
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14653
    .tp_name = "str_ascii_iterator",
14654
    .tp_basicsize = sizeof(unicodeiterobject),
14655
    .tp_dealloc = unicodeiter_dealloc,
14656
    .tp_getattro = PyObject_GenericGetAttr,
14657
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14658
    .tp_traverse = unicodeiter_traverse,
14659
    .tp_iter = PyObject_SelfIter,
14660
    .tp_iternext = unicode_ascii_iter_next,
14661
    .tp_methods = unicodeiter_methods,
14662
};
14663
14664
static PyObject *
14665
unicode_iter(PyObject *seq)
14666
1.88M
{
14667
1.88M
    unicodeiterobject *it;
14668
14669
1.88M
    if (!PyUnicode_Check(seq)) {
14670
0
        PyErr_BadInternalCall();
14671
0
        return NULL;
14672
0
    }
14673
1.88M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14674
1.02M
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14675
1.02M
    }
14676
861k
    else {
14677
861k
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14678
861k
    }
14679
1.88M
    if (it == NULL)
14680
0
        return NULL;
14681
1.88M
    it->it_index = 0;
14682
1.88M
    it->it_seq = Py_NewRef(seq);
14683
1.88M
    _PyObject_GC_TRACK(it);
14684
1.88M
    return (PyObject *)it;
14685
1.88M
}
14686
14687
static int
14688
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14689
64
{
14690
64
    int res;
14691
64
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14692
64
    if (res == -2) {
14693
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
14694
0
        return -1;
14695
0
    }
14696
64
    if (res < 0) {
14697
0
        PyErr_NoMemory();
14698
0
        return -1;
14699
0
    }
14700
64
    return 0;
14701
64
}
14702
14703
14704
static int
14705
config_get_codec_name(wchar_t **config_encoding)
14706
32
{
14707
32
    char *encoding;
14708
32
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
14709
0
        return -1;
14710
0
    }
14711
14712
32
    PyObject *name_obj = NULL;
14713
32
    PyObject *codec = _PyCodec_Lookup(encoding);
14714
32
    PyMem_RawFree(encoding);
14715
14716
32
    if (!codec)
14717
0
        goto error;
14718
14719
32
    name_obj = PyObject_GetAttrString(codec, "name");
14720
32
    Py_CLEAR(codec);
14721
32
    if (!name_obj) {
14722
0
        goto error;
14723
0
    }
14724
14725
32
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14726
32
    Py_DECREF(name_obj);
14727
32
    if (wname == NULL) {
14728
0
        goto error;
14729
0
    }
14730
14731
32
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14732
32
    if (raw_wname == NULL) {
14733
0
        PyMem_Free(wname);
14734
0
        PyErr_NoMemory();
14735
0
        goto error;
14736
0
    }
14737
14738
32
    PyMem_RawFree(*config_encoding);
14739
32
    *config_encoding = raw_wname;
14740
14741
32
    PyMem_Free(wname);
14742
32
    return 0;
14743
14744
0
error:
14745
0
    Py_XDECREF(codec);
14746
0
    Py_XDECREF(name_obj);
14747
0
    return -1;
14748
32
}
14749
14750
14751
static PyStatus
14752
init_stdio_encoding(PyInterpreterState *interp)
14753
16
{
14754
    /* Update the stdio encoding to the normalized Python codec name. */
14755
16
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14756
16
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
14757
0
        return _PyStatus_ERR("failed to get the Python codec name "
14758
0
                             "of the stdio encoding");
14759
0
    }
14760
16
    return _PyStatus_OK();
14761
16
}
14762
14763
14764
static int
14765
init_fs_codec(PyInterpreterState *interp)
14766
16
{
14767
16
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
14768
14769
16
    _Py_error_handler error_handler;
14770
16
    error_handler = get_error_handler_wide(config->filesystem_errors);
14771
16
    if (error_handler == _Py_ERROR_UNKNOWN) {
14772
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
14773
0
        return -1;
14774
0
    }
14775
14776
16
    char *encoding, *errors;
14777
16
    if (encode_wstr_utf8(config->filesystem_encoding,
14778
16
                         &encoding,
14779
16
                         "filesystem_encoding") < 0) {
14780
0
        return -1;
14781
0
    }
14782
14783
16
    if (encode_wstr_utf8(config->filesystem_errors,
14784
16
                         &errors,
14785
16
                         "filesystem_errors") < 0) {
14786
0
        PyMem_RawFree(encoding);
14787
0
        return -1;
14788
0
    }
14789
14790
16
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
14791
16
    PyMem_RawFree(fs_codec->encoding);
14792
16
    fs_codec->encoding = encoding;
14793
    /* encoding has been normalized by init_fs_encoding() */
14794
16
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
14795
16
    PyMem_RawFree(fs_codec->errors);
14796
16
    fs_codec->errors = errors;
14797
16
    fs_codec->error_handler = error_handler;
14798
14799
#ifdef _Py_FORCE_UTF8_FS_ENCODING
14800
    assert(fs_codec->utf8 == 1);
14801
#endif
14802
14803
    /* At this point, PyUnicode_EncodeFSDefault() and
14804
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
14805
       the C implementation of the filesystem encoding. */
14806
14807
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
14808
       global configuration variables. */
14809
16
    if (_Py_IsMainInterpreter(interp)) {
14810
14811
16
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
14812
16
                                      fs_codec->errors) < 0) {
14813
0
            PyErr_NoMemory();
14814
0
            return -1;
14815
0
        }
14816
16
    }
14817
16
    return 0;
14818
16
}
14819
14820
14821
static PyStatus
14822
init_fs_encoding(PyThreadState *tstate)
14823
16
{
14824
16
    PyInterpreterState *interp = tstate->interp;
14825
14826
    /* Update the filesystem encoding to the normalized Python codec name.
14827
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
14828
       (Python codec name). */
14829
16
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14830
16
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
14831
0
        _Py_DumpPathConfig(tstate);
14832
0
        return _PyStatus_ERR("failed to get the Python codec "
14833
0
                             "of the filesystem encoding");
14834
0
    }
14835
14836
16
    if (init_fs_codec(interp) < 0) {
14837
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
14838
0
    }
14839
16
    return _PyStatus_OK();
14840
16
}
14841
14842
14843
PyStatus
14844
_PyUnicode_InitEncodings(PyThreadState *tstate)
14845
16
{
14846
16
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
14847
16
    if (_PyStatus_EXCEPTION(status)) {
14848
0
        return status;
14849
0
    }
14850
16
    status = init_fs_encoding(tstate);
14851
16
    if (_PyStatus_EXCEPTION(status)) {
14852
0
        return status;
14853
0
    }
14854
14855
16
    return init_stdio_encoding(tstate->interp);
14856
16
}
14857
14858
14859
static void
14860
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
14861
0
{
14862
0
    PyMem_RawFree(fs_codec->encoding);
14863
0
    fs_codec->encoding = NULL;
14864
0
    fs_codec->utf8 = 0;
14865
0
    PyMem_RawFree(fs_codec->errors);
14866
0
    fs_codec->errors = NULL;
14867
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
14868
0
}
14869
14870
14871
#ifdef MS_WINDOWS
14872
int
14873
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
14874
{
14875
    PyInterpreterState *interp = _PyInterpreterState_GET();
14876
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
14877
14878
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
14879
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
14880
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
14881
    if (encoding == NULL || errors == NULL) {
14882
        PyMem_RawFree(encoding);
14883
        PyMem_RawFree(errors);
14884
        PyErr_NoMemory();
14885
        return -1;
14886
    }
14887
14888
    PyMem_RawFree(config->filesystem_encoding);
14889
    config->filesystem_encoding = encoding;
14890
    PyMem_RawFree(config->filesystem_errors);
14891
    config->filesystem_errors = errors;
14892
14893
    return init_fs_codec(interp);
14894
}
14895
#endif
14896
14897
14898
#ifdef Py_DEBUG
14899
static inline int
14900
unicode_is_finalizing(void)
14901
{
14902
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
14903
}
14904
#endif
14905
14906
14907
void
14908
_PyUnicode_FiniTypes(PyInterpreterState *interp)
14909
0
{
14910
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
14911
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
14912
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
14913
0
}
14914
14915
14916
void
14917
_PyUnicode_Fini(PyInterpreterState *interp)
14918
0
{
14919
0
    struct _Py_unicode_state *state = &interp->unicode;
14920
14921
0
    if (!has_shared_intern_dict(interp)) {
14922
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
14923
0
        assert(get_interned_dict(interp) == NULL);
14924
0
    }
14925
14926
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
14927
14928
    // bpo-47182: force a unicodedata CAPI capsule re-import on
14929
    // subsequent initialization of interpreter.
14930
0
    interp->unicode.ucnhash_capi = NULL;
14931
14932
0
    unicode_clear_identifiers(state);
14933
0
}
14934
14935
/* A _string module, to export formatter_parser and formatter_field_name_split
14936
   to the string.Formatter class implemented in Python. */
14937
14938
static PyMethodDef _string_methods[] = {
14939
    {"formatter_field_name_split", formatter_field_name_split,
14940
     METH_O, PyDoc_STR("split the argument as a field name")},
14941
    {"formatter_parser", formatter_parser,
14942
     METH_O, PyDoc_STR("parse the argument as a format string")},
14943
    {NULL, NULL}
14944
};
14945
14946
static PyModuleDef_Slot module_slots[] = {
14947
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
14948
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
14949
    {0, NULL}
14950
};
14951
14952
static struct PyModuleDef _string_module = {
14953
    PyModuleDef_HEAD_INIT,
14954
    .m_name = "_string",
14955
    .m_doc = PyDoc_STR("string helper module"),
14956
    .m_size = 0,
14957
    .m_methods = _string_methods,
14958
    .m_slots = module_slots,
14959
};
14960
14961
PyMODINIT_FUNC
14962
PyInit__string(void)
14963
6
{
14964
6
    return PyModuleDef_Init(&_string_module);
14965
6
}
14966
14967
14968
#undef PyUnicode_KIND
14969
int PyUnicode_KIND(PyObject *op)
14970
0
{
14971
0
    if (!PyUnicode_Check(op)) {
14972
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14973
0
        return -1;
14974
0
    }
14975
0
    return _PyASCIIObject_CAST(op)->state.kind;
14976
0
}
14977
14978
#undef PyUnicode_DATA
14979
void* PyUnicode_DATA(PyObject *op)
14980
0
{
14981
0
    if (!PyUnicode_Check(op)) {
14982
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14983
0
        return NULL;
14984
0
    }
14985
0
    return _PyUnicode_DATA(op);
14986
0
}