Coverage Report

Created: 2026-04-12 06:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython3/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_initconfig.h"    // _PyStatus_OK()
50
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
51
#include "pycore_long.h"          // _PyLong_FormatWriter()
52
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
54
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
55
#include "pycore_pyhash.h"        // _Py_HashSecret_t
56
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
57
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
58
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
59
#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart
60
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
61
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
62
63
#include "stringlib/eq.h"         // unicode_eq()
64
#include <stddef.h>               // ptrdiff_t
65
66
#ifdef MS_WINDOWS
67
#include <windows.h>
68
#endif
69
70
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
71
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
72
#endif
73
74
/* Uncomment to display statistics on interned strings at exit
75
   in _PyUnicode_ClearInterned(). */
76
/* #define INTERNED_STATS 1 */
77
78
79
/*[clinic input]
80
class str "PyObject *" "&PyUnicode_Type"
81
[clinic start generated code]*/
82
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
83
84
/*[python input]
85
class Py_UCS4_converter(CConverter):
86
    type = 'Py_UCS4'
87
    converter = 'convert_uc'
88
89
    def c_default_init(self):
90
        import libclinic
91
        self.c_default = libclinic.c_unichar_repr(self.default)
92
93
[python start generated code]*/
94
/*[python end generated code: output=da39a3ee5e6b4b0d input=22f057b68fd9a65a]*/
95
96
/* --- Globals ------------------------------------------------------------
97
98
NOTE: In the interpreter's initialization phase, some globals are currently
99
      initialized dynamically as needed. In the process Unicode objects may
100
      be created before the Unicode type is ready.
101
102
*/
103
104
1.75M
#define MAX_UNICODE _Py_MAX_UNICODE
105
425M
#define ensure_unicode _PyUnicode_EnsureUnicode
106
107
#ifdef Py_DEBUG
108
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
109
#else
110
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
111
#endif
112
113
static inline char* _PyUnicode_UTF8(PyObject *op)
114
102M
{
115
102M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
116
102M
}
117
118
static inline char* PyUnicode_UTF8(PyObject *op)
119
840k
{
120
840k
    assert(_PyUnicode_CHECK(op));
121
840k
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
122
828k
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
123
828k
    }
124
11.8k
    else {
125
11.8k
         return _PyUnicode_UTF8(op);
126
11.8k
    }
127
840k
}
128
129
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
130
2.85k
{
131
2.85k
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
132
2.85k
}
133
134
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
135
241k
{
136
241k
    assert(_PyUnicode_CHECK(op));
137
241k
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
138
241k
         return _PyASCIIObject_CAST(op)->length;
139
241k
    }
140
588
    else {
141
588
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
142
588
    }
143
241k
}
144
145
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
146
2.85k
{
147
2.85k
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
148
2.85k
}
149
150
#define _PyUnicode_LENGTH(op)                           \
151
41.2M
    (_PyASCIIObject_CAST(op)->length)
152
#define _PyUnicode_STATE(op)                            \
153
215M
    (_PyASCIIObject_CAST(op)->state)
154
#define _PyUnicode_HASH(op)                             \
155
30.9M
    (_PyASCIIObject_CAST(op)->hash)
156
157
25.7M
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
158
159
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
160
3.77M
{
161
3.77M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
162
3.77M
}
163
164
#define _PyUnicode_DATA_ANY(op)                         \
165
98
    (_PyUnicodeObject_CAST(op)->data.any)
166
167
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
168
0
{
169
0
    assert(_PyUnicode_CHECK(op));
170
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
171
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
172
0
}
173
174
/* true if the Unicode object has an allocated UTF-8 memory block
175
   (not shared with other data) */
176
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
177
40.9M
{
178
40.9M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
179
21.0M
            && _PyUnicode_UTF8(op) != NULL
180
2.75k
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
181
40.9M
}
182
183
184
122M
#define LATIN1 _Py_LATIN1_CHR
185
186
/* Forward declaration */
187
static PyObject *
188
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
189
                    const char *errors);
190
static PyObject *
191
unicode_decode_utf8(const char *s, Py_ssize_t size,
192
                    _Py_error_handler error_handler, const char *errors,
193
                    Py_ssize_t *consumed);
194
#ifdef Py_DEBUG
195
static inline int unicode_is_finalizing(void);
196
static int unicode_is_singleton(PyObject *unicode);
197
#endif
198
199
200
// Return a reference to the immortal empty string singleton.
201
PyObject*
202
_PyUnicode_GetEmpty(void)
203
36.7M
{
204
36.7M
    _Py_DECLARE_STR(empty, "");
205
36.7M
    return &_Py_STR(empty);
206
36.7M
}
207
208
/* This dictionary holds per-interpreter interned strings.
209
 * See InternalDocs/string_interning.md for details.
210
 */
211
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
212
1.79M
{
213
1.79M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
214
1.79M
}
215
216
/* This hashtable holds statically allocated interned strings.
217
 * See InternalDocs/string_interning.md for details.
218
 */
219
1.74M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
220
221
/* Get number of all interned strings for the current interpreter. */
222
Py_ssize_t
223
_PyUnicode_InternedSize(void)
224
0
{
225
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
226
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
227
0
}
228
229
/* Get number of immortal interned strings for the current interpreter. */
230
Py_ssize_t
231
_PyUnicode_InternedSize_Immortal(void)
232
0
{
233
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
234
0
    PyObject *key, *value;
235
0
    Py_ssize_t pos = 0;
236
0
    Py_ssize_t count = 0;
237
238
    // It's tempting to keep a count and avoid a loop here. But, this function
239
    // is intended for refleak tests. It spends extra work to report the true
240
    // value, to help detect bugs in optimizations.
241
242
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
243
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
244
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
245
0
           count++;
246
0
       }
247
0
    }
248
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
249
0
}
250
251
static Py_hash_t unicode_hash(PyObject *);
252
253
static Py_uhash_t
254
hashtable_unicode_hash(const void *key)
255
1.81M
{
256
1.81M
    return unicode_hash((PyObject *)key);
257
1.81M
}
258
259
static int
260
hashtable_unicode_compare(const void *key1, const void *key2)
261
122k
{
262
122k
    PyObject *obj1 = (PyObject *)key1;
263
122k
    PyObject *obj2 = (PyObject *)key2;
264
122k
    if (obj1 != NULL && obj2 != NULL) {
265
122k
        return unicode_eq(obj1, obj2);
266
122k
    }
267
0
    else {
268
0
        return obj1 == obj2;
269
0
    }
270
122k
}
271
272
/* Return true if this interpreter should share the main interpreter's
273
   intern_dict.  That's important for interpreters which load basic
274
   single-phase init extension modules (m_size == -1).  There could be interned
275
   immortal strings that are shared between interpreters, due to the
276
   PyDict_Update(mdict, m_copy) call in import_find_extension().
277
278
   It's not safe to deallocate those strings until all interpreters that
279
   potentially use them are freed.  By storing them in the main interpreter, we
280
   ensure they get freed after all other interpreters are freed.
281
*/
282
static bool
283
has_shared_intern_dict(PyInterpreterState *interp)
284
22
{
285
22
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
286
22
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
287
22
}
288
289
static int
290
init_interned_dict(PyInterpreterState *interp)
291
22
{
292
22
    assert(get_interned_dict(interp) == NULL);
293
22
    PyObject *interned;
294
22
    if (has_shared_intern_dict(interp)) {
295
0
        interned = get_interned_dict(_PyInterpreterState_Main());
296
0
        Py_INCREF(interned);
297
0
    }
298
22
    else {
299
22
        interned = PyDict_New();
300
22
        if (interned == NULL) {
301
0
            return -1;
302
0
        }
303
22
    }
304
22
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
305
22
    return 0;
306
22
}
307
308
static void
309
clear_interned_dict(PyInterpreterState *interp)
310
0
{
311
0
    PyObject *interned = get_interned_dict(interp);
312
0
    if (interned != NULL) {
313
0
        if (!has_shared_intern_dict(interp)) {
314
            // only clear if the dict belongs to this interpreter
315
0
            PyDict_Clear(interned);
316
0
        }
317
0
        Py_DECREF(interned);
318
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
319
0
    }
320
0
}
321
322
static PyStatus
323
init_global_interned_strings(PyInterpreterState *interp)
324
22
{
325
22
    assert(INTERNED_STRINGS == NULL);
326
22
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
327
328
22
    INTERNED_STRINGS = _Py_hashtable_new_full(
329
22
        hashtable_unicode_hash,
330
22
        hashtable_unicode_compare,
331
        // Objects stored here are immortal and statically allocated,
332
        // so we don't need key_destroy_func & value_destroy_func:
333
22
        NULL,
334
22
        NULL,
335
22
        &hashtable_alloc
336
22
    );
337
22
    if (INTERNED_STRINGS == NULL) {
338
0
        PyErr_Clear();
339
0
        return _PyStatus_ERR("failed to create global interned dict");
340
0
    }
341
342
    /* Intern statically allocated string identifiers, deepfreeze strings,
343
        * and one-byte latin-1 strings.
344
        * This must be done before any module initialization so that statically
345
        * allocated string identifiers are used instead of heap allocated strings.
346
        * Deepfreeze uses the interned identifiers if present to save space
347
        * else generates them and they are interned to speed up dict lookups.
348
    */
349
22
    _PyUnicode_InitStaticStrings(interp);
350
351
5.65k
    for (int i = 0; i < 256; i++) {
352
5.63k
        PyObject *s = LATIN1(i);
353
5.63k
        _PyUnicode_InternStatic(interp, &s);
354
5.63k
        assert(s == LATIN1(i));
355
5.63k
    }
356
#ifdef Py_DEBUG
357
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
358
359
    for (int i = 0; i < 256; i++) {
360
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
361
    }
362
#endif
363
22
    return _PyStatus_OK();
364
22
}
365
366
static void clear_global_interned_strings(void)
367
0
{
368
0
    if (INTERNED_STRINGS != NULL) {
369
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
370
0
        INTERNED_STRINGS = NULL;
371
0
    }
372
0
}
373
374
#define _Py_RETURN_UNICODE_EMPTY()   \
375
22.9M
    do {                             \
376
22.9M
        return _PyUnicode_GetEmpty();\
377
22.9M
    } while (0)
378
379
380
/* Fast detection of the most frequent whitespace characters */
381
const unsigned char _Py_ascii_whitespace[] = {
382
    0, 0, 0, 0, 0, 0, 0, 0,
383
/*     case 0x0009: * CHARACTER TABULATION */
384
/*     case 0x000A: * LINE FEED */
385
/*     case 0x000B: * LINE TABULATION */
386
/*     case 0x000C: * FORM FEED */
387
/*     case 0x000D: * CARRIAGE RETURN */
388
    0, 1, 1, 1, 1, 1, 0, 0,
389
    0, 0, 0, 0, 0, 0, 0, 0,
390
/*     case 0x001C: * FILE SEPARATOR */
391
/*     case 0x001D: * GROUP SEPARATOR */
392
/*     case 0x001E: * RECORD SEPARATOR */
393
/*     case 0x001F: * UNIT SEPARATOR */
394
    0, 0, 0, 0, 1, 1, 1, 1,
395
/*     case 0x0020: * SPACE */
396
    1, 0, 0, 0, 0, 0, 0, 0,
397
    0, 0, 0, 0, 0, 0, 0, 0,
398
    0, 0, 0, 0, 0, 0, 0, 0,
399
    0, 0, 0, 0, 0, 0, 0, 0,
400
401
    0, 0, 0, 0, 0, 0, 0, 0,
402
    0, 0, 0, 0, 0, 0, 0, 0,
403
    0, 0, 0, 0, 0, 0, 0, 0,
404
    0, 0, 0, 0, 0, 0, 0, 0,
405
    0, 0, 0, 0, 0, 0, 0, 0,
406
    0, 0, 0, 0, 0, 0, 0, 0,
407
    0, 0, 0, 0, 0, 0, 0, 0,
408
    0, 0, 0, 0, 0, 0, 0, 0
409
};
410
411
/* forward */
412
static PyObject* get_latin1_char(unsigned char ch);
413
414
415
static PyObject *
416
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
417
static PyObject *
418
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
419
static PyObject *
420
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
421
422
static PyObject *
423
unicode_encode_call_errorhandler(const char *errors,
424
       PyObject **errorHandler,const char *encoding, const char *reason,
425
       PyObject *unicode, PyObject **exceptionObject,
426
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
427
428
static void
429
raise_encode_exception(PyObject **exceptionObject,
430
                       const char *encoding,
431
                       PyObject *unicode,
432
                       Py_ssize_t startpos, Py_ssize_t endpos,
433
                       const char *reason);
434
435
/* Same for linebreaks */
436
static const unsigned char ascii_linebreak[] = {
437
    0, 0, 0, 0, 0, 0, 0, 0,
438
/*         0x000A, * LINE FEED */
439
/*         0x000B, * LINE TABULATION */
440
/*         0x000C, * FORM FEED */
441
/*         0x000D, * CARRIAGE RETURN */
442
    0, 0, 1, 1, 1, 1, 0, 0,
443
    0, 0, 0, 0, 0, 0, 0, 0,
444
/*         0x001C, * FILE SEPARATOR */
445
/*         0x001D, * GROUP SEPARATOR */
446
/*         0x001E, * RECORD SEPARATOR */
447
    0, 0, 0, 0, 1, 1, 1, 0,
448
    0, 0, 0, 0, 0, 0, 0, 0,
449
    0, 0, 0, 0, 0, 0, 0, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0,
452
453
    0, 0, 0, 0, 0, 0, 0, 0,
454
    0, 0, 0, 0, 0, 0, 0, 0,
455
    0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 0, 0, 0, 0, 0, 0, 0,
459
    0, 0, 0, 0, 0, 0, 0, 0,
460
    0, 0, 0, 0, 0, 0, 0, 0
461
};
462
463
static int convert_uc(PyObject *obj, void *addr);
464
465
struct encoding_map;
466
#include "clinic/unicodeobject.c.h"
467
468
_Py_error_handler
469
_Py_GetErrorHandler(const char *errors)
470
9.86k
{
471
9.86k
    if (errors == NULL || strcmp(errors, "strict") == 0) {
472
2.48k
        return _Py_ERROR_STRICT;
473
2.48k
    }
474
7.37k
    if (strcmp(errors, "surrogateescape") == 0) {
475
4.57k
        return _Py_ERROR_SURROGATEESCAPE;
476
4.57k
    }
477
2.79k
    if (strcmp(errors, "replace") == 0) {
478
1.89k
        return _Py_ERROR_REPLACE;
479
1.89k
    }
480
904
    if (strcmp(errors, "ignore") == 0) {
481
0
        return _Py_ERROR_IGNORE;
482
0
    }
483
904
    if (strcmp(errors, "backslashreplace") == 0) {
484
251
        return _Py_ERROR_BACKSLASHREPLACE;
485
251
    }
486
653
    if (strcmp(errors, "surrogatepass") == 0) {
487
653
        return _Py_ERROR_SURROGATEPASS;
488
653
    }
489
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
490
0
        return _Py_ERROR_XMLCHARREFREPLACE;
491
0
    }
492
0
    return _Py_ERROR_OTHER;
493
0
}
494
495
496
static _Py_error_handler
497
get_error_handler_wide(const wchar_t *errors)
498
7.63k
{
499
7.63k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
500
0
        return _Py_ERROR_STRICT;
501
0
    }
502
7.63k
    if (wcscmp(errors, L"surrogateescape") == 0) {
503
7.63k
        return _Py_ERROR_SURROGATEESCAPE;
504
7.63k
    }
505
0
    if (wcscmp(errors, L"replace") == 0) {
506
0
        return _Py_ERROR_REPLACE;
507
0
    }
508
0
    if (wcscmp(errors, L"ignore") == 0) {
509
0
        return _Py_ERROR_IGNORE;
510
0
    }
511
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
512
0
        return _Py_ERROR_BACKSLASHREPLACE;
513
0
    }
514
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
515
0
        return _Py_ERROR_SURROGATEPASS;
516
0
    }
517
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
518
0
        return _Py_ERROR_XMLCHARREFREPLACE;
519
0
    }
520
0
    return _Py_ERROR_OTHER;
521
0
}
522
523
524
static inline int
525
unicode_check_encoding_errors(const char *encoding, const char *errors)
526
40.7k
{
527
40.7k
    if (encoding == NULL && errors == NULL) {
528
0
        return 0;
529
0
    }
530
531
40.7k
    PyInterpreterState *interp = _PyInterpreterState_GET();
532
40.7k
#ifndef Py_DEBUG
533
    /* In release mode, only check in development mode (-X dev) */
534
40.7k
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
535
40.7k
        return 0;
536
40.7k
    }
537
#else
538
    /* Always check in debug mode */
539
#endif
540
541
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
542
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
543
0
    if (!interp->unicode.fs_codec.encoding) {
544
0
        return 0;
545
0
    }
546
547
    /* Disable checks during Python finalization. For example, it allows to
548
     * call PyObject_Dump() during finalization for debugging purpose.
549
     */
550
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
551
0
        return 0;
552
0
    }
553
554
0
    if (encoding != NULL
555
        // Fast path for the most common built-in encodings. Even if the codec
556
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
557
        // create a temporary Unicode string (the key in the cache).
558
0
        && strcmp(encoding, "utf-8") != 0
559
0
        && strcmp(encoding, "utf8") != 0
560
0
        && strcmp(encoding, "ascii") != 0)
561
0
    {
562
0
        PyObject *handler = _PyCodec_Lookup(encoding);
563
0
        if (handler == NULL) {
564
0
            return -1;
565
0
        }
566
0
        Py_DECREF(handler);
567
0
    }
568
569
0
    if (errors != NULL
570
        // Fast path for the most common built-in error handlers.
571
0
        && strcmp(errors, "strict") != 0
572
0
        && strcmp(errors, "ignore") != 0
573
0
        && strcmp(errors, "replace") != 0
574
0
        && strcmp(errors, "surrogateescape") != 0
575
0
        && strcmp(errors, "surrogatepass") != 0)
576
0
    {
577
0
        PyObject *handler = PyCodec_LookupError(errors);
578
0
        if (handler == NULL) {
579
0
            return -1;
580
0
        }
581
0
        Py_DECREF(handler);
582
0
    }
583
0
    return 0;
584
0
}
585
586
587
int
588
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
589
76.4M
{
590
76.4M
#define CHECK(expr) \
591
365M
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
592
593
76.4M
    assert(op != NULL);
594
76.4M
    CHECK(PyUnicode_Check(op));
595
596
76.4M
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
597
0
    int kind = ascii->state.kind;
598
599
76.4M
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
600
35.6M
        CHECK(kind == PyUnicode_1BYTE_KIND);
601
35.6M
    }
602
40.8M
    else {
603
40.8M
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
604
0
        void *data;
605
606
40.8M
        if (ascii->state.compact == 1) {
607
40.8M
            data = compact + 1;
608
40.8M
            CHECK(kind == PyUnicode_1BYTE_KIND
609
40.8M
                                 || kind == PyUnicode_2BYTE_KIND
610
40.8M
                                 || kind == PyUnicode_4BYTE_KIND);
611
40.8M
            CHECK(ascii->state.ascii == 0);
612
40.8M
            CHECK(_PyUnicode_UTF8(op) != data);
613
40.8M
        }
614
49
        else {
615
49
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
616
617
0
            data = unicode->data.any;
618
49
            CHECK(kind == PyUnicode_1BYTE_KIND
619
49
                     || kind == PyUnicode_2BYTE_KIND
620
49
                     || kind == PyUnicode_4BYTE_KIND);
621
49
            CHECK(ascii->state.compact == 0);
622
49
            CHECK(data != NULL);
623
49
            if (ascii->state.ascii) {
624
49
                CHECK(_PyUnicode_UTF8(op) == data);
625
49
                CHECK(compact->utf8_length == ascii->length);
626
49
            }
627
0
            else {
628
0
                CHECK(_PyUnicode_UTF8(op) != data);
629
0
            }
630
49
        }
631
40.8M
#ifndef Py_GIL_DISABLED
632
40.8M
        if (_PyUnicode_UTF8(op) == NULL)
633
40.8M
            CHECK(compact->utf8_length == 0);
634
40.8M
#endif
635
40.8M
    }
636
637
    /* check that the best kind is used: O(n) operation */
638
76.4M
    if (check_content) {
639
35.2M
        Py_ssize_t i;
640
35.2M
        Py_UCS4 maxchar = 0;
641
35.2M
        const void *data;
642
35.2M
        Py_UCS4 ch;
643
644
35.2M
        data = PyUnicode_DATA(ascii);
645
25.9G
        for (i=0; i < ascii->length; i++)
646
25.9G
        {
647
25.9G
            ch = PyUnicode_READ(kind, data, i);
648
25.9G
            if (ch > maxchar)
649
67.4M
                maxchar = ch;
650
25.9G
        }
651
35.2M
        if (kind == PyUnicode_1BYTE_KIND) {
652
18.2M
            if (ascii->state.ascii == 0) {
653
2.72M
                CHECK(maxchar >= 128);
654
2.72M
                CHECK(maxchar <= 255);
655
2.72M
            }
656
15.4M
            else
657
15.4M
                CHECK(maxchar < 128);
658
18.2M
        }
659
17.0M
        else if (kind == PyUnicode_2BYTE_KIND) {
660
15.2M
            CHECK(maxchar >= 0x100);
661
15.2M
            CHECK(maxchar <= 0xFFFF);
662
15.2M
        }
663
1.77M
        else {
664
1.77M
            CHECK(maxchar >= 0x10000);
665
1.77M
            CHECK(maxchar <= MAX_UNICODE);
666
1.77M
        }
667
35.2M
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
668
35.2M
    }
669
670
    /* Check interning state */
671
#ifdef Py_DEBUG
672
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
673
    // extensions can make immortal strings mortal (but with a high enough
674
    // refcount).
675
    // The other way is extremely unlikely (worth a potential failed assertion
676
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
677
    switch (PyUnicode_CHECK_INTERNED(op)) {
678
        case SSTATE_NOT_INTERNED:
679
            if (ascii->state.statically_allocated) {
680
                // This state is for two exceptions:
681
                // - strings are currently checked before they're interned
682
                // - the 256 one-latin1-character strings
683
                //   are static but use SSTATE_NOT_INTERNED
684
            }
685
            else {
686
                CHECK(!_Py_IsImmortal(op));
687
            }
688
            break;
689
        case SSTATE_INTERNED_MORTAL:
690
            CHECK(!ascii->state.statically_allocated);
691
            CHECK(!_Py_IsImmortal(op));
692
            break;
693
        case SSTATE_INTERNED_IMMORTAL:
694
            CHECK(!ascii->state.statically_allocated);
695
            break;
696
        case SSTATE_INTERNED_IMMORTAL_STATIC:
697
            CHECK(ascii->state.statically_allocated);
698
            break;
699
        default:
700
            Py_UNREACHABLE();
701
    }
702
#endif
703
704
76.4M
    return 1;
705
706
76.4M
#undef CHECK
707
76.4M
}
708
709
PyObject*
710
_PyUnicode_Result(PyObject *unicode)
711
5.14M
{
712
5.14M
    assert(_PyUnicode_CHECK(unicode));
713
714
5.14M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
715
5.14M
    if (length == 0) {
716
1
        PyObject *empty = _PyUnicode_GetEmpty();
717
1
        if (unicode != empty) {
718
0
            Py_DECREF(unicode);
719
0
        }
720
1
        return empty;
721
1
    }
722
723
5.14M
    if (length == 1) {
724
1.08M
        int kind = PyUnicode_KIND(unicode);
725
1.08M
        if (kind == PyUnicode_1BYTE_KIND) {
726
912k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
727
912k
            Py_UCS1 ch = data[0];
728
912k
            PyObject *latin1_char = LATIN1(ch);
729
912k
            if (unicode != latin1_char) {
730
912k
                Py_DECREF(unicode);
731
912k
            }
732
912k
            return latin1_char;
733
912k
        }
734
1.08M
    }
735
736
5.14M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
737
4.23M
    return unicode;
738
4.23M
}
739
17.9k
#define unicode_result _PyUnicode_Result
740
741
static PyObject*
742
unicode_result_unchanged(PyObject *unicode)
743
67.7k
{
744
67.7k
    if (PyUnicode_CheckExact(unicode)) {
745
67.7k
        return Py_NewRef(unicode);
746
67.7k
    }
747
0
    else
748
        /* Subtype -- return genuine unicode string with the same value. */
749
0
        return _PyUnicode_Copy(unicode);
750
67.7k
}
751
752
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
753
   ASCII, Latin1, UTF-8, etc. */
754
static char*
755
backslashreplace(PyBytesWriter *writer, char *str,
756
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
757
7.07k
{
758
7.07k
    Py_ssize_t size, i;
759
7.07k
    Py_UCS4 ch;
760
7.07k
    int kind;
761
7.07k
    const void *data;
762
763
7.07k
    kind = PyUnicode_KIND(unicode);
764
7.07k
    data = PyUnicode_DATA(unicode);
765
766
7.07k
    size = 0;
767
    /* determine replacement size */
768
137k
    for (i = collstart; i < collend; ++i) {
769
130k
        Py_ssize_t incr;
770
771
130k
        ch = PyUnicode_READ(kind, data, i);
772
130k
        if (ch < 0x100)
773
130k
            incr = 2+2;
774
0
        else if (ch < 0x10000)
775
0
            incr = 2+4;
776
0
        else {
777
0
            assert(ch <= MAX_UNICODE);
778
0
            incr = 2+8;
779
0
        }
780
130k
        if (size > PY_SSIZE_T_MAX - incr) {
781
0
            PyErr_SetString(PyExc_OverflowError,
782
0
                            "encoded result is too long for a Python string");
783
0
            return NULL;
784
0
        }
785
130k
        size += incr;
786
130k
    }
787
788
7.07k
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
789
7.07k
    if (str == NULL) {
790
0
        return NULL;
791
0
    }
792
793
    /* generate replacement */
794
137k
    for (i = collstart; i < collend; ++i) {
795
130k
        ch = PyUnicode_READ(kind, data, i);
796
130k
        *str++ = '\\';
797
130k
        if (ch >= 0x00010000) {
798
0
            *str++ = 'U';
799
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
800
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
801
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
802
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
803
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
804
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
805
0
        }
806
130k
        else if (ch >= 0x100) {
807
0
            *str++ = 'u';
808
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
809
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
810
0
        }
811
130k
        else
812
130k
            *str++ = 'x';
813
130k
        *str++ = Py_hexdigits[(ch>>4)&0xf];
814
130k
        *str++ = Py_hexdigits[ch&0xf];
815
130k
    }
816
7.07k
    return str;
817
7.07k
}
818
819
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
820
   ASCII, Latin1, UTF-8, etc. */
821
static char*
822
xmlcharrefreplace(PyBytesWriter *writer, char *str,
823
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
824
0
{
825
0
    Py_ssize_t size, i;
826
0
    Py_UCS4 ch;
827
0
    int kind;
828
0
    const void *data;
829
830
0
    kind = PyUnicode_KIND(unicode);
831
0
    data = PyUnicode_DATA(unicode);
832
833
0
    size = 0;
834
    /* determine replacement size */
835
0
    for (i = collstart; i < collend; ++i) {
836
0
        Py_ssize_t incr;
837
838
0
        ch = PyUnicode_READ(kind, data, i);
839
0
        if (ch < 10)
840
0
            incr = 2+1+1;
841
0
        else if (ch < 100)
842
0
            incr = 2+2+1;
843
0
        else if (ch < 1000)
844
0
            incr = 2+3+1;
845
0
        else if (ch < 10000)
846
0
            incr = 2+4+1;
847
0
        else if (ch < 100000)
848
0
            incr = 2+5+1;
849
0
        else if (ch < 1000000)
850
0
            incr = 2+6+1;
851
0
        else {
852
0
            assert(ch <= MAX_UNICODE);
853
0
            incr = 2+7+1;
854
0
        }
855
0
        if (size > PY_SSIZE_T_MAX - incr) {
856
0
            PyErr_SetString(PyExc_OverflowError,
857
0
                            "encoded result is too long for a Python string");
858
0
            return NULL;
859
0
        }
860
0
        size += incr;
861
0
    }
862
863
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
864
0
    if (str == NULL) {
865
0
        return NULL;
866
0
    }
867
868
    /* generate replacement */
869
0
    for (i = collstart; i < collend; ++i) {
870
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
871
0
        if (size < 0) {
872
0
            return NULL;
873
0
        }
874
0
        str += size;
875
0
    }
876
0
    return str;
877
0
}
878
879
/* --- Bloom Filters ----------------------------------------------------- */
880
881
/* stuff to implement simple "bloom filters" for Unicode characters.
882
   to keep things simple, we use a single bitmask, using the least 5
883
   bits from each unicode characters as the bit index. */
884
885
/* the linebreak mask is set up by _PyUnicode_Init() below */
886
887
#if LONG_BIT >= 128
888
#define BLOOM_WIDTH 128
889
#elif LONG_BIT >= 64
890
22.9k
#define BLOOM_WIDTH 64
891
#elif LONG_BIT >= 32
892
#define BLOOM_WIDTH 32
893
#else
894
#error "LONG_BIT is smaller than 32"
895
#endif
896
897
22.7k
#define BLOOM_MASK unsigned long
898
899
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
900
901
11.4k
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
902
903
#define BLOOM_LINEBREAK(ch)                                             \
904
0
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
905
0
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
906
907
static inline BLOOM_MASK
908
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
909
11.3k
{
910
11.3k
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
911
11.3k
    do {                                               \
912
11.3k
        TYPE *data = (TYPE *)PTR;                      \
913
11.3k
        TYPE *end = data + LEN;                        \
914
11.3k
        Py_UCS4 ch;                                    \
915
22.9k
        for (; data != end; data++) {                  \
916
11.5k
            ch = *data;                                \
917
11.5k
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
918
11.5k
        }                                              \
919
11.3k
        break;                                         \
920
11.3k
    } while (0)
921
922
    /* calculate simple bloom-style bitmask for a given unicode string */
923
924
11.3k
    BLOOM_MASK mask;
925
926
11.3k
    mask = 0;
927
11.3k
    switch (kind) {
928
11.3k
    case PyUnicode_1BYTE_KIND:
929
11.3k
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
930
11.3k
        break;
931
22
    case PyUnicode_2BYTE_KIND:
932
22
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
933
22
        break;
934
0
    case PyUnicode_4BYTE_KIND:
935
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
936
0
        break;
937
0
    default:
938
0
        Py_UNREACHABLE();
939
11.3k
    }
940
11.3k
    return mask;
941
942
11.3k
#undef BLOOM_UPDATE
943
11.3k
}
944
945
/* Compilation of templated routines */
946
947
1.79k
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
948
949
#include "stringlib/asciilib.h"
950
#include "stringlib/fastsearch.h"
951
#include "stringlib/partition.h"
952
#include "stringlib/split.h"
953
#include "stringlib/count.h"
954
#include "stringlib/find.h"
955
#include "stringlib/find_max_char.h"
956
#include "stringlib/undef.h"
957
958
#include "stringlib/ucs1lib.h"
959
#include "stringlib/fastsearch.h"
960
#include "stringlib/partition.h"
961
#include "stringlib/split.h"
962
#include "stringlib/count.h"
963
#include "stringlib/find.h"
964
#include "stringlib/replace.h"
965
#include "stringlib/repr.h"
966
#include "stringlib/find_max_char.h"
967
#include "stringlib/undef.h"
968
969
#include "stringlib/ucs2lib.h"
970
#include "stringlib/fastsearch.h"
971
#include "stringlib/partition.h"
972
#include "stringlib/split.h"
973
#include "stringlib/count.h"
974
#include "stringlib/find.h"
975
#include "stringlib/replace.h"
976
#include "stringlib/repr.h"
977
#include "stringlib/find_max_char.h"
978
#include "stringlib/undef.h"
979
980
#include "stringlib/ucs4lib.h"
981
#include "stringlib/fastsearch.h"
982
#include "stringlib/partition.h"
983
#include "stringlib/split.h"
984
#include "stringlib/count.h"
985
#include "stringlib/find.h"
986
#include "stringlib/replace.h"
987
#include "stringlib/repr.h"
988
#include "stringlib/find_max_char.h"
989
#include "stringlib/undef.h"
990
991
#undef STRINGLIB_GET_EMPTY
992
993
/* --- Unicode Object ----------------------------------------------------- */
994
995
static inline Py_ssize_t
996
findchar(const void *s, int kind,
997
         Py_ssize_t size, Py_UCS4 ch,
998
         int direction)
999
26.3M
{
1000
26.3M
    switch (kind) {
1001
26.3M
    case PyUnicode_1BYTE_KIND:
1002
26.3M
        if ((Py_UCS1) ch != ch)
1003
1.79k
            return -1;
1004
26.3M
        if (direction > 0)
1005
26.3M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1006
3.78k
        else
1007
3.78k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1008
14.3k
    case PyUnicode_2BYTE_KIND:
1009
14.3k
        if ((Py_UCS2) ch != ch)
1010
0
            return -1;
1011
14.3k
        if (direction > 0)
1012
13.3k
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1013
1.05k
        else
1014
1.05k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1015
1.45k
    case PyUnicode_4BYTE_KIND:
1016
1.45k
        if (direction > 0)
1017
9
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1018
1.44k
        else
1019
1.44k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1020
0
    default:
1021
0
        Py_UNREACHABLE();
1022
26.3M
    }
1023
26.3M
}
1024
1025
#ifdef Py_DEBUG
1026
/* Fill the data of a Unicode string with invalid characters to detect bugs
1027
   earlier.
1028
1029
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1030
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1031
   invalid character in Unicode 6.0. */
1032
static void
1033
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1034
{
1035
    int kind = PyUnicode_KIND(unicode);
1036
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1037
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1038
    if (length <= old_length)
1039
        return;
1040
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1041
}
1042
#endif
1043
1044
static PyObject*
1045
resize_copy(PyObject *unicode, Py_ssize_t length)
1046
0
{
1047
0
    Py_ssize_t copy_length;
1048
0
    PyObject *copy;
1049
1050
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051
0
    if (copy == NULL)
1052
0
        return NULL;
1053
1054
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1055
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1056
0
    return copy;
1057
0
}
1058
1059
PyObject*
1060
_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
1061
10.3M
{
1062
10.3M
    Py_ssize_t char_size;
1063
10.3M
    Py_ssize_t struct_size;
1064
10.3M
    Py_ssize_t new_size;
1065
10.3M
    PyObject *new_unicode;
1066
#ifdef Py_DEBUG
1067
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1068
#endif
1069
1070
10.3M
    if (!_PyUnicode_IsModifiable(unicode)) {
1071
0
        PyObject *copy = resize_copy(unicode, length);
1072
0
        if (copy == NULL) {
1073
0
            return NULL;
1074
0
        }
1075
0
        Py_DECREF(unicode);
1076
0
        return copy;
1077
0
    }
1078
10.3M
    assert(PyUnicode_IS_COMPACT(unicode));
1079
1080
10.3M
    char_size = PyUnicode_KIND(unicode);
1081
10.3M
    if (PyUnicode_IS_ASCII(unicode))
1082
5.84M
        struct_size = sizeof(PyASCIIObject);
1083
4.48M
    else
1084
4.48M
        struct_size = sizeof(PyCompactUnicodeObject);
1085
1086
10.3M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1087
0
        PyErr_NoMemory();
1088
0
        return NULL;
1089
0
    }
1090
10.3M
    new_size = (struct_size + (length + 1) * char_size);
1091
1092
10.3M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1093
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1094
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1095
0
        PyUnicode_SET_UTF8(unicode, NULL);
1096
0
    }
1097
#ifdef Py_TRACE_REFS
1098
    _Py_ForgetReference(unicode);
1099
#endif
1100
10.3M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1101
1102
10.3M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1103
10.3M
    if (new_unicode == NULL) {
1104
0
        _Py_NewReferenceNoTotal(unicode);
1105
0
        PyErr_NoMemory();
1106
0
        return NULL;
1107
0
    }
1108
10.3M
    unicode = new_unicode;
1109
10.3M
    _Py_NewReferenceNoTotal(unicode);
1110
1111
10.3M
    _PyUnicode_LENGTH(unicode) = length;
1112
#ifdef Py_DEBUG
1113
    unicode_fill_invalid(unicode, old_length);
1114
#endif
1115
10.3M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1116
10.3M
                    length, 0);
1117
10.3M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1118
10.3M
    return unicode;
1119
10.3M
}
1120
1121
static int
1122
resize_inplace(PyObject *unicode, Py_ssize_t length)
1123
0
{
1124
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1125
0
    assert(Py_REFCNT(unicode) == 1);
1126
1127
0
    Py_ssize_t new_size;
1128
0
    Py_ssize_t char_size;
1129
0
    int share_utf8;
1130
0
    void *data;
1131
#ifdef Py_DEBUG
1132
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1133
#endif
1134
1135
0
    data = _PyUnicode_DATA_ANY(unicode);
1136
0
    char_size = PyUnicode_KIND(unicode);
1137
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1138
1139
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1140
0
        PyErr_NoMemory();
1141
0
        return -1;
1142
0
    }
1143
0
    new_size = (length + 1) * char_size;
1144
1145
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1146
0
    {
1147
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1148
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1149
0
        PyUnicode_SET_UTF8(unicode, NULL);
1150
0
    }
1151
1152
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1153
0
    if (data == NULL) {
1154
0
        PyErr_NoMemory();
1155
0
        return -1;
1156
0
    }
1157
0
    _PyUnicode_DATA_ANY(unicode) = data;
1158
0
    if (share_utf8) {
1159
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1160
0
        PyUnicode_SET_UTF8(unicode, data);
1161
0
    }
1162
0
    _PyUnicode_LENGTH(unicode) = length;
1163
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1164
#ifdef Py_DEBUG
1165
    unicode_fill_invalid(unicode, old_length);
1166
#endif
1167
1168
    /* check for integer overflow */
1169
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1170
0
        PyErr_NoMemory();
1171
0
        return -1;
1172
0
    }
1173
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1174
0
    return 0;
1175
0
}
1176
1177
static const char*
1178
unicode_kind_name(PyObject *unicode)
1179
0
{
1180
    /* don't check consistency: unicode_kind_name() is called from
1181
       _PyUnicode_Dump() */
1182
0
    if (!PyUnicode_IS_COMPACT(unicode))
1183
0
    {
1184
0
        switch (PyUnicode_KIND(unicode))
1185
0
        {
1186
0
        case PyUnicode_1BYTE_KIND:
1187
0
            if (PyUnicode_IS_ASCII(unicode))
1188
0
                return "legacy ascii";
1189
0
            else
1190
0
                return "legacy latin1";
1191
0
        case PyUnicode_2BYTE_KIND:
1192
0
            return "legacy UCS2";
1193
0
        case PyUnicode_4BYTE_KIND:
1194
0
            return "legacy UCS4";
1195
0
        default:
1196
0
            return "<legacy invalid kind>";
1197
0
        }
1198
0
    }
1199
0
    switch (PyUnicode_KIND(unicode)) {
1200
0
    case PyUnicode_1BYTE_KIND:
1201
0
        if (PyUnicode_IS_ASCII(unicode))
1202
0
            return "ascii";
1203
0
        else
1204
0
            return "latin1";
1205
0
    case PyUnicode_2BYTE_KIND:
1206
0
        return "UCS2";
1207
0
    case PyUnicode_4BYTE_KIND:
1208
0
        return "UCS4";
1209
0
    default:
1210
0
        return "<invalid compact kind>";
1211
0
    }
1212
0
}
1213
1214
#ifdef Py_DEBUG
1215
/* Functions wrapping macros for use in debugger */
1216
const char *_PyUnicode_utf8(void *unicode_raw){
1217
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1218
    return PyUnicode_UTF8(unicode);
1219
}
1220
1221
const void *_PyUnicode_compact_data(void *unicode_raw) {
1222
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1223
    return _PyUnicode_COMPACT_DATA(unicode);
1224
}
1225
const void *_PyUnicode_data(void *unicode_raw) {
1226
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1227
    printf("obj %p\n", (void*)unicode);
1228
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1229
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1230
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1231
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1232
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1233
    return PyUnicode_DATA(unicode);
1234
}
1235
1236
void
1237
_PyUnicode_Dump(PyObject *op)
1238
{
1239
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1240
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1241
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1242
    const void *data;
1243
1244
    if (ascii->state.compact)
1245
    {
1246
        if (ascii->state.ascii)
1247
            data = (ascii + 1);
1248
        else
1249
            data = (compact + 1);
1250
    }
1251
    else
1252
        data = unicode->data.any;
1253
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1254
1255
    if (!ascii->state.ascii) {
1256
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1257
    }
1258
    printf(", data=%p\n", data);
1259
}
1260
#endif
1261
1262
1263
PyObject *
1264
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1265
37.7M
{
1266
    /* Optimization for empty strings */
1267
37.7M
    if (size == 0) {
1268
6.82M
        return _PyUnicode_GetEmpty();
1269
6.82M
    }
1270
1271
30.9M
    PyObject *obj;
1272
30.9M
    PyCompactUnicodeObject *unicode;
1273
30.9M
    void *data;
1274
30.9M
    int kind;
1275
30.9M
    int is_ascii;
1276
30.9M
    Py_ssize_t char_size;
1277
30.9M
    Py_ssize_t struct_size;
1278
1279
30.9M
    is_ascii = 0;
1280
30.9M
    struct_size = sizeof(PyCompactUnicodeObject);
1281
30.9M
    if (maxchar < 128) {
1282
14.3M
        kind = PyUnicode_1BYTE_KIND;
1283
14.3M
        char_size = 1;
1284
14.3M
        is_ascii = 1;
1285
14.3M
        struct_size = sizeof(PyASCIIObject);
1286
14.3M
    }
1287
16.5M
    else if (maxchar < 256) {
1288
2.21M
        kind = PyUnicode_1BYTE_KIND;
1289
2.21M
        char_size = 1;
1290
2.21M
    }
1291
14.3M
    else if (maxchar < 65536) {
1292
13.0M
        kind = PyUnicode_2BYTE_KIND;
1293
13.0M
        char_size = 2;
1294
13.0M
    }
1295
1.34M
    else {
1296
1.34M
        if (maxchar > MAX_UNICODE) {
1297
0
            PyErr_SetString(PyExc_SystemError,
1298
0
                            "invalid maximum character passed to PyUnicode_New");
1299
0
            return NULL;
1300
0
        }
1301
1.34M
        kind = PyUnicode_4BYTE_KIND;
1302
1.34M
        char_size = 4;
1303
1.34M
    }
1304
1305
    /* Ensure we won't overflow the size. */
1306
30.9M
    if (size < 0) {
1307
0
        PyErr_SetString(PyExc_SystemError,
1308
0
                        "Negative size passed to PyUnicode_New");
1309
0
        return NULL;
1310
0
    }
1311
30.9M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1312
0
        return PyErr_NoMemory();
1313
1314
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1315
     * PyObject_New() so we are able to allocate space for the object and
1316
     * it's data buffer.
1317
     */
1318
30.9M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1319
30.9M
    if (obj == NULL) {
1320
0
        return PyErr_NoMemory();
1321
0
    }
1322
30.9M
    _PyObject_Init(obj, &PyUnicode_Type);
1323
1324
30.9M
    unicode = (PyCompactUnicodeObject *)obj;
1325
30.9M
    if (is_ascii)
1326
14.3M
        data = ((PyASCIIObject*)obj) + 1;
1327
16.5M
    else
1328
16.5M
        data = unicode + 1;
1329
30.9M
    _PyUnicode_LENGTH(unicode) = size;
1330
30.9M
    _PyUnicode_HASH(unicode) = -1;
1331
30.9M
    _PyUnicode_STATE(unicode).interned = 0;
1332
30.9M
    _PyUnicode_STATE(unicode).kind = kind;
1333
30.9M
    _PyUnicode_STATE(unicode).compact = 1;
1334
30.9M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1335
30.9M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1336
30.9M
    if (is_ascii) {
1337
14.3M
        ((char*)data)[size] = 0;
1338
14.3M
    }
1339
16.5M
    else if (kind == PyUnicode_1BYTE_KIND) {
1340
2.21M
        ((char*)data)[size] = 0;
1341
2.21M
        unicode->utf8 = NULL;
1342
2.21M
        unicode->utf8_length = 0;
1343
2.21M
    }
1344
14.3M
    else {
1345
14.3M
        unicode->utf8 = NULL;
1346
14.3M
        unicode->utf8_length = 0;
1347
14.3M
        if (kind == PyUnicode_2BYTE_KIND)
1348
13.0M
            ((Py_UCS2*)data)[size] = 0;
1349
1.34M
        else /* kind == PyUnicode_4BYTE_KIND */
1350
1.34M
            ((Py_UCS4*)data)[size] = 0;
1351
14.3M
    }
1352
#ifdef Py_DEBUG
1353
    unicode_fill_invalid((PyObject*)unicode, 0);
1354
#endif
1355
30.9M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1356
30.9M
    return obj;
1357
30.9M
}
1358
1359
static int
1360
unicode_check_modifiable(PyObject *unicode)
1361
585
{
1362
585
    if (!_PyUnicode_IsModifiable(unicode)) {
1363
0
        PyErr_SetString(PyExc_SystemError,
1364
0
                        "Cannot modify a string currently used");
1365
0
        return -1;
1366
0
    }
1367
585
    return 0;
1368
585
}
1369
1370
static int
1371
_copy_characters(PyObject *to, Py_ssize_t to_start,
1372
                 PyObject *from, Py_ssize_t from_start,
1373
                 Py_ssize_t how_many, int check_maxchar)
1374
53.0M
{
1375
53.0M
    int from_kind, to_kind;
1376
53.0M
    const void *from_data;
1377
53.0M
    void *to_data;
1378
1379
53.0M
    assert(0 <= how_many);
1380
53.0M
    assert(0 <= from_start);
1381
53.0M
    assert(0 <= to_start);
1382
53.0M
    assert(PyUnicode_Check(from));
1383
53.0M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1384
1385
53.0M
    assert(to == NULL || PyUnicode_Check(to));
1386
1387
53.0M
    if (how_many == 0) {
1388
1.25M
        return 0;
1389
1.25M
    }
1390
1391
53.0M
    assert(to != NULL);
1392
51.8M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1393
1394
51.8M
    from_kind = PyUnicode_KIND(from);
1395
51.8M
    from_data = PyUnicode_DATA(from);
1396
51.8M
    to_kind = PyUnicode_KIND(to);
1397
51.8M
    to_data = PyUnicode_DATA(to);
1398
1399
#ifdef Py_DEBUG
1400
    if (!check_maxchar
1401
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1402
    {
1403
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1404
        Py_UCS4 ch;
1405
        Py_ssize_t i;
1406
        for (i=0; i < how_many; i++) {
1407
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1408
            assert(ch <= to_maxchar);
1409
        }
1410
    }
1411
#endif
1412
1413
51.8M
    if (from_kind == to_kind) {
1414
12.3M
        if (check_maxchar
1415
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1416
0
        {
1417
            /* Writing Latin-1 characters into an ASCII string requires to
1418
               check that all written characters are pure ASCII */
1419
0
            Py_UCS4 max_char;
1420
0
            max_char = ucs1lib_find_max_char(from_data,
1421
0
                                             (const Py_UCS1*)from_data + how_many);
1422
0
            if (max_char >= 128)
1423
0
                return -1;
1424
0
        }
1425
12.3M
        memcpy((char*)to_data + to_kind * to_start,
1426
12.3M
                  (const char*)from_data + from_kind * from_start,
1427
12.3M
                  to_kind * how_many);
1428
12.3M
    }
1429
39.4M
    else if (from_kind == PyUnicode_1BYTE_KIND
1430
39.2M
             && to_kind == PyUnicode_2BYTE_KIND)
1431
39.0M
    {
1432
39.0M
        _PyUnicode_CONVERT_BYTES(
1433
39.0M
            Py_UCS1, Py_UCS2,
1434
39.0M
            PyUnicode_1BYTE_DATA(from) + from_start,
1435
39.0M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1436
39.0M
            PyUnicode_2BYTE_DATA(to) + to_start
1437
39.0M
            );
1438
39.0M
    }
1439
427k
    else if (from_kind == PyUnicode_1BYTE_KIND
1440
169k
             && to_kind == PyUnicode_4BYTE_KIND)
1441
169k
    {
1442
169k
        _PyUnicode_CONVERT_BYTES(
1443
169k
            Py_UCS1, Py_UCS4,
1444
169k
            PyUnicode_1BYTE_DATA(from) + from_start,
1445
169k
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1446
169k
            PyUnicode_4BYTE_DATA(to) + to_start
1447
169k
            );
1448
169k
    }
1449
257k
    else if (from_kind == PyUnicode_2BYTE_KIND
1450
215k
             && to_kind == PyUnicode_4BYTE_KIND)
1451
210k
    {
1452
210k
        _PyUnicode_CONVERT_BYTES(
1453
210k
            Py_UCS2, Py_UCS4,
1454
210k
            PyUnicode_2BYTE_DATA(from) + from_start,
1455
210k
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1456
210k
            PyUnicode_4BYTE_DATA(to) + to_start
1457
210k
            );
1458
210k
    }
1459
47.3k
    else {
1460
47.3k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1461
1462
47.3k
        if (!check_maxchar) {
1463
47.3k
            if (from_kind == PyUnicode_2BYTE_KIND
1464
4.98k
                && to_kind == PyUnicode_1BYTE_KIND)
1465
4.98k
            {
1466
4.98k
                _PyUnicode_CONVERT_BYTES(
1467
4.98k
                    Py_UCS2, Py_UCS1,
1468
4.98k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1469
4.98k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1470
4.98k
                    PyUnicode_1BYTE_DATA(to) + to_start
1471
4.98k
                    );
1472
4.98k
            }
1473
42.3k
            else if (from_kind == PyUnicode_4BYTE_KIND
1474
42.3k
                     && to_kind == PyUnicode_1BYTE_KIND)
1475
30.4k
            {
1476
30.4k
                _PyUnicode_CONVERT_BYTES(
1477
30.4k
                    Py_UCS4, Py_UCS1,
1478
30.4k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1479
30.4k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1480
30.4k
                    PyUnicode_1BYTE_DATA(to) + to_start
1481
30.4k
                    );
1482
30.4k
            }
1483
11.8k
            else if (from_kind == PyUnicode_4BYTE_KIND
1484
11.8k
                     && to_kind == PyUnicode_2BYTE_KIND)
1485
11.8k
            {
1486
11.8k
                _PyUnicode_CONVERT_BYTES(
1487
11.8k
                    Py_UCS4, Py_UCS2,
1488
11.8k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1489
11.8k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1490
11.8k
                    PyUnicode_2BYTE_DATA(to) + to_start
1491
11.8k
                    );
1492
11.8k
            }
1493
0
            else {
1494
0
                Py_UNREACHABLE();
1495
0
            }
1496
47.3k
        }
1497
0
        else {
1498
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1499
0
            Py_UCS4 ch;
1500
0
            Py_ssize_t i;
1501
1502
0
            for (i=0; i < how_many; i++) {
1503
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1504
0
                if (ch > to_maxchar)
1505
0
                    return -1;
1506
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1507
0
            }
1508
0
        }
1509
47.3k
    }
1510
51.8M
    return 0;
1511
51.8M
}
1512
1513
void
1514
_PyUnicode_FastCopyCharacters(
1515
    PyObject *to, Py_ssize_t to_start,
1516
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1517
53.0M
{
1518
53.0M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1519
53.0M
}
1520
1521
Py_ssize_t
1522
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1523
                         PyObject *from, Py_ssize_t from_start,
1524
                         Py_ssize_t how_many)
1525
0
{
1526
0
    int err;
1527
1528
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1529
0
        PyErr_BadInternalCall();
1530
0
        return -1;
1531
0
    }
1532
1533
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1534
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1535
0
        return -1;
1536
0
    }
1537
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1538
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1539
0
        return -1;
1540
0
    }
1541
0
    if (how_many < 0) {
1542
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1543
0
        return -1;
1544
0
    }
1545
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1546
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1547
0
        PyErr_Format(PyExc_SystemError,
1548
0
                     "Cannot write %zi characters at %zi "
1549
0
                     "in a string of %zi characters",
1550
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1551
0
        return -1;
1552
0
    }
1553
1554
0
    if (how_many == 0)
1555
0
        return 0;
1556
1557
0
    if (unicode_check_modifiable(to))
1558
0
        return -1;
1559
1560
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1561
0
    if (err) {
1562
0
        PyErr_Format(PyExc_SystemError,
1563
0
                     "Cannot copy %s characters "
1564
0
                     "into a string of %s characters",
1565
0
                     unicode_kind_name(from),
1566
0
                     unicode_kind_name(to));
1567
0
        return -1;
1568
0
    }
1569
0
    return how_many;
1570
0
}
1571
1572
/* Find the maximum code point and count the number of surrogate pairs so a
1573
   correct string length can be computed before converting a string to UCS4.
1574
   This function counts single surrogates as a character and not as a pair.
1575
1576
   Return 0 on success, or -1 on error. */
1577
static int
1578
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1579
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1580
12.5k
{
1581
12.5k
    const wchar_t *iter;
1582
12.5k
    Py_UCS4 ch;
1583
1584
12.5k
    assert(num_surrogates != NULL && maxchar != NULL);
1585
12.5k
    *num_surrogates = 0;
1586
12.5k
    *maxchar = 0;
1587
1588
243k
    for (iter = begin; iter < end; ) {
1589
#if SIZEOF_WCHAR_T == 2
1590
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1591
            && (iter+1) < end
1592
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1593
        {
1594
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1595
            ++(*num_surrogates);
1596
            iter += 2;
1597
        }
1598
        else
1599
#endif
1600
230k
        {
1601
230k
            ch = *iter;
1602
230k
            iter++;
1603
230k
        }
1604
230k
        if (ch > *maxchar) {
1605
45.6k
            *maxchar = ch;
1606
45.6k
            if (*maxchar > MAX_UNICODE) {
1607
0
                PyErr_Format(PyExc_ValueError,
1608
0
                             "character U+%x is not in range [U+0000; U+%x]",
1609
0
                             ch, MAX_UNICODE);
1610
0
                return -1;
1611
0
            }
1612
45.6k
        }
1613
230k
    }
1614
12.5k
    return 0;
1615
12.5k
}
1616
1617
static void
1618
unicode_dealloc(PyObject *unicode)
1619
30.6M
{
1620
#ifdef Py_DEBUG
1621
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1622
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1623
    }
1624
#endif
1625
61.2M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1626
        /* This should never get called, but we also don't want to SEGV if
1627
        * we accidentally decref an immortal string out of existence. Since
1628
        * the string is an immortal object, just re-set the reference count.
1629
        */
1630
#ifdef Py_DEBUG
1631
        Py_UNREACHABLE();
1632
#endif
1633
0
        _Py_SetImmortal(unicode);
1634
0
        return;
1635
0
    }
1636
30.6M
    switch (_PyUnicode_STATE(unicode).interned) {
1637
30.4M
        case SSTATE_NOT_INTERNED:
1638
30.4M
            break;
1639
218k
        case SSTATE_INTERNED_MORTAL:
1640
            /* Remove the object from the intern dict.
1641
             * Before doing so, we set the refcount to 2: the key and value
1642
             * in the interned_dict.
1643
             */
1644
218k
            assert(Py_REFCNT(unicode) == 0);
1645
218k
            Py_SET_REFCNT(unicode, 2);
1646
#ifdef Py_REF_DEBUG
1647
            /* let's be pedantic with the ref total */
1648
            _Py_IncRefTotal(_PyThreadState_GET());
1649
            _Py_IncRefTotal(_PyThreadState_GET());
1650
#endif
1651
218k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1652
218k
            PyObject *interned = get_interned_dict(interp);
1653
218k
            assert(interned != NULL);
1654
218k
            PyObject *popped;
1655
218k
            int r = PyDict_Pop(interned, unicode, &popped);
1656
218k
            if (r == -1) {
1657
0
                PyErr_FormatUnraisable("Exception ignored while "
1658
0
                                       "removing an interned string %R",
1659
0
                                       unicode);
1660
                // We don't know what happened to the string. It's probably
1661
                // best to leak it:
1662
                // - if it was popped, there are no more references to it
1663
                //   so it can't cause trouble (except wasted memory)
1664
                // - if it wasn't popped, it'll remain interned
1665
0
                _Py_SetImmortal(unicode);
1666
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1667
0
                return;
1668
0
            }
1669
218k
            if (r == 0) {
1670
                // The interned string was not found in the interned_dict.
1671
#ifdef Py_DEBUG
1672
                Py_UNREACHABLE();
1673
#endif
1674
0
                _Py_SetImmortal(unicode);
1675
0
                return;
1676
0
            }
1677
            // Successfully popped.
1678
218k
            assert(popped == unicode);
1679
            // Only our `popped` reference should be left; remove it too.
1680
218k
            assert(Py_REFCNT(unicode) == 1);
1681
218k
            Py_SET_REFCNT(unicode, 0);
1682
#ifdef Py_REF_DEBUG
1683
            /* let's be pedantic with the ref total */
1684
            _Py_DecRefTotal(_PyThreadState_GET());
1685
#endif
1686
218k
            break;
1687
0
        default:
1688
            // As with `statically_allocated` above.
1689
#ifdef Py_REF_DEBUG
1690
            Py_UNREACHABLE();
1691
#endif
1692
0
            _Py_SetImmortal(unicode);
1693
0
            return;
1694
30.6M
    }
1695
30.6M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1696
2.75k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1697
2.75k
    }
1698
30.6M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1699
0
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1700
0
    }
1701
1702
30.6M
    Py_TYPE(unicode)->tp_free(unicode);
1703
30.6M
}
1704
1705
#ifdef Py_DEBUG
1706
static int
1707
unicode_is_singleton(PyObject *unicode)
1708
{
1709
    if (unicode == &_Py_STR(empty)) {
1710
        return 1;
1711
    }
1712
1713
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1714
    if (ascii->length == 1) {
1715
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1716
        if (ch < 256 && LATIN1(ch) == unicode) {
1717
            return 1;
1718
        }
1719
    }
1720
    return 0;
1721
}
1722
#endif
1723
1724
int
1725
_PyUnicode_IsModifiable(PyObject *unicode)
1726
21.8M
{
1727
21.8M
    assert(_PyUnicode_CHECK(unicode));
1728
21.8M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1729
371k
        return 0;
1730
21.4M
    if (PyUnicode_HASH(unicode) != -1)
1731
0
        return 0;
1732
21.4M
    if (PyUnicode_CHECK_INTERNED(unicode))
1733
0
        return 0;
1734
21.4M
    if (!PyUnicode_CheckExact(unicode))
1735
0
        return 0;
1736
#ifdef Py_DEBUG
1737
    /* singleton refcount is greater than 1 */
1738
    assert(!unicode_is_singleton(unicode));
1739
#endif
1740
21.4M
    return 1;
1741
21.4M
}
1742
1743
static int
1744
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1745
5.56M
{
1746
5.56M
    PyObject *unicode;
1747
5.56M
    Py_ssize_t old_length;
1748
1749
5.56M
    assert(p_unicode != NULL);
1750
5.56M
    unicode = *p_unicode;
1751
1752
5.56M
    assert(unicode != NULL);
1753
5.56M
    assert(PyUnicode_Check(unicode));
1754
5.56M
    assert(0 <= length);
1755
1756
5.56M
    old_length = PyUnicode_GET_LENGTH(unicode);
1757
5.56M
    if (old_length == length)
1758
0
        return 0;
1759
1760
5.56M
    if (length == 0) {
1761
0
        PyObject *empty = _PyUnicode_GetEmpty();
1762
0
        Py_SETREF(*p_unicode, empty);
1763
0
        return 0;
1764
0
    }
1765
1766
5.56M
    if (!_PyUnicode_IsModifiable(unicode)) {
1767
0
        PyObject *copy = resize_copy(unicode, length);
1768
0
        if (copy == NULL)
1769
0
            return -1;
1770
0
        Py_SETREF(*p_unicode, copy);
1771
0
        return 0;
1772
0
    }
1773
1774
5.56M
    if (PyUnicode_IS_COMPACT(unicode)) {
1775
5.56M
        PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
1776
5.56M
        if (new_unicode == NULL)
1777
0
            return -1;
1778
5.56M
        *p_unicode = new_unicode;
1779
5.56M
        return 0;
1780
5.56M
    }
1781
0
    return resize_inplace(unicode, length);
1782
5.56M
}
1783
1784
int
1785
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1786
0
{
1787
0
    PyObject *unicode;
1788
0
    if (p_unicode == NULL) {
1789
0
        PyErr_BadInternalCall();
1790
0
        return -1;
1791
0
    }
1792
0
    unicode = *p_unicode;
1793
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1794
0
    {
1795
0
        PyErr_BadInternalCall();
1796
0
        return -1;
1797
0
    }
1798
0
    return unicode_resize(p_unicode, length);
1799
0
}
1800
1801
static PyObject*
1802
get_latin1_char(Py_UCS1 ch)
1803
121M
{
1804
121M
    PyObject *o = LATIN1(ch);
1805
121M
    return o;
1806
121M
}
1807
1808
static PyObject*
1809
unicode_char(Py_UCS4 ch)
1810
87.3M
{
1811
87.3M
    PyObject *unicode;
1812
1813
87.3M
    assert(ch <= MAX_UNICODE);
1814
1815
87.3M
    if (ch < 256) {
1816
77.7M
        return get_latin1_char(ch);
1817
77.7M
    }
1818
1819
9.55M
    unicode = PyUnicode_New(1, ch);
1820
9.55M
    if (unicode == NULL)
1821
0
        return NULL;
1822
1823
9.55M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1824
19.1M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1825
8.70M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1826
8.70M
    } else {
1827
854k
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1828
854k
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1829
854k
    }
1830
9.55M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1831
9.55M
    return unicode;
1832
9.55M
}
1833
1834
1835
static inline void
1836
unicode_write_widechar(int kind, void *data,
1837
                       const wchar_t *u, Py_ssize_t size,
1838
                       Py_ssize_t num_surrogates)
1839
12.5k
{
1840
12.5k
    switch (kind) {
1841
12.5k
    case PyUnicode_1BYTE_KIND:
1842
12.5k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1843
12.5k
        break;
1844
1845
0
    case PyUnicode_2BYTE_KIND:
1846
#if SIZEOF_WCHAR_T == 2
1847
        memcpy(data, u, size * 2);
1848
#else
1849
0
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1850
0
#endif
1851
0
        break;
1852
1853
0
    case PyUnicode_4BYTE_KIND:
1854
0
    {
1855
#if SIZEOF_WCHAR_T == 2
1856
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1857
        // surrogate pairs.
1858
        const wchar_t *end = u + size;
1859
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1860
#  ifndef NDEBUG
1861
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1862
#  endif
1863
        for (const wchar_t *iter = u; iter < end; ) {
1864
            assert(ucs4_out < ucs4_end);
1865
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1866
                && (iter+1) < end
1867
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1868
            {
1869
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1870
                iter += 2;
1871
            }
1872
            else {
1873
                *ucs4_out++ = *iter;
1874
                iter++;
1875
            }
1876
        }
1877
        assert(ucs4_out == ucs4_end);
1878
#else
1879
0
        assert(num_surrogates == 0);
1880
0
        memcpy(data, u, size * 4);
1881
0
#endif
1882
0
        break;
1883
0
    }
1884
0
    default:
1885
0
        Py_UNREACHABLE();
1886
12.5k
    }
1887
12.5k
}
1888
1889
1890
PyObject *
1891
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1892
12.6k
{
1893
12.6k
    PyObject *unicode;
1894
12.6k
    Py_UCS4 maxchar = 0;
1895
12.6k
    Py_ssize_t num_surrogates;
1896
1897
12.6k
    if (u == NULL && size != 0) {
1898
0
        PyErr_BadInternalCall();
1899
0
        return NULL;
1900
0
    }
1901
1902
12.6k
    if (size == -1) {
1903
924
        size = wcslen(u);
1904
924
    }
1905
1906
    /* If the Unicode data is known at construction time, we can apply
1907
       some optimizations which share commonly used objects. */
1908
1909
    /* Optimization for empty strings */
1910
12.6k
    if (size == 0)
1911
44
        _Py_RETURN_UNICODE_EMPTY();
1912
1913
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1914
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1915
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1916
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1917
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1918
        if (!converted) {
1919
            return NULL;
1920
        }
1921
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1922
        PyMem_Free(converted);
1923
        return unicode;
1924
    }
1925
#endif
1926
1927
    /* Single character Unicode objects in the Latin-1 range are
1928
       shared when using this constructor */
1929
12.5k
    if (size == 1 && (Py_UCS4)*u < 256)
1930
0
        return get_latin1_char((unsigned char)*u);
1931
1932
    /* If not empty and not single character, copy the Unicode data
1933
       into the new object */
1934
12.5k
    if (find_maxchar_surrogates(u, u + size,
1935
12.5k
                                &maxchar, &num_surrogates) == -1)
1936
0
        return NULL;
1937
1938
12.5k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1939
12.5k
    if (!unicode)
1940
0
        return NULL;
1941
1942
12.5k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1943
12.5k
                           u, size, num_surrogates);
1944
1945
12.5k
    return unicode_result(unicode);
1946
12.5k
}
1947
1948
1949
int
1950
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
1951
                              const wchar_t *str,
1952
                              Py_ssize_t size)
1953
0
{
1954
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
1955
1956
0
    if (size < 0) {
1957
0
        size = wcslen(str);
1958
0
    }
1959
1960
0
    if (size == 0) {
1961
0
        return 0;
1962
0
    }
1963
1964
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1965
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1966
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1967
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1968
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
1969
        if (!converted) {
1970
            return -1;
1971
        }
1972
1973
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
1974
        PyMem_Free(converted);
1975
        return res;
1976
    }
1977
#endif
1978
1979
0
    Py_UCS4 maxchar = 0;
1980
0
    Py_ssize_t num_surrogates;
1981
0
    if (find_maxchar_surrogates(str, str + size,
1982
0
                                &maxchar, &num_surrogates) == -1) {
1983
0
        return -1;
1984
0
    }
1985
1986
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
1987
0
        return -1;
1988
0
    }
1989
1990
0
    int kind = writer->kind;
1991
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
1992
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
1993
1994
0
    writer->pos += size - num_surrogates;
1995
0
    return 0;
1996
0
}
1997
1998
1999
PyObject *
2000
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2001
54.9k
{
2002
54.9k
    if (size < 0) {
2003
0
        PyErr_SetString(PyExc_SystemError,
2004
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2005
0
        return NULL;
2006
0
    }
2007
54.9k
    if (u != NULL) {
2008
54.9k
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2009
54.9k
    }
2010
0
    if (size > 0) {
2011
0
        PyErr_SetString(PyExc_SystemError,
2012
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2013
0
        return NULL;
2014
0
    }
2015
0
    return _PyUnicode_GetEmpty();
2016
0
}
2017
2018
PyObject *
2019
PyUnicode_FromString(const char *u)
2020
2.63M
{
2021
2.63M
    size_t size = strlen(u);
2022
2.63M
    if (size > PY_SSIZE_T_MAX) {
2023
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2024
0
        return NULL;
2025
0
    }
2026
2.63M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2027
2.63M
}
2028
2029
2030
PyObject *
2031
_PyUnicode_FromId(_Py_Identifier *id)
2032
0
{
2033
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2034
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2035
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2036
2037
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2038
0
    if (index < 0) {
2039
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2040
2041
0
        PyMutex_Lock(&rt_ids->mutex);
2042
        // Check again to detect concurrent access. Another thread can have
2043
        // initialized the index while this thread waited for the lock.
2044
0
        index = _Py_atomic_load_ssize(&id->index);
2045
0
        if (index < 0) {
2046
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2047
0
            index = rt_ids->next_index;
2048
0
            rt_ids->next_index++;
2049
0
            _Py_atomic_store_ssize(&id->index, index);
2050
0
        }
2051
0
        PyMutex_Unlock(&rt_ids->mutex);
2052
0
    }
2053
0
    assert(index >= 0);
2054
2055
0
    PyObject *obj;
2056
0
    if (index < ids->size) {
2057
0
        obj = ids->array[index];
2058
0
        if (obj) {
2059
            // Return a borrowed reference
2060
0
            goto end;
2061
0
        }
2062
0
    }
2063
2064
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2065
0
                                       NULL, NULL);
2066
0
    if (!obj) {
2067
0
        goto end;
2068
0
    }
2069
0
    _PyUnicode_InternImmortal(interp, &obj);
2070
2071
0
    if (index >= ids->size) {
2072
        // Overallocate to reduce the number of realloc
2073
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2074
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2075
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2076
0
        if (new_array == NULL) {
2077
0
            PyErr_NoMemory();
2078
0
            obj = NULL;
2079
0
            goto end;
2080
0
        }
2081
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2082
0
        ids->array = new_array;
2083
0
        ids->size = new_size;
2084
0
    }
2085
2086
    // The array stores a strong reference
2087
0
    ids->array[index] = obj;
2088
2089
0
end:
2090
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2091
    // Return a borrowed reference
2092
0
    return obj;
2093
0
}
2094
2095
2096
static void
2097
unicode_clear_identifiers(struct _Py_unicode_state *state)
2098
0
{
2099
0
    struct _Py_unicode_ids *ids = &state->ids;
2100
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2101
0
        Py_XDECREF(ids->array[i]);
2102
0
    }
2103
0
    ids->size = 0;
2104
0
    PyMem_Free(ids->array);
2105
0
    ids->array = NULL;
2106
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2107
    // after Py_Finalize().
2108
0
}
2109
2110
2111
/* Internal function, doesn't check maximum character */
2112
2113
PyObject*
2114
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2115
7.25M
{
2116
7.25M
    const unsigned char *s = (const unsigned char *)buffer;
2117
7.25M
    PyObject *unicode;
2118
7.25M
    if (size == 1) {
2119
#ifdef Py_DEBUG
2120
        assert((unsigned char)s[0] < 128);
2121
#endif
2122
131k
        return get_latin1_char(s[0]);
2123
131k
    }
2124
7.11M
    unicode = PyUnicode_New(size, 127);
2125
7.11M
    if (!unicode)
2126
0
        return NULL;
2127
7.11M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2128
7.11M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2129
7.11M
    return unicode;
2130
7.11M
}
2131
2132
static Py_UCS4
2133
kind_maxchar_limit(int kind)
2134
18
{
2135
18
    switch (kind) {
2136
0
    case PyUnicode_1BYTE_KIND:
2137
0
        return 0x80;
2138
11
    case PyUnicode_2BYTE_KIND:
2139
11
        return 0x100;
2140
7
    case PyUnicode_4BYTE_KIND:
2141
7
        return 0x10000;
2142
0
    default:
2143
0
        Py_UNREACHABLE();
2144
18
    }
2145
18
}
2146
2147
static PyObject*
2148
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2149
4.40M
{
2150
4.40M
    PyObject *res;
2151
4.40M
    unsigned char max_char;
2152
2153
4.40M
    if (size == 0) {
2154
4.23M
        _Py_RETURN_UNICODE_EMPTY();
2155
4.23M
    }
2156
4.40M
    assert(size > 0);
2157
170k
    if (size == 1) {
2158
14.3k
        return get_latin1_char(u[0]);
2159
14.3k
    }
2160
2161
156k
    max_char = ucs1lib_find_max_char(u, u + size);
2162
156k
    res = PyUnicode_New(size, max_char);
2163
156k
    if (!res)
2164
0
        return NULL;
2165
156k
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2166
156k
    assert(_PyUnicode_CheckConsistency(res, 1));
2167
156k
    return res;
2168
156k
}
2169
2170
static PyObject*
2171
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2172
8.25M
{
2173
8.25M
    PyObject *res;
2174
8.25M
    Py_UCS2 max_char;
2175
2176
8.25M
    if (size == 0)
2177
8.13M
        _Py_RETURN_UNICODE_EMPTY();
2178
8.25M
    assert(size > 0);
2179
118k
    if (size == 1)
2180
12.3k
        return unicode_char(u[0]);
2181
2182
106k
    max_char = ucs2lib_find_max_char(u, u + size);
2183
106k
    res = PyUnicode_New(size, max_char);
2184
106k
    if (!res)
2185
0
        return NULL;
2186
106k
    if (max_char >= 256)
2187
73.7k
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2188
32.4k
    else {
2189
32.4k
        _PyUnicode_CONVERT_BYTES(
2190
32.4k
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2191
32.4k
    }
2192
106k
    assert(_PyUnicode_CheckConsistency(res, 1));
2193
106k
    return res;
2194
106k
}
2195
2196
static PyObject*
2197
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2198
9.65M
{
2199
9.65M
    PyObject *res;
2200
9.65M
    Py_UCS4 max_char;
2201
2202
9.65M
    if (size == 0)
2203
9.03M
        _Py_RETURN_UNICODE_EMPTY();
2204
9.65M
    assert(size > 0);
2205
616k
    if (size == 1)
2206
183k
        return unicode_char(u[0]);
2207
2208
433k
    max_char = ucs4lib_find_max_char(u, u + size);
2209
433k
    res = PyUnicode_New(size, max_char);
2210
433k
    if (!res)
2211
0
        return NULL;
2212
433k
    if (max_char < 256)
2213
361k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2214
433k
                                 PyUnicode_1BYTE_DATA(res));
2215
71.9k
    else if (max_char < 0x10000)
2216
57.3k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2217
71.9k
                                 PyUnicode_2BYTE_DATA(res));
2218
14.5k
    else
2219
14.5k
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2220
433k
    assert(_PyUnicode_CheckConsistency(res, 1));
2221
433k
    return res;
2222
433k
}
2223
2224
2225
int
2226
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2227
                          const Py_UCS4 *str,
2228
                          Py_ssize_t size)
2229
0
{
2230
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2231
2232
0
    if (size < 0) {
2233
0
        PyErr_SetString(PyExc_ValueError,
2234
0
                        "size must be positive");
2235
0
        return -1;
2236
0
    }
2237
2238
0
    if (size == 0) {
2239
0
        return 0;
2240
0
    }
2241
2242
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2243
2244
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2245
0
        return -1;
2246
0
    }
2247
2248
0
    int kind = writer->kind;
2249
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2250
0
    if (kind == PyUnicode_1BYTE_KIND) {
2251
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2252
0
                                 str, str + size,
2253
0
                                 data);
2254
0
    }
2255
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2256
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2257
0
                                 str, str + size,
2258
0
                                 data);
2259
0
    }
2260
0
    else {
2261
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2262
0
    }
2263
0
    writer->pos += size;
2264
2265
0
    return 0;
2266
0
}
2267
2268
2269
PyObject*
2270
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2271
2.00M
{
2272
2.00M
    if (size < 0) {
2273
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2274
0
        return NULL;
2275
0
    }
2276
2.00M
    switch (kind) {
2277
164k
    case PyUnicode_1BYTE_KIND:
2278
164k
        return _PyUnicode_FromUCS1(buffer, size);
2279
125k
    case PyUnicode_2BYTE_KIND:
2280
125k
        return _PyUnicode_FromUCS2(buffer, size);
2281
1.71M
    case PyUnicode_4BYTE_KIND:
2282
1.71M
        return _PyUnicode_FromUCS4(buffer, size);
2283
0
    default:
2284
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2285
0
        return NULL;
2286
2.00M
    }
2287
2.00M
}
2288
2289
Py_UCS4
2290
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2291
835k
{
2292
835k
    int kind;
2293
835k
    const void *startptr, *endptr;
2294
2295
835k
    assert(0 <= start);
2296
835k
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2297
835k
    assert(start <= end);
2298
2299
835k
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2300
0
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2301
2302
835k
    if (start == end)
2303
0
        return 127;
2304
2305
835k
    if (PyUnicode_IS_ASCII(unicode))
2306
766k
        return 127;
2307
2308
69.0k
    kind = PyUnicode_KIND(unicode);
2309
69.0k
    startptr = PyUnicode_DATA(unicode);
2310
69.0k
    endptr = (char *)startptr + end * kind;
2311
69.0k
    startptr = (char *)startptr + start * kind;
2312
69.0k
    switch(kind) {
2313
7.97k
    case PyUnicode_1BYTE_KIND:
2314
7.97k
        return ucs1lib_find_max_char(startptr, endptr);
2315
17.1k
    case PyUnicode_2BYTE_KIND:
2316
17.1k
        return ucs2lib_find_max_char(startptr, endptr);
2317
43.9k
    case PyUnicode_4BYTE_KIND:
2318
43.9k
        return ucs4lib_find_max_char(startptr, endptr);
2319
0
    default:
2320
0
        Py_UNREACHABLE();
2321
69.0k
    }
2322
69.0k
}
2323
2324
/* Ensure that a string uses the most efficient storage, if it is not the
2325
   case: create a new string with of the right kind. Write NULL into *p_unicode
2326
   on error. */
2327
static void
2328
unicode_adjust_maxchar(PyObject **p_unicode)
2329
0
{
2330
0
    PyObject *unicode, *copy;
2331
0
    Py_UCS4 max_char;
2332
0
    Py_ssize_t len;
2333
0
    int kind;
2334
2335
0
    assert(p_unicode != NULL);
2336
0
    unicode = *p_unicode;
2337
0
    if (PyUnicode_IS_ASCII(unicode))
2338
0
        return;
2339
2340
0
    len = PyUnicode_GET_LENGTH(unicode);
2341
0
    kind = PyUnicode_KIND(unicode);
2342
0
    if (kind == PyUnicode_1BYTE_KIND) {
2343
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2344
0
        max_char = ucs1lib_find_max_char(u, u + len);
2345
0
        if (max_char >= 128)
2346
0
            return;
2347
0
    }
2348
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2349
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2350
0
        max_char = ucs2lib_find_max_char(u, u + len);
2351
0
        if (max_char >= 256)
2352
0
            return;
2353
0
    }
2354
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2355
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2356
0
        max_char = ucs4lib_find_max_char(u, u + len);
2357
0
        if (max_char >= 0x10000)
2358
0
            return;
2359
0
    }
2360
0
    else
2361
0
        Py_UNREACHABLE();
2362
2363
0
    copy = PyUnicode_New(len, max_char);
2364
0
    if (copy != NULL)
2365
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2366
0
    Py_DECREF(unicode);
2367
0
    *p_unicode = copy;
2368
0
}
2369
2370
PyObject*
2371
_PyUnicode_Copy(PyObject *unicode)
2372
0
{
2373
0
    Py_ssize_t length;
2374
0
    PyObject *copy;
2375
2376
0
    if (!PyUnicode_Check(unicode)) {
2377
0
        PyErr_BadInternalCall();
2378
0
        return NULL;
2379
0
    }
2380
2381
0
    length = PyUnicode_GET_LENGTH(unicode);
2382
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2383
0
    if (!copy)
2384
0
        return NULL;
2385
0
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2386
2387
0
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2388
0
              length * PyUnicode_KIND(unicode));
2389
0
    assert(_PyUnicode_CheckConsistency(copy, 1));
2390
0
    return copy;
2391
0
}
2392
2393
2394
/* Widen Unicode objects to larger buffers. Don't write terminating null
2395
   character. Return NULL on error. */
2396
2397
static void*
2398
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2399
3.76k
{
2400
3.76k
    void *result;
2401
2402
3.76k
    assert(skind < kind);
2403
3.76k
    switch (kind) {
2404
1.73k
    case PyUnicode_2BYTE_KIND:
2405
1.73k
        result = PyMem_New(Py_UCS2, len);
2406
1.73k
        if (!result)
2407
0
            return PyErr_NoMemory();
2408
1.73k
        assert(skind == PyUnicode_1BYTE_KIND);
2409
1.73k
        _PyUnicode_CONVERT_BYTES(
2410
1.73k
            Py_UCS1, Py_UCS2,
2411
1.73k
            (const Py_UCS1 *)data,
2412
1.73k
            ((const Py_UCS1 *)data) + len,
2413
1.73k
            result);
2414
1.73k
        return result;
2415
2.03k
    case PyUnicode_4BYTE_KIND:
2416
2.03k
        result = PyMem_New(Py_UCS4, len);
2417
2.03k
        if (!result)
2418
0
            return PyErr_NoMemory();
2419
2.03k
        if (skind == PyUnicode_2BYTE_KIND) {
2420
0
            _PyUnicode_CONVERT_BYTES(
2421
0
                Py_UCS2, Py_UCS4,
2422
0
                (const Py_UCS2 *)data,
2423
0
                ((const Py_UCS2 *)data) + len,
2424
0
                result);
2425
0
        }
2426
2.03k
        else {
2427
2.03k
            assert(skind == PyUnicode_1BYTE_KIND);
2428
2.03k
            _PyUnicode_CONVERT_BYTES(
2429
2.03k
                Py_UCS1, Py_UCS4,
2430
2.03k
                (const Py_UCS1 *)data,
2431
2.03k
                ((const Py_UCS1 *)data) + len,
2432
2.03k
                result);
2433
2.03k
        }
2434
2.03k
        return result;
2435
0
    default:
2436
0
        Py_UNREACHABLE();
2437
0
        return NULL;
2438
3.76k
    }
2439
3.76k
}
2440
2441
static Py_UCS4*
2442
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2443
        int copy_null)
2444
0
{
2445
0
    int kind;
2446
0
    const void *data;
2447
0
    Py_ssize_t len, targetlen;
2448
0
    kind = PyUnicode_KIND(string);
2449
0
    data = PyUnicode_DATA(string);
2450
0
    len = PyUnicode_GET_LENGTH(string);
2451
0
    targetlen = len;
2452
0
    if (copy_null)
2453
0
        targetlen++;
2454
0
    if (!target) {
2455
0
        target = PyMem_New(Py_UCS4, targetlen);
2456
0
        if (!target) {
2457
0
            PyErr_NoMemory();
2458
0
            return NULL;
2459
0
        }
2460
0
    }
2461
0
    else {
2462
0
        if (targetsize < targetlen) {
2463
0
            PyErr_Format(PyExc_SystemError,
2464
0
                         "string is longer than the buffer");
2465
0
            if (copy_null && 0 < targetsize)
2466
0
                target[0] = 0;
2467
0
            return NULL;
2468
0
        }
2469
0
    }
2470
0
    if (kind == PyUnicode_1BYTE_KIND) {
2471
0
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2472
0
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2473
0
    }
2474
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2475
0
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2476
0
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2477
0
    }
2478
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2479
0
        memcpy(target, data, len * sizeof(Py_UCS4));
2480
0
    }
2481
0
    else {
2482
0
        Py_UNREACHABLE();
2483
0
    }
2484
0
    if (copy_null)
2485
0
        target[len] = 0;
2486
0
    return target;
2487
0
}
2488
2489
Py_UCS4*
2490
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2491
                 int copy_null)
2492
0
{
2493
0
    if (target == NULL || targetsize < 0) {
2494
0
        PyErr_BadInternalCall();
2495
0
        return NULL;
2496
0
    }
2497
0
    return as_ucs4(string, target, targetsize, copy_null);
2498
0
}
2499
2500
Py_UCS4*
2501
PyUnicode_AsUCS4Copy(PyObject *string)
2502
0
{
2503
0
    return as_ucs4(string, NULL, 0, 1);
2504
0
}
2505
2506
/* maximum number of characters required for output of %jo or %jd or %p.
2507
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2508
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2509
   plus 1 for the terminal NUL. */
2510
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2511
2512
static int
2513
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2514
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2515
71.2k
{
2516
71.2k
    Py_ssize_t length, fill, arglen;
2517
71.2k
    Py_UCS4 maxchar;
2518
2519
71.2k
    length = PyUnicode_GET_LENGTH(str);
2520
71.2k
    if ((precision == -1 || precision >= length)
2521
69.8k
        && width <= length)
2522
69.8k
        return _PyUnicodeWriter_WriteStr(writer, str);
2523
2524
1.49k
    if (precision != -1)
2525
1.49k
        length = Py_MIN(precision, length);
2526
2527
1.49k
    arglen = Py_MAX(length, width);
2528
1.49k
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2529
367
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2530
1.12k
    else
2531
1.12k
        maxchar = writer->maxchar;
2532
2533
1.49k
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2534
0
        return -1;
2535
2536
1.49k
    fill = Py_MAX(width - length, 0);
2537
1.49k
    if (fill && !(flags & F_LJUST)) {
2538
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2539
0
            return -1;
2540
0
        writer->pos += fill;
2541
0
    }
2542
2543
1.49k
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2544
1.49k
                                  str, 0, length);
2545
1.49k
    writer->pos += length;
2546
2547
1.49k
    if (fill && (flags & F_LJUST)) {
2548
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2549
0
            return -1;
2550
0
        writer->pos += fill;
2551
0
    }
2552
2553
1.49k
    return 0;
2554
1.49k
}
2555
2556
static int
2557
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2558
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2559
127k
{
2560
    /* UTF-8 */
2561
127k
    Py_ssize_t *pconsumed = NULL;
2562
127k
    Py_ssize_t length;
2563
127k
    if (precision == -1) {
2564
55.5k
        length = strlen(str);
2565
55.5k
    }
2566
71.9k
    else {
2567
71.9k
        length = 0;
2568
1.89M
        while (length < precision && str[length]) {
2569
1.82M
            length++;
2570
1.82M
        }
2571
71.9k
        if (length == precision) {
2572
            /* The input string is not NUL-terminated.  If it ends with an
2573
             * incomplete UTF-8 sequence, truncate the string just before it.
2574
             * Incomplete sequences in the middle and sequences which cannot
2575
             * be valid prefixes are still treated as errors and replaced
2576
             * with \xfffd. */
2577
1.11k
            pconsumed = &length;
2578
1.11k
        }
2579
71.9k
    }
2580
2581
127k
    if (width < 0) {
2582
127k
        return _PyUnicode_DecodeUTF8Writer(writer, str, length,
2583
127k
                                           _Py_ERROR_REPLACE, "replace", pconsumed);
2584
127k
    }
2585
2586
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2587
0
                                                     "replace", pconsumed);
2588
0
    if (unicode == NULL)
2589
0
        return -1;
2590
2591
0
    int res = unicode_fromformat_write_str(writer, unicode,
2592
0
                                           width, -1, flags);
2593
0
    Py_DECREF(unicode);
2594
0
    return res;
2595
0
}
2596
2597
static int
2598
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2599
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2600
0
{
2601
0
    Py_ssize_t length;
2602
0
    if (precision == -1) {
2603
0
        length = wcslen(str);
2604
0
    }
2605
0
    else {
2606
0
        length = 0;
2607
0
        while (length < precision && str[length]) {
2608
0
            length++;
2609
0
        }
2610
0
    }
2611
2612
0
    if (width < 0) {
2613
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2614
0
                                             str, length);
2615
0
    }
2616
2617
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2618
0
    if (unicode == NULL)
2619
0
        return -1;
2620
2621
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2622
0
    Py_DECREF(unicode);
2623
0
    return res;
2624
0
}
2625
2626
0
#define F_LONG 1
2627
0
#define F_LONGLONG 2
2628
71.4k
#define F_SIZE 3
2629
0
#define F_PTRDIFF 4
2630
0
#define F_INTMAX 5
2631
2632
static const char*
2633
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2634
                       const char *f, va_list *vargs)
2635
253k
{
2636
253k
    const char *p;
2637
253k
    Py_ssize_t len;
2638
253k
    int flags = 0;
2639
253k
    Py_ssize_t width;
2640
253k
    Py_ssize_t precision;
2641
2642
253k
    p = f;
2643
253k
    f++;
2644
253k
    if (*f == '%') {
2645
0
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2646
0
            return NULL;
2647
0
        f++;
2648
0
        return f;
2649
0
    }
2650
2651
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2652
    /* Flags '+', ' ' and '#' are not particularly useful.
2653
     * They are not worth the implementation and maintenance costs.
2654
     * In addition, '#' should add "0" for "o" conversions for compatibility
2655
     * with printf, but it would confuse Python users. */
2656
255k
    while (1) {
2657
255k
        switch (*f++) {
2658
0
        case '-': flags |= F_LJUST; continue;
2659
1.63k
        case '0': flags |= F_ZERO; continue;
2660
0
        case '#': flags |= F_ALT; continue;
2661
255k
        }
2662
253k
        f--;
2663
253k
        break;
2664
255k
    }
2665
2666
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2667
253k
    width = -1;
2668
253k
    if (*f == '*') {
2669
0
        width = va_arg(*vargs, int);
2670
0
        if (width < 0) {
2671
0
            flags |= F_LJUST;
2672
0
            width = -width;
2673
0
        }
2674
0
        f++;
2675
0
    }
2676
253k
    else if (Py_ISDIGIT((unsigned)*f)) {
2677
1.63k
        width = *f - '0';
2678
1.63k
        f++;
2679
1.63k
        while (Py_ISDIGIT((unsigned)*f)) {
2680
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2681
0
                PyErr_SetString(PyExc_ValueError,
2682
0
                                "width too big");
2683
0
                return NULL;
2684
0
            }
2685
0
            width = (width * 10) + (*f - '0');
2686
0
            f++;
2687
0
        }
2688
1.63k
    }
2689
253k
    precision = -1;
2690
253k
    if (*f == '.') {
2691
76.8k
        f++;
2692
76.8k
        if (*f == '*') {
2693
0
            precision = va_arg(*vargs, int);
2694
0
            if (precision < 0) {
2695
0
                precision = -2;
2696
0
            }
2697
0
            f++;
2698
0
        }
2699
76.8k
        else if (Py_ISDIGIT((unsigned)*f)) {
2700
76.8k
            precision = (*f - '0');
2701
76.8k
            f++;
2702
227k
            while (Py_ISDIGIT((unsigned)*f)) {
2703
150k
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2704
0
                    PyErr_SetString(PyExc_ValueError,
2705
0
                                    "precision too big");
2706
0
                    return NULL;
2707
0
                }
2708
150k
                precision = (precision * 10) + (*f - '0');
2709
150k
                f++;
2710
150k
            }
2711
76.8k
        }
2712
76.8k
    }
2713
2714
253k
    int sizemod = 0;
2715
253k
    if (*f == 'l') {
2716
0
        if (f[1] == 'l') {
2717
0
            sizemod = F_LONGLONG;
2718
0
            f += 2;
2719
0
        }
2720
0
        else {
2721
0
            sizemod = F_LONG;
2722
0
            ++f;
2723
0
        }
2724
0
    }
2725
253k
    else if (*f == 'z') {
2726
35.7k
        sizemod = F_SIZE;
2727
35.7k
        ++f;
2728
35.7k
    }
2729
217k
    else if (*f == 't') {
2730
0
        sizemod = F_PTRDIFF;
2731
0
        ++f;
2732
0
    }
2733
217k
    else if (*f == 'j') {
2734
0
        sizemod = F_INTMAX;
2735
0
        ++f;
2736
0
    }
2737
253k
    if (f[0] != '\0' && f[1] == '\0')
2738
33.9k
        writer->overallocate = 0;
2739
2740
253k
    switch (*f) {
2741
44.8k
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2742
44.8k
        break;
2743
10.0k
    case 'c': case 'p':
2744
10.0k
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2745
10.0k
        break;
2746
127k
    case 's':
2747
127k
    case 'V':
2748
127k
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2749
127k
        break;
2750
127k
    default:
2751
71.2k
        if (sizemod) goto invalid_format;
2752
71.2k
        break;
2753
253k
    }
2754
2755
253k
    switch (*f) {
2756
10.0k
    case 'c':
2757
10.0k
    {
2758
10.0k
        int ordinal = va_arg(*vargs, int);
2759
10.0k
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2760
0
            PyErr_SetString(PyExc_OverflowError,
2761
0
                            "character argument not in range(0x110000)");
2762
0
            return NULL;
2763
0
        }
2764
10.0k
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2765
0
            return NULL;
2766
10.0k
        break;
2767
10.0k
    }
2768
2769
42.8k
    case 'd': case 'i':
2770
44.8k
    case 'o': case 'u': case 'x': case 'X':
2771
44.8k
    {
2772
44.8k
        char buffer[MAX_INTMAX_CHARS];
2773
2774
        // Fill buffer using sprinf, with one of many possible format
2775
        // strings, like "%llX" for `long long` in hexadecimal.
2776
        // The type/size is in `sizemod`; the format is in `*f`.
2777
2778
        // Use macros with nested switches to keep the sprintf format strings
2779
        // as compile-time literals, avoiding warnings and maybe allowing
2780
        // optimizations.
2781
2782
        // `SPRINT` macro does one sprintf
2783
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2784
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2785
44.8k
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2786
44.8k
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2787
2788
        // One inner switch to handle all format variants
2789
44.8k
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2790
44.8k
            switch (*f) {                                                     \
2791
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2792
0
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2793
1.18k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2794
789
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2795
42.8k
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2796
44.8k
            }
2797
2798
        // Outer switch to handle all the sizes/types
2799
44.8k
        switch (sizemod) {
2800
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2801
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2802
35.7k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2803
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2804
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2805
9.10k
            default:         DO_SPRINTS("", int, unsigned int); break;
2806
44.8k
        }
2807
44.8k
        #undef SPRINT
2808
44.8k
        #undef DO_SPRINTS
2809
2810
44.8k
        assert(len >= 0);
2811
2812
44.8k
        int sign = (buffer[0] == '-');
2813
44.8k
        len -= sign;
2814
2815
44.8k
        precision = Py_MAX(precision, len);
2816
44.8k
        width = Py_MAX(width, precision + sign);
2817
44.8k
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2818
1.63k
            precision = width - sign;
2819
1.63k
        }
2820
2821
44.8k
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2822
44.8k
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2823
2824
44.8k
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2825
0
            return NULL;
2826
2827
44.8k
        if (spacepad && !(flags & F_LJUST)) {
2828
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2829
0
                return NULL;
2830
0
            writer->pos += spacepad;
2831
0
        }
2832
2833
44.8k
        if (sign) {
2834
0
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2835
0
                return NULL;
2836
0
        }
2837
2838
44.8k
        if (zeropad) {
2839
585
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2840
0
                return NULL;
2841
585
            writer->pos += zeropad;
2842
585
        }
2843
2844
44.8k
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2845
0
            return NULL;
2846
2847
44.8k
        if (spacepad && (flags & F_LJUST)) {
2848
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2849
0
                return NULL;
2850
0
            writer->pos += spacepad;
2851
0
        }
2852
44.8k
        break;
2853
44.8k
    }
2854
2855
44.8k
    case 'p':
2856
0
    {
2857
0
        char number[MAX_INTMAX_CHARS];
2858
2859
0
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2860
0
        assert(len >= 0);
2861
2862
        /* %p is ill-defined:  ensure leading 0x. */
2863
0
        if (number[1] == 'X')
2864
0
            number[1] = 'x';
2865
0
        else if (number[1] != 'x') {
2866
0
            memmove(number + 2, number,
2867
0
                    strlen(number) + 1);
2868
0
            number[0] = '0';
2869
0
            number[1] = 'x';
2870
0
            len += 2;
2871
0
        }
2872
2873
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2874
0
            return NULL;
2875
0
        break;
2876
0
    }
2877
2878
127k
    case 's':
2879
127k
    {
2880
127k
        if (sizemod) {
2881
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2882
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2883
0
                return NULL;
2884
0
        }
2885
127k
        else {
2886
            /* UTF-8 */
2887
127k
            const char *s = va_arg(*vargs, const char*);
2888
127k
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2889
0
                return NULL;
2890
127k
        }
2891
127k
        break;
2892
127k
    }
2893
2894
127k
    case 'U':
2895
64.6k
    {
2896
64.6k
        PyObject *obj = va_arg(*vargs, PyObject *);
2897
64.6k
        assert(obj && _PyUnicode_CHECK(obj));
2898
2899
64.6k
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2900
0
            return NULL;
2901
64.6k
        break;
2902
64.6k
    }
2903
2904
64.6k
    case 'V':
2905
338
    {
2906
338
        PyObject *obj = va_arg(*vargs, PyObject *);
2907
338
        const char *str;
2908
338
        const wchar_t *wstr;
2909
338
        if (sizemod) {
2910
0
            wstr = va_arg(*vargs, const wchar_t*);
2911
0
        }
2912
338
        else {
2913
338
            str = va_arg(*vargs, const char *);
2914
338
        }
2915
338
        if (obj) {
2916
0
            assert(_PyUnicode_CHECK(obj));
2917
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2918
0
                return NULL;
2919
0
        }
2920
338
        else if (sizemod) {
2921
0
            assert(wstr != NULL);
2922
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2923
0
                return NULL;
2924
0
        }
2925
338
        else {
2926
338
            assert(str != NULL);
2927
338
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
2928
0
                return NULL;
2929
338
        }
2930
338
        break;
2931
338
    }
2932
2933
338
    case 'S':
2934
67
    {
2935
67
        PyObject *obj = va_arg(*vargs, PyObject *);
2936
67
        PyObject *str;
2937
67
        assert(obj);
2938
67
        str = PyObject_Str(obj);
2939
67
        if (!str)
2940
0
            return NULL;
2941
67
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2942
0
            Py_DECREF(str);
2943
0
            return NULL;
2944
0
        }
2945
67
        Py_DECREF(str);
2946
67
        break;
2947
67
    }
2948
2949
6.61k
    case 'R':
2950
6.61k
    {
2951
6.61k
        PyObject *obj = va_arg(*vargs, PyObject *);
2952
6.61k
        PyObject *repr;
2953
6.61k
        assert(obj);
2954
6.61k
        repr = PyObject_Repr(obj);
2955
6.61k
        if (!repr)
2956
0
            return NULL;
2957
6.61k
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2958
0
            Py_DECREF(repr);
2959
0
            return NULL;
2960
0
        }
2961
6.61k
        Py_DECREF(repr);
2962
6.61k
        break;
2963
6.61k
    }
2964
2965
0
    case 'A':
2966
0
    {
2967
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2968
0
        PyObject *ascii;
2969
0
        assert(obj);
2970
0
        ascii = PyObject_ASCII(obj);
2971
0
        if (!ascii)
2972
0
            return NULL;
2973
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2974
0
            Py_DECREF(ascii);
2975
0
            return NULL;
2976
0
        }
2977
0
        Py_DECREF(ascii);
2978
0
        break;
2979
0
    }
2980
2981
0
    case 'T':
2982
0
    {
2983
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2984
0
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2985
2986
0
        PyObject *type_name;
2987
0
        if (flags & F_ALT) {
2988
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
2989
0
        }
2990
0
        else {
2991
0
            type_name = PyType_GetFullyQualifiedName(type);
2992
0
        }
2993
0
        Py_DECREF(type);
2994
0
        if (!type_name) {
2995
0
            return NULL;
2996
0
        }
2997
2998
0
        if (unicode_fromformat_write_str(writer, type_name,
2999
0
                                         width, precision, flags) == -1) {
3000
0
            Py_DECREF(type_name);
3001
0
            return NULL;
3002
0
        }
3003
0
        Py_DECREF(type_name);
3004
0
        break;
3005
0
    }
3006
3007
0
    case 'N':
3008
0
    {
3009
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3010
0
        assert(type_raw != NULL);
3011
3012
0
        if (!PyType_Check(type_raw)) {
3013
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3014
0
            return NULL;
3015
0
        }
3016
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3017
3018
0
        PyObject *type_name;
3019
0
        if (flags & F_ALT) {
3020
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3021
0
        }
3022
0
        else {
3023
0
            type_name = PyType_GetFullyQualifiedName(type);
3024
0
        }
3025
0
        if (!type_name) {
3026
0
            return NULL;
3027
0
        }
3028
0
        if (unicode_fromformat_write_str(writer, type_name,
3029
0
                                         width, precision, flags) == -1) {
3030
0
            Py_DECREF(type_name);
3031
0
            return NULL;
3032
0
        }
3033
0
        Py_DECREF(type_name);
3034
0
        break;
3035
0
    }
3036
3037
0
    default:
3038
0
    invalid_format:
3039
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3040
0
        return NULL;
3041
253k
    }
3042
3043
253k
    f++;
3044
253k
    return f;
3045
253k
}
3046
3047
static int
3048
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3049
171k
{
3050
171k
    Py_ssize_t len = strlen(format);
3051
171k
    writer->min_length += len + 100;
3052
171k
    writer->overallocate = 1;
3053
3054
    // Copy varags to be able to pass a reference to a subfunction.
3055
171k
    va_list vargs2;
3056
171k
    va_copy(vargs2, vargs);
3057
3058
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3059
    // to be encoded to ASCII.
3060
171k
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3061
171k
    if (!is_ascii) {
3062
0
        Py_ssize_t i;
3063
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3064
0
        PyErr_Format(PyExc_ValueError,
3065
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3066
0
            "string, got a non-ASCII byte: 0x%02x",
3067
0
            (unsigned char)format[i]);
3068
0
        goto fail;
3069
0
    }
3070
3071
784k
    for (const char *f = format; *f; ) {
3072
613k
        if (*f == '%') {
3073
253k
            f = unicode_fromformat_arg(writer, f, &vargs2);
3074
253k
            if (f == NULL)
3075
0
                goto fail;
3076
253k
        }
3077
359k
        else {
3078
359k
            const char *p = strchr(f, '%');
3079
359k
            if (p != NULL) {
3080
222k
                len = p - f;
3081
222k
            }
3082
137k
            else {
3083
137k
                len = strlen(f);
3084
137k
                writer->overallocate = 0;
3085
137k
            }
3086
3087
359k
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3088
0
                goto fail;
3089
0
            }
3090
359k
            f += len;
3091
359k
        }
3092
613k
    }
3093
171k
    va_end(vargs2);
3094
171k
    return 0;
3095
3096
0
  fail:
3097
0
    va_end(vargs2);
3098
0
    return -1;
3099
171k
}
3100
3101
PyObject *
3102
PyUnicode_FromFormatV(const char *format, va_list vargs)
3103
171k
{
3104
171k
    _PyUnicodeWriter writer;
3105
171k
    _PyUnicodeWriter_Init(&writer);
3106
3107
171k
    if (unicode_from_format(&writer, format, vargs) < 0) {
3108
0
        _PyUnicodeWriter_Dealloc(&writer);
3109
0
        return NULL;
3110
0
    }
3111
171k
    return _PyUnicodeWriter_Finish(&writer);
3112
171k
}
3113
3114
PyObject *
3115
PyUnicode_FromFormat(const char *format, ...)
3116
32.3k
{
3117
32.3k
    PyObject* ret;
3118
32.3k
    va_list vargs;
3119
3120
32.3k
    va_start(vargs, format);
3121
32.3k
    ret = PyUnicode_FromFormatV(format, vargs);
3122
32.3k
    va_end(vargs);
3123
32.3k
    return ret;
3124
32.3k
}
3125
3126
int
3127
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3128
0
{
3129
0
    va_list vargs;
3130
0
    va_start(vargs, format);
3131
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3132
0
    va_end(vargs);
3133
0
    return res;
3134
0
}
3135
3136
int
3137
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3138
                         va_list vargs)
3139
0
{
3140
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3141
0
    Py_ssize_t old_pos = _writer->pos;
3142
3143
0
    int res = unicode_from_format(_writer, format, vargs);
3144
3145
0
    if (res < 0) {
3146
0
        _writer->pos = old_pos;
3147
0
    }
3148
0
    return res;
3149
0
}
3150
3151
static Py_ssize_t
3152
unicode_get_widechar_size(PyObject *unicode)
3153
2.29k
{
3154
2.29k
    Py_ssize_t res;
3155
3156
2.29k
    assert(unicode != NULL);
3157
2.29k
    assert(_PyUnicode_CHECK(unicode));
3158
3159
2.29k
    res = _PyUnicode_LENGTH(unicode);
3160
#if SIZEOF_WCHAR_T == 2
3161
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3162
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3163
        const Py_UCS4 *end = s + res;
3164
        for (; s < end; ++s) {
3165
            if (*s > 0xFFFF) {
3166
                ++res;
3167
            }
3168
        }
3169
    }
3170
#endif
3171
0
    return res;
3172
2.29k
}
3173
3174
static void
3175
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3176
2.29k
{
3177
2.29k
    assert(unicode != NULL);
3178
2.29k
    assert(_PyUnicode_CHECK(unicode));
3179
3180
4.59k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3181
0
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3182
0
        return;
3183
0
    }
3184
3185
4.59k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3186
2.29k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3187
119k
        for (; size--; ++s, ++w) {
3188
116k
            *w = *s;
3189
116k
        }
3190
2.29k
    }
3191
0
    else {
3192
0
#if SIZEOF_WCHAR_T == 4
3193
0
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3194
0
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3195
0
        for (; size--; ++s, ++w) {
3196
0
            *w = *s;
3197
0
        }
3198
#else
3199
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3200
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3201
        for (; size--; ++s, ++w) {
3202
            Py_UCS4 ch = *s;
3203
            if (ch > 0xFFFF) {
3204
                assert(ch <= MAX_UNICODE);
3205
                /* encode surrogate pair in this case */
3206
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3207
                if (!size--)
3208
                    break;
3209
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3210
            }
3211
            else {
3212
                *w = ch;
3213
            }
3214
        }
3215
#endif
3216
0
    }
3217
2.29k
}
3218
3219
#ifdef HAVE_WCHAR_H
3220
3221
/* Convert a Unicode object to a wide character string.
3222
3223
   - If w is NULL: return the number of wide characters (including the null
3224
     character) required to convert the unicode object. Ignore size argument.
3225
3226
   - Otherwise: return the number of wide characters (excluding the null
3227
     character) written into w. Write at most size wide characters (including
3228
     the null character). */
3229
Py_ssize_t
3230
PyUnicode_AsWideChar(PyObject *unicode,
3231
                     wchar_t *w,
3232
                     Py_ssize_t size)
3233
251
{
3234
251
    Py_ssize_t res;
3235
3236
251
    if (unicode == NULL) {
3237
0
        PyErr_BadInternalCall();
3238
0
        return -1;
3239
0
    }
3240
251
    if (!PyUnicode_Check(unicode)) {
3241
0
        PyErr_BadArgument();
3242
0
        return -1;
3243
0
    }
3244
3245
251
    res = unicode_get_widechar_size(unicode);
3246
251
    if (w == NULL) {
3247
0
        return res + 1;
3248
0
    }
3249
3250
251
    if (size > res) {
3251
251
        size = res + 1;
3252
251
    }
3253
0
    else {
3254
0
        res = size;
3255
0
    }
3256
251
    unicode_copy_as_widechar(unicode, w, size);
3257
3258
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3259
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3260
       non-Unicode locales and hence needs conversion first. */
3261
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3262
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3263
            return -1;
3264
        }
3265
    }
3266
#endif
3267
3268
251
    return res;
3269
251
}
3270
3271
wchar_t*
3272
PyUnicode_AsWideCharString(PyObject *unicode,
3273
                           Py_ssize_t *size)
3274
2.04k
{
3275
2.04k
    wchar_t *buffer;
3276
2.04k
    Py_ssize_t buflen;
3277
3278
2.04k
    if (unicode == NULL) {
3279
0
        PyErr_BadInternalCall();
3280
0
        return NULL;
3281
0
    }
3282
2.04k
    if (!PyUnicode_Check(unicode)) {
3283
0
        PyErr_BadArgument();
3284
0
        return NULL;
3285
0
    }
3286
3287
2.04k
    buflen = unicode_get_widechar_size(unicode);
3288
2.04k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3289
2.04k
    if (buffer == NULL) {
3290
0
        PyErr_NoMemory();
3291
0
        return NULL;
3292
0
    }
3293
2.04k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3294
3295
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3296
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3297
       non-Unicode locales and hence needs conversion first. */
3298
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3299
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3300
            return NULL;
3301
        }
3302
    }
3303
#endif
3304
3305
2.04k
    if (size != NULL) {
3306
1.25k
        *size = buflen;
3307
1.25k
    }
3308
792
    else if (wcslen(buffer) != (size_t)buflen) {
3309
0
        PyMem_Free(buffer);
3310
0
        PyErr_SetString(PyExc_ValueError,
3311
0
                        "embedded null character");
3312
0
        return NULL;
3313
0
    }
3314
2.04k
    return buffer;
3315
2.04k
}
3316
3317
#endif /* HAVE_WCHAR_H */
3318
3319
int
3320
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3321
0
{
3322
0
    wchar_t **p = (wchar_t **)ptr;
3323
0
    if (obj == NULL) {
3324
0
        PyMem_Free(*p);
3325
0
        *p = NULL;
3326
0
        return 1;
3327
0
    }
3328
0
    if (PyUnicode_Check(obj)) {
3329
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3330
0
        if (*p == NULL) {
3331
0
            return 0;
3332
0
        }
3333
0
        return Py_CLEANUP_SUPPORTED;
3334
0
    }
3335
0
    PyErr_Format(PyExc_TypeError,
3336
0
                 "argument must be str, not %.50s",
3337
0
                 Py_TYPE(obj)->tp_name);
3338
0
    return 0;
3339
0
}
3340
3341
int
3342
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3343
0
{
3344
0
    wchar_t **p = (wchar_t **)ptr;
3345
0
    if (obj == NULL) {
3346
0
        PyMem_Free(*p);
3347
0
        *p = NULL;
3348
0
        return 1;
3349
0
    }
3350
0
    if (obj == Py_None) {
3351
0
        *p = NULL;
3352
0
        return 1;
3353
0
    }
3354
0
    if (PyUnicode_Check(obj)) {
3355
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3356
0
        if (*p == NULL) {
3357
0
            return 0;
3358
0
        }
3359
0
        return Py_CLEANUP_SUPPORTED;
3360
0
    }
3361
0
    PyErr_Format(PyExc_TypeError,
3362
0
                 "argument must be str or None, not %.50s",
3363
0
                 Py_TYPE(obj)->tp_name);
3364
0
    return 0;
3365
0
}
3366
3367
PyObject *
3368
PyUnicode_FromOrdinal(int ordinal)
3369
186k
{
3370
186k
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3371
0
        PyErr_SetString(PyExc_ValueError,
3372
0
                        "chr() arg not in range(0x110000)");
3373
0
        return NULL;
3374
0
    }
3375
3376
186k
    return unicode_char((Py_UCS4)ordinal);
3377
186k
}
3378
3379
PyObject *
3380
PyUnicode_FromObject(PyObject *obj)
3381
148k
{
3382
    /* XXX Perhaps we should make this API an alias of
3383
       PyObject_Str() instead ?! */
3384
148k
    if (PyUnicode_CheckExact(obj)) {
3385
148k
        return Py_NewRef(obj);
3386
148k
    }
3387
0
    if (PyUnicode_Check(obj)) {
3388
        /* For a Unicode subtype that's not a Unicode object,
3389
           return a true Unicode object with the same data. */
3390
0
        return _PyUnicode_Copy(obj);
3391
0
    }
3392
0
    PyErr_Format(PyExc_TypeError,
3393
0
                 "Can't convert '%.100s' object to str implicitly",
3394
0
                 Py_TYPE(obj)->tp_name);
3395
0
    return NULL;
3396
0
}
3397
3398
PyObject *
3399
PyUnicode_FromEncodedObject(PyObject *obj,
3400
                            const char *encoding,
3401
                            const char *errors)
3402
27.5k
{
3403
27.5k
    Py_buffer buffer;
3404
27.5k
    PyObject *v;
3405
3406
27.5k
    if (obj == NULL) {
3407
0
        PyErr_BadInternalCall();
3408
0
        return NULL;
3409
0
    }
3410
3411
    /* Decoding bytes objects is the most common case and should be fast */
3412
27.5k
    if (PyBytes_Check(obj)) {
3413
27.5k
        if (PyBytes_GET_SIZE(obj) == 0) {
3414
5.09k
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3415
0
                return NULL;
3416
0
            }
3417
5.09k
            _Py_RETURN_UNICODE_EMPTY();
3418
5.09k
        }
3419
22.4k
        return PyUnicode_Decode(
3420
22.4k
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3421
22.4k
                encoding, errors);
3422
27.5k
    }
3423
3424
0
    if (PyUnicode_Check(obj)) {
3425
0
        PyErr_SetString(PyExc_TypeError,
3426
0
                        "decoding str is not supported");
3427
0
        return NULL;
3428
0
    }
3429
3430
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3431
0
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3432
0
        PyErr_Format(PyExc_TypeError,
3433
0
                     "decoding to str: need a bytes-like object, %.80s found",
3434
0
                     Py_TYPE(obj)->tp_name);
3435
0
        return NULL;
3436
0
    }
3437
3438
0
    if (buffer.len == 0) {
3439
0
        PyBuffer_Release(&buffer);
3440
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3441
0
            return NULL;
3442
0
        }
3443
0
        _Py_RETURN_UNICODE_EMPTY();
3444
0
    }
3445
3446
0
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3447
0
    PyBuffer_Release(&buffer);
3448
0
    return v;
3449
0
}
3450
3451
/* Normalize an encoding name like encodings.normalize_encoding()
3452
   but allow to convert to lowercase if *to_lower* is true.
3453
   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
3454
int
3455
_Py_normalize_encoding(const char *encoding,
3456
                       char *lower,
3457
                       size_t lower_len,
3458
                       int to_lower)
3459
36.3k
{
3460
36.3k
    const char *e;
3461
36.3k
    char *l;
3462
36.3k
    char *l_end;
3463
36.3k
    int punct;
3464
3465
36.3k
    assert(encoding != NULL);
3466
3467
36.3k
    e = encoding;
3468
36.3k
    l = lower;
3469
36.3k
    l_end = &lower[lower_len - 1];
3470
36.3k
    punct = 0;
3471
11.7M
    while (1) {
3472
11.7M
        char c = *e;
3473
11.7M
        if (c == 0) {
3474
35.5k
            break;
3475
35.5k
        }
3476
3477
11.7M
        if (Py_ISALNUM(c) || c == '.') {
3478
11.6M
            if (punct && l != lower) {
3479
22.0k
                if (l == l_end) {
3480
18
                    return 0;
3481
18
                }
3482
22.0k
                *l++ = '_';
3483
22.0k
            }
3484
11.6M
            punct = 0;
3485
3486
11.6M
            if (l == l_end) {
3487
789
                return 0;
3488
789
            }
3489
11.6M
            *l++ = to_lower ? Py_TOLOWER(c) : c;
3490
11.6M
        }
3491
29.6k
        else {
3492
29.6k
            punct = 1;
3493
29.6k
        }
3494
3495
11.7M
        e++;
3496
11.7M
    }
3497
35.5k
    *l = '\0';
3498
35.5k
    return 1;
3499
36.3k
}
3500
3501
PyObject *
3502
PyUnicode_Decode(const char *s,
3503
                 Py_ssize_t size,
3504
                 const char *encoding,
3505
                 const char *errors)
3506
29.6k
{
3507
29.6k
    PyObject *buffer = NULL, *unicode;
3508
29.6k
    Py_buffer info;
3509
29.6k
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3510
3511
29.6k
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3512
0
        return NULL;
3513
0
    }
3514
3515
29.6k
    if (size == 0) {
3516
1
        _Py_RETURN_UNICODE_EMPTY();
3517
1
    }
3518
3519
29.6k
    if (encoding == NULL) {
3520
0
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3521
0
    }
3522
3523
    /* Shortcuts for common default encodings */
3524
29.6k
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3525
28.8k
        char *lower = buflower;
3526
3527
        /* Fast paths */
3528
28.8k
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3529
12.5k
            lower += 3;
3530
12.5k
            if (*lower == '_') {
3531
                /* Match "utf8" and "utf_8" */
3532
12.5k
                lower++;
3533
12.5k
            }
3534
3535
12.5k
            if (lower[0] == '8' && lower[1] == 0) {
3536
10.8k
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3537
10.8k
            }
3538
1.78k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3539
96
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3540
96
            }
3541
1.69k
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3542
23
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3543
23
            }
3544
12.5k
        }
3545
16.2k
        else {
3546
16.2k
            if (strcmp(lower, "ascii") == 0
3547
9.33k
                || strcmp(lower, "us_ascii") == 0) {
3548
6.94k
                return PyUnicode_DecodeASCII(s, size, errors);
3549
6.94k
            }
3550
    #ifdef MS_WINDOWS
3551
            else if (strcmp(lower, "mbcs") == 0) {
3552
                return PyUnicode_DecodeMBCS(s, size, errors);
3553
            }
3554
    #endif
3555
9.32k
            else if (strcmp(lower, "latin1") == 0
3556
4.64k
                     || strcmp(lower, "latin_1") == 0
3557
4.63k
                     || strcmp(lower, "iso_8859_1") == 0
3558
4.70k
                     || strcmp(lower, "iso8859_1") == 0) {
3559
4.70k
                return PyUnicode_DecodeLatin1(s, size, errors);
3560
4.70k
            }
3561
16.2k
        }
3562
28.8k
    }
3563
3564
    /* Decode via the codec registry */
3565
7.09k
    buffer = NULL;
3566
7.09k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3567
0
        goto onError;
3568
7.09k
    buffer = PyMemoryView_FromBuffer(&info);
3569
7.09k
    if (buffer == NULL)
3570
0
        goto onError;
3571
7.09k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3572
7.09k
    if (unicode == NULL)
3573
1.95k
        goto onError;
3574
5.14k
    if (!PyUnicode_Check(unicode)) {
3575
0
        PyErr_Format(PyExc_TypeError,
3576
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3577
0
                     "use codecs.decode() to decode to arbitrary types",
3578
0
                     encoding,
3579
0
                     Py_TYPE(unicode)->tp_name);
3580
0
        Py_DECREF(unicode);
3581
0
        goto onError;
3582
0
    }
3583
5.14k
    Py_DECREF(buffer);
3584
5.14k
    return unicode_result(unicode);
3585
3586
1.95k
  onError:
3587
1.95k
    Py_XDECREF(buffer);
3588
1.95k
    return NULL;
3589
5.14k
}
3590
3591
PyAPI_FUNC(PyObject *)
3592
PyUnicode_AsDecodedObject(PyObject *unicode,
3593
                          const char *encoding,
3594
                          const char *errors)
3595
0
{
3596
0
    if (!PyUnicode_Check(unicode)) {
3597
0
        PyErr_BadArgument();
3598
0
        return NULL;
3599
0
    }
3600
3601
0
    if (encoding == NULL)
3602
0
        encoding = PyUnicode_GetDefaultEncoding();
3603
3604
    /* Decode via the codec registry */
3605
0
    return PyCodec_Decode(unicode, encoding, errors);
3606
0
}
3607
3608
PyAPI_FUNC(PyObject *)
3609
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3610
                           const char *encoding,
3611
                           const char *errors)
3612
0
{
3613
0
    PyObject *v;
3614
3615
0
    if (!PyUnicode_Check(unicode)) {
3616
0
        PyErr_BadArgument();
3617
0
        goto onError;
3618
0
    }
3619
3620
0
    if (encoding == NULL)
3621
0
        encoding = PyUnicode_GetDefaultEncoding();
3622
3623
    /* Decode via the codec registry */
3624
0
    v = PyCodec_Decode(unicode, encoding, errors);
3625
0
    if (v == NULL)
3626
0
        goto onError;
3627
0
    if (!PyUnicode_Check(v)) {
3628
0
        PyErr_Format(PyExc_TypeError,
3629
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3630
0
                     "use codecs.decode() to decode to arbitrary types",
3631
0
                     encoding,
3632
0
                     Py_TYPE(unicode)->tp_name);
3633
0
        Py_DECREF(v);
3634
0
        goto onError;
3635
0
    }
3636
0
    return unicode_result(v);
3637
3638
0
  onError:
3639
0
    return NULL;
3640
0
}
3641
3642
PyAPI_FUNC(PyObject *)
3643
PyUnicode_AsEncodedObject(PyObject *unicode,
3644
                          const char *encoding,
3645
                          const char *errors)
3646
0
{
3647
0
    PyObject *v;
3648
3649
0
    if (!PyUnicode_Check(unicode)) {
3650
0
        PyErr_BadArgument();
3651
0
        goto onError;
3652
0
    }
3653
3654
0
    if (encoding == NULL)
3655
0
        encoding = PyUnicode_GetDefaultEncoding();
3656
3657
    /* Encode via the codec registry */
3658
0
    v = PyCodec_Encode(unicode, encoding, errors);
3659
0
    if (v == NULL)
3660
0
        goto onError;
3661
0
    return v;
3662
3663
0
  onError:
3664
0
    return NULL;
3665
0
}
3666
3667
3668
static PyObject *
3669
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3670
                      int current_locale)
3671
550
{
3672
550
    Py_ssize_t wlen;
3673
550
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3674
550
    if (wstr == NULL) {
3675
0
        return NULL;
3676
0
    }
3677
3678
550
    if ((size_t)wlen != wcslen(wstr)) {
3679
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3680
0
        PyMem_Free(wstr);
3681
0
        return NULL;
3682
0
    }
3683
3684
550
    char *str;
3685
550
    size_t error_pos;
3686
550
    const char *reason;
3687
550
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3688
550
                                 current_locale, error_handler);
3689
550
    PyMem_Free(wstr);
3690
3691
550
    if (res != 0) {
3692
0
        if (res == -2) {
3693
0
            PyObject *exc;
3694
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3695
0
                    "locale", unicode,
3696
0
                    (Py_ssize_t)error_pos,
3697
0
                    (Py_ssize_t)(error_pos+1),
3698
0
                    reason);
3699
0
            if (exc != NULL) {
3700
0
                PyCodec_StrictErrors(exc);
3701
0
                Py_DECREF(exc);
3702
0
            }
3703
0
        }
3704
0
        else if (res == -3) {
3705
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3706
0
        }
3707
0
        else {
3708
0
            PyErr_NoMemory();
3709
0
        }
3710
0
        return NULL;
3711
0
    }
3712
3713
550
    PyObject *bytes = PyBytes_FromString(str);
3714
550
    PyMem_RawFree(str);
3715
550
    return bytes;
3716
550
}
3717
3718
PyObject *
3719
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3720
0
{
3721
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3722
0
    return unicode_encode_locale(unicode, error_handler, 1);
3723
0
}
3724
3725
PyObject *
3726
PyUnicode_EncodeFSDefault(PyObject *unicode)
3727
6.14k
{
3728
6.14k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3729
6.14k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3730
6.14k
    if (fs_codec->utf8) {
3731
5.59k
        return unicode_encode_utf8(unicode,
3732
5.59k
                                   fs_codec->error_handler,
3733
5.59k
                                   fs_codec->errors);
3734
5.59k
    }
3735
550
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3736
550
    else if (fs_codec->encoding) {
3737
0
        return PyUnicode_AsEncodedString(unicode,
3738
0
                                         fs_codec->encoding,
3739
0
                                         fs_codec->errors);
3740
0
    }
3741
550
#endif
3742
550
    else {
3743
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3744
           machinery is not ready and so cannot be used:
3745
           use wcstombs() in this case. */
3746
550
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3747
550
        const wchar_t *filesystem_errors = config->filesystem_errors;
3748
550
        assert(filesystem_errors != NULL);
3749
550
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3750
550
        assert(errors != _Py_ERROR_UNKNOWN);
3751
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3752
        return unicode_encode_utf8(unicode, errors, NULL);
3753
#else
3754
550
        return unicode_encode_locale(unicode, errors, 0);
3755
550
#endif
3756
550
    }
3757
6.14k
}
3758
3759
PyObject *
3760
PyUnicode_AsEncodedString(PyObject *unicode,
3761
                          const char *encoding,
3762
                          const char *errors)
3763
5.98k
{
3764
5.98k
    PyObject *v;
3765
5.98k
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3766
3767
5.98k
    if (!PyUnicode_Check(unicode)) {
3768
0
        PyErr_BadArgument();
3769
0
        return NULL;
3770
0
    }
3771
3772
5.98k
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3773
0
        return NULL;
3774
0
    }
3775
3776
5.98k
    if (encoding == NULL) {
3777
0
        return _PyUnicode_AsUTF8String(unicode, errors);
3778
0
    }
3779
3780
    /* Shortcuts for common default encodings */
3781
5.98k
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3782
5.98k
        char *lower = buflower;
3783
3784
        /* Fast paths */
3785
5.98k
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3786
66
            lower += 3;
3787
66
            if (*lower == '_') {
3788
                /* Match "utf8" and "utf_8" */
3789
66
                lower++;
3790
66
            }
3791
3792
66
            if (lower[0] == '8' && lower[1] == 0) {
3793
66
                return _PyUnicode_AsUTF8String(unicode, errors);
3794
66
            }
3795
0
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3796
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3797
0
            }
3798
0
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3799
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3800
0
            }
3801
66
        }
3802
5.92k
        else {
3803
5.92k
            if (strcmp(lower, "ascii") == 0
3804
4.78k
                || strcmp(lower, "us_ascii") == 0) {
3805
4.78k
                return _PyUnicode_AsASCIIString(unicode, errors);
3806
4.78k
            }
3807
#ifdef MS_WINDOWS
3808
            else if (strcmp(lower, "mbcs") == 0) {
3809
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3810
            }
3811
#endif
3812
1.13k
            else if (strcmp(lower, "latin1") == 0 ||
3813
1.13k
                     strcmp(lower, "latin_1") == 0 ||
3814
1.13k
                     strcmp(lower, "iso_8859_1") == 0 ||
3815
1.13k
                     strcmp(lower, "iso8859_1") == 0) {
3816
0
                return _PyUnicode_AsLatin1String(unicode, errors);
3817
0
            }
3818
5.92k
        }
3819
5.98k
    }
3820
3821
    /* Encode via the codec registry */
3822
1.13k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3823
1.13k
    if (v == NULL)
3824
0
        return NULL;
3825
3826
    /* The normal path */
3827
1.13k
    if (PyBytes_Check(v))
3828
1.13k
        return v;
3829
3830
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3831
0
    if (PyByteArray_Check(v)) {
3832
0
        int error;
3833
0
        PyObject *b;
3834
3835
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3836
0
            "encoder %s returned bytearray instead of bytes; "
3837
0
            "use codecs.encode() to encode to arbitrary types",
3838
0
            encoding);
3839
0
        if (error) {
3840
0
            Py_DECREF(v);
3841
0
            return NULL;
3842
0
        }
3843
3844
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3845
0
                                      PyByteArray_GET_SIZE(v));
3846
0
        Py_DECREF(v);
3847
0
        return b;
3848
0
    }
3849
3850
0
    PyErr_Format(PyExc_TypeError,
3851
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3852
0
                 "use codecs.encode() to encode to arbitrary types",
3853
0
                 encoding,
3854
0
                 Py_TYPE(v)->tp_name);
3855
0
    Py_DECREF(v);
3856
0
    return NULL;
3857
0
}
3858
3859
PyAPI_FUNC(PyObject *)
3860
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3861
                           const char *encoding,
3862
                           const char *errors)
3863
0
{
3864
0
    PyObject *v;
3865
3866
0
    if (!PyUnicode_Check(unicode)) {
3867
0
        PyErr_BadArgument();
3868
0
        goto onError;
3869
0
    }
3870
3871
0
    if (encoding == NULL)
3872
0
        encoding = PyUnicode_GetDefaultEncoding();
3873
3874
    /* Encode via the codec registry */
3875
0
    v = PyCodec_Encode(unicode, encoding, errors);
3876
0
    if (v == NULL)
3877
0
        goto onError;
3878
0
    if (!PyUnicode_Check(v)) {
3879
0
        PyErr_Format(PyExc_TypeError,
3880
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3881
0
                     "use codecs.encode() to encode to arbitrary types",
3882
0
                     encoding,
3883
0
                     Py_TYPE(v)->tp_name);
3884
0
        Py_DECREF(v);
3885
0
        goto onError;
3886
0
    }
3887
0
    return v;
3888
3889
0
  onError:
3890
0
    return NULL;
3891
0
}
3892
3893
static PyObject*
3894
unicode_decode_locale(const char *str, Py_ssize_t len,
3895
                      _Py_error_handler errors, int current_locale)
3896
11.4k
{
3897
11.4k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3898
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3899
0
        return NULL;
3900
0
    }
3901
3902
11.4k
    wchar_t *wstr;
3903
11.4k
    size_t wlen;
3904
11.4k
    const char *reason;
3905
11.4k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3906
11.4k
                                 current_locale, errors);
3907
11.4k
    if (res != 0) {
3908
0
        if (res == -2) {
3909
0
            PyObject *exc;
3910
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3911
0
                                        "locale", str, len,
3912
0
                                        (Py_ssize_t)wlen,
3913
0
                                        (Py_ssize_t)(wlen + 1),
3914
0
                                        reason);
3915
0
            if (exc != NULL) {
3916
0
                PyCodec_StrictErrors(exc);
3917
0
                Py_DECREF(exc);
3918
0
            }
3919
0
        }
3920
0
        else if (res == -3) {
3921
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3922
0
        }
3923
0
        else {
3924
0
            PyErr_NoMemory();
3925
0
        }
3926
0
        return NULL;
3927
0
    }
3928
3929
11.4k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3930
11.4k
    PyMem_RawFree(wstr);
3931
11.4k
    return unicode;
3932
11.4k
}
3933
3934
PyObject*
3935
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3936
                              const char *errors)
3937
0
{
3938
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3939
0
    return unicode_decode_locale(str, len, error_handler, 1);
3940
0
}
3941
3942
PyObject*
3943
PyUnicode_DecodeLocale(const char *str, const char *errors)
3944
4.41k
{
3945
4.41k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3946
4.41k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3947
4.41k
    return unicode_decode_locale(str, size, error_handler, 1);
3948
4.41k
}
3949
3950
3951
PyObject*
3952
15.1k
PyUnicode_DecodeFSDefault(const char *s) {
3953
15.1k
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3954
15.1k
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3955
15.1k
}
3956
3957
PyObject*
3958
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3959
24.0k
{
3960
24.0k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3961
24.0k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3962
24.0k
    if (fs_codec->utf8) {
3963
16.9k
        return unicode_decode_utf8(s, size,
3964
16.9k
                                   fs_codec->error_handler,
3965
16.9k
                                   fs_codec->errors,
3966
16.9k
                                   NULL);
3967
16.9k
    }
3968
7.06k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3969
7.06k
    else if (fs_codec->encoding) {
3970
0
        return PyUnicode_Decode(s, size,
3971
0
                                fs_codec->encoding,
3972
0
                                fs_codec->errors);
3973
0
    }
3974
7.06k
#endif
3975
7.06k
    else {
3976
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3977
           machinery is not ready and so cannot be used:
3978
           use mbstowcs() in this case. */
3979
7.06k
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3980
7.06k
        const wchar_t *filesystem_errors = config->filesystem_errors;
3981
7.06k
        assert(filesystem_errors != NULL);
3982
7.06k
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3983
7.06k
        assert(errors != _Py_ERROR_UNKNOWN);
3984
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3985
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3986
#else
3987
7.06k
        return unicode_decode_locale(s, size, errors, 0);
3988
7.06k
#endif
3989
7.06k
    }
3990
24.0k
}
3991
3992
3993
int
3994
PyUnicode_FSConverter(PyObject* arg, void* addr)
3995
3.87k
{
3996
3.87k
    PyObject *path = NULL;
3997
3.87k
    PyObject *output = NULL;
3998
3.87k
    Py_ssize_t size;
3999
3.87k
    const char *data;
4000
3.87k
    if (arg == NULL) {
4001
0
        Py_DECREF(*(PyObject**)addr);
4002
0
        *(PyObject**)addr = NULL;
4003
0
        return 1;
4004
0
    }
4005
3.87k
    path = PyOS_FSPath(arg);
4006
3.87k
    if (path == NULL) {
4007
0
        return 0;
4008
0
    }
4009
3.87k
    if (PyBytes_Check(path)) {
4010
0
        output = path;
4011
0
    }
4012
3.87k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4013
3.87k
        output = PyUnicode_EncodeFSDefault(path);
4014
3.87k
        Py_DECREF(path);
4015
3.87k
        if (!output) {
4016
0
            return 0;
4017
0
        }
4018
3.87k
        assert(PyBytes_Check(output));
4019
3.87k
    }
4020
4021
3.87k
    size = PyBytes_GET_SIZE(output);
4022
3.87k
    data = PyBytes_AS_STRING(output);
4023
3.87k
    if ((size_t)size != strlen(data)) {
4024
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4025
0
        Py_DECREF(output);
4026
0
        return 0;
4027
0
    }
4028
3.87k
    *(PyObject**)addr = output;
4029
3.87k
    return Py_CLEANUP_SUPPORTED;
4030
3.87k
}
4031
4032
4033
int
4034
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4035
20
{
4036
20
    if (arg == NULL) {
4037
0
        Py_DECREF(*(PyObject**)addr);
4038
0
        *(PyObject**)addr = NULL;
4039
0
        return 1;
4040
0
    }
4041
4042
20
    PyObject *path = PyOS_FSPath(arg);
4043
20
    if (path == NULL) {
4044
0
        return 0;
4045
0
    }
4046
4047
20
    PyObject *output = NULL;
4048
20
    if (PyUnicode_Check(path)) {
4049
20
        output = path;
4050
20
    }
4051
0
    else if (PyBytes_Check(path)) {
4052
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4053
0
                                                  PyBytes_GET_SIZE(path));
4054
0
        Py_DECREF(path);
4055
0
        if (!output) {
4056
0
            return 0;
4057
0
        }
4058
0
    }
4059
0
    else {
4060
0
        PyErr_Format(PyExc_TypeError,
4061
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4062
0
                     Py_TYPE(arg)->tp_name);
4063
0
        Py_DECREF(path);
4064
0
        return 0;
4065
0
    }
4066
4067
20
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4068
20
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4069
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4070
0
        Py_DECREF(output);
4071
0
        return 0;
4072
0
    }
4073
20
    *(PyObject**)addr = output;
4074
20
    return Py_CLEANUP_SUPPORTED;
4075
20
}
4076
4077
4078
static int unicode_fill_utf8(PyObject *unicode);
4079
4080
4081
static int
4082
unicode_ensure_utf8(PyObject *unicode)
4083
412k
{
4084
412k
    int err = 0;
4085
412k
    if (PyUnicode_UTF8(unicode) == NULL) {
4086
2.75k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4087
2.75k
        if (PyUnicode_UTF8(unicode) == NULL) {
4088
2.75k
            err = unicode_fill_utf8(unicode);
4089
2.75k
        }
4090
2.75k
        Py_END_CRITICAL_SECTION();
4091
2.75k
    }
4092
412k
    return err;
4093
412k
}
4094
4095
const char *
4096
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4097
412k
{
4098
412k
    if (!PyUnicode_Check(unicode)) {
4099
0
        PyErr_BadArgument();
4100
0
        if (psize) {
4101
0
            *psize = -1;
4102
0
        }
4103
0
        return NULL;
4104
0
    }
4105
4106
412k
    if (unicode_ensure_utf8(unicode) == -1) {
4107
0
        if (psize) {
4108
0
            *psize = -1;
4109
0
        }
4110
0
        return NULL;
4111
0
    }
4112
4113
412k
    if (psize) {
4114
236k
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4115
236k
    }
4116
412k
    return PyUnicode_UTF8(unicode);
4117
412k
}
4118
4119
const char *
4120
PyUnicode_AsUTF8(PyObject *unicode)
4121
176k
{
4122
176k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4123
176k
}
4124
4125
const char *
4126
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4127
15.7k
{
4128
15.7k
    Py_ssize_t size;
4129
15.7k
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4130
15.7k
    if (s && strlen(s) != (size_t)size) {
4131
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4132
0
        return NULL;
4133
0
    }
4134
15.7k
    return s;
4135
15.7k
}
4136
4137
/*
4138
PyUnicode_GetSize() has been deprecated since Python 3.3
4139
because it returned length of Py_UNICODE.
4140
4141
But this function is part of stable abi, because it doesn't
4142
include Py_UNICODE in signature and it was not excluded from
4143
stable ABI in PEP 384.
4144
*/
4145
PyAPI_FUNC(Py_ssize_t)
4146
PyUnicode_GetSize(PyObject *unicode)
4147
0
{
4148
0
    PyErr_SetString(PyExc_RuntimeError,
4149
0
                    "PyUnicode_GetSize has been removed.");
4150
0
    return -1;
4151
0
}
4152
4153
Py_ssize_t
4154
PyUnicode_GetLength(PyObject *unicode)
4155
678
{
4156
678
    if (!PyUnicode_Check(unicode)) {
4157
0
        PyErr_BadArgument();
4158
0
        return -1;
4159
0
    }
4160
678
    return PyUnicode_GET_LENGTH(unicode);
4161
678
}
4162
4163
Py_UCS4
4164
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4165
7
{
4166
7
    const void *data;
4167
7
    int kind;
4168
4169
7
    if (!PyUnicode_Check(unicode)) {
4170
0
        PyErr_BadArgument();
4171
0
        return (Py_UCS4)-1;
4172
0
    }
4173
7
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4174
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4175
0
        return (Py_UCS4)-1;
4176
0
    }
4177
7
    data = PyUnicode_DATA(unicode);
4178
7
    kind = PyUnicode_KIND(unicode);
4179
7
    return PyUnicode_READ(kind, data, index);
4180
7
}
4181
4182
int
4183
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4184
0
{
4185
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4186
0
        PyErr_BadArgument();
4187
0
        return -1;
4188
0
    }
4189
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4190
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4191
0
        return -1;
4192
0
    }
4193
0
    if (unicode_check_modifiable(unicode))
4194
0
        return -1;
4195
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4196
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4197
0
        return -1;
4198
0
    }
4199
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4200
0
                    index, ch);
4201
0
    return 0;
4202
0
}
4203
4204
const char *
4205
PyUnicode_GetDefaultEncoding(void)
4206
0
{
4207
0
    return "utf-8";
4208
0
}
4209
4210
/* create or adjust a UnicodeDecodeError */
4211
static void
4212
make_decode_exception(PyObject **exceptionObject,
4213
                      const char *encoding,
4214
                      const char *input, Py_ssize_t length,
4215
                      Py_ssize_t startpos, Py_ssize_t endpos,
4216
                      const char *reason)
4217
144k
{
4218
144k
    if (*exceptionObject == NULL) {
4219
5.67k
        *exceptionObject = PyUnicodeDecodeError_Create(
4220
5.67k
            encoding, input, length, startpos, endpos, reason);
4221
5.67k
    }
4222
138k
    else {
4223
138k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4224
0
            goto onError;
4225
138k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4226
0
            goto onError;
4227
138k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4228
0
            goto onError;
4229
138k
    }
4230
144k
    return;
4231
4232
144k
onError:
4233
0
    Py_CLEAR(*exceptionObject);
4234
0
}
4235
4236
#ifdef MS_WINDOWS
4237
static int
4238
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4239
{
4240
    if (newsize > *size) {
4241
        wchar_t *newbuf = *buf;
4242
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4243
            PyErr_NoMemory();
4244
            return -1;
4245
        }
4246
        *buf = newbuf;
4247
    }
4248
    *size = newsize;
4249
    return 0;
4250
}
4251
4252
/* error handling callback helper:
4253
   build arguments, call the callback and check the arguments,
4254
   if no exception occurred, copy the replacement to the output
4255
   and adjust various state variables.
4256
   return 0 on success, -1 on error
4257
*/
4258
4259
static int
4260
unicode_decode_call_errorhandler_wchar(
4261
    const char *errors, PyObject **errorHandler,
4262
    const char *encoding, const char *reason,
4263
    const char **input, const char **inend, Py_ssize_t *startinpos,
4264
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4265
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4266
{
4267
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4268
4269
    PyObject *restuple = NULL;
4270
    PyObject *repunicode = NULL;
4271
    Py_ssize_t outsize;
4272
    Py_ssize_t insize;
4273
    Py_ssize_t requiredsize;
4274
    Py_ssize_t newpos;
4275
    PyObject *inputobj = NULL;
4276
    Py_ssize_t repwlen;
4277
4278
    if (*errorHandler == NULL) {
4279
        *errorHandler = PyCodec_LookupError(errors);
4280
        if (*errorHandler == NULL)
4281
            goto onError;
4282
    }
4283
4284
    make_decode_exception(exceptionObject,
4285
        encoding,
4286
        *input, *inend - *input,
4287
        *startinpos, *endinpos,
4288
        reason);
4289
    if (*exceptionObject == NULL)
4290
        goto onError;
4291
4292
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4293
    if (restuple == NULL)
4294
        goto onError;
4295
    if (!PyTuple_Check(restuple)) {
4296
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4297
        goto onError;
4298
    }
4299
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4300
        goto onError;
4301
4302
    /* Copy back the bytes variables, which might have been modified by the
4303
       callback */
4304
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4305
    if (!inputobj)
4306
        goto onError;
4307
    *input = PyBytes_AS_STRING(inputobj);
4308
    insize = PyBytes_GET_SIZE(inputobj);
4309
    *inend = *input + insize;
4310
    /* we can DECREF safely, as the exception has another reference,
4311
       so the object won't go away. */
4312
    Py_DECREF(inputobj);
4313
4314
    if (newpos<0)
4315
        newpos = insize+newpos;
4316
    if (newpos<0 || newpos>insize) {
4317
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4318
        goto onError;
4319
    }
4320
4321
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4322
    if (repwlen < 0)
4323
        goto onError;
4324
    repwlen--;
4325
    /* need more space? (at least enough for what we
4326
       have+the replacement+the rest of the string (starting
4327
       at the new input position), so we won't have to check space
4328
       when there are no errors in the rest of the string) */
4329
    requiredsize = *outpos;
4330
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4331
        goto overflow;
4332
    requiredsize += repwlen;
4333
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4334
        goto overflow;
4335
    requiredsize += insize - newpos;
4336
    outsize = *bufsize;
4337
    if (requiredsize > outsize) {
4338
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4339
            requiredsize = 2*outsize;
4340
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4341
            goto onError;
4342
        }
4343
    }
4344
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4345
    *outpos += repwlen;
4346
    *endinpos = newpos;
4347
    *inptr = *input + newpos;
4348
4349
    /* we made it! */
4350
    Py_DECREF(restuple);
4351
    return 0;
4352
4353
  overflow:
4354
    PyErr_SetString(PyExc_OverflowError,
4355
                    "decoded result is too long for a Python string");
4356
4357
  onError:
4358
    Py_XDECREF(restuple);
4359
    return -1;
4360
}
4361
#endif   /* MS_WINDOWS */
4362
4363
static int
4364
unicode_decode_call_errorhandler_writer(
4365
    const char *errors, PyObject **errorHandler,
4366
    const char *encoding, const char *reason,
4367
    const char **input, const char **inend, Py_ssize_t *startinpos,
4368
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4369
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4370
144k
{
4371
144k
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4372
4373
144k
    PyObject *restuple = NULL;
4374
144k
    PyObject *repunicode = NULL;
4375
144k
    Py_ssize_t insize;
4376
144k
    Py_ssize_t newpos;
4377
144k
    Py_ssize_t replen;
4378
144k
    Py_ssize_t remain;
4379
144k
    PyObject *inputobj = NULL;
4380
144k
    int need_to_grow = 0;
4381
144k
    const char *new_inptr;
4382
4383
144k
    if (*errorHandler == NULL) {
4384
5.67k
        *errorHandler = PyCodec_LookupError(errors);
4385
5.67k
        if (*errorHandler == NULL)
4386
0
            goto onError;
4387
5.67k
    }
4388
4389
144k
    make_decode_exception(exceptionObject,
4390
144k
        encoding,
4391
144k
        *input, *inend - *input,
4392
144k
        *startinpos, *endinpos,
4393
144k
        reason);
4394
144k
    if (*exceptionObject == NULL)
4395
0
        goto onError;
4396
4397
144k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4398
144k
    if (restuple == NULL)
4399
4.81k
        goto onError;
4400
139k
    if (!PyTuple_Check(restuple)) {
4401
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4402
0
        goto onError;
4403
0
    }
4404
139k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4405
0
        goto onError;
4406
4407
    /* Copy back the bytes variables, which might have been modified by the
4408
       callback */
4409
139k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4410
139k
    if (!inputobj)
4411
0
        goto onError;
4412
139k
    remain = *inend - *input - *endinpos;
4413
139k
    *input = PyBytes_AS_STRING(inputobj);
4414
139k
    insize = PyBytes_GET_SIZE(inputobj);
4415
139k
    *inend = *input + insize;
4416
    /* we can DECREF safely, as the exception has another reference,
4417
       so the object won't go away. */
4418
139k
    Py_DECREF(inputobj);
4419
4420
139k
    if (newpos<0)
4421
0
        newpos = insize+newpos;
4422
139k
    if (newpos<0 || newpos>insize) {
4423
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4424
0
        goto onError;
4425
0
    }
4426
4427
139k
    replen = PyUnicode_GET_LENGTH(repunicode);
4428
139k
    if (replen > 1) {
4429
0
        writer->min_length += replen - 1;
4430
0
        need_to_grow = 1;
4431
0
    }
4432
139k
    new_inptr = *input + newpos;
4433
139k
    if (*inend - new_inptr > remain) {
4434
        /* We don't know the decoding algorithm here so we make the worst
4435
           assumption that one byte decodes to one unicode character.
4436
           If unfortunately one byte could decode to more unicode characters,
4437
           the decoder may write out-of-bound then.  Is it possible for the
4438
           algorithms using this function? */
4439
67
        writer->min_length += *inend - new_inptr - remain;
4440
67
        need_to_grow = 1;
4441
67
    }
4442
139k
    if (need_to_grow) {
4443
67
        writer->overallocate = 1;
4444
67
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4445
67
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4446
0
            goto onError;
4447
67
    }
4448
139k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4449
0
        goto onError;
4450
4451
139k
    *endinpos = newpos;
4452
139k
    *inptr = new_inptr;
4453
4454
    /* we made it! */
4455
139k
    Py_DECREF(restuple);
4456
139k
    return 0;
4457
4458
4.81k
  onError:
4459
4.81k
    Py_XDECREF(restuple);
4460
4.81k
    return -1;
4461
139k
}
4462
4463
/* --- UTF-7 Codec -------------------------------------------------------- */
4464
4465
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4466
4467
/* Three simple macros defining base-64. */
4468
4469
/* Is c a base-64 character? */
4470
4471
#define IS_BASE64(c) \
4472
1.79k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4473
1.79k
     ((c) >= 'a' && (c) <= 'z') ||     \
4474
1.79k
     ((c) >= '0' && (c) <= '9') ||     \
4475
1.79k
     (c) == '+' || (c) == '/')
4476
4477
/* given that c is a base-64 character, what is its base-64 value? */
4478
4479
#define FROM_BASE64(c)                                                  \
4480
1.53k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4481
1.53k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4482
992
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4483
681
     (c) == '+' ? 62 : 63)
4484
4485
/* What is the base-64 character of the bottom 6 bits of n? */
4486
4487
#define TO_BASE64(n)  \
4488
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4489
4490
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4491
 * decoded as itself.  We are permissive on decoding; the only ASCII
4492
 * byte not decoding to itself is the + which begins a base64
4493
 * string. */
4494
4495
#define DECODE_DIRECT(c)                                \
4496
13.3k
    ((c) <= 127 && (c) != '+')
4497
4498
/* The UTF-7 encoder treats ASCII characters differently according to
4499
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4500
 * the above).  See RFC2152.  This array identifies these different
4501
 * sets:
4502
 * 0 : "Set D"
4503
 *     alphanumeric and '(),-./:?
4504
 * 1 : "Set O"
4505
 *     !"#$%&*;<=>@[]^_`{|}
4506
 * 2 : "whitespace"
4507
 *     ht nl cr sp
4508
 * 3 : special (must be base64 encoded)
4509
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4510
 */
4511
4512
static
4513
char utf7_category[128] = {
4514
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4515
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4516
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4517
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4518
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4519
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4520
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4521
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4522
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4523
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4524
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4525
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4526
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4527
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4528
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4529
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4530
};
4531
4532
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4533
 * answer depends on whether we are encoding set O as itself, and also
4534
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4535
 * clear that the answers to these questions vary between
4536
 * applications, so this code needs to be flexible.  */
4537
4538
#define ENCODE_DIRECT(c) \
4539
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4540
4541
PyObject *
4542
PyUnicode_DecodeUTF7(const char *s,
4543
                     Py_ssize_t size,
4544
                     const char *errors)
4545
0
{
4546
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4547
0
}
4548
4549
/* The decoder.  The only state we preserve is our read position,
4550
 * i.e. how many characters we have consumed.  So if we end in the
4551
 * middle of a shift sequence we have to back off the read position
4552
 * and the output to the beginning of the sequence, otherwise we lose
4553
 * all the shift state (seen bits, number of bits seen, high
4554
 * surrogate). */
4555
4556
PyObject *
4557
PyUnicode_DecodeUTF7Stateful(const char *s,
4558
                             Py_ssize_t size,
4559
                             const char *errors,
4560
                             Py_ssize_t *consumed)
4561
71
{
4562
71
    const char *starts = s;
4563
71
    Py_ssize_t startinpos;
4564
71
    Py_ssize_t endinpos;
4565
71
    const char *e;
4566
71
    _PyUnicodeWriter writer;
4567
71
    const char *errmsg = "";
4568
71
    int inShift = 0;
4569
71
    Py_ssize_t shiftOutStart;
4570
71
    unsigned int base64bits = 0;
4571
71
    unsigned long base64buffer = 0;
4572
71
    Py_UCS4 surrogate = 0;
4573
71
    PyObject *errorHandler = NULL;
4574
71
    PyObject *exc = NULL;
4575
4576
71
    if (size == 0) {
4577
0
        if (consumed)
4578
0
            *consumed = 0;
4579
0
        _Py_RETURN_UNICODE_EMPTY();
4580
0
    }
4581
4582
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4583
71
    _PyUnicodeWriter_Init(&writer);
4584
71
    writer.min_length = size;
4585
4586
71
    shiftOutStart = 0;
4587
71
    e = s + size;
4588
4589
15.4k
    while (s < e) {
4590
15.4k
        Py_UCS4 ch;
4591
15.4k
      restart:
4592
15.4k
        ch = (unsigned char) *s;
4593
4594
15.4k
        if (inShift) { /* in a base-64 section */
4595
1.65k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4596
1.53k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4597
1.53k
                base64bits += 6;
4598
1.53k
                s++;
4599
1.53k
                if (base64bits >= 16) {
4600
                    /* we have enough bits for a UTF-16 value */
4601
555
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4602
555
                    base64bits -= 16;
4603
555
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4604
555
                    assert(outCh <= 0xffff);
4605
555
                    if (surrogate) {
4606
                        /* expecting a second surrogate */
4607
50
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4608
23
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4609
23
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4610
0
                                goto onError;
4611
23
                            surrogate = 0;
4612
23
                            continue;
4613
23
                        }
4614
27
                        else {
4615
27
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4616
0
                                goto onError;
4617
27
                            surrogate = 0;
4618
27
                        }
4619
50
                    }
4620
532
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4621
                        /* first surrogate */
4622
85
                        surrogate = outCh;
4623
85
                    }
4624
447
                    else {
4625
447
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4626
0
                            goto onError;
4627
447
                    }
4628
532
                }
4629
1.53k
            }
4630
126
            else { /* now leaving a base-64 section */
4631
126
                inShift = 0;
4632
126
                if (base64bits > 0) { /* left-over bits */
4633
82
                    if (base64bits >= 6) {
4634
                        /* We've seen at least one base-64 character */
4635
17
                        s++;
4636
17
                        errmsg = "partial character in shift sequence";
4637
17
                        goto utf7Error;
4638
17
                    }
4639
65
                    else {
4640
                        /* Some bits remain; they should be zero */
4641
65
                        if (base64buffer != 0) {
4642
8
                            s++;
4643
8
                            errmsg = "non-zero padding bits in shift sequence";
4644
8
                            goto utf7Error;
4645
8
                        }
4646
65
                    }
4647
82
                }
4648
101
                if (surrogate && DECODE_DIRECT(ch)) {
4649
32
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4650
0
                        goto onError;
4651
32
                }
4652
101
                surrogate = 0;
4653
101
                if (ch == '-') {
4654
                    /* '-' is absorbed; other terminating
4655
                       characters are preserved */
4656
9
                    s++;
4657
9
                }
4658
101
            }
4659
1.65k
        }
4660
13.7k
        else if ( ch == '+' ) {
4661
492
            startinpos = s-starts;
4662
492
            s++; /* consume '+' */
4663
492
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4664
354
                s++;
4665
354
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4666
0
                    goto onError;
4667
354
            }
4668
138
            else if (s < e && !IS_BASE64(*s)) {
4669
12
                s++;
4670
12
                errmsg = "ill-formed sequence";
4671
12
                goto utf7Error;
4672
12
            }
4673
126
            else { /* begin base64-encoded section */
4674
126
                inShift = 1;
4675
126
                surrogate = 0;
4676
126
                shiftOutStart = writer.pos;
4677
126
                base64bits = 0;
4678
126
                base64buffer = 0;
4679
126
            }
4680
492
        }
4681
13.2k
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4682
12.3k
            s++;
4683
12.3k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4684
0
                goto onError;
4685
12.3k
        }
4686
910
        else {
4687
910
            startinpos = s-starts;
4688
910
            s++;
4689
910
            errmsg = "unexpected special character";
4690
910
            goto utf7Error;
4691
910
        }
4692
14.4k
        continue;
4693
14.4k
utf7Error:
4694
947
        endinpos = s-starts;
4695
947
        if (unicode_decode_call_errorhandler_writer(
4696
947
                errors, &errorHandler,
4697
947
                "utf7", errmsg,
4698
947
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4699
947
                &writer))
4700
44
            goto onError;
4701
947
    }
4702
4703
    /* end of string */
4704
4705
27
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4706
        /* if we're in an inconsistent state, that's an error */
4707
0
        inShift = 0;
4708
0
        if (surrogate ||
4709
0
                (base64bits >= 6) ||
4710
0
                (base64bits > 0 && base64buffer != 0)) {
4711
0
            endinpos = size;
4712
0
            if (unicode_decode_call_errorhandler_writer(
4713
0
                    errors, &errorHandler,
4714
0
                    "utf7", "unterminated shift sequence",
4715
0
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4716
0
                    &writer))
4717
0
                goto onError;
4718
0
            if (s < e)
4719
0
                goto restart;
4720
0
        }
4721
0
    }
4722
4723
    /* return state */
4724
27
    if (consumed) {
4725
0
        if (inShift) {
4726
0
            *consumed = startinpos;
4727
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4728
0
                PyObject *result = PyUnicode_FromKindAndData(
4729
0
                        writer.kind, writer.data, shiftOutStart);
4730
0
                Py_XDECREF(errorHandler);
4731
0
                Py_XDECREF(exc);
4732
0
                _PyUnicodeWriter_Dealloc(&writer);
4733
0
                return result;
4734
0
            }
4735
0
            writer.pos = shiftOutStart; /* back off output */
4736
0
        }
4737
0
        else {
4738
0
            *consumed = s-starts;
4739
0
        }
4740
0
    }
4741
4742
27
    Py_XDECREF(errorHandler);
4743
27
    Py_XDECREF(exc);
4744
27
    return _PyUnicodeWriter_Finish(&writer);
4745
4746
44
  onError:
4747
44
    Py_XDECREF(errorHandler);
4748
44
    Py_XDECREF(exc);
4749
44
    _PyUnicodeWriter_Dealloc(&writer);
4750
44
    return NULL;
4751
27
}
4752
4753
4754
PyObject *
4755
_PyUnicode_EncodeUTF7(PyObject *str,
4756
                      const char *errors)
4757
0
{
4758
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4759
0
    if (len == 0) {
4760
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4761
0
    }
4762
0
    int kind = PyUnicode_KIND(str);
4763
0
    const void *data = PyUnicode_DATA(str);
4764
4765
    /* It might be possible to tighten this worst case */
4766
0
    if (len > PY_SSIZE_T_MAX / 8) {
4767
0
        return PyErr_NoMemory();
4768
0
    }
4769
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4770
0
    if (writer == NULL) {
4771
0
        return NULL;
4772
0
    }
4773
4774
0
    int inShift = 0;
4775
0
    unsigned int base64bits = 0;
4776
0
    unsigned long base64buffer = 0;
4777
0
    char *out = PyBytesWriter_GetData(writer);
4778
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4779
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4780
4781
0
        if (inShift) {
4782
0
            if (ENCODE_DIRECT(ch)) {
4783
                /* shifting out */
4784
0
                if (base64bits) { /* output remaining bits */
4785
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4786
0
                    base64buffer = 0;
4787
0
                    base64bits = 0;
4788
0
                }
4789
0
                inShift = 0;
4790
                /* Characters not in the BASE64 set implicitly unshift the sequence
4791
                   so no '-' is required, except if the character is itself a '-' */
4792
0
                if (IS_BASE64(ch) || ch == '-') {
4793
0
                    *out++ = '-';
4794
0
                }
4795
0
                *out++ = (char) ch;
4796
0
            }
4797
0
            else {
4798
0
                goto encode_char;
4799
0
            }
4800
0
        }
4801
0
        else { /* not in a shift sequence */
4802
0
            if (ch == '+') {
4803
0
                *out++ = '+';
4804
0
                        *out++ = '-';
4805
0
            }
4806
0
            else if (ENCODE_DIRECT(ch)) {
4807
0
                *out++ = (char) ch;
4808
0
            }
4809
0
            else {
4810
0
                *out++ = '+';
4811
0
                inShift = 1;
4812
0
                goto encode_char;
4813
0
            }
4814
0
        }
4815
0
        continue;
4816
0
encode_char:
4817
0
        if (ch >= 0x10000) {
4818
0
            assert(ch <= MAX_UNICODE);
4819
4820
            /* code first surrogate */
4821
0
            base64bits += 16;
4822
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4823
0
            while (base64bits >= 6) {
4824
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4825
0
                base64bits -= 6;
4826
0
            }
4827
            /* prepare second surrogate */
4828
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4829
0
        }
4830
0
        base64bits += 16;
4831
0
        base64buffer = (base64buffer << 16) | ch;
4832
0
        while (base64bits >= 6) {
4833
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4834
0
            base64bits -= 6;
4835
0
        }
4836
0
    }
4837
0
    if (base64bits)
4838
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4839
0
    if (inShift)
4840
0
        *out++ = '-';
4841
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4842
0
}
4843
4844
#undef IS_BASE64
4845
#undef FROM_BASE64
4846
#undef TO_BASE64
4847
#undef DECODE_DIRECT
4848
#undef ENCODE_DIRECT
4849
4850
/* --- UTF-8 Codec -------------------------------------------------------- */
4851
4852
PyObject *
4853
PyUnicode_DecodeUTF8(const char *s,
4854
                     Py_ssize_t size,
4855
                     const char *errors)
4856
54.3M
{
4857
54.3M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4858
54.3M
}
4859
4860
#include "stringlib/asciilib.h"
4861
#include "stringlib/codecs.h"
4862
#include "stringlib/undef.h"
4863
4864
#include "stringlib/ucs1lib.h"
4865
#include "stringlib/codecs.h"
4866
#include "stringlib/undef.h"
4867
4868
#include "stringlib/ucs2lib.h"
4869
#include "stringlib/codecs.h"
4870
#include "stringlib/undef.h"
4871
4872
#include "stringlib/ucs4lib.h"
4873
#include "stringlib/codecs.h"
4874
#include "stringlib/undef.h"
4875
4876
#if (SIZEOF_SIZE_T == 8)
4877
/* Mask to quickly check whether a C 'size_t' contains a
4878
   non-ASCII, UTF8-encoded char. */
4879
107M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4880
// used to count codepoints in UTF-8 string.
4881
23.9M
# define VECTOR_0101     0x0101010101010101ULL
4882
371k
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4883
#elif (SIZEOF_SIZE_T == 4)
4884
# define ASCII_CHAR_MASK 0x80808080U
4885
# define VECTOR_0101     0x01010101U
4886
# define VECTOR_00FF     0x00ff00ffU
4887
#else
4888
# error C 'size_t' size should be either 4 or 8!
4889
#endif
4890
4891
#if (defined(__clang__) || defined(__GNUC__))
4892
#define HAVE_CTZ 1
4893
static inline unsigned int
4894
ctz(size_t v)
4895
4.15M
{
4896
4.15M
    return __builtin_ctzll((unsigned long long)v);
4897
4.15M
}
4898
#elif defined(_MSC_VER)
4899
#define HAVE_CTZ 1
4900
static inline unsigned int
4901
ctz(size_t v)
4902
{
4903
    unsigned long pos;
4904
#if SIZEOF_SIZE_T == 4
4905
    _BitScanForward(&pos, v);
4906
#else
4907
    _BitScanForward64(&pos, v);
4908
#endif /* SIZEOF_SIZE_T */
4909
    return pos;
4910
}
4911
#else
4912
#define HAVE_CTZ 0
4913
#endif
4914
4915
#if HAVE_CTZ && PY_LITTLE_ENDIAN
4916
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
4917
static size_t
4918
load_unaligned(const unsigned char *p, size_t size)
4919
10.0M
{
4920
10.0M
    union {
4921
10.0M
        size_t s;
4922
10.0M
        unsigned char b[SIZEOF_SIZE_T];
4923
10.0M
    } u;
4924
10.0M
    u.s = 0;
4925
    // This switch statement assumes little endian because:
4926
    // * union is faster than bitwise or and shift.
4927
    // * big endian machine is rare and hard to maintain.
4928
10.0M
    switch (size) {
4929
0
    default:
4930
0
#if SIZEOF_SIZE_T == 8
4931
0
    case 8:
4932
0
        u.b[7] = p[7];
4933
0
        _Py_FALLTHROUGH;
4934
1.02M
    case 7:
4935
1.02M
        u.b[6] = p[6];
4936
1.02M
        _Py_FALLTHROUGH;
4937
1.67M
    case 6:
4938
1.67M
        u.b[5] = p[5];
4939
1.67M
        _Py_FALLTHROUGH;
4940
3.35M
    case 5:
4941
3.35M
        u.b[4] = p[4];
4942
3.35M
        _Py_FALLTHROUGH;
4943
3.35M
#endif
4944
5.55M
    case 4:
4945
5.55M
        u.b[3] = p[3];
4946
5.55M
        _Py_FALLTHROUGH;
4947
6.54M
    case 3:
4948
6.54M
        u.b[2] = p[2];
4949
6.54M
        _Py_FALLTHROUGH;
4950
9.06M
    case 2:
4951
9.06M
        u.b[1] = p[1];
4952
9.06M
        _Py_FALLTHROUGH;
4953
9.43M
    case 1:
4954
9.43M
        u.b[0] = p[0];
4955
9.43M
        break;
4956
599k
    case 0:
4957
599k
        break;
4958
10.0M
    }
4959
10.0M
    return u.s;
4960
10.0M
}
4961
#endif
4962
4963
/*
4964
 * Find the first non-ASCII character in a byte sequence.
4965
 *
4966
 * This function scans a range of bytes from `start` to `end` and returns the
4967
 * index of the first byte that is not an ASCII character (i.e., has the most
4968
 * significant bit set). If all characters in the range are ASCII, it returns
4969
 * `end - start`.
4970
 */
4971
static Py_ssize_t
4972
find_first_nonascii(const unsigned char *start, const unsigned char *end)
4973
12.2M
{
4974
    // The search is done in `size_t` chunks.
4975
    // The start and end might not be aligned at `size_t` boundaries,
4976
    // so they're handled specially.
4977
4978
12.2M
    const unsigned char *p = start;
4979
4980
12.2M
    if (end - start >= SIZEOF_SIZE_T) {
4981
        // Avoid unaligned read.
4982
5.70M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
4983
5.70M
        size_t u;
4984
5.70M
        memcpy(&u, p, sizeof(size_t));
4985
5.70M
        u &= ASCII_CHAR_MASK;
4986
5.70M
        if (u) {
4987
684k
            return (ctz(u) - 7) / 8;
4988
684k
        }
4989
5.02M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
4990
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
4991
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
4992
        while (p < p2) {
4993
            if (*p & 0x80) {
4994
                return p - start;
4995
            }
4996
            p++;
4997
        }
4998
#endif
4999
5000
5.02M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5001
91.7M
        while (p <= e) {
5002
88.2M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5003
88.2M
            if (u) {
5004
1.50M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5005
1.50M
                return p - start + (ctz(u) - 7) / 8;
5006
#else
5007
                // big endian and minor compilers are difficult to test.
5008
                // fallback to per byte check.
5009
                break;
5010
#endif
5011
1.50M
            }
5012
86.7M
            p += SIZEOF_SIZE_T;
5013
86.7M
        }
5014
5.02M
    }
5015
10.0M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5016
12.2M
    assert((end - p) < SIZEOF_SIZE_T);
5017
    // we can not use *(const size_t*)p to avoid buffer overrun.
5018
10.0M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5019
10.0M
    if (u) {
5020
1.96M
        return p - start + (ctz(u) - 7) / 8;
5021
1.96M
    }
5022
8.06M
    return end - start;
5023
#else
5024
    while (p < end) {
5025
        if (*p & 0x80) {
5026
            break;
5027
        }
5028
        p++;
5029
    }
5030
    return p - start;
5031
#endif
5032
10.0M
}
5033
5034
static inline int
5035
scalar_utf8_start_char(unsigned int ch)
5036
1.16M
{
5037
    // 0xxxxxxx or 11xxxxxx are first byte.
5038
1.16M
    return (~ch >> 7 | ch >> 6) & 1;
5039
1.16M
}
5040
5041
static inline size_t
5042
vector_utf8_start_chars(size_t v)
5043
23.9M
{
5044
23.9M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5045
23.9M
}
5046
5047
5048
// Count the number of UTF-8 code points in a given byte sequence.
5049
static Py_ssize_t
5050
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5051
355k
{
5052
355k
    Py_ssize_t len = 0;
5053
5054
355k
    if (end - s >= SIZEOF_SIZE_T) {
5055
214k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5056
109k
            len += scalar_utf8_start_char(*s++);
5057
109k
        }
5058
5059
291k
        while (s + SIZEOF_SIZE_T <= end) {
5060
185k
            const unsigned char *e = end;
5061
185k
            if (e - s > SIZEOF_SIZE_T * 255) {
5062
92.0k
                e = s + SIZEOF_SIZE_T * 255;
5063
92.0k
            }
5064
185k
            Py_ssize_t vstart = 0;
5065
24.1M
            while (s + SIZEOF_SIZE_T <= e) {
5066
23.9M
                size_t v = *(size_t*)s;
5067
23.9M
                size_t vs = vector_utf8_start_chars(v);
5068
23.9M
                vstart += vs;
5069
23.9M
                s += SIZEOF_SIZE_T;
5070
23.9M
            }
5071
185k
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5072
185k
            vstart += vstart >> 16;
5073
185k
#if SIZEOF_SIZE_T == 8
5074
185k
            vstart += vstart >> 32;
5075
185k
#endif
5076
185k
            len += vstart & 0x7ff;
5077
185k
        }
5078
105k
    }
5079
1.40M
    while (s < end) {
5080
1.05M
        len += scalar_utf8_start_char(*s++);
5081
1.05M
    }
5082
355k
    return len;
5083
355k
}
5084
5085
static Py_ssize_t
5086
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5087
165k
{
5088
165k
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5089
165k
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5090
72.9k
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5091
16.2k
    {
5092
        /* Fast path, see in STRINGLIB(utf8_decode) for
5093
           an explanation. */
5094
16.2k
        const char *p = start;
5095
16.2k
        Py_UCS1 *q = dest;
5096
3.17M
        while (p + SIZEOF_SIZE_T <= end) {
5097
3.15M
            size_t value = *(const size_t *) p;
5098
3.15M
            if (value & ASCII_CHAR_MASK)
5099
479
                break;
5100
3.15M
            *((size_t *)q) = value;
5101
3.15M
            p += SIZEOF_SIZE_T;
5102
3.15M
            q += SIZEOF_SIZE_T;
5103
3.15M
        }
5104
75.6k
        while (p < end) {
5105
59.9k
            if ((unsigned char)*p & 0x80)
5106
622
                break;
5107
59.3k
            *q++ = *p++;
5108
59.3k
        }
5109
16.2k
        return p - start;
5110
16.2k
    }
5111
149k
#endif
5112
149k
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5113
149k
                                         (const unsigned char*)end);
5114
149k
    memcpy(dest, start, pos);
5115
149k
    return pos;
5116
165k
}
5117
5118
static int
5119
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5120
                         const char *starts, const char *s, const char *end,
5121
                         _Py_error_handler error_handler,
5122
                         const char *errors,
5123
                         Py_ssize_t *consumed)
5124
4.15M
{
5125
4.15M
    Py_ssize_t startinpos, endinpos;
5126
4.15M
    const char *errmsg = "";
5127
4.15M
    PyObject *error_handler_obj = NULL;
5128
4.15M
    PyObject *exc = NULL;
5129
5130
8.73M
    while (s < end) {
5131
7.26M
        Py_UCS4 ch;
5132
7.26M
        int kind = writer->kind;
5133
5134
7.26M
        if (kind == PyUnicode_1BYTE_KIND) {
5135
4.99M
            if (PyUnicode_IS_ASCII(writer->buffer))
5136
3.80M
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5137
1.19M
            else
5138
1.19M
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5139
4.99M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5140
1.86M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5141
1.86M
        } else {
5142
400k
            assert(kind == PyUnicode_4BYTE_KIND);
5143
400k
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5144
400k
        }
5145
5146
7.26M
        switch (ch) {
5147
2.68M
        case 0:
5148
2.68M
            if (s == end || consumed)
5149
2.68M
                goto End;
5150
2.16k
            errmsg = "unexpected end of data";
5151
2.16k
            startinpos = s - starts;
5152
2.16k
            endinpos = end - starts;
5153
2.16k
            break;
5154
41.3k
        case 1:
5155
41.3k
            errmsg = "invalid start byte";
5156
41.3k
            startinpos = s - starts;
5157
41.3k
            endinpos = startinpos + 1;
5158
41.3k
            break;
5159
38.2k
        case 2:
5160
38.2k
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5161
8
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5162
3
            {
5163
                /* Truncated surrogate code in range D800-DFFF */
5164
3
                goto End;
5165
3
            }
5166
38.2k
            _Py_FALLTHROUGH;
5167
44.2k
        case 3:
5168
45.9k
        case 4:
5169
45.9k
            errmsg = "invalid continuation byte";
5170
45.9k
            startinpos = s - starts;
5171
45.9k
            endinpos = startinpos + ch - 1;
5172
45.9k
            break;
5173
4.49M
        default:
5174
            // ch doesn't fit into kind, so change the buffer kind to write
5175
            // the character
5176
4.49M
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5177
0
                goto onError;
5178
4.49M
            continue;
5179
7.26M
        }
5180
5181
89.4k
        if (error_handler == _Py_ERROR_UNKNOWN)
5182
2.57k
            error_handler = _Py_GetErrorHandler(errors);
5183
5184
89.4k
        switch (error_handler) {
5185
0
        case _Py_ERROR_IGNORE:
5186
0
            s += (endinpos - startinpos);
5187
0
            break;
5188
5189
79.2k
        case _Py_ERROR_REPLACE:
5190
79.2k
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5191
0
                goto onError;
5192
79.2k
            s += (endinpos - startinpos);
5193
79.2k
            break;
5194
5195
0
        case _Py_ERROR_SURROGATEESCAPE:
5196
0
        {
5197
0
            Py_ssize_t i;
5198
5199
0
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5200
0
                goto onError;
5201
0
            for (i=startinpos; i<endinpos; i++) {
5202
0
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5203
0
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5204
0
                                ch + 0xdc00);
5205
0
                writer->pos++;
5206
0
            }
5207
0
            s += (endinpos - startinpos);
5208
0
            break;
5209
0
        }
5210
5211
10.1k
        default:
5212
10.1k
            if (unicode_decode_call_errorhandler_writer(
5213
10.1k
                    errors, &error_handler_obj,
5214
10.1k
                    "utf-8", errmsg,
5215
10.1k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5216
10.1k
                    writer)) {
5217
4.11k
                goto onError;
5218
4.11k
            }
5219
5220
6.07k
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5221
0
                goto onError;
5222
0
            }
5223
89.4k
        }
5224
89.4k
    }
5225
5226
4.15M
End:
5227
4.15M
    if (consumed)
5228
53
        *consumed = s - starts;
5229
5230
4.15M
    Py_XDECREF(error_handler_obj);
5231
4.15M
    Py_XDECREF(exc);
5232
4.15M
    return 0;
5233
5234
4.11k
onError:
5235
4.11k
    Py_XDECREF(error_handler_obj);
5236
4.11k
    Py_XDECREF(exc);
5237
4.11k
    return -1;
5238
4.15M
}
5239
5240
5241
static PyObject *
5242
unicode_decode_utf8(const char *s, Py_ssize_t size,
5243
                    _Py_error_handler error_handler, const char *errors,
5244
                    Py_ssize_t *consumed)
5245
57.0M
{
5246
57.0M
    if (size == 0) {
5247
1.53M
        if (consumed) {
5248
0
            *consumed = 0;
5249
0
        }
5250
1.53M
        _Py_RETURN_UNICODE_EMPTY();
5251
1.53M
    }
5252
5253
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5254
55.5M
    if (size == 1 && (unsigned char)s[0] < 128) {
5255
43.4M
        if (consumed) {
5256
0
            *consumed = 1;
5257
0
        }
5258
43.4M
        return get_latin1_char((unsigned char)s[0]);
5259
43.4M
    }
5260
5261
    // I don't know this check is necessary or not. But there is a test
5262
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5263
12.0M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5264
0
        PyErr_NoMemory();
5265
0
        return NULL;
5266
0
    }
5267
5268
12.0M
    const char *starts = s;
5269
12.0M
    const char *end = s + size;
5270
5271
12.0M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5272
12.0M
    if (pos == size) {  // fast path: ASCII string.
5273
7.91M
        PyObject *u = PyUnicode_New(size, 127);
5274
7.91M
        if (u == NULL) {
5275
0
            return NULL;
5276
0
        }
5277
7.91M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5278
7.91M
        if (consumed) {
5279
0
            *consumed = size;
5280
0
        }
5281
7.91M
        return u;
5282
7.91M
    }
5283
5284
4.15M
    int maxchr = 127;
5285
4.15M
    Py_ssize_t maxsize = size;
5286
5287
4.15M
    unsigned char ch = (unsigned char)(s[pos]);
5288
    // error handler other than strict may remove/replace the invalid byte.
5289
    // consumed != NULL allows 1~3 bytes remainings.
5290
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5291
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5292
    // reallocation and copy.
5293
4.15M
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5294
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5295
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5296
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5297
        // means that it is no longer necessary to allocate several times the required amount
5298
        // of memory.
5299
355k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5300
355k
        if (ch < 0xc4) { // latin1
5301
159k
            maxchr = 0xff;
5302
159k
        }
5303
195k
        else if (ch < 0xf0) { // ucs2
5304
161k
            maxchr = 0xffff;
5305
161k
        }
5306
33.6k
        else { // ucs4
5307
33.6k
            maxchr = 0x10ffff;
5308
33.6k
        }
5309
355k
    }
5310
4.15M
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5311
4.15M
    if (!u) {
5312
0
        return NULL;
5313
0
    }
5314
5315
    // Use _PyUnicodeWriter after fast path is failed.
5316
4.15M
    _PyUnicodeWriter writer;
5317
4.15M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5318
4.15M
    if (maxchr <= 255) {
5319
3.96M
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5320
3.96M
        s += pos;
5321
3.96M
        writer.pos = pos;
5322
3.96M
    }
5323
5324
4.15M
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5325
4.15M
                                 error_handler, errors,
5326
4.15M
                                 consumed) < 0) {
5327
4.11k
        _PyUnicodeWriter_Dealloc(&writer);
5328
4.11k
        return NULL;
5329
4.11k
    }
5330
4.15M
    return _PyUnicodeWriter_Finish(&writer);
5331
4.15M
}
5332
5333
5334
// Used by PyUnicodeWriter_WriteUTF8() implementation
5335
int
5336
_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
5337
                            const char *s, Py_ssize_t size,
5338
                            _Py_error_handler error_handler, const char *errors,
5339
                            Py_ssize_t *consumed)
5340
166k
{
5341
166k
    if (size == 0) {
5342
10.4k
        if (consumed) {
5343
0
            *consumed = 0;
5344
0
        }
5345
10.4k
        return 0;
5346
10.4k
    }
5347
5348
    // fast path: try ASCII string.
5349
155k
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5350
0
        return -1;
5351
0
    }
5352
5353
155k
    const char *starts = s;
5354
155k
    const char *end = s + size;
5355
155k
    Py_ssize_t decoded = 0;
5356
155k
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5357
155k
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5358
155k
        decoded = ascii_decode(s, end, dest);
5359
155k
        writer->pos += decoded;
5360
5361
155k
        if (decoded == size) {
5362
153k
            if (consumed) {
5363
1.06k
                *consumed = size;
5364
1.06k
            }
5365
153k
            return 0;
5366
153k
        }
5367
1.94k
        s += decoded;
5368
1.94k
    }
5369
5370
2.80k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5371
2.80k
                                    error_handler, errors, consumed);
5372
155k
}
5373
5374
5375
PyObject *
5376
PyUnicode_DecodeUTF8Stateful(const char *s,
5377
                             Py_ssize_t size,
5378
                             const char *errors,
5379
                             Py_ssize_t *consumed)
5380
57.0M
{
5381
57.0M
    return unicode_decode_utf8(s, size,
5382
57.0M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5383
57.0M
                               errors, consumed);
5384
57.0M
}
5385
5386
5387
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5388
   non-zero, use strict error handler otherwise.
5389
5390
   On success, write a pointer to a newly allocated wide character string into
5391
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5392
   (in number of wchar_t units) into *wlen (if wlen is set).
5393
5394
   On memory allocation failure, return -1.
5395
5396
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5397
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5398
   is not NULL, write the decoding error message into *reason. */
5399
int
5400
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5401
                 const char **reason, _Py_error_handler errors)
5402
7.19k
{
5403
7.19k
    const char *orig_s = s;
5404
7.19k
    const char *e;
5405
7.19k
    wchar_t *unicode;
5406
7.19k
    Py_ssize_t outpos;
5407
5408
7.19k
    int surrogateescape = 0;
5409
7.19k
    int surrogatepass = 0;
5410
7.19k
    switch (errors)
5411
7.19k
    {
5412
0
    case _Py_ERROR_STRICT:
5413
0
        break;
5414
7.19k
    case _Py_ERROR_SURROGATEESCAPE:
5415
7.19k
        surrogateescape = 1;
5416
7.19k
        break;
5417
0
    case _Py_ERROR_SURROGATEPASS:
5418
0
        surrogatepass = 1;
5419
0
        break;
5420
0
    default:
5421
0
        return -3;
5422
7.19k
    }
5423
5424
    /* Note: size will always be longer than the resulting Unicode
5425
       character count */
5426
7.19k
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5427
0
        return -1;
5428
0
    }
5429
5430
7.19k
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5431
7.19k
    if (!unicode) {
5432
0
        return -1;
5433
0
    }
5434
5435
    /* Unpack UTF-8 encoded data */
5436
7.19k
    e = s + size;
5437
7.19k
    outpos = 0;
5438
7.19k
    while (s < e) {
5439
7.19k
        Py_UCS4 ch;
5440
7.19k
#if SIZEOF_WCHAR_T == 4
5441
7.19k
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5442
#else
5443
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5444
#endif
5445
7.19k
        if (ch > 0xFF) {
5446
0
#if SIZEOF_WCHAR_T == 4
5447
0
            Py_UNREACHABLE();
5448
#else
5449
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5450
            /* write a surrogate pair */
5451
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5452
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5453
#endif
5454
0
        }
5455
7.19k
        else {
5456
7.19k
            if (!ch && s == e) {
5457
7.19k
                break;
5458
7.19k
            }
5459
5460
0
            if (surrogateescape) {
5461
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5462
0
            }
5463
0
            else {
5464
                /* Is it a valid three-byte code? */
5465
0
                if (surrogatepass
5466
0
                    && (e - s) >= 3
5467
0
                    && (s[0] & 0xf0) == 0xe0
5468
0
                    && (s[1] & 0xc0) == 0x80
5469
0
                    && (s[2] & 0xc0) == 0x80)
5470
0
                {
5471
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5472
0
                    s += 3;
5473
0
                    unicode[outpos++] = ch;
5474
0
                }
5475
0
                else {
5476
0
                    PyMem_RawFree(unicode );
5477
0
                    if (reason != NULL) {
5478
0
                        switch (ch) {
5479
0
                        case 0:
5480
0
                            *reason = "unexpected end of data";
5481
0
                            break;
5482
0
                        case 1:
5483
0
                            *reason = "invalid start byte";
5484
0
                            break;
5485
                        /* 2, 3, 4 */
5486
0
                        default:
5487
0
                            *reason = "invalid continuation byte";
5488
0
                            break;
5489
0
                        }
5490
0
                    }
5491
0
                    if (wlen != NULL) {
5492
0
                        *wlen = s - orig_s;
5493
0
                    }
5494
0
                    return -2;
5495
0
                }
5496
0
            }
5497
0
        }
5498
7.19k
    }
5499
7.19k
    unicode[outpos] = L'\0';
5500
7.19k
    if (wlen) {
5501
7.19k
        *wlen = outpos;
5502
7.19k
    }
5503
7.19k
    *wstr = unicode;
5504
7.19k
    return 0;
5505
7.19k
}
5506
5507
5508
wchar_t*
5509
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5510
                               size_t *wlen)
5511
0
{
5512
0
    wchar_t *wstr;
5513
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5514
0
                               &wstr, wlen,
5515
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5516
0
    if (res != 0) {
5517
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5518
0
        assert(res != -3);
5519
0
        if (wlen) {
5520
0
            *wlen = (size_t)res;
5521
0
        }
5522
0
        return NULL;
5523
0
    }
5524
0
    return wstr;
5525
0
}
5526
5527
5528
/* UTF-8 encoder.
5529
5530
   On success, return 0 and write the newly allocated character string (use
5531
   PyMem_Free() to free the memory) into *str.
5532
5533
   On encoding failure, return -2 and write the position of the invalid
5534
   surrogate character into *error_pos (if error_pos is set) and the decoding
5535
   error message into *reason (if reason is set).
5536
5537
   On memory allocation failure, return -1. */
5538
int
5539
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5540
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5541
990
{
5542
990
    const Py_ssize_t max_char_size = 4;
5543
990
    Py_ssize_t len = wcslen(text);
5544
5545
990
    assert(len >= 0);
5546
5547
990
    int surrogateescape = 0;
5548
990
    int surrogatepass = 0;
5549
990
    switch (errors)
5550
990
    {
5551
88
    case _Py_ERROR_STRICT:
5552
88
        break;
5553
902
    case _Py_ERROR_SURROGATEESCAPE:
5554
902
        surrogateescape = 1;
5555
902
        break;
5556
0
    case _Py_ERROR_SURROGATEPASS:
5557
0
        surrogatepass = 1;
5558
0
        break;
5559
0
    default:
5560
0
        return -3;
5561
990
    }
5562
5563
990
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5564
0
        return -1;
5565
0
    }
5566
990
    char *bytes;
5567
990
    if (raw_malloc) {
5568
990
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5569
990
    }
5570
0
    else {
5571
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5572
0
    }
5573
990
    if (bytes == NULL) {
5574
0
        return -1;
5575
0
    }
5576
5577
990
    char *p = bytes;
5578
990
    Py_ssize_t i;
5579
60.5k
    for (i = 0; i < len; ) {
5580
59.5k
        Py_ssize_t ch_pos = i;
5581
59.5k
        Py_UCS4 ch = text[i];
5582
59.5k
        i++;
5583
59.5k
        if (sizeof(wchar_t) == 2
5584
0
            && Py_UNICODE_IS_HIGH_SURROGATE(ch)
5585
0
            && i < len
5586
0
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5587
0
        {
5588
0
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5589
0
            i++;
5590
0
        }
5591
5592
59.5k
        if (ch < 0x80) {
5593
            /* Encode ASCII */
5594
59.5k
            *p++ = (char) ch;
5595
5596
59.5k
        }
5597
0
        else if (ch < 0x0800) {
5598
            /* Encode Latin-1 */
5599
0
            *p++ = (char)(0xc0 | (ch >> 6));
5600
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5601
0
        }
5602
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5603
            /* surrogateescape error handler */
5604
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5605
0
                if (error_pos != NULL) {
5606
0
                    *error_pos = (size_t)ch_pos;
5607
0
                }
5608
0
                if (reason != NULL) {
5609
0
                    *reason = "encoding error";
5610
0
                }
5611
0
                if (raw_malloc) {
5612
0
                    PyMem_RawFree(bytes);
5613
0
                }
5614
0
                else {
5615
0
                    PyMem_Free(bytes);
5616
0
                }
5617
0
                return -2;
5618
0
            }
5619
0
            *p++ = (char)(ch & 0xff);
5620
0
        }
5621
0
        else if (ch < 0x10000) {
5622
0
            *p++ = (char)(0xe0 | (ch >> 12));
5623
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5624
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5625
0
        }
5626
0
        else {  /* ch >= 0x10000 */
5627
0
            assert(ch <= MAX_UNICODE);
5628
            /* Encode UCS4 Unicode ordinals */
5629
0
            *p++ = (char)(0xf0 | (ch >> 18));
5630
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5631
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5632
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5633
0
        }
5634
59.5k
    }
5635
990
    *p++ = '\0';
5636
5637
990
    size_t final_size = (p - bytes);
5638
990
    char *bytes2;
5639
990
    if (raw_malloc) {
5640
990
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5641
990
    }
5642
0
    else {
5643
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5644
0
    }
5645
990
    if (bytes2 == NULL) {
5646
0
        if (error_pos != NULL) {
5647
0
            *error_pos = (size_t)-1;
5648
0
        }
5649
0
        if (raw_malloc) {
5650
0
            PyMem_RawFree(bytes);
5651
0
        }
5652
0
        else {
5653
0
            PyMem_Free(bytes);
5654
0
        }
5655
0
        return -1;
5656
0
    }
5657
990
    *str = bytes2;
5658
990
    return 0;
5659
990
}
5660
5661
5662
/* Primary internal function which creates utf8 encoded bytes objects.
5663
5664
   Allocation strategy:  if the string is short, convert into a stack buffer
5665
   and allocate exactly as much space needed at the end.  Else allocate the
5666
   maximum possible needed (4 result bytes per Unicode character), and return
5667
   the excess memory at the end.
5668
*/
5669
static PyObject *
5670
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5671
                    const char *errors)
5672
7.31k
{
5673
7.31k
    if (!PyUnicode_Check(unicode)) {
5674
0
        PyErr_BadArgument();
5675
0
        return NULL;
5676
0
    }
5677
5678
7.31k
    if (PyUnicode_UTF8(unicode))
5679
5.75k
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5680
5.75k
                                         PyUnicode_UTF8_LENGTH(unicode));
5681
5682
1.55k
    int kind = PyUnicode_KIND(unicode);
5683
1.55k
    const void *data = PyUnicode_DATA(unicode);
5684
1.55k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5685
5686
1.55k
    PyBytesWriter *writer;
5687
1.55k
    char *end;
5688
5689
1.55k
    switch (kind) {
5690
0
    default:
5691
0
        Py_UNREACHABLE();
5692
405
    case PyUnicode_1BYTE_KIND:
5693
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5694
405
        assert(!PyUnicode_IS_ASCII(unicode));
5695
405
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5696
405
                                      error_handler, errors, &end);
5697
405
        break;
5698
1.04k
    case PyUnicode_2BYTE_KIND:
5699
1.04k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5700
1.04k
                                      error_handler, errors, &end);
5701
1.04k
        break;
5702
111
    case PyUnicode_4BYTE_KIND:
5703
111
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5704
111
                                      error_handler, errors, &end);
5705
111
        break;
5706
1.55k
    }
5707
5708
1.55k
    if (writer == NULL) {
5709
7
        PyBytesWriter_Discard(writer);
5710
7
        return NULL;
5711
7
    }
5712
1.55k
    return PyBytesWriter_FinishWithPointer(writer, end);
5713
1.55k
}
5714
5715
static int
5716
unicode_fill_utf8(PyObject *unicode)
5717
2.75k
{
5718
2.75k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5719
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5720
2.75k
    assert(!PyUnicode_IS_ASCII(unicode));
5721
5722
2.75k
    int kind = PyUnicode_KIND(unicode);
5723
2.75k
    const void *data = PyUnicode_DATA(unicode);
5724
2.75k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5725
5726
2.75k
    PyBytesWriter *writer;
5727
2.75k
    char *end;
5728
5729
2.75k
    switch (kind) {
5730
0
    default:
5731
0
        Py_UNREACHABLE();
5732
276
    case PyUnicode_1BYTE_KIND:
5733
276
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5734
276
                                      _Py_ERROR_STRICT, NULL, &end);
5735
276
        break;
5736
1.70k
    case PyUnicode_2BYTE_KIND:
5737
1.70k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5738
1.70k
                                      _Py_ERROR_STRICT, NULL, &end);
5739
1.70k
        break;
5740
773
    case PyUnicode_4BYTE_KIND:
5741
773
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5742
773
                                      _Py_ERROR_STRICT, NULL, &end);
5743
773
        break;
5744
2.75k
    }
5745
2.75k
    if (writer == NULL) {
5746
0
        return -1;
5747
0
    }
5748
5749
2.75k
    const char *start = PyBytesWriter_GetData(writer);
5750
2.75k
    Py_ssize_t len = end - start;
5751
5752
2.75k
    char *cache = PyMem_Malloc(len + 1);
5753
2.75k
    if (cache == NULL) {
5754
0
        PyBytesWriter_Discard(writer);
5755
0
        PyErr_NoMemory();
5756
0
        return -1;
5757
0
    }
5758
2.75k
    memcpy(cache, start, len);
5759
2.75k
    cache[len] = '\0';
5760
2.75k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5761
2.75k
    PyUnicode_SET_UTF8(unicode, cache);
5762
2.75k
    PyBytesWriter_Discard(writer);
5763
2.75k
    return 0;
5764
2.75k
}
5765
5766
PyObject *
5767
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5768
1.71k
{
5769
1.71k
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5770
1.71k
}
5771
5772
5773
PyObject *
5774
PyUnicode_AsUTF8String(PyObject *unicode)
5775
1.65k
{
5776
1.65k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5777
1.65k
}
5778
5779
/* --- UTF-32 Codec ------------------------------------------------------- */
5780
5781
PyObject *
5782
PyUnicode_DecodeUTF32(const char *s,
5783
                      Py_ssize_t size,
5784
                      const char *errors,
5785
                      int *byteorder)
5786
23
{
5787
23
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5788
23
}
5789
5790
PyObject *
5791
PyUnicode_DecodeUTF32Stateful(const char *s,
5792
                              Py_ssize_t size,
5793
                              const char *errors,
5794
                              int *byteorder,
5795
                              Py_ssize_t *consumed)
5796
394
{
5797
394
    const char *starts = s;
5798
394
    Py_ssize_t startinpos;
5799
394
    Py_ssize_t endinpos;
5800
394
    _PyUnicodeWriter writer;
5801
394
    const unsigned char *q, *e;
5802
394
    int le, bo = 0;       /* assume native ordering by default */
5803
394
    const char *encoding;
5804
394
    const char *errmsg = "";
5805
394
    PyObject *errorHandler = NULL;
5806
394
    PyObject *exc = NULL;
5807
5808
394
    q = (const unsigned char *)s;
5809
394
    e = q + size;
5810
5811
394
    if (byteorder)
5812
371
        bo = *byteorder;
5813
5814
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5815
       byte order setting accordingly. In native mode, the leading BOM
5816
       mark is skipped, in all other modes, it is copied to the output
5817
       stream as-is (giving a ZWNBSP character). */
5818
394
    if (bo == 0 && size >= 4) {
5819
28
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5820
28
        if (bom == 0x0000FEFF) {
5821
8
            bo = -1;
5822
8
            q += 4;
5823
8
        }
5824
20
        else if (bom == 0xFFFE0000) {
5825
12
            bo = 1;
5826
12
            q += 4;
5827
12
        }
5828
28
        if (byteorder)
5829
5
            *byteorder = bo;
5830
28
    }
5831
5832
394
    if (q == e) {
5833
2
        if (consumed)
5834
0
            *consumed = size;
5835
2
        _Py_RETURN_UNICODE_EMPTY();
5836
2
    }
5837
5838
#ifdef WORDS_BIGENDIAN
5839
    le = bo < 0;
5840
#else
5841
392
    le = bo <= 0;
5842
392
#endif
5843
392
    encoding = le ? "utf-32-le" : "utf-32-be";
5844
5845
392
    _PyUnicodeWriter_Init(&writer);
5846
392
    writer.min_length = (e - q + 3) / 4;
5847
392
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5848
0
        goto onError;
5849
5850
2.54k
    while (1) {
5851
2.54k
        Py_UCS4 ch = 0;
5852
2.54k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5853
5854
2.54k
        if (e - q >= 4) {
5855
2.41k
            int kind = writer.kind;
5856
2.41k
            void *data = writer.data;
5857
2.41k
            const unsigned char *last = e - 4;
5858
2.41k
            Py_ssize_t pos = writer.pos;
5859
2.41k
            if (le) {
5860
300k
                do {
5861
300k
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5862
300k
                    if (ch > maxch)
5863
676
                        break;
5864
300k
                    if (kind != PyUnicode_1BYTE_KIND &&
5865
267k
                        Py_UNICODE_IS_SURROGATE(ch))
5866
551
                        break;
5867
299k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5868
299k
                    q += 4;
5869
299k
                } while (q <= last);
5870
1.28k
            }
5871
1.13k
            else {
5872
670k
                do {
5873
670k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5874
670k
                    if (ch > maxch)
5875
401
                        break;
5876
670k
                    if (kind != PyUnicode_1BYTE_KIND &&
5877
664k
                        Py_UNICODE_IS_SURROGATE(ch))
5878
679
                        break;
5879
669k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5880
669k
                    q += 4;
5881
669k
                } while (q <= last);
5882
1.13k
            }
5883
2.41k
            writer.pos = pos;
5884
2.41k
        }
5885
5886
2.54k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5887
1.27k
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5888
1.27k
            startinpos = ((const char *)q) - starts;
5889
1.27k
            endinpos = startinpos + 4;
5890
1.27k
        }
5891
1.26k
        else if (ch <= maxch) {
5892
236
            if (q == e || consumed)
5893
214
                break;
5894
            /* remaining bytes at the end? (size should be divisible by 4) */
5895
22
            errmsg = "truncated data";
5896
22
            startinpos = ((const char *)q) - starts;
5897
22
            endinpos = ((const char *)e) - starts;
5898
22
        }
5899
1.03k
        else {
5900
1.03k
            if (ch < 0x110000) {
5901
364
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5902
0
                    goto onError;
5903
364
                q += 4;
5904
364
                continue;
5905
364
            }
5906
666
            errmsg = "code point not in range(0x110000)";
5907
666
            startinpos = ((const char *)q) - starts;
5908
666
            endinpos = startinpos + 4;
5909
666
        }
5910
5911
        /* The remaining input chars are ignored if the callback
5912
           chooses to skip the input */
5913
1.96k
        if (unicode_decode_call_errorhandler_writer(
5914
1.96k
                errors, &errorHandler,
5915
1.96k
                encoding, errmsg,
5916
1.96k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5917
1.96k
                &writer))
5918
178
            goto onError;
5919
1.96k
    }
5920
5921
214
    if (consumed)
5922
0
        *consumed = (const char *)q-starts;
5923
5924
214
    Py_XDECREF(errorHandler);
5925
214
    Py_XDECREF(exc);
5926
214
    return _PyUnicodeWriter_Finish(&writer);
5927
5928
178
  onError:
5929
178
    _PyUnicodeWriter_Dealloc(&writer);
5930
178
    Py_XDECREF(errorHandler);
5931
178
    Py_XDECREF(exc);
5932
178
    return NULL;
5933
392
}
5934
5935
PyObject *
5936
_PyUnicode_EncodeUTF32(PyObject *str,
5937
                       const char *errors,
5938
                       int byteorder)
5939
0
{
5940
0
    if (!PyUnicode_Check(str)) {
5941
0
        PyErr_BadArgument();
5942
0
        return NULL;
5943
0
    }
5944
0
    int kind = PyUnicode_KIND(str);
5945
0
    const void *data = PyUnicode_DATA(str);
5946
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
5947
5948
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5949
0
        return PyErr_NoMemory();
5950
0
    Py_ssize_t nsize = len + (byteorder == 0);
5951
5952
0
#if PY_LITTLE_ENDIAN
5953
0
    int native_ordering = byteorder <= 0;
5954
#else
5955
    int native_ordering = byteorder >= 0;
5956
#endif
5957
5958
0
    if (kind == PyUnicode_1BYTE_KIND) {
5959
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
5960
        // on short strings
5961
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5962
0
        if (v == NULL) {
5963
0
            return NULL;
5964
0
        }
5965
5966
        /* output buffer is 4-bytes aligned */
5967
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5968
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
5969
0
        if (byteorder == 0) {
5970
0
            *out++ = 0xFEFF;
5971
0
        }
5972
0
        if (len > 0) {
5973
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
5974
0
                                 &out, native_ordering);
5975
0
        }
5976
0
        return v;
5977
0
    }
5978
5979
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
5980
0
    if (writer == NULL) {
5981
0
        return NULL;
5982
0
    }
5983
5984
    /* output buffer is 4-bytes aligned */
5985
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
5986
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
5987
0
    if (byteorder == 0) {
5988
0
        *out++ = 0xFEFF;
5989
0
    }
5990
0
    if (len == 0) {
5991
0
        return PyBytesWriter_Finish(writer);
5992
0
    }
5993
5994
0
    const char *encoding;
5995
0
    if (byteorder == -1)
5996
0
        encoding = "utf-32-le";
5997
0
    else if (byteorder == 1)
5998
0
        encoding = "utf-32-be";
5999
0
    else
6000
0
        encoding = "utf-32";
6001
6002
0
    PyObject *errorHandler = NULL;
6003
0
    PyObject *exc = NULL;
6004
0
    PyObject *rep = NULL;
6005
6006
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6007
0
        if (kind == PyUnicode_2BYTE_KIND) {
6008
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6009
0
                                        &out, native_ordering);
6010
0
        }
6011
0
        else {
6012
0
            assert(kind == PyUnicode_4BYTE_KIND);
6013
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6014
0
                                        &out, native_ordering);
6015
0
        }
6016
0
        if (pos == len)
6017
0
            break;
6018
6019
0
        Py_ssize_t newpos;
6020
0
        rep = unicode_encode_call_errorhandler(
6021
0
                errors, &errorHandler,
6022
0
                encoding, "surrogates not allowed",
6023
0
                str, &exc, pos, pos + 1, &newpos);
6024
0
        if (!rep)
6025
0
            goto error;
6026
6027
0
        Py_ssize_t repsize, moreunits;
6028
0
        if (PyBytes_Check(rep)) {
6029
0
            repsize = PyBytes_GET_SIZE(rep);
6030
0
            if (repsize & 3) {
6031
0
                raise_encode_exception(&exc, encoding,
6032
0
                                       str, pos, pos + 1,
6033
0
                                       "surrogates not allowed");
6034
0
                goto error;
6035
0
            }
6036
0
            moreunits = repsize / 4;
6037
0
        }
6038
0
        else {
6039
0
            assert(PyUnicode_Check(rep));
6040
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6041
0
            if (!PyUnicode_IS_ASCII(rep)) {
6042
0
                raise_encode_exception(&exc, encoding,
6043
0
                                       str, pos, pos + 1,
6044
0
                                       "surrogates not allowed");
6045
0
                goto error;
6046
0
            }
6047
0
        }
6048
0
        moreunits += pos - newpos;
6049
0
        pos = newpos;
6050
6051
        /* four bytes are reserved for each surrogate */
6052
0
        if (moreunits > 0) {
6053
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6054
0
            if (out == NULL) {
6055
0
                goto error;
6056
0
            }
6057
0
        }
6058
6059
0
        if (PyBytes_Check(rep)) {
6060
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6061
0
            out += repsize / 4;
6062
0
        }
6063
0
        else {
6064
            /* rep is unicode */
6065
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6066
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6067
0
                                 &out, native_ordering);
6068
0
        }
6069
6070
0
        Py_CLEAR(rep);
6071
0
    }
6072
6073
0
    Py_XDECREF(errorHandler);
6074
0
    Py_XDECREF(exc);
6075
6076
    /* Cut back to size actually needed. This is necessary for, for example,
6077
       encoding of a string containing isolated surrogates and the 'ignore'
6078
       handler is used. */
6079
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6080
6081
0
  error:
6082
0
    Py_XDECREF(rep);
6083
0
    Py_XDECREF(errorHandler);
6084
0
    Py_XDECREF(exc);
6085
0
    PyBytesWriter_Discard(writer);
6086
0
    return NULL;
6087
0
}
6088
6089
PyObject *
6090
PyUnicode_AsUTF32String(PyObject *unicode)
6091
0
{
6092
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6093
0
}
6094
6095
/* --- UTF-16 Codec ------------------------------------------------------- */
6096
6097
PyObject *
6098
PyUnicode_DecodeUTF16(const char *s,
6099
                      Py_ssize_t size,
6100
                      const char *errors,
6101
                      int *byteorder)
6102
96
{
6103
96
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6104
96
}
6105
6106
PyObject *
6107
PyUnicode_DecodeUTF16Stateful(const char *s,
6108
                              Py_ssize_t size,
6109
                              const char *errors,
6110
                              int *byteorder,
6111
                              Py_ssize_t *consumed)
6112
785
{
6113
785
    const char *starts = s;
6114
785
    Py_ssize_t startinpos;
6115
785
    Py_ssize_t endinpos;
6116
785
    _PyUnicodeWriter writer;
6117
785
    const unsigned char *q, *e;
6118
785
    int bo = 0;       /* assume native ordering by default */
6119
785
    int native_ordering;
6120
785
    const char *errmsg = "";
6121
785
    PyObject *errorHandler = NULL;
6122
785
    PyObject *exc = NULL;
6123
785
    const char *encoding;
6124
6125
785
    q = (const unsigned char *)s;
6126
785
    e = q + size;
6127
6128
785
    if (byteorder)
6129
689
        bo = *byteorder;
6130
6131
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6132
       byte order setting accordingly. In native mode, the leading BOM
6133
       mark is skipped, in all other modes, it is copied to the output
6134
       stream as-is (giving a ZWNBSP character). */
6135
785
    if (bo == 0 && size >= 2) {
6136
112
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6137
112
        if (bom == 0xFEFF) {
6138
23
            q += 2;
6139
23
            bo = -1;
6140
23
        }
6141
89
        else if (bom == 0xFFFE) {
6142
34
            q += 2;
6143
34
            bo = 1;
6144
34
        }
6145
112
        if (byteorder)
6146
16
            *byteorder = bo;
6147
112
    }
6148
6149
785
    if (q == e) {
6150
2
        if (consumed)
6151
0
            *consumed = size;
6152
2
        _Py_RETURN_UNICODE_EMPTY();
6153
2
    }
6154
6155
783
#if PY_LITTLE_ENDIAN
6156
783
    native_ordering = bo <= 0;
6157
783
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6158
#else
6159
    native_ordering = bo >= 0;
6160
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6161
#endif
6162
6163
    /* Note: size will always be longer than the resulting Unicode
6164
       character count normally.  Error handler will take care of
6165
       resizing when needed. */
6166
783
    _PyUnicodeWriter_Init(&writer);
6167
783
    writer.min_length = (e - q + 1) / 2;
6168
783
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6169
0
        goto onError;
6170
6171
128k
    while (1) {
6172
128k
        Py_UCS4 ch = 0;
6173
128k
        if (e - q >= 2) {
6174
128k
            int kind = writer.kind;
6175
128k
            if (kind == PyUnicode_1BYTE_KIND) {
6176
998
                if (PyUnicode_IS_ASCII(writer.buffer))
6177
781
                    ch = asciilib_utf16_decode(&q, e,
6178
781
                            (Py_UCS1*)writer.data, &writer.pos,
6179
781
                            native_ordering);
6180
217
                else
6181
217
                    ch = ucs1lib_utf16_decode(&q, e,
6182
217
                            (Py_UCS1*)writer.data, &writer.pos,
6183
217
                            native_ordering);
6184
127k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6185
21.0k
                ch = ucs2lib_utf16_decode(&q, e,
6186
21.0k
                        (Py_UCS2*)writer.data, &writer.pos,
6187
21.0k
                        native_ordering);
6188
106k
            } else {
6189
106k
                assert(kind == PyUnicode_4BYTE_KIND);
6190
106k
                ch = ucs4lib_utf16_decode(&q, e,
6191
106k
                        (Py_UCS4*)writer.data, &writer.pos,
6192
106k
                        native_ordering);
6193
106k
            }
6194
128k
        }
6195
6196
128k
        switch (ch)
6197
128k
        {
6198
768
        case 0:
6199
            /* remaining byte at the end? (size should be even) */
6200
768
            if (q == e || consumed)
6201
480
                goto End;
6202
288
            errmsg = "truncated data";
6203
288
            startinpos = ((const char *)q) - starts;
6204
288
            endinpos = ((const char *)e) - starts;
6205
288
            break;
6206
            /* The remaining input chars are ignored if the callback
6207
               chooses to skip the input */
6208
163
        case 1:
6209
163
            q -= 2;
6210
163
            if (consumed)
6211
0
                goto End;
6212
163
            errmsg = "unexpected end of data";
6213
163
            startinpos = ((const char *)q) - starts;
6214
163
            endinpos = ((const char *)e) - starts;
6215
163
            break;
6216
100k
        case 2:
6217
100k
            errmsg = "illegal encoding";
6218
100k
            startinpos = ((const char *)q) - 2 - starts;
6219
100k
            endinpos = startinpos + 2;
6220
100k
            break;
6221
26.3k
        case 3:
6222
26.3k
            errmsg = "illegal UTF-16 surrogate";
6223
26.3k
            startinpos = ((const char *)q) - 4 - starts;
6224
26.3k
            endinpos = startinpos + 2;
6225
26.3k
            break;
6226
857
        default:
6227
857
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6228
0
                goto onError;
6229
857
            continue;
6230
128k
        }
6231
6232
127k
        if (unicode_decode_call_errorhandler_writer(
6233
127k
                errors,
6234
127k
                &errorHandler,
6235
127k
                encoding, errmsg,
6236
127k
                &starts,
6237
127k
                (const char **)&e,
6238
127k
                &startinpos,
6239
127k
                &endinpos,
6240
127k
                &exc,
6241
127k
                (const char **)&q,
6242
127k
                &writer))
6243
303
            goto onError;
6244
127k
    }
6245
6246
480
End:
6247
480
    if (consumed)
6248
0
        *consumed = (const char *)q-starts;
6249
6250
480
    Py_XDECREF(errorHandler);
6251
480
    Py_XDECREF(exc);
6252
480
    return _PyUnicodeWriter_Finish(&writer);
6253
6254
303
  onError:
6255
303
    _PyUnicodeWriter_Dealloc(&writer);
6256
303
    Py_XDECREF(errorHandler);
6257
303
    Py_XDECREF(exc);
6258
303
    return NULL;
6259
783
}
6260
6261
PyObject *
6262
_PyUnicode_EncodeUTF16(PyObject *str,
6263
                       const char *errors,
6264
                       int byteorder)
6265
0
{
6266
0
    if (!PyUnicode_Check(str)) {
6267
0
        PyErr_BadArgument();
6268
0
        return NULL;
6269
0
    }
6270
0
    int kind = PyUnicode_KIND(str);
6271
0
    const void *data = PyUnicode_DATA(str);
6272
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6273
6274
0
    Py_ssize_t pairs = 0;
6275
0
    if (kind == PyUnicode_4BYTE_KIND) {
6276
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6277
0
        const Py_UCS4 *end = in + len;
6278
0
        while (in < end) {
6279
0
            if (*in++ >= 0x10000) {
6280
0
                pairs++;
6281
0
            }
6282
0
        }
6283
0
    }
6284
0
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6285
0
        return PyErr_NoMemory();
6286
0
    }
6287
0
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6288
6289
#if PY_BIG_ENDIAN
6290
    int native_ordering = byteorder >= 0;
6291
#else
6292
0
    int native_ordering = byteorder <= 0;
6293
0
#endif
6294
6295
0
    if (kind == PyUnicode_1BYTE_KIND) {
6296
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6297
        // on short strings
6298
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6299
0
        if (v == NULL) {
6300
0
            return NULL;
6301
0
        }
6302
6303
        /* output buffer is 2-bytes aligned */
6304
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6305
0
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6306
0
        if (byteorder == 0) {
6307
0
            *out++ = 0xFEFF;
6308
0
        }
6309
0
        if (len > 0) {
6310
0
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6311
0
        }
6312
0
        return v;
6313
0
    }
6314
6315
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6316
0
    if (writer == NULL) {
6317
0
        return NULL;
6318
0
    }
6319
6320
    /* output buffer is 2-bytes aligned */
6321
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6322
0
    unsigned short *out = PyBytesWriter_GetData(writer);
6323
0
    if (byteorder == 0) {
6324
0
        *out++ = 0xFEFF;
6325
0
    }
6326
0
    if (len == 0) {
6327
0
        return PyBytesWriter_Finish(writer);
6328
0
    }
6329
6330
0
    const char *encoding;
6331
0
    if (byteorder < 0) {
6332
0
        encoding = "utf-16-le";
6333
0
    }
6334
0
    else if (byteorder > 0) {
6335
0
        encoding = "utf-16-be";
6336
0
    }
6337
0
    else {
6338
0
        encoding = "utf-16";
6339
0
    }
6340
6341
0
    PyObject *errorHandler = NULL;
6342
0
    PyObject *exc = NULL;
6343
0
    PyObject *rep = NULL;
6344
6345
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6346
0
        if (kind == PyUnicode_2BYTE_KIND) {
6347
0
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6348
0
                                        &out, native_ordering);
6349
0
        }
6350
0
        else {
6351
0
            assert(kind == PyUnicode_4BYTE_KIND);
6352
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6353
0
                                        &out, native_ordering);
6354
0
        }
6355
0
        if (pos == len)
6356
0
            break;
6357
6358
0
        Py_ssize_t newpos;
6359
0
        rep = unicode_encode_call_errorhandler(
6360
0
                errors, &errorHandler,
6361
0
                encoding, "surrogates not allowed",
6362
0
                str, &exc, pos, pos + 1, &newpos);
6363
0
        if (!rep)
6364
0
            goto error;
6365
6366
0
        Py_ssize_t repsize, moreunits;
6367
0
        if (PyBytes_Check(rep)) {
6368
0
            repsize = PyBytes_GET_SIZE(rep);
6369
0
            if (repsize & 1) {
6370
0
                raise_encode_exception(&exc, encoding,
6371
0
                                       str, pos, pos + 1,
6372
0
                                       "surrogates not allowed");
6373
0
                goto error;
6374
0
            }
6375
0
            moreunits = repsize / 2;
6376
0
        }
6377
0
        else {
6378
0
            assert(PyUnicode_Check(rep));
6379
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6380
0
            if (!PyUnicode_IS_ASCII(rep)) {
6381
0
                raise_encode_exception(&exc, encoding,
6382
0
                                       str, pos, pos + 1,
6383
0
                                       "surrogates not allowed");
6384
0
                goto error;
6385
0
            }
6386
0
        }
6387
0
        moreunits += pos - newpos;
6388
0
        pos = newpos;
6389
6390
        /* two bytes are reserved for each surrogate */
6391
0
        if (moreunits > 0) {
6392
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6393
0
            if (out == NULL) {
6394
0
                goto error;
6395
0
            }
6396
0
        }
6397
6398
0
        if (PyBytes_Check(rep)) {
6399
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6400
0
            out += repsize / 2;
6401
0
        } else {
6402
            /* rep is unicode */
6403
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6404
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6405
0
                                 &out, native_ordering);
6406
0
        }
6407
6408
0
        Py_CLEAR(rep);
6409
0
    }
6410
6411
0
    Py_XDECREF(errorHandler);
6412
0
    Py_XDECREF(exc);
6413
6414
    /* Cut back to size actually needed. This is necessary for, for example,
6415
    encoding of a string containing isolated surrogates and the 'ignore' handler
6416
    is used. */
6417
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6418
6419
0
  error:
6420
0
    Py_XDECREF(rep);
6421
0
    Py_XDECREF(errorHandler);
6422
0
    Py_XDECREF(exc);
6423
0
    PyBytesWriter_Discard(writer);
6424
0
    return NULL;
6425
0
}
6426
6427
PyObject *
6428
PyUnicode_AsUTF16String(PyObject *unicode)
6429
0
{
6430
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6431
0
}
6432
6433
_PyUnicode_Name_CAPI *
6434
_PyUnicode_GetNameCAPI(void)
6435
1.14k
{
6436
1.14k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6437
1.14k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6438
6439
1.14k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6440
1.14k
    if (ucnhash_capi == NULL) {
6441
2
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6442
2
                PyUnicodeData_CAPSULE_NAME, 1);
6443
6444
        // It's fine if we overwrite the value here. It's always the same value.
6445
2
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6446
2
    }
6447
1.14k
    return ucnhash_capi;
6448
1.14k
}
6449
6450
/* --- Unicode Escape Codec ----------------------------------------------- */
6451
6452
PyObject *
6453
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6454
                               Py_ssize_t size,
6455
                               const char *errors,
6456
                               Py_ssize_t *consumed,
6457
                               int *first_invalid_escape_char,
6458
                               const char **first_invalid_escape_ptr)
6459
15.7k
{
6460
15.7k
    const char *starts = s;
6461
15.7k
    const char *initial_starts = starts;
6462
15.7k
    _PyUnicodeWriter writer;
6463
15.7k
    const char *end;
6464
15.7k
    PyObject *errorHandler = NULL;
6465
15.7k
    PyObject *exc = NULL;
6466
15.7k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6467
6468
    // so we can remember if we've seen an invalid escape char or not
6469
15.7k
    *first_invalid_escape_char = -1;
6470
15.7k
    *first_invalid_escape_ptr = NULL;
6471
6472
15.7k
    if (size == 0) {
6473
1.24k
        if (consumed) {
6474
0
            *consumed = 0;
6475
0
        }
6476
1.24k
        _Py_RETURN_UNICODE_EMPTY();
6477
1.24k
    }
6478
    /* Escaped strings will always be longer than the resulting
6479
       Unicode string, so we start with size here and then reduce the
6480
       length after conversion to the true value.
6481
       (but if the error callback returns a long replacement string
6482
       we'll have to allocate more space) */
6483
14.4k
    _PyUnicodeWriter_Init(&writer);
6484
14.4k
    writer.min_length = size;
6485
14.4k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6486
0
        goto onError;
6487
0
    }
6488
6489
14.4k
    end = s + size;
6490
10.1M
    while (s < end) {
6491
10.1M
        unsigned char c = (unsigned char) *s++;
6492
10.1M
        Py_UCS4 ch;
6493
10.1M
        int count;
6494
10.1M
        const char *message;
6495
6496
10.1M
#define WRITE_ASCII_CHAR(ch)                                                  \
6497
10.1M
            do {                                                              \
6498
2.62M
                assert(ch <= 127);                                            \
6499
2.62M
                assert(writer.pos < writer.size);                             \
6500
2.62M
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6501
2.62M
            } while(0)
6502
6503
10.1M
#define WRITE_CHAR(ch)                                                        \
6504
10.1M
            do {                                                              \
6505
7.85M
                if (ch <= writer.maxchar) {                                   \
6506
7.85M
                    assert(writer.pos < writer.size);                         \
6507
7.85M
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6508
7.85M
                }                                                             \
6509
7.85M
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6510
0
                    goto onError;                                             \
6511
0
                }                                                             \
6512
7.85M
            } while(0)
6513
6514
        /* Non-escape characters are interpreted as Unicode ordinals */
6515
10.1M
        if (c != '\\') {
6516
7.21M
            WRITE_CHAR(c);
6517
7.21M
            continue;
6518
7.21M
        }
6519
6520
2.88M
        Py_ssize_t startinpos = s - starts - 1;
6521
        /* \ - Escapes */
6522
2.88M
        if (s >= end) {
6523
0
            message = "\\ at end of string";
6524
0
            goto incomplete;
6525
0
        }
6526
2.88M
        c = (unsigned char) *s++;
6527
6528
2.88M
        assert(writer.pos < writer.size);
6529
2.88M
        switch (c) {
6530
6531
            /* \x escapes */
6532
2.79k
        case '\n': continue;
6533
246k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6534
246k
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6535
282k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6536
282k
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6537
        /* FF */
6538
404k
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6539
404k
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6540
680k
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6541
680k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6542
        /* VT */
6543
386k
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6544
        /* BEL, not classic C */
6545
30.8k
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6546
6547
            /* \OOO (octal) escapes */
6548
75.6k
        case '0': case '1': case '2': case '3':
6549
88.6k
        case '4': case '5': case '6': case '7':
6550
88.6k
            ch = c - '0';
6551
88.6k
            if (s < end && '0' <= *s && *s <= '7') {
6552
10.1k
                ch = (ch<<3) + *s++ - '0';
6553
10.1k
                if (s < end && '0' <= *s && *s <= '7') {
6554
1.15k
                    ch = (ch<<3) + *s++ - '0';
6555
1.15k
                }
6556
10.1k
            }
6557
88.6k
            if (ch > 0377) {
6558
377
                if (*first_invalid_escape_char == -1) {
6559
49
                    *first_invalid_escape_char = ch;
6560
49
                    if (starts == initial_starts) {
6561
                        /* Back up 3 chars, since we've already incremented s. */
6562
49
                        *first_invalid_escape_ptr = s - 3;
6563
49
                    }
6564
49
                }
6565
377
            }
6566
88.6k
            WRITE_CHAR(ch);
6567
88.6k
            continue;
6568
6569
            /* hex escapes */
6570
            /* \xXX */
6571
88.6k
        case 'x':
6572
95
            count = 2;
6573
95
            message = "truncated \\xXX escape";
6574
95
            goto hexescape;
6575
6576
            /* \uXXXX */
6577
668
        case 'u':
6578
668
            count = 4;
6579
668
            message = "truncated \\uXXXX escape";
6580
668
            goto hexescape;
6581
6582
            /* \UXXXXXXXX */
6583
172k
        case 'U':
6584
172k
            count = 8;
6585
172k
            message = "truncated \\UXXXXXXXX escape";
6586
173k
        hexescape:
6587
1.55M
            for (ch = 0; count; ++s, --count) {
6588
1.38M
                if (s >= end) {
6589
2
                    goto incomplete;
6590
2
                }
6591
1.38M
                c = (unsigned char)*s;
6592
1.38M
                ch <<= 4;
6593
1.38M
                if (c >= '0' && c <= '9') {
6594
1.10M
                    ch += c - '0';
6595
1.10M
                }
6596
274k
                else if (c >= 'a' && c <= 'f') {
6597
274k
                    ch += c - ('a' - 10);
6598
274k
                }
6599
137
                else if (c >= 'A' && c <= 'F') {
6600
130
                    ch += c - ('A' - 10);
6601
130
                }
6602
7
                else {
6603
7
                    goto error;
6604
7
                }
6605
1.38M
            }
6606
6607
            /* when we get here, ch is a 32-bit unicode character */
6608
173k
            if (ch > MAX_UNICODE) {
6609
0
                message = "illegal Unicode character";
6610
0
                goto error;
6611
0
            }
6612
6613
173k
            WRITE_CHAR(ch);
6614
173k
            continue;
6615
6616
            /* \N{name} */
6617
173k
        case 'N':
6618
1.14k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6619
1.14k
            if (ucnhash_capi == NULL) {
6620
0
                PyErr_SetString(
6621
0
                        PyExc_UnicodeError,
6622
0
                        "\\N escapes not supported (can't load unicodedata module)"
6623
0
                );
6624
0
                goto onError;
6625
0
            }
6626
6627
1.14k
            message = "malformed \\N character escape";
6628
1.14k
            if (s >= end) {
6629
8
                goto incomplete;
6630
8
            }
6631
1.13k
            if (*s == '{') {
6632
1.12k
                const char *start = ++s;
6633
1.12k
                size_t namelen;
6634
                /* look for the closing brace */
6635
3.14M
                while (s < end && *s != '}')
6636
3.14M
                    s++;
6637
1.12k
                if (s >= end) {
6638
2
                    goto incomplete;
6639
2
                }
6640
1.12k
                namelen = s - start;
6641
1.12k
                if (namelen) {
6642
                    /* found a name.  look it up in the unicode database */
6643
1.12k
                    s++;
6644
1.12k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6645
1.12k
                    if (namelen <= INT_MAX &&
6646
1.12k
                        ucnhash_capi->getcode(start, (int)namelen,
6647
1.12k
                                              &ch, 0)) {
6648
1.09k
                        assert(ch <= MAX_UNICODE);
6649
1.09k
                        WRITE_CHAR(ch);
6650
1.09k
                        continue;
6651
1.09k
                    }
6652
34
                    message = "unknown Unicode character name";
6653
34
                }
6654
1.12k
            }
6655
41
            goto error;
6656
6657
377k
        default:
6658
377k
            if (*first_invalid_escape_char == -1) {
6659
9.18k
                *first_invalid_escape_char = c;
6660
9.18k
                if (starts == initial_starts) {
6661
                    /* Back up one char, since we've already incremented s. */
6662
9.18k
                    *first_invalid_escape_ptr = s - 1;
6663
9.18k
                }
6664
9.18k
            }
6665
377k
            WRITE_ASCII_CHAR('\\');
6666
377k
            WRITE_CHAR(c);
6667
377k
            continue;
6668
2.88M
        }
6669
6670
12
      incomplete:
6671
12
        if (consumed) {
6672
0
            *consumed = startinpos;
6673
0
            break;
6674
0
        }
6675
60
      error:;
6676
60
        Py_ssize_t endinpos = s-starts;
6677
60
        writer.min_length = end - s + writer.pos;
6678
60
        if (unicode_decode_call_errorhandler_writer(
6679
60
                errors, &errorHandler,
6680
60
                "unicodeescape", message,
6681
60
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6682
60
                &writer)) {
6683
60
            goto onError;
6684
60
        }
6685
60
        assert(end - s <= writer.size - writer.pos);
6686
6687
0
#undef WRITE_ASCII_CHAR
6688
0
#undef WRITE_CHAR
6689
0
    }
6690
6691
14.4k
    Py_XDECREF(errorHandler);
6692
14.4k
    Py_XDECREF(exc);
6693
14.4k
    return _PyUnicodeWriter_Finish(&writer);
6694
6695
60
  onError:
6696
60
    _PyUnicodeWriter_Dealloc(&writer);
6697
60
    Py_XDECREF(errorHandler);
6698
60
    Py_XDECREF(exc);
6699
60
    return NULL;
6700
14.4k
}
6701
6702
PyObject *
6703
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6704
                              Py_ssize_t size,
6705
                              const char *errors,
6706
                              Py_ssize_t *consumed)
6707
0
{
6708
0
    int first_invalid_escape_char;
6709
0
    const char *first_invalid_escape_ptr;
6710
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6711
0
                                                      consumed,
6712
0
                                                      &first_invalid_escape_char,
6713
0
                                                      &first_invalid_escape_ptr);
6714
0
    if (result == NULL)
6715
0
        return NULL;
6716
0
    if (first_invalid_escape_char != -1) {
6717
0
        if (first_invalid_escape_char > 0xff) {
6718
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6719
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6720
0
                                 "Such sequences will not work in the future. ",
6721
0
                                 first_invalid_escape_char) < 0)
6722
0
            {
6723
0
                Py_DECREF(result);
6724
0
                return NULL;
6725
0
            }
6726
0
        }
6727
0
        else {
6728
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6729
0
                                 "\"\\%c\" is an invalid escape sequence. "
6730
0
                                 "Such sequences will not work in the future. ",
6731
0
                                 first_invalid_escape_char) < 0)
6732
0
            {
6733
0
                Py_DECREF(result);
6734
0
                return NULL;
6735
0
            }
6736
0
        }
6737
0
    }
6738
0
    return result;
6739
0
}
6740
6741
PyObject *
6742
PyUnicode_DecodeUnicodeEscape(const char *s,
6743
                              Py_ssize_t size,
6744
                              const char *errors)
6745
0
{
6746
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6747
0
}
6748
6749
/* Return a Unicode-Escape string version of the Unicode object. */
6750
6751
PyObject *
6752
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6753
0
{
6754
0
    if (!PyUnicode_Check(unicode)) {
6755
0
        PyErr_BadArgument();
6756
0
        return NULL;
6757
0
    }
6758
6759
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6760
0
    if (len == 0) {
6761
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6762
0
    }
6763
0
    int kind = PyUnicode_KIND(unicode);
6764
0
    const void *data = PyUnicode_DATA(unicode);
6765
6766
    /* Initial allocation is based on the longest-possible character
6767
     * escape.
6768
     *
6769
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6770
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6771
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6772
0
    Py_ssize_t expandsize = kind * 2 + 2;
6773
0
    if (len > PY_SSIZE_T_MAX / expandsize) {
6774
0
        return PyErr_NoMemory();
6775
0
    }
6776
6777
0
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6778
0
    if (writer == NULL) {
6779
0
        return NULL;
6780
0
    }
6781
0
    char *p = PyBytesWriter_GetData(writer);
6782
6783
0
    for (Py_ssize_t i = 0; i < len; i++) {
6784
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6785
6786
        /* U+0000-U+00ff range */
6787
0
        if (ch < 0x100) {
6788
0
            if (ch >= ' ' && ch < 127) {
6789
0
                if (ch != '\\') {
6790
                    /* Copy printable US ASCII as-is */
6791
0
                    *p++ = (char) ch;
6792
0
                }
6793
                /* Escape backslashes */
6794
0
                else {
6795
0
                    *p++ = '\\';
6796
0
                    *p++ = '\\';
6797
0
                }
6798
0
            }
6799
6800
            /* Map special whitespace to '\t', \n', '\r' */
6801
0
            else if (ch == '\t') {
6802
0
                *p++ = '\\';
6803
0
                *p++ = 't';
6804
0
            }
6805
0
            else if (ch == '\n') {
6806
0
                *p++ = '\\';
6807
0
                *p++ = 'n';
6808
0
            }
6809
0
            else if (ch == '\r') {
6810
0
                *p++ = '\\';
6811
0
                *p++ = 'r';
6812
0
            }
6813
6814
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6815
0
            else {
6816
0
                *p++ = '\\';
6817
0
                *p++ = 'x';
6818
0
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6819
0
                *p++ = Py_hexdigits[ch & 0x000F];
6820
0
            }
6821
0
        }
6822
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6823
0
        else if (ch < 0x10000) {
6824
0
            *p++ = '\\';
6825
0
            *p++ = 'u';
6826
0
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6827
0
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6828
0
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6829
0
            *p++ = Py_hexdigits[ch & 0x000F];
6830
0
        }
6831
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6832
0
        else {
6833
6834
            /* Make sure that the first two digits are zero */
6835
0
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6836
0
            *p++ = '\\';
6837
0
            *p++ = 'U';
6838
0
            *p++ = '0';
6839
0
            *p++ = '0';
6840
0
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6841
0
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6842
0
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6843
0
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6844
0
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6845
0
            *p++ = Py_hexdigits[ch & 0x0000000F];
6846
0
        }
6847
0
    }
6848
6849
0
    return PyBytesWriter_FinishWithPointer(writer, p);
6850
0
}
6851
6852
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6853
6854
PyObject *
6855
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6856
                                          Py_ssize_t size,
6857
                                          const char *errors,
6858
                                          Py_ssize_t *consumed)
6859
0
{
6860
0
    const char *starts = s;
6861
0
    _PyUnicodeWriter writer;
6862
0
    const char *end;
6863
0
    PyObject *errorHandler = NULL;
6864
0
    PyObject *exc = NULL;
6865
6866
0
    if (size == 0) {
6867
0
        if (consumed) {
6868
0
            *consumed = 0;
6869
0
        }
6870
0
        _Py_RETURN_UNICODE_EMPTY();
6871
0
    }
6872
6873
    /* Escaped strings will always be longer than the resulting
6874
       Unicode string, so we start with size here and then reduce the
6875
       length after conversion to the true value. (But decoding error
6876
       handler might have to resize the string) */
6877
0
    _PyUnicodeWriter_Init(&writer);
6878
0
    writer.min_length = size;
6879
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6880
0
        goto onError;
6881
0
    }
6882
6883
0
    end = s + size;
6884
0
    while (s < end) {
6885
0
        unsigned char c = (unsigned char) *s++;
6886
0
        Py_UCS4 ch;
6887
0
        int count;
6888
0
        const char *message;
6889
6890
0
#define WRITE_CHAR(ch)                                                        \
6891
0
            do {                                                              \
6892
0
                if (ch <= writer.maxchar) {                                   \
6893
0
                    assert(writer.pos < writer.size);                         \
6894
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6895
0
                }                                                             \
6896
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6897
0
                    goto onError;                                             \
6898
0
                }                                                             \
6899
0
            } while(0)
6900
6901
        /* Non-escape characters are interpreted as Unicode ordinals */
6902
0
        if (c != '\\' || (s >= end && !consumed)) {
6903
0
            WRITE_CHAR(c);
6904
0
            continue;
6905
0
        }
6906
6907
0
        Py_ssize_t startinpos = s - starts - 1;
6908
        /* \ - Escapes */
6909
0
        if (s >= end) {
6910
0
            assert(consumed);
6911
            // Set message to silent compiler warning.
6912
            // Actually it is never used.
6913
0
            message = "\\ at end of string";
6914
0
            goto incomplete;
6915
0
        }
6916
6917
0
        c = (unsigned char) *s++;
6918
0
        if (c == 'u') {
6919
0
            count = 4;
6920
0
            message = "truncated \\uXXXX escape";
6921
0
        }
6922
0
        else if (c == 'U') {
6923
0
            count = 8;
6924
0
            message = "truncated \\UXXXXXXXX escape";
6925
0
        }
6926
0
        else {
6927
0
            assert(writer.pos < writer.size);
6928
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6929
0
            WRITE_CHAR(c);
6930
0
            continue;
6931
0
        }
6932
6933
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6934
0
        for (ch = 0; count; ++s, --count) {
6935
0
            if (s >= end) {
6936
0
                goto incomplete;
6937
0
            }
6938
0
            c = (unsigned char)*s;
6939
0
            ch <<= 4;
6940
0
            if (c >= '0' && c <= '9') {
6941
0
                ch += c - '0';
6942
0
            }
6943
0
            else if (c >= 'a' && c <= 'f') {
6944
0
                ch += c - ('a' - 10);
6945
0
            }
6946
0
            else if (c >= 'A' && c <= 'F') {
6947
0
                ch += c - ('A' - 10);
6948
0
            }
6949
0
            else {
6950
0
                goto error;
6951
0
            }
6952
0
        }
6953
0
        if (ch > MAX_UNICODE) {
6954
0
            message = "\\Uxxxxxxxx out of range";
6955
0
            goto error;
6956
0
        }
6957
0
        WRITE_CHAR(ch);
6958
0
        continue;
6959
6960
0
      incomplete:
6961
0
        if (consumed) {
6962
0
            *consumed = startinpos;
6963
0
            break;
6964
0
        }
6965
0
      error:;
6966
0
        Py_ssize_t endinpos = s-starts;
6967
0
        writer.min_length = end - s + writer.pos;
6968
0
        if (unicode_decode_call_errorhandler_writer(
6969
0
                errors, &errorHandler,
6970
0
                "rawunicodeescape", message,
6971
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6972
0
                &writer)) {
6973
0
            goto onError;
6974
0
        }
6975
0
        assert(end - s <= writer.size - writer.pos);
6976
6977
0
#undef WRITE_CHAR
6978
0
    }
6979
0
    Py_XDECREF(errorHandler);
6980
0
    Py_XDECREF(exc);
6981
0
    return _PyUnicodeWriter_Finish(&writer);
6982
6983
0
  onError:
6984
0
    _PyUnicodeWriter_Dealloc(&writer);
6985
0
    Py_XDECREF(errorHandler);
6986
0
    Py_XDECREF(exc);
6987
0
    return NULL;
6988
0
}
6989
6990
PyObject *
6991
PyUnicode_DecodeRawUnicodeEscape(const char *s,
6992
                                 Py_ssize_t size,
6993
                                 const char *errors)
6994
0
{
6995
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6996
0
}
6997
6998
6999
PyObject *
7000
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7001
0
{
7002
0
    if (!PyUnicode_Check(unicode)) {
7003
0
        PyErr_BadArgument();
7004
0
        return NULL;
7005
0
    }
7006
0
    int kind = PyUnicode_KIND(unicode);
7007
0
    const void *data = PyUnicode_DATA(unicode);
7008
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7009
0
    if (len == 0) {
7010
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7011
0
    }
7012
0
    if (kind == PyUnicode_1BYTE_KIND) {
7013
0
        return PyBytes_FromStringAndSize(data, len);
7014
0
    }
7015
7016
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7017
       bytes, and 1 byte characters 4. */
7018
0
    Py_ssize_t expandsize = kind * 2 + 2;
7019
0
    if (len > PY_SSIZE_T_MAX / expandsize) {
7020
0
        return PyErr_NoMemory();
7021
0
    }
7022
7023
0
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7024
0
    if (writer == NULL) {
7025
0
        return NULL;
7026
0
    }
7027
0
    char *p = PyBytesWriter_GetData(writer);
7028
7029
0
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7030
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7031
7032
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7033
0
        if (ch < 0x100) {
7034
0
            *p++ = (char) ch;
7035
0
        }
7036
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7037
0
        else if (ch < 0x10000) {
7038
0
            *p++ = '\\';
7039
0
            *p++ = 'u';
7040
0
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7041
0
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7042
0
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7043
0
            *p++ = Py_hexdigits[ch & 15];
7044
0
        }
7045
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7046
0
        else {
7047
0
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7048
0
            *p++ = '\\';
7049
0
            *p++ = 'U';
7050
0
            *p++ = '0';
7051
0
            *p++ = '0';
7052
0
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7053
0
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7054
0
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7055
0
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7056
0
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7057
0
            *p++ = Py_hexdigits[ch & 15];
7058
0
        }
7059
0
    }
7060
7061
0
    return PyBytesWriter_FinishWithPointer(writer, p);
7062
0
}
7063
7064
/* --- Latin-1 Codec ------------------------------------------------------ */
7065
7066
PyObject *
7067
PyUnicode_DecodeLatin1(const char *s,
7068
                       Py_ssize_t size,
7069
                       const char *errors)
7070
5.13k
{
7071
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7072
5.13k
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7073
5.13k
}
7074
7075
/* create or adjust a UnicodeEncodeError */
7076
static void
7077
make_encode_exception(PyObject **exceptionObject,
7078
                      const char *encoding,
7079
                      PyObject *unicode,
7080
                      Py_ssize_t startpos, Py_ssize_t endpos,
7081
                      const char *reason)
7082
2.34k
{
7083
2.34k
    if (*exceptionObject == NULL) {
7084
2.34k
        *exceptionObject = PyObject_CallFunction(
7085
2.34k
            PyExc_UnicodeEncodeError, "sOnns",
7086
2.34k
            encoding, unicode, startpos, endpos, reason);
7087
2.34k
    }
7088
0
    else {
7089
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7090
0
            goto onError;
7091
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7092
0
            goto onError;
7093
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7094
0
            goto onError;
7095
0
        return;
7096
0
      onError:
7097
0
        Py_CLEAR(*exceptionObject);
7098
0
    }
7099
2.34k
}
7100
7101
/* raises a UnicodeEncodeError */
7102
static void
7103
raise_encode_exception(PyObject **exceptionObject,
7104
                       const char *encoding,
7105
                       PyObject *unicode,
7106
                       Py_ssize_t startpos, Py_ssize_t endpos,
7107
                       const char *reason)
7108
2.33k
{
7109
2.33k
    make_encode_exception(exceptionObject,
7110
2.33k
                          encoding, unicode, startpos, endpos, reason);
7111
2.33k
    if (*exceptionObject != NULL)
7112
2.33k
        PyCodec_StrictErrors(*exceptionObject);
7113
2.33k
}
7114
7115
/* error handling callback helper:
7116
   build arguments, call the callback and check the arguments,
7117
   put the result into newpos and return the replacement string, which
7118
   has to be freed by the caller */
7119
static PyObject *
7120
unicode_encode_call_errorhandler(const char *errors,
7121
                                 PyObject **errorHandler,
7122
                                 const char *encoding, const char *reason,
7123
                                 PyObject *unicode, PyObject **exceptionObject,
7124
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7125
                                 Py_ssize_t *newpos)
7126
7
{
7127
7
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7128
7
    Py_ssize_t len;
7129
7
    PyObject *restuple;
7130
7
    PyObject *resunicode;
7131
7132
7
    if (*errorHandler == NULL) {
7133
7
        *errorHandler = PyCodec_LookupError(errors);
7134
7
        if (*errorHandler == NULL)
7135
0
            return NULL;
7136
7
    }
7137
7138
7
    len = PyUnicode_GET_LENGTH(unicode);
7139
7140
7
    make_encode_exception(exceptionObject,
7141
7
                          encoding, unicode, startpos, endpos, reason);
7142
7
    if (*exceptionObject == NULL)
7143
0
        return NULL;
7144
7145
7
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7146
7
    if (restuple == NULL)
7147
7
        return NULL;
7148
0
    if (!PyTuple_Check(restuple)) {
7149
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7150
0
        Py_DECREF(restuple);
7151
0
        return NULL;
7152
0
    }
7153
0
    if (!PyArg_ParseTuple(restuple, argparse,
7154
0
                          &resunicode, newpos)) {
7155
0
        Py_DECREF(restuple);
7156
0
        return NULL;
7157
0
    }
7158
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7159
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7160
0
        Py_DECREF(restuple);
7161
0
        return NULL;
7162
0
    }
7163
0
    if (*newpos<0)
7164
0
        *newpos = len + *newpos;
7165
0
    if (*newpos<0 || *newpos>len) {
7166
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7167
0
        Py_DECREF(restuple);
7168
0
        return NULL;
7169
0
    }
7170
0
    Py_INCREF(resunicode);
7171
0
    Py_DECREF(restuple);
7172
0
    return resunicode;
7173
0
}
7174
7175
static PyObject *
7176
unicode_encode_ucs1(PyObject *unicode,
7177
                    const char *errors,
7178
                    const Py_UCS4 limit)
7179
2.58k
{
7180
    /* input state */
7181
2.58k
    Py_ssize_t pos=0, size;
7182
2.58k
    int kind;
7183
2.58k
    const void *data;
7184
2.58k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7185
2.58k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7186
2.58k
    PyObject *error_handler_obj = NULL;
7187
2.58k
    PyObject *exc = NULL;
7188
2.58k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7189
2.58k
    PyObject *rep = NULL;
7190
7191
2.58k
    size = PyUnicode_GET_LENGTH(unicode);
7192
2.58k
    kind = PyUnicode_KIND(unicode);
7193
2.58k
    data = PyUnicode_DATA(unicode);
7194
    /* allocate enough for a simple encoding without
7195
       replacements, if we need more, we'll resize */
7196
2.58k
    if (size == 0)
7197
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7198
7199
    /* output object */
7200
2.58k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7201
2.58k
    if (writer == NULL) {
7202
0
        return NULL;
7203
0
    }
7204
    /* pointer into the output */
7205
2.58k
    char *str = PyBytesWriter_GetData(writer);
7206
7207
1.90M
    while (pos < size) {
7208
1.90M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7209
7210
        /* can we encode this? */
7211
1.90M
        if (ch < limit) {
7212
            /* no overflow check, because we know that the space is enough */
7213
1.89M
            *str++ = (char)ch;
7214
1.89M
            ++pos;
7215
1.89M
        }
7216
9.41k
        else {
7217
9.41k
            Py_ssize_t newpos, i;
7218
            /* startpos for collecting unencodable chars */
7219
9.41k
            Py_ssize_t collstart = pos;
7220
9.41k
            Py_ssize_t collend = collstart + 1;
7221
            /* find all unecodable characters */
7222
7223
181k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7224
171k
                ++collend;
7225
7226
            /* Only overallocate the buffer if it's not the last write */
7227
9.41k
            writer->overallocate = (collend < size);
7228
7229
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7230
9.41k
            if (error_handler == _Py_ERROR_UNKNOWN)
7231
2.58k
                error_handler = _Py_GetErrorHandler(errors);
7232
7233
9.41k
            switch (error_handler) {
7234
2.33k
            case _Py_ERROR_STRICT:
7235
2.33k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7236
2.33k
                goto onError;
7237
7238
0
            case _Py_ERROR_REPLACE:
7239
0
                memset(str, '?', collend - collstart);
7240
0
                str += (collend - collstart);
7241
0
                _Py_FALLTHROUGH;
7242
0
            case _Py_ERROR_IGNORE:
7243
0
                pos = collend;
7244
0
                break;
7245
7246
7.07k
            case _Py_ERROR_BACKSLASHREPLACE:
7247
                /* subtract preallocated bytes */
7248
7.07k
                writer->size -= (collend - collstart);
7249
7.07k
                str = backslashreplace(writer, str,
7250
7.07k
                                       unicode, collstart, collend);
7251
7.07k
                if (str == NULL)
7252
0
                    goto onError;
7253
7.07k
                pos = collend;
7254
7.07k
                break;
7255
7256
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7257
                /* subtract preallocated bytes */
7258
0
                writer->size -= (collend - collstart);
7259
0
                str = xmlcharrefreplace(writer, str,
7260
0
                                        unicode, collstart, collend);
7261
0
                if (str == NULL)
7262
0
                    goto onError;
7263
0
                pos = collend;
7264
0
                break;
7265
7266
0
            case _Py_ERROR_SURROGATEESCAPE:
7267
0
                for (i = collstart; i < collend; ++i) {
7268
0
                    ch = PyUnicode_READ(kind, data, i);
7269
0
                    if (ch < 0xdc80 || 0xdcff < ch) {
7270
                        /* Not a UTF-8b surrogate */
7271
0
                        break;
7272
0
                    }
7273
0
                    *str++ = (char)(ch - 0xdc00);
7274
0
                    ++pos;
7275
0
                }
7276
0
                if (i >= collend)
7277
0
                    break;
7278
0
                collstart = pos;
7279
0
                assert(collstart != collend);
7280
0
                _Py_FALLTHROUGH;
7281
7282
0
            default:
7283
0
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7284
0
                                                       encoding, reason, unicode, &exc,
7285
0
                                                       collstart, collend, &newpos);
7286
0
                if (rep == NULL)
7287
0
                    goto onError;
7288
7289
0
                if (newpos < collstart) {
7290
0
                    writer->overallocate = 1;
7291
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7292
0
                                                             collstart - newpos,
7293
0
                                                             str);
7294
0
                    if (str == NULL) {
7295
0
                        goto onError;
7296
0
                    }
7297
0
                }
7298
0
                else {
7299
                    /* subtract preallocated bytes */
7300
0
                    writer->size -= newpos - collstart;
7301
                    /* Only overallocate the buffer if it's not the last write */
7302
0
                    writer->overallocate = (newpos < size);
7303
0
                }
7304
7305
0
                char *rep_str;
7306
0
                Py_ssize_t rep_len;
7307
0
                if (PyBytes_Check(rep)) {
7308
                    /* Directly copy bytes result to output. */
7309
0
                    rep_str = PyBytes_AS_STRING(rep);
7310
0
                    rep_len = PyBytes_GET_SIZE(rep);
7311
0
                }
7312
0
                else {
7313
0
                    assert(PyUnicode_Check(rep));
7314
7315
0
                    if (limit == 256 ?
7316
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7317
0
                        !PyUnicode_IS_ASCII(rep))
7318
0
                    {
7319
                        /* Not all characters are smaller than limit */
7320
0
                        raise_encode_exception(&exc, encoding, unicode,
7321
0
                                               collstart, collend, reason);
7322
0
                        goto onError;
7323
0
                    }
7324
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7325
0
                    rep_str = PyUnicode_DATA(rep);
7326
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7327
0
                }
7328
7329
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7330
0
                if (str == NULL) {
7331
0
                    goto onError;
7332
0
                }
7333
0
                memcpy(str, rep_str, rep_len);
7334
0
                str += rep_len;
7335
7336
0
                pos = newpos;
7337
0
                Py_CLEAR(rep);
7338
9.41k
            }
7339
7340
            /* If overallocation was disabled, ensure that it was the last
7341
               write. Otherwise, we missed an optimization */
7342
9.41k
            assert(writer->overallocate || pos == size);
7343
7.07k
        }
7344
1.90M
    }
7345
7346
251
    Py_XDECREF(error_handler_obj);
7347
251
    Py_XDECREF(exc);
7348
251
    return PyBytesWriter_FinishWithPointer(writer, str);
7349
7350
2.33k
  onError:
7351
2.33k
    Py_XDECREF(rep);
7352
2.33k
    PyBytesWriter_Discard(writer);
7353
2.33k
    Py_XDECREF(error_handler_obj);
7354
2.33k
    Py_XDECREF(exc);
7355
2.33k
    return NULL;
7356
2.58k
}
7357
7358
PyObject *
7359
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7360
0
{
7361
0
    if (!PyUnicode_Check(unicode)) {
7362
0
        PyErr_BadArgument();
7363
0
        return NULL;
7364
0
    }
7365
    /* Fast path: if it is a one-byte string, construct
7366
       bytes object directly. */
7367
0
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7368
0
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7369
0
                                         PyUnicode_GET_LENGTH(unicode));
7370
    /* Non-Latin-1 characters present. Defer to above function to
7371
       raise the exception. */
7372
0
    return unicode_encode_ucs1(unicode, errors, 256);
7373
0
}
7374
7375
PyObject*
7376
PyUnicode_AsLatin1String(PyObject *unicode)
7377
0
{
7378
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7379
0
}
7380
7381
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7382
7383
PyObject *
7384
PyUnicode_DecodeASCII(const char *s,
7385
                      Py_ssize_t size,
7386
                      const char *errors)
7387
11.1k
{
7388
11.1k
    const char *starts = s;
7389
11.1k
    const char *e = s + size;
7390
11.1k
    PyObject *error_handler_obj = NULL;
7391
11.1k
    PyObject *exc = NULL;
7392
11.1k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7393
7394
11.1k
    if (size == 0)
7395
9
        _Py_RETURN_UNICODE_EMPTY();
7396
7397
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7398
11.0k
    if (size == 1 && (unsigned char)s[0] < 128) {
7399
368
        return get_latin1_char((unsigned char)s[0]);
7400
368
    }
7401
7402
    // Shortcut for simple case
7403
10.7k
    PyObject *u = PyUnicode_New(size, 127);
7404
10.7k
    if (u == NULL) {
7405
0
        return NULL;
7406
0
    }
7407
10.7k
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7408
10.7k
    if (outpos == size) {
7409
10.4k
        return u;
7410
10.4k
    }
7411
7412
284
    _PyUnicodeWriter writer;
7413
284
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7414
284
    writer.pos = outpos;
7415
7416
284
    s += outpos;
7417
284
    int kind = writer.kind;
7418
284
    void *data = writer.data;
7419
284
    Py_ssize_t startinpos, endinpos;
7420
7421
7.51M
    while (s < e) {
7422
7.51M
        unsigned char c = (unsigned char)*s;
7423
7.51M
        if (c < 128) {
7424
7.48M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7425
7.48M
            writer.pos++;
7426
7.48M
            ++s;
7427
7.48M
            continue;
7428
7.48M
        }
7429
7430
        /* byte outsize range 0x00..0x7f: call the error handler */
7431
7432
35.2k
        if (error_handler == _Py_ERROR_UNKNOWN)
7433
284
            error_handler = _Py_GetErrorHandler(errors);
7434
7435
35.2k
        switch (error_handler)
7436
35.2k
        {
7437
1.02k
        case _Py_ERROR_REPLACE:
7438
35.0k
        case _Py_ERROR_SURROGATEESCAPE:
7439
            /* Fast-path: the error handler only writes one character,
7440
               but we may switch to UCS2 at the first write */
7441
35.0k
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7442
0
                goto onError;
7443
35.0k
            kind = writer.kind;
7444
35.0k
            data = writer.data;
7445
7446
35.0k
            if (error_handler == _Py_ERROR_REPLACE)
7447
1.02k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7448
34.0k
            else
7449
34.0k
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7450
35.0k
            writer.pos++;
7451
35.0k
            ++s;
7452
35.0k
            break;
7453
7454
0
        case _Py_ERROR_IGNORE:
7455
0
            ++s;
7456
0
            break;
7457
7458
107
        default:
7459
107
            startinpos = s-starts;
7460
107
            endinpos = startinpos + 1;
7461
107
            if (unicode_decode_call_errorhandler_writer(
7462
107
                    errors, &error_handler_obj,
7463
107
                    "ascii", "ordinal not in range(128)",
7464
107
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7465
107
                    &writer))
7466
107
                goto onError;
7467
0
            kind = writer.kind;
7468
0
            data = writer.data;
7469
35.2k
        }
7470
35.2k
    }
7471
177
    Py_XDECREF(error_handler_obj);
7472
177
    Py_XDECREF(exc);
7473
177
    return _PyUnicodeWriter_Finish(&writer);
7474
7475
107
  onError:
7476
107
    _PyUnicodeWriter_Dealloc(&writer);
7477
107
    Py_XDECREF(error_handler_obj);
7478
107
    Py_XDECREF(exc);
7479
107
    return NULL;
7480
284
}
7481
7482
PyObject *
7483
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7484
5.02k
{
7485
5.02k
    if (!PyUnicode_Check(unicode)) {
7486
0
        PyErr_BadArgument();
7487
0
        return NULL;
7488
0
    }
7489
    /* Fast path: if it is an ASCII-only string, construct bytes object
7490
       directly. Else defer to above function to raise the exception. */
7491
5.02k
    if (PyUnicode_IS_ASCII(unicode))
7492
2.43k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7493
2.43k
                                         PyUnicode_GET_LENGTH(unicode));
7494
2.58k
    return unicode_encode_ucs1(unicode, errors, 128);
7495
5.02k
}
7496
7497
PyObject *
7498
PyUnicode_AsASCIIString(PyObject *unicode)
7499
2
{
7500
2
    return _PyUnicode_AsASCIIString(unicode, NULL);
7501
2
}
7502
7503
#ifdef MS_WINDOWS
7504
7505
/* --- MBCS codecs for Windows -------------------------------------------- */
7506
7507
#if SIZEOF_INT < SIZEOF_SIZE_T
7508
#define NEED_RETRY
7509
#endif
7510
7511
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7512
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7513
   both cases also and avoids partial characters overrunning the
7514
   length limit in MultiByteToWideChar on Windows */
7515
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7516
7517
#ifndef WC_ERR_INVALID_CHARS
7518
#  define WC_ERR_INVALID_CHARS 0x0080
7519
#endif
7520
7521
static const char*
7522
code_page_name(UINT code_page, PyObject **obj)
7523
{
7524
    *obj = NULL;
7525
    if (code_page == CP_ACP)
7526
        return "mbcs";
7527
7528
    *obj = PyBytes_FromFormat("cp%u", code_page);
7529
    if (*obj == NULL)
7530
        return NULL;
7531
    return PyBytes_AS_STRING(*obj);
7532
}
7533
7534
static DWORD
7535
decode_code_page_flags(UINT code_page)
7536
{
7537
    if (code_page == CP_UTF7) {
7538
        /* The CP_UTF7 decoder only supports flags=0 */
7539
        return 0;
7540
    }
7541
    else
7542
        return MB_ERR_INVALID_CHARS;
7543
}
7544
7545
/*
7546
 * Decode a byte string from a Windows code page into unicode object in strict
7547
 * mode.
7548
 *
7549
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7550
 * OSError and returns -1 on other error.
7551
 */
7552
static int
7553
decode_code_page_strict(UINT code_page,
7554
                        wchar_t **buf,
7555
                        Py_ssize_t *bufsize,
7556
                        const char *in,
7557
                        int insize)
7558
{
7559
    DWORD flags = MB_ERR_INVALID_CHARS;
7560
    wchar_t *out;
7561
    DWORD outsize;
7562
7563
    /* First get the size of the result */
7564
    assert(insize > 0);
7565
    while ((outsize = MultiByteToWideChar(code_page, flags,
7566
                                          in, insize, NULL, 0)) <= 0)
7567
    {
7568
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7569
            goto error;
7570
        }
7571
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7572
        flags = 0;
7573
    }
7574
7575
    /* Extend a wchar_t* buffer */
7576
    Py_ssize_t n = *bufsize;   /* Get the current length */
7577
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7578
        return -1;
7579
    }
7580
    out = *buf + n;
7581
7582
    /* Do the conversion */
7583
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7584
    if (outsize <= 0)
7585
        goto error;
7586
    return insize;
7587
7588
error:
7589
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7590
        return -2;
7591
    PyErr_SetFromWindowsErr(0);
7592
    return -1;
7593
}
7594
7595
/*
7596
 * Decode a byte string from a code page into unicode object with an error
7597
 * handler.
7598
 *
7599
 * Returns consumed size if succeed, or raise an OSError or
7600
 * UnicodeDecodeError exception and returns -1 on error.
7601
 */
7602
static int
7603
decode_code_page_errors(UINT code_page,
7604
                        wchar_t **buf,
7605
                        Py_ssize_t *bufsize,
7606
                        const char *in, const int size,
7607
                        const char *errors, int final)
7608
{
7609
    const char *startin = in;
7610
    const char *endin = in + size;
7611
    DWORD flags = MB_ERR_INVALID_CHARS;
7612
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7613
       2000 English version of the message. */
7614
    const char *reason = "No mapping for the Unicode character exists "
7615
                         "in the target code page.";
7616
    /* each step cannot decode more than 1 character, but a character can be
7617
       represented as a surrogate pair */
7618
    wchar_t buffer[2], *out;
7619
    int insize;
7620
    Py_ssize_t outsize;
7621
    PyObject *errorHandler = NULL;
7622
    PyObject *exc = NULL;
7623
    PyObject *encoding_obj = NULL;
7624
    const char *encoding;
7625
    DWORD err;
7626
    int ret = -1;
7627
7628
    assert(size > 0);
7629
7630
    encoding = code_page_name(code_page, &encoding_obj);
7631
    if (encoding == NULL)
7632
        return -1;
7633
7634
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7635
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7636
           UnicodeDecodeError. */
7637
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7638
        if (exc != NULL) {
7639
            PyCodec_StrictErrors(exc);
7640
            Py_CLEAR(exc);
7641
        }
7642
        goto error;
7643
    }
7644
7645
    /* Extend a wchar_t* buffer */
7646
    Py_ssize_t n = *bufsize;   /* Get the current length */
7647
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7648
        PyErr_NoMemory();
7649
        goto error;
7650
    }
7651
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7652
        goto error;
7653
    }
7654
    out = *buf + n;
7655
7656
    /* Decode the byte string character per character */
7657
    while (in < endin)
7658
    {
7659
        /* Decode a character */
7660
        insize = 1;
7661
        do
7662
        {
7663
            outsize = MultiByteToWideChar(code_page, flags,
7664
                                          in, insize,
7665
                                          buffer, Py_ARRAY_LENGTH(buffer));
7666
            if (outsize > 0)
7667
                break;
7668
            err = GetLastError();
7669
            if (err == ERROR_INVALID_FLAGS && flags) {
7670
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7671
                flags = 0;
7672
                continue;
7673
            }
7674
            if (err != ERROR_NO_UNICODE_TRANSLATION
7675
                && err != ERROR_INSUFFICIENT_BUFFER)
7676
            {
7677
                PyErr_SetFromWindowsErr(err);
7678
                goto error;
7679
            }
7680
            insize++;
7681
        }
7682
        /* 4=maximum length of a UTF-8 sequence */
7683
        while (insize <= 4 && (in + insize) <= endin);
7684
7685
        if (outsize <= 0) {
7686
            Py_ssize_t startinpos, endinpos, outpos;
7687
7688
            /* last character in partial decode? */
7689
            if (in + insize >= endin && !final)
7690
                break;
7691
7692
            startinpos = in - startin;
7693
            endinpos = startinpos + 1;
7694
            outpos = out - *buf;
7695
            if (unicode_decode_call_errorhandler_wchar(
7696
                    errors, &errorHandler,
7697
                    encoding, reason,
7698
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7699
                    buf, bufsize, &outpos))
7700
            {
7701
                goto error;
7702
            }
7703
            out = *buf + outpos;
7704
        }
7705
        else {
7706
            in += insize;
7707
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7708
            out += outsize;
7709
        }
7710
    }
7711
7712
    /* Shrink the buffer */
7713
    assert(out - *buf <= *bufsize);
7714
    *bufsize = out - *buf;
7715
    /* (in - startin) <= size and size is an int */
7716
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7717
7718
error:
7719
    Py_XDECREF(encoding_obj);
7720
    Py_XDECREF(errorHandler);
7721
    Py_XDECREF(exc);
7722
    return ret;
7723
}
7724
7725
static PyObject *
7726
decode_code_page_stateful(int code_page,
7727
                          const char *s, Py_ssize_t size,
7728
                          const char *errors, Py_ssize_t *consumed)
7729
{
7730
    wchar_t *buf = NULL;
7731
    Py_ssize_t bufsize = 0;
7732
    int chunk_size, final, converted, done;
7733
7734
    if (code_page < 0) {
7735
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7736
        return NULL;
7737
    }
7738
    if (size < 0) {
7739
        PyErr_BadInternalCall();
7740
        return NULL;
7741
    }
7742
7743
    if (consumed)
7744
        *consumed = 0;
7745
7746
    do
7747
    {
7748
#ifdef NEED_RETRY
7749
        if (size > DECODING_CHUNK_SIZE) {
7750
            chunk_size = DECODING_CHUNK_SIZE;
7751
            final = 0;
7752
            done = 0;
7753
        }
7754
        else
7755
#endif
7756
        {
7757
            chunk_size = (int)size;
7758
            final = (consumed == NULL);
7759
            done = 1;
7760
        }
7761
7762
        if (chunk_size == 0 && done) {
7763
            if (buf != NULL)
7764
                break;
7765
            _Py_RETURN_UNICODE_EMPTY();
7766
        }
7767
7768
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7769
                                            s, chunk_size);
7770
        if (converted == -2)
7771
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7772
                                                s, chunk_size,
7773
                                                errors, final);
7774
        assert(converted != 0 || done);
7775
7776
        if (converted < 0) {
7777
            PyMem_Free(buf);
7778
            return NULL;
7779
        }
7780
7781
        if (consumed)
7782
            *consumed += converted;
7783
7784
        s += converted;
7785
        size -= converted;
7786
    } while (!done);
7787
7788
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7789
    PyMem_Free(buf);
7790
    return v;
7791
}
7792
7793
PyObject *
7794
PyUnicode_DecodeCodePageStateful(int code_page,
7795
                                 const char *s,
7796
                                 Py_ssize_t size,
7797
                                 const char *errors,
7798
                                 Py_ssize_t *consumed)
7799
{
7800
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7801
}
7802
7803
PyObject *
7804
PyUnicode_DecodeMBCSStateful(const char *s,
7805
                             Py_ssize_t size,
7806
                             const char *errors,
7807
                             Py_ssize_t *consumed)
7808
{
7809
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7810
}
7811
7812
PyObject *
7813
PyUnicode_DecodeMBCS(const char *s,
7814
                     Py_ssize_t size,
7815
                     const char *errors)
7816
{
7817
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7818
}
7819
7820
static DWORD
7821
encode_code_page_flags(UINT code_page, const char *errors)
7822
{
7823
    if (code_page == CP_UTF8) {
7824
        return WC_ERR_INVALID_CHARS;
7825
    }
7826
    else if (code_page == CP_UTF7) {
7827
        /* CP_UTF7 only supports flags=0 */
7828
        return 0;
7829
    }
7830
    else {
7831
        if (errors != NULL && strcmp(errors, "replace") == 0)
7832
            return 0;
7833
        else
7834
            return WC_NO_BEST_FIT_CHARS;
7835
    }
7836
}
7837
7838
/*
7839
 * Encode a Unicode string to a Windows code page into a byte string in strict
7840
 * mode.
7841
 *
7842
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7843
 * an OSError and returns -1 on other error.
7844
 */
7845
static int
7846
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7847
                        PyObject *unicode, Py_ssize_t offset, int len,
7848
                        const char* errors)
7849
{
7850
    BOOL usedDefaultChar = FALSE;
7851
    BOOL *pusedDefaultChar = &usedDefaultChar;
7852
    int outsize;
7853
    wchar_t *p;
7854
    Py_ssize_t size;
7855
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7856
    char *out;
7857
    /* Create a substring so that we can get the UTF-16 representation
7858
       of just the slice under consideration. */
7859
    PyObject *substring;
7860
    int ret = -1;
7861
7862
    assert(len > 0);
7863
7864
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7865
        pusedDefaultChar = &usedDefaultChar;
7866
    else
7867
        pusedDefaultChar = NULL;
7868
7869
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7870
    if (substring == NULL)
7871
        return -1;
7872
    p = PyUnicode_AsWideCharString(substring, &size);
7873
    Py_CLEAR(substring);
7874
    if (p == NULL) {
7875
        return -1;
7876
    }
7877
    assert(size <= INT_MAX);
7878
7879
    /* First get the size of the result */
7880
    outsize = WideCharToMultiByte(code_page, flags,
7881
                                  p, (int)size,
7882
                                  NULL, 0,
7883
                                  NULL, pusedDefaultChar);
7884
    if (outsize <= 0)
7885
        goto error;
7886
    /* If we used a default char, then we failed! */
7887
    if (pusedDefaultChar && *pusedDefaultChar) {
7888
        ret = -2;
7889
        goto done;
7890
    }
7891
7892
    if (*writer == NULL) {
7893
        /* Create string object */
7894
        *writer = PyBytesWriter_Create(outsize);
7895
        if (*writer == NULL) {
7896
            goto done;
7897
        }
7898
        out = PyBytesWriter_GetData(*writer);
7899
    }
7900
    else {
7901
        /* Extend string object */
7902
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
7903
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
7904
            goto done;
7905
        }
7906
        out = (char*)PyBytesWriter_GetData(*writer) + n;
7907
    }
7908
7909
    /* Do the conversion */
7910
    outsize = WideCharToMultiByte(code_page, flags,
7911
                                  p, (int)size,
7912
                                  out, outsize,
7913
                                  NULL, pusedDefaultChar);
7914
    if (outsize <= 0)
7915
        goto error;
7916
    if (pusedDefaultChar && *pusedDefaultChar) {
7917
        ret = -2;
7918
        goto done;
7919
    }
7920
    ret = 0;
7921
7922
done:
7923
    PyMem_Free(p);
7924
    return ret;
7925
7926
error:
7927
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7928
        ret = -2;
7929
        goto done;
7930
    }
7931
    PyErr_SetFromWindowsErr(0);
7932
    goto done;
7933
}
7934
7935
/*
7936
 * Encode a Unicode string to a Windows code page into a byte string using an
7937
 * error handler.
7938
 *
7939
 * Returns consumed characters if succeed, or raise an OSError and returns
7940
 * -1 on other error.
7941
 */
7942
static int
7943
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
7944
                        PyObject *unicode, Py_ssize_t unicode_offset,
7945
                        Py_ssize_t insize, const char* errors)
7946
{
7947
    const DWORD flags = encode_code_page_flags(code_page, errors);
7948
    Py_ssize_t pos = unicode_offset;
7949
    Py_ssize_t endin = unicode_offset + insize;
7950
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7951
       2000 English version of the message. */
7952
    const char *reason = "invalid character";
7953
    /* 4=maximum length of a UTF-8 sequence */
7954
    char buffer[4];
7955
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7956
    Py_ssize_t outsize;
7957
    char *out;
7958
    PyObject *errorHandler = NULL;
7959
    PyObject *exc = NULL;
7960
    PyObject *encoding_obj = NULL;
7961
    const char *encoding;
7962
    Py_ssize_t newpos;
7963
    PyObject *rep;
7964
    int ret = -1;
7965
7966
    assert(insize > 0);
7967
7968
    encoding = code_page_name(code_page, &encoding_obj);
7969
    if (encoding == NULL)
7970
        return -1;
7971
7972
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7973
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7974
           then we raise a UnicodeEncodeError. */
7975
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7976
        if (exc != NULL) {
7977
            PyCodec_StrictErrors(exc);
7978
            Py_DECREF(exc);
7979
        }
7980
        Py_XDECREF(encoding_obj);
7981
        return -1;
7982
    }
7983
7984
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7985
        pusedDefaultChar = &usedDefaultChar;
7986
    else
7987
        pusedDefaultChar = NULL;
7988
7989
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7990
        PyErr_NoMemory();
7991
        goto error;
7992
    }
7993
    outsize = insize * Py_ARRAY_LENGTH(buffer);
7994
7995
    if (*writer == NULL) {
7996
        /* Create string object */
7997
        *writer = PyBytesWriter_Create(outsize);
7998
        if (*writer == NULL) {
7999
            goto error;
8000
        }
8001
        out = PyBytesWriter_GetData(*writer);
8002
    }
8003
    else {
8004
        /* Extend string object */
8005
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8006
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8007
            goto error;
8008
        }
8009
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8010
    }
8011
8012
    /* Encode the string character per character */
8013
    while (pos < endin)
8014
    {
8015
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8016
        wchar_t chars[2];
8017
        int charsize;
8018
        if (ch < 0x10000) {
8019
            chars[0] = (wchar_t)ch;
8020
            charsize = 1;
8021
        }
8022
        else {
8023
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8024
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8025
            charsize = 2;
8026
        }
8027
8028
        outsize = WideCharToMultiByte(code_page, flags,
8029
                                      chars, charsize,
8030
                                      buffer, Py_ARRAY_LENGTH(buffer),
8031
                                      NULL, pusedDefaultChar);
8032
        if (outsize > 0) {
8033
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8034
            {
8035
                pos++;
8036
                memcpy(out, buffer, outsize);
8037
                out += outsize;
8038
                continue;
8039
            }
8040
        }
8041
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8042
            PyErr_SetFromWindowsErr(0);
8043
            goto error;
8044
        }
8045
8046
        rep = unicode_encode_call_errorhandler(
8047
                  errors, &errorHandler, encoding, reason,
8048
                  unicode, &exc,
8049
                  pos, pos + 1, &newpos);
8050
        if (rep == NULL)
8051
            goto error;
8052
8053
        Py_ssize_t morebytes = pos - newpos;
8054
        if (PyBytes_Check(rep)) {
8055
            outsize = PyBytes_GET_SIZE(rep);
8056
            morebytes += outsize;
8057
            if (morebytes > 0) {
8058
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8059
                if (out == NULL) {
8060
                    Py_DECREF(rep);
8061
                    goto error;
8062
                }
8063
            }
8064
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8065
            out += outsize;
8066
        }
8067
        else {
8068
            Py_ssize_t i;
8069
            int kind;
8070
            const void *data;
8071
8072
            outsize = PyUnicode_GET_LENGTH(rep);
8073
            morebytes += outsize;
8074
            if (morebytes > 0) {
8075
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8076
                if (out == NULL) {
8077
                    Py_DECREF(rep);
8078
                    goto error;
8079
                }
8080
            }
8081
            kind = PyUnicode_KIND(rep);
8082
            data = PyUnicode_DATA(rep);
8083
            for (i=0; i < outsize; i++) {
8084
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8085
                if (ch > 127) {
8086
                    raise_encode_exception(&exc,
8087
                        encoding, unicode,
8088
                        pos, pos + 1,
8089
                        "unable to encode error handler result to ASCII");
8090
                    Py_DECREF(rep);
8091
                    goto error;
8092
                }
8093
                *out = (unsigned char)ch;
8094
                out++;
8095
            }
8096
        }
8097
        pos = newpos;
8098
        Py_DECREF(rep);
8099
    }
8100
    /* write a NUL byte */
8101
    *out = 0;
8102
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8103
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8104
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8105
        goto error;
8106
    }
8107
    ret = 0;
8108
8109
error:
8110
    Py_XDECREF(encoding_obj);
8111
    Py_XDECREF(errorHandler);
8112
    Py_XDECREF(exc);
8113
    return ret;
8114
}
8115
8116
8117
PyObject *
8118
PyUnicode_EncodeCodePage(int code_page,
8119
                         PyObject *unicode,
8120
                         const char *errors)
8121
{
8122
    Py_ssize_t len;
8123
    PyBytesWriter *writer = NULL;
8124
    Py_ssize_t offset;
8125
    int chunk_len, ret, done;
8126
8127
    if (!PyUnicode_Check(unicode)) {
8128
        PyErr_BadArgument();
8129
        return NULL;
8130
    }
8131
8132
    len = PyUnicode_GET_LENGTH(unicode);
8133
8134
    if (code_page < 0) {
8135
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8136
        return NULL;
8137
    }
8138
8139
    if (len == 0)
8140
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8141
8142
    offset = 0;
8143
    do
8144
    {
8145
#ifdef NEED_RETRY
8146
        if (len > DECODING_CHUNK_SIZE) {
8147
            chunk_len = DECODING_CHUNK_SIZE;
8148
            done = 0;
8149
        }
8150
        else
8151
#endif
8152
        {
8153
            chunk_len = (int)len;
8154
            done = 1;
8155
        }
8156
8157
        ret = encode_code_page_strict(code_page, &writer,
8158
                                      unicode, offset, chunk_len,
8159
                                      errors);
8160
        if (ret == -2)
8161
            ret = encode_code_page_errors(code_page, &writer,
8162
                                          unicode, offset,
8163
                                          chunk_len, errors);
8164
        if (ret < 0) {
8165
            PyBytesWriter_Discard(writer);
8166
            return NULL;
8167
        }
8168
8169
        offset += chunk_len;
8170
        len -= chunk_len;
8171
    } while (!done);
8172
8173
    return PyBytesWriter_Finish(writer);
8174
}
8175
8176
8177
PyObject *
8178
PyUnicode_AsMBCSString(PyObject *unicode)
8179
{
8180
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8181
}
8182
8183
#undef NEED_RETRY
8184
8185
#endif /* MS_WINDOWS */
8186
8187
/* --- Character Mapping Codec -------------------------------------------- */
8188
8189
static int
8190
charmap_decode_string(const char *s,
8191
                      Py_ssize_t size,
8192
                      PyObject *mapping,
8193
                      const char *errors,
8194
                      _PyUnicodeWriter *writer)
8195
1.83k
{
8196
1.83k
    const char *starts = s;
8197
1.83k
    const char *e;
8198
1.83k
    Py_ssize_t startinpos, endinpos;
8199
1.83k
    PyObject *errorHandler = NULL, *exc = NULL;
8200
1.83k
    Py_ssize_t maplen;
8201
1.83k
    int mapkind;
8202
1.83k
    const void *mapdata;
8203
1.83k
    Py_UCS4 x;
8204
1.83k
    unsigned char ch;
8205
8206
1.83k
    maplen = PyUnicode_GET_LENGTH(mapping);
8207
1.83k
    mapdata = PyUnicode_DATA(mapping);
8208
1.83k
    mapkind = PyUnicode_KIND(mapping);
8209
8210
0
    e = s + size;
8211
8212
1.83k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8213
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8214
         * is disabled in encoding aliases, latin1 is preferred because
8215
         * its implementation is faster. */
8216
15
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8217
15
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8218
15
        Py_UCS4 maxchar = writer->maxchar;
8219
8220
15
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8221
11.2k
        while (s < e) {
8222
11.2k
            ch = *s;
8223
11.2k
            x = mapdata_ucs1[ch];
8224
11.2k
            if (x > maxchar) {
8225
15
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8226
0
                    goto onError;
8227
15
                maxchar = writer->maxchar;
8228
15
                outdata = (Py_UCS1 *)writer->data;
8229
15
            }
8230
11.2k
            outdata[writer->pos] = x;
8231
11.2k
            writer->pos++;
8232
11.2k
            ++s;
8233
11.2k
        }
8234
15
        return 0;
8235
15
    }
8236
8237
8.63k
    while (s < e) {
8238
8.24k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8239
8.24k
            int outkind = writer->kind;
8240
8.24k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8241
8.24k
            if (outkind == PyUnicode_1BYTE_KIND) {
8242
3.44k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8243
3.44k
                Py_UCS4 maxchar = writer->maxchar;
8244
292k
                while (s < e) {
8245
292k
                    ch = *s;
8246
292k
                    x = mapdata_ucs2[ch];
8247
292k
                    if (x > maxchar)
8248
3.34k
                        goto Error;
8249
289k
                    outdata[writer->pos] = x;
8250
289k
                    writer->pos++;
8251
289k
                    ++s;
8252
289k
                }
8253
100
                break;
8254
3.44k
            }
8255
4.80k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8256
4.80k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8257
1.70M
                while (s < e) {
8258
1.70M
                    ch = *s;
8259
1.70M
                    x = mapdata_ucs2[ch];
8260
1.70M
                    if (x == 0xFFFE)
8261
3.47k
                        goto Error;
8262
1.69M
                    outdata[writer->pos] = x;
8263
1.69M
                    writer->pos++;
8264
1.69M
                    ++s;
8265
1.69M
                }
8266
1.32k
                break;
8267
4.80k
            }
8268
8.24k
        }
8269
0
        ch = *s;
8270
8271
0
        if (ch < maplen)
8272
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8273
0
        else
8274
0
            x = 0xfffe; /* invalid value */
8275
6.82k
Error:
8276
6.82k
        if (x == 0xfffe)
8277
3.48k
        {
8278
            /* undefined mapping */
8279
3.48k
            startinpos = s-starts;
8280
3.48k
            endinpos = startinpos+1;
8281
3.48k
            if (unicode_decode_call_errorhandler_writer(
8282
3.48k
                    errors, &errorHandler,
8283
3.48k
                    "charmap", "character maps to <undefined>",
8284
3.48k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8285
3.48k
                    writer)) {
8286
9
                goto onError;
8287
9
            }
8288
3.47k
            continue;
8289
3.48k
        }
8290
8291
3.33k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8292
0
            goto onError;
8293
3.33k
        ++s;
8294
3.33k
    }
8295
1.80k
    Py_XDECREF(errorHandler);
8296
1.80k
    Py_XDECREF(exc);
8297
1.80k
    return 0;
8298
8299
9
onError:
8300
9
    Py_XDECREF(errorHandler);
8301
9
    Py_XDECREF(exc);
8302
9
    return -1;
8303
1.81k
}
8304
8305
static int
8306
charmap_decode_mapping(const char *s,
8307
                       Py_ssize_t size,
8308
                       PyObject *mapping,
8309
                       const char *errors,
8310
                       _PyUnicodeWriter *writer)
8311
0
{
8312
0
    const char *starts = s;
8313
0
    const char *e;
8314
0
    Py_ssize_t startinpos, endinpos;
8315
0
    PyObject *errorHandler = NULL, *exc = NULL;
8316
0
    unsigned char ch;
8317
0
    PyObject *key, *item = NULL;
8318
8319
0
    e = s + size;
8320
8321
0
    while (s < e) {
8322
0
        ch = *s;
8323
8324
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8325
0
        key = PyLong_FromLong((long)ch);
8326
0
        if (key == NULL)
8327
0
            goto onError;
8328
8329
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8330
0
        Py_DECREF(key);
8331
0
        if (rc == 0) {
8332
            /* No mapping found means: mapping is undefined. */
8333
0
            goto Undefined;
8334
0
        }
8335
0
        if (item == NULL) {
8336
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8337
                /* No mapping found means: mapping is undefined. */
8338
0
                PyErr_Clear();
8339
0
                goto Undefined;
8340
0
            } else
8341
0
                goto onError;
8342
0
        }
8343
8344
        /* Apply mapping */
8345
0
        if (item == Py_None)
8346
0
            goto Undefined;
8347
0
        if (PyLong_Check(item)) {
8348
0
            long value = PyLong_AsLong(item);
8349
0
            if (value == 0xFFFE)
8350
0
                goto Undefined;
8351
0
            if (value < 0 || value > MAX_UNICODE) {
8352
0
                PyErr_Format(PyExc_TypeError,
8353
0
                             "character mapping must be in range(0x%lx)",
8354
0
                             (unsigned long)MAX_UNICODE + 1);
8355
0
                goto onError;
8356
0
            }
8357
8358
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8359
0
                goto onError;
8360
0
        }
8361
0
        else if (PyUnicode_Check(item)) {
8362
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8363
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8364
0
                if (value == 0xFFFE)
8365
0
                    goto Undefined;
8366
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8367
0
                    goto onError;
8368
0
            }
8369
0
            else {
8370
0
                writer->overallocate = 1;
8371
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8372
0
                    goto onError;
8373
0
            }
8374
0
        }
8375
0
        else {
8376
            /* wrong return value */
8377
0
            PyErr_SetString(PyExc_TypeError,
8378
0
                            "character mapping must return integer, None or str");
8379
0
            goto onError;
8380
0
        }
8381
0
        Py_CLEAR(item);
8382
0
        ++s;
8383
0
        continue;
8384
8385
0
Undefined:
8386
        /* undefined mapping */
8387
0
        Py_CLEAR(item);
8388
0
        startinpos = s-starts;
8389
0
        endinpos = startinpos+1;
8390
0
        if (unicode_decode_call_errorhandler_writer(
8391
0
                errors, &errorHandler,
8392
0
                "charmap", "character maps to <undefined>",
8393
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8394
0
                writer)) {
8395
0
            goto onError;
8396
0
        }
8397
0
    }
8398
0
    Py_XDECREF(errorHandler);
8399
0
    Py_XDECREF(exc);
8400
0
    return 0;
8401
8402
0
onError:
8403
0
    Py_XDECREF(item);
8404
0
    Py_XDECREF(errorHandler);
8405
0
    Py_XDECREF(exc);
8406
0
    return -1;
8407
0
}
8408
8409
PyObject *
8410
PyUnicode_DecodeCharmap(const char *s,
8411
                        Py_ssize_t size,
8412
                        PyObject *mapping,
8413
                        const char *errors)
8414
1.83k
{
8415
1.83k
    _PyUnicodeWriter writer;
8416
8417
    /* Default to Latin-1 */
8418
1.83k
    if (mapping == NULL)
8419
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8420
8421
1.83k
    if (size == 0)
8422
0
        _Py_RETURN_UNICODE_EMPTY();
8423
1.83k
    _PyUnicodeWriter_Init(&writer);
8424
1.83k
    writer.min_length = size;
8425
1.83k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8426
0
        goto onError;
8427
8428
1.83k
    if (PyUnicode_CheckExact(mapping)) {
8429
1.83k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8430
9
            goto onError;
8431
1.83k
    }
8432
0
    else {
8433
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8434
0
            goto onError;
8435
0
    }
8436
1.82k
    return _PyUnicodeWriter_Finish(&writer);
8437
8438
9
  onError:
8439
9
    _PyUnicodeWriter_Dealloc(&writer);
8440
9
    return NULL;
8441
1.83k
}
8442
8443
/* Charmap encoding: the lookup table */
8444
8445
/*[clinic input]
8446
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8447
[clinic start generated code]*/
8448
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8449
8450
struct encoding_map {
8451
    PyObject_HEAD
8452
    unsigned char level1[32];
8453
    int count2, count3;
8454
    unsigned char level23[1];
8455
};
8456
8457
/*[clinic input]
8458
EncodingMap.size
8459
8460
Return the size (in bytes) of this object.
8461
[clinic start generated code]*/
8462
8463
static PyObject *
8464
EncodingMap_size_impl(struct encoding_map *self)
8465
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8466
0
{
8467
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8468
0
                           128*self->count3);
8469
0
}
8470
8471
static PyMethodDef encoding_map_methods[] = {
8472
    ENCODINGMAP_SIZE_METHODDEF
8473
    {NULL, NULL}
8474
};
8475
8476
static PyTypeObject EncodingMapType = {
8477
    PyVarObject_HEAD_INIT(NULL, 0)
8478
    .tp_name = "EncodingMap",
8479
    .tp_basicsize = sizeof(struct encoding_map),
8480
    /* methods */
8481
    .tp_flags = Py_TPFLAGS_DEFAULT,
8482
    .tp_methods = encoding_map_methods,
8483
};
8484
8485
PyObject*
8486
PyUnicode_BuildEncodingMap(PyObject* string)
8487
48
{
8488
48
    PyObject *result;
8489
48
    struct encoding_map *mresult;
8490
48
    int i;
8491
48
    int need_dict = 0;
8492
48
    unsigned char level1[32];
8493
48
    unsigned char level2[512];
8494
48
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8495
48
    int count2 = 0, count3 = 0;
8496
48
    int kind;
8497
48
    const void *data;
8498
48
    int length;
8499
48
    Py_UCS4 ch;
8500
8501
48
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8502
0
        PyErr_BadArgument();
8503
0
        return NULL;
8504
0
    }
8505
48
    kind = PyUnicode_KIND(string);
8506
48
    data = PyUnicode_DATA(string);
8507
48
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8508
48
    memset(level1, 0xFF, sizeof level1);
8509
48
    memset(level2, 0xFF, sizeof level2);
8510
8511
    /* If there isn't a one-to-one mapping of NULL to \0,
8512
       or if there are non-BMP characters, we need to use
8513
       a mapping dictionary. */
8514
48
    if (PyUnicode_READ(kind, data, 0) != 0)
8515
0
        need_dict = 1;
8516
12.2k
    for (i = 1; i < length; i++) {
8517
12.2k
        int l1, l2;
8518
12.2k
        ch = PyUnicode_READ(kind, data, i);
8519
12.2k
        if (ch == 0 || ch > 0xFFFF) {
8520
0
            need_dict = 1;
8521
0
            break;
8522
0
        }
8523
12.2k
        if (ch == 0xFFFE)
8524
            /* unmapped character */
8525
327
            continue;
8526
11.9k
        l1 = ch >> 11;
8527
11.9k
        l2 = ch >> 7;
8528
11.9k
        if (level1[l1] == 0xFF)
8529
85
            level1[l1] = count2++;
8530
11.9k
        if (level2[l2] == 0xFF)
8531
242
            level2[l2] = count3++;
8532
11.9k
    }
8533
8534
48
    if (count2 >= 0xFF || count3 >= 0xFF)
8535
0
        need_dict = 1;
8536
8537
48
    if (need_dict) {
8538
0
        PyObject *result = PyDict_New();
8539
0
        if (!result)
8540
0
            return NULL;
8541
0
        for (i = 0; i < length; i++) {
8542
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8543
0
            PyObject *key = PyLong_FromLong(c);
8544
0
            if (key == NULL) {
8545
0
                Py_DECREF(result);
8546
0
                return NULL;
8547
0
            }
8548
0
            PyObject *value = PyLong_FromLong(i);
8549
0
            if (value == NULL) {
8550
0
                Py_DECREF(key);
8551
0
                Py_DECREF(result);
8552
0
                return NULL;
8553
0
            }
8554
0
            int rc = PyDict_SetItem(result, key, value);
8555
0
            Py_DECREF(key);
8556
0
            Py_DECREF(value);
8557
0
            if (rc < 0) {
8558
0
                Py_DECREF(result);
8559
0
                return NULL;
8560
0
            }
8561
0
        }
8562
0
        return result;
8563
0
    }
8564
8565
    /* Create a three-level trie */
8566
48
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8567
48
                             16*count2 + 128*count3 - 1);
8568
48
    if (!result) {
8569
0
        return PyErr_NoMemory();
8570
0
    }
8571
8572
48
    _PyObject_Init(result, &EncodingMapType);
8573
48
    mresult = (struct encoding_map*)result;
8574
48
    mresult->count2 = count2;
8575
48
    mresult->count3 = count3;
8576
48
    mlevel1 = mresult->level1;
8577
48
    mlevel2 = mresult->level23;
8578
48
    mlevel3 = mresult->level23 + 16*count2;
8579
48
    memcpy(mlevel1, level1, 32);
8580
48
    memset(mlevel2, 0xFF, 16*count2);
8581
48
    memset(mlevel3, 0, 128*count3);
8582
48
    count3 = 0;
8583
12.2k
    for (i = 1; i < length; i++) {
8584
12.2k
        int o1, o2, o3, i2, i3;
8585
12.2k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8586
12.2k
        if (ch == 0xFFFE)
8587
            /* unmapped character */
8588
327
            continue;
8589
11.9k
        o1 = ch>>11;
8590
11.9k
        o2 = (ch>>7) & 0xF;
8591
11.9k
        i2 = 16*mlevel1[o1] + o2;
8592
11.9k
        if (mlevel2[i2] == 0xFF)
8593
242
            mlevel2[i2] = count3++;
8594
11.9k
        o3 = ch & 0x7F;
8595
11.9k
        i3 = 128*mlevel2[i2] + o3;
8596
11.9k
        mlevel3[i3] = i;
8597
11.9k
    }
8598
48
    return result;
8599
48
}
8600
8601
static int
8602
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8603
0
{
8604
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8605
0
    int l1 = c>>11;
8606
0
    int l2 = (c>>7) & 0xF;
8607
0
    int l3 = c & 0x7F;
8608
0
    int i;
8609
8610
0
    if (c > 0xFFFF)
8611
0
        return -1;
8612
0
    if (c == 0)
8613
0
        return 0;
8614
    /* level 1*/
8615
0
    i = map->level1[l1];
8616
0
    if (i == 0xFF) {
8617
0
        return -1;
8618
0
    }
8619
    /* level 2*/
8620
0
    i = map->level23[16*i+l2];
8621
0
    if (i == 0xFF) {
8622
0
        return -1;
8623
0
    }
8624
    /* level 3 */
8625
0
    i = map->level23[16*map->count2 + 128*i + l3];
8626
0
    if (i == 0) {
8627
0
        return -1;
8628
0
    }
8629
0
    return i;
8630
0
}
8631
8632
/* Lookup the character in the mapping.
8633
   On success, return PyLong, PyBytes or None (if the character can't be found).
8634
   If the result is PyLong, put its value in replace.
8635
   On error, return NULL.
8636
   */
8637
static PyObject *
8638
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8639
0
{
8640
0
    PyObject *w = PyLong_FromLong((long)c);
8641
0
    PyObject *x;
8642
8643
0
    if (w == NULL)
8644
0
        return NULL;
8645
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8646
0
    Py_DECREF(w);
8647
0
    if (rc == 0) {
8648
        /* No mapping found means: mapping is undefined. */
8649
0
        Py_RETURN_NONE;
8650
0
    }
8651
0
    if (x == NULL) {
8652
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8653
            /* No mapping found means: mapping is undefined. */
8654
0
            PyErr_Clear();
8655
0
            Py_RETURN_NONE;
8656
0
        } else
8657
0
            return NULL;
8658
0
    }
8659
0
    else if (x == Py_None)
8660
0
        return x;
8661
0
    else if (PyLong_Check(x)) {
8662
0
        long value = PyLong_AsLong(x);
8663
0
        if (value < 0 || value > 255) {
8664
0
            PyErr_SetString(PyExc_TypeError,
8665
0
                            "character mapping must be in range(256)");
8666
0
            Py_DECREF(x);
8667
0
            return NULL;
8668
0
        }
8669
0
        *replace = (unsigned char)value;
8670
0
        return x;
8671
0
    }
8672
0
    else if (PyBytes_Check(x))
8673
0
        return x;
8674
0
    else {
8675
        /* wrong return value */
8676
0
        PyErr_Format(PyExc_TypeError,
8677
0
                     "character mapping must return integer, bytes or None, not %.400s",
8678
0
                     Py_TYPE(x)->tp_name);
8679
0
        Py_DECREF(x);
8680
0
        return NULL;
8681
0
    }
8682
0
}
8683
8684
static int
8685
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8686
0
{
8687
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8688
    /* exponentially overallocate to minimize reallocations */
8689
0
    if (requiredsize < 2 * outsize)
8690
0
        requiredsize = 2 * outsize;
8691
0
    return PyBytesWriter_Resize(writer, requiredsize);
8692
0
}
8693
8694
typedef enum charmapencode_result {
8695
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8696
} charmapencode_result;
8697
/* lookup the character, put the result in the output string and adjust
8698
   various state variables. Resize the output bytes object if not enough
8699
   space is available. Return a new reference to the object that
8700
   was put in the output buffer, or Py_None, if the mapping was undefined
8701
   (in which case no character was written) or NULL, if a
8702
   reallocation error occurred. The caller must decref the result */
8703
static charmapencode_result
8704
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8705
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8706
0
{
8707
0
    PyObject *rep;
8708
0
    unsigned char replace;
8709
0
    char *outstart;
8710
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8711
8712
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8713
0
        int res = encoding_map_lookup(c, mapping);
8714
0
        Py_ssize_t requiredsize = *outpos+1;
8715
0
        if (res == -1) {
8716
0
            return enc_FAILED;
8717
0
        }
8718
8719
0
        if (outsize<requiredsize) {
8720
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8721
0
                return enc_EXCEPTION;
8722
0
            }
8723
0
        }
8724
0
        outstart = _PyBytesWriter_GetData(writer);
8725
0
        outstart[(*outpos)++] = (char)res;
8726
0
        return enc_SUCCESS;
8727
0
    }
8728
8729
0
    rep = charmapencode_lookup(c, mapping, &replace);
8730
0
    if (rep==NULL)
8731
0
        return enc_EXCEPTION;
8732
0
    else if (rep==Py_None) {
8733
0
        Py_DECREF(rep);
8734
0
        return enc_FAILED;
8735
0
    } else {
8736
0
        if (PyLong_Check(rep)) {
8737
0
            Py_ssize_t requiredsize = *outpos+1;
8738
0
            if (outsize<requiredsize)
8739
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8740
0
                    Py_DECREF(rep);
8741
0
                    return enc_EXCEPTION;
8742
0
                }
8743
0
            outstart = _PyBytesWriter_GetData(writer);
8744
0
            outstart[(*outpos)++] = (char)replace;
8745
0
        }
8746
0
        else {
8747
0
            const char *repchars = PyBytes_AS_STRING(rep);
8748
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8749
0
            Py_ssize_t requiredsize = *outpos+repsize;
8750
0
            if (outsize<requiredsize)
8751
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8752
0
                    Py_DECREF(rep);
8753
0
                    return enc_EXCEPTION;
8754
0
                }
8755
0
            outstart = _PyBytesWriter_GetData(writer);
8756
0
            memcpy(outstart + *outpos, repchars, repsize);
8757
0
            *outpos += repsize;
8758
0
        }
8759
0
    }
8760
0
    Py_DECREF(rep);
8761
0
    return enc_SUCCESS;
8762
0
}
8763
8764
/* handle an error in _PyUnicode_EncodeCharmap()
8765
   Return 0 on success, -1 on error */
8766
static int
8767
charmap_encoding_error(
8768
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8769
    PyObject **exceptionObject,
8770
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8771
    PyBytesWriter *writer, Py_ssize_t *respos)
8772
0
{
8773
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8774
0
    Py_ssize_t size, repsize;
8775
0
    Py_ssize_t newpos;
8776
0
    int kind;
8777
0
    const void *data;
8778
0
    Py_ssize_t index;
8779
    /* startpos for collecting unencodable chars */
8780
0
    Py_ssize_t collstartpos = *inpos;
8781
0
    Py_ssize_t collendpos = *inpos+1;
8782
0
    Py_ssize_t collpos;
8783
0
    const char *encoding = "charmap";
8784
0
    const char *reason = "character maps to <undefined>";
8785
0
    charmapencode_result x;
8786
0
    Py_UCS4 ch;
8787
0
    int val;
8788
8789
0
    size = PyUnicode_GET_LENGTH(unicode);
8790
    /* find all unencodable characters */
8791
0
    while (collendpos < size) {
8792
0
        PyObject *rep;
8793
0
        unsigned char replace;
8794
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8795
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8796
0
            val = encoding_map_lookup(ch, mapping);
8797
0
            if (val != -1)
8798
0
                break;
8799
0
            ++collendpos;
8800
0
            continue;
8801
0
        }
8802
8803
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8804
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8805
0
        if (rep==NULL)
8806
0
            return -1;
8807
0
        else if (rep!=Py_None) {
8808
0
            Py_DECREF(rep);
8809
0
            break;
8810
0
        }
8811
0
        Py_DECREF(rep);
8812
0
        ++collendpos;
8813
0
    }
8814
    /* cache callback name lookup
8815
     * (if not done yet, i.e. it's the first error) */
8816
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8817
0
        *error_handler = _Py_GetErrorHandler(errors);
8818
8819
0
    switch (*error_handler) {
8820
0
    case _Py_ERROR_STRICT:
8821
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8822
0
        return -1;
8823
8824
0
    case _Py_ERROR_REPLACE:
8825
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8826
0
            x = charmapencode_output('?', mapping, writer, respos);
8827
0
            if (x==enc_EXCEPTION) {
8828
0
                return -1;
8829
0
            }
8830
0
            else if (x==enc_FAILED) {
8831
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8832
0
                return -1;
8833
0
            }
8834
0
        }
8835
0
        _Py_FALLTHROUGH;
8836
0
    case _Py_ERROR_IGNORE:
8837
0
        *inpos = collendpos;
8838
0
        break;
8839
8840
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8841
        /* generate replacement (temporarily (mis)uses p) */
8842
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8843
0
            char buffer[2+29+1+1];
8844
0
            char *cp;
8845
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8846
0
            for (cp = buffer; *cp; ++cp) {
8847
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8848
0
                if (x==enc_EXCEPTION)
8849
0
                    return -1;
8850
0
                else if (x==enc_FAILED) {
8851
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8852
0
                    return -1;
8853
0
                }
8854
0
            }
8855
0
        }
8856
0
        *inpos = collendpos;
8857
0
        break;
8858
8859
0
    default:
8860
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8861
0
                                                      encoding, reason, unicode, exceptionObject,
8862
0
                                                      collstartpos, collendpos, &newpos);
8863
0
        if (repunicode == NULL)
8864
0
            return -1;
8865
0
        if (PyBytes_Check(repunicode)) {
8866
            /* Directly copy bytes result to output. */
8867
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8868
0
            Py_ssize_t requiredsize;
8869
0
            repsize = PyBytes_Size(repunicode);
8870
0
            requiredsize = *respos + repsize;
8871
0
            if (requiredsize > outsize)
8872
                /* Make room for all additional bytes. */
8873
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8874
0
                    Py_DECREF(repunicode);
8875
0
                    return -1;
8876
0
                }
8877
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8878
0
                   PyBytes_AsString(repunicode),  repsize);
8879
0
            *respos += repsize;
8880
0
            *inpos = newpos;
8881
0
            Py_DECREF(repunicode);
8882
0
            break;
8883
0
        }
8884
        /* generate replacement  */
8885
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8886
0
        data = PyUnicode_DATA(repunicode);
8887
0
        kind = PyUnicode_KIND(repunicode);
8888
0
        for (index = 0; index < repsize; index++) {
8889
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8890
0
            x = charmapencode_output(repch, mapping, writer, respos);
8891
0
            if (x==enc_EXCEPTION) {
8892
0
                Py_DECREF(repunicode);
8893
0
                return -1;
8894
0
            }
8895
0
            else if (x==enc_FAILED) {
8896
0
                Py_DECREF(repunicode);
8897
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8898
0
                return -1;
8899
0
            }
8900
0
        }
8901
0
        *inpos = newpos;
8902
0
        Py_DECREF(repunicode);
8903
0
    }
8904
0
    return 0;
8905
0
}
8906
8907
PyObject *
8908
_PyUnicode_EncodeCharmap(PyObject *unicode,
8909
                         PyObject *mapping,
8910
                         const char *errors)
8911
0
{
8912
    /* Default to Latin-1 */
8913
0
    if (mapping == NULL) {
8914
0
        return unicode_encode_ucs1(unicode, errors, 256);
8915
0
    }
8916
8917
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
8918
0
    if (size == 0) {
8919
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8920
0
    }
8921
0
    const void *data = PyUnicode_DATA(unicode);
8922
0
    int kind = PyUnicode_KIND(unicode);
8923
8924
0
    PyObject *error_handler_obj = NULL;
8925
0
    PyObject *exc = NULL;
8926
8927
    /* output object */
8928
0
    PyBytesWriter *writer;
8929
    /* allocate enough for a simple encoding without
8930
       replacements, if we need more, we'll resize */
8931
0
    writer = PyBytesWriter_Create(size);
8932
0
    if (writer == NULL) {
8933
0
        goto onError;
8934
0
    }
8935
8936
    /* current input position */
8937
0
    Py_ssize_t inpos = 0;
8938
    /* current output position */
8939
0
    Py_ssize_t respos = 0;
8940
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8941
8942
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8943
0
        char *outstart = _PyBytesWriter_GetData(writer);
8944
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8945
8946
0
        while (inpos<size) {
8947
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8948
8949
            /* try to encode it */
8950
0
            int res = encoding_map_lookup(ch, mapping);
8951
0
            Py_ssize_t requiredsize = respos+1;
8952
0
            if (res == -1) {
8953
0
                goto enc_FAILED;
8954
0
            }
8955
8956
0
            if (outsize<requiredsize) {
8957
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
8958
0
                    goto onError;
8959
0
                }
8960
0
                outstart = _PyBytesWriter_GetData(writer);
8961
0
                outsize = _PyBytesWriter_GetSize(writer);
8962
0
            }
8963
0
            outstart[respos++] = (char)res;
8964
8965
            /* done with this character => adjust input position */
8966
0
            ++inpos;
8967
0
            continue;
8968
8969
0
enc_FAILED:
8970
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
8971
0
                                       &exc,
8972
0
                                       &error_handler, &error_handler_obj, errors,
8973
0
                                       writer, &respos)) {
8974
0
                goto onError;
8975
0
            }
8976
0
            outstart = _PyBytesWriter_GetData(writer);
8977
0
            outsize = _PyBytesWriter_GetSize(writer);
8978
0
        }
8979
0
    }
8980
0
    else {
8981
0
        while (inpos<size) {
8982
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8983
            /* try to encode it */
8984
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
8985
0
            if (x==enc_EXCEPTION) { /* error */
8986
0
                goto onError;
8987
0
            }
8988
0
            if (x==enc_FAILED) { /* unencodable character */
8989
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
8990
0
                                           &exc,
8991
0
                                           &error_handler, &error_handler_obj, errors,
8992
0
                                           writer, &respos)) {
8993
0
                    goto onError;
8994
0
                }
8995
0
            }
8996
0
            else {
8997
                /* done with this character => adjust input position */
8998
0
                ++inpos;
8999
0
            }
9000
0
        }
9001
0
    }
9002
9003
0
    Py_XDECREF(exc);
9004
0
    Py_XDECREF(error_handler_obj);
9005
9006
    /* Resize if we allocated too much */
9007
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9008
9009
0
  onError:
9010
0
    PyBytesWriter_Discard(writer);
9011
0
    Py_XDECREF(exc);
9012
0
    Py_XDECREF(error_handler_obj);
9013
0
    return NULL;
9014
0
}
9015
9016
PyObject *
9017
PyUnicode_AsCharmapString(PyObject *unicode,
9018
                          PyObject *mapping)
9019
0
{
9020
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9021
0
        PyErr_BadArgument();
9022
0
        return NULL;
9023
0
    }
9024
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9025
0
}
9026
9027
/* create or adjust a UnicodeTranslateError */
9028
static void
9029
make_translate_exception(PyObject **exceptionObject,
9030
                         PyObject *unicode,
9031
                         Py_ssize_t startpos, Py_ssize_t endpos,
9032
                         const char *reason)
9033
0
{
9034
0
    if (*exceptionObject == NULL) {
9035
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9036
0
            unicode, startpos, endpos, reason);
9037
0
    }
9038
0
    else {
9039
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9040
0
            goto onError;
9041
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9042
0
            goto onError;
9043
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9044
0
            goto onError;
9045
0
        return;
9046
0
      onError:
9047
0
        Py_CLEAR(*exceptionObject);
9048
0
    }
9049
0
}
9050
9051
/* error handling callback helper:
9052
   build arguments, call the callback and check the arguments,
9053
   put the result into newpos and return the replacement string, which
9054
   has to be freed by the caller */
9055
static PyObject *
9056
unicode_translate_call_errorhandler(const char *errors,
9057
                                    PyObject **errorHandler,
9058
                                    const char *reason,
9059
                                    PyObject *unicode, PyObject **exceptionObject,
9060
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9061
                                    Py_ssize_t *newpos)
9062
0
{
9063
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9064
9065
0
    Py_ssize_t i_newpos;
9066
0
    PyObject *restuple;
9067
0
    PyObject *resunicode;
9068
9069
0
    if (*errorHandler == NULL) {
9070
0
        *errorHandler = PyCodec_LookupError(errors);
9071
0
        if (*errorHandler == NULL)
9072
0
            return NULL;
9073
0
    }
9074
9075
0
    make_translate_exception(exceptionObject,
9076
0
                             unicode, startpos, endpos, reason);
9077
0
    if (*exceptionObject == NULL)
9078
0
        return NULL;
9079
9080
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9081
0
    if (restuple == NULL)
9082
0
        return NULL;
9083
0
    if (!PyTuple_Check(restuple)) {
9084
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9085
0
        Py_DECREF(restuple);
9086
0
        return NULL;
9087
0
    }
9088
0
    if (!PyArg_ParseTuple(restuple, argparse,
9089
0
                          &resunicode, &i_newpos)) {
9090
0
        Py_DECREF(restuple);
9091
0
        return NULL;
9092
0
    }
9093
0
    if (i_newpos<0)
9094
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9095
0
    else
9096
0
        *newpos = i_newpos;
9097
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9098
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9099
0
        Py_DECREF(restuple);
9100
0
        return NULL;
9101
0
    }
9102
0
    Py_INCREF(resunicode);
9103
0
    Py_DECREF(restuple);
9104
0
    return resunicode;
9105
0
}
9106
9107
/* Lookup the character ch in the mapping and put the result in result,
9108
   which must be decrefed by the caller.
9109
   The result can be PyLong, PyUnicode, None or NULL.
9110
   If the result is PyLong, put its value in replace.
9111
   Return 0 on success, -1 on error */
9112
static int
9113
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9114
282
{
9115
282
    PyObject *w = PyLong_FromLong((long)c);
9116
282
    PyObject *x;
9117
9118
282
    if (w == NULL)
9119
0
        return -1;
9120
282
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9121
282
    Py_DECREF(w);
9122
282
    if (rc == 0) {
9123
        /* No mapping found means: use 1:1 mapping. */
9124
126
        *result = NULL;
9125
126
        return 0;
9126
126
    }
9127
156
    if (x == NULL) {
9128
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9129
            /* No mapping found means: use 1:1 mapping. */
9130
0
            PyErr_Clear();
9131
0
            *result = NULL;
9132
0
            return 0;
9133
0
        } else
9134
0
            return -1;
9135
0
    }
9136
156
    else if (x == Py_None) {
9137
0
        *result = x;
9138
0
        return 0;
9139
0
    }
9140
156
    else if (PyLong_Check(x)) {
9141
0
        long value = PyLong_AsLong(x);
9142
0
        if (value < 0 || value > MAX_UNICODE) {
9143
0
            PyErr_Format(PyExc_ValueError,
9144
0
                         "character mapping must be in range(0x%lx)",
9145
0
                         (unsigned long)MAX_UNICODE + 1);
9146
0
            Py_DECREF(x);
9147
0
            return -1;
9148
0
        }
9149
0
        *result = x;
9150
0
        *replace = (Py_UCS4)value;
9151
0
        return 0;
9152
0
    }
9153
156
    else if (PyUnicode_Check(x)) {
9154
156
        *result = x;
9155
156
        return 0;
9156
156
    }
9157
0
    else {
9158
        /* wrong return value */
9159
0
        PyErr_SetString(PyExc_TypeError,
9160
0
                        "character mapping must return integer, None or str");
9161
0
        Py_DECREF(x);
9162
0
        return -1;
9163
0
    }
9164
156
}
9165
9166
/* lookup the character, write the result into the writer.
9167
   Return 1 if the result was written into the writer, return 0 if the mapping
9168
   was undefined, raise an exception return -1 on error. */
9169
static int
9170
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9171
                        _PyUnicodeWriter *writer)
9172
108
{
9173
108
    PyObject *item;
9174
108
    Py_UCS4 replace;
9175
9176
108
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9177
0
        return -1;
9178
9179
108
    if (item == NULL) {
9180
        /* not found => default to 1:1 mapping */
9181
24
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9182
0
            return -1;
9183
0
        }
9184
24
        return 1;
9185
24
    }
9186
9187
84
    if (item == Py_None) {
9188
0
        Py_DECREF(item);
9189
0
        return 0;
9190
0
    }
9191
9192
84
    if (PyLong_Check(item)) {
9193
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9194
0
            Py_DECREF(item);
9195
0
            return -1;
9196
0
        }
9197
0
        Py_DECREF(item);
9198
0
        return 1;
9199
0
    }
9200
9201
84
    if (!PyUnicode_Check(item)) {
9202
0
        Py_DECREF(item);
9203
0
        return -1;
9204
0
    }
9205
9206
84
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9207
0
        Py_DECREF(item);
9208
0
        return -1;
9209
0
    }
9210
9211
84
    Py_DECREF(item);
9212
84
    return 1;
9213
84
}
9214
9215
static int
9216
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9217
                              Py_UCS1 *translate)
9218
174
{
9219
174
    PyObject *item = NULL;
9220
174
    Py_UCS4 replace;
9221
174
    int ret = 0;
9222
9223
174
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9224
0
        return -1;
9225
0
    }
9226
9227
174
    if (item == Py_None) {
9228
        /* deletion */
9229
0
        translate[ch] = 0xfe;
9230
0
    }
9231
174
    else if (item == NULL) {
9232
        /* not found => default to 1:1 mapping */
9233
102
        translate[ch] = ch;
9234
102
        return 1;
9235
102
    }
9236
72
    else if (PyLong_Check(item)) {
9237
0
        if (replace > 127) {
9238
            /* invalid character or character outside ASCII:
9239
               skip the fast translate */
9240
0
            goto exit;
9241
0
        }
9242
0
        translate[ch] = (Py_UCS1)replace;
9243
0
    }
9244
72
    else if (PyUnicode_Check(item)) {
9245
72
        if (PyUnicode_GET_LENGTH(item) != 1)
9246
72
            goto exit;
9247
9248
0
        replace = PyUnicode_READ_CHAR(item, 0);
9249
0
        if (replace > 127)
9250
0
            goto exit;
9251
0
        translate[ch] = (Py_UCS1)replace;
9252
0
    }
9253
0
    else {
9254
        /* not None, NULL, long or unicode */
9255
0
        goto exit;
9256
0
    }
9257
0
    ret = 1;
9258
9259
72
  exit:
9260
72
    Py_DECREF(item);
9261
72
    return ret;
9262
0
}
9263
9264
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9265
   was translated into writer, return 0 if the input string was partially
9266
   translated into writer, raise an exception and return -1 on error. */
9267
static int
9268
unicode_fast_translate(PyObject *input, PyObject *mapping,
9269
                       _PyUnicodeWriter *writer, int ignore,
9270
                       Py_ssize_t *input_pos)
9271
144
{
9272
144
    Py_UCS1 ascii_table[128], ch, ch2;
9273
144
    Py_ssize_t len;
9274
144
    const Py_UCS1 *in, *end;
9275
144
    Py_UCS1 *out;
9276
144
    int res = 0;
9277
9278
144
    len = PyUnicode_GET_LENGTH(input);
9279
9280
144
    memset(ascii_table, 0xff, 128);
9281
9282
144
    in = PyUnicode_1BYTE_DATA(input);
9283
144
    end = in + len;
9284
9285
144
    assert(PyUnicode_IS_ASCII(writer->buffer));
9286
144
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9287
144
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9288
9289
267
    for (; in < end; in++) {
9290
195
        ch = *in;
9291
195
        ch2 = ascii_table[ch];
9292
195
        if (ch2 == 0xff) {
9293
174
            int translate = unicode_fast_translate_lookup(mapping, ch,
9294
174
                                                          ascii_table);
9295
174
            if (translate < 0)
9296
0
                return -1;
9297
174
            if (translate == 0)
9298
72
                goto exit;
9299
102
            ch2 = ascii_table[ch];
9300
102
        }
9301
123
        if (ch2 == 0xfe) {
9302
0
            if (ignore)
9303
0
                continue;
9304
0
            goto exit;
9305
0
        }
9306
123
        assert(ch2 < 128);
9307
123
        *out = ch2;
9308
123
        out++;
9309
123
    }
9310
72
    res = 1;
9311
9312
144
exit:
9313
144
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9314
144
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9315
144
    return res;
9316
72
}
9317
9318
static PyObject *
9319
_PyUnicode_TranslateCharmap(PyObject *input,
9320
                            PyObject *mapping,
9321
                            const char *errors)
9322
144
{
9323
    /* input object */
9324
144
    const void *data;
9325
144
    Py_ssize_t size, i;
9326
144
    int kind;
9327
    /* output buffer */
9328
144
    _PyUnicodeWriter writer;
9329
    /* error handler */
9330
144
    const char *reason = "character maps to <undefined>";
9331
144
    PyObject *errorHandler = NULL;
9332
144
    PyObject *exc = NULL;
9333
144
    int ignore;
9334
144
    int res;
9335
9336
144
    if (mapping == NULL) {
9337
0
        PyErr_BadArgument();
9338
0
        return NULL;
9339
0
    }
9340
9341
144
    data = PyUnicode_DATA(input);
9342
144
    kind = PyUnicode_KIND(input);
9343
144
    size = PyUnicode_GET_LENGTH(input);
9344
9345
144
    if (size == 0)
9346
0
        return PyUnicode_FromObject(input);
9347
9348
    /* allocate enough for a simple 1:1 translation without
9349
       replacements, if we need more, we'll resize */
9350
144
    _PyUnicodeWriter_Init(&writer);
9351
144
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9352
0
        goto onError;
9353
9354
144
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9355
9356
144
    if (PyUnicode_IS_ASCII(input)) {
9357
144
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9358
144
        if (res < 0) {
9359
0
            _PyUnicodeWriter_Dealloc(&writer);
9360
0
            return NULL;
9361
0
        }
9362
144
        if (res == 1)
9363
72
            return _PyUnicodeWriter_Finish(&writer);
9364
144
    }
9365
0
    else {
9366
0
        i = 0;
9367
0
    }
9368
9369
180
    while (i<size) {
9370
        /* try to encode it */
9371
108
        int translate;
9372
108
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9373
108
        Py_ssize_t newpos;
9374
        /* startpos for collecting untranslatable chars */
9375
108
        Py_ssize_t collstart;
9376
108
        Py_ssize_t collend;
9377
108
        Py_UCS4 ch;
9378
9379
108
        ch = PyUnicode_READ(kind, data, i);
9380
108
        translate = charmaptranslate_output(ch, mapping, &writer);
9381
108
        if (translate < 0)
9382
0
            goto onError;
9383
9384
108
        if (translate != 0) {
9385
            /* it worked => adjust input pointer */
9386
108
            ++i;
9387
108
            continue;
9388
108
        }
9389
9390
        /* untranslatable character */
9391
0
        collstart = i;
9392
0
        collend = i+1;
9393
9394
        /* find all untranslatable characters */
9395
0
        while (collend < size) {
9396
0
            PyObject *x;
9397
0
            Py_UCS4 replace;
9398
0
            ch = PyUnicode_READ(kind, data, collend);
9399
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9400
0
                goto onError;
9401
0
            Py_XDECREF(x);
9402
0
            if (x != Py_None)
9403
0
                break;
9404
0
            ++collend;
9405
0
        }
9406
9407
0
        if (ignore) {
9408
0
            i = collend;
9409
0
        }
9410
0
        else {
9411
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9412
0
                                                             reason, input, &exc,
9413
0
                                                             collstart, collend, &newpos);
9414
0
            if (repunicode == NULL)
9415
0
                goto onError;
9416
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9417
0
                Py_DECREF(repunicode);
9418
0
                goto onError;
9419
0
            }
9420
0
            Py_DECREF(repunicode);
9421
0
            i = newpos;
9422
0
        }
9423
0
    }
9424
72
    Py_XDECREF(exc);
9425
72
    Py_XDECREF(errorHandler);
9426
72
    return _PyUnicodeWriter_Finish(&writer);
9427
9428
0
  onError:
9429
0
    _PyUnicodeWriter_Dealloc(&writer);
9430
0
    Py_XDECREF(exc);
9431
0
    Py_XDECREF(errorHandler);
9432
0
    return NULL;
9433
72
}
9434
9435
PyObject *
9436
PyUnicode_Translate(PyObject *str,
9437
                    PyObject *mapping,
9438
                    const char *errors)
9439
0
{
9440
0
    if (ensure_unicode(str) < 0)
9441
0
        return NULL;
9442
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9443
0
}
9444
9445
PyObject *
9446
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9447
179k
{
9448
179k
    if (!PyUnicode_Check(unicode)) {
9449
0
        PyErr_BadInternalCall();
9450
0
        return NULL;
9451
0
    }
9452
179k
    if (PyUnicode_IS_ASCII(unicode)) {
9453
        /* If the string is already ASCII, just return the same string */
9454
177k
        return Py_NewRef(unicode);
9455
177k
    }
9456
9457
1.66k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9458
1.66k
    PyObject *result = PyUnicode_New(len, 127);
9459
1.66k
    if (result == NULL) {
9460
0
        return NULL;
9461
0
    }
9462
9463
1.66k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9464
1.66k
    int kind = PyUnicode_KIND(unicode);
9465
1.66k
    const void *data = PyUnicode_DATA(unicode);
9466
1.66k
    Py_ssize_t i;
9467
1.85M
    for (i = 0; i < len; ++i) {
9468
1.85M
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9469
1.85M
        if (ch < 127) {
9470
1.84M
            out[i] = ch;
9471
1.84M
        }
9472
9.13k
        else if (Py_UNICODE_ISSPACE(ch)) {
9473
7.21k
            out[i] = ' ';
9474
7.21k
        }
9475
1.91k
        else {
9476
1.91k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9477
1.91k
            if (decimal < 0) {
9478
1.38k
                out[i] = '?';
9479
1.38k
                out[i+1] = '\0';
9480
1.38k
                _PyUnicode_LENGTH(result) = i + 1;
9481
0
                break;
9482
1.38k
            }
9483
536
            out[i] = '0' + decimal;
9484
536
        }
9485
1.85M
    }
9486
9487
1.66k
    assert(_PyUnicode_CheckConsistency(result, 1));
9488
1.66k
    return result;
9489
1.66k
}
9490
9491
/* --- Helpers ------------------------------------------------------------ */
9492
9493
/* helper macro to fixup start/end slice values */
9494
#define ADJUST_INDICES(start, end, len) \
9495
2.04M
    do {                                \
9496
2.04M
        if (end > len) {                \
9497
353k
            end = len;                  \
9498
353k
        }                               \
9499
2.04M
        else if (end < 0) {             \
9500
0
            end += len;                 \
9501
0
            if (end < 0) {              \
9502
0
                end = 0;                \
9503
0
            }                           \
9504
0
        }                               \
9505
2.04M
        if (start < 0) {                \
9506
0
            start += len;               \
9507
0
            if (start < 0) {            \
9508
0
                start = 0;              \
9509
0
            }                           \
9510
0
        }                               \
9511
2.04M
    } while (0)
9512
9513
static Py_ssize_t
9514
any_find_slice(PyObject* s1, PyObject* s2,
9515
               Py_ssize_t start,
9516
               Py_ssize_t end,
9517
               int direction)
9518
7.55k
{
9519
7.55k
    int kind1, kind2;
9520
7.55k
    const void *buf1, *buf2;
9521
7.55k
    Py_ssize_t len1, len2, result;
9522
9523
7.55k
    kind1 = PyUnicode_KIND(s1);
9524
7.55k
    kind2 = PyUnicode_KIND(s2);
9525
7.55k
    if (kind1 < kind2)
9526
0
        return -1;
9527
9528
7.55k
    len1 = PyUnicode_GET_LENGTH(s1);
9529
7.55k
    len2 = PyUnicode_GET_LENGTH(s2);
9530
7.55k
    ADJUST_INDICES(start, end, len1);
9531
7.55k
    if (end - start < len2)
9532
1.29k
        return -1;
9533
9534
6.26k
    buf1 = PyUnicode_DATA(s1);
9535
6.26k
    buf2 = PyUnicode_DATA(s2);
9536
6.26k
    if (len2 == 1) {
9537
6.26k
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9538
6.26k
        result = findchar((const char *)buf1 + kind1*start,
9539
6.26k
                          kind1, end - start, ch, direction);
9540
6.26k
        if (result == -1)
9541
4.73k
            return -1;
9542
1.52k
        else
9543
1.52k
            return start + result;
9544
6.26k
    }
9545
9546
0
    if (kind2 != kind1) {
9547
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9548
0
        if (!buf2)
9549
0
            return -2;
9550
0
    }
9551
9552
0
    if (direction > 0) {
9553
0
        switch (kind1) {
9554
0
        case PyUnicode_1BYTE_KIND:
9555
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9556
0
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9557
0
            else
9558
0
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9559
0
            break;
9560
0
        case PyUnicode_2BYTE_KIND:
9561
0
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9562
0
            break;
9563
0
        case PyUnicode_4BYTE_KIND:
9564
0
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9565
0
            break;
9566
0
        default:
9567
0
            Py_UNREACHABLE();
9568
0
        }
9569
0
    }
9570
0
    else {
9571
0
        switch (kind1) {
9572
0
        case PyUnicode_1BYTE_KIND:
9573
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9574
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9575
0
            else
9576
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9577
0
            break;
9578
0
        case PyUnicode_2BYTE_KIND:
9579
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9580
0
            break;
9581
0
        case PyUnicode_4BYTE_KIND:
9582
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9583
0
            break;
9584
0
        default:
9585
0
            Py_UNREACHABLE();
9586
0
        }
9587
0
    }
9588
9589
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9590
0
    if (kind2 != kind1)
9591
0
        PyMem_Free((void *)buf2);
9592
9593
0
    return result;
9594
0
}
9595
9596
9597
Py_ssize_t
9598
PyUnicode_Count(PyObject *str,
9599
                PyObject *substr,
9600
                Py_ssize_t start,
9601
                Py_ssize_t end)
9602
0
{
9603
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9604
0
        return -1;
9605
9606
0
    return unicode_count_impl(str, substr, start, end);
9607
0
}
9608
9609
Py_ssize_t
9610
PyUnicode_Find(PyObject *str,
9611
               PyObject *substr,
9612
               Py_ssize_t start,
9613
               Py_ssize_t end,
9614
               int direction)
9615
139
{
9616
139
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9617
0
        return -2;
9618
9619
139
    return any_find_slice(str, substr, start, end, direction);
9620
139
}
9621
9622
Py_ssize_t
9623
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9624
                   Py_ssize_t start, Py_ssize_t end,
9625
                   int direction)
9626
1.67M
{
9627
1.67M
    int kind;
9628
1.67M
    Py_ssize_t len, result;
9629
1.67M
    len = PyUnicode_GET_LENGTH(str);
9630
1.67M
    ADJUST_INDICES(start, end, len);
9631
1.67M
    if (end - start < 1)
9632
0
        return -1;
9633
1.67M
    kind = PyUnicode_KIND(str);
9634
1.67M
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9635
1.67M
                      kind, end-start, ch, direction);
9636
1.67M
    if (result == -1)
9637
1.51M
        return -1;
9638
159k
    else
9639
159k
        return start + result;
9640
1.67M
}
9641
9642
static int
9643
tailmatch(PyObject *self,
9644
          PyObject *substring,
9645
          Py_ssize_t start,
9646
          Py_ssize_t end,
9647
          int direction)
9648
352k
{
9649
352k
    int kind_self;
9650
352k
    int kind_sub;
9651
352k
    const void *data_self;
9652
352k
    const void *data_sub;
9653
352k
    Py_ssize_t offset;
9654
352k
    Py_ssize_t i;
9655
352k
    Py_ssize_t end_sub;
9656
9657
352k
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9658
352k
    end -= PyUnicode_GET_LENGTH(substring);
9659
352k
    if (end < start)
9660
90.8k
        return 0;
9661
9662
262k
    if (PyUnicode_GET_LENGTH(substring) == 0)
9663
0
        return 1;
9664
9665
262k
    kind_self = PyUnicode_KIND(self);
9666
262k
    data_self = PyUnicode_DATA(self);
9667
262k
    kind_sub = PyUnicode_KIND(substring);
9668
262k
    data_sub = PyUnicode_DATA(substring);
9669
262k
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9670
9671
262k
    if (direction > 0)
9672
116k
        offset = end;
9673
145k
    else
9674
145k
        offset = start;
9675
9676
262k
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9677
262k
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9678
259k
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9679
259k
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9680
        /* If both are of the same kind, memcmp is sufficient */
9681
233k
        if (kind_self == kind_sub) {
9682
233k
            return ! memcmp((char *)data_self +
9683
233k
                                (offset * PyUnicode_KIND(substring)),
9684
0
                            data_sub,
9685
233k
                            PyUnicode_GET_LENGTH(substring) *
9686
233k
                                PyUnicode_KIND(substring));
9687
233k
        }
9688
        /* otherwise we have to compare each character by first accessing it */
9689
1
        else {
9690
            /* We do not need to compare 0 and len(substring)-1 because
9691
               the if statement above ensured already that they are equal
9692
               when we end up here. */
9693
3
            for (i = 1; i < end_sub; ++i) {
9694
2
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9695
2
                    PyUnicode_READ(kind_sub, data_sub, i))
9696
0
                    return 0;
9697
2
            }
9698
1
            return 1;
9699
1
        }
9700
233k
    }
9701
9702
28.8k
    return 0;
9703
262k
}
9704
9705
Py_ssize_t
9706
PyUnicode_Tailmatch(PyObject *str,
9707
                    PyObject *substr,
9708
                    Py_ssize_t start,
9709
                    Py_ssize_t end,
9710
                    int direction)
9711
0
{
9712
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9713
0
        return -1;
9714
9715
0
    return tailmatch(str, substr, start, end, direction);
9716
0
}
9717
9718
static PyObject *
9719
ascii_upper_or_lower(PyObject *self, int lower)
9720
66.5k
{
9721
66.5k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9722
66.5k
    const char *data = PyUnicode_DATA(self);
9723
66.5k
    char *resdata;
9724
66.5k
    PyObject *res;
9725
9726
66.5k
    res = PyUnicode_New(len, 127);
9727
66.5k
    if (res == NULL)
9728
0
        return NULL;
9729
66.5k
    resdata = PyUnicode_DATA(res);
9730
66.5k
    if (lower)
9731
66.3k
        _Py_bytes_lower(resdata, data, len);
9732
153
    else
9733
153
        _Py_bytes_upper(resdata, data, len);
9734
66.5k
    return res;
9735
66.5k
}
9736
9737
static Py_UCS4
9738
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9739
27
{
9740
27
    Py_ssize_t j;
9741
27
    int final_sigma;
9742
27
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9743
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9744
9745
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9746
9747
    where ! is a negation and \p{xxx} is a character with property xxx.
9748
    */
9749
27
    for (j = i - 1; j >= 0; j--) {
9750
0
        c = PyUnicode_READ(kind, data, j);
9751
0
        if (!_PyUnicode_IsCaseIgnorable(c))
9752
0
            break;
9753
0
    }
9754
27
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9755
27
    if (final_sigma) {
9756
0
        for (j = i + 1; j < length; j++) {
9757
0
            c = PyUnicode_READ(kind, data, j);
9758
0
            if (!_PyUnicode_IsCaseIgnorable(c))
9759
0
                break;
9760
0
        }
9761
0
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9762
0
    }
9763
27
    return (final_sigma) ? 0x3C2 : 0x3C3;
9764
27
}
9765
9766
static int
9767
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9768
           Py_UCS4 c, Py_UCS4 *mapped)
9769
185k
{
9770
    /* Obscure special case. */
9771
185k
    if (c == 0x3A3) {
9772
27
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9773
27
        return 1;
9774
27
    }
9775
185k
    return _PyUnicode_ToLowerFull(c, mapped);
9776
185k
}
9777
9778
static Py_ssize_t
9779
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9780
0
{
9781
0
    Py_ssize_t i, k = 0;
9782
0
    int n_res, j;
9783
0
    Py_UCS4 c, mapped[3];
9784
9785
0
    c = PyUnicode_READ(kind, data, 0);
9786
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9787
0
    for (j = 0; j < n_res; j++) {
9788
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9789
0
        res[k++] = mapped[j];
9790
0
    }
9791
0
    for (i = 1; i < length; i++) {
9792
0
        c = PyUnicode_READ(kind, data, i);
9793
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9794
0
        for (j = 0; j < n_res; j++) {
9795
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9796
0
            res[k++] = mapped[j];
9797
0
        }
9798
0
    }
9799
0
    return k;
9800
0
}
9801
9802
static Py_ssize_t
9803
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9804
0
    Py_ssize_t i, k = 0;
9805
9806
0
    for (i = 0; i < length; i++) {
9807
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9808
0
        int n_res, j;
9809
0
        if (Py_UNICODE_ISUPPER(c)) {
9810
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9811
0
        }
9812
0
        else if (Py_UNICODE_ISLOWER(c)) {
9813
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9814
0
        }
9815
0
        else {
9816
0
            n_res = 1;
9817
0
            mapped[0] = c;
9818
0
        }
9819
0
        for (j = 0; j < n_res; j++) {
9820
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9821
0
            res[k++] = mapped[j];
9822
0
        }
9823
0
    }
9824
0
    return k;
9825
0
}
9826
9827
static Py_ssize_t
9828
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9829
                  Py_UCS4 *maxchar, int lower)
9830
99.4k
{
9831
99.4k
    Py_ssize_t i, k = 0;
9832
9833
284k
    for (i = 0; i < length; i++) {
9834
185k
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9835
185k
        int n_res, j;
9836
185k
        if (lower)
9837
185k
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9838
0
        else
9839
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9840
370k
        for (j = 0; j < n_res; j++) {
9841
185k
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9842
185k
            res[k++] = mapped[j];
9843
185k
        }
9844
185k
    }
9845
99.4k
    return k;
9846
99.4k
}
9847
9848
static Py_ssize_t
9849
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9850
0
{
9851
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9852
0
}
9853
9854
static Py_ssize_t
9855
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9856
99.4k
{
9857
99.4k
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9858
99.4k
}
9859
9860
static Py_ssize_t
9861
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9862
0
{
9863
0
    Py_ssize_t i, k = 0;
9864
9865
0
    for (i = 0; i < length; i++) {
9866
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9867
0
        Py_UCS4 mapped[3];
9868
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9869
0
        for (j = 0; j < n_res; j++) {
9870
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9871
0
            res[k++] = mapped[j];
9872
0
        }
9873
0
    }
9874
0
    return k;
9875
0
}
9876
9877
static Py_ssize_t
9878
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9879
0
{
9880
0
    Py_ssize_t i, k = 0;
9881
0
    int previous_is_cased;
9882
9883
0
    previous_is_cased = 0;
9884
0
    for (i = 0; i < length; i++) {
9885
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9886
0
        Py_UCS4 mapped[3];
9887
0
        int n_res, j;
9888
9889
0
        if (previous_is_cased)
9890
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9891
0
        else
9892
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9893
9894
0
        for (j = 0; j < n_res; j++) {
9895
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9896
0
            res[k++] = mapped[j];
9897
0
        }
9898
9899
0
        previous_is_cased = _PyUnicode_IsCased(c);
9900
0
    }
9901
0
    return k;
9902
0
}
9903
9904
static PyObject *
9905
case_operation(PyObject *self,
9906
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9907
99.4k
{
9908
99.4k
    PyObject *res = NULL;
9909
99.4k
    Py_ssize_t length, newlength = 0;
9910
99.4k
    int kind, outkind;
9911
99.4k
    const void *data;
9912
99.4k
    void *outdata;
9913
99.4k
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9914
9915
99.4k
    kind = PyUnicode_KIND(self);
9916
99.4k
    data = PyUnicode_DATA(self);
9917
99.4k
    length = PyUnicode_GET_LENGTH(self);
9918
99.4k
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9919
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9920
0
        return NULL;
9921
0
    }
9922
99.4k
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9923
99.4k
    if (tmp == NULL)
9924
0
        return PyErr_NoMemory();
9925
99.4k
    newlength = perform(kind, data, length, tmp, &maxchar);
9926
99.4k
    res = PyUnicode_New(newlength, maxchar);
9927
99.4k
    if (res == NULL)
9928
0
        goto leave;
9929
99.4k
    tmpend = tmp + newlength;
9930
99.4k
    outdata = PyUnicode_DATA(res);
9931
99.4k
    outkind = PyUnicode_KIND(res);
9932
0
    switch (outkind) {
9933
6.95k
    case PyUnicode_1BYTE_KIND:
9934
6.95k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9935
6.95k
        break;
9936
82.9k
    case PyUnicode_2BYTE_KIND:
9937
82.9k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9938
82.9k
        break;
9939
9.51k
    case PyUnicode_4BYTE_KIND:
9940
9.51k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9941
9.51k
        break;
9942
0
    default:
9943
0
        Py_UNREACHABLE();
9944
99.4k
    }
9945
99.4k
  leave:
9946
99.4k
    PyMem_Free(tmp);
9947
99.4k
    return res;
9948
99.4k
}
9949
9950
PyObject *
9951
PyUnicode_Join(PyObject *separator, PyObject *seq)
9952
954k
{
9953
954k
    PyObject *res;
9954
954k
    PyObject *fseq;
9955
954k
    Py_ssize_t seqlen;
9956
954k
    PyObject **items;
9957
9958
954k
    fseq = PySequence_Fast(seq, "can only join an iterable");
9959
954k
    if (fseq == NULL) {
9960
0
        return NULL;
9961
0
    }
9962
9963
954k
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9964
9965
954k
    items = PySequence_Fast_ITEMS(fseq);
9966
954k
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9967
954k
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9968
9969
954k
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9970
9971
954k
    Py_DECREF(fseq);
9972
954k
    return res;
9973
954k
}
9974
9975
PyObject *
9976
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9977
1.10M
{
9978
1.10M
    PyObject *res = NULL; /* the result */
9979
1.10M
    PyObject *sep = NULL;
9980
1.10M
    Py_ssize_t seplen;
9981
1.10M
    PyObject *item;
9982
1.10M
    Py_ssize_t sz, i, res_offset;
9983
1.10M
    Py_UCS4 maxchar;
9984
1.10M
    Py_UCS4 item_maxchar;
9985
1.10M
    int use_memcpy;
9986
1.10M
    unsigned char *res_data = NULL, *sep_data = NULL;
9987
1.10M
    PyObject *last_obj;
9988
1.10M
    int kind = 0;
9989
9990
    /* If empty sequence, return u"". */
9991
1.10M
    if (seqlen == 0) {
9992
0
        _Py_RETURN_UNICODE_EMPTY();
9993
0
    }
9994
9995
    /* If singleton sequence with an exact Unicode, return that. */
9996
1.10M
    last_obj = NULL;
9997
1.10M
    if (seqlen == 1) {
9998
59.0k
        if (PyUnicode_CheckExact(items[0])) {
9999
59.0k
            res = items[0];
10000
59.0k
            return Py_NewRef(res);
10001
59.0k
        }
10002
0
        seplen = 0;
10003
0
        maxchar = 0;
10004
0
    }
10005
1.04M
    else {
10006
        /* Set up sep and seplen */
10007
1.04M
        if (separator == NULL) {
10008
            /* fall back to a blank space separator */
10009
0
            sep = PyUnicode_FromOrdinal(' ');
10010
0
            if (!sep)
10011
0
                goto onError;
10012
0
            seplen = 1;
10013
0
            maxchar = 32;
10014
0
        }
10015
1.04M
        else {
10016
1.04M
            if (!PyUnicode_Check(separator)) {
10017
0
                PyErr_Format(PyExc_TypeError,
10018
0
                             "separator: expected str instance,"
10019
0
                             " %.80s found",
10020
0
                             Py_TYPE(separator)->tp_name);
10021
0
                goto onError;
10022
0
            }
10023
1.04M
            sep = separator;
10024
1.04M
            seplen = PyUnicode_GET_LENGTH(separator);
10025
1.04M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10026
            /* inc refcount to keep this code path symmetric with the
10027
               above case of a blank separator */
10028
1.04M
            Py_INCREF(sep);
10029
1.04M
        }
10030
1.04M
        last_obj = sep;
10031
1.04M
    }
10032
10033
    /* There are at least two things to join, or else we have a subclass
10034
     * of str in the sequence.
10035
     * Do a pre-pass to figure out the total amount of space we'll
10036
     * need (sz), and see whether all argument are strings.
10037
     */
10038
1.04M
    sz = 0;
10039
#ifdef Py_DEBUG
10040
    use_memcpy = 0;
10041
#else
10042
1.04M
    use_memcpy = 1;
10043
1.04M
#endif
10044
41.5M
    for (i = 0; i < seqlen; i++) {
10045
40.4M
        size_t add_sz;
10046
40.4M
        item = items[i];
10047
40.4M
        if (!PyUnicode_Check(item)) {
10048
0
            PyErr_Format(PyExc_TypeError,
10049
0
                         "sequence item %zd: expected str instance,"
10050
0
                         " %.80s found",
10051
0
                         i, Py_TYPE(item)->tp_name);
10052
0
            goto onError;
10053
0
        }
10054
40.4M
        add_sz = PyUnicode_GET_LENGTH(item);
10055
40.4M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10056
40.4M
        maxchar = Py_MAX(maxchar, item_maxchar);
10057
40.4M
        if (i != 0) {
10058
39.4M
            add_sz += seplen;
10059
39.4M
        }
10060
40.4M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10061
0
            PyErr_SetString(PyExc_OverflowError,
10062
0
                            "join() result is too long for a Python string");
10063
0
            goto onError;
10064
0
        }
10065
40.4M
        sz += add_sz;
10066
40.4M
        if (use_memcpy && last_obj != NULL) {
10067
41.3M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10068
715k
                use_memcpy = 0;
10069
20.6M
        }
10070
0
        last_obj = item;
10071
40.4M
    }
10072
10073
1.04M
    res = PyUnicode_New(sz, maxchar);
10074
1.04M
    if (res == NULL)
10075
0
        goto onError;
10076
10077
    /* Catenate everything. */
10078
#ifdef Py_DEBUG
10079
    use_memcpy = 0;
10080
#else
10081
1.04M
    if (use_memcpy) {
10082
334k
        res_data = PyUnicode_1BYTE_DATA(res);
10083
334k
        kind = PyUnicode_KIND(res);
10084
334k
        if (seplen != 0)
10085
5.58k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10086
334k
    }
10087
1.04M
#endif
10088
1.04M
    if (use_memcpy) {
10089
2.95M
        for (i = 0; i < seqlen; ++i) {
10090
2.61M
            Py_ssize_t itemlen;
10091
2.61M
            item = items[i];
10092
10093
            /* Copy item, and maybe the separator. */
10094
2.61M
            if (i && seplen != 0) {
10095
7.44k
                memcpy(res_data,
10096
7.44k
                          sep_data,
10097
7.44k
                          kind * seplen);
10098
7.44k
                res_data += kind * seplen;
10099
7.44k
            }
10100
10101
2.61M
            itemlen = PyUnicode_GET_LENGTH(item);
10102
2.61M
            if (itemlen != 0) {
10103
2.61M
                memcpy(res_data,
10104
2.61M
                          PyUnicode_DATA(item),
10105
2.61M
                          kind * itemlen);
10106
2.61M
                res_data += kind * itemlen;
10107
2.61M
            }
10108
2.61M
        }
10109
334k
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10110
334k
                           + kind * PyUnicode_GET_LENGTH(res));
10111
334k
    }
10112
715k
    else {
10113
38.5M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10114
37.8M
            Py_ssize_t itemlen;
10115
37.8M
            item = items[i];
10116
10117
            /* Copy item, and maybe the separator. */
10118
37.8M
            if (i && seplen != 0) {
10119
2.81k
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10120
2.81k
                res_offset += seplen;
10121
2.81k
            }
10122
10123
37.8M
            itemlen = PyUnicode_GET_LENGTH(item);
10124
37.8M
            if (itemlen != 0) {
10125
37.8M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10126
37.8M
                res_offset += itemlen;
10127
37.8M
            }
10128
37.8M
        }
10129
715k
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10130
715k
    }
10131
10132
1.04M
    Py_XDECREF(sep);
10133
1.04M
    assert(_PyUnicode_CheckConsistency(res, 1));
10134
1.04M
    return res;
10135
10136
0
  onError:
10137
0
    Py_XDECREF(sep);
10138
0
    Py_XDECREF(res);
10139
0
    return NULL;
10140
1.04M
}
10141
10142
void
10143
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10144
                    Py_UCS4 fill_char)
10145
585
{
10146
585
    const int kind = PyUnicode_KIND(unicode);
10147
585
    void *data = PyUnicode_DATA(unicode);
10148
585
    assert(_PyUnicode_IsModifiable(unicode));
10149
585
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10150
585
    assert(start >= 0);
10151
585
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10152
585
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10153
585
}
10154
10155
Py_ssize_t
10156
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10157
               Py_UCS4 fill_char)
10158
585
{
10159
585
    Py_ssize_t maxlen;
10160
10161
585
    if (!PyUnicode_Check(unicode)) {
10162
0
        PyErr_BadInternalCall();
10163
0
        return -1;
10164
0
    }
10165
585
    if (unicode_check_modifiable(unicode))
10166
0
        return -1;
10167
10168
585
    if (start < 0) {
10169
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10170
0
        return -1;
10171
0
    }
10172
585
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10173
0
        PyErr_SetString(PyExc_ValueError,
10174
0
                         "fill character is bigger than "
10175
0
                         "the string maximum character");
10176
0
        return -1;
10177
0
    }
10178
10179
585
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10180
585
    length = Py_MIN(maxlen, length);
10181
585
    if (length <= 0)
10182
0
        return 0;
10183
10184
585
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10185
585
    return length;
10186
585
}
10187
10188
static PyObject *
10189
pad(PyObject *self,
10190
    Py_ssize_t left,
10191
    Py_ssize_t right,
10192
    Py_UCS4 fill)
10193
0
{
10194
0
    PyObject *u;
10195
0
    Py_UCS4 maxchar;
10196
0
    int kind;
10197
0
    void *data;
10198
10199
0
    if (left < 0)
10200
0
        left = 0;
10201
0
    if (right < 0)
10202
0
        right = 0;
10203
10204
0
    if (left == 0 && right == 0)
10205
0
        return unicode_result_unchanged(self);
10206
10207
0
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10208
0
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10209
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10210
0
        return NULL;
10211
0
    }
10212
0
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10213
0
    maxchar = Py_MAX(maxchar, fill);
10214
0
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10215
0
    if (!u)
10216
0
        return NULL;
10217
10218
0
    kind = PyUnicode_KIND(u);
10219
0
    data = PyUnicode_DATA(u);
10220
0
    if (left)
10221
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10222
0
    if (right)
10223
0
        _PyUnicode_Fill(kind, data, fill,
10224
0
                        left + _PyUnicode_LENGTH(self), right);
10225
0
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10226
0
    assert(_PyUnicode_CheckConsistency(u, 1));
10227
0
    return u;
10228
0
}
10229
10230
PyObject *
10231
PyUnicode_Splitlines(PyObject *string, int keepends)
10232
0
{
10233
0
    PyObject *list;
10234
10235
0
    if (ensure_unicode(string) < 0)
10236
0
        return NULL;
10237
10238
0
    switch (PyUnicode_KIND(string)) {
10239
0
    case PyUnicode_1BYTE_KIND:
10240
0
        if (PyUnicode_IS_ASCII(string))
10241
0
            list = asciilib_splitlines(
10242
0
                string, PyUnicode_1BYTE_DATA(string),
10243
0
                PyUnicode_GET_LENGTH(string), keepends);
10244
0
        else
10245
0
            list = ucs1lib_splitlines(
10246
0
                string, PyUnicode_1BYTE_DATA(string),
10247
0
                PyUnicode_GET_LENGTH(string), keepends);
10248
0
        break;
10249
0
    case PyUnicode_2BYTE_KIND:
10250
0
        list = ucs2lib_splitlines(
10251
0
            string, PyUnicode_2BYTE_DATA(string),
10252
0
            PyUnicode_GET_LENGTH(string), keepends);
10253
0
        break;
10254
0
    case PyUnicode_4BYTE_KIND:
10255
0
        list = ucs4lib_splitlines(
10256
0
            string, PyUnicode_4BYTE_DATA(string),
10257
0
            PyUnicode_GET_LENGTH(string), keepends);
10258
0
        break;
10259
0
    default:
10260
0
        Py_UNREACHABLE();
10261
0
    }
10262
0
    return list;
10263
0
}
10264
10265
static PyObject *
10266
split(PyObject *self,
10267
      PyObject *substring,
10268
      Py_ssize_t maxcount)
10269
1.79k
{
10270
1.79k
    int kind1, kind2;
10271
1.79k
    const void *buf1, *buf2;
10272
1.79k
    Py_ssize_t len1, len2;
10273
1.79k
    PyObject* out;
10274
1.79k
    len1 = PyUnicode_GET_LENGTH(self);
10275
1.79k
    kind1 = PyUnicode_KIND(self);
10276
10277
1.79k
    if (substring == NULL) {
10278
7
        if (maxcount < 0) {
10279
7
            maxcount = (len1 - 1) / 2 + 1;
10280
7
        }
10281
7
        switch (kind1) {
10282
7
        case PyUnicode_1BYTE_KIND:
10283
7
            if (PyUnicode_IS_ASCII(self))
10284
7
                return asciilib_split_whitespace(
10285
7
                    self,  PyUnicode_1BYTE_DATA(self),
10286
7
                    len1, maxcount
10287
7
                    );
10288
0
            else
10289
0
                return ucs1lib_split_whitespace(
10290
0
                    self,  PyUnicode_1BYTE_DATA(self),
10291
0
                    len1, maxcount
10292
0
                    );
10293
0
        case PyUnicode_2BYTE_KIND:
10294
0
            return ucs2lib_split_whitespace(
10295
0
                self,  PyUnicode_2BYTE_DATA(self),
10296
0
                len1, maxcount
10297
0
                );
10298
0
        case PyUnicode_4BYTE_KIND:
10299
0
            return ucs4lib_split_whitespace(
10300
0
                self,  PyUnicode_4BYTE_DATA(self),
10301
0
                len1, maxcount
10302
0
                );
10303
0
        default:
10304
0
            Py_UNREACHABLE();
10305
7
        }
10306
7
    }
10307
10308
1.79k
    kind2 = PyUnicode_KIND(substring);
10309
1.79k
    len2 = PyUnicode_GET_LENGTH(substring);
10310
1.79k
    if (maxcount < 0) {
10311
        // if len2 == 0, it will raise ValueError.
10312
1.79k
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10313
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10314
1.79k
        maxcount = maxcount < 0 ? len1 : maxcount;
10315
1.79k
    }
10316
1.79k
    if (kind1 < kind2 || len1 < len2) {
10317
3
        out = PyList_New(1);
10318
3
        if (out == NULL)
10319
0
            return NULL;
10320
3
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10321
3
        return out;
10322
3
    }
10323
1.78k
    buf1 = PyUnicode_DATA(self);
10324
1.78k
    buf2 = PyUnicode_DATA(substring);
10325
1.78k
    if (kind2 != kind1) {
10326
1.10k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10327
1.10k
        if (!buf2)
10328
0
            return NULL;
10329
1.10k
    }
10330
10331
1.78k
    switch (kind1) {
10332
681
    case PyUnicode_1BYTE_KIND:
10333
681
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10334
431
            out = asciilib_split(
10335
431
                self,  buf1, len1, buf2, len2, maxcount);
10336
250
        else
10337
250
            out = ucs1lib_split(
10338
250
                self,  buf1, len1, buf2, len2, maxcount);
10339
681
        break;
10340
578
    case PyUnicode_2BYTE_KIND:
10341
578
        out = ucs2lib_split(
10342
578
            self,  buf1, len1, buf2, len2, maxcount);
10343
578
        break;
10344
530
    case PyUnicode_4BYTE_KIND:
10345
530
        out = ucs4lib_split(
10346
530
            self,  buf1, len1, buf2, len2, maxcount);
10347
530
        break;
10348
0
    default:
10349
0
        out = NULL;
10350
1.78k
    }
10351
1.78k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10352
1.78k
    if (kind2 != kind1)
10353
1.10k
        PyMem_Free((void *)buf2);
10354
1.78k
    return out;
10355
1.78k
}
10356
10357
static PyObject *
10358
rsplit(PyObject *self,
10359
       PyObject *substring,
10360
       Py_ssize_t maxcount)
10361
0
{
10362
0
    int kind1, kind2;
10363
0
    const void *buf1, *buf2;
10364
0
    Py_ssize_t len1, len2;
10365
0
    PyObject* out;
10366
10367
0
    len1 = PyUnicode_GET_LENGTH(self);
10368
0
    kind1 = PyUnicode_KIND(self);
10369
10370
0
    if (substring == NULL) {
10371
0
        if (maxcount < 0) {
10372
0
            maxcount = (len1 - 1) / 2 + 1;
10373
0
        }
10374
0
        switch (kind1) {
10375
0
        case PyUnicode_1BYTE_KIND:
10376
0
            if (PyUnicode_IS_ASCII(self))
10377
0
                return asciilib_rsplit_whitespace(
10378
0
                    self,  PyUnicode_1BYTE_DATA(self),
10379
0
                    len1, maxcount
10380
0
                    );
10381
0
            else
10382
0
                return ucs1lib_rsplit_whitespace(
10383
0
                    self,  PyUnicode_1BYTE_DATA(self),
10384
0
                    len1, maxcount
10385
0
                    );
10386
0
        case PyUnicode_2BYTE_KIND:
10387
0
            return ucs2lib_rsplit_whitespace(
10388
0
                self,  PyUnicode_2BYTE_DATA(self),
10389
0
                len1, maxcount
10390
0
                );
10391
0
        case PyUnicode_4BYTE_KIND:
10392
0
            return ucs4lib_rsplit_whitespace(
10393
0
                self,  PyUnicode_4BYTE_DATA(self),
10394
0
                len1, maxcount
10395
0
                );
10396
0
        default:
10397
0
            Py_UNREACHABLE();
10398
0
        }
10399
0
    }
10400
0
    kind2 = PyUnicode_KIND(substring);
10401
0
    len2 = PyUnicode_GET_LENGTH(substring);
10402
0
    if (maxcount < 0) {
10403
        // if len2 == 0, it will raise ValueError.
10404
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10405
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10406
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10407
0
    }
10408
0
    if (kind1 < kind2 || len1 < len2) {
10409
0
        out = PyList_New(1);
10410
0
        if (out == NULL)
10411
0
            return NULL;
10412
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10413
0
        return out;
10414
0
    }
10415
0
    buf1 = PyUnicode_DATA(self);
10416
0
    buf2 = PyUnicode_DATA(substring);
10417
0
    if (kind2 != kind1) {
10418
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10419
0
        if (!buf2)
10420
0
            return NULL;
10421
0
    }
10422
10423
0
    switch (kind1) {
10424
0
    case PyUnicode_1BYTE_KIND:
10425
0
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10426
0
            out = asciilib_rsplit(
10427
0
                self,  buf1, len1, buf2, len2, maxcount);
10428
0
        else
10429
0
            out = ucs1lib_rsplit(
10430
0
                self,  buf1, len1, buf2, len2, maxcount);
10431
0
        break;
10432
0
    case PyUnicode_2BYTE_KIND:
10433
0
        out = ucs2lib_rsplit(
10434
0
            self,  buf1, len1, buf2, len2, maxcount);
10435
0
        break;
10436
0
    case PyUnicode_4BYTE_KIND:
10437
0
        out = ucs4lib_rsplit(
10438
0
            self,  buf1, len1, buf2, len2, maxcount);
10439
0
        break;
10440
0
    default:
10441
0
        out = NULL;
10442
0
    }
10443
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10444
0
    if (kind2 != kind1)
10445
0
        PyMem_Free((void *)buf2);
10446
0
    return out;
10447
0
}
10448
10449
static Py_ssize_t
10450
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10451
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10452
4.65k
{
10453
4.65k
    switch (kind) {
10454
4.34k
    case PyUnicode_1BYTE_KIND:
10455
4.34k
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10456
4.27k
            return asciilib_find(buf1, len1, buf2, len2, offset);
10457
66
        else
10458
66
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10459
132
    case PyUnicode_2BYTE_KIND:
10460
132
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10461
175
    case PyUnicode_4BYTE_KIND:
10462
175
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10463
4.65k
    }
10464
4.65k
    Py_UNREACHABLE();
10465
4.65k
}
10466
10467
static Py_ssize_t
10468
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10469
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10470
24.7k
{
10471
24.7k
    switch (kind) {
10472
24.6k
    case PyUnicode_1BYTE_KIND:
10473
24.6k
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10474
71
    case PyUnicode_2BYTE_KIND:
10475
71
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10476
27
    case PyUnicode_4BYTE_KIND:
10477
27
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10478
24.7k
    }
10479
24.7k
    Py_UNREACHABLE();
10480
24.7k
}
10481
10482
static void
10483
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10484
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10485
117
{
10486
117
    int kind = PyUnicode_KIND(u);
10487
117
    void *data = PyUnicode_DATA(u);
10488
117
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10489
117
    if (kind == PyUnicode_1BYTE_KIND) {
10490
117
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10491
117
                                      (Py_UCS1 *)data + len,
10492
117
                                      u1, u2, maxcount);
10493
117
    }
10494
0
    else if (kind == PyUnicode_2BYTE_KIND) {
10495
0
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10496
0
                                      (Py_UCS2 *)data + len,
10497
0
                                      u1, u2, maxcount);
10498
0
    }
10499
0
    else {
10500
0
        assert(kind == PyUnicode_4BYTE_KIND);
10501
0
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10502
0
                                      (Py_UCS4 *)data + len,
10503
0
                                      u1, u2, maxcount);
10504
0
    }
10505
117
}
10506
10507
static PyObject *
10508
replace(PyObject *self, PyObject *str1,
10509
        PyObject *str2, Py_ssize_t maxcount)
10510
25.8k
{
10511
25.8k
    PyObject *u;
10512
25.8k
    const char *sbuf = PyUnicode_DATA(self);
10513
25.8k
    const void *buf1 = PyUnicode_DATA(str1);
10514
25.8k
    const void *buf2 = PyUnicode_DATA(str2);
10515
25.8k
    int srelease = 0, release1 = 0, release2 = 0;
10516
25.8k
    int skind = PyUnicode_KIND(self);
10517
25.8k
    int kind1 = PyUnicode_KIND(str1);
10518
25.8k
    int kind2 = PyUnicode_KIND(str2);
10519
25.8k
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10520
25.8k
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10521
25.8k
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10522
25.8k
    int mayshrink;
10523
25.8k
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10524
10525
25.8k
    if (slen < len1)
10526
213
        goto nothing;
10527
10528
25.6k
    if (maxcount < 0)
10529
25.6k
        maxcount = PY_SSIZE_T_MAX;
10530
0
    else if (maxcount == 0)
10531
0
        goto nothing;
10532
10533
25.6k
    if (str1 == str2)
10534
0
        goto nothing;
10535
10536
25.6k
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10537
25.6k
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10538
25.6k
    if (maxchar < maxchar_str1)
10539
        /* substring too wide to be present */
10540
0
        goto nothing;
10541
25.6k
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10542
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10543
       result string. */
10544
25.6k
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10545
25.6k
    maxchar = Py_MAX(maxchar, maxchar_str2);
10546
10547
25.6k
    if (len1 == len2) {
10548
        /* same length */
10549
865
        if (len1 == 0)
10550
0
            goto nothing;
10551
865
        if (len1 == 1) {
10552
            /* replace characters */
10553
865
            Py_UCS4 u1, u2;
10554
865
            Py_ssize_t pos;
10555
10556
865
            u1 = PyUnicode_READ(kind1, buf1, 0);
10557
865
            pos = findchar(sbuf, skind, slen, u1, 1);
10558
865
            if (pos < 0)
10559
748
                goto nothing;
10560
117
            u2 = PyUnicode_READ(kind2, buf2, 0);
10561
117
            u = PyUnicode_New(slen, maxchar);
10562
117
            if (!u)
10563
0
                goto error;
10564
10565
117
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10566
117
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10567
117
        }
10568
0
        else {
10569
0
            int rkind = skind;
10570
0
            char *res;
10571
0
            Py_ssize_t i;
10572
10573
0
            if (kind1 < rkind) {
10574
                /* widen substring */
10575
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10576
0
                if (!buf1) goto error;
10577
0
                release1 = 1;
10578
0
            }
10579
0
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10580
0
            if (i < 0)
10581
0
                goto nothing;
10582
0
            if (rkind > kind2) {
10583
                /* widen replacement */
10584
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10585
0
                if (!buf2) goto error;
10586
0
                release2 = 1;
10587
0
            }
10588
0
            else if (rkind < kind2) {
10589
                /* widen self and buf1 */
10590
0
                rkind = kind2;
10591
0
                if (release1) {
10592
0
                    assert(buf1 != PyUnicode_DATA(str1));
10593
0
                    PyMem_Free((void *)buf1);
10594
0
                    buf1 = PyUnicode_DATA(str1);
10595
0
                    release1 = 0;
10596
0
                }
10597
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10598
0
                if (!sbuf) goto error;
10599
0
                srelease = 1;
10600
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10601
0
                if (!buf1) goto error;
10602
0
                release1 = 1;
10603
0
            }
10604
0
            u = PyUnicode_New(slen, maxchar);
10605
0
            if (!u)
10606
0
                goto error;
10607
0
            assert(PyUnicode_KIND(u) == rkind);
10608
0
            res = PyUnicode_DATA(u);
10609
10610
0
            memcpy(res, sbuf, rkind * slen);
10611
            /* change everything in-place, starting with this one */
10612
0
            memcpy(res + rkind * i,
10613
0
                   buf2,
10614
0
                   rkind * len2);
10615
0
            i += len1;
10616
10617
0
            while ( --maxcount > 0) {
10618
0
                i = anylib_find(rkind, self,
10619
0
                                sbuf+rkind*i, slen-i,
10620
0
                                str1, buf1, len1, i);
10621
0
                if (i == -1)
10622
0
                    break;
10623
0
                memcpy(res + rkind * i,
10624
0
                       buf2,
10625
0
                       rkind * len2);
10626
0
                i += len1;
10627
0
            }
10628
0
        }
10629
865
    }
10630
24.7k
    else {
10631
24.7k
        Py_ssize_t n, i, j, ires;
10632
24.7k
        Py_ssize_t new_size;
10633
24.7k
        int rkind = skind;
10634
24.7k
        char *res;
10635
10636
24.7k
        if (kind1 < rkind) {
10637
            /* widen substring */
10638
98
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10639
98
            if (!buf1) goto error;
10640
98
            release1 = 1;
10641
98
        }
10642
24.7k
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10643
24.7k
        if (n == 0)
10644
24.0k
            goto nothing;
10645
666
        if (kind2 < rkind) {
10646
            /* widen replacement */
10647
62
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10648
62
            if (!buf2) goto error;
10649
62
            release2 = 1;
10650
62
        }
10651
604
        else if (kind2 > rkind) {
10652
            /* widen self and buf1 */
10653
0
            rkind = kind2;
10654
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10655
0
            if (!sbuf) goto error;
10656
0
            srelease = 1;
10657
0
            if (release1) {
10658
0
                assert(buf1 != PyUnicode_DATA(str1));
10659
0
                PyMem_Free((void *)buf1);
10660
0
                buf1 = PyUnicode_DATA(str1);
10661
0
                release1 = 0;
10662
0
            }
10663
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10664
0
            if (!buf1) goto error;
10665
0
            release1 = 1;
10666
0
        }
10667
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10668
           PyUnicode_GET_LENGTH(str1)); */
10669
666
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10670
0
                PyErr_SetString(PyExc_OverflowError,
10671
0
                                "replace string is too long");
10672
0
                goto error;
10673
0
        }
10674
666
        new_size = slen + n * (len2 - len1);
10675
666
        if (new_size == 0) {
10676
0
            u = _PyUnicode_GetEmpty();
10677
0
            goto done;
10678
0
        }
10679
666
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10680
0
            PyErr_SetString(PyExc_OverflowError,
10681
0
                            "replace string is too long");
10682
0
            goto error;
10683
0
        }
10684
666
        u = PyUnicode_New(new_size, maxchar);
10685
666
        if (!u)
10686
0
            goto error;
10687
666
        assert(PyUnicode_KIND(u) == rkind);
10688
666
        res = PyUnicode_DATA(u);
10689
666
        ires = i = 0;
10690
666
        if (len1 > 0) {
10691
5.31k
            while (n-- > 0) {
10692
                /* look for next match */
10693
4.65k
                j = anylib_find(rkind, self,
10694
4.65k
                                sbuf + rkind * i, slen-i,
10695
4.65k
                                str1, buf1, len1, i);
10696
4.65k
                if (j == -1)
10697
0
                    break;
10698
4.65k
                else if (j > i) {
10699
                    /* copy unchanged part [i:j] */
10700
3.62k
                    memcpy(res + rkind * ires,
10701
3.62k
                           sbuf + rkind * i,
10702
3.62k
                           rkind * (j-i));
10703
3.62k
                    ires += j - i;
10704
3.62k
                }
10705
                /* copy substitution string */
10706
4.65k
                if (len2 > 0) {
10707
1.31k
                    memcpy(res + rkind * ires,
10708
1.31k
                           buf2,
10709
1.31k
                           rkind * len2);
10710
1.31k
                    ires += len2;
10711
1.31k
                }
10712
4.65k
                i = j + len1;
10713
4.65k
            }
10714
666
            if (i < slen)
10715
                /* copy tail [i:] */
10716
279
                memcpy(res + rkind * ires,
10717
279
                       sbuf + rkind * i,
10718
279
                       rkind * (slen-i));
10719
666
        }
10720
0
        else {
10721
            /* interleave */
10722
0
            while (n > 0) {
10723
0
                memcpy(res + rkind * ires,
10724
0
                       buf2,
10725
0
                       rkind * len2);
10726
0
                ires += len2;
10727
0
                if (--n <= 0)
10728
0
                    break;
10729
0
                memcpy(res + rkind * ires,
10730
0
                       sbuf + rkind * i,
10731
0
                       rkind);
10732
0
                ires++;
10733
0
                i++;
10734
0
            }
10735
0
            memcpy(res + rkind * ires,
10736
0
                   sbuf + rkind * i,
10737
0
                   rkind * (slen-i));
10738
0
        }
10739
666
    }
10740
10741
783
    if (mayshrink) {
10742
0
        unicode_adjust_maxchar(&u);
10743
0
        if (u == NULL)
10744
0
            goto error;
10745
0
    }
10746
10747
783
  done:
10748
783
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10749
783
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10750
783
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10751
783
    if (srelease)
10752
0
        PyMem_Free((void *)sbuf);
10753
783
    if (release1)
10754
62
        PyMem_Free((void *)buf1);
10755
783
    if (release2)
10756
62
        PyMem_Free((void *)buf2);
10757
783
    assert(_PyUnicode_CheckConsistency(u, 1));
10758
783
    return u;
10759
10760
25.0k
  nothing:
10761
    /* nothing to replace; return original string (when possible) */
10762
25.0k
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10763
25.0k
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10764
25.0k
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10765
25.0k
    if (srelease)
10766
0
        PyMem_Free((void *)sbuf);
10767
25.0k
    if (release1)
10768
36
        PyMem_Free((void *)buf1);
10769
25.0k
    if (release2)
10770
0
        PyMem_Free((void *)buf2);
10771
25.0k
    return unicode_result_unchanged(self);
10772
10773
0
  error:
10774
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10775
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10776
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10777
0
    if (srelease)
10778
0
        PyMem_Free((void *)sbuf);
10779
0
    if (release1)
10780
0
        PyMem_Free((void *)buf1);
10781
0
    if (release2)
10782
0
        PyMem_Free((void *)buf2);
10783
0
    return NULL;
10784
0
}
10785
10786
/* --- Unicode Object Methods --------------------------------------------- */
10787
10788
/*[clinic input]
10789
@permit_long_docstring_body
10790
str.title as unicode_title
10791
10792
Return a version of the string where each word is titlecased.
10793
10794
More specifically, words start with uppercased characters and all remaining
10795
cased characters have lower case.
10796
[clinic start generated code]*/
10797
10798
static PyObject *
10799
unicode_title_impl(PyObject *self)
10800
/*[clinic end generated code: output=c75ae03809574902 input=533ce0eb6a7f5d1b]*/
10801
0
{
10802
0
    return case_operation(self, do_title);
10803
0
}
10804
10805
/*[clinic input]
10806
@permit_long_docstring_body
10807
str.capitalize as unicode_capitalize
10808
10809
Return a capitalized version of the string.
10810
10811
More specifically, make the first character have upper case and the rest lower
10812
case.
10813
[clinic start generated code]*/
10814
10815
static PyObject *
10816
unicode_capitalize_impl(PyObject *self)
10817
/*[clinic end generated code: output=e49a4c333cdb7667 input=a4a15ade41f6f9e9]*/
10818
0
{
10819
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10820
0
        return unicode_result_unchanged(self);
10821
0
    return case_operation(self, do_capitalize);
10822
0
}
10823
10824
/*[clinic input]
10825
str.casefold as unicode_casefold
10826
10827
Return a version of the string suitable for caseless comparisons.
10828
[clinic start generated code]*/
10829
10830
static PyObject *
10831
unicode_casefold_impl(PyObject *self)
10832
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10833
0
{
10834
0
    if (PyUnicode_IS_ASCII(self))
10835
0
        return ascii_upper_or_lower(self, 1);
10836
0
    return case_operation(self, do_casefold);
10837
0
}
10838
10839
10840
/* Argument converter. Accepts a single Unicode character. */
10841
10842
static int
10843
convert_uc(PyObject *obj, void *addr)
10844
0
{
10845
0
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10846
10847
0
    if (!PyUnicode_Check(obj)) {
10848
0
        PyErr_Format(PyExc_TypeError,
10849
0
                     "The fill character must be a unicode character, "
10850
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10851
0
        return 0;
10852
0
    }
10853
0
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10854
0
        PyErr_SetString(PyExc_TypeError,
10855
0
                        "The fill character must be exactly one character long");
10856
0
        return 0;
10857
0
    }
10858
0
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10859
0
    return 1;
10860
0
}
10861
10862
/*[clinic input]
10863
str.center as unicode_center
10864
10865
    width: Py_ssize_t
10866
    fillchar: Py_UCS4 = ' '
10867
    /
10868
10869
Return a centered string of length width.
10870
10871
Padding is done using the specified fill character (default is a space).
10872
[clinic start generated code]*/
10873
10874
static PyObject *
10875
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10876
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10877
0
{
10878
0
    Py_ssize_t marg, left;
10879
10880
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10881
0
        return unicode_result_unchanged(self);
10882
10883
0
    marg = width - PyUnicode_GET_LENGTH(self);
10884
0
    left = marg / 2 + (marg & width & 1);
10885
10886
0
    return pad(self, left, marg - left, fillchar);
10887
0
}
10888
10889
/* This function assumes that str1 and str2 are readied by the caller. */
10890
10891
static int
10892
unicode_compare(PyObject *str1, PyObject *str2)
10893
7.02M
{
10894
7.02M
#define COMPARE(TYPE1, TYPE2) \
10895
7.02M
    do { \
10896
6.26M
        TYPE1* p1 = (TYPE1 *)data1; \
10897
6.26M
        TYPE2* p2 = (TYPE2 *)data2; \
10898
6.26M
        TYPE1* end = p1 + len; \
10899
6.26M
        Py_UCS4 c1, c2; \
10900
6.59M
        for (; p1 != end; p1++, p2++) { \
10901
6.57M
            c1 = *p1; \
10902
6.57M
            c2 = *p2; \
10903
6.57M
            if (c1 != c2) \
10904
6.57M
                return (c1 < c2) ? -1 : 1; \
10905
6.57M
        } \
10906
6.26M
    } \
10907
6.26M
    while (0)
10908
10909
7.02M
    int kind1, kind2;
10910
7.02M
    const void *data1, *data2;
10911
7.02M
    Py_ssize_t len1, len2, len;
10912
10913
7.02M
    kind1 = PyUnicode_KIND(str1);
10914
7.02M
    kind2 = PyUnicode_KIND(str2);
10915
7.02M
    data1 = PyUnicode_DATA(str1);
10916
7.02M
    data2 = PyUnicode_DATA(str2);
10917
7.02M
    len1 = PyUnicode_GET_LENGTH(str1);
10918
7.02M
    len2 = PyUnicode_GET_LENGTH(str2);
10919
7.02M
    len = Py_MIN(len1, len2);
10920
10921
7.02M
    switch(kind1) {
10922
3.29M
    case PyUnicode_1BYTE_KIND:
10923
3.29M
    {
10924
3.29M
        switch(kind2) {
10925
560k
        case PyUnicode_1BYTE_KIND:
10926
560k
        {
10927
560k
            int cmp = memcmp(data1, data2, len);
10928
            /* normalize result of memcmp() into the range [-1; 1] */
10929
560k
            if (cmp < 0)
10930
369k
                return -1;
10931
190k
            if (cmp > 0)
10932
167k
                return 1;
10933
23.5k
            break;
10934
190k
        }
10935
2.61M
        case PyUnicode_2BYTE_KIND:
10936
2.61M
            COMPARE(Py_UCS1, Py_UCS2);
10937
4.06k
            break;
10938
119k
        case PyUnicode_4BYTE_KIND:
10939
119k
            COMPARE(Py_UCS1, Py_UCS4);
10940
26
            break;
10941
26
        default:
10942
0
            Py_UNREACHABLE();
10943
3.29M
        }
10944
27.6k
        break;
10945
3.29M
    }
10946
3.43M
    case PyUnicode_2BYTE_KIND:
10947
3.43M
    {
10948
3.43M
        switch(kind2) {
10949
35.0k
        case PyUnicode_1BYTE_KIND:
10950
35.0k
            COMPARE(Py_UCS2, Py_UCS1);
10951
3.00k
            break;
10952
3.32M
        case PyUnicode_2BYTE_KIND:
10953
3.32M
        {
10954
3.32M
            COMPARE(Py_UCS2, Py_UCS2);
10955
11.1k
            break;
10956
3.32M
        }
10957
81.7k
        case PyUnicode_4BYTE_KIND:
10958
81.7k
            COMPARE(Py_UCS2, Py_UCS4);
10959
2
            break;
10960
2
        default:
10961
0
            Py_UNREACHABLE();
10962
3.43M
        }
10963
14.1k
        break;
10964
3.43M
    }
10965
293k
    case PyUnicode_4BYTE_KIND:
10966
293k
    {
10967
293k
        switch(kind2) {
10968
1.27k
        case PyUnicode_1BYTE_KIND:
10969
1.27k
            COMPARE(Py_UCS4, Py_UCS1);
10970
10
            break;
10971
96.0k
        case PyUnicode_2BYTE_KIND:
10972
96.0k
            COMPARE(Py_UCS4, Py_UCS2);
10973
6
            break;
10974
196k
        case PyUnicode_4BYTE_KIND:
10975
196k
        {
10976
196k
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10977
196k
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10978
            /* normalize result of wmemcmp() into the range [-1; 1] */
10979
196k
            if (cmp < 0)
10980
71.4k
                return -1;
10981
125k
            if (cmp > 0)
10982
125k
                return 1;
10983
#else
10984
            COMPARE(Py_UCS4, Py_UCS4);
10985
#endif
10986
4
            break;
10987
125k
        }
10988
4
        default:
10989
0
            Py_UNREACHABLE();
10990
293k
        }
10991
20
        break;
10992
293k
    }
10993
20
    default:
10994
0
        Py_UNREACHABLE();
10995
7.02M
    }
10996
10997
41.7k
    if (len1 == len2)
10998
333
        return 0;
10999
41.4k
    if (len1 < len2)
11000
20.9k
        return -1;
11001
20.5k
    else
11002
20.5k
        return 1;
11003
11004
41.4k
#undef COMPARE
11005
41.4k
}
11006
11007
11008
int
11009
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11010
68.4M
{
11011
68.4M
    assert(PyUnicode_Check(str1));
11012
68.4M
    assert(PyUnicode_Check(str2));
11013
68.4M
    if (str1 == str2) {
11014
8.30M
        return 1;
11015
8.30M
    }
11016
60.1M
    return unicode_eq(str1, str2);
11017
68.4M
}
11018
11019
11020
int
11021
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11022
0
{
11023
0
    if (!PyUnicode_Check(str1)) {
11024
0
        PyErr_Format(PyExc_TypeError,
11025
0
                     "first argument must be str, not %T", str1);
11026
0
        return -1;
11027
0
    }
11028
0
    if (!PyUnicode_Check(str2)) {
11029
0
        PyErr_Format(PyExc_TypeError,
11030
0
                     "second argument must be str, not %T", str2);
11031
0
        return -1;
11032
0
    }
11033
11034
0
    return _PyUnicode_Equal(str1, str2);
11035
0
}
11036
11037
11038
int
11039
PyUnicode_Compare(PyObject *left, PyObject *right)
11040
126k
{
11041
126k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11042
        /* a string is equal to itself */
11043
126k
        if (left == right)
11044
21
            return 0;
11045
11046
126k
        return unicode_compare(left, right);
11047
126k
    }
11048
0
    PyErr_Format(PyExc_TypeError,
11049
0
                 "Can't compare %.100s and %.100s",
11050
0
                 Py_TYPE(left)->tp_name,
11051
0
                 Py_TYPE(right)->tp_name);
11052
0
    return -1;
11053
126k
}
11054
11055
int
11056
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11057
623k
{
11058
623k
    Py_ssize_t i;
11059
623k
    int kind;
11060
623k
    Py_UCS4 chr;
11061
11062
623k
    assert(_PyUnicode_CHECK(uni));
11063
623k
    kind = PyUnicode_KIND(uni);
11064
623k
    if (kind == PyUnicode_1BYTE_KIND) {
11065
623k
        const void *data = PyUnicode_1BYTE_DATA(uni);
11066
623k
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11067
623k
        size_t len, len2 = strlen(str);
11068
623k
        int cmp;
11069
11070
623k
        len = Py_MIN(len1, len2);
11071
623k
        cmp = memcmp(data, str, len);
11072
623k
        if (cmp != 0) {
11073
323k
            if (cmp < 0)
11074
21.5k
                return -1;
11075
301k
            else
11076
301k
                return 1;
11077
323k
        }
11078
299k
        if (len1 > len2)
11079
75
            return 1; /* uni is longer */
11080
299k
        if (len1 < len2)
11081
6.23k
            return -1; /* str is longer */
11082
293k
        return 0;
11083
299k
    }
11084
489
    else {
11085
489
        const void *data = PyUnicode_DATA(uni);
11086
        /* Compare Unicode string and source character set string */
11087
779
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11088
765
            if (chr != (unsigned char)str[i])
11089
475
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11090
        /* This check keeps Python strings that end in '\0' from comparing equal
11091
         to C strings identical up to that point. */
11092
14
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11093
14
            return 1; /* uni is longer */
11094
0
        if (str[i])
11095
0
            return -1; /* str is longer */
11096
0
        return 0;
11097
0
    }
11098
623k
}
11099
11100
int
11101
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11102
0
{
11103
0
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11104
0
}
11105
11106
int
11107
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11108
0
{
11109
0
    assert(_PyUnicode_CHECK(unicode));
11110
0
    assert(str);
11111
11112
0
    if (PyUnicode_IS_ASCII(unicode)) {
11113
0
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11114
0
        return size == len &&
11115
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11116
0
    }
11117
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11118
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11119
0
        return size == len &&
11120
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11121
0
    }
11122
11123
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11124
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11125
0
        return 0;
11126
0
    }
11127
0
    const unsigned char *s = (const unsigned char *)str;
11128
0
    const unsigned char *ends = s + (size_t)size;
11129
0
    int kind = PyUnicode_KIND(unicode);
11130
0
    const void *data = PyUnicode_DATA(unicode);
11131
    /* Compare Unicode string and UTF-8 string */
11132
0
    for (Py_ssize_t i = 0; i < len; i++) {
11133
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11134
0
        if (ch < 0x80) {
11135
0
            if (ends == s || s[0] != ch) {
11136
0
                return 0;
11137
0
            }
11138
0
            s += 1;
11139
0
        }
11140
0
        else if (ch < 0x800) {
11141
0
            if ((ends - s) < 2 ||
11142
0
                s[0] != (0xc0 | (ch >> 6)) ||
11143
0
                s[1] != (0x80 | (ch & 0x3f)))
11144
0
            {
11145
0
                return 0;
11146
0
            }
11147
0
            s += 2;
11148
0
        }
11149
0
        else if (ch < 0x10000) {
11150
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11151
0
                (ends - s) < 3 ||
11152
0
                s[0] != (0xe0 | (ch >> 12)) ||
11153
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11154
0
                s[2] != (0x80 | (ch & 0x3f)))
11155
0
            {
11156
0
                return 0;
11157
0
            }
11158
0
            s += 3;
11159
0
        }
11160
0
        else {
11161
0
            assert(ch <= MAX_UNICODE);
11162
0
            if ((ends - s) < 4 ||
11163
0
                s[0] != (0xf0 | (ch >> 18)) ||
11164
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11165
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11166
0
                s[3] != (0x80 | (ch & 0x3f)))
11167
0
            {
11168
0
                return 0;
11169
0
            }
11170
0
            s += 4;
11171
0
        }
11172
0
    }
11173
0
    return s == ends;
11174
0
}
11175
11176
int
11177
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11178
7.04M
{
11179
7.04M
    size_t len;
11180
7.04M
    assert(_PyUnicode_CHECK(unicode));
11181
7.04M
    assert(str);
11182
7.04M
#ifndef NDEBUG
11183
39.6M
    for (const char *p = str; *p; p++) {
11184
32.6M
        assert((unsigned char)*p < 128);
11185
32.6M
    }
11186
7.04M
#endif
11187
7.04M
    if (!PyUnicode_IS_ASCII(unicode))
11188
560k
        return 0;
11189
6.48M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11190
6.48M
    return strlen(str) == len &&
11191
515k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11192
7.04M
}
11193
11194
PyObject *
11195
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11196
7.76M
{
11197
7.76M
    int result;
11198
11199
7.76M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11200
6.99k
        Py_RETURN_NOTIMPLEMENTED;
11201
11202
7.75M
    if (left == right) {
11203
1.00k
        switch (op) {
11204
927
        case Py_EQ:
11205
927
        case Py_LE:
11206
927
        case Py_GE:
11207
            /* a string is equal to itself */
11208
927
            Py_RETURN_TRUE;
11209
75
        case Py_NE:
11210
75
        case Py_LT:
11211
75
        case Py_GT:
11212
75
            Py_RETURN_FALSE;
11213
0
        default:
11214
0
            PyErr_BadArgument();
11215
0
            return NULL;
11216
1.00k
        }
11217
1.00k
    }
11218
7.75M
    else if (op == Py_EQ || op == Py_NE) {
11219
860k
        result = unicode_eq(left, right);
11220
860k
        result ^= (op == Py_NE);
11221
860k
        return PyBool_FromLong(result);
11222
860k
    }
11223
6.89M
    else {
11224
6.89M
        result = unicode_compare(left, right);
11225
6.89M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11226
6.89M
    }
11227
7.75M
}
11228
11229
int
11230
PyUnicode_Contains(PyObject *str, PyObject *substr)
11231
424M
{
11232
424M
    int kind1, kind2;
11233
424M
    const void *buf1, *buf2;
11234
424M
    Py_ssize_t len1, len2;
11235
424M
    int result;
11236
11237
424M
    if (!PyUnicode_Check(substr)) {
11238
0
        PyErr_Format(PyExc_TypeError,
11239
0
                     "'in <string>' requires string as left operand, not %.100s",
11240
0
                     Py_TYPE(substr)->tp_name);
11241
0
        return -1;
11242
0
    }
11243
424M
    if (ensure_unicode(str) < 0)
11244
0
        return -1;
11245
11246
424M
    kind1 = PyUnicode_KIND(str);
11247
424M
    kind2 = PyUnicode_KIND(substr);
11248
424M
    if (kind1 < kind2)
11249
3
        return 0;
11250
424M
    len1 = PyUnicode_GET_LENGTH(str);
11251
424M
    len2 = PyUnicode_GET_LENGTH(substr);
11252
424M
    if (len1 < len2)
11253
44
        return 0;
11254
424M
    buf1 = PyUnicode_DATA(str);
11255
424M
    buf2 = PyUnicode_DATA(substr);
11256
424M
    if (len2 == 1) {
11257
24.6M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11258
24.6M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11259
24.6M
        return result;
11260
24.6M
    }
11261
399M
    if (kind2 != kind1) {
11262
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11263
0
        if (!buf2)
11264
0
            return -1;
11265
0
    }
11266
11267
399M
    switch (kind1) {
11268
399M
    case PyUnicode_1BYTE_KIND:
11269
399M
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11270
399M
        break;
11271
0
    case PyUnicode_2BYTE_KIND:
11272
0
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11273
0
        break;
11274
0
    case PyUnicode_4BYTE_KIND:
11275
0
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11276
0
        break;
11277
0
    default:
11278
0
        Py_UNREACHABLE();
11279
399M
    }
11280
11281
399M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11282
399M
    if (kind2 != kind1)
11283
0
        PyMem_Free((void *)buf2);
11284
11285
399M
    return result;
11286
399M
}
11287
11288
/* Concat to string or Unicode object giving a new Unicode object. */
11289
11290
PyObject *
11291
PyUnicode_Concat(PyObject *left, PyObject *right)
11292
998k
{
11293
998k
    PyObject *result;
11294
998k
    Py_UCS4 maxchar, maxchar2;
11295
998k
    Py_ssize_t left_len, right_len, new_len;
11296
11297
998k
    if (ensure_unicode(left) < 0)
11298
0
        return NULL;
11299
11300
998k
    if (!PyUnicode_Check(right)) {
11301
6
        PyErr_Format(PyExc_TypeError,
11302
6
            "can only concatenate str (not \"%.200s\") to str",
11303
6
            Py_TYPE(right)->tp_name);
11304
6
        return NULL;
11305
6
    }
11306
11307
    /* Shortcuts */
11308
998k
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11309
998k
    if (left == empty) {
11310
3.62k
        return PyUnicode_FromObject(right);
11311
3.62k
    }
11312
995k
    if (right == empty) {
11313
4.25k
        return PyUnicode_FromObject(left);
11314
4.25k
    }
11315
11316
990k
    left_len = PyUnicode_GET_LENGTH(left);
11317
990k
    right_len = PyUnicode_GET_LENGTH(right);
11318
990k
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11319
0
        PyErr_SetString(PyExc_OverflowError,
11320
0
                        "strings are too large to concat");
11321
0
        return NULL;
11322
0
    }
11323
990k
    new_len = left_len + right_len;
11324
11325
990k
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11326
990k
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11327
990k
    maxchar = Py_MAX(maxchar, maxchar2);
11328
11329
    /* Concat the two Unicode strings */
11330
990k
    result = PyUnicode_New(new_len, maxchar);
11331
990k
    if (result == NULL)
11332
0
        return NULL;
11333
990k
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11334
990k
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11335
990k
    assert(_PyUnicode_CheckConsistency(result, 1));
11336
990k
    return result;
11337
990k
}
11338
11339
void
11340
PyUnicode_Append(PyObject **p_left, PyObject *right)
11341
5.98M
{
11342
5.98M
    PyObject *left, *res;
11343
5.98M
    Py_UCS4 maxchar, maxchar2;
11344
5.98M
    Py_ssize_t left_len, right_len, new_len;
11345
11346
5.98M
    if (p_left == NULL) {
11347
0
        if (!PyErr_Occurred())
11348
0
            PyErr_BadInternalCall();
11349
0
        return;
11350
0
    }
11351
5.98M
    left = *p_left;
11352
5.98M
    if (right == NULL || left == NULL
11353
5.98M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11354
0
        if (!PyErr_Occurred())
11355
0
            PyErr_BadInternalCall();
11356
0
        goto error;
11357
0
    }
11358
11359
    /* Shortcuts */
11360
5.98M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11361
5.98M
    if (left == empty) {
11362
48.7k
        Py_DECREF(left);
11363
48.7k
        *p_left = Py_NewRef(right);
11364
48.7k
        return;
11365
48.7k
    }
11366
5.93M
    if (right == empty) {
11367
2.11k
        return;
11368
2.11k
    }
11369
11370
5.93M
    left_len = PyUnicode_GET_LENGTH(left);
11371
5.93M
    right_len = PyUnicode_GET_LENGTH(right);
11372
5.93M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11373
0
        PyErr_SetString(PyExc_OverflowError,
11374
0
                        "strings are too large to concat");
11375
0
        goto error;
11376
0
    }
11377
5.93M
    new_len = left_len + right_len;
11378
11379
5.93M
    if (_PyUnicode_IsModifiable(left)
11380
5.93M
        && PyUnicode_CheckExact(right)
11381
11.1M
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11382
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11383
           to change the structure size, but characters are stored just after
11384
           the structure, and so it requires to move all characters which is
11385
           not so different than duplicating the string. */
11386
5.56M
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11387
5.56M
    {
11388
        /* append inplace */
11389
5.56M
        if (unicode_resize(p_left, new_len) != 0)
11390
0
            goto error;
11391
11392
        /* copy 'right' into the newly allocated area of 'left' */
11393
5.56M
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11394
5.56M
    }
11395
371k
    else {
11396
371k
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11397
371k
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11398
371k
        maxchar = Py_MAX(maxchar, maxchar2);
11399
11400
        /* Concat the two Unicode strings */
11401
371k
        res = PyUnicode_New(new_len, maxchar);
11402
371k
        if (res == NULL)
11403
0
            goto error;
11404
371k
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11405
371k
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11406
371k
        Py_DECREF(left);
11407
371k
        *p_left = res;
11408
371k
    }
11409
5.93M
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11410
5.93M
    return;
11411
11412
5.93M
error:
11413
0
    Py_CLEAR(*p_left);
11414
0
}
11415
11416
void
11417
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11418
0
{
11419
0
    PyUnicode_Append(pleft, right);
11420
0
    Py_XDECREF(right);
11421
0
}
11422
11423
/*[clinic input]
11424
@permit_long_summary
11425
@text_signature "($self, sub[, start[, end]], /)"
11426
str.count as unicode_count -> Py_ssize_t
11427
11428
    self as str: self
11429
    sub as substr: unicode
11430
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11431
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11432
    /
11433
11434
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11435
11436
Optional arguments start and end are interpreted as in slice notation.
11437
[clinic start generated code]*/
11438
11439
static Py_ssize_t
11440
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11441
                   Py_ssize_t end)
11442
/*[clinic end generated code: output=8fcc3aef0b18edbf input=8590716ee228b935]*/
11443
6.99k
{
11444
6.99k
    assert(PyUnicode_Check(str));
11445
6.99k
    assert(PyUnicode_Check(substr));
11446
11447
6.99k
    Py_ssize_t result;
11448
6.99k
    int kind1, kind2;
11449
6.99k
    const void *buf1 = NULL, *buf2 = NULL;
11450
6.99k
    Py_ssize_t len1, len2;
11451
11452
6.99k
    kind1 = PyUnicode_KIND(str);
11453
6.99k
    kind2 = PyUnicode_KIND(substr);
11454
6.99k
    if (kind1 < kind2)
11455
0
        return 0;
11456
11457
6.99k
    len1 = PyUnicode_GET_LENGTH(str);
11458
6.99k
    len2 = PyUnicode_GET_LENGTH(substr);
11459
6.99k
    ADJUST_INDICES(start, end, len1);
11460
6.99k
    if (end - start < len2)
11461
1.29k
        return 0;
11462
11463
5.70k
    buf1 = PyUnicode_DATA(str);
11464
5.70k
    buf2 = PyUnicode_DATA(substr);
11465
5.70k
    if (kind2 != kind1) {
11466
2.50k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11467
2.50k
        if (!buf2)
11468
0
            goto onError;
11469
2.50k
    }
11470
11471
    // We don't reuse `anylib_count` here because of the explicit casts.
11472
5.70k
    switch (kind1) {
11473
3.20k
    case PyUnicode_1BYTE_KIND:
11474
3.20k
        result = ucs1lib_count(
11475
3.20k
            ((const Py_UCS1*)buf1) + start, end - start,
11476
3.20k
            buf2, len2, PY_SSIZE_T_MAX
11477
3.20k
            );
11478
3.20k
        break;
11479
1.05k
    case PyUnicode_2BYTE_KIND:
11480
1.05k
        result = ucs2lib_count(
11481
1.05k
            ((const Py_UCS2*)buf1) + start, end - start,
11482
1.05k
            buf2, len2, PY_SSIZE_T_MAX
11483
1.05k
            );
11484
1.05k
        break;
11485
1.44k
    case PyUnicode_4BYTE_KIND:
11486
1.44k
        result = ucs4lib_count(
11487
1.44k
            ((const Py_UCS4*)buf1) + start, end - start,
11488
1.44k
            buf2, len2, PY_SSIZE_T_MAX
11489
1.44k
            );
11490
1.44k
        break;
11491
0
    default:
11492
0
        Py_UNREACHABLE();
11493
5.70k
    }
11494
11495
5.70k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11496
5.70k
    if (kind2 != kind1)
11497
2.50k
        PyMem_Free((void *)buf2);
11498
11499
5.70k
    return result;
11500
0
  onError:
11501
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11502
0
    if (kind2 != kind1)
11503
0
        PyMem_Free((void *)buf2);
11504
0
    return -1;
11505
0
}
11506
11507
/*[clinic input]
11508
str.encode as unicode_encode
11509
11510
    encoding: str(c_default="NULL") = 'utf-8'
11511
        The encoding in which to encode the string.
11512
    errors: str(c_default="NULL") = 'strict'
11513
        The error handling scheme to use for encoding errors.
11514
        The default is 'strict' meaning that encoding errors raise a
11515
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11516
        'xmlcharrefreplace' as well as any other name registered with
11517
        codecs.register_error that can handle UnicodeEncodeErrors.
11518
11519
Encode the string using the codec registered for encoding.
11520
[clinic start generated code]*/
11521
11522
static PyObject *
11523
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11524
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11525
5.64k
{
11526
5.64k
    return PyUnicode_AsEncodedString(self, encoding, errors);
11527
5.64k
}
11528
11529
/*[clinic input]
11530
str.expandtabs as unicode_expandtabs
11531
11532
    tabsize: int = 8
11533
11534
Return a copy where all tab characters are expanded using spaces.
11535
11536
If tabsize is not given, a tab size of 8 characters is assumed.
11537
[clinic start generated code]*/
11538
11539
static PyObject *
11540
unicode_expandtabs_impl(PyObject *self, int tabsize)
11541
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11542
782
{
11543
782
    Py_ssize_t i, j, line_pos, src_len, incr;
11544
782
    Py_UCS4 ch;
11545
782
    PyObject *u;
11546
782
    const void *src_data;
11547
782
    void *dest_data;
11548
782
    int kind;
11549
782
    int found;
11550
11551
    /* First pass: determine size of output string */
11552
782
    src_len = PyUnicode_GET_LENGTH(self);
11553
782
    i = j = line_pos = 0;
11554
782
    kind = PyUnicode_KIND(self);
11555
782
    src_data = PyUnicode_DATA(self);
11556
782
    found = 0;
11557
41.5k
    for (; i < src_len; i++) {
11558
40.7k
        ch = PyUnicode_READ(kind, src_data, i);
11559
40.7k
        if (ch == '\t') {
11560
532
            found = 1;
11561
532
            if (tabsize > 0) {
11562
532
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11563
532
                if (j > PY_SSIZE_T_MAX - incr)
11564
0
                    goto overflow;
11565
532
                line_pos += incr;
11566
532
                j += incr;
11567
532
            }
11568
532
        }
11569
40.2k
        else {
11570
40.2k
            if (j > PY_SSIZE_T_MAX - 1)
11571
0
                goto overflow;
11572
40.2k
            line_pos++;
11573
40.2k
            j++;
11574
40.2k
            if (ch == '\n' || ch == '\r')
11575
765
                line_pos = 0;
11576
40.2k
        }
11577
40.7k
    }
11578
782
    if (!found)
11579
584
        return unicode_result_unchanged(self);
11580
11581
    /* Second pass: create output string and fill it */
11582
198
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11583
198
    if (!u)
11584
0
        return NULL;
11585
198
    dest_data = PyUnicode_DATA(u);
11586
11587
198
    i = j = line_pos = 0;
11588
11589
36.4k
    for (; i < src_len; i++) {
11590
36.2k
        ch = PyUnicode_READ(kind, src_data, i);
11591
36.2k
        if (ch == '\t') {
11592
532
            if (tabsize > 0) {
11593
532
                incr = tabsize - (line_pos % tabsize);
11594
532
                line_pos += incr;
11595
532
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11596
532
                j += incr;
11597
532
            }
11598
532
        }
11599
35.7k
        else {
11600
35.7k
            line_pos++;
11601
35.7k
            PyUnicode_WRITE(kind, dest_data, j, ch);
11602
35.7k
            j++;
11603
35.7k
            if (ch == '\n' || ch == '\r')
11604
591
                line_pos = 0;
11605
35.7k
        }
11606
36.2k
    }
11607
198
    assert (j == PyUnicode_GET_LENGTH(u));
11608
198
    return unicode_result(u);
11609
11610
0
  overflow:
11611
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11612
0
    return NULL;
11613
198
}
11614
11615
/*[clinic input]
11616
@permit_long_summary
11617
str.find as unicode_find = str.count
11618
11619
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11620
11621
Optional arguments start and end are interpreted as in slice notation.
11622
Return -1 on failure.
11623
[clinic start generated code]*/
11624
11625
static Py_ssize_t
11626
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11627
                  Py_ssize_t end)
11628
/*[clinic end generated code: output=51dbe6255712e278 input=3a9d650fe4c24695]*/
11629
22
{
11630
22
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11631
22
    if (result < 0) {
11632
0
        return -1;
11633
0
    }
11634
22
    return result;
11635
22
}
11636
11637
static PyObject *
11638
unicode_getitem(PyObject *self, Py_ssize_t index)
11639
6.87M
{
11640
6.87M
    const void *data;
11641
6.87M
    int kind;
11642
6.87M
    Py_UCS4 ch;
11643
11644
6.87M
    if (!PyUnicode_Check(self)) {
11645
0
        PyErr_BadArgument();
11646
0
        return NULL;
11647
0
    }
11648
6.87M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11649
5.16k
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11650
5.16k
        return NULL;
11651
5.16k
    }
11652
6.86M
    kind = PyUnicode_KIND(self);
11653
6.86M
    data = PyUnicode_DATA(self);
11654
6.86M
    ch = PyUnicode_READ(kind, data, index);
11655
6.86M
    return unicode_char(ch);
11656
6.86M
}
11657
11658
/* Believe it or not, this produces the same value for ASCII strings
11659
   as bytes_hash(). */
11660
static Py_hash_t
11661
unicode_hash(PyObject *self)
11662
4.24M
{
11663
4.24M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11664
11665
#ifdef Py_DEBUG
11666
    assert(_Py_HashSecret_Initialized);
11667
#endif
11668
4.24M
    Py_hash_t hash = PyUnicode_HASH(self);
11669
4.24M
    if (hash != -1) {
11670
471k
        return hash;
11671
471k
    }
11672
3.77M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11673
3.77M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11674
11675
0
    PyUnicode_SET_HASH(self, x);
11676
3.77M
    return x;
11677
3.77M
}
11678
11679
/*[clinic input]
11680
@permit_long_summary
11681
str.index as unicode_index = str.count
11682
11683
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11684
11685
Optional arguments start and end are interpreted as in slice notation.
11686
Raises ValueError when the substring is not found.
11687
[clinic start generated code]*/
11688
11689
static Py_ssize_t
11690
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11691
                   Py_ssize_t end)
11692
/*[clinic end generated code: output=77558288837cdf40 input=ae5e48f69ed75b06]*/
11693
0
{
11694
0
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11695
0
    if (result == -1) {
11696
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
11697
0
    }
11698
0
    else if (result < 0) {
11699
0
        return -1;
11700
0
    }
11701
0
    return result;
11702
0
}
11703
11704
/*[clinic input]
11705
str.isascii as unicode_isascii
11706
11707
Return True if all characters in the string are ASCII, False otherwise.
11708
11709
ASCII characters have code points in the range U+0000-U+007F.
11710
Empty string is ASCII too.
11711
[clinic start generated code]*/
11712
11713
static PyObject *
11714
unicode_isascii_impl(PyObject *self)
11715
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11716
5.62k
{
11717
5.62k
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11718
5.62k
}
11719
11720
/*[clinic input]
11721
@permit_long_docstring_body
11722
str.islower as unicode_islower
11723
11724
Return True if the string is a lowercase string, False otherwise.
11725
11726
A string is lowercase if all cased characters in the string are lowercase and
11727
there is at least one cased character in the string.
11728
[clinic start generated code]*/
11729
11730
static PyObject *
11731
unicode_islower_impl(PyObject *self)
11732
/*[clinic end generated code: output=dbd41995bd005b81 input=c6fc0295241a1aaa]*/
11733
0
{
11734
0
    Py_ssize_t i, length;
11735
0
    int kind;
11736
0
    const void *data;
11737
0
    int cased;
11738
11739
0
    length = PyUnicode_GET_LENGTH(self);
11740
0
    kind = PyUnicode_KIND(self);
11741
0
    data = PyUnicode_DATA(self);
11742
11743
    /* Shortcut for single character strings */
11744
0
    if (length == 1)
11745
0
        return PyBool_FromLong(
11746
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11747
11748
    /* Special case for empty strings */
11749
0
    if (length == 0)
11750
0
        Py_RETURN_FALSE;
11751
11752
0
    cased = 0;
11753
0
    for (i = 0; i < length; i++) {
11754
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11755
11756
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11757
0
            Py_RETURN_FALSE;
11758
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11759
0
            cased = 1;
11760
0
    }
11761
0
    return PyBool_FromLong(cased);
11762
0
}
11763
11764
/*[clinic input]
11765
@permit_long_docstring_body
11766
str.isupper as unicode_isupper
11767
11768
Return True if the string is an uppercase string, False otherwise.
11769
11770
A string is uppercase if all cased characters in the string are uppercase and
11771
there is at least one cased character in the string.
11772
[clinic start generated code]*/
11773
11774
static PyObject *
11775
unicode_isupper_impl(PyObject *self)
11776
/*[clinic end generated code: output=049209c8e7f15f59 input=8d5cb33e67efde72]*/
11777
0
{
11778
0
    Py_ssize_t i, length;
11779
0
    int kind;
11780
0
    const void *data;
11781
0
    int cased;
11782
11783
0
    length = PyUnicode_GET_LENGTH(self);
11784
0
    kind = PyUnicode_KIND(self);
11785
0
    data = PyUnicode_DATA(self);
11786
11787
    /* Shortcut for single character strings */
11788
0
    if (length == 1)
11789
0
        return PyBool_FromLong(
11790
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11791
11792
    /* Special case for empty strings */
11793
0
    if (length == 0)
11794
0
        Py_RETURN_FALSE;
11795
11796
0
    cased = 0;
11797
0
    for (i = 0; i < length; i++) {
11798
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11799
11800
0
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11801
0
            Py_RETURN_FALSE;
11802
0
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11803
0
            cased = 1;
11804
0
    }
11805
0
    return PyBool_FromLong(cased);
11806
0
}
11807
11808
/*[clinic input]
11809
str.istitle as unicode_istitle
11810
11811
Return True if the string is a title-cased string, False otherwise.
11812
11813
In a title-cased string, upper- and title-case characters may only
11814
follow uncased characters and lowercase characters only cased ones.
11815
[clinic start generated code]*/
11816
11817
static PyObject *
11818
unicode_istitle_impl(PyObject *self)
11819
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11820
0
{
11821
0
    Py_ssize_t i, length;
11822
0
    int kind;
11823
0
    const void *data;
11824
0
    int cased, previous_is_cased;
11825
11826
0
    length = PyUnicode_GET_LENGTH(self);
11827
0
    kind = PyUnicode_KIND(self);
11828
0
    data = PyUnicode_DATA(self);
11829
11830
    /* Shortcut for single character strings */
11831
0
    if (length == 1) {
11832
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11833
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11834
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11835
0
    }
11836
11837
    /* Special case for empty strings */
11838
0
    if (length == 0)
11839
0
        Py_RETURN_FALSE;
11840
11841
0
    cased = 0;
11842
0
    previous_is_cased = 0;
11843
0
    for (i = 0; i < length; i++) {
11844
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11845
11846
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11847
0
            if (previous_is_cased)
11848
0
                Py_RETURN_FALSE;
11849
0
            previous_is_cased = 1;
11850
0
            cased = 1;
11851
0
        }
11852
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11853
0
            if (!previous_is_cased)
11854
0
                Py_RETURN_FALSE;
11855
0
            previous_is_cased = 1;
11856
0
            cased = 1;
11857
0
        }
11858
0
        else
11859
0
            previous_is_cased = 0;
11860
0
    }
11861
0
    return PyBool_FromLong(cased);
11862
0
}
11863
11864
/*[clinic input]
11865
@permit_long_docstring_body
11866
str.isspace as unicode_isspace
11867
11868
Return True if the string is a whitespace string, False otherwise.
11869
11870
A string is whitespace if all characters in the string are whitespace and there
11871
is at least one character in the string.
11872
[clinic start generated code]*/
11873
11874
static PyObject *
11875
unicode_isspace_impl(PyObject *self)
11876
/*[clinic end generated code: output=163a63bfa08ac2b9 input=44fe05e248c6e159]*/
11877
0
{
11878
0
    Py_ssize_t i, length;
11879
0
    int kind;
11880
0
    const void *data;
11881
11882
0
    length = PyUnicode_GET_LENGTH(self);
11883
0
    kind = PyUnicode_KIND(self);
11884
0
    data = PyUnicode_DATA(self);
11885
11886
    /* Shortcut for single character strings */
11887
0
    if (length == 1)
11888
0
        return PyBool_FromLong(
11889
0
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11890
11891
    /* Special case for empty strings */
11892
0
    if (length == 0)
11893
0
        Py_RETURN_FALSE;
11894
11895
0
    for (i = 0; i < length; i++) {
11896
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11897
0
        if (!Py_UNICODE_ISSPACE(ch))
11898
0
            Py_RETURN_FALSE;
11899
0
    }
11900
0
    Py_RETURN_TRUE;
11901
0
}
11902
11903
/*[clinic input]
11904
@permit_long_docstring_body
11905
str.isalpha as unicode_isalpha
11906
11907
Return True if the string is an alphabetic string, False otherwise.
11908
11909
A string is alphabetic if all characters in the string are alphabetic and there
11910
is at least one character in the string.
11911
[clinic start generated code]*/
11912
11913
static PyObject *
11914
unicode_isalpha_impl(PyObject *self)
11915
/*[clinic end generated code: output=cc81b9ac3883ec4f input=c233000624a56e0d]*/
11916
9
{
11917
9
    Py_ssize_t i, length;
11918
9
    int kind;
11919
9
    const void *data;
11920
11921
9
    length = PyUnicode_GET_LENGTH(self);
11922
9
    kind = PyUnicode_KIND(self);
11923
9
    data = PyUnicode_DATA(self);
11924
11925
    /* Shortcut for single character strings */
11926
9
    if (length == 1)
11927
6
        return PyBool_FromLong(
11928
6
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11929
11930
    /* Special case for empty strings */
11931
3
    if (length == 0)
11932
0
        Py_RETURN_FALSE;
11933
11934
3
    for (i = 0; i < length; i++) {
11935
3
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11936
3
            Py_RETURN_FALSE;
11937
3
    }
11938
3
    Py_RETURN_TRUE;
11939
3
}
11940
11941
/*[clinic input]
11942
@permit_long_docstring_body
11943
str.isalnum as unicode_isalnum
11944
11945
Return True if the string is an alpha-numeric string, False otherwise.
11946
11947
A string is alpha-numeric if all characters in the string are alpha-numeric and
11948
there is at least one character in the string.
11949
[clinic start generated code]*/
11950
11951
static PyObject *
11952
unicode_isalnum_impl(PyObject *self)
11953
/*[clinic end generated code: output=a5a23490ffc3660c input=5d63ba9c9bafdb6b]*/
11954
0
{
11955
0
    int kind;
11956
0
    const void *data;
11957
0
    Py_ssize_t len, i;
11958
11959
0
    kind = PyUnicode_KIND(self);
11960
0
    data = PyUnicode_DATA(self);
11961
0
    len = PyUnicode_GET_LENGTH(self);
11962
11963
    /* Shortcut for single character strings */
11964
0
    if (len == 1) {
11965
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11966
0
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11967
0
    }
11968
11969
    /* Special case for empty strings */
11970
0
    if (len == 0)
11971
0
        Py_RETURN_FALSE;
11972
11973
0
    for (i = 0; i < len; i++) {
11974
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11975
0
        if (!Py_UNICODE_ISALNUM(ch))
11976
0
            Py_RETURN_FALSE;
11977
0
    }
11978
0
    Py_RETURN_TRUE;
11979
0
}
11980
11981
/*[clinic input]
11982
@permit_long_docstring_body
11983
str.isdecimal as unicode_isdecimal
11984
11985
Return True if the string is a decimal string, False otherwise.
11986
11987
A string is a decimal string if all characters in the string are decimal and
11988
there is at least one character in the string.
11989
[clinic start generated code]*/
11990
11991
static PyObject *
11992
unicode_isdecimal_impl(PyObject *self)
11993
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=8e84a58b414935a3]*/
11994
4.45k
{
11995
4.45k
    Py_ssize_t i, length;
11996
4.45k
    int kind;
11997
4.45k
    const void *data;
11998
11999
4.45k
    length = PyUnicode_GET_LENGTH(self);
12000
4.45k
    kind = PyUnicode_KIND(self);
12001
4.45k
    data = PyUnicode_DATA(self);
12002
12003
    /* Shortcut for single character strings */
12004
4.45k
    if (length == 1)
12005
3.10k
        return PyBool_FromLong(
12006
3.10k
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12007
12008
    /* Special case for empty strings */
12009
1.35k
    if (length == 0)
12010
0
        Py_RETURN_FALSE;
12011
12012
955k
    for (i = 0; i < length; i++) {
12013
953k
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12014
321
            Py_RETURN_FALSE;
12015
953k
    }
12016
1.35k
    Py_RETURN_TRUE;
12017
1.35k
}
12018
12019
/*[clinic input]
12020
@permit_long_docstring_body
12021
str.isdigit as unicode_isdigit
12022
12023
Return True if the string is a digit string, False otherwise.
12024
12025
A string is a digit string if all characters in the string are digits and there
12026
is at least one character in the string.
12027
[clinic start generated code]*/
12028
12029
static PyObject *
12030
unicode_isdigit_impl(PyObject *self)
12031
/*[clinic end generated code: output=10a6985311da6858 input=99e284affb54d4a0]*/
12032
0
{
12033
0
    Py_ssize_t i, length;
12034
0
    int kind;
12035
0
    const void *data;
12036
12037
0
    length = PyUnicode_GET_LENGTH(self);
12038
0
    kind = PyUnicode_KIND(self);
12039
0
    data = PyUnicode_DATA(self);
12040
12041
    /* Shortcut for single character strings */
12042
0
    if (length == 1) {
12043
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12044
0
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12045
0
    }
12046
12047
    /* Special case for empty strings */
12048
0
    if (length == 0)
12049
0
        Py_RETURN_FALSE;
12050
12051
0
    for (i = 0; i < length; i++) {
12052
0
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12053
0
            Py_RETURN_FALSE;
12054
0
    }
12055
0
    Py_RETURN_TRUE;
12056
0
}
12057
12058
/*[clinic input]
12059
@permit_long_docstring_body
12060
str.isnumeric as unicode_isnumeric
12061
12062
Return True if the string is a numeric string, False otherwise.
12063
12064
A string is numeric if all characters in the string are numeric and there is at
12065
least one character in the string.
12066
[clinic start generated code]*/
12067
12068
static PyObject *
12069
unicode_isnumeric_impl(PyObject *self)
12070
/*[clinic end generated code: output=9172a32d9013051a input=e9f5b6b8b29b0ee6]*/
12071
0
{
12072
0
    Py_ssize_t i, length;
12073
0
    int kind;
12074
0
    const void *data;
12075
12076
0
    length = PyUnicode_GET_LENGTH(self);
12077
0
    kind = PyUnicode_KIND(self);
12078
0
    data = PyUnicode_DATA(self);
12079
12080
    /* Shortcut for single character strings */
12081
0
    if (length == 1)
12082
0
        return PyBool_FromLong(
12083
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12084
12085
    /* Special case for empty strings */
12086
0
    if (length == 0)
12087
0
        Py_RETURN_FALSE;
12088
12089
0
    for (i = 0; i < length; i++) {
12090
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12091
0
            Py_RETURN_FALSE;
12092
0
    }
12093
0
    Py_RETURN_TRUE;
12094
0
}
12095
12096
Py_ssize_t
12097
_PyUnicode_ScanIdentifier(PyObject *self)
12098
51.3k
{
12099
51.3k
    Py_ssize_t i;
12100
51.3k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12101
51.3k
    if (len == 0) {
12102
        /* an empty string is not a valid identifier */
12103
0
        return 0;
12104
0
    }
12105
12106
51.3k
    int kind = PyUnicode_KIND(self);
12107
51.3k
    const void *data = PyUnicode_DATA(self);
12108
51.3k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12109
    /* PEP 3131 says that the first character must be in
12110
       XID_Start and subsequent characters in XID_Continue,
12111
       and for the ASCII range, the 2.x rules apply (i.e
12112
       start with letters and underscore, continue with
12113
       letters, digits, underscore). However, given the current
12114
       definition of XID_Start and XID_Continue, it is sufficient
12115
       to check just for these, except that _ must be allowed
12116
       as starting an identifier.  */
12117
51.3k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12118
297
        return 0;
12119
297
    }
12120
12121
10.1M
    for (i = 1; i < len; i++) {
12122
10.0M
        ch = PyUnicode_READ(kind, data, i);
12123
10.0M
        if (!_PyUnicode_IsXidContinue(ch)) {
12124
123
            return i;
12125
123
        }
12126
10.0M
    }
12127
50.9k
    return i;
12128
51.0k
}
12129
12130
int
12131
PyUnicode_IsIdentifier(PyObject *self)
12132
1.48k
{
12133
1.48k
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12134
1.48k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12135
    /* an empty string is not a valid identifier */
12136
1.48k
    return len && i == len;
12137
1.48k
}
12138
12139
/*[clinic input]
12140
@permit_long_docstring_body
12141
str.isidentifier as unicode_isidentifier
12142
12143
Return True if the string is a valid Python identifier, False otherwise.
12144
12145
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12146
such as "def" or "class".
12147
[clinic start generated code]*/
12148
12149
static PyObject *
12150
unicode_isidentifier_impl(PyObject *self)
12151
/*[clinic end generated code: output=fe585a9666572905 input=86315dd889d7bd04]*/
12152
1.25k
{
12153
1.25k
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12154
1.25k
}
12155
12156
/*[clinic input]
12157
@permit_long_summary
12158
str.isprintable as unicode_isprintable
12159
12160
Return True if all characters in the string are printable, False otherwise.
12161
12162
A character is printable if repr() may use it in its output.
12163
[clinic start generated code]*/
12164
12165
static PyObject *
12166
unicode_isprintable_impl(PyObject *self)
12167
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12168
0
{
12169
0
    Py_ssize_t i, length;
12170
0
    int kind;
12171
0
    const void *data;
12172
12173
0
    length = PyUnicode_GET_LENGTH(self);
12174
0
    kind = PyUnicode_KIND(self);
12175
0
    data = PyUnicode_DATA(self);
12176
12177
    /* Shortcut for single character strings */
12178
0
    if (length == 1)
12179
0
        return PyBool_FromLong(
12180
0
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12181
12182
0
    for (i = 0; i < length; i++) {
12183
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12184
0
            Py_RETURN_FALSE;
12185
0
        }
12186
0
    }
12187
0
    Py_RETURN_TRUE;
12188
0
}
12189
12190
/*[clinic input]
12191
@permit_long_docstring_body
12192
str.join as unicode_join
12193
12194
    iterable: object
12195
    /
12196
12197
Concatenate any number of strings.
12198
12199
The string whose method is called is inserted in between each given string.
12200
The result is returned as a new string.
12201
12202
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12203
[clinic start generated code]*/
12204
12205
static PyObject *
12206
unicode_join(PyObject *self, PyObject *iterable)
12207
/*[clinic end generated code: output=6857e7cecfe7bf98 input=bac724ed412ef3f8]*/
12208
75.5k
{
12209
75.5k
    return PyUnicode_Join(self, iterable);
12210
75.5k
}
12211
12212
static Py_ssize_t
12213
unicode_length(PyObject *self)
12214
4.40M
{
12215
4.40M
    return PyUnicode_GET_LENGTH(self);
12216
4.40M
}
12217
12218
/*[clinic input]
12219
str.ljust as unicode_ljust
12220
12221
    width: Py_ssize_t
12222
    fillchar: Py_UCS4 = ' '
12223
    /
12224
12225
Return a left-justified string of length width.
12226
12227
Padding is done using the specified fill character (default is a space).
12228
[clinic start generated code]*/
12229
12230
static PyObject *
12231
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12232
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12233
0
{
12234
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12235
0
        return unicode_result_unchanged(self);
12236
12237
0
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12238
0
}
12239
12240
/*[clinic input]
12241
str.lower as unicode_lower
12242
12243
Return a copy of the string converted to lowercase.
12244
[clinic start generated code]*/
12245
12246
static PyObject *
12247
unicode_lower_impl(PyObject *self)
12248
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12249
165k
{
12250
165k
    if (PyUnicode_IS_ASCII(self))
12251
66.3k
        return ascii_upper_or_lower(self, 1);
12252
99.4k
    return case_operation(self, do_lower);
12253
165k
}
12254
12255
32.1k
#define LEFTSTRIP 0
12256
64.2k
#define RIGHTSTRIP 1
12257
12
#define BOTHSTRIP 2
12258
12259
/* Arrays indexed by above */
12260
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12261
12262
0
#define STRIPNAME(i) (stripfuncnames[i])
12263
12264
/* externally visible for str.strip(unicode) */
12265
PyObject *
12266
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12267
11.3k
{
12268
11.3k
    const void *data;
12269
11.3k
    int kind;
12270
11.3k
    Py_ssize_t i, j, len;
12271
11.3k
    BLOOM_MASK sepmask;
12272
11.3k
    Py_ssize_t seplen;
12273
12274
11.3k
    kind = PyUnicode_KIND(self);
12275
11.3k
    data = PyUnicode_DATA(self);
12276
11.3k
    len = PyUnicode_GET_LENGTH(self);
12277
11.3k
    seplen = PyUnicode_GET_LENGTH(sepobj);
12278
11.3k
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12279
11.3k
                              PyUnicode_DATA(sepobj),
12280
11.3k
                              seplen);
12281
12282
0
    i = 0;
12283
11.3k
    if (striptype != RIGHTSTRIP) {
12284
28
        while (i < len) {
12285
24
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12286
24
            if (!BLOOM(sepmask, ch))
12287
20
                break;
12288
4
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12289
0
                break;
12290
4
            i++;
12291
4
        }
12292
24
    }
12293
12294
11.3k
    j = len;
12295
11.3k
    if (striptype != LEFTSTRIP) {
12296
11.3k
        j--;
12297
11.3k
        while (j >= i) {
12298
11.3k
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12299
11.3k
            if (!BLOOM(sepmask, ch))
12300
7.71k
                break;
12301
3.67k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12302
3.63k
                break;
12303
44
            j--;
12304
44
        }
12305
12306
11.3k
        j++;
12307
11.3k
    }
12308
12309
11.3k
    return PyUnicode_Substring(self, i, j);
12310
11.3k
}
12311
12312
PyObject*
12313
_PyUnicode_BinarySlice(PyObject *container, PyObject *start_o, PyObject *stop_o)
12314
187k
{
12315
187k
    assert(PyUnicode_CheckExact(container));
12316
187k
    Py_ssize_t len = PyUnicode_GET_LENGTH(container);
12317
187k
    Py_ssize_t istart, istop;
12318
187k
    if (!_PyEval_UnpackIndices(start_o, stop_o, len, &istart, &istop)) {
12319
0
        return NULL;
12320
0
    }
12321
187k
    return PyUnicode_Substring(container, istart, istop);
12322
187k
}
12323
12324
PyObject*
12325
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12326
1.15M
{
12327
1.15M
    const unsigned char *data;
12328
1.15M
    int kind;
12329
1.15M
    Py_ssize_t length;
12330
12331
1.15M
    length = PyUnicode_GET_LENGTH(self);
12332
1.15M
    end = Py_MIN(end, length);
12333
12334
1.15M
    if (start == 0 && end == length)
12335
41.9k
        return unicode_result_unchanged(self);
12336
12337
1.11M
    if (start < 0 || end < 0) {
12338
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12339
0
        return NULL;
12340
0
    }
12341
1.11M
    if (start >= length || end < start)
12342
3.15k
        _Py_RETURN_UNICODE_EMPTY();
12343
12344
1.11M
    length = end - start;
12345
1.11M
    if (PyUnicode_IS_ASCII(self)) {
12346
552k
        data = PyUnicode_1BYTE_DATA(self);
12347
552k
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12348
552k
    }
12349
558k
    else {
12350
558k
        kind = PyUnicode_KIND(self);
12351
558k
        data = PyUnicode_1BYTE_DATA(self);
12352
558k
        return PyUnicode_FromKindAndData(kind,
12353
558k
                                         data + kind * start,
12354
558k
                                         length);
12355
558k
    }
12356
1.11M
}
12357
12358
static PyObject *
12359
do_strip(PyObject *self, int striptype)
12360
20.7k
{
12361
20.7k
    Py_ssize_t len, i, j;
12362
12363
20.7k
    len = PyUnicode_GET_LENGTH(self);
12364
12365
20.7k
    if (PyUnicode_IS_ASCII(self)) {
12366
20.7k
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12367
12368
20.7k
        i = 0;
12369
20.7k
        if (striptype != RIGHTSTRIP) {
12370
12
            while (i < len) {
12371
12
                Py_UCS1 ch = data[i];
12372
12
                if (!_Py_ascii_whitespace[ch])
12373
12
                    break;
12374
0
                i++;
12375
0
            }
12376
12
        }
12377
12378
20.7k
        j = len;
12379
20.7k
        if (striptype != LEFTSTRIP) {
12380
20.7k
            j--;
12381
20.7k
            while (j >= i) {
12382
20.7k
                Py_UCS1 ch = data[j];
12383
20.7k
                if (!_Py_ascii_whitespace[ch])
12384
20.7k
                    break;
12385
0
                j--;
12386
0
            }
12387
20.7k
            j++;
12388
20.7k
        }
12389
20.7k
    }
12390
0
    else {
12391
0
        int kind = PyUnicode_KIND(self);
12392
0
        const void *data = PyUnicode_DATA(self);
12393
12394
0
        i = 0;
12395
0
        if (striptype != RIGHTSTRIP) {
12396
0
            while (i < len) {
12397
0
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12398
0
                if (!Py_UNICODE_ISSPACE(ch))
12399
0
                    break;
12400
0
                i++;
12401
0
            }
12402
0
        }
12403
12404
0
        j = len;
12405
0
        if (striptype != LEFTSTRIP) {
12406
0
            j--;
12407
0
            while (j >= i) {
12408
0
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12409
0
                if (!Py_UNICODE_ISSPACE(ch))
12410
0
                    break;
12411
0
                j--;
12412
0
            }
12413
0
            j++;
12414
0
        }
12415
0
    }
12416
12417
20.7k
    return PyUnicode_Substring(self, i, j);
12418
20.7k
}
12419
12420
12421
static PyObject *
12422
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12423
32.1k
{
12424
32.1k
    if (sep != Py_None) {
12425
11.3k
        if (PyUnicode_Check(sep))
12426
11.3k
            return _PyUnicode_XStrip(self, striptype, sep);
12427
0
        else {
12428
0
            PyErr_Format(PyExc_TypeError,
12429
0
                         "%s arg must be None or str",
12430
0
                         STRIPNAME(striptype));
12431
0
            return NULL;
12432
0
        }
12433
11.3k
    }
12434
12435
20.7k
    return do_strip(self, striptype);
12436
32.1k
}
12437
12438
12439
/*[clinic input]
12440
@permit_long_summary
12441
str.strip as unicode_strip
12442
12443
    chars: object = None
12444
    /
12445
12446
Return a copy of the string with leading and trailing whitespace removed.
12447
12448
If chars is given and not None, remove characters in chars instead.
12449
[clinic start generated code]*/
12450
12451
static PyObject *
12452
unicode_strip_impl(PyObject *self, PyObject *chars)
12453
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12454
12
{
12455
12
    return do_argstrip(self, BOTHSTRIP, chars);
12456
12
}
12457
12458
12459
/*[clinic input]
12460
str.lstrip as unicode_lstrip
12461
12462
    chars: object = None
12463
    /
12464
12465
Return a copy of the string with leading whitespace removed.
12466
12467
If chars is given and not None, remove characters in chars instead.
12468
[clinic start generated code]*/
12469
12470
static PyObject *
12471
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12472
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12473
24
{
12474
24
    return do_argstrip(self, LEFTSTRIP, chars);
12475
24
}
12476
12477
12478
/*[clinic input]
12479
str.rstrip as unicode_rstrip
12480
12481
    chars: object = None
12482
    /
12483
12484
Return a copy of the string with trailing whitespace removed.
12485
12486
If chars is given and not None, remove characters in chars instead.
12487
[clinic start generated code]*/
12488
12489
static PyObject *
12490
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12491
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12492
32.1k
{
12493
32.1k
    return do_argstrip(self, RIGHTSTRIP, chars);
12494
32.1k
}
12495
12496
12497
static PyObject*
12498
unicode_repeat(PyObject *str, Py_ssize_t len)
12499
971
{
12500
971
    PyObject *u;
12501
971
    Py_ssize_t nchars, n;
12502
12503
971
    if (len < 1)
12504
27
        _Py_RETURN_UNICODE_EMPTY();
12505
12506
    /* no repeat, return original string */
12507
944
    if (len == 1)
12508
194
        return unicode_result_unchanged(str);
12509
12510
750
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12511
0
        PyErr_SetString(PyExc_OverflowError,
12512
0
                        "repeated string is too long");
12513
0
        return NULL;
12514
0
    }
12515
750
    nchars = len * PyUnicode_GET_LENGTH(str);
12516
12517
750
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12518
750
    if (!u)
12519
0
        return NULL;
12520
750
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12521
12522
750
    if (PyUnicode_GET_LENGTH(str) == 1) {
12523
312
        int kind = PyUnicode_KIND(str);
12524
312
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12525
312
        if (kind == PyUnicode_1BYTE_KIND) {
12526
306
            void *to = PyUnicode_DATA(u);
12527
306
            memset(to, (unsigned char)fill_char, len);
12528
306
        }
12529
6
        else if (kind == PyUnicode_2BYTE_KIND) {
12530
2
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12531
16
            for (n = 0; n < len; ++n)
12532
14
                ucs2[n] = fill_char;
12533
4
        } else {
12534
4
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12535
4
            assert(kind == PyUnicode_4BYTE_KIND);
12536
571
            for (n = 0; n < len; ++n)
12537
567
                ucs4[n] = fill_char;
12538
4
        }
12539
312
    }
12540
438
    else {
12541
438
        Py_ssize_t char_size = PyUnicode_KIND(str);
12542
438
        char *to = (char *) PyUnicode_DATA(u);
12543
438
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12544
438
            PyUnicode_GET_LENGTH(str) * char_size);
12545
438
    }
12546
12547
750
    assert(_PyUnicode_CheckConsistency(u, 1));
12548
750
    return u;
12549
750
}
12550
12551
PyObject *
12552
PyUnicode_Replace(PyObject *str,
12553
                  PyObject *substr,
12554
                  PyObject *replstr,
12555
                  Py_ssize_t maxcount)
12556
4.00k
{
12557
4.00k
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12558
4.00k
            ensure_unicode(replstr) < 0)
12559
0
        return NULL;
12560
4.00k
    return replace(str, substr, replstr, maxcount);
12561
4.00k
}
12562
12563
/*[clinic input]
12564
str.replace as unicode_replace
12565
12566
    old: unicode
12567
    new: unicode
12568
    /
12569
    count: Py_ssize_t = -1
12570
        Maximum number of occurrences to replace.
12571
        -1 (the default value) means replace all occurrences.
12572
12573
Return a copy with all occurrences of substring old replaced by new.
12574
12575
If count is given, only the first count occurrences are replaced.
12576
If count is not specified or -1, then all occurrences are replaced.
12577
[clinic start generated code]*/
12578
12579
static PyObject *
12580
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12581
                     Py_ssize_t count)
12582
/*[clinic end generated code: output=b63f1a8b5eebf448 input=d15a6886b05e2edc]*/
12583
21.8k
{
12584
21.8k
    return replace(self, old, new, count);
12585
21.8k
}
12586
12587
/*[clinic input]
12588
@permit_long_docstring_body
12589
str.removeprefix as unicode_removeprefix
12590
12591
    prefix: unicode
12592
    /
12593
12594
Return a str with the given prefix string removed if present.
12595
12596
If the string starts with the prefix string, return string[len(prefix):].
12597
Otherwise, return a copy of the original string.
12598
[clinic start generated code]*/
12599
12600
static PyObject *
12601
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12602
/*[clinic end generated code: output=f1e5945e9763bcb9 input=1989a856dbb813f1]*/
12603
0
{
12604
0
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12605
0
    if (match == -1) {
12606
0
        return NULL;
12607
0
    }
12608
0
    if (match) {
12609
0
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12610
0
                                   PyUnicode_GET_LENGTH(self));
12611
0
    }
12612
0
    return unicode_result_unchanged(self);
12613
0
}
12614
12615
/*[clinic input]
12616
str.removesuffix as unicode_removesuffix
12617
12618
    suffix: unicode
12619
    /
12620
12621
Return a str with the given suffix string removed if present.
12622
12623
If the string ends with the suffix string and that suffix is not empty,
12624
return string[:-len(suffix)]. Otherwise, return a copy of the original
12625
string.
12626
[clinic start generated code]*/
12627
12628
static PyObject *
12629
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12630
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12631
0
{
12632
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12633
0
    if (match == -1) {
12634
0
        return NULL;
12635
0
    }
12636
0
    if (match) {
12637
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12638
0
                                            - PyUnicode_GET_LENGTH(suffix));
12639
0
    }
12640
0
    return unicode_result_unchanged(self);
12641
0
}
12642
12643
static PyObject *
12644
unicode_repr(PyObject *unicode)
12645
6.11k
{
12646
6.11k
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12647
6.11k
    const void *idata = PyUnicode_DATA(unicode);
12648
12649
    /* Compute length of output, quote characters, and
12650
       maximum character */
12651
6.11k
    Py_ssize_t osize = 0;
12652
6.11k
    Py_UCS4 maxch = 127;
12653
6.11k
    Py_ssize_t squote = 0;
12654
6.11k
    Py_ssize_t dquote = 0;
12655
6.11k
    int ikind = PyUnicode_KIND(unicode);
12656
26.1M
    for (Py_ssize_t i = 0; i < isize; i++) {
12657
26.1M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12658
26.1M
        Py_ssize_t incr = 1;
12659
26.1M
        switch (ch) {
12660
57.5k
        case '\'': squote++; break;
12661
98.7k
        case '"':  dquote++; break;
12662
37.7k
        case '\\': case '\t': case '\r': case '\n':
12663
37.7k
            incr = 2;
12664
37.7k
            break;
12665
25.9M
        default:
12666
            /* Fast-path ASCII */
12667
25.9M
            if (ch < ' ' || ch == 0x7f)
12668
1.68M
                incr = 4; /* \xHH */
12669
24.2M
            else if (ch < 0x7f)
12670
23.7M
                ;
12671
488k
            else if (Py_UNICODE_ISPRINTABLE(ch))
12672
161k
                maxch = (ch > maxch) ? ch : maxch;
12673
326k
            else if (ch < 0x100)
12674
314k
                incr = 4; /* \xHH */
12675
12.1k
            else if (ch < 0x10000)
12676
9.83k
                incr = 6; /* \uHHHH */
12677
2.35k
            else
12678
2.35k
                incr = 10; /* \uHHHHHHHH */
12679
26.1M
        }
12680
26.1M
        if (osize > PY_SSIZE_T_MAX - incr) {
12681
0
            PyErr_SetString(PyExc_OverflowError,
12682
0
                            "string is too long to generate repr");
12683
0
            return NULL;
12684
0
        }
12685
26.1M
        osize += incr;
12686
26.1M
    }
12687
12688
6.11k
    Py_UCS4 quote = '\'';
12689
6.11k
    int changed = (osize != isize);
12690
6.11k
    if (squote) {
12691
515
        changed = 1;
12692
515
        if (dquote)
12693
            /* Both squote and dquote present. Use squote,
12694
               and escape them */
12695
252
            osize += squote;
12696
263
        else
12697
263
            quote = '"';
12698
515
    }
12699
6.11k
    osize += 2;   /* quotes */
12700
12701
6.11k
    PyObject *repr = PyUnicode_New(osize, maxch);
12702
6.11k
    if (repr == NULL)
12703
0
        return NULL;
12704
6.11k
    int okind = PyUnicode_KIND(repr);
12705
6.11k
    void *odata = PyUnicode_DATA(repr);
12706
12707
6.11k
    if (!changed) {
12708
3.37k
        PyUnicode_WRITE(okind, odata, 0, quote);
12709
12710
3.37k
        _PyUnicode_FastCopyCharacters(repr, 1,
12711
3.37k
                                      unicode, 0,
12712
3.37k
                                      isize);
12713
12714
3.37k
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12715
3.37k
    }
12716
2.73k
    else {
12717
2.73k
        switch (okind) {
12718
1.80k
        case PyUnicode_1BYTE_KIND:
12719
1.80k
            ucs1lib_repr(unicode, quote, odata);
12720
1.80k
            break;
12721
548
        case PyUnicode_2BYTE_KIND:
12722
548
            ucs2lib_repr(unicode, quote, odata);
12723
548
            break;
12724
389
        default:
12725
389
            assert(okind == PyUnicode_4BYTE_KIND);
12726
389
            ucs4lib_repr(unicode, quote, odata);
12727
2.73k
        }
12728
2.73k
    }
12729
12730
6.11k
    assert(_PyUnicode_CheckConsistency(repr, 1));
12731
6.11k
    return repr;
12732
6.11k
}
12733
12734
/*[clinic input]
12735
@permit_long_summary
12736
str.rfind as unicode_rfind = str.count
12737
12738
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12739
12740
Optional arguments start and end are interpreted as in slice notation.
12741
Return -1 on failure.
12742
[clinic start generated code]*/
12743
12744
static Py_ssize_t
12745
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12746
                   Py_ssize_t end)
12747
/*[clinic end generated code: output=880b29f01dd014c8 input=7f7e97d5cd3299a2]*/
12748
7.39k
{
12749
7.39k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12750
7.39k
    if (result < 0) {
12751
5.89k
        return -1;
12752
5.89k
    }
12753
1.50k
    return result;
12754
7.39k
}
12755
12756
/*[clinic input]
12757
@permit_long_summary
12758
str.rindex as unicode_rindex = str.count
12759
12760
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12761
12762
Optional arguments start and end are interpreted as in slice notation.
12763
Raises ValueError when the substring is not found.
12764
[clinic start generated code]*/
12765
12766
static Py_ssize_t
12767
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12768
                    Py_ssize_t end)
12769
/*[clinic end generated code: output=5f3aef124c867fe1 input=0363a324740b3e62]*/
12770
0
{
12771
0
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12772
0
    if (result == -1) {
12773
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12774
0
    }
12775
0
    else if (result < 0) {
12776
0
        return -1;
12777
0
    }
12778
0
    return result;
12779
0
}
12780
12781
/*[clinic input]
12782
str.rjust as unicode_rjust
12783
12784
    width: Py_ssize_t
12785
    fillchar: Py_UCS4 = ' '
12786
    /
12787
12788
Return a right-justified string of length width.
12789
12790
Padding is done using the specified fill character (default is a space).
12791
[clinic start generated code]*/
12792
12793
static PyObject *
12794
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12795
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12796
0
{
12797
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12798
0
        return unicode_result_unchanged(self);
12799
12800
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12801
0
}
12802
12803
PyObject *
12804
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12805
0
{
12806
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12807
0
        return NULL;
12808
12809
0
    return split(s, sep, maxsplit);
12810
0
}
12811
12812
/*[clinic input]
12813
@permit_long_summary
12814
str.split as unicode_split
12815
12816
    sep: object = None
12817
        The separator used to split the string.
12818
12819
        When set to None (the default value), will split on any whitespace
12820
        character (including \n \r \t \f and spaces) and will discard
12821
        empty strings from the result.
12822
    maxsplit: Py_ssize_t = -1
12823
        Maximum number of splits.
12824
        -1 (the default value) means no limit.
12825
12826
Return a list of the substrings in the string, using sep as the separator string.
12827
12828
Splitting starts at the front of the string and works to the end.
12829
12830
Note, str.split() is mainly useful for data that has been intentionally
12831
delimited.  With natural text that includes punctuation, consider using
12832
the regular expression module.
12833
12834
[clinic start generated code]*/
12835
12836
static PyObject *
12837
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12838
/*[clinic end generated code: output=3a65b1db356948dc input=2c1fd08a78e038b8]*/
12839
1.79k
{
12840
1.79k
    if (sep == Py_None)
12841
7
        return split(self, NULL, maxsplit);
12842
1.79k
    if (PyUnicode_Check(sep))
12843
1.79k
        return split(self, sep, maxsplit);
12844
12845
0
    PyErr_Format(PyExc_TypeError,
12846
0
                 "must be str or None, not %.100s",
12847
0
                 Py_TYPE(sep)->tp_name);
12848
0
    return NULL;
12849
1.79k
}
12850
12851
PyObject *
12852
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12853
0
{
12854
0
    PyObject* out;
12855
0
    int kind1, kind2;
12856
0
    const void *buf1, *buf2;
12857
0
    Py_ssize_t len1, len2;
12858
12859
0
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12860
0
        return NULL;
12861
12862
0
    kind1 = PyUnicode_KIND(str_obj);
12863
0
    kind2 = PyUnicode_KIND(sep_obj);
12864
0
    len1 = PyUnicode_GET_LENGTH(str_obj);
12865
0
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12866
0
    if (kind1 < kind2 || len1 < len2) {
12867
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12868
0
        return PyTuple_Pack(3, str_obj, empty, empty);
12869
0
    }
12870
0
    buf1 = PyUnicode_DATA(str_obj);
12871
0
    buf2 = PyUnicode_DATA(sep_obj);
12872
0
    if (kind2 != kind1) {
12873
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12874
0
        if (!buf2)
12875
0
            return NULL;
12876
0
    }
12877
12878
0
    switch (kind1) {
12879
0
    case PyUnicode_1BYTE_KIND:
12880
0
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12881
0
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12882
0
        else
12883
0
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12884
0
        break;
12885
0
    case PyUnicode_2BYTE_KIND:
12886
0
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12887
0
        break;
12888
0
    case PyUnicode_4BYTE_KIND:
12889
0
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12890
0
        break;
12891
0
    default:
12892
0
        Py_UNREACHABLE();
12893
0
    }
12894
12895
0
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12896
0
    if (kind2 != kind1)
12897
0
        PyMem_Free((void *)buf2);
12898
12899
0
    return out;
12900
0
}
12901
12902
12903
PyObject *
12904
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12905
4.10k
{
12906
4.10k
    PyObject* out;
12907
4.10k
    int kind1, kind2;
12908
4.10k
    const void *buf1, *buf2;
12909
4.10k
    Py_ssize_t len1, len2;
12910
12911
4.10k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12912
0
        return NULL;
12913
12914
4.10k
    kind1 = PyUnicode_KIND(str_obj);
12915
4.10k
    kind2 = PyUnicode_KIND(sep_obj);
12916
4.10k
    len1 = PyUnicode_GET_LENGTH(str_obj);
12917
4.10k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12918
4.10k
    if (kind1 < kind2 || len1 < len2) {
12919
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12920
0
        return PyTuple_Pack(3, empty, empty, str_obj);
12921
0
    }
12922
4.10k
    buf1 = PyUnicode_DATA(str_obj);
12923
4.10k
    buf2 = PyUnicode_DATA(sep_obj);
12924
4.10k
    if (kind2 != kind1) {
12925
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12926
0
        if (!buf2)
12927
0
            return NULL;
12928
0
    }
12929
12930
4.10k
    switch (kind1) {
12931
4.10k
    case PyUnicode_1BYTE_KIND:
12932
4.10k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12933
4.10k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12934
0
        else
12935
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12936
4.10k
        break;
12937
0
    case PyUnicode_2BYTE_KIND:
12938
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12939
0
        break;
12940
0
    case PyUnicode_4BYTE_KIND:
12941
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12942
0
        break;
12943
0
    default:
12944
0
        Py_UNREACHABLE();
12945
4.10k
    }
12946
12947
4.10k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12948
4.10k
    if (kind2 != kind1)
12949
0
        PyMem_Free((void *)buf2);
12950
12951
4.10k
    return out;
12952
4.10k
}
12953
12954
/*[clinic input]
12955
@permit_long_docstring_body
12956
str.partition as unicode_partition
12957
12958
    sep: object
12959
    /
12960
12961
Partition the string into three parts using the given separator.
12962
12963
This will search for the separator in the string.  If the separator is found,
12964
returns a 3-tuple containing the part before the separator, the separator
12965
itself, and the part after it.
12966
12967
If the separator is not found, returns a 3-tuple containing the original string
12968
and two empty strings.
12969
[clinic start generated code]*/
12970
12971
static PyObject *
12972
unicode_partition(PyObject *self, PyObject *sep)
12973
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=4d854b520d7b0e97]*/
12974
0
{
12975
0
    return PyUnicode_Partition(self, sep);
12976
0
}
12977
12978
/*[clinic input]
12979
@permit_long_docstring_body
12980
str.rpartition as unicode_rpartition = str.partition
12981
12982
Partition the string into three parts using the given separator.
12983
12984
This will search for the separator in the string, starting at the end. If
12985
the separator is found, returns a 3-tuple containing the part before the
12986
separator, the separator itself, and the part after it.
12987
12988
If the separator is not found, returns a 3-tuple containing two empty strings
12989
and the original string.
12990
[clinic start generated code]*/
12991
12992
static PyObject *
12993
unicode_rpartition(PyObject *self, PyObject *sep)
12994
/*[clinic end generated code: output=1aa13cf1156572aa input=a6adabe91e75b486]*/
12995
4.10k
{
12996
4.10k
    return PyUnicode_RPartition(self, sep);
12997
4.10k
}
12998
12999
PyObject *
13000
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13001
0
{
13002
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13003
0
        return NULL;
13004
13005
0
    return rsplit(s, sep, maxsplit);
13006
0
}
13007
13008
/*[clinic input]
13009
@permit_long_summary
13010
str.rsplit as unicode_rsplit = str.split
13011
13012
Return a list of the substrings in the string, using sep as the separator string.
13013
13014
Splitting starts at the end of the string and works to the front.
13015
[clinic start generated code]*/
13016
13017
static PyObject *
13018
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13019
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13020
0
{
13021
0
    if (sep == Py_None)
13022
0
        return rsplit(self, NULL, maxsplit);
13023
0
    if (PyUnicode_Check(sep))
13024
0
        return rsplit(self, sep, maxsplit);
13025
13026
0
    PyErr_Format(PyExc_TypeError,
13027
0
                 "must be str or None, not %.100s",
13028
0
                 Py_TYPE(sep)->tp_name);
13029
0
    return NULL;
13030
0
}
13031
13032
/*[clinic input]
13033
@permit_long_docstring_body
13034
str.splitlines as unicode_splitlines
13035
13036
    keepends: bool = False
13037
13038
Return a list of the lines in the string, breaking at line boundaries.
13039
13040
Line breaks are not included in the resulting list unless keepends is given and
13041
true.
13042
[clinic start generated code]*/
13043
13044
static PyObject *
13045
unicode_splitlines_impl(PyObject *self, int keepends)
13046
/*[clinic end generated code: output=f664dcdad153ec40 input=39eeafbfef61c827]*/
13047
0
{
13048
0
    return PyUnicode_Splitlines(self, keepends);
13049
0
}
13050
13051
static
13052
PyObject *unicode_str(PyObject *self)
13053
0
{
13054
0
    return unicode_result_unchanged(self);
13055
0
}
13056
13057
/*[clinic input]
13058
@permit_long_summary
13059
str.swapcase as unicode_swapcase
13060
13061
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13062
[clinic start generated code]*/
13063
13064
static PyObject *
13065
unicode_swapcase_impl(PyObject *self)
13066
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13067
0
{
13068
0
    return case_operation(self, do_swapcase);
13069
0
}
13070
13071
static int
13072
unicode_maketrans_from_dict(PyObject *x, PyObject *newdict)
13073
0
{
13074
0
    PyObject *key, *value;
13075
0
    Py_ssize_t i = 0;
13076
0
    int res;
13077
0
    while (PyDict_Next(x, &i, &key, &value)) {
13078
0
        if (PyUnicode_Check(key)) {
13079
0
            PyObject *newkey;
13080
0
            int kind;
13081
0
            const void *data;
13082
0
            if (PyUnicode_GET_LENGTH(key) != 1) {
13083
0
                PyErr_SetString(PyExc_ValueError, "string keys in translate"
13084
0
                                "table must be of length 1");
13085
0
                return -1;
13086
0
            }
13087
0
            kind = PyUnicode_KIND(key);
13088
0
            data = PyUnicode_DATA(key);
13089
0
            newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13090
0
            if (!newkey)
13091
0
                return -1;
13092
0
            res = PyDict_SetItem(newdict, newkey, value);
13093
0
            Py_DECREF(newkey);
13094
0
            if (res < 0)
13095
0
                return -1;
13096
0
        }
13097
0
        else if (PyLong_Check(key)) {
13098
0
            if (PyDict_SetItem(newdict, key, value) < 0)
13099
0
                return -1;
13100
0
        }
13101
0
        else {
13102
0
            PyErr_SetString(PyExc_TypeError, "keys in translate table must"
13103
0
                            "be strings or integers");
13104
0
            return -1;
13105
0
        }
13106
0
    }
13107
0
    return 0;
13108
0
}
13109
13110
/*[clinic input]
13111
13112
@staticmethod
13113
str.maketrans as unicode_maketrans
13114
13115
  x: object
13116
13117
  y: unicode=NULL
13118
13119
  z: unicode=NULL
13120
13121
  /
13122
13123
Return a translation table usable for str.translate().
13124
13125
If there is only one argument, it must be a dictionary mapping Unicode
13126
ordinals (integers) or characters to Unicode ordinals, strings or None.
13127
Character keys will be then converted to ordinals.
13128
If there are two arguments, they must be strings of equal length, and
13129
in the resulting dictionary, each character in x will be mapped to the
13130
character at the same position in y. If there is a third argument, it
13131
must be a string, whose characters will be mapped to None in the result.
13132
[clinic start generated code]*/
13133
13134
static PyObject *
13135
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13136
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13137
0
{
13138
0
    PyObject *new = NULL, *key, *value;
13139
0
    Py_ssize_t i = 0;
13140
0
    int res;
13141
13142
0
    new = PyDict_New();
13143
0
    if (!new)
13144
0
        return NULL;
13145
0
    if (y != NULL) {
13146
0
        int x_kind, y_kind, z_kind;
13147
0
        const void *x_data, *y_data, *z_data;
13148
13149
        /* x must be a string too, of equal length */
13150
0
        if (!PyUnicode_Check(x)) {
13151
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13152
0
                            "be a string if there is a second argument");
13153
0
            goto err;
13154
0
        }
13155
0
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13156
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13157
0
                            "arguments must have equal length");
13158
0
            goto err;
13159
0
        }
13160
        /* create entries for translating chars in x to those in y */
13161
0
        x_kind = PyUnicode_KIND(x);
13162
0
        y_kind = PyUnicode_KIND(y);
13163
0
        x_data = PyUnicode_DATA(x);
13164
0
        y_data = PyUnicode_DATA(y);
13165
0
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13166
0
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13167
0
            if (!key)
13168
0
                goto err;
13169
0
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13170
0
            if (!value) {
13171
0
                Py_DECREF(key);
13172
0
                goto err;
13173
0
            }
13174
0
            res = PyDict_SetItem(new, key, value);
13175
0
            Py_DECREF(key);
13176
0
            Py_DECREF(value);
13177
0
            if (res < 0)
13178
0
                goto err;
13179
0
        }
13180
        /* create entries for deleting chars in z */
13181
0
        if (z != NULL) {
13182
0
            z_kind = PyUnicode_KIND(z);
13183
0
            z_data = PyUnicode_DATA(z);
13184
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13185
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13186
0
                if (!key)
13187
0
                    goto err;
13188
0
                res = PyDict_SetItem(new, key, Py_None);
13189
0
                Py_DECREF(key);
13190
0
                if (res < 0)
13191
0
                    goto err;
13192
0
            }
13193
0
        }
13194
0
    } else {
13195
        /* x must be a dict */
13196
0
        if (!PyAnyDict_CheckExact(x)) {
13197
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13198
0
                            "to maketrans it must be a dict");
13199
0
            goto err;
13200
0
        }
13201
        /* copy entries into the new dict, converting string keys to int keys */
13202
0
        int errcode;
13203
0
        Py_BEGIN_CRITICAL_SECTION(x);
13204
0
        errcode = unicode_maketrans_from_dict(x, new);
13205
0
        Py_END_CRITICAL_SECTION();
13206
0
        if (errcode < 0)
13207
0
            goto err;
13208
0
    }
13209
0
    return new;
13210
0
  err:
13211
0
    Py_DECREF(new);
13212
0
    return NULL;
13213
0
}
13214
13215
/*[clinic input]
13216
@permit_long_docstring_body
13217
str.translate as unicode_translate
13218
13219
    table: object
13220
        Translation table, which must be a mapping of Unicode ordinals to
13221
        Unicode ordinals, strings, or None.
13222
    /
13223
13224
Replace each character in the string using the given translation table.
13225
13226
The table must implement lookup/indexing via __getitem__, for instance a
13227
dictionary or list.  If this operation raises LookupError, the character is
13228
left untouched.  Characters mapped to None are deleted.
13229
[clinic start generated code]*/
13230
13231
static PyObject *
13232
unicode_translate(PyObject *self, PyObject *table)
13233
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=699e5fa0ebf9f5e9]*/
13234
144
{
13235
144
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13236
144
}
13237
13238
/*[clinic input]
13239
str.upper as unicode_upper
13240
13241
Return a copy of the string converted to uppercase.
13242
[clinic start generated code]*/
13243
13244
static PyObject *
13245
unicode_upper_impl(PyObject *self)
13246
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13247
153
{
13248
153
    if (PyUnicode_IS_ASCII(self))
13249
153
        return ascii_upper_or_lower(self, 0);
13250
0
    return case_operation(self, do_upper);
13251
153
}
13252
13253
/*[clinic input]
13254
@permit_long_summary
13255
str.zfill as unicode_zfill
13256
13257
    width: Py_ssize_t
13258
    /
13259
13260
Pad a numeric string with zeros on the left, to fill a field of the given width.
13261
13262
The string is never truncated.
13263
[clinic start generated code]*/
13264
13265
static PyObject *
13266
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13267
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13268
0
{
13269
0
    Py_ssize_t fill;
13270
0
    PyObject *u;
13271
0
    int kind;
13272
0
    const void *data;
13273
0
    Py_UCS4 chr;
13274
13275
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13276
0
        return unicode_result_unchanged(self);
13277
13278
0
    fill = width - PyUnicode_GET_LENGTH(self);
13279
13280
0
    u = pad(self, fill, 0, '0');
13281
13282
0
    if (u == NULL)
13283
0
        return NULL;
13284
13285
0
    kind = PyUnicode_KIND(u);
13286
0
    data = PyUnicode_DATA(u);
13287
0
    chr = PyUnicode_READ(kind, data, fill);
13288
13289
0
    if (chr == '+' || chr == '-') {
13290
        /* move sign to beginning of string */
13291
0
        PyUnicode_WRITE(kind, data, 0, chr);
13292
0
        PyUnicode_WRITE(kind, data, fill, '0');
13293
0
    }
13294
13295
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13296
0
    return u;
13297
0
}
13298
13299
/*[clinic input]
13300
@permit_long_summary
13301
@text_signature "($self, prefix[, start[, end]], /)"
13302
str.startswith as unicode_startswith
13303
13304
    prefix as subobj: object
13305
        A string or a tuple of strings to try.
13306
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13307
        Optional start position. Default: start of the string.
13308
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13309
        Optional stop position. Default: end of the string.
13310
    /
13311
13312
Return True if the string starts with the specified prefix, False otherwise.
13313
[clinic start generated code]*/
13314
13315
static PyObject *
13316
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13317
                        Py_ssize_t end)
13318
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13319
235k
{
13320
235k
    if (PyTuple_Check(subobj)) {
13321
120
        Py_ssize_t i;
13322
840
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13323
720
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13324
720
            if (!PyUnicode_Check(substring)) {
13325
0
                PyErr_Format(PyExc_TypeError,
13326
0
                             "tuple for startswith must only contain str, "
13327
0
                             "not %.100s",
13328
0
                             Py_TYPE(substring)->tp_name);
13329
0
                return NULL;
13330
0
            }
13331
720
            int result = tailmatch(self, substring, start, end, -1);
13332
720
            if (result < 0) {
13333
0
                return NULL;
13334
0
            }
13335
720
            if (result) {
13336
0
                Py_RETURN_TRUE;
13337
0
            }
13338
720
        }
13339
        /* nothing matched */
13340
120
        Py_RETURN_FALSE;
13341
120
    }
13342
235k
    if (!PyUnicode_Check(subobj)) {
13343
0
        PyErr_Format(PyExc_TypeError,
13344
0
                     "startswith first arg must be str or "
13345
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13346
0
        return NULL;
13347
0
    }
13348
235k
    int result = tailmatch(self, subobj, start, end, -1);
13349
235k
    if (result < 0) {
13350
0
        return NULL;
13351
0
    }
13352
235k
    return PyBool_FromLong(result);
13353
235k
}
13354
13355
13356
/*[clinic input]
13357
@permit_long_summary
13358
@text_signature "($self, suffix[, start[, end]], /)"
13359
str.endswith as unicode_endswith
13360
13361
    suffix as subobj: object
13362
        A string or a tuple of strings to try.
13363
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13364
        Optional start position. Default: start of the string.
13365
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13366
        Optional stop position. Default: end of the string.
13367
    /
13368
13369
Return True if the string ends with the specified suffix, False otherwise.
13370
[clinic start generated code]*/
13371
13372
static PyObject *
13373
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13374
                      Py_ssize_t end)
13375
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13376
116k
{
13377
116k
    if (PyTuple_Check(subobj)) {
13378
0
        Py_ssize_t i;
13379
0
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13380
0
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13381
0
            if (!PyUnicode_Check(substring)) {
13382
0
                PyErr_Format(PyExc_TypeError,
13383
0
                             "tuple for endswith must only contain str, "
13384
0
                             "not %.100s",
13385
0
                             Py_TYPE(substring)->tp_name);
13386
0
                return NULL;
13387
0
            }
13388
0
            int result = tailmatch(self, substring, start, end, +1);
13389
0
            if (result < 0) {
13390
0
                return NULL;
13391
0
            }
13392
0
            if (result) {
13393
0
                Py_RETURN_TRUE;
13394
0
            }
13395
0
        }
13396
0
        Py_RETURN_FALSE;
13397
0
    }
13398
116k
    if (!PyUnicode_Check(subobj)) {
13399
0
        PyErr_Format(PyExc_TypeError,
13400
0
                     "endswith first arg must be str or "
13401
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13402
0
        return NULL;
13403
0
    }
13404
116k
    int result = tailmatch(self, subobj, start, end, +1);
13405
116k
    if (result < 0) {
13406
0
        return NULL;
13407
0
    }
13408
116k
    return PyBool_FromLong(result);
13409
116k
}
13410
13411
13412
#include "stringlib/unicode_format.h"
13413
13414
PyDoc_STRVAR(format__doc__,
13415
             "format($self, /, *args, **kwargs)\n\
13416
--\n\
13417
\n\
13418
Return a formatted version of the string, using substitutions from args and kwargs.\n\
13419
The substitutions are identified by braces ('{' and '}').");
13420
13421
PyDoc_STRVAR(format_map__doc__,
13422
             "format_map($self, mapping, /)\n\
13423
--\n\
13424
\n\
13425
Return a formatted version of the string, using substitutions from mapping.\n\
13426
The substitutions are identified by braces ('{' and '}').");
13427
13428
/*[clinic input]
13429
str.__format__ as unicode___format__
13430
13431
    format_spec: unicode
13432
    /
13433
13434
Return a formatted version of the string as described by format_spec.
13435
[clinic start generated code]*/
13436
13437
static PyObject *
13438
unicode___format___impl(PyObject *self, PyObject *format_spec)
13439
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13440
0
{
13441
0
    _PyUnicodeWriter writer;
13442
0
    int ret;
13443
13444
0
    _PyUnicodeWriter_Init(&writer);
13445
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13446
0
                                          self, format_spec, 0,
13447
0
                                          PyUnicode_GET_LENGTH(format_spec));
13448
0
    if (ret == -1) {
13449
0
        _PyUnicodeWriter_Dealloc(&writer);
13450
0
        return NULL;
13451
0
    }
13452
0
    return _PyUnicodeWriter_Finish(&writer);
13453
0
}
13454
13455
/*[clinic input]
13456
str.__sizeof__ as unicode_sizeof
13457
13458
Return the size of the string in memory, in bytes.
13459
[clinic start generated code]*/
13460
13461
static PyObject *
13462
unicode_sizeof_impl(PyObject *self)
13463
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13464
0
{
13465
0
    Py_ssize_t size;
13466
13467
    /* If it's a compact object, account for base structure +
13468
       character data. */
13469
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
13470
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13471
0
    }
13472
0
    else if (PyUnicode_IS_COMPACT(self)) {
13473
0
        size = sizeof(PyCompactUnicodeObject) +
13474
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13475
0
    }
13476
0
    else {
13477
        /* If it is a two-block object, account for base object, and
13478
           for character block if present. */
13479
0
        size = sizeof(PyUnicodeObject);
13480
0
        if (_PyUnicode_DATA_ANY(self))
13481
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13482
0
                PyUnicode_KIND(self);
13483
0
    }
13484
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13485
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13486
13487
0
    return PyLong_FromSsize_t(size);
13488
0
}
13489
13490
static PyObject *
13491
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13492
0
{
13493
0
    PyObject *copy = _PyUnicode_Copy(v);
13494
0
    if (!copy)
13495
0
        return NULL;
13496
0
    return Py_BuildValue("(N)", copy);
13497
0
}
13498
13499
/*
13500
This function searchs the longest common leading whitespace
13501
of all lines in the [src, end).
13502
It returns the length of the common leading whitespace and sets `output` to
13503
point to the beginning of the common leading whitespace if length > 0.
13504
*/
13505
static Py_ssize_t
13506
search_longest_common_leading_whitespace(
13507
    const char *const src,
13508
    const char *const end,
13509
    const char **output)
13510
0
{
13511
    // [_start, _start + _len)
13512
    // describes the current longest common leading whitespace
13513
0
    const char *_start = NULL;
13514
0
    Py_ssize_t _len = 0;
13515
13516
0
    for (const char *iter = src; iter < end; ++iter) {
13517
0
        const char *line_start = iter;
13518
0
        const char *leading_whitespace_end = NULL;
13519
13520
        // scan the whole line
13521
0
        while (iter < end && *iter != '\n') {
13522
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
13523
                /* `iter` points to the first non-whitespace character
13524
                   in this line */
13525
0
                if (iter == line_start) {
13526
                    // some line has no indent, fast exit!
13527
0
                    return 0;
13528
0
                }
13529
0
                leading_whitespace_end = iter;
13530
0
            }
13531
0
            ++iter;
13532
0
        }
13533
13534
        // if this line has all white space, skip it
13535
0
        if (!leading_whitespace_end) {
13536
0
            continue;
13537
0
        }
13538
13539
0
        if (!_start) {
13540
            // update the first leading whitespace
13541
0
            _start = line_start;
13542
0
            _len = leading_whitespace_end - line_start;
13543
0
            assert(_len > 0);
13544
0
        }
13545
0
        else {
13546
            /* We then compare with the current longest leading whitespace.
13547
13548
               [line_start, leading_whitespace_end) is the leading
13549
               whitespace of this line,
13550
13551
               [_start, _start + _len) is the leading whitespace of the
13552
               current longest leading whitespace. */
13553
0
            Py_ssize_t new_len = 0;
13554
0
            const char *_iter = _start, *line_iter = line_start;
13555
13556
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
13557
0
                   && *_iter == *line_iter)
13558
0
            {
13559
0
                ++_iter;
13560
0
                ++line_iter;
13561
0
                ++new_len;
13562
0
            }
13563
13564
0
            _len = new_len;
13565
0
            if (_len == 0) {
13566
                // No common things now, fast exit!
13567
0
                return 0;
13568
0
            }
13569
0
        }
13570
0
    }
13571
13572
0
    assert(_len >= 0);
13573
0
    if (_len > 0) {
13574
0
        *output = _start;
13575
0
    }
13576
0
    return _len;
13577
0
}
13578
13579
/* Dedent a string.
13580
   Intended to dedent Python source. Unlike `textwrap.dedent`, this
13581
   only supports spaces and tabs and doesn't normalize empty lines.
13582
   Return a new reference on success, NULL with exception set on error.
13583
   */
13584
PyObject *
13585
_PyUnicode_Dedent(PyObject *unicode)
13586
0
{
13587
0
    Py_ssize_t src_len = 0;
13588
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
13589
0
    if (!src) {
13590
0
        return NULL;
13591
0
    }
13592
0
    assert(src_len >= 0);
13593
0
    if (src_len == 0) {
13594
0
        return Py_NewRef(unicode);
13595
0
    }
13596
13597
0
    const char *const end = src + src_len;
13598
13599
    // [whitespace_start, whitespace_start + whitespace_len)
13600
    // describes the current longest common leading whitespace
13601
0
    const char *whitespace_start = NULL;
13602
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
13603
0
        src, end, &whitespace_start);
13604
13605
0
    if (whitespace_len == 0) {
13606
0
        return Py_NewRef(unicode);
13607
0
    }
13608
13609
    // now we should trigger a dedent
13610
0
    char *dest = PyMem_Malloc(src_len);
13611
0
    if (!dest) {
13612
0
        PyErr_NoMemory();
13613
0
        return NULL;
13614
0
    }
13615
0
    char *dest_iter = dest;
13616
13617
0
    for (const char *iter = src; iter < end; ++iter) {
13618
0
        const char *line_start = iter;
13619
0
        bool in_leading_space = true;
13620
13621
        // iterate over a line to find the end of a line
13622
0
        while (iter < end && *iter != '\n') {
13623
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
13624
0
                in_leading_space = false;
13625
0
            }
13626
0
            ++iter;
13627
0
        }
13628
13629
        // invariant: *iter == '\n' or iter == end
13630
0
        bool append_newline = iter < end;
13631
13632
        // if this line has all white space, write '\n' and continue
13633
0
        if (in_leading_space && append_newline) {
13634
0
            *dest_iter++ = '\n';
13635
0
            continue;
13636
0
        }
13637
13638
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
13639
            conditionally append '\n' */
13640
13641
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
13642
0
        assert(new_line_len >= 0);
13643
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
13644
13645
0
        dest_iter += new_line_len;
13646
13647
0
        if (append_newline) {
13648
0
            *dest_iter++ = '\n';
13649
0
        }
13650
0
    }
13651
13652
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
13653
0
    PyMem_Free(dest);
13654
0
    return res;
13655
0
}
13656
13657
static PyMethodDef unicode_methods[] = {
13658
    UNICODE_ENCODE_METHODDEF
13659
    UNICODE_REPLACE_METHODDEF
13660
    UNICODE_SPLIT_METHODDEF
13661
    UNICODE_RSPLIT_METHODDEF
13662
    UNICODE_JOIN_METHODDEF
13663
    UNICODE_CAPITALIZE_METHODDEF
13664
    UNICODE_CASEFOLD_METHODDEF
13665
    UNICODE_TITLE_METHODDEF
13666
    UNICODE_CENTER_METHODDEF
13667
    UNICODE_COUNT_METHODDEF
13668
    UNICODE_EXPANDTABS_METHODDEF
13669
    UNICODE_FIND_METHODDEF
13670
    UNICODE_PARTITION_METHODDEF
13671
    UNICODE_INDEX_METHODDEF
13672
    UNICODE_LJUST_METHODDEF
13673
    UNICODE_LOWER_METHODDEF
13674
    UNICODE_LSTRIP_METHODDEF
13675
    UNICODE_RFIND_METHODDEF
13676
    UNICODE_RINDEX_METHODDEF
13677
    UNICODE_RJUST_METHODDEF
13678
    UNICODE_RSTRIP_METHODDEF
13679
    UNICODE_RPARTITION_METHODDEF
13680
    UNICODE_SPLITLINES_METHODDEF
13681
    UNICODE_STRIP_METHODDEF
13682
    UNICODE_SWAPCASE_METHODDEF
13683
    UNICODE_TRANSLATE_METHODDEF
13684
    UNICODE_UPPER_METHODDEF
13685
    UNICODE_STARTSWITH_METHODDEF
13686
    UNICODE_ENDSWITH_METHODDEF
13687
    UNICODE_REMOVEPREFIX_METHODDEF
13688
    UNICODE_REMOVESUFFIX_METHODDEF
13689
    UNICODE_ISASCII_METHODDEF
13690
    UNICODE_ISLOWER_METHODDEF
13691
    UNICODE_ISUPPER_METHODDEF
13692
    UNICODE_ISTITLE_METHODDEF
13693
    UNICODE_ISSPACE_METHODDEF
13694
    UNICODE_ISDECIMAL_METHODDEF
13695
    UNICODE_ISDIGIT_METHODDEF
13696
    UNICODE_ISNUMERIC_METHODDEF
13697
    UNICODE_ISALPHA_METHODDEF
13698
    UNICODE_ISALNUM_METHODDEF
13699
    UNICODE_ISIDENTIFIER_METHODDEF
13700
    UNICODE_ISPRINTABLE_METHODDEF
13701
    UNICODE_ZFILL_METHODDEF
13702
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13703
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
13704
    UNICODE___FORMAT___METHODDEF
13705
    UNICODE_MAKETRANS_METHODDEF
13706
    UNICODE_SIZEOF_METHODDEF
13707
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13708
    {NULL, NULL}
13709
};
13710
13711
static PyObject *
13712
unicode_mod(PyObject *v, PyObject *w)
13713
768k
{
13714
768k
    if (!PyUnicode_Check(v))
13715
3
        Py_RETURN_NOTIMPLEMENTED;
13716
768k
    return PyUnicode_Format(v, w);
13717
768k
}
13718
13719
static PyNumberMethods unicode_as_number = {
13720
    0,              /*nb_add*/
13721
    0,              /*nb_subtract*/
13722
    0,              /*nb_multiply*/
13723
    unicode_mod,            /*nb_remainder*/
13724
};
13725
13726
static PySequenceMethods unicode_as_sequence = {
13727
    unicode_length,     /* sq_length */
13728
    PyUnicode_Concat,   /* sq_concat */
13729
    unicode_repeat,     /* sq_repeat */
13730
    unicode_getitem,    /* sq_item */
13731
    0,                  /* sq_slice */
13732
    0,                  /* sq_ass_item */
13733
    0,                  /* sq_ass_slice */
13734
    PyUnicode_Contains, /* sq_contains */
13735
};
13736
13737
static PyObject*
13738
unicode_subscript(PyObject* self, PyObject* item)
13739
6.99M
{
13740
6.99M
    if (_PyIndex_Check(item)) {
13741
6.87M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13742
6.87M
        if (i == -1 && PyErr_Occurred())
13743
30
            return NULL;
13744
6.87M
        if (i < 0)
13745
1.46k
            i += PyUnicode_GET_LENGTH(self);
13746
6.87M
        return unicode_getitem(self, i);
13747
6.87M
    } else if (PySlice_Check(item)) {
13748
117k
        Py_ssize_t start, stop, step, slicelength, i;
13749
117k
        size_t cur;
13750
117k
        PyObject *result;
13751
117k
        const void *src_data;
13752
117k
        void *dest_data;
13753
117k
        int src_kind, dest_kind;
13754
117k
        Py_UCS4 ch, max_char, kind_limit;
13755
13756
117k
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13757
3
            return NULL;
13758
3
        }
13759
117k
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13760
117k
                                            &start, &stop, step);
13761
13762
117k
        if (slicelength <= 0) {
13763
58
            _Py_RETURN_UNICODE_EMPTY();
13764
117k
        } else if (start == 0 && step == 1 &&
13765
629
                   slicelength == PyUnicode_GET_LENGTH(self)) {
13766
13
            return unicode_result_unchanged(self);
13767
117k
        } else if (step == 1) {
13768
117k
            return PyUnicode_Substring(self,
13769
117k
                                       start, start + slicelength);
13770
117k
        }
13771
        /* General case */
13772
27
        src_kind = PyUnicode_KIND(self);
13773
27
        src_data = PyUnicode_DATA(self);
13774
27
        if (!PyUnicode_IS_ASCII(self)) {
13775
18
            kind_limit = kind_maxchar_limit(src_kind);
13776
18
            max_char = 0;
13777
158
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13778
151
                ch = PyUnicode_READ(src_kind, src_data, cur);
13779
151
                if (ch > max_char) {
13780
33
                    max_char = ch;
13781
33
                    if (max_char >= kind_limit)
13782
11
                        break;
13783
33
                }
13784
151
            }
13785
18
        }
13786
9
        else
13787
9
            max_char = 127;
13788
27
        result = PyUnicode_New(slicelength, max_char);
13789
27
        if (result == NULL)
13790
0
            return NULL;
13791
27
        dest_kind = PyUnicode_KIND(result);
13792
27
        dest_data = PyUnicode_DATA(result);
13793
13794
341
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13795
314
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13796
314
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13797
314
        }
13798
27
        assert(_PyUnicode_CheckConsistency(result, 1));
13799
27
        return result;
13800
27
    } else {
13801
1
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13802
1
                     Py_TYPE(item)->tp_name);
13803
1
        return NULL;
13804
1
    }
13805
6.99M
}
13806
13807
static PyMappingMethods unicode_as_mapping = {
13808
    unicode_length,     /* mp_length */
13809
    unicode_subscript,  /* mp_subscript */
13810
    0,                  /* mp_ass_subscript */
13811
};
13812
13813
13814
static PyObject *
13815
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
13816
13817
/*[clinic input]
13818
@classmethod
13819
str.__new__ as unicode_new
13820
13821
    object as x: object = NULL
13822
    encoding: str = NULL
13823
    errors: str = NULL
13824
13825
[clinic start generated code]*/
13826
13827
static PyObject *
13828
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
13829
                 const char *errors)
13830
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
13831
49
{
13832
49
    PyObject *unicode;
13833
49
    if (x == NULL) {
13834
0
        unicode = _PyUnicode_GetEmpty();
13835
0
    }
13836
49
    else if (encoding == NULL && errors == NULL) {
13837
49
        unicode = PyObject_Str(x);
13838
49
    }
13839
0
    else {
13840
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
13841
0
    }
13842
13843
49
    if (unicode != NULL && type != &PyUnicode_Type) {
13844
49
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
13845
49
    }
13846
49
    return unicode;
13847
49
}
13848
13849
static const char *
13850
arg_as_utf8(PyObject *obj, const char *name)
13851
14.8k
{
13852
14.8k
    if (!PyUnicode_Check(obj)) {
13853
0
        PyErr_Format(PyExc_TypeError,
13854
0
                     "str() argument '%s' must be str, not %T",
13855
0
                     name, obj);
13856
0
        return NULL;
13857
0
    }
13858
14.8k
    return _PyUnicode_AsUTF8NoNUL(obj);
13859
14.8k
}
13860
13861
static PyObject *
13862
unicode_vectorcall(PyObject *type, PyObject *const *args,
13863
                   size_t nargsf, PyObject *kwnames)
13864
14.7k
{
13865
14.7k
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
13866
13867
14.7k
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
13868
14.7k
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
13869
        // Fallback to unicode_new()
13870
0
        PyObject *tuple = PyTuple_FromArray(args, nargs);
13871
0
        if (tuple == NULL) {
13872
0
            return NULL;
13873
0
        }
13874
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
13875
0
        if (dict == NULL) {
13876
0
            Py_DECREF(tuple);
13877
0
            return NULL;
13878
0
        }
13879
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
13880
0
        Py_DECREF(tuple);
13881
0
        Py_DECREF(dict);
13882
0
        return ret;
13883
0
    }
13884
14.7k
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
13885
0
        return NULL;
13886
0
    }
13887
14.7k
    if (nargs == 0) {
13888
0
        return _PyUnicode_GetEmpty();
13889
0
    }
13890
14.7k
    PyObject *object = args[0];
13891
14.7k
    if (nargs == 1) {
13892
142
        return PyObject_Str(object);
13893
142
    }
13894
14.5k
    const char *encoding = arg_as_utf8(args[1], "encoding");
13895
14.5k
    if (encoding == NULL) {
13896
0
        return NULL;
13897
0
    }
13898
14.5k
    const char *errors = NULL;
13899
14.5k
    if (nargs == 3) {
13900
282
        errors = arg_as_utf8(args[2], "errors");
13901
282
        if (errors == NULL) {
13902
0
            return NULL;
13903
0
        }
13904
282
    }
13905
14.5k
    return PyUnicode_FromEncodedObject(object, encoding, errors);
13906
14.5k
}
13907
13908
static PyObject *
13909
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
13910
49
{
13911
49
    PyObject *self;
13912
49
    Py_ssize_t length, char_size;
13913
49
    int share_utf8;
13914
49
    int kind;
13915
49
    void *data;
13916
13917
49
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13918
49
    assert(_PyUnicode_CHECK(unicode));
13919
13920
49
    self = type->tp_alloc(type, 0);
13921
49
    if (self == NULL) {
13922
0
        return NULL;
13923
0
    }
13924
49
    kind = PyUnicode_KIND(unicode);
13925
49
    length = PyUnicode_GET_LENGTH(unicode);
13926
13927
49
    _PyUnicode_LENGTH(self) = length;
13928
#ifdef Py_DEBUG
13929
    _PyUnicode_HASH(self) = -1;
13930
#else
13931
49
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13932
0
#endif
13933
49
    _PyUnicode_STATE(self).interned = 0;
13934
49
    _PyUnicode_STATE(self).kind = kind;
13935
49
    _PyUnicode_STATE(self).compact = 0;
13936
49
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13937
49
    _PyUnicode_STATE(self).statically_allocated = 0;
13938
0
    PyUnicode_SET_UTF8_LENGTH(self, 0);
13939
49
    PyUnicode_SET_UTF8(self, NULL);
13940
49
    _PyUnicode_DATA_ANY(self) = NULL;
13941
13942
0
    share_utf8 = 0;
13943
49
    if (kind == PyUnicode_1BYTE_KIND) {
13944
49
        char_size = 1;
13945
49
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13946
49
            share_utf8 = 1;
13947
49
    }
13948
0
    else if (kind == PyUnicode_2BYTE_KIND) {
13949
0
        char_size = 2;
13950
0
    }
13951
0
    else {
13952
0
        assert(kind == PyUnicode_4BYTE_KIND);
13953
0
        char_size = 4;
13954
0
    }
13955
13956
    /* Ensure we won't overflow the length. */
13957
49
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13958
0
        PyErr_NoMemory();
13959
0
        goto onError;
13960
0
    }
13961
49
    data = PyMem_Malloc((length + 1) * char_size);
13962
49
    if (data == NULL) {
13963
0
        PyErr_NoMemory();
13964
0
        goto onError;
13965
0
    }
13966
13967
98
    _PyUnicode_DATA_ANY(self) = data;
13968
49
    if (share_utf8) {
13969
49
        PyUnicode_SET_UTF8_LENGTH(self, length);
13970
49
        PyUnicode_SET_UTF8(self, data);
13971
49
    }
13972
13973
98
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
13974
98
    assert(_PyUnicode_CheckConsistency(self, 1));
13975
#ifdef Py_DEBUG
13976
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13977
#endif
13978
49
    return self;
13979
13980
0
onError:
13981
0
    Py_DECREF(self);
13982
0
    return NULL;
13983
98
}
13984
13985
void
13986
_PyUnicode_ExactDealloc(PyObject *op)
13987
505k
{
13988
505k
    assert(PyUnicode_CheckExact(op));
13989
505k
    unicode_dealloc(op);
13990
505k
}
13991
13992
PyDoc_STRVAR(unicode_doc,
13993
"str(object='') -> str\n\
13994
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
13995
\n\
13996
Create a new string object from the given object. If encoding or\n\
13997
errors is specified, then the object must expose a data buffer\n\
13998
that will be decoded using the given encoding and error handler.\n\
13999
Otherwise, returns the result of object.__str__() (if defined)\n\
14000
or repr(object).\n\
14001
encoding defaults to 'utf-8'.\n\
14002
errors defaults to 'strict'.");
14003
14004
static PyObject *unicode_iter(PyObject *seq);
14005
14006
PyTypeObject PyUnicode_Type = {
14007
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14008
    "str",                        /* tp_name */
14009
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14010
    0,                            /* tp_itemsize */
14011
    /* Slots */
14012
    unicode_dealloc,              /* tp_dealloc */
14013
    0,                            /* tp_vectorcall_offset */
14014
    0,                            /* tp_getattr */
14015
    0,                            /* tp_setattr */
14016
    0,                            /* tp_as_async */
14017
    unicode_repr,                 /* tp_repr */
14018
    &unicode_as_number,           /* tp_as_number */
14019
    &unicode_as_sequence,         /* tp_as_sequence */
14020
    &unicode_as_mapping,          /* tp_as_mapping */
14021
    unicode_hash,                 /* tp_hash*/
14022
    0,                            /* tp_call*/
14023
    unicode_str,                  /* tp_str */
14024
    PyObject_GenericGetAttr,      /* tp_getattro */
14025
    0,                            /* tp_setattro */
14026
    0,                            /* tp_as_buffer */
14027
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14028
        Py_TPFLAGS_UNICODE_SUBCLASS |
14029
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14030
    unicode_doc,                  /* tp_doc */
14031
    0,                            /* tp_traverse */
14032
    0,                            /* tp_clear */
14033
    PyUnicode_RichCompare,        /* tp_richcompare */
14034
    0,                            /* tp_weaklistoffset */
14035
    unicode_iter,                 /* tp_iter */
14036
    0,                            /* tp_iternext */
14037
    unicode_methods,              /* tp_methods */
14038
    0,                            /* tp_members */
14039
    0,                            /* tp_getset */
14040
    0,                            /* tp_base */
14041
    0,                            /* tp_dict */
14042
    0,                            /* tp_descr_get */
14043
    0,                            /* tp_descr_set */
14044
    0,                            /* tp_dictoffset */
14045
    0,                            /* tp_init */
14046
    0,                            /* tp_alloc */
14047
    unicode_new,                  /* tp_new */
14048
    PyObject_Free,                /* tp_free */
14049
    .tp_vectorcall = unicode_vectorcall,
14050
};
14051
14052
/* Initialize the Unicode implementation */
14053
14054
static void
14055
_init_global_state(void)
14056
22
{
14057
22
    static int initialized = 0;
14058
22
    if (initialized) {
14059
0
        return;
14060
0
    }
14061
22
    initialized = 1;
14062
14063
    /* initialize the linebreak bloom filter */
14064
22
    const Py_UCS2 linebreak[] = {
14065
22
        0x000A, /* LINE FEED */
14066
22
        0x000D, /* CARRIAGE RETURN */
14067
22
        0x001C, /* FILE SEPARATOR */
14068
22
        0x001D, /* GROUP SEPARATOR */
14069
22
        0x001E, /* RECORD SEPARATOR */
14070
22
        0x0085, /* NEXT LINE */
14071
22
        0x2028, /* LINE SEPARATOR */
14072
22
        0x2029, /* PARAGRAPH SEPARATOR */
14073
22
    };
14074
22
    bloom_linebreak = make_bloom_mask(
14075
22
        PyUnicode_2BYTE_KIND, linebreak,
14076
22
        Py_ARRAY_LENGTH(linebreak));
14077
22
}
14078
14079
void
14080
_PyUnicode_InitState(PyInterpreterState *interp)
14081
22
{
14082
22
    if (!_Py_IsMainInterpreter(interp)) {
14083
0
        return;
14084
0
    }
14085
22
    _init_global_state();
14086
22
}
14087
14088
14089
PyStatus
14090
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14091
22
{
14092
22
    if (_Py_IsMainInterpreter(interp)) {
14093
22
        PyStatus status = init_global_interned_strings(interp);
14094
22
        if (_PyStatus_EXCEPTION(status)) {
14095
0
            return status;
14096
0
        }
14097
22
    }
14098
22
    assert(INTERNED_STRINGS);
14099
14100
22
    if (init_interned_dict(interp)) {
14101
0
        PyErr_Clear();
14102
0
        return _PyStatus_ERR("failed to create interned dict");
14103
0
    }
14104
14105
22
    return _PyStatus_OK();
14106
22
}
14107
14108
14109
PyStatus
14110
_PyUnicode_InitTypes(PyInterpreterState *interp)
14111
22
{
14112
22
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
14113
0
        goto error;
14114
0
    }
14115
22
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
14116
0
        goto error;
14117
0
    }
14118
22
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
14119
0
        goto error;
14120
0
    }
14121
22
    return _PyStatus_OK();
14122
14123
0
error:
14124
0
    return _PyStatus_ERR("Can't initialize unicode types");
14125
22
}
14126
14127
static /* non-null */ PyObject*
14128
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
14129
24.5k
{
14130
    // Note that this steals a reference to `s`, but in many cases that
14131
    // stolen ref is returned, requiring no decref/incref.
14132
14133
24.5k
    assert(s != NULL);
14134
24.5k
    assert(_PyUnicode_CHECK(s));
14135
24.5k
    assert(_PyUnicode_STATE(s).statically_allocated);
14136
24.5k
    assert(!PyUnicode_CHECK_INTERNED(s));
14137
14138
#ifdef Py_DEBUG
14139
    /* We must not add process-global interned string if there's already a
14140
     * per-interpreter interned_dict, which might contain duplicates.
14141
     */
14142
    PyObject *interned = get_interned_dict(interp);
14143
    assert(interned == NULL);
14144
#endif
14145
14146
    /* Look in the global cache first. */
14147
24.5k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14148
    /* We should only init each string once */
14149
24.5k
    assert(r == NULL);
14150
    /* but just in case (for the non-debug build), handle this */
14151
24.5k
    if (r != NULL && r != s) {
14152
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
14153
0
        assert(_PyUnicode_CHECK(r));
14154
0
        Py_DECREF(s);
14155
0
        return Py_NewRef(r);
14156
0
    }
14157
14158
24.5k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
14159
0
        Py_FatalError("failed to intern static string");
14160
0
    }
14161
14162
24.5k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14163
0
    return s;
14164
24.5k
}
14165
14166
void
14167
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
14168
24.5k
{
14169
    // This should only be called as part of runtime initialization
14170
24.5k
    assert(!Py_IsInitialized());
14171
14172
24.5k
    *p = intern_static(interp, *p);
14173
24.5k
    assert(*p);
14174
24.5k
}
14175
14176
static void
14177
immortalize_interned(PyObject *s)
14178
91.2k
{
14179
91.2k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
14180
91.2k
    assert(!_Py_IsImmortal(s));
14181
#ifdef Py_REF_DEBUG
14182
    /* The reference count value should be excluded from the RefTotal.
14183
       The decrements to these objects will not be registered so they
14184
       need to be accounted for in here. */
14185
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
14186
        _Py_DecRefTotal(_PyThreadState_GET());
14187
    }
14188
#endif
14189
91.2k
    _Py_SetImmortal(s);
14190
    // The switch to SSTATE_INTERNED_IMMORTAL must be the last thing done here
14191
    // to synchronize with the check in intern_common() that avoids locking if
14192
    // the string is already immortal.
14193
91.2k
    FT_ATOMIC_STORE_UINT8(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
14194
91.2k
}
14195
14196
static /* non-null */ PyObject*
14197
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
14198
              bool immortalize)
14199
5.82M
{
14200
    // Note that this steals a reference to `s`, but in many cases that
14201
    // stolen ref is returned, requiring no decref/incref.
14202
14203
#ifdef Py_DEBUG
14204
    assert(s != NULL);
14205
    assert(_PyUnicode_CHECK(s));
14206
#else
14207
5.82M
    if (s == NULL || !PyUnicode_Check(s)) {
14208
0
        return s;
14209
0
    }
14210
5.82M
#endif
14211
14212
    /* If it's a subclass, we don't really know what putting
14213
       it in the interned dict might do. */
14214
5.82M
    if (!PyUnicode_CheckExact(s)) {
14215
0
        return s;
14216
0
    }
14217
14218
    /* Is it already interned? */
14219
5.82M
    switch (PyUnicode_CHECK_INTERNED(s)) {
14220
1.69M
        case SSTATE_NOT_INTERNED:
14221
            // no, go on
14222
1.69M
            break;
14223
42.2k
        case SSTATE_INTERNED_MORTAL:
14224
            // yes but we might need to make it immortal
14225
42.2k
            if (immortalize) {
14226
108
                immortalize_interned(s);
14227
108
            }
14228
42.2k
            return s;
14229
4.09M
        default:
14230
            // all done
14231
4.09M
            return s;
14232
5.82M
    }
14233
14234
    /* Statically allocated strings must be already interned. */
14235
5.82M
    assert(!_PyUnicode_STATE(s).statically_allocated);
14236
14237
#if Py_GIL_DISABLED
14238
    /* In the free-threaded build, all interned strings are immortal */
14239
    immortalize = 1;
14240
#endif
14241
14242
    /* If it's already immortal, intern it as such */
14243
1.69M
    if (_Py_IsImmortal(s)) {
14244
0
        immortalize = 1;
14245
0
    }
14246
14247
    /* if it's a short string, get the singleton */
14248
1.69M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
14249
48.7k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
14250
1
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
14251
1
        assert(PyUnicode_CHECK_INTERNED(r));
14252
1
        Py_DECREF(s);
14253
1
        return r;
14254
1
    }
14255
#ifdef Py_DEBUG
14256
    assert(!unicode_is_singleton(s));
14257
#endif
14258
14259
    /* Look in the global cache now. */
14260
1.69M
    {
14261
1.69M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14262
1.69M
        if (r != NULL) {
14263
122k
            assert(_PyUnicode_STATE(r).statically_allocated);
14264
122k
            assert(r != s);  // r must be statically_allocated; s is not
14265
122k
            Py_DECREF(s);
14266
122k
            return Py_NewRef(r);
14267
122k
        }
14268
1.69M
    }
14269
14270
    /* Do a setdefault on the per-interpreter cache. */
14271
1.57M
    PyObject *interned = get_interned_dict(interp);
14272
1.57M
    assert(interned != NULL);
14273
#ifdef Py_GIL_DISABLED
14274
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
14275
    // Lock-free fast path: check if there's already an interned copy that
14276
    // is in its final immortal state.
14277
    PyObject *r;
14278
    int res = PyDict_GetItemRef(interned, s, &r);
14279
    if (res < 0) {
14280
        PyErr_Clear();
14281
        return s;
14282
    }
14283
    if (res > 0) {
14284
        unsigned int state = _Py_atomic_load_uint8(&_PyUnicode_STATE(r).interned);
14285
        if (state == SSTATE_INTERNED_IMMORTAL) {
14286
            Py_DECREF(s);
14287
            return r;
14288
        }
14289
        // Not yet fully interned; fall through to the locking path.
14290
        Py_DECREF(r);
14291
    }
14292
#endif
14293
1.57M
    FT_MUTEX_LOCK(INTERN_MUTEX);
14294
1.57M
    PyObject *t;
14295
1.57M
    {
14296
1.57M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
14297
1.57M
        if (res < 0) {
14298
0
            PyErr_Clear();
14299
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14300
0
            return s;
14301
0
        }
14302
1.57M
        else if (res == 1) {
14303
            // value was already present (not inserted)
14304
1.24M
            Py_DECREF(s);
14305
1.24M
            if (immortalize &&
14306
918k
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
14307
3.81k
                immortalize_interned(t);
14308
3.81k
            }
14309
1.24M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14310
1.24M
            return t;
14311
1.24M
        }
14312
323k
        else {
14313
            // value was newly inserted
14314
323k
            assert (s == t);
14315
323k
            Py_DECREF(t);
14316
323k
        }
14317
1.57M
    }
14318
14319
    /* NOT_INTERNED -> INTERNED_MORTAL */
14320
14321
1.57M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
14322
14323
323k
    if (!_Py_IsImmortal(s)) {
14324
        /* The two references in interned dict (key and value) are not counted.
14325
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
14326
323k
        Py_DECREF(s);
14327
323k
        Py_DECREF(s);
14328
323k
    }
14329
323k
    FT_ATOMIC_STORE_UINT8(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
14330
14331
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
14332
14333
#ifdef Py_DEBUG
14334
    if (_Py_IsImmortal(s)) {
14335
        assert(immortalize);
14336
    }
14337
#endif
14338
323k
    if (immortalize) {
14339
87.3k
        immortalize_interned(s);
14340
87.3k
    }
14341
14342
323k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
14343
323k
    return s;
14344
323k
}
14345
14346
void
14347
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
14348
2.22M
{
14349
2.22M
    *p = intern_common(interp, *p, 1);
14350
2.22M
    assert(*p);
14351
2.22M
}
14352
14353
void
14354
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
14355
3.59M
{
14356
3.59M
    *p = intern_common(interp, *p, 0);
14357
3.59M
    assert(*p);
14358
3.59M
}
14359
14360
14361
void
14362
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
14363
0
{
14364
0
    _PyUnicode_InternImmortal(interp, p);
14365
0
    return;
14366
0
}
14367
14368
void
14369
PyUnicode_InternInPlace(PyObject **p)
14370
0
{
14371
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14372
0
    _PyUnicode_InternMortal(interp, p);
14373
0
}
14374
14375
// Public-looking name kept for the stable ABI; user should not call this:
14376
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14377
void
14378
PyUnicode_InternImmortal(PyObject **p)
14379
0
{
14380
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14381
0
    _PyUnicode_InternImmortal(interp, p);
14382
0
}
14383
14384
PyObject *
14385
PyUnicode_InternFromString(const char *cp)
14386
604k
{
14387
604k
    PyObject *s = PyUnicode_FromString(cp);
14388
604k
    if (s == NULL) {
14389
0
        return NULL;
14390
0
    }
14391
604k
    PyInterpreterState *interp = _PyInterpreterState_GET();
14392
604k
    _PyUnicode_InternMortal(interp, &s);
14393
604k
    return s;
14394
604k
}
14395
14396
14397
void
14398
_PyUnicode_ClearInterned(PyInterpreterState *interp)
14399
0
{
14400
0
    PyObject *interned = get_interned_dict(interp);
14401
0
    if (interned == NULL) {
14402
0
        return;
14403
0
    }
14404
0
    assert(PyDict_CheckExact(interned));
14405
14406
0
    if (has_shared_intern_dict(interp)) {
14407
        // the dict doesn't belong to this interpreter, skip the debug
14408
        // checks on it and just clear the pointer to it
14409
0
        clear_interned_dict(interp);
14410
0
        return;
14411
0
    }
14412
14413
#ifdef INTERNED_STATS
14414
    fprintf(stderr, "releasing %zd interned strings\n",
14415
            PyDict_GET_SIZE(interned));
14416
14417
    Py_ssize_t total_length = 0;
14418
#endif
14419
0
    Py_ssize_t pos = 0;
14420
0
    PyObject *s, *ignored_value;
14421
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
14422
0
        int shared = 0;
14423
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
14424
0
        case SSTATE_INTERNED_IMMORTAL:
14425
            /* Make immortal interned strings mortal again. */
14426
            // Skip the Immortal Instance check and restore
14427
            // the two references (key and value) ignored
14428
            // by PyUnicode_InternInPlace().
14429
0
            _Py_SetMortal(s, 2);
14430
#ifdef Py_REF_DEBUG
14431
            /* let's be pedantic with the ref total */
14432
            _Py_IncRefTotal(_PyThreadState_GET());
14433
            _Py_IncRefTotal(_PyThreadState_GET());
14434
#endif
14435
#ifdef INTERNED_STATS
14436
            total_length += PyUnicode_GET_LENGTH(s);
14437
#endif
14438
0
            break;
14439
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
14440
            /* It is shared between interpreters, so we should unmark it
14441
               only when this is the last interpreter in which it's
14442
               interned.  We immortalize all the statically initialized
14443
               strings during startup, so we can rely on the
14444
               main interpreter to be the last one. */
14445
0
            if (!_Py_IsMainInterpreter(interp)) {
14446
0
                shared = 1;
14447
0
            }
14448
0
            break;
14449
0
        case SSTATE_INTERNED_MORTAL:
14450
            // Restore 2 references held by the interned dict; these will
14451
            // be decref'd by clear_interned_dict's PyDict_Clear.
14452
0
            _Py_RefcntAdd(s, 2);
14453
#ifdef Py_REF_DEBUG
14454
            /* let's be pedantic with the ref total */
14455
            _Py_IncRefTotal(_PyThreadState_GET());
14456
            _Py_IncRefTotal(_PyThreadState_GET());
14457
#endif
14458
0
            break;
14459
0
        case SSTATE_NOT_INTERNED:
14460
0
            _Py_FALLTHROUGH;
14461
0
        default:
14462
0
            Py_UNREACHABLE();
14463
0
        }
14464
0
        if (!shared) {
14465
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
14466
0
        }
14467
0
    }
14468
#ifdef INTERNED_STATS
14469
    fprintf(stderr,
14470
            "total length of all interned strings: %zd characters\n",
14471
            total_length);
14472
#endif
14473
14474
0
    struct _Py_unicode_state *state = &interp->unicode;
14475
0
    struct _Py_unicode_ids *ids = &state->ids;
14476
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
14477
0
        Py_XINCREF(ids->array[i]);
14478
0
    }
14479
0
    clear_interned_dict(interp);
14480
0
    if (_Py_IsMainInterpreter(interp)) {
14481
0
        clear_global_interned_strings();
14482
0
    }
14483
0
}
14484
14485
14486
/********************* Unicode Iterator **************************/
14487
14488
typedef struct {
14489
    PyObject_HEAD
14490
    Py_ssize_t it_index;
14491
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14492
} unicodeiterobject;
14493
14494
static void
14495
unicodeiter_dealloc(PyObject *op)
14496
635k
{
14497
635k
    unicodeiterobject *it = (unicodeiterobject *)op;
14498
635k
    _PyObject_GC_UNTRACK(it);
14499
635k
    Py_XDECREF(it->it_seq);
14500
635k
    PyObject_GC_Del(it);
14501
635k
}
14502
14503
static int
14504
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
14505
0
{
14506
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14507
0
    Py_VISIT(it->it_seq);
14508
0
    return 0;
14509
0
}
14510
14511
static PyObject *
14512
unicodeiter_next(PyObject *op)
14513
80.6M
{
14514
80.6M
    unicodeiterobject *it = (unicodeiterobject *)op;
14515
80.6M
    PyObject *seq;
14516
14517
80.6M
    assert(it != NULL);
14518
80.6M
    seq = it->it_seq;
14519
80.6M
    if (seq == NULL)
14520
0
        return NULL;
14521
80.6M
    assert(_PyUnicode_CHECK(seq));
14522
14523
80.6M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14524
80.0M
        int kind = PyUnicode_KIND(seq);
14525
80.0M
        const void *data = PyUnicode_DATA(seq);
14526
80.0M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14527
80.0M
        it->it_index++;
14528
80.0M
        return unicode_char(chr);
14529
80.0M
    }
14530
14531
605k
    it->it_seq = NULL;
14532
605k
    Py_DECREF(seq);
14533
605k
    return NULL;
14534
80.6M
}
14535
14536
static PyObject *
14537
unicode_ascii_iter_next(PyObject *op)
14538
2.47M
{
14539
2.47M
    unicodeiterobject *it = (unicodeiterobject *)op;
14540
2.47M
    assert(it != NULL);
14541
2.47M
    PyObject *seq = it->it_seq;
14542
2.47M
    if (seq == NULL) {
14543
0
        return NULL;
14544
0
    }
14545
2.47M
    assert(_PyUnicode_CHECK(seq));
14546
2.47M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
14547
2.47M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14548
2.44M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14549
2.44M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14550
2.44M
                                              data, it->it_index);
14551
2.44M
        it->it_index++;
14552
2.44M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14553
2.44M
    }
14554
29.0k
    it->it_seq = NULL;
14555
29.0k
    Py_DECREF(seq);
14556
29.0k
    return NULL;
14557
2.47M
}
14558
14559
static PyObject *
14560
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
14561
541k
{
14562
541k
    unicodeiterobject *it = (unicodeiterobject *)op;
14563
541k
    Py_ssize_t len = 0;
14564
541k
    if (it->it_seq)
14565
541k
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14566
541k
    return PyLong_FromSsize_t(len);
14567
541k
}
14568
14569
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14570
14571
static PyObject *
14572
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
14573
0
{
14574
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14575
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
14576
14577
    /* _PyEval_GetBuiltin can invoke arbitrary code,
14578
     * call must be before access of iterator pointers.
14579
     * see issue #101765 */
14580
14581
0
    if (it->it_seq != NULL) {
14582
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
14583
0
    } else {
14584
0
        PyObject *u = _PyUnicode_GetEmpty();
14585
0
        if (u == NULL) {
14586
0
            Py_XDECREF(iter);
14587
0
            return NULL;
14588
0
        }
14589
0
        return Py_BuildValue("N(N)", iter, u);
14590
0
    }
14591
0
}
14592
14593
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14594
14595
static PyObject *
14596
unicodeiter_setstate(PyObject *op, PyObject *state)
14597
0
{
14598
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14599
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
14600
0
    if (index == -1 && PyErr_Occurred())
14601
0
        return NULL;
14602
0
    if (it->it_seq != NULL) {
14603
0
        if (index < 0)
14604
0
            index = 0;
14605
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14606
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14607
0
        it->it_index = index;
14608
0
    }
14609
0
    Py_RETURN_NONE;
14610
0
}
14611
14612
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14613
14614
static PyMethodDef unicodeiter_methods[] = {
14615
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
14616
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
14617
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
14618
    {NULL,      NULL}       /* sentinel */
14619
};
14620
14621
PyTypeObject PyUnicodeIter_Type = {
14622
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14623
    "str_iterator",         /* tp_name */
14624
    sizeof(unicodeiterobject),      /* tp_basicsize */
14625
    0,                  /* tp_itemsize */
14626
    /* methods */
14627
    unicodeiter_dealloc,/* tp_dealloc */
14628
    0,                  /* tp_vectorcall_offset */
14629
    0,                  /* tp_getattr */
14630
    0,                  /* tp_setattr */
14631
    0,                  /* tp_as_async */
14632
    0,                  /* tp_repr */
14633
    0,                  /* tp_as_number */
14634
    0,                  /* tp_as_sequence */
14635
    0,                  /* tp_as_mapping */
14636
    0,                  /* tp_hash */
14637
    0,                  /* tp_call */
14638
    0,                  /* tp_str */
14639
    PyObject_GenericGetAttr,        /* tp_getattro */
14640
    0,                  /* tp_setattro */
14641
    0,                  /* tp_as_buffer */
14642
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14643
    0,                  /* tp_doc */
14644
    unicodeiter_traverse, /* tp_traverse */
14645
    0,                  /* tp_clear */
14646
    0,                  /* tp_richcompare */
14647
    0,                  /* tp_weaklistoffset */
14648
    PyObject_SelfIter,          /* tp_iter */
14649
    unicodeiter_next,   /* tp_iternext */
14650
    unicodeiter_methods,            /* tp_methods */
14651
    0,
14652
};
14653
14654
PyTypeObject _PyUnicodeASCIIIter_Type = {
14655
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14656
    .tp_name = "str_ascii_iterator",
14657
    .tp_basicsize = sizeof(unicodeiterobject),
14658
    .tp_dealloc = unicodeiter_dealloc,
14659
    .tp_getattro = PyObject_GenericGetAttr,
14660
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14661
    .tp_traverse = unicodeiter_traverse,
14662
    .tp_iter = PyObject_SelfIter,
14663
    .tp_iternext = unicode_ascii_iter_next,
14664
    .tp_methods = unicodeiter_methods,
14665
};
14666
14667
static PyObject *
14668
unicode_iter(PyObject *seq)
14669
635k
{
14670
635k
    unicodeiterobject *it;
14671
14672
635k
    if (!PyUnicode_Check(seq)) {
14673
0
        PyErr_BadInternalCall();
14674
0
        return NULL;
14675
0
    }
14676
635k
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14677
29.0k
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14678
29.0k
    }
14679
605k
    else {
14680
605k
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14681
605k
    }
14682
635k
    if (it == NULL)
14683
0
        return NULL;
14684
635k
    it->it_index = 0;
14685
635k
    it->it_seq = Py_NewRef(seq);
14686
635k
    _PyObject_GC_TRACK(it);
14687
635k
    return (PyObject *)it;
14688
635k
}
14689
14690
static int
14691
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14692
88
{
14693
88
    int res;
14694
88
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14695
88
    if (res == -2) {
14696
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
14697
0
        return -1;
14698
0
    }
14699
88
    if (res < 0) {
14700
0
        PyErr_NoMemory();
14701
0
        return -1;
14702
0
    }
14703
88
    return 0;
14704
88
}
14705
14706
14707
static int
14708
config_get_codec_name(wchar_t **config_encoding)
14709
44
{
14710
44
    char *encoding;
14711
44
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
14712
0
        return -1;
14713
0
    }
14714
14715
44
    PyObject *name_obj = NULL;
14716
44
    PyObject *codec = _PyCodec_Lookup(encoding);
14717
44
    PyMem_RawFree(encoding);
14718
14719
44
    if (!codec)
14720
0
        goto error;
14721
14722
44
    name_obj = PyObject_GetAttrString(codec, "name");
14723
44
    Py_CLEAR(codec);
14724
44
    if (!name_obj) {
14725
0
        goto error;
14726
0
    }
14727
14728
44
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14729
44
    Py_DECREF(name_obj);
14730
44
    if (wname == NULL) {
14731
0
        goto error;
14732
0
    }
14733
14734
44
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14735
44
    if (raw_wname == NULL) {
14736
0
        PyMem_Free(wname);
14737
0
        PyErr_NoMemory();
14738
0
        goto error;
14739
0
    }
14740
14741
44
    PyMem_RawFree(*config_encoding);
14742
44
    *config_encoding = raw_wname;
14743
14744
44
    PyMem_Free(wname);
14745
44
    return 0;
14746
14747
0
error:
14748
0
    Py_XDECREF(codec);
14749
0
    Py_XDECREF(name_obj);
14750
0
    return -1;
14751
44
}
14752
14753
14754
static PyStatus
14755
init_stdio_encoding(PyInterpreterState *interp)
14756
22
{
14757
    /* Update the stdio encoding to the normalized Python codec name. */
14758
22
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14759
22
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
14760
0
        return _PyStatus_ERR("failed to get the Python codec name "
14761
0
                             "of the stdio encoding");
14762
0
    }
14763
22
    return _PyStatus_OK();
14764
22
}
14765
14766
14767
static int
14768
init_fs_codec(PyInterpreterState *interp)
14769
22
{
14770
22
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
14771
14772
22
    _Py_error_handler error_handler;
14773
22
    error_handler = get_error_handler_wide(config->filesystem_errors);
14774
22
    if (error_handler == _Py_ERROR_UNKNOWN) {
14775
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
14776
0
        return -1;
14777
0
    }
14778
14779
22
    char *encoding, *errors;
14780
22
    if (encode_wstr_utf8(config->filesystem_encoding,
14781
22
                         &encoding,
14782
22
                         "filesystem_encoding") < 0) {
14783
0
        return -1;
14784
0
    }
14785
14786
22
    if (encode_wstr_utf8(config->filesystem_errors,
14787
22
                         &errors,
14788
22
                         "filesystem_errors") < 0) {
14789
0
        PyMem_RawFree(encoding);
14790
0
        return -1;
14791
0
    }
14792
14793
22
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
14794
22
    PyMem_RawFree(fs_codec->encoding);
14795
22
    fs_codec->encoding = encoding;
14796
    /* encoding has been normalized by init_fs_encoding() */
14797
22
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
14798
22
    PyMem_RawFree(fs_codec->errors);
14799
22
    fs_codec->errors = errors;
14800
22
    fs_codec->error_handler = error_handler;
14801
14802
#ifdef _Py_FORCE_UTF8_FS_ENCODING
14803
    assert(fs_codec->utf8 == 1);
14804
#endif
14805
14806
    /* At this point, PyUnicode_EncodeFSDefault() and
14807
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
14808
       the C implementation of the filesystem encoding. */
14809
14810
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
14811
       global configuration variables. */
14812
22
    if (_Py_IsMainInterpreter(interp)) {
14813
14814
22
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
14815
22
                                      fs_codec->errors) < 0) {
14816
0
            PyErr_NoMemory();
14817
0
            return -1;
14818
0
        }
14819
22
    }
14820
22
    return 0;
14821
22
}
14822
14823
14824
static PyStatus
14825
init_fs_encoding(PyThreadState *tstate)
14826
22
{
14827
22
    PyInterpreterState *interp = tstate->interp;
14828
14829
    /* Update the filesystem encoding to the normalized Python codec name.
14830
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
14831
       (Python codec name). */
14832
22
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14833
22
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
14834
0
        _Py_DumpPathConfig(tstate);
14835
0
        return _PyStatus_ERR("failed to get the Python codec "
14836
0
                             "of the filesystem encoding");
14837
0
    }
14838
14839
22
    if (init_fs_codec(interp) < 0) {
14840
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
14841
0
    }
14842
22
    return _PyStatus_OK();
14843
22
}
14844
14845
14846
PyStatus
14847
_PyUnicode_InitEncodings(PyThreadState *tstate)
14848
22
{
14849
22
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
14850
22
    if (_PyStatus_EXCEPTION(status)) {
14851
0
        return status;
14852
0
    }
14853
22
    status = init_fs_encoding(tstate);
14854
22
    if (_PyStatus_EXCEPTION(status)) {
14855
0
        return status;
14856
0
    }
14857
14858
22
    return init_stdio_encoding(tstate->interp);
14859
22
}
14860
14861
14862
static void
14863
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
14864
0
{
14865
0
    PyMem_RawFree(fs_codec->encoding);
14866
0
    fs_codec->encoding = NULL;
14867
0
    fs_codec->utf8 = 0;
14868
0
    PyMem_RawFree(fs_codec->errors);
14869
0
    fs_codec->errors = NULL;
14870
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
14871
0
}
14872
14873
14874
#ifdef MS_WINDOWS
14875
int
14876
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
14877
{
14878
    PyInterpreterState *interp = _PyInterpreterState_GET();
14879
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
14880
14881
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
14882
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
14883
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
14884
    if (encoding == NULL || errors == NULL) {
14885
        PyMem_RawFree(encoding);
14886
        PyMem_RawFree(errors);
14887
        PyErr_NoMemory();
14888
        return -1;
14889
    }
14890
14891
    PyMem_RawFree(config->filesystem_encoding);
14892
    config->filesystem_encoding = encoding;
14893
    PyMem_RawFree(config->filesystem_errors);
14894
    config->filesystem_errors = errors;
14895
14896
    return init_fs_codec(interp);
14897
}
14898
#endif
14899
14900
14901
#ifdef Py_DEBUG
14902
static inline int
14903
unicode_is_finalizing(void)
14904
{
14905
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
14906
}
14907
#endif
14908
14909
14910
void
14911
_PyUnicode_FiniTypes(PyInterpreterState *interp)
14912
0
{
14913
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
14914
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
14915
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
14916
0
}
14917
14918
14919
void
14920
_PyUnicode_Fini(PyInterpreterState *interp)
14921
0
{
14922
0
    struct _Py_unicode_state *state = &interp->unicode;
14923
14924
0
    if (!has_shared_intern_dict(interp)) {
14925
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
14926
0
        assert(get_interned_dict(interp) == NULL);
14927
0
    }
14928
14929
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
14930
14931
    // bpo-47182: force a unicodedata CAPI capsule re-import on
14932
    // subsequent initialization of interpreter.
14933
0
    interp->unicode.ucnhash_capi = NULL;
14934
14935
0
    unicode_clear_identifiers(state);
14936
0
}
14937
14938
/* A _string module, to export formatter_parser and formatter_field_name_split
14939
   to the string.Formatter class implemented in Python. */
14940
14941
static PyMethodDef _string_methods[] = {
14942
    {"formatter_field_name_split", formatter_field_name_split,
14943
     METH_O, PyDoc_STR("split the argument as a field name")},
14944
    {"formatter_parser", formatter_parser,
14945
     METH_O, PyDoc_STR("parse the argument as a format string")},
14946
    {NULL, NULL}
14947
};
14948
14949
static PyModuleDef_Slot module_slots[] = {
14950
    _Py_ABI_SLOT,
14951
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
14952
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
14953
    {0, NULL}
14954
};
14955
14956
static struct PyModuleDef _string_module = {
14957
    PyModuleDef_HEAD_INIT,
14958
    .m_name = "_string",
14959
    .m_doc = PyDoc_STR("string helper module"),
14960
    .m_size = 0,
14961
    .m_methods = _string_methods,
14962
    .m_slots = module_slots,
14963
};
14964
14965
PyMODINIT_FUNC
14966
PyInit__string(void)
14967
0
{
14968
0
    return PyModuleDef_Init(&_string_module);
14969
0
}
14970
14971
14972
#undef PyUnicode_KIND
14973
int PyUnicode_KIND(PyObject *op)
14974
0
{
14975
0
    if (!PyUnicode_Check(op)) {
14976
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14977
0
        return -1;
14978
0
    }
14979
0
    return _PyASCIIObject_CAST(op)->state.kind;
14980
0
}
14981
14982
#undef PyUnicode_DATA
14983
void* PyUnicode_DATA(PyObject *op)
14984
0
{
14985
0
    if (!PyUnicode_Check(op)) {
14986
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14987
0
        return NULL;
14988
0
    }
14989
0
    return _PyUnicode_DATA(op);
14990
0
}