Coverage Report

Created: 2025-10-10 06:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython3/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_freelist.h"      // _Py_FREELIST_FREE(), _Py_FREELIST_POP()
50
#include "pycore_initconfig.h"    // _PyStatus_OK()
51
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
52
#include "pycore_long.h"          // _PyLong_FormatWriter()
53
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
54
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
55
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
56
#include "pycore_pyhash.h"        // _Py_HashSecret_t
57
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
58
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
59
#include "pycore_tuple.h"         // _PyTuple_FromArray()
60
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
61
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
62
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
63
64
#include "stringlib/eq.h"         // unicode_eq()
65
#include <stddef.h>               // ptrdiff_t
66
67
#ifdef MS_WINDOWS
68
#include <windows.h>
69
#endif
70
71
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
72
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
73
#endif
74
75
/* Uncomment to display statistics on interned strings at exit
76
   in _PyUnicode_ClearInterned(). */
77
/* #define INTERNED_STATS 1 */
78
79
80
/*[clinic input]
81
class str "PyObject *" "&PyUnicode_Type"
82
[clinic start generated code]*/
83
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
84
85
/*[python input]
86
class Py_UCS4_converter(CConverter):
87
    type = 'Py_UCS4'
88
    converter = 'convert_uc'
89
90
    def converter_init(self):
91
        if self.default is not unspecified:
92
            self.c_default = ascii(self.default)
93
            if len(self.c_default) > 4 or self.c_default[0] != "'":
94
                self.c_default = hex(ord(self.default))
95
96
[python start generated code]*/
97
/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
98
99
/* --- Globals ------------------------------------------------------------
100
101
NOTE: In the interpreter's initialization phase, some globals are currently
102
      initialized dynamically as needed. In the process Unicode objects may
103
      be created before the Unicode type is ready.
104
105
*/
106
107
4.93M
#define MAX_UNICODE _Py_MAX_UNICODE
108
109
#ifdef Py_DEBUG
110
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
111
#else
112
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
113
#endif
114
115
static inline char* _PyUnicode_UTF8(PyObject *op)
116
153M
{
117
153M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
118
153M
}
119
120
static inline char* PyUnicode_UTF8(PyObject *op)
121
1.21M
{
122
1.21M
    assert(_PyUnicode_CHECK(op));
123
1.21M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
124
1.18M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
125
1.18M
    }
126
28.5k
    else {
127
28.5k
         return _PyUnicode_UTF8(op);
128
28.5k
    }
129
1.21M
}
130
131
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
132
7.46k
{
133
7.46k
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
134
7.46k
}
135
136
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
137
351k
{
138
351k
    assert(_PyUnicode_CHECK(op));
139
351k
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
140
350k
         return _PyASCIIObject_CAST(op)->length;
141
350k
    }
142
924
    else {
143
924
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
144
924
    }
145
351k
}
146
147
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
148
7.46k
{
149
7.46k
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
150
7.46k
}
151
152
#define _PyUnicode_LENGTH(op)                           \
153
52.3M
    (_PyASCIIObject_CAST(op)->length)
154
#define _PyUnicode_STATE(op)                            \
155
308M
    (_PyASCIIObject_CAST(op)->state)
156
#define _PyUnicode_HASH(op)                             \
157
44.2M
    (_PyASCIIObject_CAST(op)->hash)
158
159
26.1M
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
160
161
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
162
6.26M
{
163
6.26M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
164
6.26M
}
165
166
#define _PyUnicode_DATA_ANY(op)                         \
167
98
    (_PyUnicodeObject_CAST(op)->data.any)
168
169
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
170
0
{
171
0
    assert(_PyUnicode_CHECK(op));
172
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
173
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
174
0
}
175
176
/* true if the Unicode object has an allocated UTF-8 memory block
177
   (not shared with other data) */
178
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
179
51.9M
{
180
51.9M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
181
30.8M
            && _PyUnicode_UTF8(op) != NULL
182
7.36k
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
183
51.9M
}
184
185
186
/* Generic helper macro to convert characters of different types.
187
   from_type and to_type have to be valid type names, begin and end
188
   are pointers to the source characters which should be of type
189
   "from_type *".  to is a pointer of type "to_type *" and points to the
190
   buffer where the result characters are written to. */
191
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
192
31.2M
    do {                                                \
193
31.2M
        to_type *_to = (to_type *)(to);                 \
194
31.2M
        const from_type *_iter = (const from_type *)(begin);\
195
31.2M
        const from_type *_end = (const from_type *)(end);\
196
31.2M
        Py_ssize_t n = (_end) - (_iter);                \
197
31.2M
        const from_type *_unrolled_end =                \
198
31.2M
            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
199
167M
        while (_iter < (_unrolled_end)) {               \
200
135M
            _to[0] = (to_type) _iter[0];                \
201
135M
            _to[1] = (to_type) _iter[1];                \
202
135M
            _to[2] = (to_type) _iter[2];                \
203
135M
            _to[3] = (to_type) _iter[3];                \
204
135M
            _iter += 4; _to += 4;                       \
205
135M
        }                                               \
206
63.4M
        while (_iter < (_end))                          \
207
32.1M
            *_to++ = (to_type) *_iter++;                \
208
31.2M
    } while (0)
209
210
162M
#define LATIN1 _Py_LATIN1_CHR
211
212
#ifdef MS_WINDOWS
213
   /* On Windows, overallocate by 50% is the best factor */
214
#  define OVERALLOCATE_FACTOR 2
215
#else
216
   /* On Linux, overallocate by 25% is the best factor */
217
3.62M
#  define OVERALLOCATE_FACTOR 4
218
#endif
219
220
/* Forward declaration */
221
static inline int
222
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
223
static inline void
224
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
225
static PyObject *
226
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
227
                    const char *errors);
228
static PyObject *
229
unicode_decode_utf8(const char *s, Py_ssize_t size,
230
                    _Py_error_handler error_handler, const char *errors,
231
                    Py_ssize_t *consumed);
232
static int
233
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
234
                           const char *s, Py_ssize_t size,
235
                           _Py_error_handler error_handler, const char *errors,
236
                           Py_ssize_t *consumed);
237
#ifdef Py_DEBUG
238
static inline int unicode_is_finalizing(void);
239
static int unicode_is_singleton(PyObject *unicode);
240
#endif
241
242
243
// Return a reference to the immortal empty string singleton.
244
static inline PyObject* unicode_get_empty(void)
245
38.4M
{
246
38.4M
    _Py_DECLARE_STR(empty, "");
247
38.4M
    return &_Py_STR(empty);
248
38.4M
}
249
250
/* This dictionary holds per-interpreter interned strings.
251
 * See InternalDocs/string_interning.md for details.
252
 */
253
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
254
3.29M
{
255
3.29M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
256
3.29M
}
257
258
/* This hashtable holds statically allocated interned strings.
259
 * See InternalDocs/string_interning.md for details.
260
 */
261
3.27M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
262
263
/* Get number of all interned strings for the current interpreter. */
264
Py_ssize_t
265
_PyUnicode_InternedSize(void)
266
0
{
267
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
268
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
269
0
}
270
271
/* Get number of immortal interned strings for the current interpreter. */
272
Py_ssize_t
273
_PyUnicode_InternedSize_Immortal(void)
274
0
{
275
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
276
0
    PyObject *key, *value;
277
0
    Py_ssize_t pos = 0;
278
0
    Py_ssize_t count = 0;
279
280
    // It's tempting to keep a count and avoid a loop here. But, this function
281
    // is intended for refleak tests. It spends extra work to report the true
282
    // value, to help detect bugs in optimizations.
283
284
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
285
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
286
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
287
0
           count++;
288
0
       }
289
0
    }
290
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
291
0
}
292
293
static Py_hash_t unicode_hash(PyObject *);
294
295
static Py_uhash_t
296
hashtable_unicode_hash(const void *key)
297
3.34M
{
298
3.34M
    return unicode_hash((PyObject *)key);
299
3.34M
}
300
301
static int
302
hashtable_unicode_compare(const void *key1, const void *key2)
303
170k
{
304
170k
    PyObject *obj1 = (PyObject *)key1;
305
170k
    PyObject *obj2 = (PyObject *)key2;
306
170k
    if (obj1 != NULL && obj2 != NULL) {
307
170k
        return unicode_eq(obj1, obj2);
308
170k
    }
309
0
    else {
310
0
        return obj1 == obj2;
311
0
    }
312
170k
}
313
314
/* Return true if this interpreter should share the main interpreter's
315
   intern_dict.  That's important for interpreters which load basic
316
   single-phase init extension modules (m_size == -1).  There could be interned
317
   immortal strings that are shared between interpreters, due to the
318
   PyDict_Update(mdict, m_copy) call in import_find_extension().
319
320
   It's not safe to deallocate those strings until all interpreters that
321
   potentially use them are freed.  By storing them in the main interpreter, we
322
   ensure they get freed after all other interpreters are freed.
323
*/
324
static bool
325
has_shared_intern_dict(PyInterpreterState *interp)
326
22
{
327
22
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
328
22
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
329
22
}
330
331
static int
332
init_interned_dict(PyInterpreterState *interp)
333
22
{
334
22
    assert(get_interned_dict(interp) == NULL);
335
22
    PyObject *interned;
336
22
    if (has_shared_intern_dict(interp)) {
337
0
        interned = get_interned_dict(_PyInterpreterState_Main());
338
0
        Py_INCREF(interned);
339
0
    }
340
22
    else {
341
22
        interned = PyDict_New();
342
22
        if (interned == NULL) {
343
0
            return -1;
344
0
        }
345
22
    }
346
22
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
347
22
    return 0;
348
22
}
349
350
static void
351
clear_interned_dict(PyInterpreterState *interp)
352
0
{
353
0
    PyObject *interned = get_interned_dict(interp);
354
0
    if (interned != NULL) {
355
0
        if (!has_shared_intern_dict(interp)) {
356
            // only clear if the dict belongs to this interpreter
357
0
            PyDict_Clear(interned);
358
0
        }
359
0
        Py_DECREF(interned);
360
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
361
0
    }
362
0
}
363
364
static PyStatus
365
init_global_interned_strings(PyInterpreterState *interp)
366
22
{
367
22
    assert(INTERNED_STRINGS == NULL);
368
22
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
369
370
22
    INTERNED_STRINGS = _Py_hashtable_new_full(
371
22
        hashtable_unicode_hash,
372
22
        hashtable_unicode_compare,
373
        // Objects stored here are immortal and statically allocated,
374
        // so we don't need key_destroy_func & value_destroy_func:
375
22
        NULL,
376
22
        NULL,
377
22
        &hashtable_alloc
378
22
    );
379
22
    if (INTERNED_STRINGS == NULL) {
380
0
        PyErr_Clear();
381
0
        return _PyStatus_ERR("failed to create global interned dict");
382
0
    }
383
384
    /* Intern statically allocated string identifiers, deepfreeze strings,
385
        * and one-byte latin-1 strings.
386
        * This must be done before any module initialization so that statically
387
        * allocated string identifiers are used instead of heap allocated strings.
388
        * Deepfreeze uses the interned identifiers if present to save space
389
        * else generates them and they are interned to speed up dict lookups.
390
    */
391
22
    _PyUnicode_InitStaticStrings(interp);
392
393
5.65k
    for (int i = 0; i < 256; i++) {
394
5.63k
        PyObject *s = LATIN1(i);
395
5.63k
        _PyUnicode_InternStatic(interp, &s);
396
5.63k
        assert(s == LATIN1(i));
397
5.63k
    }
398
#ifdef Py_DEBUG
399
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
400
401
    for (int i = 0; i < 256; i++) {
402
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
403
    }
404
#endif
405
22
    return _PyStatus_OK();
406
22
}
407
408
static void clear_global_interned_strings(void)
409
0
{
410
0
    if (INTERNED_STRINGS != NULL) {
411
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
412
0
        INTERNED_STRINGS = NULL;
413
0
    }
414
0
}
415
416
#define _Py_RETURN_UNICODE_EMPTY()   \
417
21.2M
    do {                             \
418
21.2M
        return unicode_get_empty();  \
419
21.2M
    } while (0)
420
421
422
/* Fast detection of the most frequent whitespace characters */
423
const unsigned char _Py_ascii_whitespace[] = {
424
    0, 0, 0, 0, 0, 0, 0, 0,
425
/*     case 0x0009: * CHARACTER TABULATION */
426
/*     case 0x000A: * LINE FEED */
427
/*     case 0x000B: * LINE TABULATION */
428
/*     case 0x000C: * FORM FEED */
429
/*     case 0x000D: * CARRIAGE RETURN */
430
    0, 1, 1, 1, 1, 1, 0, 0,
431
    0, 0, 0, 0, 0, 0, 0, 0,
432
/*     case 0x001C: * FILE SEPARATOR */
433
/*     case 0x001D: * GROUP SEPARATOR */
434
/*     case 0x001E: * RECORD SEPARATOR */
435
/*     case 0x001F: * UNIT SEPARATOR */
436
    0, 0, 0, 0, 1, 1, 1, 1,
437
/*     case 0x0020: * SPACE */
438
    1, 0, 0, 0, 0, 0, 0, 0,
439
    0, 0, 0, 0, 0, 0, 0, 0,
440
    0, 0, 0, 0, 0, 0, 0, 0,
441
    0, 0, 0, 0, 0, 0, 0, 0,
442
443
    0, 0, 0, 0, 0, 0, 0, 0,
444
    0, 0, 0, 0, 0, 0, 0, 0,
445
    0, 0, 0, 0, 0, 0, 0, 0,
446
    0, 0, 0, 0, 0, 0, 0, 0,
447
    0, 0, 0, 0, 0, 0, 0, 0,
448
    0, 0, 0, 0, 0, 0, 0, 0,
449
    0, 0, 0, 0, 0, 0, 0, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0
451
};
452
453
/* forward */
454
static PyObject* get_latin1_char(unsigned char ch);
455
static int unicode_modifiable(PyObject *unicode);
456
457
458
static PyObject *
459
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
460
static PyObject *
461
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
462
static PyObject *
463
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
464
465
static PyObject *
466
unicode_encode_call_errorhandler(const char *errors,
467
       PyObject **errorHandler,const char *encoding, const char *reason,
468
       PyObject *unicode, PyObject **exceptionObject,
469
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
470
471
static void
472
raise_encode_exception(PyObject **exceptionObject,
473
                       const char *encoding,
474
                       PyObject *unicode,
475
                       Py_ssize_t startpos, Py_ssize_t endpos,
476
                       const char *reason);
477
478
/* Same for linebreaks */
479
static const unsigned char ascii_linebreak[] = {
480
    0, 0, 0, 0, 0, 0, 0, 0,
481
/*         0x000A, * LINE FEED */
482
/*         0x000B, * LINE TABULATION */
483
/*         0x000C, * FORM FEED */
484
/*         0x000D, * CARRIAGE RETURN */
485
    0, 0, 1, 1, 1, 1, 0, 0,
486
    0, 0, 0, 0, 0, 0, 0, 0,
487
/*         0x001C, * FILE SEPARATOR */
488
/*         0x001D, * GROUP SEPARATOR */
489
/*         0x001E, * RECORD SEPARATOR */
490
    0, 0, 0, 0, 1, 1, 1, 0,
491
    0, 0, 0, 0, 0, 0, 0, 0,
492
    0, 0, 0, 0, 0, 0, 0, 0,
493
    0, 0, 0, 0, 0, 0, 0, 0,
494
    0, 0, 0, 0, 0, 0, 0, 0,
495
496
    0, 0, 0, 0, 0, 0, 0, 0,
497
    0, 0, 0, 0, 0, 0, 0, 0,
498
    0, 0, 0, 0, 0, 0, 0, 0,
499
    0, 0, 0, 0, 0, 0, 0, 0,
500
    0, 0, 0, 0, 0, 0, 0, 0,
501
    0, 0, 0, 0, 0, 0, 0, 0,
502
    0, 0, 0, 0, 0, 0, 0, 0,
503
    0, 0, 0, 0, 0, 0, 0, 0
504
};
505
506
static int convert_uc(PyObject *obj, void *addr);
507
508
struct encoding_map;
509
#include "clinic/unicodeobject.c.h"
510
511
_Py_error_handler
512
_Py_GetErrorHandler(const char *errors)
513
47.4k
{
514
47.4k
    if (errors == NULL || strcmp(errors, "strict") == 0) {
515
34.7k
        return _Py_ERROR_STRICT;
516
34.7k
    }
517
12.6k
    if (strcmp(errors, "surrogateescape") == 0) {
518
8.17k
        return _Py_ERROR_SURROGATEESCAPE;
519
8.17k
    }
520
4.48k
    if (strcmp(errors, "replace") == 0) {
521
3.64k
        return _Py_ERROR_REPLACE;
522
3.64k
    }
523
841
    if (strcmp(errors, "ignore") == 0) {
524
0
        return _Py_ERROR_IGNORE;
525
0
    }
526
841
    if (strcmp(errors, "backslashreplace") == 0) {
527
299
        return _Py_ERROR_BACKSLASHREPLACE;
528
299
    }
529
542
    if (strcmp(errors, "surrogatepass") == 0) {
530
542
        return _Py_ERROR_SURROGATEPASS;
531
542
    }
532
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
533
0
        return _Py_ERROR_XMLCHARREFREPLACE;
534
0
    }
535
0
    return _Py_ERROR_OTHER;
536
0
}
537
538
539
static _Py_error_handler
540
get_error_handler_wide(const wchar_t *errors)
541
7.61k
{
542
7.61k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
543
0
        return _Py_ERROR_STRICT;
544
0
    }
545
7.61k
    if (wcscmp(errors, L"surrogateescape") == 0) {
546
7.61k
        return _Py_ERROR_SURROGATEESCAPE;
547
7.61k
    }
548
0
    if (wcscmp(errors, L"replace") == 0) {
549
0
        return _Py_ERROR_REPLACE;
550
0
    }
551
0
    if (wcscmp(errors, L"ignore") == 0) {
552
0
        return _Py_ERROR_IGNORE;
553
0
    }
554
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
555
0
        return _Py_ERROR_BACKSLASHREPLACE;
556
0
    }
557
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
558
0
        return _Py_ERROR_SURROGATEPASS;
559
0
    }
560
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
561
0
        return _Py_ERROR_XMLCHARREFREPLACE;
562
0
    }
563
0
    return _Py_ERROR_OTHER;
564
0
}
565
566
567
static inline int
568
unicode_check_encoding_errors(const char *encoding, const char *errors)
569
176k
{
570
176k
    if (encoding == NULL && errors == NULL) {
571
0
        return 0;
572
0
    }
573
574
176k
    PyInterpreterState *interp = _PyInterpreterState_GET();
575
176k
#ifndef Py_DEBUG
576
    /* In release mode, only check in development mode (-X dev) */
577
176k
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
578
176k
        return 0;
579
176k
    }
580
#else
581
    /* Always check in debug mode */
582
#endif
583
584
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
585
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
586
0
    if (!interp->unicode.fs_codec.encoding) {
587
0
        return 0;
588
0
    }
589
590
    /* Disable checks during Python finalization. For example, it allows to
591
       call _PyObject_Dump() during finalization for debugging purpose. */
592
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
593
0
        return 0;
594
0
    }
595
596
0
    if (encoding != NULL
597
        // Fast path for the most common built-in encodings. Even if the codec
598
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
599
        // create a temporary Unicode string (the key in the cache).
600
0
        && strcmp(encoding, "utf-8") != 0
601
0
        && strcmp(encoding, "utf8") != 0
602
0
        && strcmp(encoding, "ascii") != 0)
603
0
    {
604
0
        PyObject *handler = _PyCodec_Lookup(encoding);
605
0
        if (handler == NULL) {
606
0
            return -1;
607
0
        }
608
0
        Py_DECREF(handler);
609
0
    }
610
611
0
    if (errors != NULL
612
        // Fast path for the most common built-in error handlers.
613
0
        && strcmp(errors, "strict") != 0
614
0
        && strcmp(errors, "ignore") != 0
615
0
        && strcmp(errors, "replace") != 0
616
0
        && strcmp(errors, "surrogateescape") != 0
617
0
        && strcmp(errors, "surrogatepass") != 0)
618
0
    {
619
0
        PyObject *handler = PyCodec_LookupError(errors);
620
0
        if (handler == NULL) {
621
0
            return -1;
622
0
        }
623
0
        Py_DECREF(handler);
624
0
    }
625
0
    return 0;
626
0
}
627
628
629
int
630
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
631
103M
{
632
103M
#define CHECK(expr) \
633
524M
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
634
635
103M
    assert(op != NULL);
636
103M
    CHECK(PyUnicode_Check(op));
637
638
103M
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
639
0
    int kind = ascii->state.kind;
640
641
103M
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
642
42.5M
        CHECK(kind == PyUnicode_1BYTE_KIND);
643
42.5M
    }
644
61.1M
    else {
645
61.1M
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
646
0
        void *data;
647
648
61.1M
        if (ascii->state.compact == 1) {
649
61.1M
            data = compact + 1;
650
61.1M
            CHECK(kind == PyUnicode_1BYTE_KIND
651
61.1M
                                 || kind == PyUnicode_2BYTE_KIND
652
61.1M
                                 || kind == PyUnicode_4BYTE_KIND);
653
61.1M
            CHECK(ascii->state.ascii == 0);
654
61.1M
            CHECK(_PyUnicode_UTF8(op) != data);
655
61.1M
        }
656
49
        else {
657
49
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
658
659
0
            data = unicode->data.any;
660
49
            CHECK(kind == PyUnicode_1BYTE_KIND
661
49
                     || kind == PyUnicode_2BYTE_KIND
662
49
                     || kind == PyUnicode_4BYTE_KIND);
663
49
            CHECK(ascii->state.compact == 0);
664
49
            CHECK(data != NULL);
665
49
            if (ascii->state.ascii) {
666
49
                CHECK(_PyUnicode_UTF8(op) == data);
667
49
                CHECK(compact->utf8_length == ascii->length);
668
49
            }
669
0
            else {
670
0
                CHECK(_PyUnicode_UTF8(op) != data);
671
0
            }
672
49
        }
673
61.1M
#ifndef Py_GIL_DISABLED
674
61.1M
        if (_PyUnicode_UTF8(op) == NULL)
675
61.1M
            CHECK(compact->utf8_length == 0);
676
61.1M
#endif
677
61.1M
    }
678
679
    /* check that the best kind is used: O(n) operation */
680
103M
    if (check_content) {
681
51.3M
        Py_ssize_t i;
682
51.3M
        Py_UCS4 maxchar = 0;
683
51.3M
        const void *data;
684
51.3M
        Py_UCS4 ch;
685
686
51.3M
        data = PyUnicode_DATA(ascii);
687
27.5G
        for (i=0; i < ascii->length; i++)
688
27.5G
        {
689
27.5G
            ch = PyUnicode_READ(kind, data, i);
690
27.5G
            if (ch > maxchar)
691
67.1M
                maxchar = ch;
692
27.5G
        }
693
51.3M
        if (kind == PyUnicode_1BYTE_KIND) {
694
22.5M
            if (ascii->state.ascii == 0) {
695
1.49M
                CHECK(maxchar >= 128);
696
1.49M
                CHECK(maxchar <= 255);
697
1.49M
            }
698
21.0M
            else
699
21.0M
                CHECK(maxchar < 128);
700
22.5M
        }
701
28.8M
        else if (kind == PyUnicode_2BYTE_KIND) {
702
24.5M
            CHECK(maxchar >= 0x100);
703
24.5M
            CHECK(maxchar <= 0xFFFF);
704
24.5M
        }
705
4.22M
        else {
706
4.22M
            CHECK(maxchar >= 0x10000);
707
4.22M
            CHECK(maxchar <= MAX_UNICODE);
708
4.22M
        }
709
51.3M
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
710
51.3M
    }
711
712
    /* Check interning state */
713
#ifdef Py_DEBUG
714
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
715
    // extensions can make immortal strings mortal (but with a high enough
716
    // refcount).
717
    // The other way is extremely unlikely (worth a potential failed assertion
718
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
719
    switch (PyUnicode_CHECK_INTERNED(op)) {
720
        case SSTATE_NOT_INTERNED:
721
            if (ascii->state.statically_allocated) {
722
                // This state is for two exceptions:
723
                // - strings are currently checked before they're interned
724
                // - the 256 one-latin1-character strings
725
                //   are static but use SSTATE_NOT_INTERNED
726
            }
727
            else {
728
                CHECK(!_Py_IsImmortal(op));
729
            }
730
            break;
731
        case SSTATE_INTERNED_MORTAL:
732
            CHECK(!ascii->state.statically_allocated);
733
            CHECK(!_Py_IsImmortal(op));
734
            break;
735
        case SSTATE_INTERNED_IMMORTAL:
736
            CHECK(!ascii->state.statically_allocated);
737
            break;
738
        case SSTATE_INTERNED_IMMORTAL_STATIC:
739
            CHECK(ascii->state.statically_allocated);
740
            break;
741
        default:
742
            Py_UNREACHABLE();
743
    }
744
#endif
745
746
103M
    return 1;
747
748
103M
#undef CHECK
749
103M
}
750
751
static PyObject*
752
unicode_result(PyObject *unicode)
753
3.33M
{
754
3.33M
    assert(_PyUnicode_CHECK(unicode));
755
756
3.33M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
757
3.33M
    if (length == 0) {
758
4
        PyObject *empty = unicode_get_empty();
759
4
        if (unicode != empty) {
760
0
            Py_DECREF(unicode);
761
0
        }
762
4
        return empty;
763
4
    }
764
765
3.33M
    if (length == 1) {
766
319k
        int kind = PyUnicode_KIND(unicode);
767
319k
        if (kind == PyUnicode_1BYTE_KIND) {
768
117k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
769
117k
            Py_UCS1 ch = data[0];
770
117k
            PyObject *latin1_char = LATIN1(ch);
771
117k
            if (unicode != latin1_char) {
772
116k
                Py_DECREF(unicode);
773
116k
            }
774
117k
            return latin1_char;
775
117k
        }
776
319k
    }
777
778
3.33M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
779
3.21M
    return unicode;
780
3.21M
}
781
782
static PyObject*
783
unicode_result_unchanged(PyObject *unicode)
784
153k
{
785
153k
    if (PyUnicode_CheckExact(unicode)) {
786
153k
        return Py_NewRef(unicode);
787
153k
    }
788
0
    else
789
        /* Subtype -- return genuine unicode string with the same value. */
790
0
        return _PyUnicode_Copy(unicode);
791
153k
}
792
793
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
794
   ASCII, Latin1, UTF-8, etc. */
795
static char*
796
backslashreplace(PyBytesWriter *writer, char *str,
797
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
798
10.6k
{
799
10.6k
    Py_ssize_t size, i;
800
10.6k
    Py_UCS4 ch;
801
10.6k
    int kind;
802
10.6k
    const void *data;
803
804
10.6k
    kind = PyUnicode_KIND(unicode);
805
10.6k
    data = PyUnicode_DATA(unicode);
806
807
10.6k
    size = 0;
808
    /* determine replacement size */
809
149k
    for (i = collstart; i < collend; ++i) {
810
138k
        Py_ssize_t incr;
811
812
138k
        ch = PyUnicode_READ(kind, data, i);
813
138k
        if (ch < 0x100)
814
138k
            incr = 2+2;
815
0
        else if (ch < 0x10000)
816
0
            incr = 2+4;
817
0
        else {
818
0
            assert(ch <= MAX_UNICODE);
819
0
            incr = 2+8;
820
0
        }
821
138k
        if (size > PY_SSIZE_T_MAX - incr) {
822
0
            PyErr_SetString(PyExc_OverflowError,
823
0
                            "encoded result is too long for a Python string");
824
0
            return NULL;
825
0
        }
826
138k
        size += incr;
827
138k
    }
828
829
10.6k
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
830
10.6k
    if (str == NULL) {
831
0
        return NULL;
832
0
    }
833
834
    /* generate replacement */
835
149k
    for (i = collstart; i < collend; ++i) {
836
138k
        ch = PyUnicode_READ(kind, data, i);
837
138k
        *str++ = '\\';
838
138k
        if (ch >= 0x00010000) {
839
0
            *str++ = 'U';
840
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
841
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
842
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
843
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
844
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
845
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
846
0
        }
847
138k
        else if (ch >= 0x100) {
848
0
            *str++ = 'u';
849
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
850
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
851
0
        }
852
138k
        else
853
138k
            *str++ = 'x';
854
138k
        *str++ = Py_hexdigits[(ch>>4)&0xf];
855
138k
        *str++ = Py_hexdigits[ch&0xf];
856
138k
    }
857
10.6k
    return str;
858
10.6k
}
859
860
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
861
   ASCII, Latin1, UTF-8, etc. */
862
static char*
863
xmlcharrefreplace(PyBytesWriter *writer, char *str,
864
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
865
0
{
866
0
    Py_ssize_t size, i;
867
0
    Py_UCS4 ch;
868
0
    int kind;
869
0
    const void *data;
870
871
0
    kind = PyUnicode_KIND(unicode);
872
0
    data = PyUnicode_DATA(unicode);
873
874
0
    size = 0;
875
    /* determine replacement size */
876
0
    for (i = collstart; i < collend; ++i) {
877
0
        Py_ssize_t incr;
878
879
0
        ch = PyUnicode_READ(kind, data, i);
880
0
        if (ch < 10)
881
0
            incr = 2+1+1;
882
0
        else if (ch < 100)
883
0
            incr = 2+2+1;
884
0
        else if (ch < 1000)
885
0
            incr = 2+3+1;
886
0
        else if (ch < 10000)
887
0
            incr = 2+4+1;
888
0
        else if (ch < 100000)
889
0
            incr = 2+5+1;
890
0
        else if (ch < 1000000)
891
0
            incr = 2+6+1;
892
0
        else {
893
0
            assert(ch <= MAX_UNICODE);
894
0
            incr = 2+7+1;
895
0
        }
896
0
        if (size > PY_SSIZE_T_MAX - incr) {
897
0
            PyErr_SetString(PyExc_OverflowError,
898
0
                            "encoded result is too long for a Python string");
899
0
            return NULL;
900
0
        }
901
0
        size += incr;
902
0
    }
903
904
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
905
0
    if (str == NULL) {
906
0
        return NULL;
907
0
    }
908
909
    /* generate replacement */
910
0
    for (i = collstart; i < collend; ++i) {
911
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
912
0
        if (size < 0) {
913
0
            return NULL;
914
0
        }
915
0
        str += size;
916
0
    }
917
0
    return str;
918
0
}
919
920
/* --- Bloom Filters ----------------------------------------------------- */
921
922
/* stuff to implement simple "bloom filters" for Unicode characters.
923
   to keep things simple, we use a single bitmask, using the least 5
924
   bits from each unicode characters as the bit index. */
925
926
/* the linebreak mask is set up by _PyUnicode_Init() below */
927
928
#if LONG_BIT >= 128
929
#define BLOOM_WIDTH 128
930
#elif LONG_BIT >= 64
931
21.4k
#define BLOOM_WIDTH 64
932
#elif LONG_BIT >= 32
933
#define BLOOM_WIDTH 32
934
#else
935
#error "LONG_BIT is smaller than 32"
936
#endif
937
938
21.2k
#define BLOOM_MASK unsigned long
939
940
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
941
942
10.6k
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
943
944
#define BLOOM_LINEBREAK(ch)                                             \
945
0
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
946
0
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
947
948
static inline BLOOM_MASK
949
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
950
10.6k
{
951
10.6k
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
952
10.6k
    do {                                               \
953
10.6k
        TYPE *data = (TYPE *)PTR;                      \
954
10.6k
        TYPE *end = data + LEN;                        \
955
10.6k
        Py_UCS4 ch;                                    \
956
21.4k
        for (; data != end; data++) {                  \
957
10.7k
            ch = *data;                                \
958
10.7k
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
959
10.7k
        }                                              \
960
10.6k
        break;                                         \
961
10.6k
    } while (0)
962
963
    /* calculate simple bloom-style bitmask for a given unicode string */
964
965
10.6k
    BLOOM_MASK mask;
966
967
10.6k
    mask = 0;
968
10.6k
    switch (kind) {
969
10.6k
    case PyUnicode_1BYTE_KIND:
970
10.6k
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
971
10.6k
        break;
972
22
    case PyUnicode_2BYTE_KIND:
973
22
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
974
22
        break;
975
0
    case PyUnicode_4BYTE_KIND:
976
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
977
0
        break;
978
0
    default:
979
0
        Py_UNREACHABLE();
980
10.6k
    }
981
10.6k
    return mask;
982
983
10.6k
#undef BLOOM_UPDATE
984
10.6k
}
985
986
static int
987
ensure_unicode(PyObject *obj)
988
310M
{
989
310M
    if (!PyUnicode_Check(obj)) {
990
0
        PyErr_Format(PyExc_TypeError,
991
0
                     "must be str, not %.100s",
992
0
                     Py_TYPE(obj)->tp_name);
993
0
        return -1;
994
0
    }
995
310M
    return 0;
996
310M
}
997
998
/* Compilation of templated routines */
999
1000
1.78k
#define STRINGLIB_GET_EMPTY() unicode_get_empty()
1001
1002
#include "stringlib/asciilib.h"
1003
#include "stringlib/fastsearch.h"
1004
#include "stringlib/partition.h"
1005
#include "stringlib/split.h"
1006
#include "stringlib/count.h"
1007
#include "stringlib/find.h"
1008
#include "stringlib/find_max_char.h"
1009
#include "stringlib/undef.h"
1010
1011
#include "stringlib/ucs1lib.h"
1012
#include "stringlib/fastsearch.h"
1013
#include "stringlib/partition.h"
1014
#include "stringlib/split.h"
1015
#include "stringlib/count.h"
1016
#include "stringlib/find.h"
1017
#include "stringlib/replace.h"
1018
#include "stringlib/repr.h"
1019
#include "stringlib/find_max_char.h"
1020
#include "stringlib/undef.h"
1021
1022
#include "stringlib/ucs2lib.h"
1023
#include "stringlib/fastsearch.h"
1024
#include "stringlib/partition.h"
1025
#include "stringlib/split.h"
1026
#include "stringlib/count.h"
1027
#include "stringlib/find.h"
1028
#include "stringlib/replace.h"
1029
#include "stringlib/repr.h"
1030
#include "stringlib/find_max_char.h"
1031
#include "stringlib/undef.h"
1032
1033
#include "stringlib/ucs4lib.h"
1034
#include "stringlib/fastsearch.h"
1035
#include "stringlib/partition.h"
1036
#include "stringlib/split.h"
1037
#include "stringlib/count.h"
1038
#include "stringlib/find.h"
1039
#include "stringlib/replace.h"
1040
#include "stringlib/repr.h"
1041
#include "stringlib/find_max_char.h"
1042
#include "stringlib/undef.h"
1043
1044
#undef STRINGLIB_GET_EMPTY
1045
1046
/* --- Unicode Object ----------------------------------------------------- */
1047
1048
static inline Py_ssize_t
1049
findchar(const void *s, int kind,
1050
         Py_ssize_t size, Py_UCS4 ch,
1051
         int direction)
1052
26.9M
{
1053
26.9M
    switch (kind) {
1054
26.9M
    case PyUnicode_1BYTE_KIND:
1055
26.9M
        if ((Py_UCS1) ch != ch)
1056
1.69k
            return -1;
1057
26.8M
        if (direction > 0)
1058
26.8M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1059
4.08k
        else
1060
4.08k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1061
6.78k
    case PyUnicode_2BYTE_KIND:
1062
6.78k
        if ((Py_UCS2) ch != ch)
1063
0
            return -1;
1064
6.78k
        if (direction > 0)
1065
5.74k
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1066
1.04k
        else
1067
1.04k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1068
5.20k
    case PyUnicode_4BYTE_KIND:
1069
5.20k
        if (direction > 0)
1070
3.95k
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1071
1.24k
        else
1072
1.24k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1073
0
    default:
1074
0
        Py_UNREACHABLE();
1075
26.9M
    }
1076
26.9M
}
1077
1078
#ifdef Py_DEBUG
1079
/* Fill the data of a Unicode string with invalid characters to detect bugs
1080
   earlier.
1081
1082
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1083
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1084
   invalid character in Unicode 6.0. */
1085
static void
1086
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1087
{
1088
    int kind = PyUnicode_KIND(unicode);
1089
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1090
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1091
    if (length <= old_length)
1092
        return;
1093
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1094
}
1095
#endif
1096
1097
static PyObject*
1098
resize_copy(PyObject *unicode, Py_ssize_t length)
1099
0
{
1100
0
    Py_ssize_t copy_length;
1101
0
    PyObject *copy;
1102
1103
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1104
0
    if (copy == NULL)
1105
0
        return NULL;
1106
1107
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1108
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1109
0
    return copy;
1110
0
}
1111
1112
static PyObject*
1113
resize_compact(PyObject *unicode, Py_ssize_t length)
1114
8.13M
{
1115
8.13M
    Py_ssize_t char_size;
1116
8.13M
    Py_ssize_t struct_size;
1117
8.13M
    Py_ssize_t new_size;
1118
8.13M
    PyObject *new_unicode;
1119
#ifdef Py_DEBUG
1120
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1121
#endif
1122
1123
8.13M
    if (!unicode_modifiable(unicode)) {
1124
0
        PyObject *copy = resize_copy(unicode, length);
1125
0
        if (copy == NULL) {
1126
0
            return NULL;
1127
0
        }
1128
0
        Py_DECREF(unicode);
1129
0
        return copy;
1130
0
    }
1131
8.13M
    assert(PyUnicode_IS_COMPACT(unicode));
1132
1133
8.13M
    char_size = PyUnicode_KIND(unicode);
1134
8.13M
    if (PyUnicode_IS_ASCII(unicode))
1135
5.90M
        struct_size = sizeof(PyASCIIObject);
1136
2.23M
    else
1137
2.23M
        struct_size = sizeof(PyCompactUnicodeObject);
1138
1139
8.13M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1140
0
        PyErr_NoMemory();
1141
0
        return NULL;
1142
0
    }
1143
8.13M
    new_size = (struct_size + (length + 1) * char_size);
1144
1145
8.13M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1146
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1147
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1148
0
        PyUnicode_SET_UTF8(unicode, NULL);
1149
0
    }
1150
#ifdef Py_TRACE_REFS
1151
    _Py_ForgetReference(unicode);
1152
#endif
1153
8.13M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1154
1155
8.13M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1156
8.13M
    if (new_unicode == NULL) {
1157
0
        _Py_NewReferenceNoTotal(unicode);
1158
0
        PyErr_NoMemory();
1159
0
        return NULL;
1160
0
    }
1161
8.13M
    unicode = new_unicode;
1162
8.13M
    _Py_NewReferenceNoTotal(unicode);
1163
1164
8.13M
    _PyUnicode_LENGTH(unicode) = length;
1165
#ifdef Py_DEBUG
1166
    unicode_fill_invalid(unicode, old_length);
1167
#endif
1168
8.13M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1169
8.13M
                    length, 0);
1170
8.13M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1171
8.13M
    return unicode;
1172
8.13M
}
1173
1174
static int
1175
resize_inplace(PyObject *unicode, Py_ssize_t length)
1176
0
{
1177
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1178
0
    assert(Py_REFCNT(unicode) == 1);
1179
1180
0
    Py_ssize_t new_size;
1181
0
    Py_ssize_t char_size;
1182
0
    int share_utf8;
1183
0
    void *data;
1184
#ifdef Py_DEBUG
1185
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1186
#endif
1187
1188
0
    data = _PyUnicode_DATA_ANY(unicode);
1189
0
    char_size = PyUnicode_KIND(unicode);
1190
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1191
1192
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1193
0
        PyErr_NoMemory();
1194
0
        return -1;
1195
0
    }
1196
0
    new_size = (length + 1) * char_size;
1197
1198
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1199
0
    {
1200
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1201
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1202
0
        PyUnicode_SET_UTF8(unicode, NULL);
1203
0
    }
1204
1205
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1206
0
    if (data == NULL) {
1207
0
        PyErr_NoMemory();
1208
0
        return -1;
1209
0
    }
1210
0
    _PyUnicode_DATA_ANY(unicode) = data;
1211
0
    if (share_utf8) {
1212
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1213
0
        PyUnicode_SET_UTF8(unicode, data);
1214
0
    }
1215
0
    _PyUnicode_LENGTH(unicode) = length;
1216
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1217
#ifdef Py_DEBUG
1218
    unicode_fill_invalid(unicode, old_length);
1219
#endif
1220
1221
    /* check for integer overflow */
1222
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1223
0
        PyErr_NoMemory();
1224
0
        return -1;
1225
0
    }
1226
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1227
0
    return 0;
1228
0
}
1229
1230
static const char*
1231
unicode_kind_name(PyObject *unicode)
1232
0
{
1233
    /* don't check consistency: unicode_kind_name() is called from
1234
       _PyUnicode_Dump() */
1235
0
    if (!PyUnicode_IS_COMPACT(unicode))
1236
0
    {
1237
0
        switch (PyUnicode_KIND(unicode))
1238
0
        {
1239
0
        case PyUnicode_1BYTE_KIND:
1240
0
            if (PyUnicode_IS_ASCII(unicode))
1241
0
                return "legacy ascii";
1242
0
            else
1243
0
                return "legacy latin1";
1244
0
        case PyUnicode_2BYTE_KIND:
1245
0
            return "legacy UCS2";
1246
0
        case PyUnicode_4BYTE_KIND:
1247
0
            return "legacy UCS4";
1248
0
        default:
1249
0
            return "<legacy invalid kind>";
1250
0
        }
1251
0
    }
1252
0
    switch (PyUnicode_KIND(unicode)) {
1253
0
    case PyUnicode_1BYTE_KIND:
1254
0
        if (PyUnicode_IS_ASCII(unicode))
1255
0
            return "ascii";
1256
0
        else
1257
0
            return "latin1";
1258
0
    case PyUnicode_2BYTE_KIND:
1259
0
        return "UCS2";
1260
0
    case PyUnicode_4BYTE_KIND:
1261
0
        return "UCS4";
1262
0
    default:
1263
0
        return "<invalid compact kind>";
1264
0
    }
1265
0
}
1266
1267
#ifdef Py_DEBUG
1268
/* Functions wrapping macros for use in debugger */
1269
const char *_PyUnicode_utf8(void *unicode_raw){
1270
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1271
    return PyUnicode_UTF8(unicode);
1272
}
1273
1274
const void *_PyUnicode_compact_data(void *unicode_raw) {
1275
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1276
    return _PyUnicode_COMPACT_DATA(unicode);
1277
}
1278
const void *_PyUnicode_data(void *unicode_raw) {
1279
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1280
    printf("obj %p\n", (void*)unicode);
1281
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1282
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1283
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1284
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1285
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1286
    return PyUnicode_DATA(unicode);
1287
}
1288
1289
void
1290
_PyUnicode_Dump(PyObject *op)
1291
{
1292
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1293
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1294
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1295
    const void *data;
1296
1297
    if (ascii->state.compact)
1298
    {
1299
        if (ascii->state.ascii)
1300
            data = (ascii + 1);
1301
        else
1302
            data = (compact + 1);
1303
    }
1304
    else
1305
        data = unicode->data.any;
1306
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1307
1308
    if (!ascii->state.ascii) {
1309
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1310
    }
1311
    printf(", data=%p\n", data);
1312
}
1313
#endif
1314
1315
1316
PyObject *
1317
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1318
55.3M
{
1319
    /* Optimization for empty strings */
1320
55.3M
    if (size == 0) {
1321
11.1M
        return unicode_get_empty();
1322
11.1M
    }
1323
1324
44.2M
    PyObject *obj;
1325
44.2M
    PyCompactUnicodeObject *unicode;
1326
44.2M
    void *data;
1327
44.2M
    int kind;
1328
44.2M
    int is_ascii;
1329
44.2M
    Py_ssize_t char_size;
1330
44.2M
    Py_ssize_t struct_size;
1331
1332
44.2M
    is_ascii = 0;
1333
44.2M
    struct_size = sizeof(PyCompactUnicodeObject);
1334
44.2M
    if (maxchar < 128) {
1335
15.6M
        kind = PyUnicode_1BYTE_KIND;
1336
15.6M
        char_size = 1;
1337
15.6M
        is_ascii = 1;
1338
15.6M
        struct_size = sizeof(PyASCIIObject);
1339
15.6M
    }
1340
28.6M
    else if (maxchar < 256) {
1341
610k
        kind = PyUnicode_1BYTE_KIND;
1342
610k
        char_size = 1;
1343
610k
    }
1344
27.9M
    else if (maxchar < 65536) {
1345
24.2M
        kind = PyUnicode_2BYTE_KIND;
1346
24.2M
        char_size = 2;
1347
24.2M
    }
1348
3.69M
    else {
1349
3.69M
        if (maxchar > MAX_UNICODE) {
1350
0
            PyErr_SetString(PyExc_SystemError,
1351
0
                            "invalid maximum character passed to PyUnicode_New");
1352
0
            return NULL;
1353
0
        }
1354
3.69M
        kind = PyUnicode_4BYTE_KIND;
1355
3.69M
        char_size = 4;
1356
3.69M
    }
1357
1358
    /* Ensure we won't overflow the size. */
1359
44.2M
    if (size < 0) {
1360
0
        PyErr_SetString(PyExc_SystemError,
1361
0
                        "Negative size passed to PyUnicode_New");
1362
0
        return NULL;
1363
0
    }
1364
44.2M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1365
0
        return PyErr_NoMemory();
1366
1367
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1368
     * PyObject_New() so we are able to allocate space for the object and
1369
     * it's data buffer.
1370
     */
1371
44.2M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1372
44.2M
    if (obj == NULL) {
1373
0
        return PyErr_NoMemory();
1374
0
    }
1375
44.2M
    _PyObject_Init(obj, &PyUnicode_Type);
1376
1377
44.2M
    unicode = (PyCompactUnicodeObject *)obj;
1378
44.2M
    if (is_ascii)
1379
15.6M
        data = ((PyASCIIObject*)obj) + 1;
1380
28.6M
    else
1381
28.6M
        data = unicode + 1;
1382
44.2M
    _PyUnicode_LENGTH(unicode) = size;
1383
44.2M
    _PyUnicode_HASH(unicode) = -1;
1384
44.2M
    _PyUnicode_STATE(unicode).interned = 0;
1385
44.2M
    _PyUnicode_STATE(unicode).kind = kind;
1386
44.2M
    _PyUnicode_STATE(unicode).compact = 1;
1387
44.2M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1388
44.2M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1389
44.2M
    if (is_ascii) {
1390
15.6M
        ((char*)data)[size] = 0;
1391
15.6M
    }
1392
28.6M
    else if (kind == PyUnicode_1BYTE_KIND) {
1393
610k
        ((char*)data)[size] = 0;
1394
610k
        unicode->utf8 = NULL;
1395
610k
        unicode->utf8_length = 0;
1396
610k
    }
1397
27.9M
    else {
1398
27.9M
        unicode->utf8 = NULL;
1399
27.9M
        unicode->utf8_length = 0;
1400
27.9M
        if (kind == PyUnicode_2BYTE_KIND)
1401
24.2M
            ((Py_UCS2*)data)[size] = 0;
1402
3.69M
        else /* kind == PyUnicode_4BYTE_KIND */
1403
3.69M
            ((Py_UCS4*)data)[size] = 0;
1404
27.9M
    }
1405
#ifdef Py_DEBUG
1406
    unicode_fill_invalid((PyObject*)unicode, 0);
1407
#endif
1408
44.2M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1409
44.2M
    return obj;
1410
44.2M
}
1411
1412
static int
1413
unicode_check_modifiable(PyObject *unicode)
1414
691
{
1415
691
    if (!unicode_modifiable(unicode)) {
1416
0
        PyErr_SetString(PyExc_SystemError,
1417
0
                        "Cannot modify a string currently used");
1418
0
        return -1;
1419
0
    }
1420
691
    return 0;
1421
691
}
1422
1423
static int
1424
_copy_characters(PyObject *to, Py_ssize_t to_start,
1425
                 PyObject *from, Py_ssize_t from_start,
1426
                 Py_ssize_t how_many, int check_maxchar)
1427
42.7M
{
1428
42.7M
    int from_kind, to_kind;
1429
42.7M
    const void *from_data;
1430
42.7M
    void *to_data;
1431
1432
42.7M
    assert(0 <= how_many);
1433
42.7M
    assert(0 <= from_start);
1434
42.7M
    assert(0 <= to_start);
1435
42.7M
    assert(PyUnicode_Check(from));
1436
42.7M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1437
1438
42.7M
    assert(to == NULL || PyUnicode_Check(to));
1439
1440
42.7M
    if (how_many == 0) {
1441
498k
        return 0;
1442
498k
    }
1443
1444
42.7M
    assert(to != NULL);
1445
42.3M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1446
1447
42.3M
    from_kind = PyUnicode_KIND(from);
1448
42.3M
    from_data = PyUnicode_DATA(from);
1449
42.3M
    to_kind = PyUnicode_KIND(to);
1450
42.3M
    to_data = PyUnicode_DATA(to);
1451
1452
#ifdef Py_DEBUG
1453
    if (!check_maxchar
1454
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1455
    {
1456
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1457
        Py_UCS4 ch;
1458
        Py_ssize_t i;
1459
        for (i=0; i < how_many; i++) {
1460
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1461
            assert(ch <= to_maxchar);
1462
        }
1463
    }
1464
#endif
1465
1466
42.3M
    if (from_kind == to_kind) {
1467
12.0M
        if (check_maxchar
1468
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1469
0
        {
1470
            /* Writing Latin-1 characters into an ASCII string requires to
1471
               check that all written characters are pure ASCII */
1472
0
            Py_UCS4 max_char;
1473
0
            max_char = ucs1lib_find_max_char(from_data,
1474
0
                                             (const Py_UCS1*)from_data + how_many);
1475
0
            if (max_char >= 128)
1476
0
                return -1;
1477
0
        }
1478
12.0M
        memcpy((char*)to_data + to_kind * to_start,
1479
12.0M
                  (const char*)from_data + from_kind * from_start,
1480
12.0M
                  to_kind * how_many);
1481
12.0M
    }
1482
30.2M
    else if (from_kind == PyUnicode_1BYTE_KIND
1483
29.8M
             && to_kind == PyUnicode_2BYTE_KIND)
1484
16.9M
    {
1485
16.9M
        _PyUnicode_CONVERT_BYTES(
1486
16.9M
            Py_UCS1, Py_UCS2,
1487
16.9M
            PyUnicode_1BYTE_DATA(from) + from_start,
1488
16.9M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1489
16.9M
            PyUnicode_2BYTE_DATA(to) + to_start
1490
16.9M
            );
1491
16.9M
    }
1492
13.3M
    else if (from_kind == PyUnicode_1BYTE_KIND
1493
12.8M
             && to_kind == PyUnicode_4BYTE_KIND)
1494
12.8M
    {
1495
12.8M
        _PyUnicode_CONVERT_BYTES(
1496
12.8M
            Py_UCS1, Py_UCS4,
1497
12.8M
            PyUnicode_1BYTE_DATA(from) + from_start,
1498
12.8M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1499
12.8M
            PyUnicode_4BYTE_DATA(to) + to_start
1500
12.8M
            );
1501
12.8M
    }
1502
421k
    else if (from_kind == PyUnicode_2BYTE_KIND
1503
299k
             && to_kind == PyUnicode_4BYTE_KIND)
1504
290k
    {
1505
290k
        _PyUnicode_CONVERT_BYTES(
1506
290k
            Py_UCS2, Py_UCS4,
1507
290k
            PyUnicode_2BYTE_DATA(from) + from_start,
1508
290k
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1509
290k
            PyUnicode_4BYTE_DATA(to) + to_start
1510
290k
            );
1511
290k
    }
1512
131k
    else {
1513
131k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1514
1515
131k
        if (!check_maxchar) {
1516
131k
            if (from_kind == PyUnicode_2BYTE_KIND
1517
9.53k
                && to_kind == PyUnicode_1BYTE_KIND)
1518
9.53k
            {
1519
9.53k
                _PyUnicode_CONVERT_BYTES(
1520
9.53k
                    Py_UCS2, Py_UCS1,
1521
9.53k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1522
9.53k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1523
9.53k
                    PyUnicode_1BYTE_DATA(to) + to_start
1524
9.53k
                    );
1525
9.53k
            }
1526
121k
            else if (from_kind == PyUnicode_4BYTE_KIND
1527
121k
                     && to_kind == PyUnicode_1BYTE_KIND)
1528
94.5k
            {
1529
94.5k
                _PyUnicode_CONVERT_BYTES(
1530
94.5k
                    Py_UCS4, Py_UCS1,
1531
94.5k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1532
94.5k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1533
94.5k
                    PyUnicode_1BYTE_DATA(to) + to_start
1534
94.5k
                    );
1535
94.5k
            }
1536
26.9k
            else if (from_kind == PyUnicode_4BYTE_KIND
1537
26.9k
                     && to_kind == PyUnicode_2BYTE_KIND)
1538
26.9k
            {
1539
26.9k
                _PyUnicode_CONVERT_BYTES(
1540
26.9k
                    Py_UCS4, Py_UCS2,
1541
26.9k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1542
26.9k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1543
26.9k
                    PyUnicode_2BYTE_DATA(to) + to_start
1544
26.9k
                    );
1545
26.9k
            }
1546
0
            else {
1547
0
                Py_UNREACHABLE();
1548
0
            }
1549
131k
        }
1550
0
        else {
1551
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1552
0
            Py_UCS4 ch;
1553
0
            Py_ssize_t i;
1554
1555
0
            for (i=0; i < how_many; i++) {
1556
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1557
0
                if (ch > to_maxchar)
1558
0
                    return -1;
1559
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1560
0
            }
1561
0
        }
1562
131k
    }
1563
42.3M
    return 0;
1564
42.3M
}
1565
1566
void
1567
_PyUnicode_FastCopyCharacters(
1568
    PyObject *to, Py_ssize_t to_start,
1569
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1570
42.7M
{
1571
42.7M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1572
42.7M
}
1573
1574
Py_ssize_t
1575
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1576
                         PyObject *from, Py_ssize_t from_start,
1577
                         Py_ssize_t how_many)
1578
0
{
1579
0
    int err;
1580
1581
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1582
0
        PyErr_BadInternalCall();
1583
0
        return -1;
1584
0
    }
1585
1586
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1587
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1588
0
        return -1;
1589
0
    }
1590
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1591
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1592
0
        return -1;
1593
0
    }
1594
0
    if (how_many < 0) {
1595
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1596
0
        return -1;
1597
0
    }
1598
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1599
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1600
0
        PyErr_Format(PyExc_SystemError,
1601
0
                     "Cannot write %zi characters at %zi "
1602
0
                     "in a string of %zi characters",
1603
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1604
0
        return -1;
1605
0
    }
1606
1607
0
    if (how_many == 0)
1608
0
        return 0;
1609
1610
0
    if (unicode_check_modifiable(to))
1611
0
        return -1;
1612
1613
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1614
0
    if (err) {
1615
0
        PyErr_Format(PyExc_SystemError,
1616
0
                     "Cannot copy %s characters "
1617
0
                     "into a string of %s characters",
1618
0
                     unicode_kind_name(from),
1619
0
                     unicode_kind_name(to));
1620
0
        return -1;
1621
0
    }
1622
0
    return how_many;
1623
0
}
1624
1625
/* Find the maximum code point and count the number of surrogate pairs so a
1626
   correct string length can be computed before converting a string to UCS4.
1627
   This function counts single surrogates as a character and not as a pair.
1628
1629
   Return 0 on success, or -1 on error. */
1630
static int
1631
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1632
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1633
16.3k
{
1634
16.3k
    const wchar_t *iter;
1635
16.3k
    Py_UCS4 ch;
1636
1637
16.3k
    assert(num_surrogates != NULL && maxchar != NULL);
1638
16.3k
    *num_surrogates = 0;
1639
16.3k
    *maxchar = 0;
1640
1641
347k
    for (iter = begin; iter < end; ) {
1642
#if SIZEOF_WCHAR_T == 2
1643
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1644
            && (iter+1) < end
1645
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1646
        {
1647
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1648
            ++(*num_surrogates);
1649
            iter += 2;
1650
        }
1651
        else
1652
#endif
1653
330k
        {
1654
330k
            ch = *iter;
1655
330k
            iter++;
1656
330k
        }
1657
330k
        if (ch > *maxchar) {
1658
60.2k
            *maxchar = ch;
1659
60.2k
            if (*maxchar > MAX_UNICODE) {
1660
0
                PyErr_Format(PyExc_ValueError,
1661
0
                             "character U+%x is not in range [U+0000; U+%x]",
1662
0
                             ch, MAX_UNICODE);
1663
0
                return -1;
1664
0
            }
1665
60.2k
        }
1666
330k
    }
1667
16.3k
    return 0;
1668
16.3k
}
1669
1670
static void
1671
unicode_dealloc(PyObject *unicode)
1672
43.8M
{
1673
#ifdef Py_DEBUG
1674
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1675
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1676
    }
1677
#endif
1678
87.6M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1679
        /* This should never get called, but we also don't want to SEGV if
1680
        * we accidentally decref an immortal string out of existence. Since
1681
        * the string is an immortal object, just re-set the reference count.
1682
        */
1683
#ifdef Py_DEBUG
1684
        Py_UNREACHABLE();
1685
#endif
1686
0
        _Py_SetImmortal(unicode);
1687
0
        return;
1688
0
    }
1689
43.8M
    switch (_PyUnicode_STATE(unicode).interned) {
1690
43.5M
        case SSTATE_NOT_INTERNED:
1691
43.5M
            break;
1692
238k
        case SSTATE_INTERNED_MORTAL:
1693
            /* Remove the object from the intern dict.
1694
             * Before doing so, we set the refcount to 2: the key and value
1695
             * in the interned_dict.
1696
             */
1697
238k
            assert(Py_REFCNT(unicode) == 0);
1698
238k
            Py_SET_REFCNT(unicode, 2);
1699
#ifdef Py_REF_DEBUG
1700
            /* let's be pedantic with the ref total */
1701
            _Py_IncRefTotal(_PyThreadState_GET());
1702
            _Py_IncRefTotal(_PyThreadState_GET());
1703
#endif
1704
238k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1705
238k
            PyObject *interned = get_interned_dict(interp);
1706
238k
            assert(interned != NULL);
1707
238k
            PyObject *popped;
1708
238k
            int r = PyDict_Pop(interned, unicode, &popped);
1709
238k
            if (r == -1) {
1710
0
                PyErr_FormatUnraisable("Exception ignored while "
1711
0
                                       "removing an interned string %R",
1712
0
                                       unicode);
1713
                // We don't know what happened to the string. It's probably
1714
                // best to leak it:
1715
                // - if it was popped, there are no more references to it
1716
                //   so it can't cause trouble (except wasted memory)
1717
                // - if it wasn't popped, it'll remain interned
1718
0
                _Py_SetImmortal(unicode);
1719
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1720
0
                return;
1721
0
            }
1722
238k
            if (r == 0) {
1723
                // The interned string was not found in the interned_dict.
1724
#ifdef Py_DEBUG
1725
                Py_UNREACHABLE();
1726
#endif
1727
0
                _Py_SetImmortal(unicode);
1728
0
                return;
1729
0
            }
1730
            // Successfully popped.
1731
238k
            assert(popped == unicode);
1732
            // Only our `popped` reference should be left; remove it too.
1733
238k
            assert(Py_REFCNT(unicode) == 1);
1734
238k
            Py_SET_REFCNT(unicode, 0);
1735
#ifdef Py_REF_DEBUG
1736
            /* let's be pedantic with the ref total */
1737
            _Py_DecRefTotal(_PyThreadState_GET());
1738
#endif
1739
238k
            break;
1740
0
        default:
1741
            // As with `statically_allocated` above.
1742
#ifdef Py_REF_DEBUG
1743
            Py_UNREACHABLE();
1744
#endif
1745
0
            _Py_SetImmortal(unicode);
1746
0
            return;
1747
43.8M
    }
1748
43.8M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1749
7.36k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1750
7.36k
    }
1751
43.8M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1752
0
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1753
0
    }
1754
1755
43.8M
    Py_TYPE(unicode)->tp_free(unicode);
1756
43.8M
}
1757
1758
#ifdef Py_DEBUG
1759
static int
1760
unicode_is_singleton(PyObject *unicode)
1761
{
1762
    if (unicode == &_Py_STR(empty)) {
1763
        return 1;
1764
    }
1765
1766
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1767
    if (ascii->length == 1) {
1768
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1769
        if (ch < 256 && LATIN1(ch) == unicode) {
1770
            return 1;
1771
        }
1772
    }
1773
    return 0;
1774
}
1775
#endif
1776
1777
static int
1778
unicode_modifiable(PyObject *unicode)
1779
18.8M
{
1780
18.8M
    assert(_PyUnicode_CHECK(unicode));
1781
18.8M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1782
287k
        return 0;
1783
18.5M
    if (PyUnicode_HASH(unicode) != -1)
1784
0
        return 0;
1785
18.5M
    if (PyUnicode_CHECK_INTERNED(unicode))
1786
0
        return 0;
1787
18.5M
    if (!PyUnicode_CheckExact(unicode))
1788
0
        return 0;
1789
#ifdef Py_DEBUG
1790
    /* singleton refcount is greater than 1 */
1791
    assert(!unicode_is_singleton(unicode));
1792
#endif
1793
18.5M
    return 1;
1794
18.5M
}
1795
1796
static int
1797
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1798
5.22M
{
1799
5.22M
    PyObject *unicode;
1800
5.22M
    Py_ssize_t old_length;
1801
1802
5.22M
    assert(p_unicode != NULL);
1803
5.22M
    unicode = *p_unicode;
1804
1805
5.22M
    assert(unicode != NULL);
1806
5.22M
    assert(PyUnicode_Check(unicode));
1807
5.22M
    assert(0 <= length);
1808
1809
5.22M
    old_length = PyUnicode_GET_LENGTH(unicode);
1810
5.22M
    if (old_length == length)
1811
0
        return 0;
1812
1813
5.22M
    if (length == 0) {
1814
0
        PyObject *empty = unicode_get_empty();
1815
0
        Py_SETREF(*p_unicode, empty);
1816
0
        return 0;
1817
0
    }
1818
1819
5.22M
    if (!unicode_modifiable(unicode)) {
1820
0
        PyObject *copy = resize_copy(unicode, length);
1821
0
        if (copy == NULL)
1822
0
            return -1;
1823
0
        Py_SETREF(*p_unicode, copy);
1824
0
        return 0;
1825
0
    }
1826
1827
5.22M
    if (PyUnicode_IS_COMPACT(unicode)) {
1828
5.22M
        PyObject *new_unicode = resize_compact(unicode, length);
1829
5.22M
        if (new_unicode == NULL)
1830
0
            return -1;
1831
5.22M
        *p_unicode = new_unicode;
1832
5.22M
        return 0;
1833
5.22M
    }
1834
0
    return resize_inplace(unicode, length);
1835
5.22M
}
1836
1837
int
1838
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1839
0
{
1840
0
    PyObject *unicode;
1841
0
    if (p_unicode == NULL) {
1842
0
        PyErr_BadInternalCall();
1843
0
        return -1;
1844
0
    }
1845
0
    unicode = *p_unicode;
1846
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1847
0
    {
1848
0
        PyErr_BadInternalCall();
1849
0
        return -1;
1850
0
    }
1851
0
    return unicode_resize(p_unicode, length);
1852
0
}
1853
1854
/* Copy an ASCII or latin1 char* string into a Python Unicode string.
1855
1856
   WARNING: The function doesn't copy the terminating null character and
1857
   doesn't check the maximum character (may write a latin1 character in an
1858
   ASCII string). */
1859
static void
1860
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1861
                   const char *str, Py_ssize_t len)
1862
0
{
1863
0
    int kind = PyUnicode_KIND(unicode);
1864
0
    const void *data = PyUnicode_DATA(unicode);
1865
0
    const char *end = str + len;
1866
1867
0
    assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1868
0
    switch (kind) {
1869
0
    case PyUnicode_1BYTE_KIND: {
1870
#ifdef Py_DEBUG
1871
        if (PyUnicode_IS_ASCII(unicode)) {
1872
            Py_UCS4 maxchar = ucs1lib_find_max_char(
1873
                (const Py_UCS1*)str,
1874
                (const Py_UCS1*)str + len);
1875
            assert(maxchar < 128);
1876
        }
1877
#endif
1878
0
        memcpy((char *) data + index, str, len);
1879
0
        break;
1880
0
    }
1881
0
    case PyUnicode_2BYTE_KIND: {
1882
0
        Py_UCS2 *start = (Py_UCS2 *)data + index;
1883
0
        Py_UCS2 *ucs2 = start;
1884
1885
0
        for (; str < end; ++ucs2, ++str)
1886
0
            *ucs2 = (Py_UCS2)*str;
1887
1888
0
        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1889
0
        break;
1890
0
    }
1891
0
    case PyUnicode_4BYTE_KIND: {
1892
0
        Py_UCS4 *start = (Py_UCS4 *)data + index;
1893
0
        Py_UCS4 *ucs4 = start;
1894
1895
0
        for (; str < end; ++ucs4, ++str)
1896
0
            *ucs4 = (Py_UCS4)*str;
1897
1898
0
        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1899
0
        break;
1900
0
    }
1901
0
    default:
1902
0
        Py_UNREACHABLE();
1903
0
    }
1904
0
}
1905
1906
static PyObject*
1907
get_latin1_char(Py_UCS1 ch)
1908
162M
{
1909
162M
    PyObject *o = LATIN1(ch);
1910
162M
    return o;
1911
162M
}
1912
1913
static PyObject*
1914
unicode_char(Py_UCS4 ch)
1915
112M
{
1916
112M
    PyObject *unicode;
1917
1918
112M
    assert(ch <= MAX_UNICODE);
1919
1920
112M
    if (ch < 256) {
1921
87.0M
        return get_latin1_char(ch);
1922
87.0M
    }
1923
1924
25.1M
    unicode = PyUnicode_New(1, ch);
1925
25.1M
    if (unicode == NULL)
1926
0
        return NULL;
1927
1928
25.1M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1929
50.3M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1930
22.1M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1931
22.1M
    } else {
1932
2.98M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1933
2.98M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1934
2.98M
    }
1935
25.1M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1936
25.1M
    return unicode;
1937
25.1M
}
1938
1939
1940
static inline void
1941
unicode_write_widechar(int kind, void *data,
1942
                       const wchar_t *u, Py_ssize_t size,
1943
                       Py_ssize_t num_surrogates)
1944
16.3k
{
1945
16.3k
    switch (kind) {
1946
16.3k
    case PyUnicode_1BYTE_KIND:
1947
16.3k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1948
16.3k
        break;
1949
1950
0
    case PyUnicode_2BYTE_KIND:
1951
#if SIZEOF_WCHAR_T == 2
1952
        memcpy(data, u, size * 2);
1953
#else
1954
0
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1955
0
#endif
1956
0
        break;
1957
1958
0
    case PyUnicode_4BYTE_KIND:
1959
0
    {
1960
#if SIZEOF_WCHAR_T == 2
1961
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1962
        // surrogate pairs.
1963
        const wchar_t *end = u + size;
1964
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1965
#  ifndef NDEBUG
1966
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1967
#  endif
1968
        for (const wchar_t *iter = u; iter < end; ) {
1969
            assert(ucs4_out < ucs4_end);
1970
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1971
                && (iter+1) < end
1972
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1973
            {
1974
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1975
                iter += 2;
1976
            }
1977
            else {
1978
                *ucs4_out++ = *iter;
1979
                iter++;
1980
            }
1981
        }
1982
        assert(ucs4_out == ucs4_end);
1983
#else
1984
0
        assert(num_surrogates == 0);
1985
0
        memcpy(data, u, size * 4);
1986
0
#endif
1987
0
        break;
1988
0
    }
1989
0
    default:
1990
0
        Py_UNREACHABLE();
1991
16.3k
    }
1992
16.3k
}
1993
1994
1995
PyObject *
1996
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1997
16.3k
{
1998
16.3k
    PyObject *unicode;
1999
16.3k
    Py_UCS4 maxchar = 0;
2000
16.3k
    Py_ssize_t num_surrogates;
2001
2002
16.3k
    if (u == NULL && size != 0) {
2003
0
        PyErr_BadInternalCall();
2004
0
        return NULL;
2005
0
    }
2006
2007
16.3k
    if (size == -1) {
2008
924
        size = wcslen(u);
2009
924
    }
2010
2011
    /* If the Unicode data is known at construction time, we can apply
2012
       some optimizations which share commonly used objects. */
2013
2014
    /* Optimization for empty strings */
2015
16.3k
    if (size == 0)
2016
44
        _Py_RETURN_UNICODE_EMPTY();
2017
2018
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2019
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2020
       non-Unicode locales and hence needs conversion to UCS-4 first. */
2021
    if (_Py_LocaleUsesNonUnicodeWchar()) {
2022
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2023
        if (!converted) {
2024
            return NULL;
2025
        }
2026
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2027
        PyMem_Free(converted);
2028
        return unicode;
2029
    }
2030
#endif
2031
2032
    /* Single character Unicode objects in the Latin-1 range are
2033
       shared when using this constructor */
2034
16.3k
    if (size == 1 && (Py_UCS4)*u < 256)
2035
0
        return get_latin1_char((unsigned char)*u);
2036
2037
    /* If not empty and not single character, copy the Unicode data
2038
       into the new object */
2039
16.3k
    if (find_maxchar_surrogates(u, u + size,
2040
16.3k
                                &maxchar, &num_surrogates) == -1)
2041
0
        return NULL;
2042
2043
16.3k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
2044
16.3k
    if (!unicode)
2045
0
        return NULL;
2046
2047
16.3k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2048
16.3k
                           u, size, num_surrogates);
2049
2050
0
    return unicode_result(unicode);
2051
16.3k
}
2052
2053
2054
int
2055
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
2056
                              const wchar_t *str,
2057
                              Py_ssize_t size)
2058
0
{
2059
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
2060
2061
0
    if (size < 0) {
2062
0
        size = wcslen(str);
2063
0
    }
2064
2065
0
    if (size == 0) {
2066
0
        return 0;
2067
0
    }
2068
2069
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2070
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2071
       non-Unicode locales and hence needs conversion to UCS-4 first. */
2072
    if (_Py_LocaleUsesNonUnicodeWchar()) {
2073
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
2074
        if (!converted) {
2075
            return -1;
2076
        }
2077
2078
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
2079
        PyMem_Free(converted);
2080
        return res;
2081
    }
2082
#endif
2083
2084
0
    Py_UCS4 maxchar = 0;
2085
0
    Py_ssize_t num_surrogates;
2086
0
    if (find_maxchar_surrogates(str, str + size,
2087
0
                                &maxchar, &num_surrogates) == -1) {
2088
0
        return -1;
2089
0
    }
2090
2091
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
2092
0
        return -1;
2093
0
    }
2094
2095
0
    int kind = writer->kind;
2096
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2097
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
2098
2099
0
    writer->pos += size - num_surrogates;
2100
0
    return 0;
2101
0
}
2102
2103
2104
PyObject *
2105
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2106
120k
{
2107
120k
    if (size < 0) {
2108
0
        PyErr_SetString(PyExc_SystemError,
2109
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2110
0
        return NULL;
2111
0
    }
2112
120k
    if (u != NULL) {
2113
120k
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2114
120k
    }
2115
0
    if (size > 0) {
2116
0
        PyErr_SetString(PyExc_SystemError,
2117
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2118
0
        return NULL;
2119
0
    }
2120
0
    return unicode_get_empty();
2121
0
}
2122
2123
PyObject *
2124
PyUnicode_FromString(const char *u)
2125
4.70M
{
2126
4.70M
    size_t size = strlen(u);
2127
4.70M
    if (size > PY_SSIZE_T_MAX) {
2128
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2129
0
        return NULL;
2130
0
    }
2131
4.70M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2132
4.70M
}
2133
2134
2135
PyObject *
2136
_PyUnicode_FromId(_Py_Identifier *id)
2137
0
{
2138
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2139
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2140
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2141
2142
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2143
0
    if (index < 0) {
2144
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2145
2146
0
        PyMutex_Lock(&rt_ids->mutex);
2147
        // Check again to detect concurrent access. Another thread can have
2148
        // initialized the index while this thread waited for the lock.
2149
0
        index = _Py_atomic_load_ssize(&id->index);
2150
0
        if (index < 0) {
2151
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2152
0
            index = rt_ids->next_index;
2153
0
            rt_ids->next_index++;
2154
0
            _Py_atomic_store_ssize(&id->index, index);
2155
0
        }
2156
0
        PyMutex_Unlock(&rt_ids->mutex);
2157
0
    }
2158
0
    assert(index >= 0);
2159
2160
0
    PyObject *obj;
2161
0
    if (index < ids->size) {
2162
0
        obj = ids->array[index];
2163
0
        if (obj) {
2164
            // Return a borrowed reference
2165
0
            goto end;
2166
0
        }
2167
0
    }
2168
2169
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2170
0
                                       NULL, NULL);
2171
0
    if (!obj) {
2172
0
        goto end;
2173
0
    }
2174
0
    _PyUnicode_InternImmortal(interp, &obj);
2175
2176
0
    if (index >= ids->size) {
2177
        // Overallocate to reduce the number of realloc
2178
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2179
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2180
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2181
0
        if (new_array == NULL) {
2182
0
            PyErr_NoMemory();
2183
0
            obj = NULL;
2184
0
            goto end;
2185
0
        }
2186
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2187
0
        ids->array = new_array;
2188
0
        ids->size = new_size;
2189
0
    }
2190
2191
    // The array stores a strong reference
2192
0
    ids->array[index] = obj;
2193
2194
0
end:
2195
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2196
    // Return a borrowed reference
2197
0
    return obj;
2198
0
}
2199
2200
2201
static void
2202
unicode_clear_identifiers(struct _Py_unicode_state *state)
2203
0
{
2204
0
    struct _Py_unicode_ids *ids = &state->ids;
2205
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2206
0
        Py_XDECREF(ids->array[i]);
2207
0
    }
2208
0
    ids->size = 0;
2209
0
    PyMem_Free(ids->array);
2210
0
    ids->array = NULL;
2211
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2212
    // after Py_Finalize().
2213
0
}
2214
2215
2216
/* Internal function, doesn't check maximum character */
2217
2218
PyObject*
2219
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2220
11.5M
{
2221
11.5M
    const unsigned char *s = (const unsigned char *)buffer;
2222
11.5M
    PyObject *unicode;
2223
11.5M
    if (size == 1) {
2224
#ifdef Py_DEBUG
2225
        assert((unsigned char)s[0] < 128);
2226
#endif
2227
93.7k
        return get_latin1_char(s[0]);
2228
93.7k
    }
2229
11.4M
    unicode = PyUnicode_New(size, 127);
2230
11.4M
    if (!unicode)
2231
0
        return NULL;
2232
11.4M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2233
11.4M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2234
11.4M
    return unicode;
2235
11.4M
}
2236
2237
static Py_UCS4
2238
kind_maxchar_limit(int kind)
2239
850
{
2240
850
    switch (kind) {
2241
278
    case PyUnicode_1BYTE_KIND:
2242
278
        return 0x80;
2243
281
    case PyUnicode_2BYTE_KIND:
2244
281
        return 0x100;
2245
291
    case PyUnicode_4BYTE_KIND:
2246
291
        return 0x10000;
2247
0
    default:
2248
0
        Py_UNREACHABLE();
2249
850
    }
2250
850
}
2251
2252
static PyObject*
2253
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2254
4.98M
{
2255
4.98M
    PyObject *res;
2256
4.98M
    unsigned char max_char;
2257
2258
4.98M
    if (size == 0) {
2259
4.76M
        _Py_RETURN_UNICODE_EMPTY();
2260
4.76M
    }
2261
4.98M
    assert(size > 0);
2262
224k
    if (size == 1) {
2263
26.8k
        return get_latin1_char(u[0]);
2264
26.8k
    }
2265
2266
197k
    max_char = ucs1lib_find_max_char(u, u + size);
2267
197k
    res = PyUnicode_New(size, max_char);
2268
197k
    if (!res)
2269
0
        return NULL;
2270
197k
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2271
197k
    assert(_PyUnicode_CheckConsistency(res, 1));
2272
197k
    return res;
2273
197k
}
2274
2275
static PyObject*
2276
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2277
6.20M
{
2278
6.20M
    PyObject *res;
2279
6.20M
    Py_UCS2 max_char;
2280
2281
6.20M
    if (size == 0)
2282
5.86M
        _Py_RETURN_UNICODE_EMPTY();
2283
6.20M
    assert(size > 0);
2284
336k
    if (size == 1)
2285
22.1k
        return unicode_char(u[0]);
2286
2287
314k
    max_char = ucs2lib_find_max_char(u, u + size);
2288
314k
    res = PyUnicode_New(size, max_char);
2289
314k
    if (!res)
2290
0
        return NULL;
2291
314k
    if (max_char >= 256)
2292
266k
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2293
47.8k
    else {
2294
47.8k
        _PyUnicode_CONVERT_BYTES(
2295
47.8k
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2296
47.8k
    }
2297
314k
    assert(_PyUnicode_CheckConsistency(res, 1));
2298
314k
    return res;
2299
314k
}
2300
2301
static PyObject*
2302
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2303
10.5M
{
2304
10.5M
    PyObject *res;
2305
10.5M
    Py_UCS4 max_char;
2306
2307
10.5M
    if (size == 0)
2308
9.45M
        _Py_RETURN_UNICODE_EMPTY();
2309
10.5M
    assert(size > 0);
2310
1.09M
    if (size == 1)
2311
576k
        return unicode_char(u[0]);
2312
2313
523k
    max_char = ucs4lib_find_max_char(u, u + size);
2314
523k
    res = PyUnicode_New(size, max_char);
2315
523k
    if (!res)
2316
0
        return NULL;
2317
523k
    if (max_char < 256)
2318
357k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2319
523k
                                 PyUnicode_1BYTE_DATA(res));
2320
165k
    else if (max_char < 0x10000)
2321
120k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2322
165k
                                 PyUnicode_2BYTE_DATA(res));
2323
44.7k
    else
2324
44.7k
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2325
523k
    assert(_PyUnicode_CheckConsistency(res, 1));
2326
523k
    return res;
2327
523k
}
2328
2329
2330
int
2331
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2332
                          Py_UCS4 *str,
2333
                          Py_ssize_t size)
2334
0
{
2335
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2336
2337
0
    if (size < 0) {
2338
0
        PyErr_SetString(PyExc_ValueError,
2339
0
                        "size must be positive");
2340
0
        return -1;
2341
0
    }
2342
2343
0
    if (size == 0) {
2344
0
        return 0;
2345
0
    }
2346
2347
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2348
2349
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2350
0
        return -1;
2351
0
    }
2352
2353
0
    int kind = writer->kind;
2354
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2355
0
    if (kind == PyUnicode_1BYTE_KIND) {
2356
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2357
0
                                 str, str + size,
2358
0
                                 data);
2359
0
    }
2360
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2361
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2362
0
                                 str, str + size,
2363
0
                                 data);
2364
0
    }
2365
0
    else {
2366
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2367
0
    }
2368
0
    writer->pos += size;
2369
2370
0
    return 0;
2371
0
}
2372
2373
2374
PyObject*
2375
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2376
2.46M
{
2377
2.46M
    if (size < 0) {
2378
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2379
0
        return NULL;
2380
0
    }
2381
2.46M
    switch (kind) {
2382
217k
    case PyUnicode_1BYTE_KIND:
2383
217k
        return _PyUnicode_FromUCS1(buffer, size);
2384
371k
    case PyUnicode_2BYTE_KIND:
2385
371k
        return _PyUnicode_FromUCS2(buffer, size);
2386
1.87M
    case PyUnicode_4BYTE_KIND:
2387
1.87M
        return _PyUnicode_FromUCS4(buffer, size);
2388
0
    default:
2389
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2390
0
        return NULL;
2391
2.46M
    }
2392
2.46M
}
2393
2394
Py_UCS4
2395
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2396
1.53M
{
2397
1.53M
    int kind;
2398
1.53M
    const void *startptr, *endptr;
2399
2400
1.53M
    assert(0 <= start);
2401
1.53M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2402
1.53M
    assert(start <= end);
2403
2404
1.53M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2405
0
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2406
2407
1.53M
    if (start == end)
2408
0
        return 127;
2409
2410
1.53M
    if (PyUnicode_IS_ASCII(unicode))
2411
1.39M
        return 127;
2412
2413
141k
    kind = PyUnicode_KIND(unicode);
2414
141k
    startptr = PyUnicode_DATA(unicode);
2415
141k
    endptr = (char *)startptr + end * kind;
2416
141k
    startptr = (char *)startptr + start * kind;
2417
141k
    switch(kind) {
2418
4.23k
    case PyUnicode_1BYTE_KIND:
2419
4.23k
        return ucs1lib_find_max_char(startptr, endptr);
2420
12.8k
    case PyUnicode_2BYTE_KIND:
2421
12.8k
        return ucs2lib_find_max_char(startptr, endptr);
2422
124k
    case PyUnicode_4BYTE_KIND:
2423
124k
        return ucs4lib_find_max_char(startptr, endptr);
2424
0
    default:
2425
0
        Py_UNREACHABLE();
2426
141k
    }
2427
141k
}
2428
2429
/* Ensure that a string uses the most efficient storage, if it is not the
2430
   case: create a new string with of the right kind. Write NULL into *p_unicode
2431
   on error. */
2432
static void
2433
unicode_adjust_maxchar(PyObject **p_unicode)
2434
0
{
2435
0
    PyObject *unicode, *copy;
2436
0
    Py_UCS4 max_char;
2437
0
    Py_ssize_t len;
2438
0
    int kind;
2439
2440
0
    assert(p_unicode != NULL);
2441
0
    unicode = *p_unicode;
2442
0
    if (PyUnicode_IS_ASCII(unicode))
2443
0
        return;
2444
2445
0
    len = PyUnicode_GET_LENGTH(unicode);
2446
0
    kind = PyUnicode_KIND(unicode);
2447
0
    if (kind == PyUnicode_1BYTE_KIND) {
2448
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2449
0
        max_char = ucs1lib_find_max_char(u, u + len);
2450
0
        if (max_char >= 128)
2451
0
            return;
2452
0
    }
2453
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2454
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2455
0
        max_char = ucs2lib_find_max_char(u, u + len);
2456
0
        if (max_char >= 256)
2457
0
            return;
2458
0
    }
2459
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2460
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2461
0
        max_char = ucs4lib_find_max_char(u, u + len);
2462
0
        if (max_char >= 0x10000)
2463
0
            return;
2464
0
    }
2465
0
    else
2466
0
        Py_UNREACHABLE();
2467
2468
0
    copy = PyUnicode_New(len, max_char);
2469
0
    if (copy != NULL)
2470
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2471
0
    Py_DECREF(unicode);
2472
0
    *p_unicode = copy;
2473
0
}
2474
2475
PyObject*
2476
_PyUnicode_Copy(PyObject *unicode)
2477
0
{
2478
0
    Py_ssize_t length;
2479
0
    PyObject *copy;
2480
2481
0
    if (!PyUnicode_Check(unicode)) {
2482
0
        PyErr_BadInternalCall();
2483
0
        return NULL;
2484
0
    }
2485
2486
0
    length = PyUnicode_GET_LENGTH(unicode);
2487
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2488
0
    if (!copy)
2489
0
        return NULL;
2490
0
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2491
2492
0
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2493
0
              length * PyUnicode_KIND(unicode));
2494
0
    assert(_PyUnicode_CheckConsistency(copy, 1));
2495
0
    return copy;
2496
0
}
2497
2498
2499
/* Widen Unicode objects to larger buffers. Don't write terminating null
2500
   character. Return NULL on error. */
2501
2502
static void*
2503
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2504
17.3k
{
2505
17.3k
    void *result;
2506
2507
17.3k
    assert(skind < kind);
2508
17.3k
    switch (kind) {
2509
11.0k
    case PyUnicode_2BYTE_KIND:
2510
11.0k
        result = PyMem_New(Py_UCS2, len);
2511
11.0k
        if (!result)
2512
0
            return PyErr_NoMemory();
2513
11.0k
        assert(skind == PyUnicode_1BYTE_KIND);
2514
11.0k
        _PyUnicode_CONVERT_BYTES(
2515
11.0k
            Py_UCS1, Py_UCS2,
2516
11.0k
            (const Py_UCS1 *)data,
2517
11.0k
            ((const Py_UCS1 *)data) + len,
2518
11.0k
            result);
2519
11.0k
        return result;
2520
6.37k
    case PyUnicode_4BYTE_KIND:
2521
6.37k
        result = PyMem_New(Py_UCS4, len);
2522
6.37k
        if (!result)
2523
0
            return PyErr_NoMemory();
2524
6.37k
        if (skind == PyUnicode_2BYTE_KIND) {
2525
0
            _PyUnicode_CONVERT_BYTES(
2526
0
                Py_UCS2, Py_UCS4,
2527
0
                (const Py_UCS2 *)data,
2528
0
                ((const Py_UCS2 *)data) + len,
2529
0
                result);
2530
0
        }
2531
6.37k
        else {
2532
6.37k
            assert(skind == PyUnicode_1BYTE_KIND);
2533
6.37k
            _PyUnicode_CONVERT_BYTES(
2534
6.37k
                Py_UCS1, Py_UCS4,
2535
6.37k
                (const Py_UCS1 *)data,
2536
6.37k
                ((const Py_UCS1 *)data) + len,
2537
6.37k
                result);
2538
6.37k
        }
2539
6.37k
        return result;
2540
0
    default:
2541
0
        Py_UNREACHABLE();
2542
0
        return NULL;
2543
17.3k
    }
2544
17.3k
}
2545
2546
static Py_UCS4*
2547
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2548
        int copy_null)
2549
0
{
2550
0
    int kind;
2551
0
    const void *data;
2552
0
    Py_ssize_t len, targetlen;
2553
0
    kind = PyUnicode_KIND(string);
2554
0
    data = PyUnicode_DATA(string);
2555
0
    len = PyUnicode_GET_LENGTH(string);
2556
0
    targetlen = len;
2557
0
    if (copy_null)
2558
0
        targetlen++;
2559
0
    if (!target) {
2560
0
        target = PyMem_New(Py_UCS4, targetlen);
2561
0
        if (!target) {
2562
0
            PyErr_NoMemory();
2563
0
            return NULL;
2564
0
        }
2565
0
    }
2566
0
    else {
2567
0
        if (targetsize < targetlen) {
2568
0
            PyErr_Format(PyExc_SystemError,
2569
0
                         "string is longer than the buffer");
2570
0
            if (copy_null && 0 < targetsize)
2571
0
                target[0] = 0;
2572
0
            return NULL;
2573
0
        }
2574
0
    }
2575
0
    if (kind == PyUnicode_1BYTE_KIND) {
2576
0
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2577
0
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2578
0
    }
2579
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2580
0
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2581
0
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2582
0
    }
2583
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2584
0
        memcpy(target, data, len * sizeof(Py_UCS4));
2585
0
    }
2586
0
    else {
2587
0
        Py_UNREACHABLE();
2588
0
    }
2589
0
    if (copy_null)
2590
0
        target[len] = 0;
2591
0
    return target;
2592
0
}
2593
2594
Py_UCS4*
2595
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2596
                 int copy_null)
2597
0
{
2598
0
    if (target == NULL || targetsize < 0) {
2599
0
        PyErr_BadInternalCall();
2600
0
        return NULL;
2601
0
    }
2602
0
    return as_ucs4(string, target, targetsize, copy_null);
2603
0
}
2604
2605
Py_UCS4*
2606
PyUnicode_AsUCS4Copy(PyObject *string)
2607
0
{
2608
0
    return as_ucs4(string, NULL, 0, 1);
2609
0
}
2610
2611
/* maximum number of characters required for output of %jo or %jd or %p.
2612
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2613
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2614
   plus 1 for the terminal NUL. */
2615
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2616
2617
static int
2618
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2619
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2620
85.2k
{
2621
85.2k
    Py_ssize_t length, fill, arglen;
2622
85.2k
    Py_UCS4 maxchar;
2623
2624
85.2k
    length = PyUnicode_GET_LENGTH(str);
2625
85.2k
    if ((precision == -1 || precision >= length)
2626
84.6k
        && width <= length)
2627
84.6k
        return _PyUnicodeWriter_WriteStr(writer, str);
2628
2629
690
    if (precision != -1)
2630
690
        length = Py_MIN(precision, length);
2631
2632
690
    arglen = Py_MAX(length, width);
2633
690
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2634
326
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2635
364
    else
2636
364
        maxchar = writer->maxchar;
2637
2638
690
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2639
0
        return -1;
2640
2641
690
    fill = Py_MAX(width - length, 0);
2642
690
    if (fill && !(flags & F_LJUST)) {
2643
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2644
0
            return -1;
2645
0
        writer->pos += fill;
2646
0
    }
2647
2648
690
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2649
690
                                  str, 0, length);
2650
690
    writer->pos += length;
2651
2652
690
    if (fill && (flags & F_LJUST)) {
2653
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2654
0
            return -1;
2655
0
        writer->pos += fill;
2656
0
    }
2657
2658
690
    return 0;
2659
690
}
2660
2661
static int
2662
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2663
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2664
233k
{
2665
    /* UTF-8 */
2666
233k
    Py_ssize_t *pconsumed = NULL;
2667
233k
    Py_ssize_t length;
2668
233k
    if (precision == -1) {
2669
89.1k
        length = strlen(str);
2670
89.1k
    }
2671
144k
    else {
2672
144k
        length = 0;
2673
2.25M
        while (length < precision && str[length]) {
2674
2.10M
            length++;
2675
2.10M
        }
2676
144k
        if (length == precision) {
2677
            /* The input string is not NUL-terminated.  If it ends with an
2678
             * incomplete UTF-8 sequence, truncate the string just before it.
2679
             * Incomplete sequences in the middle and sequences which cannot
2680
             * be valid prefixes are still treated as errors and replaced
2681
             * with \xfffd. */
2682
2.51k
            pconsumed = &length;
2683
2.51k
        }
2684
144k
    }
2685
2686
233k
    if (width < 0) {
2687
233k
        return unicode_decode_utf8_writer(writer, str, length,
2688
233k
                                          _Py_ERROR_REPLACE, "replace", pconsumed);
2689
233k
    }
2690
2691
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2692
0
                                                     "replace", pconsumed);
2693
0
    if (unicode == NULL)
2694
0
        return -1;
2695
2696
0
    int res = unicode_fromformat_write_str(writer, unicode,
2697
0
                                           width, -1, flags);
2698
0
    Py_DECREF(unicode);
2699
0
    return res;
2700
0
}
2701
2702
static int
2703
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2704
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2705
0
{
2706
0
    Py_ssize_t length;
2707
0
    if (precision == -1) {
2708
0
        length = wcslen(str);
2709
0
    }
2710
0
    else {
2711
0
        length = 0;
2712
0
        while (length < precision && str[length]) {
2713
0
            length++;
2714
0
        }
2715
0
    }
2716
2717
0
    if (width < 0) {
2718
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2719
0
                                             str, length);
2720
0
    }
2721
2722
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2723
0
    if (unicode == NULL)
2724
0
        return -1;
2725
2726
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2727
0
    Py_DECREF(unicode);
2728
0
    return res;
2729
0
}
2730
2731
0
#define F_LONG 1
2732
0
#define F_LONGLONG 2
2733
72.7k
#define F_SIZE 3
2734
0
#define F_PTRDIFF 4
2735
0
#define F_INTMAX 5
2736
2737
static const char*
2738
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2739
                       const char *f, va_list *vargs)
2740
388k
{
2741
388k
    const char *p;
2742
388k
    Py_ssize_t len;
2743
388k
    int flags = 0;
2744
388k
    Py_ssize_t width;
2745
388k
    Py_ssize_t precision;
2746
2747
388k
    p = f;
2748
388k
    f++;
2749
388k
    if (*f == '%') {
2750
0
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2751
0
            return NULL;
2752
0
        f++;
2753
0
        return f;
2754
0
    }
2755
2756
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2757
    /* Flags '+', ' ' and '#' are not particularly useful.
2758
     * They are not worth the implementation and maintenance costs.
2759
     * In addition, '#' should add "0" for "o" conversions for compatibility
2760
     * with printf, but it would confuse Python users. */
2761
390k
    while (1) {
2762
390k
        switch (*f++) {
2763
0
        case '-': flags |= F_LJUST; continue;
2764
2.41k
        case '0': flags |= F_ZERO; continue;
2765
0
        case '#': flags |= F_ALT; continue;
2766
390k
        }
2767
388k
        f--;
2768
388k
        break;
2769
390k
    }
2770
2771
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2772
388k
    width = -1;
2773
388k
    if (*f == '*') {
2774
0
        width = va_arg(*vargs, int);
2775
0
        if (width < 0) {
2776
0
            flags |= F_LJUST;
2777
0
            width = -width;
2778
0
        }
2779
0
        f++;
2780
0
    }
2781
388k
    else if (Py_ISDIGIT((unsigned)*f)) {
2782
2.41k
        width = *f - '0';
2783
2.41k
        f++;
2784
2.41k
        while (Py_ISDIGIT((unsigned)*f)) {
2785
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2786
0
                PyErr_SetString(PyExc_ValueError,
2787
0
                                "width too big");
2788
0
                return NULL;
2789
0
            }
2790
0
            width = (width * 10) + (*f - '0');
2791
0
            f++;
2792
0
        }
2793
2.41k
    }
2794
388k
    precision = -1;
2795
388k
    if (*f == '.') {
2796
147k
        f++;
2797
147k
        if (*f == '*') {
2798
0
            precision = va_arg(*vargs, int);
2799
0
            if (precision < 0) {
2800
0
                precision = -2;
2801
0
            }
2802
0
            f++;
2803
0
        }
2804
147k
        else if (Py_ISDIGIT((unsigned)*f)) {
2805
147k
            precision = (*f - '0');
2806
147k
            f++;
2807
437k
            while (Py_ISDIGIT((unsigned)*f)) {
2808
290k
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2809
0
                    PyErr_SetString(PyExc_ValueError,
2810
0
                                    "precision too big");
2811
0
                    return NULL;
2812
0
                }
2813
290k
                precision = (precision * 10) + (*f - '0');
2814
290k
                f++;
2815
290k
            }
2816
147k
        }
2817
147k
    }
2818
2819
388k
    int sizemod = 0;
2820
388k
    if (*f == 'l') {
2821
0
        if (f[1] == 'l') {
2822
0
            sizemod = F_LONGLONG;
2823
0
            f += 2;
2824
0
        }
2825
0
        else {
2826
0
            sizemod = F_LONG;
2827
0
            ++f;
2828
0
        }
2829
0
    }
2830
388k
    else if (*f == 'z') {
2831
36.3k
        sizemod = F_SIZE;
2832
36.3k
        ++f;
2833
36.3k
    }
2834
352k
    else if (*f == 't') {
2835
0
        sizemod = F_PTRDIFF;
2836
0
        ++f;
2837
0
    }
2838
352k
    else if (*f == 'j') {
2839
0
        sizemod = F_INTMAX;
2840
0
        ++f;
2841
0
    }
2842
388k
    if (f[0] != '\0' && f[1] == '\0')
2843
51.1k
        writer->overallocate = 0;
2844
2845
388k
    switch (*f) {
2846
49.1k
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2847
49.1k
        break;
2848
20.7k
    case 'c': case 'p':
2849
20.7k
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2850
20.7k
        break;
2851
233k
    case 's':
2852
233k
    case 'V':
2853
233k
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2854
233k
        break;
2855
233k
    default:
2856
85.2k
        if (sizemod) goto invalid_format;
2857
85.2k
        break;
2858
388k
    }
2859
2860
388k
    switch (*f) {
2861
20.7k
    case 'c':
2862
20.7k
    {
2863
20.7k
        int ordinal = va_arg(*vargs, int);
2864
20.7k
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2865
0
            PyErr_SetString(PyExc_OverflowError,
2866
0
                            "character argument not in range(0x110000)");
2867
0
            return NULL;
2868
0
        }
2869
20.7k
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2870
0
            return NULL;
2871
20.7k
        break;
2872
20.7k
    }
2873
2874
46.6k
    case 'd': case 'i':
2875
49.1k
    case 'o': case 'u': case 'x': case 'X':
2876
49.1k
    {
2877
49.1k
        char buffer[MAX_INTMAX_CHARS];
2878
2879
        // Fill buffer using sprinf, with one of many possible format
2880
        // strings, like "%llX" for `long long` in hexadecimal.
2881
        // The type/size is in `sizemod`; the format is in `*f`.
2882
2883
        // Use macros with nested switches to keep the sprintf format strings
2884
        // as compile-time literals, avoiding warnings and maybe allowing
2885
        // optimizations.
2886
2887
        // `SPRINT` macro does one sprintf
2888
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2889
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2890
49.1k
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2891
49.1k
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2892
2893
        // One inner switch to handle all format variants
2894
49.1k
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2895
49.1k
            switch (*f) {                                                     \
2896
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2897
0
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2898
1.51k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2899
895
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2900
46.6k
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2901
49.1k
            }
2902
2903
        // Outer switch to handle all the sizes/types
2904
49.1k
        switch (sizemod) {
2905
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2906
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2907
36.3k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2908
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2909
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2910
12.7k
            default:         DO_SPRINTS("", int, unsigned int); break;
2911
49.1k
        }
2912
49.1k
        #undef SPRINT
2913
49.1k
        #undef DO_SPRINTS
2914
2915
49.1k
        assert(len >= 0);
2916
2917
49.1k
        int sign = (buffer[0] == '-');
2918
49.1k
        len -= sign;
2919
2920
49.1k
        precision = Py_MAX(precision, len);
2921
49.1k
        width = Py_MAX(width, precision + sign);
2922
49.1k
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2923
2.41k
            precision = width - sign;
2924
2.41k
        }
2925
2926
49.1k
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2927
49.1k
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2928
2929
49.1k
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2930
0
            return NULL;
2931
2932
49.1k
        if (spacepad && !(flags & F_LJUST)) {
2933
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2934
0
                return NULL;
2935
0
            writer->pos += spacepad;
2936
0
        }
2937
2938
49.1k
        if (sign) {
2939
0
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2940
0
                return NULL;
2941
0
        }
2942
2943
49.1k
        if (zeropad) {
2944
691
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2945
0
                return NULL;
2946
691
            writer->pos += zeropad;
2947
691
        }
2948
2949
49.1k
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2950
0
            return NULL;
2951
2952
49.1k
        if (spacepad && (flags & F_LJUST)) {
2953
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2954
0
                return NULL;
2955
0
            writer->pos += spacepad;
2956
0
        }
2957
49.1k
        break;
2958
49.1k
    }
2959
2960
49.1k
    case 'p':
2961
0
    {
2962
0
        char number[MAX_INTMAX_CHARS];
2963
2964
0
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2965
0
        assert(len >= 0);
2966
2967
        /* %p is ill-defined:  ensure leading 0x. */
2968
0
        if (number[1] == 'X')
2969
0
            number[1] = 'x';
2970
0
        else if (number[1] != 'x') {
2971
0
            memmove(number + 2, number,
2972
0
                    strlen(number) + 1);
2973
0
            number[0] = '0';
2974
0
            number[1] = 'x';
2975
0
            len += 2;
2976
0
        }
2977
2978
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2979
0
            return NULL;
2980
0
        break;
2981
0
    }
2982
2983
233k
    case 's':
2984
233k
    {
2985
233k
        if (sizemod) {
2986
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2987
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2988
0
                return NULL;
2989
0
        }
2990
233k
        else {
2991
            /* UTF-8 */
2992
233k
            const char *s = va_arg(*vargs, const char*);
2993
233k
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2994
0
                return NULL;
2995
233k
        }
2996
233k
        break;
2997
233k
    }
2998
2999
233k
    case 'U':
3000
79.8k
    {
3001
79.8k
        PyObject *obj = va_arg(*vargs, PyObject *);
3002
79.8k
        assert(obj && _PyUnicode_CHECK(obj));
3003
3004
79.8k
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
3005
0
            return NULL;
3006
79.8k
        break;
3007
79.8k
    }
3008
3009
79.8k
    case 'V':
3010
0
    {
3011
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3012
0
        const char *str;
3013
0
        const wchar_t *wstr;
3014
0
        if (sizemod) {
3015
0
            wstr = va_arg(*vargs, const wchar_t*);
3016
0
        }
3017
0
        else {
3018
0
            str = va_arg(*vargs, const char *);
3019
0
        }
3020
0
        if (obj) {
3021
0
            assert(_PyUnicode_CHECK(obj));
3022
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
3023
0
                return NULL;
3024
0
        }
3025
0
        else if (sizemod) {
3026
0
            assert(wstr != NULL);
3027
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
3028
0
                return NULL;
3029
0
        }
3030
0
        else {
3031
0
            assert(str != NULL);
3032
0
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
3033
0
                return NULL;
3034
0
        }
3035
0
        break;
3036
0
    }
3037
3038
3
    case 'S':
3039
3
    {
3040
3
        PyObject *obj = va_arg(*vargs, PyObject *);
3041
3
        PyObject *str;
3042
3
        assert(obj);
3043
3
        str = PyObject_Str(obj);
3044
3
        if (!str)
3045
0
            return NULL;
3046
3
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
3047
0
            Py_DECREF(str);
3048
0
            return NULL;
3049
0
        }
3050
3
        Py_DECREF(str);
3051
3
        break;
3052
3
    }
3053
3054
5.42k
    case 'R':
3055
5.42k
    {
3056
5.42k
        PyObject *obj = va_arg(*vargs, PyObject *);
3057
5.42k
        PyObject *repr;
3058
5.42k
        assert(obj);
3059
5.42k
        repr = PyObject_Repr(obj);
3060
5.42k
        if (!repr)
3061
0
            return NULL;
3062
5.42k
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
3063
0
            Py_DECREF(repr);
3064
0
            return NULL;
3065
0
        }
3066
5.42k
        Py_DECREF(repr);
3067
5.42k
        break;
3068
5.42k
    }
3069
3070
0
    case 'A':
3071
0
    {
3072
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3073
0
        PyObject *ascii;
3074
0
        assert(obj);
3075
0
        ascii = PyObject_ASCII(obj);
3076
0
        if (!ascii)
3077
0
            return NULL;
3078
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
3079
0
            Py_DECREF(ascii);
3080
0
            return NULL;
3081
0
        }
3082
0
        Py_DECREF(ascii);
3083
0
        break;
3084
0
    }
3085
3086
0
    case 'T':
3087
0
    {
3088
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3089
0
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
3090
3091
0
        PyObject *type_name;
3092
0
        if (flags & F_ALT) {
3093
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3094
0
        }
3095
0
        else {
3096
0
            type_name = PyType_GetFullyQualifiedName(type);
3097
0
        }
3098
0
        Py_DECREF(type);
3099
0
        if (!type_name) {
3100
0
            return NULL;
3101
0
        }
3102
3103
0
        if (unicode_fromformat_write_str(writer, type_name,
3104
0
                                         width, precision, flags) == -1) {
3105
0
            Py_DECREF(type_name);
3106
0
            return NULL;
3107
0
        }
3108
0
        Py_DECREF(type_name);
3109
0
        break;
3110
0
    }
3111
3112
0
    case 'N':
3113
0
    {
3114
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3115
0
        assert(type_raw != NULL);
3116
3117
0
        if (!PyType_Check(type_raw)) {
3118
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3119
0
            return NULL;
3120
0
        }
3121
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3122
3123
0
        PyObject *type_name;
3124
0
        if (flags & F_ALT) {
3125
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3126
0
        }
3127
0
        else {
3128
0
            type_name = PyType_GetFullyQualifiedName(type);
3129
0
        }
3130
0
        if (!type_name) {
3131
0
            return NULL;
3132
0
        }
3133
0
        if (unicode_fromformat_write_str(writer, type_name,
3134
0
                                         width, precision, flags) == -1) {
3135
0
            Py_DECREF(type_name);
3136
0
            return NULL;
3137
0
        }
3138
0
        Py_DECREF(type_name);
3139
0
        break;
3140
0
    }
3141
3142
0
    default:
3143
0
    invalid_format:
3144
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3145
0
        return NULL;
3146
388k
    }
3147
3148
388k
    f++;
3149
388k
    return f;
3150
388k
}
3151
3152
static int
3153
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3154
230k
{
3155
230k
    Py_ssize_t len = strlen(format);
3156
230k
    writer->min_length += len + 100;
3157
230k
    writer->overallocate = 1;
3158
3159
    // Copy varags to be able to pass a reference to a subfunction.
3160
230k
    va_list vargs2;
3161
230k
    va_copy(vargs2, vargs);
3162
3163
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3164
    // to be encoded to ASCII.
3165
230k
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3166
230k
    if (!is_ascii) {
3167
0
        Py_ssize_t i;
3168
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3169
0
        PyErr_Format(PyExc_ValueError,
3170
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3171
0
            "string, got a non-ASCII byte: 0x%02x",
3172
0
            (unsigned char)format[i]);
3173
0
        goto fail;
3174
0
    }
3175
3176
1.12M
    for (const char *f = format; *f; ) {
3177
895k
        if (*f == '%') {
3178
388k
            f = unicode_fromformat_arg(writer, f, &vargs2);
3179
388k
            if (f == NULL)
3180
0
                goto fail;
3181
388k
        }
3182
507k
        else {
3183
507k
            const char *p = strchr(f, '%');
3184
507k
            if (p != NULL) {
3185
328k
                len = p - f;
3186
328k
            }
3187
179k
            else {
3188
179k
                len = strlen(f);
3189
179k
                writer->overallocate = 0;
3190
179k
            }
3191
3192
507k
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3193
0
                goto fail;
3194
0
            }
3195
507k
            f += len;
3196
507k
        }
3197
895k
    }
3198
230k
    va_end(vargs2);
3199
230k
    return 0;
3200
3201
0
  fail:
3202
0
    va_end(vargs2);
3203
0
    return -1;
3204
230k
}
3205
3206
PyObject *
3207
PyUnicode_FromFormatV(const char *format, va_list vargs)
3208
230k
{
3209
230k
    _PyUnicodeWriter writer;
3210
230k
    _PyUnicodeWriter_Init(&writer);
3211
3212
230k
    if (unicode_from_format(&writer, format, vargs) < 0) {
3213
0
        _PyUnicodeWriter_Dealloc(&writer);
3214
0
        return NULL;
3215
0
    }
3216
230k
    return _PyUnicodeWriter_Finish(&writer);
3217
230k
}
3218
3219
PyObject *
3220
PyUnicode_FromFormat(const char *format, ...)
3221
60.8k
{
3222
60.8k
    PyObject* ret;
3223
60.8k
    va_list vargs;
3224
3225
60.8k
    va_start(vargs, format);
3226
60.8k
    ret = PyUnicode_FromFormatV(format, vargs);
3227
60.8k
    va_end(vargs);
3228
60.8k
    return ret;
3229
60.8k
}
3230
3231
int
3232
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3233
0
{
3234
0
    va_list vargs;
3235
0
    va_start(vargs, format);
3236
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3237
0
    va_end(vargs);
3238
0
    return res;
3239
0
}
3240
3241
int
3242
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3243
                         va_list vargs)
3244
0
{
3245
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3246
0
    Py_ssize_t old_pos = _writer->pos;
3247
3248
0
    int res = unicode_from_format(_writer, format, vargs);
3249
3250
0
    if (res < 0) {
3251
0
        _writer->pos = old_pos;
3252
0
    }
3253
0
    return res;
3254
0
}
3255
3256
static Py_ssize_t
3257
unicode_get_widechar_size(PyObject *unicode)
3258
2.23k
{
3259
2.23k
    Py_ssize_t res;
3260
3261
2.23k
    assert(unicode != NULL);
3262
2.23k
    assert(_PyUnicode_CHECK(unicode));
3263
3264
2.23k
    res = _PyUnicode_LENGTH(unicode);
3265
#if SIZEOF_WCHAR_T == 2
3266
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3267
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3268
        const Py_UCS4 *end = s + res;
3269
        for (; s < end; ++s) {
3270
            if (*s > 0xFFFF) {
3271
                ++res;
3272
            }
3273
        }
3274
    }
3275
#endif
3276
0
    return res;
3277
2.23k
}
3278
3279
static void
3280
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3281
2.23k
{
3282
2.23k
    assert(unicode != NULL);
3283
2.23k
    assert(_PyUnicode_CHECK(unicode));
3284
3285
4.47k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3286
0
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3287
0
        return;
3288
0
    }
3289
3290
4.47k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3291
2.23k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3292
115k
        for (; size--; ++s, ++w) {
3293
113k
            *w = *s;
3294
113k
        }
3295
2.23k
    }
3296
0
    else {
3297
0
#if SIZEOF_WCHAR_T == 4
3298
0
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3299
0
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3300
0
        for (; size--; ++s, ++w) {
3301
0
            *w = *s;
3302
0
        }
3303
#else
3304
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3305
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3306
        for (; size--; ++s, ++w) {
3307
            Py_UCS4 ch = *s;
3308
            if (ch > 0xFFFF) {
3309
                assert(ch <= MAX_UNICODE);
3310
                /* encode surrogate pair in this case */
3311
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3312
                if (!size--)
3313
                    break;
3314
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3315
            }
3316
            else {
3317
                *w = ch;
3318
            }
3319
        }
3320
#endif
3321
0
    }
3322
2.23k
}
3323
3324
#ifdef HAVE_WCHAR_H
3325
3326
/* Convert a Unicode object to a wide character string.
3327
3328
   - If w is NULL: return the number of wide characters (including the null
3329
     character) required to convert the unicode object. Ignore size argument.
3330
3331
   - Otherwise: return the number of wide characters (excluding the null
3332
     character) written into w. Write at most size wide characters (including
3333
     the null character). */
3334
Py_ssize_t
3335
PyUnicode_AsWideChar(PyObject *unicode,
3336
                     wchar_t *w,
3337
                     Py_ssize_t size)
3338
279
{
3339
279
    Py_ssize_t res;
3340
3341
279
    if (unicode == NULL) {
3342
0
        PyErr_BadInternalCall();
3343
0
        return -1;
3344
0
    }
3345
279
    if (!PyUnicode_Check(unicode)) {
3346
0
        PyErr_BadArgument();
3347
0
        return -1;
3348
0
    }
3349
3350
279
    res = unicode_get_widechar_size(unicode);
3351
279
    if (w == NULL) {
3352
0
        return res + 1;
3353
0
    }
3354
3355
279
    if (size > res) {
3356
279
        size = res + 1;
3357
279
    }
3358
0
    else {
3359
0
        res = size;
3360
0
    }
3361
279
    unicode_copy_as_widechar(unicode, w, size);
3362
3363
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3364
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3365
       non-Unicode locales and hence needs conversion first. */
3366
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3367
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3368
            return -1;
3369
        }
3370
    }
3371
#endif
3372
3373
279
    return res;
3374
279
}
3375
3376
wchar_t*
3377
PyUnicode_AsWideCharString(PyObject *unicode,
3378
                           Py_ssize_t *size)
3379
1.95k
{
3380
1.95k
    wchar_t *buffer;
3381
1.95k
    Py_ssize_t buflen;
3382
3383
1.95k
    if (unicode == NULL) {
3384
0
        PyErr_BadInternalCall();
3385
0
        return NULL;
3386
0
    }
3387
1.95k
    if (!PyUnicode_Check(unicode)) {
3388
0
        PyErr_BadArgument();
3389
0
        return NULL;
3390
0
    }
3391
3392
1.95k
    buflen = unicode_get_widechar_size(unicode);
3393
1.95k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3394
1.95k
    if (buffer == NULL) {
3395
0
        PyErr_NoMemory();
3396
0
        return NULL;
3397
0
    }
3398
1.95k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3399
3400
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3401
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3402
       non-Unicode locales and hence needs conversion first. */
3403
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3404
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3405
            return NULL;
3406
        }
3407
    }
3408
#endif
3409
3410
1.95k
    if (size != NULL) {
3411
1.23k
        *size = buflen;
3412
1.23k
    }
3413
726
    else if (wcslen(buffer) != (size_t)buflen) {
3414
0
        PyMem_Free(buffer);
3415
0
        PyErr_SetString(PyExc_ValueError,
3416
0
                        "embedded null character");
3417
0
        return NULL;
3418
0
    }
3419
1.95k
    return buffer;
3420
1.95k
}
3421
3422
#endif /* HAVE_WCHAR_H */
3423
3424
int
3425
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3426
0
{
3427
0
    wchar_t **p = (wchar_t **)ptr;
3428
0
    if (obj == NULL) {
3429
0
        PyMem_Free(*p);
3430
0
        *p = NULL;
3431
0
        return 1;
3432
0
    }
3433
0
    if (PyUnicode_Check(obj)) {
3434
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3435
0
        if (*p == NULL) {
3436
0
            return 0;
3437
0
        }
3438
0
        return Py_CLEANUP_SUPPORTED;
3439
0
    }
3440
0
    PyErr_Format(PyExc_TypeError,
3441
0
                 "argument must be str, not %.50s",
3442
0
                 Py_TYPE(obj)->tp_name);
3443
0
    return 0;
3444
0
}
3445
3446
int
3447
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3448
0
{
3449
0
    wchar_t **p = (wchar_t **)ptr;
3450
0
    if (obj == NULL) {
3451
0
        PyMem_Free(*p);
3452
0
        *p = NULL;
3453
0
        return 1;
3454
0
    }
3455
0
    if (obj == Py_None) {
3456
0
        *p = NULL;
3457
0
        return 1;
3458
0
    }
3459
0
    if (PyUnicode_Check(obj)) {
3460
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3461
0
        if (*p == NULL) {
3462
0
            return 0;
3463
0
        }
3464
0
        return Py_CLEANUP_SUPPORTED;
3465
0
    }
3466
0
    PyErr_Format(PyExc_TypeError,
3467
0
                 "argument must be str or None, not %.50s",
3468
0
                 Py_TYPE(obj)->tp_name);
3469
0
    return 0;
3470
0
}
3471
3472
PyObject *
3473
PyUnicode_FromOrdinal(int ordinal)
3474
426k
{
3475
426k
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3476
0
        PyErr_SetString(PyExc_ValueError,
3477
0
                        "chr() arg not in range(0x110000)");
3478
0
        return NULL;
3479
0
    }
3480
3481
426k
    return unicode_char((Py_UCS4)ordinal);
3482
426k
}
3483
3484
PyObject *
3485
PyUnicode_FromObject(PyObject *obj)
3486
236k
{
3487
    /* XXX Perhaps we should make this API an alias of
3488
       PyObject_Str() instead ?! */
3489
236k
    if (PyUnicode_CheckExact(obj)) {
3490
236k
        return Py_NewRef(obj);
3491
236k
    }
3492
0
    if (PyUnicode_Check(obj)) {
3493
        /* For a Unicode subtype that's not a Unicode object,
3494
           return a true Unicode object with the same data. */
3495
0
        return _PyUnicode_Copy(obj);
3496
0
    }
3497
0
    PyErr_Format(PyExc_TypeError,
3498
0
                 "Can't convert '%.100s' object to str implicitly",
3499
0
                 Py_TYPE(obj)->tp_name);
3500
0
    return NULL;
3501
0
}
3502
3503
PyObject *
3504
PyUnicode_FromEncodedObject(PyObject *obj,
3505
                            const char *encoding,
3506
                            const char *errors)
3507
116k
{
3508
116k
    Py_buffer buffer;
3509
116k
    PyObject *v;
3510
3511
116k
    if (obj == NULL) {
3512
0
        PyErr_BadInternalCall();
3513
0
        return NULL;
3514
0
    }
3515
3516
    /* Decoding bytes objects is the most common case and should be fast */
3517
116k
    if (PyBytes_Check(obj)) {
3518
116k
        if (PyBytes_GET_SIZE(obj) == 0) {
3519
27.7k
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3520
0
                return NULL;
3521
0
            }
3522
27.7k
            _Py_RETURN_UNICODE_EMPTY();
3523
27.7k
        }
3524
88.6k
        return PyUnicode_Decode(
3525
88.6k
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3526
88.6k
                encoding, errors);
3527
116k
    }
3528
3529
0
    if (PyUnicode_Check(obj)) {
3530
0
        PyErr_SetString(PyExc_TypeError,
3531
0
                        "decoding str is not supported");
3532
0
        return NULL;
3533
0
    }
3534
3535
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3536
0
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3537
0
        PyErr_Format(PyExc_TypeError,
3538
0
                     "decoding to str: need a bytes-like object, %.80s found",
3539
0
                     Py_TYPE(obj)->tp_name);
3540
0
        return NULL;
3541
0
    }
3542
3543
0
    if (buffer.len == 0) {
3544
0
        PyBuffer_Release(&buffer);
3545
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3546
0
            return NULL;
3547
0
        }
3548
0
        _Py_RETURN_UNICODE_EMPTY();
3549
0
    }
3550
3551
0
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3552
0
    PyBuffer_Release(&buffer);
3553
0
    return v;
3554
0
}
3555
3556
/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3557
   also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3558
   longer than lower_len-1). */
3559
int
3560
_Py_normalize_encoding(const char *encoding,
3561
                       char *lower,
3562
                       size_t lower_len)
3563
148k
{
3564
148k
    const char *e;
3565
148k
    char *l;
3566
148k
    char *l_end;
3567
148k
    int punct;
3568
3569
148k
    assert(encoding != NULL);
3570
3571
148k
    e = encoding;
3572
148k
    l = lower;
3573
148k
    l_end = &lower[lower_len - 1];
3574
148k
    punct = 0;
3575
1.01M
    while (1) {
3576
1.01M
        char c = *e;
3577
1.01M
        if (c == 0) {
3578
148k
            break;
3579
148k
        }
3580
3581
864k
        if (Py_ISALNUM(c) || c == '.') {
3582
851k
            if (punct && l != lower) {
3583
11.8k
                if (l == l_end) {
3584
18
                    return 0;
3585
18
                }
3586
11.8k
                *l++ = '_';
3587
11.8k
            }
3588
851k
            punct = 0;
3589
3590
851k
            if (l == l_end) {
3591
636
                return 0;
3592
636
            }
3593
850k
            *l++ = Py_TOLOWER(c);
3594
850k
        }
3595
13.6k
        else {
3596
13.6k
            punct = 1;
3597
13.6k
        }
3598
3599
864k
        e++;
3600
864k
    }
3601
148k
    *l = '\0';
3602
148k
    return 1;
3603
148k
}
3604
3605
PyObject *
3606
PyUnicode_Decode(const char *s,
3607
                 Py_ssize_t size,
3608
                 const char *encoding,
3609
                 const char *errors)
3610
94.6k
{
3611
94.6k
    PyObject *buffer = NULL, *unicode;
3612
94.6k
    Py_buffer info;
3613
94.6k
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3614
3615
94.6k
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3616
0
        return NULL;
3617
0
    }
3618
3619
94.6k
    if (size == 0) {
3620
1
        _Py_RETURN_UNICODE_EMPTY();
3621
1
    }
3622
3623
94.6k
    if (encoding == NULL) {
3624
0
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3625
0
    }
3626
3627
    /* Shortcuts for common default encodings */
3628
94.6k
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3629
94.0k
        char *lower = buflower;
3630
3631
        /* Fast paths */
3632
94.0k
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3633
8.75k
            lower += 3;
3634
8.75k
            if (*lower == '_') {
3635
                /* Match "utf8" and "utf_8" */
3636
8.71k
                lower++;
3637
8.71k
            }
3638
3639
8.75k
            if (lower[0] == '8' && lower[1] == 0) {
3640
7.20k
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3641
7.20k
            }
3642
1.55k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3643
75
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3644
75
            }
3645
1.47k
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3646
26
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3647
26
            }
3648
8.75k
        }
3649
85.2k
        else {
3650
85.2k
            if (strcmp(lower, "ascii") == 0
3651
58.0k
                || strcmp(lower, "us_ascii") == 0) {
3652
58.0k
                return PyUnicode_DecodeASCII(s, size, errors);
3653
58.0k
            }
3654
    #ifdef MS_WINDOWS
3655
            else if (strcmp(lower, "mbcs") == 0) {
3656
                return PyUnicode_DecodeMBCS(s, size, errors);
3657
            }
3658
    #endif
3659
27.2k
            else if (strcmp(lower, "latin1") == 0
3660
22.6k
                     || strcmp(lower, "latin_1") == 0
3661
22.6k
                     || strcmp(lower, "iso_8859_1") == 0
3662
22.6k
                     || strcmp(lower, "iso8859_1") == 0) {
3663
4.60k
                return PyUnicode_DecodeLatin1(s, size, errors);
3664
4.60k
            }
3665
85.2k
        }
3666
94.0k
    }
3667
3668
    /* Decode via the codec registry */
3669
24.7k
    buffer = NULL;
3670
24.7k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3671
0
        goto onError;
3672
24.7k
    buffer = PyMemoryView_FromBuffer(&info);
3673
24.7k
    if (buffer == NULL)
3674
0
        goto onError;
3675
24.7k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3676
24.7k
    if (unicode == NULL)
3677
2.13k
        goto onError;
3678
22.6k
    if (!PyUnicode_Check(unicode)) {
3679
0
        PyErr_Format(PyExc_TypeError,
3680
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3681
0
                     "use codecs.decode() to decode to arbitrary types",
3682
0
                     encoding,
3683
0
                     Py_TYPE(unicode)->tp_name);
3684
0
        Py_DECREF(unicode);
3685
0
        goto onError;
3686
0
    }
3687
22.6k
    Py_DECREF(buffer);
3688
22.6k
    return unicode_result(unicode);
3689
3690
2.13k
  onError:
3691
2.13k
    Py_XDECREF(buffer);
3692
2.13k
    return NULL;
3693
22.6k
}
3694
3695
PyAPI_FUNC(PyObject *)
3696
PyUnicode_AsDecodedObject(PyObject *unicode,
3697
                          const char *encoding,
3698
                          const char *errors)
3699
0
{
3700
0
    if (!PyUnicode_Check(unicode)) {
3701
0
        PyErr_BadArgument();
3702
0
        return NULL;
3703
0
    }
3704
3705
0
    if (encoding == NULL)
3706
0
        encoding = PyUnicode_GetDefaultEncoding();
3707
3708
    /* Decode via the codec registry */
3709
0
    return PyCodec_Decode(unicode, encoding, errors);
3710
0
}
3711
3712
PyAPI_FUNC(PyObject *)
3713
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3714
                           const char *encoding,
3715
                           const char *errors)
3716
0
{
3717
0
    PyObject *v;
3718
3719
0
    if (!PyUnicode_Check(unicode)) {
3720
0
        PyErr_BadArgument();
3721
0
        goto onError;
3722
0
    }
3723
3724
0
    if (encoding == NULL)
3725
0
        encoding = PyUnicode_GetDefaultEncoding();
3726
3727
    /* Decode via the codec registry */
3728
0
    v = PyCodec_Decode(unicode, encoding, errors);
3729
0
    if (v == NULL)
3730
0
        goto onError;
3731
0
    if (!PyUnicode_Check(v)) {
3732
0
        PyErr_Format(PyExc_TypeError,
3733
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3734
0
                     "use codecs.decode() to decode to arbitrary types",
3735
0
                     encoding,
3736
0
                     Py_TYPE(unicode)->tp_name);
3737
0
        Py_DECREF(v);
3738
0
        goto onError;
3739
0
    }
3740
0
    return unicode_result(v);
3741
3742
0
  onError:
3743
0
    return NULL;
3744
0
}
3745
3746
PyAPI_FUNC(PyObject *)
3747
PyUnicode_AsEncodedObject(PyObject *unicode,
3748
                          const char *encoding,
3749
                          const char *errors)
3750
0
{
3751
0
    PyObject *v;
3752
3753
0
    if (!PyUnicode_Check(unicode)) {
3754
0
        PyErr_BadArgument();
3755
0
        goto onError;
3756
0
    }
3757
3758
0
    if (encoding == NULL)
3759
0
        encoding = PyUnicode_GetDefaultEncoding();
3760
3761
    /* Encode via the codec registry */
3762
0
    v = PyCodec_Encode(unicode, encoding, errors);
3763
0
    if (v == NULL)
3764
0
        goto onError;
3765
0
    return v;
3766
3767
0
  onError:
3768
0
    return NULL;
3769
0
}
3770
3771
3772
static PyObject *
3773
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3774
                      int current_locale)
3775
528
{
3776
528
    Py_ssize_t wlen;
3777
528
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3778
528
    if (wstr == NULL) {
3779
0
        return NULL;
3780
0
    }
3781
3782
528
    if ((size_t)wlen != wcslen(wstr)) {
3783
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3784
0
        PyMem_Free(wstr);
3785
0
        return NULL;
3786
0
    }
3787
3788
528
    char *str;
3789
528
    size_t error_pos;
3790
528
    const char *reason;
3791
528
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3792
528
                                 current_locale, error_handler);
3793
528
    PyMem_Free(wstr);
3794
3795
528
    if (res != 0) {
3796
0
        if (res == -2) {
3797
0
            PyObject *exc;
3798
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3799
0
                    "locale", unicode,
3800
0
                    (Py_ssize_t)error_pos,
3801
0
                    (Py_ssize_t)(error_pos+1),
3802
0
                    reason);
3803
0
            if (exc != NULL) {
3804
0
                PyCodec_StrictErrors(exc);
3805
0
                Py_DECREF(exc);
3806
0
            }
3807
0
        }
3808
0
        else if (res == -3) {
3809
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3810
0
        }
3811
0
        else {
3812
0
            PyErr_NoMemory();
3813
0
        }
3814
0
        return NULL;
3815
0
    }
3816
3817
528
    PyObject *bytes = PyBytes_FromString(str);
3818
528
    PyMem_RawFree(str);
3819
528
    return bytes;
3820
528
}
3821
3822
PyObject *
3823
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3824
0
{
3825
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3826
0
    return unicode_encode_locale(unicode, error_handler, 1);
3827
0
}
3828
3829
PyObject *
3830
PyUnicode_EncodeFSDefault(PyObject *unicode)
3831
8.55k
{
3832
8.55k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3833
8.55k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3834
8.55k
    if (fs_codec->utf8) {
3835
8.02k
        return unicode_encode_utf8(unicode,
3836
8.02k
                                   fs_codec->error_handler,
3837
8.02k
                                   fs_codec->errors);
3838
8.02k
    }
3839
528
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3840
528
    else if (fs_codec->encoding) {
3841
0
        return PyUnicode_AsEncodedString(unicode,
3842
0
                                         fs_codec->encoding,
3843
0
                                         fs_codec->errors);
3844
0
    }
3845
528
#endif
3846
528
    else {
3847
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3848
           machinery is not ready and so cannot be used:
3849
           use wcstombs() in this case. */
3850
528
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3851
528
        const wchar_t *filesystem_errors = config->filesystem_errors;
3852
528
        assert(filesystem_errors != NULL);
3853
528
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3854
528
        assert(errors != _Py_ERROR_UNKNOWN);
3855
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3856
        return unicode_encode_utf8(unicode, errors, NULL);
3857
#else
3858
528
        return unicode_encode_locale(unicode, errors, 0);
3859
528
#endif
3860
528
    }
3861
8.55k
}
3862
3863
PyObject *
3864
PyUnicode_AsEncodedString(PyObject *unicode,
3865
                          const char *encoding,
3866
                          const char *errors)
3867
54.2k
{
3868
54.2k
    PyObject *v;
3869
54.2k
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3870
3871
54.2k
    if (!PyUnicode_Check(unicode)) {
3872
0
        PyErr_BadArgument();
3873
0
        return NULL;
3874
0
    }
3875
3876
54.2k
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3877
0
        return NULL;
3878
0
    }
3879
3880
54.2k
    if (encoding == NULL) {
3881
0
        return _PyUnicode_AsUTF8String(unicode, errors);
3882
0
    }
3883
3884
    /* Shortcuts for common default encodings */
3885
54.2k
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3886
54.2k
        char *lower = buflower;
3887
3888
        /* Fast paths */
3889
54.2k
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3890
66
            lower += 3;
3891
66
            if (*lower == '_') {
3892
                /* Match "utf8" and "utf_8" */
3893
66
                lower++;
3894
66
            }
3895
3896
66
            if (lower[0] == '8' && lower[1] == 0) {
3897
66
                return _PyUnicode_AsUTF8String(unicode, errors);
3898
66
            }
3899
0
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3900
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3901
0
            }
3902
0
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3903
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3904
0
            }
3905
66
        }
3906
54.2k
        else {
3907
54.2k
            if (strcmp(lower, "ascii") == 0
3908
36.9k
                || strcmp(lower, "us_ascii") == 0) {
3909
36.9k
                return _PyUnicode_AsASCIIString(unicode, errors);
3910
36.9k
            }
3911
#ifdef MS_WINDOWS
3912
            else if (strcmp(lower, "mbcs") == 0) {
3913
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3914
            }
3915
#endif
3916
17.2k
            else if (strcmp(lower, "latin1") == 0 ||
3917
17.2k
                     strcmp(lower, "latin_1") == 0 ||
3918
17.2k
                     strcmp(lower, "iso_8859_1") == 0 ||
3919
17.2k
                     strcmp(lower, "iso8859_1") == 0) {
3920
0
                return _PyUnicode_AsLatin1String(unicode, errors);
3921
0
            }
3922
54.2k
        }
3923
54.2k
    }
3924
3925
    /* Encode via the codec registry */
3926
17.2k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3927
17.2k
    if (v == NULL)
3928
0
        return NULL;
3929
3930
    /* The normal path */
3931
17.2k
    if (PyBytes_Check(v))
3932
17.2k
        return v;
3933
3934
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3935
0
    if (PyByteArray_Check(v)) {
3936
0
        int error;
3937
0
        PyObject *b;
3938
3939
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3940
0
            "encoder %s returned bytearray instead of bytes; "
3941
0
            "use codecs.encode() to encode to arbitrary types",
3942
0
            encoding);
3943
0
        if (error) {
3944
0
            Py_DECREF(v);
3945
0
            return NULL;
3946
0
        }
3947
3948
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3949
0
                                      PyByteArray_GET_SIZE(v));
3950
0
        Py_DECREF(v);
3951
0
        return b;
3952
0
    }
3953
3954
0
    PyErr_Format(PyExc_TypeError,
3955
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3956
0
                 "use codecs.encode() to encode to arbitrary types",
3957
0
                 encoding,
3958
0
                 Py_TYPE(v)->tp_name);
3959
0
    Py_DECREF(v);
3960
0
    return NULL;
3961
0
}
3962
3963
PyAPI_FUNC(PyObject *)
3964
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3965
                           const char *encoding,
3966
                           const char *errors)
3967
0
{
3968
0
    PyObject *v;
3969
3970
0
    if (!PyUnicode_Check(unicode)) {
3971
0
        PyErr_BadArgument();
3972
0
        goto onError;
3973
0
    }
3974
3975
0
    if (encoding == NULL)
3976
0
        encoding = PyUnicode_GetDefaultEncoding();
3977
3978
    /* Encode via the codec registry */
3979
0
    v = PyCodec_Encode(unicode, encoding, errors);
3980
0
    if (v == NULL)
3981
0
        goto onError;
3982
0
    if (!PyUnicode_Check(v)) {
3983
0
        PyErr_Format(PyExc_TypeError,
3984
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3985
0
                     "use codecs.encode() to encode to arbitrary types",
3986
0
                     encoding,
3987
0
                     Py_TYPE(v)->tp_name);
3988
0
        Py_DECREF(v);
3989
0
        goto onError;
3990
0
    }
3991
0
    return v;
3992
3993
0
  onError:
3994
0
    return NULL;
3995
0
}
3996
3997
static PyObject*
3998
unicode_decode_locale(const char *str, Py_ssize_t len,
3999
                      _Py_error_handler errors, int current_locale)
4000
15.2k
{
4001
15.2k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
4002
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4003
0
        return NULL;
4004
0
    }
4005
4006
15.2k
    wchar_t *wstr;
4007
15.2k
    size_t wlen;
4008
15.2k
    const char *reason;
4009
15.2k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
4010
15.2k
                                 current_locale, errors);
4011
15.2k
    if (res != 0) {
4012
0
        if (res == -2) {
4013
0
            PyObject *exc;
4014
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4015
0
                                        "locale", str, len,
4016
0
                                        (Py_ssize_t)wlen,
4017
0
                                        (Py_ssize_t)(wlen + 1),
4018
0
                                        reason);
4019
0
            if (exc != NULL) {
4020
0
                PyCodec_StrictErrors(exc);
4021
0
                Py_DECREF(exc);
4022
0
            }
4023
0
        }
4024
0
        else if (res == -3) {
4025
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4026
0
        }
4027
0
        else {
4028
0
            PyErr_NoMemory();
4029
0
        }
4030
0
        return NULL;
4031
0
    }
4032
4033
15.2k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4034
15.2k
    PyMem_RawFree(wstr);
4035
15.2k
    return unicode;
4036
15.2k
}
4037
4038
PyObject*
4039
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4040
                              const char *errors)
4041
0
{
4042
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4043
0
    return unicode_decode_locale(str, len, error_handler, 1);
4044
0
}
4045
4046
PyObject*
4047
PyUnicode_DecodeLocale(const char *str, const char *errors)
4048
8.17k
{
4049
8.17k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
4050
8.17k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4051
8.17k
    return unicode_decode_locale(str, size, error_handler, 1);
4052
8.17k
}
4053
4054
4055
PyObject*
4056
27.0k
PyUnicode_DecodeFSDefault(const char *s) {
4057
27.0k
    Py_ssize_t size = (Py_ssize_t)strlen(s);
4058
27.0k
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
4059
27.0k
}
4060
4061
PyObject*
4062
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4063
35.9k
{
4064
35.9k
    PyInterpreterState *interp = _PyInterpreterState_GET();
4065
35.9k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4066
35.9k
    if (fs_codec->utf8) {
4067
28.8k
        return unicode_decode_utf8(s, size,
4068
28.8k
                                   fs_codec->error_handler,
4069
28.8k
                                   fs_codec->errors,
4070
28.8k
                                   NULL);
4071
28.8k
    }
4072
7.06k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
4073
7.06k
    else if (fs_codec->encoding) {
4074
0
        return PyUnicode_Decode(s, size,
4075
0
                                fs_codec->encoding,
4076
0
                                fs_codec->errors);
4077
0
    }
4078
7.06k
#endif
4079
7.06k
    else {
4080
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
4081
           machinery is not ready and so cannot be used:
4082
           use mbstowcs() in this case. */
4083
7.06k
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4084
7.06k
        const wchar_t *filesystem_errors = config->filesystem_errors;
4085
7.06k
        assert(filesystem_errors != NULL);
4086
7.06k
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4087
7.06k
        assert(errors != _Py_ERROR_UNKNOWN);
4088
#ifdef _Py_FORCE_UTF8_FS_ENCODING
4089
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
4090
#else
4091
7.06k
        return unicode_decode_locale(s, size, errors, 0);
4092
7.06k
#endif
4093
7.06k
    }
4094
35.9k
}
4095
4096
4097
int
4098
PyUnicode_FSConverter(PyObject* arg, void* addr)
4099
6.30k
{
4100
6.30k
    PyObject *path = NULL;
4101
6.30k
    PyObject *output = NULL;
4102
6.30k
    Py_ssize_t size;
4103
6.30k
    const char *data;
4104
6.30k
    if (arg == NULL) {
4105
0
        Py_DECREF(*(PyObject**)addr);
4106
0
        *(PyObject**)addr = NULL;
4107
0
        return 1;
4108
0
    }
4109
6.30k
    path = PyOS_FSPath(arg);
4110
6.30k
    if (path == NULL) {
4111
0
        return 0;
4112
0
    }
4113
6.30k
    if (PyBytes_Check(path)) {
4114
0
        output = path;
4115
0
    }
4116
6.30k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4117
6.30k
        output = PyUnicode_EncodeFSDefault(path);
4118
6.30k
        Py_DECREF(path);
4119
6.30k
        if (!output) {
4120
0
            return 0;
4121
0
        }
4122
6.30k
        assert(PyBytes_Check(output));
4123
6.30k
    }
4124
4125
6.30k
    size = PyBytes_GET_SIZE(output);
4126
6.30k
    data = PyBytes_AS_STRING(output);
4127
6.30k
    if ((size_t)size != strlen(data)) {
4128
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4129
0
        Py_DECREF(output);
4130
0
        return 0;
4131
0
    }
4132
6.30k
    *(PyObject**)addr = output;
4133
6.30k
    return Py_CLEANUP_SUPPORTED;
4134
6.30k
}
4135
4136
4137
int
4138
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4139
20
{
4140
20
    if (arg == NULL) {
4141
0
        Py_DECREF(*(PyObject**)addr);
4142
0
        *(PyObject**)addr = NULL;
4143
0
        return 1;
4144
0
    }
4145
4146
20
    PyObject *path = PyOS_FSPath(arg);
4147
20
    if (path == NULL) {
4148
0
        return 0;
4149
0
    }
4150
4151
20
    PyObject *output = NULL;
4152
20
    if (PyUnicode_Check(path)) {
4153
20
        output = path;
4154
20
    }
4155
0
    else if (PyBytes_Check(path)) {
4156
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4157
0
                                                  PyBytes_GET_SIZE(path));
4158
0
        Py_DECREF(path);
4159
0
        if (!output) {
4160
0
            return 0;
4161
0
        }
4162
0
    }
4163
0
    else {
4164
0
        PyErr_Format(PyExc_TypeError,
4165
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4166
0
                     Py_TYPE(arg)->tp_name);
4167
0
        Py_DECREF(path);
4168
0
        return 0;
4169
0
    }
4170
4171
20
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4172
20
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4173
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4174
0
        Py_DECREF(output);
4175
0
        return 0;
4176
0
    }
4177
20
    *(PyObject**)addr = output;
4178
20
    return Py_CLEANUP_SUPPORTED;
4179
20
}
4180
4181
4182
static int unicode_fill_utf8(PyObject *unicode);
4183
4184
4185
static int
4186
unicode_ensure_utf8(PyObject *unicode)
4187
591k
{
4188
591k
    int err = 0;
4189
591k
    if (PyUnicode_UTF8(unicode) == NULL) {
4190
7.37k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4191
7.37k
        if (PyUnicode_UTF8(unicode) == NULL) {
4192
7.37k
            err = unicode_fill_utf8(unicode);
4193
7.37k
        }
4194
7.37k
        Py_END_CRITICAL_SECTION();
4195
7.37k
    }
4196
591k
    return err;
4197
591k
}
4198
4199
const char *
4200
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4201
591k
{
4202
591k
    if (!PyUnicode_Check(unicode)) {
4203
0
        PyErr_BadArgument();
4204
0
        if (psize) {
4205
0
            *psize = -1;
4206
0
        }
4207
0
        return NULL;
4208
0
    }
4209
4210
591k
    if (unicode_ensure_utf8(unicode) == -1) {
4211
5
        if (psize) {
4212
5
            *psize = -1;
4213
5
        }
4214
5
        return NULL;
4215
5
    }
4216
4217
591k
    if (psize) {
4218
343k
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4219
343k
    }
4220
591k
    return PyUnicode_UTF8(unicode);
4221
591k
}
4222
4223
const char *
4224
PyUnicode_AsUTF8(PyObject *unicode)
4225
248k
{
4226
248k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4227
248k
}
4228
4229
const char *
4230
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4231
92.1k
{
4232
92.1k
    Py_ssize_t size;
4233
92.1k
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4234
92.1k
    if (s && strlen(s) != (size_t)size) {
4235
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4236
0
        return NULL;
4237
0
    }
4238
92.1k
    return s;
4239
92.1k
}
4240
4241
/*
4242
PyUnicode_GetSize() has been deprecated since Python 3.3
4243
because it returned length of Py_UNICODE.
4244
4245
But this function is part of stable abi, because it doesn't
4246
include Py_UNICODE in signature and it was not excluded from
4247
stable ABI in PEP 384.
4248
*/
4249
PyAPI_FUNC(Py_ssize_t)
4250
PyUnicode_GetSize(PyObject *unicode)
4251
0
{
4252
0
    PyErr_SetString(PyExc_RuntimeError,
4253
0
                    "PyUnicode_GetSize has been removed.");
4254
0
    return -1;
4255
0
}
4256
4257
Py_ssize_t
4258
PyUnicode_GetLength(PyObject *unicode)
4259
37.4k
{
4260
37.4k
    if (!PyUnicode_Check(unicode)) {
4261
0
        PyErr_BadArgument();
4262
0
        return -1;
4263
0
    }
4264
37.4k
    return PyUnicode_GET_LENGTH(unicode);
4265
37.4k
}
4266
4267
Py_UCS4
4268
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4269
16
{
4270
16
    const void *data;
4271
16
    int kind;
4272
4273
16
    if (!PyUnicode_Check(unicode)) {
4274
0
        PyErr_BadArgument();
4275
0
        return (Py_UCS4)-1;
4276
0
    }
4277
16
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4278
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4279
0
        return (Py_UCS4)-1;
4280
0
    }
4281
16
    data = PyUnicode_DATA(unicode);
4282
16
    kind = PyUnicode_KIND(unicode);
4283
16
    return PyUnicode_READ(kind, data, index);
4284
16
}
4285
4286
int
4287
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4288
0
{
4289
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4290
0
        PyErr_BadArgument();
4291
0
        return -1;
4292
0
    }
4293
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4294
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4295
0
        return -1;
4296
0
    }
4297
0
    if (unicode_check_modifiable(unicode))
4298
0
        return -1;
4299
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4300
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4301
0
        return -1;
4302
0
    }
4303
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4304
0
                    index, ch);
4305
0
    return 0;
4306
0
}
4307
4308
const char *
4309
PyUnicode_GetDefaultEncoding(void)
4310
0
{
4311
0
    return "utf-8";
4312
0
}
4313
4314
/* create or adjust a UnicodeDecodeError */
4315
static void
4316
make_decode_exception(PyObject **exceptionObject,
4317
                      const char *encoding,
4318
                      const char *input, Py_ssize_t length,
4319
                      Py_ssize_t startpos, Py_ssize_t endpos,
4320
                      const char *reason)
4321
202k
{
4322
202k
    if (*exceptionObject == NULL) {
4323
7.36k
        *exceptionObject = PyUnicodeDecodeError_Create(
4324
7.36k
            encoding, input, length, startpos, endpos, reason);
4325
7.36k
    }
4326
195k
    else {
4327
195k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4328
0
            goto onError;
4329
195k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4330
0
            goto onError;
4331
195k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4332
0
            goto onError;
4333
195k
    }
4334
202k
    return;
4335
4336
202k
onError:
4337
0
    Py_CLEAR(*exceptionObject);
4338
0
}
4339
4340
#ifdef MS_WINDOWS
4341
static int
4342
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4343
{
4344
    if (newsize > *size) {
4345
        wchar_t *newbuf = *buf;
4346
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4347
            PyErr_NoMemory();
4348
            return -1;
4349
        }
4350
        *buf = newbuf;
4351
    }
4352
    *size = newsize;
4353
    return 0;
4354
}
4355
4356
/* error handling callback helper:
4357
   build arguments, call the callback and check the arguments,
4358
   if no exception occurred, copy the replacement to the output
4359
   and adjust various state variables.
4360
   return 0 on success, -1 on error
4361
*/
4362
4363
static int
4364
unicode_decode_call_errorhandler_wchar(
4365
    const char *errors, PyObject **errorHandler,
4366
    const char *encoding, const char *reason,
4367
    const char **input, const char **inend, Py_ssize_t *startinpos,
4368
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4369
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4370
{
4371
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4372
4373
    PyObject *restuple = NULL;
4374
    PyObject *repunicode = NULL;
4375
    Py_ssize_t outsize;
4376
    Py_ssize_t insize;
4377
    Py_ssize_t requiredsize;
4378
    Py_ssize_t newpos;
4379
    PyObject *inputobj = NULL;
4380
    Py_ssize_t repwlen;
4381
4382
    if (*errorHandler == NULL) {
4383
        *errorHandler = PyCodec_LookupError(errors);
4384
        if (*errorHandler == NULL)
4385
            goto onError;
4386
    }
4387
4388
    make_decode_exception(exceptionObject,
4389
        encoding,
4390
        *input, *inend - *input,
4391
        *startinpos, *endinpos,
4392
        reason);
4393
    if (*exceptionObject == NULL)
4394
        goto onError;
4395
4396
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4397
    if (restuple == NULL)
4398
        goto onError;
4399
    if (!PyTuple_Check(restuple)) {
4400
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4401
        goto onError;
4402
    }
4403
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4404
        goto onError;
4405
4406
    /* Copy back the bytes variables, which might have been modified by the
4407
       callback */
4408
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4409
    if (!inputobj)
4410
        goto onError;
4411
    *input = PyBytes_AS_STRING(inputobj);
4412
    insize = PyBytes_GET_SIZE(inputobj);
4413
    *inend = *input + insize;
4414
    /* we can DECREF safely, as the exception has another reference,
4415
       so the object won't go away. */
4416
    Py_DECREF(inputobj);
4417
4418
    if (newpos<0)
4419
        newpos = insize+newpos;
4420
    if (newpos<0 || newpos>insize) {
4421
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4422
        goto onError;
4423
    }
4424
4425
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4426
    if (repwlen < 0)
4427
        goto onError;
4428
    repwlen--;
4429
    /* need more space? (at least enough for what we
4430
       have+the replacement+the rest of the string (starting
4431
       at the new input position), so we won't have to check space
4432
       when there are no errors in the rest of the string) */
4433
    requiredsize = *outpos;
4434
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4435
        goto overflow;
4436
    requiredsize += repwlen;
4437
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4438
        goto overflow;
4439
    requiredsize += insize - newpos;
4440
    outsize = *bufsize;
4441
    if (requiredsize > outsize) {
4442
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4443
            requiredsize = 2*outsize;
4444
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4445
            goto onError;
4446
        }
4447
    }
4448
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4449
    *outpos += repwlen;
4450
    *endinpos = newpos;
4451
    *inptr = *input + newpos;
4452
4453
    /* we made it! */
4454
    Py_DECREF(restuple);
4455
    return 0;
4456
4457
  overflow:
4458
    PyErr_SetString(PyExc_OverflowError,
4459
                    "decoded result is too long for a Python string");
4460
4461
  onError:
4462
    Py_XDECREF(restuple);
4463
    return -1;
4464
}
4465
#endif   /* MS_WINDOWS */
4466
4467
static int
4468
unicode_decode_call_errorhandler_writer(
4469
    const char *errors, PyObject **errorHandler,
4470
    const char *encoding, const char *reason,
4471
    const char **input, const char **inend, Py_ssize_t *startinpos,
4472
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4473
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4474
202k
{
4475
202k
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4476
4477
202k
    PyObject *restuple = NULL;
4478
202k
    PyObject *repunicode = NULL;
4479
202k
    Py_ssize_t insize;
4480
202k
    Py_ssize_t newpos;
4481
202k
    Py_ssize_t replen;
4482
202k
    Py_ssize_t remain;
4483
202k
    PyObject *inputobj = NULL;
4484
202k
    int need_to_grow = 0;
4485
202k
    const char *new_inptr;
4486
4487
202k
    if (*errorHandler == NULL) {
4488
7.36k
        *errorHandler = PyCodec_LookupError(errors);
4489
7.36k
        if (*errorHandler == NULL)
4490
0
            goto onError;
4491
7.36k
    }
4492
4493
202k
    make_decode_exception(exceptionObject,
4494
202k
        encoding,
4495
202k
        *input, *inend - *input,
4496
202k
        *startinpos, *endinpos,
4497
202k
        reason);
4498
202k
    if (*exceptionObject == NULL)
4499
0
        goto onError;
4500
4501
202k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4502
202k
    if (restuple == NULL)
4503
6.60k
        goto onError;
4504
195k
    if (!PyTuple_Check(restuple)) {
4505
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4506
0
        goto onError;
4507
0
    }
4508
195k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4509
0
        goto onError;
4510
4511
    /* Copy back the bytes variables, which might have been modified by the
4512
       callback */
4513
195k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4514
195k
    if (!inputobj)
4515
0
        goto onError;
4516
195k
    remain = *inend - *input - *endinpos;
4517
195k
    *input = PyBytes_AS_STRING(inputobj);
4518
195k
    insize = PyBytes_GET_SIZE(inputobj);
4519
195k
    *inend = *input + insize;
4520
    /* we can DECREF safely, as the exception has another reference,
4521
       so the object won't go away. */
4522
195k
    Py_DECREF(inputobj);
4523
4524
195k
    if (newpos<0)
4525
0
        newpos = insize+newpos;
4526
195k
    if (newpos<0 || newpos>insize) {
4527
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4528
0
        goto onError;
4529
0
    }
4530
4531
195k
    replen = PyUnicode_GET_LENGTH(repunicode);
4532
195k
    if (replen > 1) {
4533
0
        writer->min_length += replen - 1;
4534
0
        need_to_grow = 1;
4535
0
    }
4536
195k
    new_inptr = *input + newpos;
4537
195k
    if (*inend - new_inptr > remain) {
4538
        /* We don't know the decoding algorithm here so we make the worst
4539
           assumption that one byte decodes to one unicode character.
4540
           If unfortunately one byte could decode to more unicode characters,
4541
           the decoder may write out-of-bound then.  Is it possible for the
4542
           algorithms using this function? */
4543
58
        writer->min_length += *inend - new_inptr - remain;
4544
58
        need_to_grow = 1;
4545
58
    }
4546
195k
    if (need_to_grow) {
4547
58
        writer->overallocate = 1;
4548
58
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4549
58
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4550
0
            goto onError;
4551
58
    }
4552
195k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4553
0
        goto onError;
4554
4555
195k
    *endinpos = newpos;
4556
195k
    *inptr = new_inptr;
4557
4558
    /* we made it! */
4559
195k
    Py_DECREF(restuple);
4560
195k
    return 0;
4561
4562
6.60k
  onError:
4563
6.60k
    Py_XDECREF(restuple);
4564
6.60k
    return -1;
4565
195k
}
4566
4567
/* --- UTF-7 Codec -------------------------------------------------------- */
4568
4569
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4570
4571
/* Three simple macros defining base-64. */
4572
4573
/* Is c a base-64 character? */
4574
4575
#define IS_BASE64(c) \
4576
34.0k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4577
34.0k
     ((c) >= 'a' && (c) <= 'z') ||     \
4578
34.0k
     ((c) >= '0' && (c) <= '9') ||     \
4579
34.0k
     (c) == '+' || (c) == '/')
4580
4581
/* given that c is a base-64 character, what is its base-64 value? */
4582
4583
#define FROM_BASE64(c)                                                  \
4584
25.5k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4585
25.5k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4586
20.6k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4587
6.87k
     (c) == '+' ? 62 : 63)
4588
4589
/* What is the base-64 character of the bottom 6 bits of n? */
4590
4591
#define TO_BASE64(n)  \
4592
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4593
4594
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4595
 * decoded as itself.  We are permissive on decoding; the only ASCII
4596
 * byte not decoding to itself is the + which begins a base64
4597
 * string. */
4598
4599
#define DECODE_DIRECT(c)                                \
4600
73.5k
    ((c) <= 127 && (c) != '+')
4601
4602
/* The UTF-7 encoder treats ASCII characters differently according to
4603
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4604
 * the above).  See RFC2152.  This array identifies these different
4605
 * sets:
4606
 * 0 : "Set D"
4607
 *     alphanumeric and '(),-./:?
4608
 * 1 : "Set O"
4609
 *     !"#$%&*;<=>@[]^_`{|}
4610
 * 2 : "whitespace"
4611
 *     ht nl cr sp
4612
 * 3 : special (must be base64 encoded)
4613
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4614
 */
4615
4616
static
4617
char utf7_category[128] = {
4618
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4619
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4620
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4621
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4622
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4623
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4624
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4625
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4626
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4627
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4628
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4629
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4630
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4631
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4632
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4633
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4634
};
4635
4636
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4637
 * answer depends on whether we are encoding set O as itself, and also
4638
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4639
 * clear that the answers to these questions vary between
4640
 * applications, so this code needs to be flexible.  */
4641
4642
#define ENCODE_DIRECT(c) \
4643
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4644
4645
PyObject *
4646
PyUnicode_DecodeUTF7(const char *s,
4647
                     Py_ssize_t size,
4648
                     const char *errors)
4649
0
{
4650
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4651
0
}
4652
4653
/* The decoder.  The only state we preserve is our read position,
4654
 * i.e. how many characters we have consumed.  So if we end in the
4655
 * middle of a shift sequence we have to back off the read position
4656
 * and the output to the beginning of the sequence, otherwise we lose
4657
 * all the shift state (seen bits, number of bits seen, high
4658
 * surrogate). */
4659
4660
PyObject *
4661
PyUnicode_DecodeUTF7Stateful(const char *s,
4662
                             Py_ssize_t size,
4663
                             const char *errors,
4664
                             Py_ssize_t *consumed)
4665
252
{
4666
252
    const char *starts = s;
4667
252
    Py_ssize_t startinpos;
4668
252
    Py_ssize_t endinpos;
4669
252
    const char *e;
4670
252
    _PyUnicodeWriter writer;
4671
252
    const char *errmsg = "";
4672
252
    int inShift = 0;
4673
252
    Py_ssize_t shiftOutStart;
4674
252
    unsigned int base64bits = 0;
4675
252
    unsigned long base64buffer = 0;
4676
252
    Py_UCS4 surrogate = 0;
4677
252
    PyObject *errorHandler = NULL;
4678
252
    PyObject *exc = NULL;
4679
4680
252
    if (size == 0) {
4681
0
        if (consumed)
4682
0
            *consumed = 0;
4683
0
        _Py_RETURN_UNICODE_EMPTY();
4684
0
    }
4685
4686
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4687
252
    _PyUnicodeWriter_Init(&writer);
4688
252
    writer.min_length = size;
4689
4690
252
    shiftOutStart = 0;
4691
252
    e = s + size;
4692
4693
108k
    while (s < e) {
4694
108k
        Py_UCS4 ch;
4695
108k
      restart:
4696
108k
        ch = (unsigned char) *s;
4697
4698
108k
        if (inShift) { /* in a base-64 section */
4699
29.8k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4700
25.5k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4701
25.5k
                base64bits += 6;
4702
25.5k
                s++;
4703
25.5k
                if (base64bits >= 16) {
4704
                    /* we have enough bits for a UTF-16 value */
4705
9.16k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4706
9.16k
                    base64bits -= 16;
4707
9.16k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4708
9.16k
                    assert(outCh <= 0xffff);
4709
9.16k
                    if (surrogate) {
4710
                        /* expecting a second surrogate */
4711
1.34k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4712
833
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4713
833
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4714
0
                                goto onError;
4715
833
                            surrogate = 0;
4716
833
                            continue;
4717
833
                        }
4718
511
                        else {
4719
511
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4720
0
                                goto onError;
4721
511
                            surrogate = 0;
4722
511
                        }
4723
1.34k
                    }
4724
8.33k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4725
                        /* first surrogate */
4726
2.02k
                        surrogate = outCh;
4727
2.02k
                    }
4728
6.30k
                    else {
4729
6.30k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4730
0
                            goto onError;
4731
6.30k
                    }
4732
8.33k
                }
4733
25.5k
            }
4734
4.20k
            else { /* now leaving a base-64 section */
4735
4.20k
                inShift = 0;
4736
4.20k
                if (base64bits > 0) { /* left-over bits */
4737
2.69k
                    if (base64bits >= 6) {
4738
                        /* We've seen at least one base-64 character */
4739
37
                        s++;
4740
37
                        errmsg = "partial character in shift sequence";
4741
37
                        goto utf7Error;
4742
37
                    }
4743
2.66k
                    else {
4744
                        /* Some bits remain; they should be zero */
4745
2.66k
                        if (base64buffer != 0) {
4746
12
                            s++;
4747
12
                            errmsg = "non-zero padding bits in shift sequence";
4748
12
                            goto utf7Error;
4749
12
                        }
4750
2.66k
                    }
4751
2.69k
                }
4752
4.15k
                if (surrogate && DECODE_DIRECT(ch)) {
4753
669
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4754
0
                        goto onError;
4755
669
                }
4756
4.15k
                surrogate = 0;
4757
4.15k
                if (ch == '-') {
4758
                    /* '-' is absorbed; other terminating
4759
                       characters are preserved */
4760
106
                    s++;
4761
106
                }
4762
4.15k
            }
4763
29.8k
        }
4764
78.8k
        else if ( ch == '+' ) {
4765
5.94k
            startinpos = s-starts;
4766
5.94k
            s++; /* consume '+' */
4767
5.94k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4768
1.71k
                s++;
4769
1.71k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4770
0
                    goto onError;
4771
1.71k
            }
4772
4.23k
            else if (s < e && !IS_BASE64(*s)) {
4773
11
                s++;
4774
11
                errmsg = "ill-formed sequence";
4775
11
                goto utf7Error;
4776
11
            }
4777
4.22k
            else { /* begin base64-encoded section */
4778
4.22k
                inShift = 1;
4779
4.22k
                surrogate = 0;
4780
4.22k
                shiftOutStart = writer.pos;
4781
4.22k
                base64bits = 0;
4782
4.22k
                base64buffer = 0;
4783
4.22k
            }
4784
5.94k
        }
4785
72.8k
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4786
72.0k
            s++;
4787
72.0k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4788
0
                goto onError;
4789
72.0k
        }
4790
779
        else {
4791
779
            startinpos = s-starts;
4792
779
            s++;
4793
779
            errmsg = "unexpected special character";
4794
779
            goto utf7Error;
4795
779
        }
4796
106k
        continue;
4797
106k
utf7Error:
4798
839
        endinpos = s-starts;
4799
839
        if (unicode_decode_call_errorhandler_writer(
4800
839
                errors, &errorHandler,
4801
839
                "utf7", errmsg,
4802
839
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4803
839
                &writer))
4804
65
            goto onError;
4805
839
    }
4806
4807
    /* end of string */
4808
4809
187
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4810
        /* if we're in an inconsistent state, that's an error */
4811
21
        inShift = 0;
4812
21
        if (surrogate ||
4813
20
                (base64bits >= 6) ||
4814
19
                (base64bits > 0 && base64buffer != 0)) {
4815
3
            endinpos = size;
4816
3
            if (unicode_decode_call_errorhandler_writer(
4817
3
                    errors, &errorHandler,
4818
3
                    "utf7", "unterminated shift sequence",
4819
3
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4820
3
                    &writer))
4821
3
                goto onError;
4822
0
            if (s < e)
4823
0
                goto restart;
4824
0
        }
4825
21
    }
4826
4827
    /* return state */
4828
184
    if (consumed) {
4829
0
        if (inShift) {
4830
0
            *consumed = startinpos;
4831
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4832
0
                PyObject *result = PyUnicode_FromKindAndData(
4833
0
                        writer.kind, writer.data, shiftOutStart);
4834
0
                Py_XDECREF(errorHandler);
4835
0
                Py_XDECREF(exc);
4836
0
                _PyUnicodeWriter_Dealloc(&writer);
4837
0
                return result;
4838
0
            }
4839
0
            writer.pos = shiftOutStart; /* back off output */
4840
0
        }
4841
0
        else {
4842
0
            *consumed = s-starts;
4843
0
        }
4844
0
    }
4845
4846
184
    Py_XDECREF(errorHandler);
4847
184
    Py_XDECREF(exc);
4848
184
    return _PyUnicodeWriter_Finish(&writer);
4849
4850
68
  onError:
4851
68
    Py_XDECREF(errorHandler);
4852
68
    Py_XDECREF(exc);
4853
68
    _PyUnicodeWriter_Dealloc(&writer);
4854
68
    return NULL;
4855
184
}
4856
4857
4858
PyObject *
4859
_PyUnicode_EncodeUTF7(PyObject *str,
4860
                      const char *errors)
4861
0
{
4862
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4863
0
    if (len == 0) {
4864
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4865
0
    }
4866
0
    int kind = PyUnicode_KIND(str);
4867
0
    const void *data = PyUnicode_DATA(str);
4868
4869
    /* It might be possible to tighten this worst case */
4870
0
    if (len > PY_SSIZE_T_MAX / 8) {
4871
0
        return PyErr_NoMemory();
4872
0
    }
4873
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4874
0
    if (writer == NULL) {
4875
0
        return NULL;
4876
0
    }
4877
4878
0
    int inShift = 0;
4879
0
    unsigned int base64bits = 0;
4880
0
    unsigned long base64buffer = 0;
4881
0
    char *out = PyBytesWriter_GetData(writer);
4882
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4883
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4884
4885
0
        if (inShift) {
4886
0
            if (ENCODE_DIRECT(ch)) {
4887
                /* shifting out */
4888
0
                if (base64bits) { /* output remaining bits */
4889
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4890
0
                    base64buffer = 0;
4891
0
                    base64bits = 0;
4892
0
                }
4893
0
                inShift = 0;
4894
                /* Characters not in the BASE64 set implicitly unshift the sequence
4895
                   so no '-' is required, except if the character is itself a '-' */
4896
0
                if (IS_BASE64(ch) || ch == '-') {
4897
0
                    *out++ = '-';
4898
0
                }
4899
0
                *out++ = (char) ch;
4900
0
            }
4901
0
            else {
4902
0
                goto encode_char;
4903
0
            }
4904
0
        }
4905
0
        else { /* not in a shift sequence */
4906
0
            if (ch == '+') {
4907
0
                *out++ = '+';
4908
0
                        *out++ = '-';
4909
0
            }
4910
0
            else if (ENCODE_DIRECT(ch)) {
4911
0
                *out++ = (char) ch;
4912
0
            }
4913
0
            else {
4914
0
                *out++ = '+';
4915
0
                inShift = 1;
4916
0
                goto encode_char;
4917
0
            }
4918
0
        }
4919
0
        continue;
4920
0
encode_char:
4921
0
        if (ch >= 0x10000) {
4922
0
            assert(ch <= MAX_UNICODE);
4923
4924
            /* code first surrogate */
4925
0
            base64bits += 16;
4926
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4927
0
            while (base64bits >= 6) {
4928
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4929
0
                base64bits -= 6;
4930
0
            }
4931
            /* prepare second surrogate */
4932
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4933
0
        }
4934
0
        base64bits += 16;
4935
0
        base64buffer = (base64buffer << 16) | ch;
4936
0
        while (base64bits >= 6) {
4937
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4938
0
            base64bits -= 6;
4939
0
        }
4940
0
    }
4941
0
    if (base64bits)
4942
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4943
0
    if (inShift)
4944
0
        *out++ = '-';
4945
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4946
0
}
4947
4948
#undef IS_BASE64
4949
#undef FROM_BASE64
4950
#undef TO_BASE64
4951
#undef DECODE_DIRECT
4952
#undef ENCODE_DIRECT
4953
4954
/* --- UTF-8 Codec -------------------------------------------------------- */
4955
4956
PyObject *
4957
PyUnicode_DecodeUTF8(const char *s,
4958
                     Py_ssize_t size,
4959
                     const char *errors)
4960
83.8M
{
4961
83.8M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4962
83.8M
}
4963
4964
#include "stringlib/asciilib.h"
4965
#include "stringlib/codecs.h"
4966
#include "stringlib/undef.h"
4967
4968
#include "stringlib/ucs1lib.h"
4969
#include "stringlib/codecs.h"
4970
#include "stringlib/undef.h"
4971
4972
#include "stringlib/ucs2lib.h"
4973
#include "stringlib/codecs.h"
4974
#include "stringlib/undef.h"
4975
4976
#include "stringlib/ucs4lib.h"
4977
#include "stringlib/codecs.h"
4978
#include "stringlib/undef.h"
4979
4980
#if (SIZEOF_SIZE_T == 8)
4981
/* Mask to quickly check whether a C 'size_t' contains a
4982
   non-ASCII, UTF8-encoded char. */
4983
78.2M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4984
// used to count codepoints in UTF-8 string.
4985
23.6M
# define VECTOR_0101     0x0101010101010101ULL
4986
469k
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4987
#elif (SIZEOF_SIZE_T == 4)
4988
# define ASCII_CHAR_MASK 0x80808080U
4989
# define VECTOR_0101     0x01010101U
4990
# define VECTOR_00FF     0x00ff00ffU
4991
#else
4992
# error C 'size_t' size should be either 4 or 8!
4993
#endif
4994
4995
#if (defined(__clang__) || defined(__GNUC__))
4996
#define HAVE_CTZ 1
4997
static inline unsigned int
4998
ctz(size_t v)
4999
1.54M
{
5000
1.54M
    return __builtin_ctzll((unsigned long long)v);
5001
1.54M
}
5002
#elif defined(_MSC_VER)
5003
#define HAVE_CTZ 1
5004
static inline unsigned int
5005
ctz(size_t v)
5006
{
5007
    unsigned long pos;
5008
#if SIZEOF_SIZE_T == 4
5009
    _BitScanForward(&pos, v);
5010
#else
5011
    _BitScanForward64(&pos, v);
5012
#endif /* SIZEOF_SIZE_T */
5013
    return pos;
5014
}
5015
#else
5016
#define HAVE_CTZ 0
5017
#endif
5018
5019
#if HAVE_CTZ && PY_LITTLE_ENDIAN
5020
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
5021
static size_t
5022
load_unaligned(const unsigned char *p, size_t size)
5023
11.6M
{
5024
11.6M
    union {
5025
11.6M
        size_t s;
5026
11.6M
        unsigned char b[SIZEOF_SIZE_T];
5027
11.6M
    } u;
5028
11.6M
    u.s = 0;
5029
    // This switch statement assumes little endian because:
5030
    // * union is faster than bitwise or and shift.
5031
    // * big endian machine is rare and hard to maintain.
5032
11.6M
    switch (size) {
5033
0
    default:
5034
0
#if SIZEOF_SIZE_T == 8
5035
0
    case 8:
5036
0
        u.b[7] = p[7];
5037
0
        _Py_FALLTHROUGH;
5038
731k
    case 7:
5039
731k
        u.b[6] = p[6];
5040
731k
        _Py_FALLTHROUGH;
5041
1.49M
    case 6:
5042
1.49M
        u.b[5] = p[5];
5043
1.49M
        _Py_FALLTHROUGH;
5044
4.28M
    case 5:
5045
4.28M
        u.b[4] = p[4];
5046
4.28M
        _Py_FALLTHROUGH;
5047
4.28M
#endif
5048
6.67M
    case 4:
5049
6.67M
        u.b[3] = p[3];
5050
6.67M
        _Py_FALLTHROUGH;
5051
8.04M
    case 3:
5052
8.04M
        u.b[2] = p[2];
5053
8.04M
        _Py_FALLTHROUGH;
5054
11.0M
    case 2:
5055
11.0M
        u.b[1] = p[1];
5056
11.0M
        _Py_FALLTHROUGH;
5057
11.3M
    case 1:
5058
11.3M
        u.b[0] = p[0];
5059
11.3M
        break;
5060
272k
    case 0:
5061
272k
        break;
5062
11.6M
    }
5063
11.6M
    return u.s;
5064
11.6M
}
5065
#endif
5066
5067
/*
5068
 * Find the first non-ASCII character in a byte sequence.
5069
 *
5070
 * This function scans a range of bytes from `start` to `end` and returns the
5071
 * index of the first byte that is not an ASCII character (i.e., has the most
5072
 * significant bit set). If all characters in the range are ASCII, it returns
5073
 * `end - start`.
5074
 */
5075
static Py_ssize_t
5076
find_first_nonascii(const unsigned char *start, const unsigned char *end)
5077
12.6M
{
5078
    // The search is done in `size_t` chunks.
5079
    // The start and end might not be aligned at `size_t` boundaries,
5080
    // so they're handled specially.
5081
5082
12.6M
    const unsigned char *p = start;
5083
5084
12.6M
    if (end - start >= SIZEOF_SIZE_T) {
5085
        // Avoid unaligned read.
5086
3.50M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5087
3.50M
        size_t u;
5088
3.50M
        memcpy(&u, p, sizeof(size_t));
5089
3.50M
        u &= ASCII_CHAR_MASK;
5090
3.50M
        if (u) {
5091
869k
            return (ctz(u) - 7) / 8;
5092
869k
        }
5093
2.63M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
5094
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
5095
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
5096
        while (p < p2) {
5097
            if (*p & 0x80) {
5098
                return p - start;
5099
            }
5100
            p++;
5101
        }
5102
#endif
5103
5104
2.63M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5105
64.4M
        while (p <= e) {
5106
61.8M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5107
61.8M
            if (u) {
5108
99.3k
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5109
99.3k
                return p - start + (ctz(u) - 7) / 8;
5110
#else
5111
                // big endian and minor compilers are difficult to test.
5112
                // fallback to per byte check.
5113
                break;
5114
#endif
5115
99.3k
            }
5116
61.7M
            p += SIZEOF_SIZE_T;
5117
61.7M
        }
5118
2.63M
    }
5119
11.6M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5120
12.6M
    assert((end - p) < SIZEOF_SIZE_T);
5121
    // we can not use *(const size_t*)p to avoid buffer overrun.
5122
11.6M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5123
11.6M
    if (u) {
5124
580k
        return p - start + (ctz(u) - 7) / 8;
5125
580k
    }
5126
11.0M
    return end - start;
5127
#else
5128
    while (p < end) {
5129
        if (*p & 0x80) {
5130
            break;
5131
        }
5132
        p++;
5133
    }
5134
    return p - start;
5135
#endif
5136
11.6M
}
5137
5138
static inline int
5139
scalar_utf8_start_char(unsigned int ch)
5140
1.56M
{
5141
    // 0xxxxxxx or 11xxxxxx are first byte.
5142
1.56M
    return (~ch >> 7 | ch >> 6) & 1;
5143
1.56M
}
5144
5145
static inline size_t
5146
vector_utf8_start_chars(size_t v)
5147
23.6M
{
5148
23.6M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5149
23.6M
}
5150
5151
5152
// Count the number of UTF-8 code points in a given byte sequence.
5153
static Py_ssize_t
5154
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5155
409k
{
5156
409k
    Py_ssize_t len = 0;
5157
5158
409k
    if (end - s >= SIZEOF_SIZE_T) {
5159
377k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5160
217k
            len += scalar_utf8_start_char(*s++);
5161
217k
        }
5162
5163
395k
        while (s + SIZEOF_SIZE_T <= end) {
5164
234k
            const unsigned char *e = end;
5165
234k
            if (e - s > SIZEOF_SIZE_T * 255) {
5166
89.7k
                e = s + SIZEOF_SIZE_T * 255;
5167
89.7k
            }
5168
234k
            Py_ssize_t vstart = 0;
5169
23.8M
            while (s + SIZEOF_SIZE_T <= e) {
5170
23.6M
                size_t v = *(size_t*)s;
5171
23.6M
                size_t vs = vector_utf8_start_chars(v);
5172
23.6M
                vstart += vs;
5173
23.6M
                s += SIZEOF_SIZE_T;
5174
23.6M
            }
5175
234k
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5176
234k
            vstart += vstart >> 16;
5177
234k
#if SIZEOF_SIZE_T == 8
5178
234k
            vstart += vstart >> 32;
5179
234k
#endif
5180
234k
            len += vstart & 0x7ff;
5181
234k
        }
5182
160k
    }
5183
1.75M
    while (s < end) {
5184
1.34M
        len += scalar_utf8_start_char(*s++);
5185
1.34M
    }
5186
409k
    return len;
5187
409k
}
5188
5189
static Py_ssize_t
5190
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5191
416k
{
5192
416k
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5193
416k
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5194
136k
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5195
69.6k
    {
5196
        /* Fast path, see in STRINGLIB(utf8_decode) for
5197
           an explanation. */
5198
69.6k
        const char *p = start;
5199
69.6k
        Py_UCS1 *q = dest;
5200
1.23M
        while (p + SIZEOF_SIZE_T <= end) {
5201
1.16M
            size_t value = *(const size_t *) p;
5202
1.16M
            if (value & ASCII_CHAR_MASK)
5203
281
                break;
5204
1.16M
            *((size_t *)q) = value;
5205
1.16M
            p += SIZEOF_SIZE_T;
5206
1.16M
            q += SIZEOF_SIZE_T;
5207
1.16M
        }
5208
296k
        while (p < end) {
5209
227k
            if ((unsigned char)*p & 0x80)
5210
438
                break;
5211
227k
            *q++ = *p++;
5212
227k
        }
5213
69.6k
        return p - start;
5214
69.6k
    }
5215
346k
#endif
5216
346k
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5217
346k
                                         (const unsigned char*)end);
5218
346k
    memcpy(dest, start, pos);
5219
346k
    return pos;
5220
416k
}
5221
5222
static int
5223
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5224
                         const char *starts, const char *s, const char *end,
5225
                         _Py_error_handler error_handler,
5226
                         const char *errors,
5227
                         Py_ssize_t *consumed)
5228
1.58M
{
5229
1.58M
    Py_ssize_t startinpos, endinpos;
5230
1.58M
    const char *errmsg = "";
5231
1.58M
    PyObject *error_handler_obj = NULL;
5232
1.58M
    PyObject *exc = NULL;
5233
5234
3.11M
    while (s < end) {
5235
2.90M
        Py_UCS4 ch;
5236
2.90M
        int kind = writer->kind;
5237
5238
2.90M
        if (kind == PyUnicode_1BYTE_KIND) {
5239
1.58M
            if (PyUnicode_IS_ASCII(writer->buffer))
5240
1.13M
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5241
444k
            else
5242
444k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5243
1.58M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5244
818k
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5245
818k
        } else {
5246
506k
            assert(kind == PyUnicode_4BYTE_KIND);
5247
506k
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5248
506k
        }
5249
5250
2.90M
        switch (ch) {
5251
1.37M
        case 0:
5252
1.37M
            if (s == end || consumed)
5253
1.37M
                goto End;
5254
3.09k
            errmsg = "unexpected end of data";
5255
3.09k
            startinpos = s - starts;
5256
3.09k
            endinpos = end - starts;
5257
3.09k
            break;
5258
33.2k
        case 1:
5259
33.2k
            errmsg = "invalid start byte";
5260
33.2k
            startinpos = s - starts;
5261
33.2k
            endinpos = startinpos + 1;
5262
33.2k
            break;
5263
41.0k
        case 2:
5264
41.0k
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5265
12
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5266
7
            {
5267
                /* Truncated surrogate code in range D800-DFFF */
5268
7
                goto End;
5269
7
            }
5270
41.0k
            _Py_FALLTHROUGH;
5271
46.8k
        case 3:
5272
48.6k
        case 4:
5273
48.6k
            errmsg = "invalid continuation byte";
5274
48.6k
            startinpos = s - starts;
5275
48.6k
            endinpos = startinpos + ch - 1;
5276
48.6k
            break;
5277
1.45M
        default:
5278
            // ch doesn't fit into kind, so change the buffer kind to write
5279
            // the character
5280
1.45M
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5281
0
                goto onError;
5282
1.45M
            continue;
5283
2.90M
        }
5284
5285
84.9k
        if (error_handler == _Py_ERROR_UNKNOWN)
5286
4.20k
            error_handler = _Py_GetErrorHandler(errors);
5287
5288
84.9k
        switch (error_handler) {
5289
0
        case _Py_ERROR_IGNORE:
5290
0
            s += (endinpos - startinpos);
5291
0
            break;
5292
5293
68.1k
        case _Py_ERROR_REPLACE:
5294
68.1k
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5295
0
                goto onError;
5296
68.1k
            s += (endinpos - startinpos);
5297
68.1k
            break;
5298
5299
0
        case _Py_ERROR_SURROGATEESCAPE:
5300
0
        {
5301
0
            Py_ssize_t i;
5302
5303
0
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5304
0
                goto onError;
5305
0
            for (i=startinpos; i<endinpos; i++) {
5306
0
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5307
0
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5308
0
                                ch + 0xdc00);
5309
0
                writer->pos++;
5310
0
            }
5311
0
            s += (endinpos - startinpos);
5312
0
            break;
5313
0
        }
5314
5315
16.8k
        default:
5316
16.8k
            if (unicode_decode_call_errorhandler_writer(
5317
16.8k
                    errors, &error_handler_obj,
5318
16.8k
                    "utf-8", errmsg,
5319
16.8k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5320
16.8k
                    writer)) {
5321
5.84k
                goto onError;
5322
5.84k
            }
5323
5324
11.0k
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5325
0
                return -1;
5326
0
            }
5327
84.9k
        }
5328
84.9k
    }
5329
5330
1.57M
End:
5331
1.57M
    if (consumed)
5332
56
        *consumed = s - starts;
5333
5334
1.57M
    Py_XDECREF(error_handler_obj);
5335
1.57M
    Py_XDECREF(exc);
5336
1.57M
    return 0;
5337
5338
5.84k
onError:
5339
5.84k
    Py_XDECREF(error_handler_obj);
5340
5.84k
    Py_XDECREF(exc);
5341
5.84k
    return -1;
5342
1.58M
}
5343
5344
5345
static PyObject *
5346
unicode_decode_utf8(const char *s, Py_ssize_t size,
5347
                    _Py_error_handler error_handler, const char *errors,
5348
                    Py_ssize_t *consumed)
5349
88.8M
{
5350
88.8M
    if (size == 0) {
5351
1.00M
        if (consumed) {
5352
0
            *consumed = 0;
5353
0
        }
5354
1.00M
        _Py_RETURN_UNICODE_EMPTY();
5355
1.00M
    }
5356
5357
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5358
87.8M
    if (size == 1 && (unsigned char)s[0] < 128) {
5359
75.5M
        if (consumed) {
5360
0
            *consumed = 1;
5361
0
        }
5362
75.5M
        return get_latin1_char((unsigned char)s[0]);
5363
75.5M
    }
5364
5365
    // I don't know this check is necessary or not. But there is a test
5366
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5367
12.2M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5368
0
        PyErr_NoMemory();
5369
0
        return NULL;
5370
0
    }
5371
5372
12.2M
    const char *starts = s;
5373
12.2M
    const char *end = s + size;
5374
5375
12.2M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5376
12.2M
    if (pos == size) {  // fast path: ASCII string.
5377
10.7M
        PyObject *u = PyUnicode_New(size, 127);
5378
10.7M
        if (u == NULL) {
5379
0
            return NULL;
5380
0
        }
5381
10.7M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5382
10.7M
        if (consumed) {
5383
0
            *consumed = size;
5384
0
        }
5385
10.7M
        return u;
5386
10.7M
    }
5387
5388
1.54M
    int maxchr = 127;
5389
1.54M
    Py_ssize_t maxsize = size;
5390
5391
1.54M
    unsigned char ch = (unsigned char)(s[pos]);
5392
    // error handler other than strict may remove/replace the invalid byte.
5393
    // consumed != NULL allows 1~3 bytes remainings.
5394
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5395
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5396
    // reallocation and copy.
5397
1.54M
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5398
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5399
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5400
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5401
        // means that it is no longer necessary to allocate several times the required amount
5402
        // of memory.
5403
409k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5404
409k
        if (ch < 0xc4) { // latin1
5405
176k
            maxchr = 0xff;
5406
176k
        }
5407
233k
        else if (ch < 0xf0) { // ucs2
5408
197k
            maxchr = 0xffff;
5409
197k
        }
5410
35.9k
        else { // ucs4
5411
35.9k
            maxchr = 0x10ffff;
5412
35.9k
        }
5413
409k
    }
5414
1.54M
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5415
1.54M
    if (!u) {
5416
0
        return NULL;
5417
0
    }
5418
5419
    // Use _PyUnicodeWriter after fast path is failed.
5420
1.54M
    _PyUnicodeWriter writer;
5421
1.54M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5422
1.54M
    if (maxchr <= 255) {
5423
1.31M
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5424
1.31M
        s += pos;
5425
1.31M
        size -= pos;
5426
1.31M
        writer.pos = pos;
5427
1.31M
    }
5428
5429
1.54M
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5430
1.54M
                                 error_handler, errors,
5431
1.54M
                                 consumed) < 0) {
5432
5.84k
        _PyUnicodeWriter_Dealloc(&writer);
5433
5.84k
        return NULL;
5434
5.84k
    }
5435
1.54M
    return _PyUnicodeWriter_Finish(&writer);
5436
1.54M
}
5437
5438
5439
// Used by PyUnicodeWriter_WriteUTF8() implementation
5440
static int
5441
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
5442
                           const char *s, Py_ssize_t size,
5443
                           _Py_error_handler error_handler, const char *errors,
5444
                           Py_ssize_t *consumed)
5445
420k
{
5446
420k
    if (size == 0) {
5447
29.4k
        if (consumed) {
5448
0
            *consumed = 0;
5449
0
        }
5450
29.4k
        return 0;
5451
29.4k
    }
5452
5453
    // fast path: try ASCII string.
5454
390k
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5455
0
        return -1;
5456
0
    }
5457
5458
390k
    const char *starts = s;
5459
390k
    const char *end = s + size;
5460
390k
    Py_ssize_t decoded = 0;
5461
390k
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5462
390k
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5463
358k
        decoded = ascii_decode(s, end, dest);
5464
358k
        writer->pos += decoded;
5465
5466
358k
        if (decoded == size) {
5467
357k
            if (consumed) {
5468
2.45k
                *consumed = size;
5469
2.45k
            }
5470
357k
            return 0;
5471
357k
        }
5472
1.85k
        s += decoded;
5473
1.85k
        size -= decoded;
5474
1.85k
    }
5475
5476
33.9k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5477
33.9k
                                    error_handler, errors, consumed);
5478
390k
}
5479
5480
5481
PyObject *
5482
PyUnicode_DecodeUTF8Stateful(const char *s,
5483
                             Py_ssize_t size,
5484
                             const char *errors,
5485
                             Py_ssize_t *consumed)
5486
88.8M
{
5487
88.8M
    return unicode_decode_utf8(s, size,
5488
88.8M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5489
88.8M
                               errors, consumed);
5490
88.8M
}
5491
5492
5493
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5494
   non-zero, use strict error handler otherwise.
5495
5496
   On success, write a pointer to a newly allocated wide character string into
5497
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5498
   (in number of wchar_t units) into *wlen (if wlen is set).
5499
5500
   On memory allocation failure, return -1.
5501
5502
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5503
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5504
   is not NULL, write the decoding error message into *reason. */
5505
int
5506
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5507
                 const char **reason, _Py_error_handler errors)
5508
7.19k
{
5509
7.19k
    const char *orig_s = s;
5510
7.19k
    const char *e;
5511
7.19k
    wchar_t *unicode;
5512
7.19k
    Py_ssize_t outpos;
5513
5514
7.19k
    int surrogateescape = 0;
5515
7.19k
    int surrogatepass = 0;
5516
7.19k
    switch (errors)
5517
7.19k
    {
5518
0
    case _Py_ERROR_STRICT:
5519
0
        break;
5520
7.19k
    case _Py_ERROR_SURROGATEESCAPE:
5521
7.19k
        surrogateescape = 1;
5522
7.19k
        break;
5523
0
    case _Py_ERROR_SURROGATEPASS:
5524
0
        surrogatepass = 1;
5525
0
        break;
5526
0
    default:
5527
0
        return -3;
5528
7.19k
    }
5529
5530
    /* Note: size will always be longer than the resulting Unicode
5531
       character count */
5532
7.19k
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5533
0
        return -1;
5534
0
    }
5535
5536
7.19k
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5537
7.19k
    if (!unicode) {
5538
0
        return -1;
5539
0
    }
5540
5541
    /* Unpack UTF-8 encoded data */
5542
7.19k
    e = s + size;
5543
7.19k
    outpos = 0;
5544
7.19k
    while (s < e) {
5545
7.19k
        Py_UCS4 ch;
5546
7.19k
#if SIZEOF_WCHAR_T == 4
5547
7.19k
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5548
#else
5549
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5550
#endif
5551
7.19k
        if (ch > 0xFF) {
5552
0
#if SIZEOF_WCHAR_T == 4
5553
0
            Py_UNREACHABLE();
5554
#else
5555
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5556
            /* write a surrogate pair */
5557
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5558
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5559
#endif
5560
0
        }
5561
7.19k
        else {
5562
7.19k
            if (!ch && s == e) {
5563
7.19k
                break;
5564
7.19k
            }
5565
5566
0
            if (surrogateescape) {
5567
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5568
0
            }
5569
0
            else {
5570
                /* Is it a valid three-byte code? */
5571
0
                if (surrogatepass
5572
0
                    && (e - s) >= 3
5573
0
                    && (s[0] & 0xf0) == 0xe0
5574
0
                    && (s[1] & 0xc0) == 0x80
5575
0
                    && (s[2] & 0xc0) == 0x80)
5576
0
                {
5577
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5578
0
                    s += 3;
5579
0
                    unicode[outpos++] = ch;
5580
0
                }
5581
0
                else {
5582
0
                    PyMem_RawFree(unicode );
5583
0
                    if (reason != NULL) {
5584
0
                        switch (ch) {
5585
0
                        case 0:
5586
0
                            *reason = "unexpected end of data";
5587
0
                            break;
5588
0
                        case 1:
5589
0
                            *reason = "invalid start byte";
5590
0
                            break;
5591
                        /* 2, 3, 4 */
5592
0
                        default:
5593
0
                            *reason = "invalid continuation byte";
5594
0
                            break;
5595
0
                        }
5596
0
                    }
5597
0
                    if (wlen != NULL) {
5598
0
                        *wlen = s - orig_s;
5599
0
                    }
5600
0
                    return -2;
5601
0
                }
5602
0
            }
5603
0
        }
5604
7.19k
    }
5605
7.19k
    unicode[outpos] = L'\0';
5606
7.19k
    if (wlen) {
5607
7.19k
        *wlen = outpos;
5608
7.19k
    }
5609
7.19k
    *wstr = unicode;
5610
7.19k
    return 0;
5611
7.19k
}
5612
5613
5614
wchar_t*
5615
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5616
                               size_t *wlen)
5617
0
{
5618
0
    wchar_t *wstr;
5619
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5620
0
                               &wstr, wlen,
5621
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5622
0
    if (res != 0) {
5623
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5624
0
        assert(res != -3);
5625
0
        if (wlen) {
5626
0
            *wlen = (size_t)res;
5627
0
        }
5628
0
        return NULL;
5629
0
    }
5630
0
    return wstr;
5631
0
}
5632
5633
5634
/* UTF-8 encoder.
5635
5636
   On success, return 0 and write the newly allocated character string (use
5637
   PyMem_Free() to free the memory) into *str.
5638
5639
   On encoding failure, return -2 and write the position of the invalid
5640
   surrogate character into *error_pos (if error_pos is set) and the decoding
5641
   error message into *reason (if reason is set).
5642
5643
   On memory allocation failure, return -1. */
5644
int
5645
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5646
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5647
902
{
5648
902
    const Py_ssize_t max_char_size = 4;
5649
902
    Py_ssize_t len = wcslen(text);
5650
5651
902
    assert(len >= 0);
5652
5653
902
    int surrogateescape = 0;
5654
902
    int surrogatepass = 0;
5655
902
    switch (errors)
5656
902
    {
5657
88
    case _Py_ERROR_STRICT:
5658
88
        break;
5659
814
    case _Py_ERROR_SURROGATEESCAPE:
5660
814
        surrogateescape = 1;
5661
814
        break;
5662
0
    case _Py_ERROR_SURROGATEPASS:
5663
0
        surrogatepass = 1;
5664
0
        break;
5665
0
    default:
5666
0
        return -3;
5667
902
    }
5668
5669
902
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5670
0
        return -1;
5671
0
    }
5672
902
    char *bytes;
5673
902
    if (raw_malloc) {
5674
902
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5675
902
    }
5676
0
    else {
5677
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5678
0
    }
5679
902
    if (bytes == NULL) {
5680
0
        return -1;
5681
0
    }
5682
5683
902
    char *p = bytes;
5684
902
    Py_ssize_t i;
5685
54.7k
    for (i = 0; i < len; ) {
5686
53.8k
        Py_ssize_t ch_pos = i;
5687
53.8k
        Py_UCS4 ch = text[i];
5688
53.8k
        i++;
5689
#if Py_UNICODE_SIZE == 2
5690
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5691
            && i < len
5692
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5693
        {
5694
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5695
            i++;
5696
        }
5697
#endif
5698
5699
53.8k
        if (ch < 0x80) {
5700
            /* Encode ASCII */
5701
53.8k
            *p++ = (char) ch;
5702
5703
53.8k
        }
5704
0
        else if (ch < 0x0800) {
5705
            /* Encode Latin-1 */
5706
0
            *p++ = (char)(0xc0 | (ch >> 6));
5707
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5708
0
        }
5709
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5710
            /* surrogateescape error handler */
5711
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5712
0
                if (error_pos != NULL) {
5713
0
                    *error_pos = (size_t)ch_pos;
5714
0
                }
5715
0
                if (reason != NULL) {
5716
0
                    *reason = "encoding error";
5717
0
                }
5718
0
                if (raw_malloc) {
5719
0
                    PyMem_RawFree(bytes);
5720
0
                }
5721
0
                else {
5722
0
                    PyMem_Free(bytes);
5723
0
                }
5724
0
                return -2;
5725
0
            }
5726
0
            *p++ = (char)(ch & 0xff);
5727
0
        }
5728
0
        else if (ch < 0x10000) {
5729
0
            *p++ = (char)(0xe0 | (ch >> 12));
5730
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5731
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5732
0
        }
5733
0
        else {  /* ch >= 0x10000 */
5734
0
            assert(ch <= MAX_UNICODE);
5735
            /* Encode UCS4 Unicode ordinals */
5736
0
            *p++ = (char)(0xf0 | (ch >> 18));
5737
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5738
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5739
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5740
0
        }
5741
53.8k
    }
5742
902
    *p++ = '\0';
5743
5744
902
    size_t final_size = (p - bytes);
5745
902
    char *bytes2;
5746
902
    if (raw_malloc) {
5747
902
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5748
902
    }
5749
0
    else {
5750
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5751
0
    }
5752
902
    if (bytes2 == NULL) {
5753
0
        if (error_pos != NULL) {
5754
0
            *error_pos = (size_t)-1;
5755
0
        }
5756
0
        if (raw_malloc) {
5757
0
            PyMem_RawFree(bytes);
5758
0
        }
5759
0
        else {
5760
0
            PyMem_Free(bytes);
5761
0
        }
5762
0
        return -1;
5763
0
    }
5764
902
    *str = bytes2;
5765
902
    return 0;
5766
902
}
5767
5768
5769
/* Primary internal function which creates utf8 encoded bytes objects.
5770
5771
   Allocation strategy:  if the string is short, convert into a stack buffer
5772
   and allocate exactly as much space needed at the end.  Else allocate the
5773
   maximum possible needed (4 result bytes per Unicode character), and return
5774
   the excess memory at the end.
5775
*/
5776
static PyObject *
5777
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5778
                    const char *errors)
5779
10.8k
{
5780
10.8k
    if (!PyUnicode_Check(unicode)) {
5781
0
        PyErr_BadArgument();
5782
0
        return NULL;
5783
0
    }
5784
5785
10.8k
    if (PyUnicode_UTF8(unicode))
5786
8.26k
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5787
8.26k
                                         PyUnicode_UTF8_LENGTH(unicode));
5788
5789
2.61k
    int kind = PyUnicode_KIND(unicode);
5790
2.61k
    const void *data = PyUnicode_DATA(unicode);
5791
2.61k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5792
5793
2.61k
    PyBytesWriter *writer;
5794
2.61k
    char *end;
5795
5796
2.61k
    switch (kind) {
5797
0
    default:
5798
0
        Py_UNREACHABLE();
5799
639
    case PyUnicode_1BYTE_KIND:
5800
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5801
639
        assert(!PyUnicode_IS_ASCII(unicode));
5802
639
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5803
639
                                      error_handler, errors, &end);
5804
639
        break;
5805
1.74k
    case PyUnicode_2BYTE_KIND:
5806
1.74k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5807
1.74k
                                      error_handler, errors, &end);
5808
1.74k
        break;
5809
228
    case PyUnicode_4BYTE_KIND:
5810
228
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5811
228
                                      error_handler, errors, &end);
5812
228
        break;
5813
2.61k
    }
5814
5815
2.61k
    if (writer == NULL) {
5816
31
        PyBytesWriter_Discard(writer);
5817
31
        return NULL;
5818
31
    }
5819
2.58k
    return PyBytesWriter_FinishWithPointer(writer, end);
5820
2.61k
}
5821
5822
static int
5823
unicode_fill_utf8(PyObject *unicode)
5824
7.37k
{
5825
7.37k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5826
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5827
7.37k
    assert(!PyUnicode_IS_ASCII(unicode));
5828
5829
7.37k
    int kind = PyUnicode_KIND(unicode);
5830
7.37k
    const void *data = PyUnicode_DATA(unicode);
5831
7.37k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5832
5833
7.37k
    PyBytesWriter *writer;
5834
7.37k
    char *end;
5835
5836
7.37k
    switch (kind) {
5837
0
    default:
5838
0
        Py_UNREACHABLE();
5839
2.76k
    case PyUnicode_1BYTE_KIND:
5840
2.76k
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5841
2.76k
                                      _Py_ERROR_STRICT, NULL, &end);
5842
2.76k
        break;
5843
3.46k
    case PyUnicode_2BYTE_KIND:
5844
3.46k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5845
3.46k
                                      _Py_ERROR_STRICT, NULL, &end);
5846
3.46k
        break;
5847
1.15k
    case PyUnicode_4BYTE_KIND:
5848
1.15k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5849
1.15k
                                      _Py_ERROR_STRICT, NULL, &end);
5850
1.15k
        break;
5851
7.37k
    }
5852
7.37k
    if (writer == NULL) {
5853
5
        return -1;
5854
5
    }
5855
5856
7.36k
    const char *start = PyBytesWriter_GetData(writer);
5857
7.36k
    Py_ssize_t len = end - start;
5858
5859
7.36k
    char *cache = PyMem_Malloc(len + 1);
5860
7.36k
    if (cache == NULL) {
5861
0
        PyBytesWriter_Discard(writer);
5862
0
        PyErr_NoMemory();
5863
0
        return -1;
5864
0
    }
5865
7.36k
    memcpy(cache, start, len);
5866
7.36k
    cache[len] = '\0';
5867
7.36k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5868
7.36k
    PyUnicode_SET_UTF8(unicode, cache);
5869
7.36k
    PyBytesWriter_Discard(writer);
5870
7.36k
    return 0;
5871
7.36k
}
5872
5873
PyObject *
5874
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5875
2.86k
{
5876
2.86k
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5877
2.86k
}
5878
5879
5880
PyObject *
5881
PyUnicode_AsUTF8String(PyObject *unicode)
5882
2.79k
{
5883
2.79k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5884
2.79k
}
5885
5886
/* --- UTF-32 Codec ------------------------------------------------------- */
5887
5888
PyObject *
5889
PyUnicode_DecodeUTF32(const char *s,
5890
                      Py_ssize_t size,
5891
                      const char *errors,
5892
                      int *byteorder)
5893
26
{
5894
26
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5895
26
}
5896
5897
PyObject *
5898
PyUnicode_DecodeUTF32Stateful(const char *s,
5899
                              Py_ssize_t size,
5900
                              const char *errors,
5901
                              int *byteorder,
5902
                              Py_ssize_t *consumed)
5903
374
{
5904
374
    const char *starts = s;
5905
374
    Py_ssize_t startinpos;
5906
374
    Py_ssize_t endinpos;
5907
374
    _PyUnicodeWriter writer;
5908
374
    const unsigned char *q, *e;
5909
374
    int le, bo = 0;       /* assume native ordering by default */
5910
374
    const char *encoding;
5911
374
    const char *errmsg = "";
5912
374
    PyObject *errorHandler = NULL;
5913
374
    PyObject *exc = NULL;
5914
5915
374
    q = (const unsigned char *)s;
5916
374
    e = q + size;
5917
5918
374
    if (byteorder)
5919
348
        bo = *byteorder;
5920
5921
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5922
       byte order setting accordingly. In native mode, the leading BOM
5923
       mark is skipped, in all other modes, it is copied to the output
5924
       stream as-is (giving a ZWNBSP character). */
5925
374
    if (bo == 0 && size >= 4) {
5926
34
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5927
34
        if (bom == 0x0000FEFF) {
5928
8
            bo = -1;
5929
8
            q += 4;
5930
8
        }
5931
26
        else if (bom == 0xFFFE0000) {
5932
14
            bo = 1;
5933
14
            q += 4;
5934
14
        }
5935
34
        if (byteorder)
5936
8
            *byteorder = bo;
5937
34
    }
5938
5939
374
    if (q == e) {
5940
2
        if (consumed)
5941
0
            *consumed = size;
5942
2
        _Py_RETURN_UNICODE_EMPTY();
5943
2
    }
5944
5945
#ifdef WORDS_BIGENDIAN
5946
    le = bo < 0;
5947
#else
5948
372
    le = bo <= 0;
5949
372
#endif
5950
372
    encoding = le ? "utf-32-le" : "utf-32-be";
5951
5952
372
    _PyUnicodeWriter_Init(&writer);
5953
372
    writer.min_length = (e - q + 3) / 4;
5954
372
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5955
0
        goto onError;
5956
5957
3.36k
    while (1) {
5958
3.36k
        Py_UCS4 ch = 0;
5959
3.36k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5960
5961
3.36k
        if (e - q >= 4) {
5962
3.26k
            int kind = writer.kind;
5963
3.26k
            void *data = writer.data;
5964
3.26k
            const unsigned char *last = e - 4;
5965
3.26k
            Py_ssize_t pos = writer.pos;
5966
3.26k
            if (le) {
5967
251k
                do {
5968
251k
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5969
251k
                    if (ch > maxch)
5970
527
                        break;
5971
250k
                    if (kind != PyUnicode_1BYTE_KIND &&
5972
248k
                        Py_UNICODE_IS_SURROGATE(ch))
5973
1.60k
                        break;
5974
249k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5975
249k
                    q += 4;
5976
249k
                } while (q <= last);
5977
2.18k
            }
5978
1.07k
            else {
5979
15.1k
                do {
5980
15.1k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5981
15.1k
                    if (ch > maxch)
5982
407
                        break;
5983
14.6k
                    if (kind != PyUnicode_1BYTE_KIND &&
5984
9.92k
                        Py_UNICODE_IS_SURROGATE(ch))
5985
628
                        break;
5986
14.0k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5987
14.0k
                    q += 4;
5988
14.0k
                } while (q <= last);
5989
1.07k
            }
5990
3.26k
            writer.pos = pos;
5991
3.26k
        }
5992
5993
3.36k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5994
2.30k
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5995
2.30k
            startinpos = ((const char *)q) - starts;
5996
2.30k
            endinpos = startinpos + 4;
5997
2.30k
        }
5998
1.06k
        else if (ch <= maxch) {
5999
198
            if (q == e || consumed)
6000
174
                break;
6001
            /* remaining bytes at the end? (size should be divisible by 4) */
6002
24
            errmsg = "truncated data";
6003
24
            startinpos = ((const char *)q) - starts;
6004
24
            endinpos = ((const char *)e) - starts;
6005
24
        }
6006
870
        else {
6007
870
            if (ch < 0x110000) {
6008
314
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6009
0
                    goto onError;
6010
314
                q += 4;
6011
314
                continue;
6012
314
            }
6013
556
            errmsg = "code point not in range(0x110000)";
6014
556
            startinpos = ((const char *)q) - starts;
6015
556
            endinpos = startinpos + 4;
6016
556
        }
6017
6018
        /* The remaining input chars are ignored if the callback
6019
           chooses to skip the input */
6020
2.88k
        if (unicode_decode_call_errorhandler_writer(
6021
2.88k
                errors, &errorHandler,
6022
2.88k
                encoding, errmsg,
6023
2.88k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
6024
2.88k
                &writer))
6025
198
            goto onError;
6026
2.88k
    }
6027
6028
174
    if (consumed)
6029
0
        *consumed = (const char *)q-starts;
6030
6031
174
    Py_XDECREF(errorHandler);
6032
174
    Py_XDECREF(exc);
6033
174
    return _PyUnicodeWriter_Finish(&writer);
6034
6035
198
  onError:
6036
198
    _PyUnicodeWriter_Dealloc(&writer);
6037
198
    Py_XDECREF(errorHandler);
6038
198
    Py_XDECREF(exc);
6039
198
    return NULL;
6040
372
}
6041
6042
PyObject *
6043
_PyUnicode_EncodeUTF32(PyObject *str,
6044
                       const char *errors,
6045
                       int byteorder)
6046
0
{
6047
0
    if (!PyUnicode_Check(str)) {
6048
0
        PyErr_BadArgument();
6049
0
        return NULL;
6050
0
    }
6051
0
    int kind = PyUnicode_KIND(str);
6052
0
    const void *data = PyUnicode_DATA(str);
6053
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6054
6055
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
6056
0
        return PyErr_NoMemory();
6057
0
    Py_ssize_t nsize = len + (byteorder == 0);
6058
6059
0
#if PY_LITTLE_ENDIAN
6060
0
    int native_ordering = byteorder <= 0;
6061
#else
6062
    int native_ordering = byteorder >= 0;
6063
#endif
6064
6065
0
    if (kind == PyUnicode_1BYTE_KIND) {
6066
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6067
        // on short strings
6068
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
6069
0
        if (v == NULL) {
6070
0
            return NULL;
6071
0
        }
6072
6073
        /* output buffer is 4-bytes aligned */
6074
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
6075
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
6076
0
        if (byteorder == 0) {
6077
0
            *out++ = 0xFEFF;
6078
0
        }
6079
0
        if (len > 0) {
6080
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
6081
0
                                 &out, native_ordering);
6082
0
        }
6083
0
        return v;
6084
0
    }
6085
6086
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
6087
0
    if (writer == NULL) {
6088
0
        return NULL;
6089
0
    }
6090
6091
    /* output buffer is 4-bytes aligned */
6092
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
6093
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
6094
0
    if (byteorder == 0) {
6095
0
        *out++ = 0xFEFF;
6096
0
    }
6097
0
    if (len == 0) {
6098
0
        return PyBytesWriter_Finish(writer);
6099
0
    }
6100
6101
0
    const char *encoding;
6102
0
    if (byteorder == -1)
6103
0
        encoding = "utf-32-le";
6104
0
    else if (byteorder == 1)
6105
0
        encoding = "utf-32-be";
6106
0
    else
6107
0
        encoding = "utf-32";
6108
6109
0
    PyObject *errorHandler = NULL;
6110
0
    PyObject *exc = NULL;
6111
0
    PyObject *rep = NULL;
6112
6113
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6114
0
        if (kind == PyUnicode_2BYTE_KIND) {
6115
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6116
0
                                        &out, native_ordering);
6117
0
        }
6118
0
        else {
6119
0
            assert(kind == PyUnicode_4BYTE_KIND);
6120
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6121
0
                                        &out, native_ordering);
6122
0
        }
6123
0
        if (pos == len)
6124
0
            break;
6125
6126
0
        Py_ssize_t newpos;
6127
0
        rep = unicode_encode_call_errorhandler(
6128
0
                errors, &errorHandler,
6129
0
                encoding, "surrogates not allowed",
6130
0
                str, &exc, pos, pos + 1, &newpos);
6131
0
        if (!rep)
6132
0
            goto error;
6133
6134
0
        Py_ssize_t repsize, moreunits;
6135
0
        if (PyBytes_Check(rep)) {
6136
0
            repsize = PyBytes_GET_SIZE(rep);
6137
0
            if (repsize & 3) {
6138
0
                raise_encode_exception(&exc, encoding,
6139
0
                                       str, pos, pos + 1,
6140
0
                                       "surrogates not allowed");
6141
0
                goto error;
6142
0
            }
6143
0
            moreunits = repsize / 4;
6144
0
        }
6145
0
        else {
6146
0
            assert(PyUnicode_Check(rep));
6147
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6148
0
            if (!PyUnicode_IS_ASCII(rep)) {
6149
0
                raise_encode_exception(&exc, encoding,
6150
0
                                       str, pos, pos + 1,
6151
0
                                       "surrogates not allowed");
6152
0
                goto error;
6153
0
            }
6154
0
        }
6155
0
        moreunits += pos - newpos;
6156
0
        pos = newpos;
6157
6158
        /* four bytes are reserved for each surrogate */
6159
0
        if (moreunits > 0) {
6160
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6161
0
            if (out == NULL) {
6162
0
                goto error;
6163
0
            }
6164
0
        }
6165
6166
0
        if (PyBytes_Check(rep)) {
6167
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6168
0
            out += repsize / 4;
6169
0
        }
6170
0
        else {
6171
            /* rep is unicode */
6172
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6173
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6174
0
                                 &out, native_ordering);
6175
0
        }
6176
6177
0
        Py_CLEAR(rep);
6178
0
    }
6179
6180
0
    Py_XDECREF(errorHandler);
6181
0
    Py_XDECREF(exc);
6182
6183
    /* Cut back to size actually needed. This is necessary for, for example,
6184
       encoding of a string containing isolated surrogates and the 'ignore'
6185
       handler is used. */
6186
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6187
6188
0
  error:
6189
0
    Py_XDECREF(rep);
6190
0
    Py_XDECREF(errorHandler);
6191
0
    Py_XDECREF(exc);
6192
0
    PyBytesWriter_Discard(writer);
6193
0
    return NULL;
6194
0
}
6195
6196
PyObject *
6197
PyUnicode_AsUTF32String(PyObject *unicode)
6198
0
{
6199
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6200
0
}
6201
6202
/* --- UTF-16 Codec ------------------------------------------------------- */
6203
6204
PyObject *
6205
PyUnicode_DecodeUTF16(const char *s,
6206
                      Py_ssize_t size,
6207
                      const char *errors,
6208
                      int *byteorder)
6209
75
{
6210
75
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6211
75
}
6212
6213
PyObject *
6214
PyUnicode_DecodeUTF16Stateful(const char *s,
6215
                              Py_ssize_t size,
6216
                              const char *errors,
6217
                              int *byteorder,
6218
                              Py_ssize_t *consumed)
6219
767
{
6220
767
    const char *starts = s;
6221
767
    Py_ssize_t startinpos;
6222
767
    Py_ssize_t endinpos;
6223
767
    _PyUnicodeWriter writer;
6224
767
    const unsigned char *q, *e;
6225
767
    int bo = 0;       /* assume native ordering by default */
6226
767
    int native_ordering;
6227
767
    const char *errmsg = "";
6228
767
    PyObject *errorHandler = NULL;
6229
767
    PyObject *exc = NULL;
6230
767
    const char *encoding;
6231
6232
767
    q = (const unsigned char *)s;
6233
767
    e = q + size;
6234
6235
767
    if (byteorder)
6236
692
        bo = *byteorder;
6237
6238
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6239
       byte order setting accordingly. In native mode, the leading BOM
6240
       mark is skipped, in all other modes, it is copied to the output
6241
       stream as-is (giving a ZWNBSP character). */
6242
767
    if (bo == 0 && size >= 2) {
6243
175
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6244
175
        if (bom == 0xFEFF) {
6245
30
            q += 2;
6246
30
            bo = -1;
6247
30
        }
6248
145
        else if (bom == 0xFFFE) {
6249
14
            q += 2;
6250
14
            bo = 1;
6251
14
        }
6252
175
        if (byteorder)
6253
100
            *byteorder = bo;
6254
175
    }
6255
6256
767
    if (q == e) {
6257
2
        if (consumed)
6258
0
            *consumed = size;
6259
2
        _Py_RETURN_UNICODE_EMPTY();
6260
2
    }
6261
6262
765
#if PY_LITTLE_ENDIAN
6263
765
    native_ordering = bo <= 0;
6264
765
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6265
#else
6266
    native_ordering = bo >= 0;
6267
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6268
#endif
6269
6270
    /* Note: size will always be longer than the resulting Unicode
6271
       character count normally.  Error handler will take care of
6272
       resizing when needed. */
6273
765
    _PyUnicodeWriter_Init(&writer);
6274
765
    writer.min_length = (e - q + 1) / 2;
6275
765
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6276
0
        goto onError;
6277
6278
179k
    while (1) {
6279
179k
        Py_UCS4 ch = 0;
6280
179k
        if (e - q >= 2) {
6281
179k
            int kind = writer.kind;
6282
179k
            if (kind == PyUnicode_1BYTE_KIND) {
6283
967
                if (PyUnicode_IS_ASCII(writer.buffer))
6284
763
                    ch = asciilib_utf16_decode(&q, e,
6285
763
                            (Py_UCS1*)writer.data, &writer.pos,
6286
763
                            native_ordering);
6287
204
                else
6288
204
                    ch = ucs1lib_utf16_decode(&q, e,
6289
204
                            (Py_UCS1*)writer.data, &writer.pos,
6290
204
                            native_ordering);
6291
178k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6292
35.2k
                ch = ucs2lib_utf16_decode(&q, e,
6293
35.2k
                        (Py_UCS2*)writer.data, &writer.pos,
6294
35.2k
                        native_ordering);
6295
143k
            } else {
6296
143k
                assert(kind == PyUnicode_4BYTE_KIND);
6297
143k
                ch = ucs4lib_utf16_decode(&q, e,
6298
143k
                        (Py_UCS4*)writer.data, &writer.pos,
6299
143k
                        native_ordering);
6300
143k
            }
6301
179k
        }
6302
6303
179k
        switch (ch)
6304
179k
        {
6305
754
        case 0:
6306
            /* remaining byte at the end? (size should be even) */
6307
754
            if (q == e || consumed)
6308
506
                goto End;
6309
248
            errmsg = "truncated data";
6310
248
            startinpos = ((const char *)q) - starts;
6311
248
            endinpos = ((const char *)e) - starts;
6312
248
            break;
6313
            /* The remaining input chars are ignored if the callback
6314
               chooses to skip the input */
6315
129
        case 1:
6316
129
            q -= 2;
6317
129
            if (consumed)
6318
0
                goto End;
6319
129
            errmsg = "unexpected end of data";
6320
129
            startinpos = ((const char *)q) - starts;
6321
129
            endinpos = ((const char *)e) - starts;
6322
129
            break;
6323
126k
        case 2:
6324
126k
            errmsg = "illegal encoding";
6325
126k
            startinpos = ((const char *)q) - 2 - starts;
6326
126k
            endinpos = startinpos + 2;
6327
126k
            break;
6328
51.7k
        case 3:
6329
51.7k
            errmsg = "illegal UTF-16 surrogate";
6330
51.7k
            startinpos = ((const char *)q) - 4 - starts;
6331
51.7k
            endinpos = startinpos + 2;
6332
51.7k
            break;
6333
831
        default:
6334
831
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6335
0
                goto onError;
6336
831
            continue;
6337
179k
        }
6338
6339
178k
        if (unicode_decode_call_errorhandler_writer(
6340
178k
                errors,
6341
178k
                &errorHandler,
6342
178k
                encoding, errmsg,
6343
178k
                &starts,
6344
178k
                (const char **)&e,
6345
178k
                &startinpos,
6346
178k
                &endinpos,
6347
178k
                &exc,
6348
178k
                (const char **)&q,
6349
178k
                &writer))
6350
259
            goto onError;
6351
178k
    }
6352
6353
506
End:
6354
506
    if (consumed)
6355
0
        *consumed = (const char *)q-starts;
6356
6357
506
    Py_XDECREF(errorHandler);
6358
506
    Py_XDECREF(exc);
6359
506
    return _PyUnicodeWriter_Finish(&writer);
6360
6361
259
  onError:
6362
259
    _PyUnicodeWriter_Dealloc(&writer);
6363
259
    Py_XDECREF(errorHandler);
6364
259
    Py_XDECREF(exc);
6365
259
    return NULL;
6366
765
}
6367
6368
PyObject *
6369
_PyUnicode_EncodeUTF16(PyObject *str,
6370
                       const char *errors,
6371
                       int byteorder)
6372
0
{
6373
0
    if (!PyUnicode_Check(str)) {
6374
0
        PyErr_BadArgument();
6375
0
        return NULL;
6376
0
    }
6377
0
    int kind = PyUnicode_KIND(str);
6378
0
    const void *data = PyUnicode_DATA(str);
6379
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6380
6381
0
    Py_ssize_t pairs = 0;
6382
0
    if (kind == PyUnicode_4BYTE_KIND) {
6383
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6384
0
        const Py_UCS4 *end = in + len;
6385
0
        while (in < end) {
6386
0
            if (*in++ >= 0x10000) {
6387
0
                pairs++;
6388
0
            }
6389
0
        }
6390
0
    }
6391
0
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6392
0
        return PyErr_NoMemory();
6393
0
    }
6394
0
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6395
6396
#if PY_BIG_ENDIAN
6397
    int native_ordering = byteorder >= 0;
6398
#else
6399
0
    int native_ordering = byteorder <= 0;
6400
0
#endif
6401
6402
0
    if (kind == PyUnicode_1BYTE_KIND) {
6403
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6404
        // on short strings
6405
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6406
0
        if (v == NULL) {
6407
0
            return NULL;
6408
0
        }
6409
6410
        /* output buffer is 2-bytes aligned */
6411
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6412
0
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6413
0
        if (byteorder == 0) {
6414
0
            *out++ = 0xFEFF;
6415
0
        }
6416
0
        if (len > 0) {
6417
0
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6418
0
        }
6419
0
        return v;
6420
0
    }
6421
6422
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6423
0
    if (writer == NULL) {
6424
0
        return NULL;
6425
0
    }
6426
6427
    /* output buffer is 2-bytes aligned */
6428
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6429
0
    unsigned short *out = PyBytesWriter_GetData(writer);
6430
0
    if (byteorder == 0) {
6431
0
        *out++ = 0xFEFF;
6432
0
    }
6433
0
    if (len == 0) {
6434
0
        return PyBytesWriter_Finish(writer);
6435
0
    }
6436
6437
0
    const char *encoding;
6438
0
    if (byteorder < 0) {
6439
0
        encoding = "utf-16-le";
6440
0
    }
6441
0
    else if (byteorder > 0) {
6442
0
        encoding = "utf-16-be";
6443
0
    }
6444
0
    else {
6445
0
        encoding = "utf-16";
6446
0
    }
6447
6448
0
    PyObject *errorHandler = NULL;
6449
0
    PyObject *exc = NULL;
6450
0
    PyObject *rep = NULL;
6451
6452
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6453
0
        if (kind == PyUnicode_2BYTE_KIND) {
6454
0
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6455
0
                                        &out, native_ordering);
6456
0
        }
6457
0
        else {
6458
0
            assert(kind == PyUnicode_4BYTE_KIND);
6459
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6460
0
                                        &out, native_ordering);
6461
0
        }
6462
0
        if (pos == len)
6463
0
            break;
6464
6465
0
        Py_ssize_t newpos;
6466
0
        rep = unicode_encode_call_errorhandler(
6467
0
                errors, &errorHandler,
6468
0
                encoding, "surrogates not allowed",
6469
0
                str, &exc, pos, pos + 1, &newpos);
6470
0
        if (!rep)
6471
0
            goto error;
6472
6473
0
        Py_ssize_t repsize, moreunits;
6474
0
        if (PyBytes_Check(rep)) {
6475
0
            repsize = PyBytes_GET_SIZE(rep);
6476
0
            if (repsize & 1) {
6477
0
                raise_encode_exception(&exc, encoding,
6478
0
                                       str, pos, pos + 1,
6479
0
                                       "surrogates not allowed");
6480
0
                goto error;
6481
0
            }
6482
0
            moreunits = repsize / 2;
6483
0
        }
6484
0
        else {
6485
0
            assert(PyUnicode_Check(rep));
6486
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6487
0
            if (!PyUnicode_IS_ASCII(rep)) {
6488
0
                raise_encode_exception(&exc, encoding,
6489
0
                                       str, pos, pos + 1,
6490
0
                                       "surrogates not allowed");
6491
0
                goto error;
6492
0
            }
6493
0
        }
6494
0
        moreunits += pos - newpos;
6495
0
        pos = newpos;
6496
6497
        /* two bytes are reserved for each surrogate */
6498
0
        if (moreunits > 0) {
6499
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6500
0
            if (out == NULL) {
6501
0
                goto error;
6502
0
            }
6503
0
        }
6504
6505
0
        if (PyBytes_Check(rep)) {
6506
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6507
0
            out += repsize / 2;
6508
0
        } else {
6509
            /* rep is unicode */
6510
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6511
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6512
0
                                 &out, native_ordering);
6513
0
        }
6514
6515
0
        Py_CLEAR(rep);
6516
0
    }
6517
6518
0
    Py_XDECREF(errorHandler);
6519
0
    Py_XDECREF(exc);
6520
6521
    /* Cut back to size actually needed. This is necessary for, for example,
6522
    encoding of a string containing isolated surrogates and the 'ignore' handler
6523
    is used. */
6524
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6525
6526
0
  error:
6527
0
    Py_XDECREF(rep);
6528
0
    Py_XDECREF(errorHandler);
6529
0
    Py_XDECREF(exc);
6530
0
    PyBytesWriter_Discard(writer);
6531
0
    return NULL;
6532
0
}
6533
6534
PyObject *
6535
PyUnicode_AsUTF16String(PyObject *unicode)
6536
0
{
6537
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6538
0
}
6539
6540
_PyUnicode_Name_CAPI *
6541
_PyUnicode_GetNameCAPI(void)
6542
5.18k
{
6543
5.18k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6544
5.18k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6545
6546
5.18k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6547
5.18k
    if (ucnhash_capi == NULL) {
6548
2
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6549
2
                PyUnicodeData_CAPSULE_NAME, 1);
6550
6551
        // It's fine if we overwrite the value here. It's always the same value.
6552
2
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6553
2
    }
6554
5.18k
    return ucnhash_capi;
6555
5.18k
}
6556
6557
/* --- Unicode Escape Codec ----------------------------------------------- */
6558
6559
PyObject *
6560
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6561
                               Py_ssize_t size,
6562
                               const char *errors,
6563
                               Py_ssize_t *consumed,
6564
                               int *first_invalid_escape_char,
6565
                               const char **first_invalid_escape_ptr)
6566
34.1k
{
6567
34.1k
    const char *starts = s;
6568
34.1k
    const char *initial_starts = starts;
6569
34.1k
    _PyUnicodeWriter writer;
6570
34.1k
    const char *end;
6571
34.1k
    PyObject *errorHandler = NULL;
6572
34.1k
    PyObject *exc = NULL;
6573
34.1k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6574
6575
    // so we can remember if we've seen an invalid escape char or not
6576
34.1k
    *first_invalid_escape_char = -1;
6577
34.1k
    *first_invalid_escape_ptr = NULL;
6578
6579
34.1k
    if (size == 0) {
6580
3.95k
        if (consumed) {
6581
0
            *consumed = 0;
6582
0
        }
6583
3.95k
        _Py_RETURN_UNICODE_EMPTY();
6584
3.95k
    }
6585
    /* Escaped strings will always be longer than the resulting
6586
       Unicode string, so we start with size here and then reduce the
6587
       length after conversion to the true value.
6588
       (but if the error callback returns a long replacement string
6589
       we'll have to allocate more space) */
6590
30.1k
    _PyUnicodeWriter_Init(&writer);
6591
30.1k
    writer.min_length = size;
6592
30.1k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6593
0
        goto onError;
6594
0
    }
6595
6596
30.1k
    end = s + size;
6597
8.11M
    while (s < end) {
6598
8.08M
        unsigned char c = (unsigned char) *s++;
6599
8.08M
        Py_UCS4 ch;
6600
8.08M
        int count;
6601
8.08M
        const char *message;
6602
6603
8.08M
#define WRITE_ASCII_CHAR(ch)                                                  \
6604
8.08M
            do {                                                              \
6605
1.79M
                assert(ch <= 127);                                            \
6606
1.79M
                assert(writer.pos < writer.size);                             \
6607
1.79M
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6608
1.79M
            } while(0)
6609
6610
8.08M
#define WRITE_CHAR(ch)                                                        \
6611
8.08M
            do {                                                              \
6612
6.54M
                if (ch <= writer.maxchar) {                                   \
6613
6.53M
                    assert(writer.pos < writer.size);                         \
6614
6.53M
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6615
6.53M
                }                                                             \
6616
6.54M
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6617
0
                    goto onError;                                             \
6618
0
                }                                                             \
6619
6.54M
            } while(0)
6620
6621
        /* Non-escape characters are interpreted as Unicode ordinals */
6622
8.08M
        if (c != '\\') {
6623
5.69M
            WRITE_CHAR(c);
6624
5.69M
            continue;
6625
5.69M
        }
6626
6627
2.38M
        Py_ssize_t startinpos = s - starts - 1;
6628
        /* \ - Escapes */
6629
2.38M
        if (s >= end) {
6630
0
            message = "\\ at end of string";
6631
0
            goto incomplete;
6632
0
        }
6633
2.38M
        c = (unsigned char) *s++;
6634
6635
2.38M
        assert(writer.pos < writer.size);
6636
2.38M
        switch (c) {
6637
6638
            /* \x escapes */
6639
2.21k
        case '\n': continue;
6640
184k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6641
184k
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6642
190k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6643
190k
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6644
        /* FF */
6645
274k
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6646
274k
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6647
453k
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6648
453k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6649
        /* VT */
6650
261k
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6651
        /* BEL, not classic C */
6652
22.3k
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6653
6654
            /* \OOO (octal) escapes */
6655
52.8k
        case '0': case '1': case '2': case '3':
6656
65.1k
        case '4': case '5': case '6': case '7':
6657
65.1k
            ch = c - '0';
6658
65.1k
            if (s < end && '0' <= *s && *s <= '7') {
6659
6.49k
                ch = (ch<<3) + *s++ - '0';
6660
6.49k
                if (s < end && '0' <= *s && *s <= '7') {
6661
5.23k
                    ch = (ch<<3) + *s++ - '0';
6662
5.23k
                }
6663
6.49k
            }
6664
65.1k
            if (ch > 0377) {
6665
1.39k
                if (*first_invalid_escape_char == -1) {
6666
520
                    *first_invalid_escape_char = ch;
6667
520
                    if (starts == initial_starts) {
6668
                        /* Back up 3 chars, since we've already incremented s. */
6669
520
                        *first_invalid_escape_ptr = s - 3;
6670
520
                    }
6671
520
                }
6672
1.39k
            }
6673
65.1k
            WRITE_CHAR(ch);
6674
65.1k
            continue;
6675
6676
            /* hex escapes */
6677
            /* \xXX */
6678
65.1k
        case 'x':
6679
337
            count = 2;
6680
337
            message = "truncated \\xXX escape";
6681
337
            goto hexescape;
6682
6683
            /* \uXXXX */
6684
5.18k
        case 'u':
6685
5.18k
            count = 4;
6686
5.18k
            message = "truncated \\uXXXX escape";
6687
5.18k
            goto hexescape;
6688
6689
            /* \UXXXXXXXX */
6690
509k
        case 'U':
6691
509k
            count = 8;
6692
509k
            message = "truncated \\UXXXXXXXX escape";
6693
515k
        hexescape:
6694
4.61M
            for (ch = 0; count; ++s, --count) {
6695
4.09M
                if (s >= end) {
6696
5
                    goto incomplete;
6697
5
                }
6698
4.09M
                c = (unsigned char)*s;
6699
4.09M
                ch <<= 4;
6700
4.09M
                if (c >= '0' && c <= '9') {
6701
3.13M
                    ch += c - '0';
6702
3.13M
                }
6703
962k
                else if (c >= 'a' && c <= 'f') {
6704
962k
                    ch += c - ('a' - 10);
6705
962k
                }
6706
239
                else if (c >= 'A' && c <= 'F') {
6707
233
                    ch += c - ('A' - 10);
6708
233
                }
6709
6
                else {
6710
6
                    goto error;
6711
6
                }
6712
4.09M
            }
6713
6714
            /* when we get here, ch is a 32-bit unicode character */
6715
515k
            if (ch > MAX_UNICODE) {
6716
0
                message = "illegal Unicode character";
6717
0
                goto error;
6718
0
            }
6719
6720
515k
            WRITE_CHAR(ch);
6721
515k
            continue;
6722
6723
            /* \N{name} */
6724
515k
        case 'N':
6725
5.18k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6726
5.18k
            if (ucnhash_capi == NULL) {
6727
0
                PyErr_SetString(
6728
0
                        PyExc_UnicodeError,
6729
0
                        "\\N escapes not supported (can't load unicodedata module)"
6730
0
                );
6731
0
                goto onError;
6732
0
            }
6733
6734
5.18k
            message = "malformed \\N character escape";
6735
5.18k
            if (s >= end) {
6736
4
                goto incomplete;
6737
4
            }
6738
5.18k
            if (*s == '{') {
6739
5.17k
                const char *start = ++s;
6740
5.17k
                size_t namelen;
6741
                /* look for the closing brace */
6742
6.44M
                while (s < end && *s != '}')
6743
6.44M
                    s++;
6744
5.17k
                if (s >= end) {
6745
9
                    goto incomplete;
6746
9
                }
6747
5.17k
                namelen = s - start;
6748
5.17k
                if (namelen) {
6749
                    /* found a name.  look it up in the unicode database */
6750
5.16k
                    s++;
6751
5.16k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6752
5.16k
                    if (namelen <= INT_MAX &&
6753
5.16k
                        ucnhash_capi->getcode(start, (int)namelen,
6754
5.16k
                                              &ch, 0)) {
6755
5.07k
                        assert(ch <= MAX_UNICODE);
6756
5.07k
                        WRITE_CHAR(ch);
6757
5.07k
                        continue;
6758
5.07k
                    }
6759
94
                    message = "unknown Unicode character name";
6760
94
                }
6761
5.17k
            }
6762
97
            goto error;
6763
6764
262k
        default:
6765
262k
            if (*first_invalid_escape_char == -1) {
6766
9.52k
                *first_invalid_escape_char = c;
6767
9.52k
                if (starts == initial_starts) {
6768
                    /* Back up one char, since we've already incremented s. */
6769
9.52k
                    *first_invalid_escape_ptr = s - 1;
6770
9.52k
                }
6771
9.52k
            }
6772
262k
            WRITE_ASCII_CHAR('\\');
6773
262k
            WRITE_CHAR(c);
6774
262k
            continue;
6775
2.38M
        }
6776
6777
18
      incomplete:
6778
18
        if (consumed) {
6779
0
            *consumed = startinpos;
6780
0
            break;
6781
0
        }
6782
121
      error:;
6783
121
        Py_ssize_t endinpos = s-starts;
6784
121
        writer.min_length = end - s + writer.pos;
6785
121
        if (unicode_decode_call_errorhandler_writer(
6786
121
                errors, &errorHandler,
6787
121
                "unicodeescape", message,
6788
121
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6789
121
                &writer)) {
6790
121
            goto onError;
6791
121
        }
6792
121
        assert(end - s <= writer.size - writer.pos);
6793
6794
0
#undef WRITE_ASCII_CHAR
6795
0
#undef WRITE_CHAR
6796
0
    }
6797
6798
30.0k
    Py_XDECREF(errorHandler);
6799
30.0k
    Py_XDECREF(exc);
6800
30.0k
    return _PyUnicodeWriter_Finish(&writer);
6801
6802
121
  onError:
6803
121
    _PyUnicodeWriter_Dealloc(&writer);
6804
121
    Py_XDECREF(errorHandler);
6805
121
    Py_XDECREF(exc);
6806
121
    return NULL;
6807
30.1k
}
6808
6809
PyObject *
6810
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6811
                              Py_ssize_t size,
6812
                              const char *errors,
6813
                              Py_ssize_t *consumed)
6814
0
{
6815
0
    int first_invalid_escape_char;
6816
0
    const char *first_invalid_escape_ptr;
6817
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6818
0
                                                      consumed,
6819
0
                                                      &first_invalid_escape_char,
6820
0
                                                      &first_invalid_escape_ptr);
6821
0
    if (result == NULL)
6822
0
        return NULL;
6823
0
    if (first_invalid_escape_char != -1) {
6824
0
        if (first_invalid_escape_char > 0xff) {
6825
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6826
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6827
0
                                 "Such sequences will not work in the future. ",
6828
0
                                 first_invalid_escape_char) < 0)
6829
0
            {
6830
0
                Py_DECREF(result);
6831
0
                return NULL;
6832
0
            }
6833
0
        }
6834
0
        else {
6835
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6836
0
                                 "\"\\%c\" is an invalid escape sequence. "
6837
0
                                 "Such sequences will not work in the future. ",
6838
0
                                 first_invalid_escape_char) < 0)
6839
0
            {
6840
0
                Py_DECREF(result);
6841
0
                return NULL;
6842
0
            }
6843
0
        }
6844
0
    }
6845
0
    return result;
6846
0
}
6847
6848
PyObject *
6849
PyUnicode_DecodeUnicodeEscape(const char *s,
6850
                              Py_ssize_t size,
6851
                              const char *errors)
6852
0
{
6853
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6854
0
}
6855
6856
/* Return a Unicode-Escape string version of the Unicode object. */
6857
6858
PyObject *
6859
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6860
0
{
6861
0
    if (!PyUnicode_Check(unicode)) {
6862
0
        PyErr_BadArgument();
6863
0
        return NULL;
6864
0
    }
6865
6866
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6867
0
    if (len == 0) {
6868
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6869
0
    }
6870
0
    int kind = PyUnicode_KIND(unicode);
6871
0
    const void *data = PyUnicode_DATA(unicode);
6872
6873
    /* Initial allocation is based on the longest-possible character
6874
     * escape.
6875
     *
6876
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6877
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6878
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6879
0
    Py_ssize_t expandsize = kind * 2 + 2;
6880
0
    if (len > PY_SSIZE_T_MAX / expandsize) {
6881
0
        return PyErr_NoMemory();
6882
0
    }
6883
6884
0
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6885
0
    if (writer == NULL) {
6886
0
        return NULL;
6887
0
    }
6888
0
    char *p = PyBytesWriter_GetData(writer);
6889
6890
0
    for (Py_ssize_t i = 0; i < len; i++) {
6891
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6892
6893
        /* U+0000-U+00ff range */
6894
0
        if (ch < 0x100) {
6895
0
            if (ch >= ' ' && ch < 127) {
6896
0
                if (ch != '\\') {
6897
                    /* Copy printable US ASCII as-is */
6898
0
                    *p++ = (char) ch;
6899
0
                }
6900
                /* Escape backslashes */
6901
0
                else {
6902
0
                    *p++ = '\\';
6903
0
                    *p++ = '\\';
6904
0
                }
6905
0
            }
6906
6907
            /* Map special whitespace to '\t', \n', '\r' */
6908
0
            else if (ch == '\t') {
6909
0
                *p++ = '\\';
6910
0
                *p++ = 't';
6911
0
            }
6912
0
            else if (ch == '\n') {
6913
0
                *p++ = '\\';
6914
0
                *p++ = 'n';
6915
0
            }
6916
0
            else if (ch == '\r') {
6917
0
                *p++ = '\\';
6918
0
                *p++ = 'r';
6919
0
            }
6920
6921
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6922
0
            else {
6923
0
                *p++ = '\\';
6924
0
                *p++ = 'x';
6925
0
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6926
0
                *p++ = Py_hexdigits[ch & 0x000F];
6927
0
            }
6928
0
        }
6929
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6930
0
        else if (ch < 0x10000) {
6931
0
            *p++ = '\\';
6932
0
            *p++ = 'u';
6933
0
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6934
0
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6935
0
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6936
0
            *p++ = Py_hexdigits[ch & 0x000F];
6937
0
        }
6938
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6939
0
        else {
6940
6941
            /* Make sure that the first two digits are zero */
6942
0
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6943
0
            *p++ = '\\';
6944
0
            *p++ = 'U';
6945
0
            *p++ = '0';
6946
0
            *p++ = '0';
6947
0
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6948
0
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6949
0
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6950
0
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6951
0
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6952
0
            *p++ = Py_hexdigits[ch & 0x0000000F];
6953
0
        }
6954
0
    }
6955
6956
0
    return PyBytesWriter_FinishWithPointer(writer, p);
6957
0
}
6958
6959
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6960
6961
PyObject *
6962
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6963
                                          Py_ssize_t size,
6964
                                          const char *errors,
6965
                                          Py_ssize_t *consumed)
6966
0
{
6967
0
    const char *starts = s;
6968
0
    _PyUnicodeWriter writer;
6969
0
    const char *end;
6970
0
    PyObject *errorHandler = NULL;
6971
0
    PyObject *exc = NULL;
6972
6973
0
    if (size == 0) {
6974
0
        if (consumed) {
6975
0
            *consumed = 0;
6976
0
        }
6977
0
        _Py_RETURN_UNICODE_EMPTY();
6978
0
    }
6979
6980
    /* Escaped strings will always be longer than the resulting
6981
       Unicode string, so we start with size here and then reduce the
6982
       length after conversion to the true value. (But decoding error
6983
       handler might have to resize the string) */
6984
0
    _PyUnicodeWriter_Init(&writer);
6985
0
    writer.min_length = size;
6986
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6987
0
        goto onError;
6988
0
    }
6989
6990
0
    end = s + size;
6991
0
    while (s < end) {
6992
0
        unsigned char c = (unsigned char) *s++;
6993
0
        Py_UCS4 ch;
6994
0
        int count;
6995
0
        const char *message;
6996
6997
0
#define WRITE_CHAR(ch)                                                        \
6998
0
            do {                                                              \
6999
0
                if (ch <= writer.maxchar) {                                   \
7000
0
                    assert(writer.pos < writer.size);                         \
7001
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
7002
0
                }                                                             \
7003
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
7004
0
                    goto onError;                                             \
7005
0
                }                                                             \
7006
0
            } while(0)
7007
7008
        /* Non-escape characters are interpreted as Unicode ordinals */
7009
0
        if (c != '\\' || (s >= end && !consumed)) {
7010
0
            WRITE_CHAR(c);
7011
0
            continue;
7012
0
        }
7013
7014
0
        Py_ssize_t startinpos = s - starts - 1;
7015
        /* \ - Escapes */
7016
0
        if (s >= end) {
7017
0
            assert(consumed);
7018
            // Set message to silent compiler warning.
7019
            // Actually it is never used.
7020
0
            message = "\\ at end of string";
7021
0
            goto incomplete;
7022
0
        }
7023
7024
0
        c = (unsigned char) *s++;
7025
0
        if (c == 'u') {
7026
0
            count = 4;
7027
0
            message = "truncated \\uXXXX escape";
7028
0
        }
7029
0
        else if (c == 'U') {
7030
0
            count = 8;
7031
0
            message = "truncated \\UXXXXXXXX escape";
7032
0
        }
7033
0
        else {
7034
0
            assert(writer.pos < writer.size);
7035
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
7036
0
            WRITE_CHAR(c);
7037
0
            continue;
7038
0
        }
7039
7040
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
7041
0
        for (ch = 0; count; ++s, --count) {
7042
0
            if (s >= end) {
7043
0
                goto incomplete;
7044
0
            }
7045
0
            c = (unsigned char)*s;
7046
0
            ch <<= 4;
7047
0
            if (c >= '0' && c <= '9') {
7048
0
                ch += c - '0';
7049
0
            }
7050
0
            else if (c >= 'a' && c <= 'f') {
7051
0
                ch += c - ('a' - 10);
7052
0
            }
7053
0
            else if (c >= 'A' && c <= 'F') {
7054
0
                ch += c - ('A' - 10);
7055
0
            }
7056
0
            else {
7057
0
                goto error;
7058
0
            }
7059
0
        }
7060
0
        if (ch > MAX_UNICODE) {
7061
0
            message = "\\Uxxxxxxxx out of range";
7062
0
            goto error;
7063
0
        }
7064
0
        WRITE_CHAR(ch);
7065
0
        continue;
7066
7067
0
      incomplete:
7068
0
        if (consumed) {
7069
0
            *consumed = startinpos;
7070
0
            break;
7071
0
        }
7072
0
      error:;
7073
0
        Py_ssize_t endinpos = s-starts;
7074
0
        writer.min_length = end - s + writer.pos;
7075
0
        if (unicode_decode_call_errorhandler_writer(
7076
0
                errors, &errorHandler,
7077
0
                "rawunicodeescape", message,
7078
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
7079
0
                &writer)) {
7080
0
            goto onError;
7081
0
        }
7082
0
        assert(end - s <= writer.size - writer.pos);
7083
7084
0
#undef WRITE_CHAR
7085
0
    }
7086
0
    Py_XDECREF(errorHandler);
7087
0
    Py_XDECREF(exc);
7088
0
    return _PyUnicodeWriter_Finish(&writer);
7089
7090
0
  onError:
7091
0
    _PyUnicodeWriter_Dealloc(&writer);
7092
0
    Py_XDECREF(errorHandler);
7093
0
    Py_XDECREF(exc);
7094
0
    return NULL;
7095
0
}
7096
7097
PyObject *
7098
PyUnicode_DecodeRawUnicodeEscape(const char *s,
7099
                                 Py_ssize_t size,
7100
                                 const char *errors)
7101
0
{
7102
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
7103
0
}
7104
7105
7106
PyObject *
7107
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7108
0
{
7109
0
    if (!PyUnicode_Check(unicode)) {
7110
0
        PyErr_BadArgument();
7111
0
        return NULL;
7112
0
    }
7113
0
    int kind = PyUnicode_KIND(unicode);
7114
0
    const void *data = PyUnicode_DATA(unicode);
7115
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7116
0
    if (len == 0) {
7117
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7118
0
    }
7119
0
    if (kind == PyUnicode_1BYTE_KIND) {
7120
0
        return PyBytes_FromStringAndSize(data, len);
7121
0
    }
7122
7123
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7124
       bytes, and 1 byte characters 4. */
7125
0
    Py_ssize_t expandsize = kind * 2 + 2;
7126
0
    if (len > PY_SSIZE_T_MAX / expandsize) {
7127
0
        return PyErr_NoMemory();
7128
0
    }
7129
7130
0
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7131
0
    if (writer == NULL) {
7132
0
        return NULL;
7133
0
    }
7134
0
    char *p = PyBytesWriter_GetData(writer);
7135
7136
0
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7137
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7138
7139
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7140
0
        if (ch < 0x100) {
7141
0
            *p++ = (char) ch;
7142
0
        }
7143
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7144
0
        else if (ch < 0x10000) {
7145
0
            *p++ = '\\';
7146
0
            *p++ = 'u';
7147
0
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7148
0
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7149
0
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7150
0
            *p++ = Py_hexdigits[ch & 15];
7151
0
        }
7152
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7153
0
        else {
7154
0
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7155
0
            *p++ = '\\';
7156
0
            *p++ = 'U';
7157
0
            *p++ = '0';
7158
0
            *p++ = '0';
7159
0
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7160
0
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7161
0
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7162
0
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7163
0
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7164
0
            *p++ = Py_hexdigits[ch & 15];
7165
0
        }
7166
0
    }
7167
7168
0
    return PyBytesWriter_FinishWithPointer(writer, p);
7169
0
}
7170
7171
/* --- Latin-1 Codec ------------------------------------------------------ */
7172
7173
PyObject *
7174
PyUnicode_DecodeLatin1(const char *s,
7175
                       Py_ssize_t size,
7176
                       const char *errors)
7177
5.28k
{
7178
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7179
5.28k
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7180
5.28k
}
7181
7182
/* create or adjust a UnicodeEncodeError */
7183
static void
7184
make_encode_exception(PyObject **exceptionObject,
7185
                      const char *encoding,
7186
                      PyObject *unicode,
7187
                      Py_ssize_t startpos, Py_ssize_t endpos,
7188
                      const char *reason)
7189
34.6k
{
7190
34.6k
    if (*exceptionObject == NULL) {
7191
34.6k
        *exceptionObject = PyObject_CallFunction(
7192
34.6k
            PyExc_UnicodeEncodeError, "sOnns",
7193
34.6k
            encoding, unicode, startpos, endpos, reason);
7194
34.6k
    }
7195
0
    else {
7196
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7197
0
            goto onError;
7198
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7199
0
            goto onError;
7200
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7201
0
            goto onError;
7202
0
        return;
7203
0
      onError:
7204
0
        Py_CLEAR(*exceptionObject);
7205
0
    }
7206
34.6k
}
7207
7208
/* raises a UnicodeEncodeError */
7209
static void
7210
raise_encode_exception(PyObject **exceptionObject,
7211
                       const char *encoding,
7212
                       PyObject *unicode,
7213
                       Py_ssize_t startpos, Py_ssize_t endpos,
7214
                       const char *reason)
7215
34.5k
{
7216
34.5k
    make_encode_exception(exceptionObject,
7217
34.5k
                          encoding, unicode, startpos, endpos, reason);
7218
34.5k
    if (*exceptionObject != NULL)
7219
34.5k
        PyCodec_StrictErrors(*exceptionObject);
7220
34.5k
}
7221
7222
/* error handling callback helper:
7223
   build arguments, call the callback and check the arguments,
7224
   put the result into newpos and return the replacement string, which
7225
   has to be freed by the caller */
7226
static PyObject *
7227
unicode_encode_call_errorhandler(const char *errors,
7228
                                 PyObject **errorHandler,
7229
                                 const char *encoding, const char *reason,
7230
                                 PyObject *unicode, PyObject **exceptionObject,
7231
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7232
                                 Py_ssize_t *newpos)
7233
36
{
7234
36
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7235
36
    Py_ssize_t len;
7236
36
    PyObject *restuple;
7237
36
    PyObject *resunicode;
7238
7239
36
    if (*errorHandler == NULL) {
7240
36
        *errorHandler = PyCodec_LookupError(errors);
7241
36
        if (*errorHandler == NULL)
7242
0
            return NULL;
7243
36
    }
7244
7245
36
    len = PyUnicode_GET_LENGTH(unicode);
7246
7247
36
    make_encode_exception(exceptionObject,
7248
36
                          encoding, unicode, startpos, endpos, reason);
7249
36
    if (*exceptionObject == NULL)
7250
0
        return NULL;
7251
7252
36
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7253
36
    if (restuple == NULL)
7254
36
        return NULL;
7255
0
    if (!PyTuple_Check(restuple)) {
7256
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7257
0
        Py_DECREF(restuple);
7258
0
        return NULL;
7259
0
    }
7260
0
    if (!PyArg_ParseTuple(restuple, argparse,
7261
0
                          &resunicode, newpos)) {
7262
0
        Py_DECREF(restuple);
7263
0
        return NULL;
7264
0
    }
7265
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7266
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7267
0
        Py_DECREF(restuple);
7268
0
        return NULL;
7269
0
    }
7270
0
    if (*newpos<0)
7271
0
        *newpos = len + *newpos;
7272
0
    if (*newpos<0 || *newpos>len) {
7273
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7274
0
        Py_DECREF(restuple);
7275
0
        return NULL;
7276
0
    }
7277
0
    Py_INCREF(resunicode);
7278
0
    Py_DECREF(restuple);
7279
0
    return resunicode;
7280
0
}
7281
7282
static PyObject *
7283
unicode_encode_ucs1(PyObject *unicode,
7284
                    const char *errors,
7285
                    const Py_UCS4 limit)
7286
34.8k
{
7287
    /* input state */
7288
34.8k
    Py_ssize_t pos=0, size;
7289
34.8k
    int kind;
7290
34.8k
    const void *data;
7291
34.8k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7292
34.8k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7293
34.8k
    PyObject *error_handler_obj = NULL;
7294
34.8k
    PyObject *exc = NULL;
7295
34.8k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7296
34.8k
    PyObject *rep = NULL;
7297
7298
34.8k
    size = PyUnicode_GET_LENGTH(unicode);
7299
34.8k
    kind = PyUnicode_KIND(unicode);
7300
34.8k
    data = PyUnicode_DATA(unicode);
7301
    /* allocate enough for a simple encoding without
7302
       replacements, if we need more, we'll resize */
7303
34.8k
    if (size == 0)
7304
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7305
7306
    /* output object */
7307
34.8k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7308
34.8k
    if (writer == NULL) {
7309
0
        return NULL;
7310
0
    }
7311
    /* pointer into the output */
7312
34.8k
    char *str = PyBytesWriter_GetData(writer);
7313
7314
3.40M
    while (pos < size) {
7315
3.40M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7316
7317
        /* can we encode this? */
7318
3.40M
        if (ch < limit) {
7319
            /* no overflow check, because we know that the space is enough */
7320
3.35M
            *str++ = (char)ch;
7321
3.35M
            ++pos;
7322
3.35M
        }
7323
45.1k
        else {
7324
45.1k
            Py_ssize_t newpos, i;
7325
            /* startpos for collecting unencodable chars */
7326
45.1k
            Py_ssize_t collstart = pos;
7327
45.1k
            Py_ssize_t collend = collstart + 1;
7328
            /* find all unecodable characters */
7329
7330
564k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7331
519k
                ++collend;
7332
7333
            /* Only overallocate the buffer if it's not the last write */
7334
45.1k
            writer->overallocate = (collend < size);
7335
7336
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7337
45.1k
            if (error_handler == _Py_ERROR_UNKNOWN)
7338
34.8k
                error_handler = _Py_GetErrorHandler(errors);
7339
7340
45.1k
            switch (error_handler) {
7341
34.5k
            case _Py_ERROR_STRICT:
7342
34.5k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7343
34.5k
                goto onError;
7344
7345
0
            case _Py_ERROR_REPLACE:
7346
0
                memset(str, '?', collend - collstart);
7347
0
                str += (collend - collstart);
7348
0
                _Py_FALLTHROUGH;
7349
0
            case _Py_ERROR_IGNORE:
7350
0
                pos = collend;
7351
0
                break;
7352
7353
10.6k
            case _Py_ERROR_BACKSLASHREPLACE:
7354
                /* subtract preallocated bytes */
7355
10.6k
                writer->size -= (collend - collstart);
7356
10.6k
                str = backslashreplace(writer, str,
7357
10.6k
                                       unicode, collstart, collend);
7358
10.6k
                if (str == NULL)
7359
0
                    goto onError;
7360
10.6k
                pos = collend;
7361
10.6k
                break;
7362
7363
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7364
                /* subtract preallocated bytes */
7365
0
                writer->size -= (collend - collstart);
7366
0
                str = xmlcharrefreplace(writer, str,
7367
0
                                        unicode, collstart, collend);
7368
0
                if (str == NULL)
7369
0
                    goto onError;
7370
0
                pos = collend;
7371
0
                break;
7372
7373
0
            case _Py_ERROR_SURROGATEESCAPE:
7374
0
                for (i = collstart; i < collend; ++i) {
7375
0
                    ch = PyUnicode_READ(kind, data, i);
7376
0
                    if (ch < 0xdc80 || 0xdcff < ch) {
7377
                        /* Not a UTF-8b surrogate */
7378
0
                        break;
7379
0
                    }
7380
0
                    *str++ = (char)(ch - 0xdc00);
7381
0
                    ++pos;
7382
0
                }
7383
0
                if (i >= collend)
7384
0
                    break;
7385
0
                collstart = pos;
7386
0
                assert(collstart != collend);
7387
0
                _Py_FALLTHROUGH;
7388
7389
0
            default:
7390
0
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7391
0
                                                       encoding, reason, unicode, &exc,
7392
0
                                                       collstart, collend, &newpos);
7393
0
                if (rep == NULL)
7394
0
                    goto onError;
7395
7396
0
                if (newpos < collstart) {
7397
0
                    writer->overallocate = 1;
7398
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7399
0
                                                             collstart - newpos,
7400
0
                                                             str);
7401
0
                    if (str == NULL) {
7402
0
                        goto onError;
7403
0
                    }
7404
0
                }
7405
0
                else {
7406
                    /* subtract preallocated bytes */
7407
0
                    writer->size -= newpos - collstart;
7408
                    /* Only overallocate the buffer if it's not the last write */
7409
0
                    writer->overallocate = (newpos < size);
7410
0
                }
7411
7412
0
                char *rep_str;
7413
0
                Py_ssize_t rep_len;
7414
0
                if (PyBytes_Check(rep)) {
7415
                    /* Directly copy bytes result to output. */
7416
0
                    rep_str = PyBytes_AS_STRING(rep);
7417
0
                    rep_len = PyBytes_GET_SIZE(rep);
7418
0
                }
7419
0
                else {
7420
0
                    assert(PyUnicode_Check(rep));
7421
7422
0
                    if (limit == 256 ?
7423
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7424
0
                        !PyUnicode_IS_ASCII(rep))
7425
0
                    {
7426
                        /* Not all characters are smaller than limit */
7427
0
                        raise_encode_exception(&exc, encoding, unicode,
7428
0
                                               collstart, collend, reason);
7429
0
                        goto onError;
7430
0
                    }
7431
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7432
0
                    rep_str = PyUnicode_DATA(rep);
7433
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7434
0
                }
7435
7436
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7437
0
                if (str == NULL) {
7438
0
                    goto onError;
7439
0
                }
7440
0
                memcpy(str, rep_str, rep_len);
7441
0
                str += rep_len;
7442
7443
0
                pos = newpos;
7444
0
                Py_CLEAR(rep);
7445
45.1k
            }
7446
7447
            /* If overallocation was disabled, ensure that it was the last
7448
               write. Otherwise, we missed an optimization */
7449
45.1k
            assert(writer->overallocate || pos == size);
7450
10.6k
        }
7451
3.40M
    }
7452
7453
299
    Py_XDECREF(error_handler_obj);
7454
299
    Py_XDECREF(exc);
7455
299
    return PyBytesWriter_FinishWithPointer(writer, str);
7456
7457
34.5k
  onError:
7458
34.5k
    Py_XDECREF(rep);
7459
34.5k
    PyBytesWriter_Discard(writer);
7460
34.5k
    Py_XDECREF(error_handler_obj);
7461
34.5k
    Py_XDECREF(exc);
7462
34.5k
    return NULL;
7463
34.8k
}
7464
7465
PyObject *
7466
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7467
0
{
7468
0
    if (!PyUnicode_Check(unicode)) {
7469
0
        PyErr_BadArgument();
7470
0
        return NULL;
7471
0
    }
7472
    /* Fast path: if it is a one-byte string, construct
7473
       bytes object directly. */
7474
0
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7475
0
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7476
0
                                         PyUnicode_GET_LENGTH(unicode));
7477
    /* Non-Latin-1 characters present. Defer to above function to
7478
       raise the exception. */
7479
0
    return unicode_encode_ucs1(unicode, errors, 256);
7480
0
}
7481
7482
PyObject*
7483
PyUnicode_AsLatin1String(PyObject *unicode)
7484
0
{
7485
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7486
0
}
7487
7488
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7489
7490
PyObject *
7491
PyUnicode_DecodeASCII(const char *s,
7492
                      Py_ssize_t size,
7493
                      const char *errors)
7494
60.7k
{
7495
60.7k
    const char *starts = s;
7496
60.7k
    const char *e = s + size;
7497
60.7k
    PyObject *error_handler_obj = NULL;
7498
60.7k
    PyObject *exc = NULL;
7499
60.7k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7500
7501
60.7k
    if (size == 0)
7502
0
        _Py_RETURN_UNICODE_EMPTY();
7503
7504
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7505
60.7k
    if (size == 1 && (unsigned char)s[0] < 128) {
7506
3.34k
        return get_latin1_char((unsigned char)s[0]);
7507
3.34k
    }
7508
7509
    // Shortcut for simple case
7510
57.3k
    PyObject *u = PyUnicode_New(size, 127);
7511
57.3k
    if (u == NULL) {
7512
0
        return NULL;
7513
0
    }
7514
57.3k
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7515
57.3k
    if (outpos == size) {
7516
57.2k
        return u;
7517
57.2k
    }
7518
7519
123
    _PyUnicodeWriter writer;
7520
123
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7521
123
    writer.pos = outpos;
7522
7523
123
    s += outpos;
7524
123
    int kind = writer.kind;
7525
123
    void *data = writer.data;
7526
123
    Py_ssize_t startinpos, endinpos;
7527
7528
1.27k
    while (s < e) {
7529
1.26k
        unsigned char c = (unsigned char)*s;
7530
1.26k
        if (c < 128) {
7531
0
            PyUnicode_WRITE(kind, data, writer.pos, c);
7532
0
            writer.pos++;
7533
0
            ++s;
7534
0
            continue;
7535
0
        }
7536
7537
        /* byte outsize range 0x00..0x7f: call the error handler */
7538
7539
1.26k
        if (error_handler == _Py_ERROR_UNKNOWN)
7540
123
            error_handler = _Py_GetErrorHandler(errors);
7541
7542
1.26k
        switch (error_handler)
7543
1.26k
        {
7544
1.15k
        case _Py_ERROR_REPLACE:
7545
1.15k
        case _Py_ERROR_SURROGATEESCAPE:
7546
            /* Fast-path: the error handler only writes one character,
7547
               but we may switch to UCS2 at the first write */
7548
1.15k
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7549
0
                goto onError;
7550
1.15k
            kind = writer.kind;
7551
1.15k
            data = writer.data;
7552
7553
1.15k
            if (error_handler == _Py_ERROR_REPLACE)
7554
1.15k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7555
0
            else
7556
0
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7557
1.15k
            writer.pos++;
7558
1.15k
            ++s;
7559
1.15k
            break;
7560
7561
0
        case _Py_ERROR_IGNORE:
7562
0
            ++s;
7563
0
            break;
7564
7565
114
        default:
7566
114
            startinpos = s-starts;
7567
114
            endinpos = startinpos + 1;
7568
114
            if (unicode_decode_call_errorhandler_writer(
7569
114
                    errors, &error_handler_obj,
7570
114
                    "ascii", "ordinal not in range(128)",
7571
114
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7572
114
                    &writer))
7573
114
                goto onError;
7574
0
            kind = writer.kind;
7575
0
            data = writer.data;
7576
1.26k
        }
7577
1.26k
    }
7578
9
    Py_XDECREF(error_handler_obj);
7579
9
    Py_XDECREF(exc);
7580
9
    return _PyUnicodeWriter_Finish(&writer);
7581
7582
114
  onError:
7583
114
    _PyUnicodeWriter_Dealloc(&writer);
7584
114
    Py_XDECREF(error_handler_obj);
7585
114
    Py_XDECREF(exc);
7586
114
    return NULL;
7587
123
}
7588
7589
PyObject *
7590
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7591
37.2k
{
7592
37.2k
    if (!PyUnicode_Check(unicode)) {
7593
0
        PyErr_BadArgument();
7594
0
        return NULL;
7595
0
    }
7596
    /* Fast path: if it is an ASCII-only string, construct bytes object
7597
       directly. Else defer to above function to raise the exception. */
7598
37.2k
    if (PyUnicode_IS_ASCII(unicode))
7599
2.36k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7600
2.36k
                                         PyUnicode_GET_LENGTH(unicode));
7601
34.8k
    return unicode_encode_ucs1(unicode, errors, 128);
7602
37.2k
}
7603
7604
PyObject *
7605
PyUnicode_AsASCIIString(PyObject *unicode)
7606
2
{
7607
2
    return _PyUnicode_AsASCIIString(unicode, NULL);
7608
2
}
7609
7610
#ifdef MS_WINDOWS
7611
7612
/* --- MBCS codecs for Windows -------------------------------------------- */
7613
7614
#if SIZEOF_INT < SIZEOF_SIZE_T
7615
#define NEED_RETRY
7616
#endif
7617
7618
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7619
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7620
   both cases also and avoids partial characters overrunning the
7621
   length limit in MultiByteToWideChar on Windows */
7622
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7623
7624
#ifndef WC_ERR_INVALID_CHARS
7625
#  define WC_ERR_INVALID_CHARS 0x0080
7626
#endif
7627
7628
static const char*
7629
code_page_name(UINT code_page, PyObject **obj)
7630
{
7631
    *obj = NULL;
7632
    if (code_page == CP_ACP)
7633
        return "mbcs";
7634
7635
    *obj = PyBytes_FromFormat("cp%u", code_page);
7636
    if (*obj == NULL)
7637
        return NULL;
7638
    return PyBytes_AS_STRING(*obj);
7639
}
7640
7641
static DWORD
7642
decode_code_page_flags(UINT code_page)
7643
{
7644
    if (code_page == CP_UTF7) {
7645
        /* The CP_UTF7 decoder only supports flags=0 */
7646
        return 0;
7647
    }
7648
    else
7649
        return MB_ERR_INVALID_CHARS;
7650
}
7651
7652
/*
7653
 * Decode a byte string from a Windows code page into unicode object in strict
7654
 * mode.
7655
 *
7656
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7657
 * OSError and returns -1 on other error.
7658
 */
7659
static int
7660
decode_code_page_strict(UINT code_page,
7661
                        wchar_t **buf,
7662
                        Py_ssize_t *bufsize,
7663
                        const char *in,
7664
                        int insize)
7665
{
7666
    DWORD flags = MB_ERR_INVALID_CHARS;
7667
    wchar_t *out;
7668
    DWORD outsize;
7669
7670
    /* First get the size of the result */
7671
    assert(insize > 0);
7672
    while ((outsize = MultiByteToWideChar(code_page, flags,
7673
                                          in, insize, NULL, 0)) <= 0)
7674
    {
7675
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7676
            goto error;
7677
        }
7678
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7679
        flags = 0;
7680
    }
7681
7682
    /* Extend a wchar_t* buffer */
7683
    Py_ssize_t n = *bufsize;   /* Get the current length */
7684
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7685
        return -1;
7686
    }
7687
    out = *buf + n;
7688
7689
    /* Do the conversion */
7690
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7691
    if (outsize <= 0)
7692
        goto error;
7693
    return insize;
7694
7695
error:
7696
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7697
        return -2;
7698
    PyErr_SetFromWindowsErr(0);
7699
    return -1;
7700
}
7701
7702
/*
7703
 * Decode a byte string from a code page into unicode object with an error
7704
 * handler.
7705
 *
7706
 * Returns consumed size if succeed, or raise an OSError or
7707
 * UnicodeDecodeError exception and returns -1 on error.
7708
 */
7709
static int
7710
decode_code_page_errors(UINT code_page,
7711
                        wchar_t **buf,
7712
                        Py_ssize_t *bufsize,
7713
                        const char *in, const int size,
7714
                        const char *errors, int final)
7715
{
7716
    const char *startin = in;
7717
    const char *endin = in + size;
7718
    DWORD flags = MB_ERR_INVALID_CHARS;
7719
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7720
       2000 English version of the message. */
7721
    const char *reason = "No mapping for the Unicode character exists "
7722
                         "in the target code page.";
7723
    /* each step cannot decode more than 1 character, but a character can be
7724
       represented as a surrogate pair */
7725
    wchar_t buffer[2], *out;
7726
    int insize;
7727
    Py_ssize_t outsize;
7728
    PyObject *errorHandler = NULL;
7729
    PyObject *exc = NULL;
7730
    PyObject *encoding_obj = NULL;
7731
    const char *encoding;
7732
    DWORD err;
7733
    int ret = -1;
7734
7735
    assert(size > 0);
7736
7737
    encoding = code_page_name(code_page, &encoding_obj);
7738
    if (encoding == NULL)
7739
        return -1;
7740
7741
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7742
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7743
           UnicodeDecodeError. */
7744
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7745
        if (exc != NULL) {
7746
            PyCodec_StrictErrors(exc);
7747
            Py_CLEAR(exc);
7748
        }
7749
        goto error;
7750
    }
7751
7752
    /* Extend a wchar_t* buffer */
7753
    Py_ssize_t n = *bufsize;   /* Get the current length */
7754
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7755
        PyErr_NoMemory();
7756
        goto error;
7757
    }
7758
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7759
        goto error;
7760
    }
7761
    out = *buf + n;
7762
7763
    /* Decode the byte string character per character */
7764
    while (in < endin)
7765
    {
7766
        /* Decode a character */
7767
        insize = 1;
7768
        do
7769
        {
7770
            outsize = MultiByteToWideChar(code_page, flags,
7771
                                          in, insize,
7772
                                          buffer, Py_ARRAY_LENGTH(buffer));
7773
            if (outsize > 0)
7774
                break;
7775
            err = GetLastError();
7776
            if (err == ERROR_INVALID_FLAGS && flags) {
7777
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7778
                flags = 0;
7779
                continue;
7780
            }
7781
            if (err != ERROR_NO_UNICODE_TRANSLATION
7782
                && err != ERROR_INSUFFICIENT_BUFFER)
7783
            {
7784
                PyErr_SetFromWindowsErr(err);
7785
                goto error;
7786
            }
7787
            insize++;
7788
        }
7789
        /* 4=maximum length of a UTF-8 sequence */
7790
        while (insize <= 4 && (in + insize) <= endin);
7791
7792
        if (outsize <= 0) {
7793
            Py_ssize_t startinpos, endinpos, outpos;
7794
7795
            /* last character in partial decode? */
7796
            if (in + insize >= endin && !final)
7797
                break;
7798
7799
            startinpos = in - startin;
7800
            endinpos = startinpos + 1;
7801
            outpos = out - *buf;
7802
            if (unicode_decode_call_errorhandler_wchar(
7803
                    errors, &errorHandler,
7804
                    encoding, reason,
7805
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7806
                    buf, bufsize, &outpos))
7807
            {
7808
                goto error;
7809
            }
7810
            out = *buf + outpos;
7811
        }
7812
        else {
7813
            in += insize;
7814
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7815
            out += outsize;
7816
        }
7817
    }
7818
7819
    /* Shrink the buffer */
7820
    assert(out - *buf <= *bufsize);
7821
    *bufsize = out - *buf;
7822
    /* (in - startin) <= size and size is an int */
7823
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7824
7825
error:
7826
    Py_XDECREF(encoding_obj);
7827
    Py_XDECREF(errorHandler);
7828
    Py_XDECREF(exc);
7829
    return ret;
7830
}
7831
7832
static PyObject *
7833
decode_code_page_stateful(int code_page,
7834
                          const char *s, Py_ssize_t size,
7835
                          const char *errors, Py_ssize_t *consumed)
7836
{
7837
    wchar_t *buf = NULL;
7838
    Py_ssize_t bufsize = 0;
7839
    int chunk_size, final, converted, done;
7840
7841
    if (code_page < 0) {
7842
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7843
        return NULL;
7844
    }
7845
    if (size < 0) {
7846
        PyErr_BadInternalCall();
7847
        return NULL;
7848
    }
7849
7850
    if (consumed)
7851
        *consumed = 0;
7852
7853
    do
7854
    {
7855
#ifdef NEED_RETRY
7856
        if (size > DECODING_CHUNK_SIZE) {
7857
            chunk_size = DECODING_CHUNK_SIZE;
7858
            final = 0;
7859
            done = 0;
7860
        }
7861
        else
7862
#endif
7863
        {
7864
            chunk_size = (int)size;
7865
            final = (consumed == NULL);
7866
            done = 1;
7867
        }
7868
7869
        if (chunk_size == 0 && done) {
7870
            if (buf != NULL)
7871
                break;
7872
            _Py_RETURN_UNICODE_EMPTY();
7873
        }
7874
7875
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7876
                                            s, chunk_size);
7877
        if (converted == -2)
7878
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7879
                                                s, chunk_size,
7880
                                                errors, final);
7881
        assert(converted != 0 || done);
7882
7883
        if (converted < 0) {
7884
            PyMem_Free(buf);
7885
            return NULL;
7886
        }
7887
7888
        if (consumed)
7889
            *consumed += converted;
7890
7891
        s += converted;
7892
        size -= converted;
7893
    } while (!done);
7894
7895
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7896
    PyMem_Free(buf);
7897
    return v;
7898
}
7899
7900
PyObject *
7901
PyUnicode_DecodeCodePageStateful(int code_page,
7902
                                 const char *s,
7903
                                 Py_ssize_t size,
7904
                                 const char *errors,
7905
                                 Py_ssize_t *consumed)
7906
{
7907
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7908
}
7909
7910
PyObject *
7911
PyUnicode_DecodeMBCSStateful(const char *s,
7912
                             Py_ssize_t size,
7913
                             const char *errors,
7914
                             Py_ssize_t *consumed)
7915
{
7916
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7917
}
7918
7919
PyObject *
7920
PyUnicode_DecodeMBCS(const char *s,
7921
                     Py_ssize_t size,
7922
                     const char *errors)
7923
{
7924
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7925
}
7926
7927
static DWORD
7928
encode_code_page_flags(UINT code_page, const char *errors)
7929
{
7930
    if (code_page == CP_UTF8) {
7931
        return WC_ERR_INVALID_CHARS;
7932
    }
7933
    else if (code_page == CP_UTF7) {
7934
        /* CP_UTF7 only supports flags=0 */
7935
        return 0;
7936
    }
7937
    else {
7938
        if (errors != NULL && strcmp(errors, "replace") == 0)
7939
            return 0;
7940
        else
7941
            return WC_NO_BEST_FIT_CHARS;
7942
    }
7943
}
7944
7945
/*
7946
 * Encode a Unicode string to a Windows code page into a byte string in strict
7947
 * mode.
7948
 *
7949
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7950
 * an OSError and returns -1 on other error.
7951
 */
7952
static int
7953
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7954
                        PyObject *unicode, Py_ssize_t offset, int len,
7955
                        const char* errors)
7956
{
7957
    BOOL usedDefaultChar = FALSE;
7958
    BOOL *pusedDefaultChar = &usedDefaultChar;
7959
    int outsize;
7960
    wchar_t *p;
7961
    Py_ssize_t size;
7962
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7963
    char *out;
7964
    /* Create a substring so that we can get the UTF-16 representation
7965
       of just the slice under consideration. */
7966
    PyObject *substring;
7967
    int ret = -1;
7968
7969
    assert(len > 0);
7970
7971
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7972
        pusedDefaultChar = &usedDefaultChar;
7973
    else
7974
        pusedDefaultChar = NULL;
7975
7976
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7977
    if (substring == NULL)
7978
        return -1;
7979
    p = PyUnicode_AsWideCharString(substring, &size);
7980
    Py_CLEAR(substring);
7981
    if (p == NULL) {
7982
        return -1;
7983
    }
7984
    assert(size <= INT_MAX);
7985
7986
    /* First get the size of the result */
7987
    outsize = WideCharToMultiByte(code_page, flags,
7988
                                  p, (int)size,
7989
                                  NULL, 0,
7990
                                  NULL, pusedDefaultChar);
7991
    if (outsize <= 0)
7992
        goto error;
7993
    /* If we used a default char, then we failed! */
7994
    if (pusedDefaultChar && *pusedDefaultChar) {
7995
        ret = -2;
7996
        goto done;
7997
    }
7998
7999
    if (*writer == NULL) {
8000
        /* Create string object */
8001
        *writer = PyBytesWriter_Create(outsize);
8002
        if (*writer == NULL) {
8003
            goto done;
8004
        }
8005
        out = PyBytesWriter_GetData(*writer);
8006
    }
8007
    else {
8008
        /* Extend string object */
8009
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8010
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8011
            goto done;
8012
        }
8013
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8014
    }
8015
8016
    /* Do the conversion */
8017
    outsize = WideCharToMultiByte(code_page, flags,
8018
                                  p, (int)size,
8019
                                  out, outsize,
8020
                                  NULL, pusedDefaultChar);
8021
    if (outsize <= 0)
8022
        goto error;
8023
    if (pusedDefaultChar && *pusedDefaultChar) {
8024
        ret = -2;
8025
        goto done;
8026
    }
8027
    ret = 0;
8028
8029
done:
8030
    PyMem_Free(p);
8031
    return ret;
8032
8033
error:
8034
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
8035
        ret = -2;
8036
        goto done;
8037
    }
8038
    PyErr_SetFromWindowsErr(0);
8039
    goto done;
8040
}
8041
8042
/*
8043
 * Encode a Unicode string to a Windows code page into a byte string using an
8044
 * error handler.
8045
 *
8046
 * Returns consumed characters if succeed, or raise an OSError and returns
8047
 * -1 on other error.
8048
 */
8049
static int
8050
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
8051
                        PyObject *unicode, Py_ssize_t unicode_offset,
8052
                        Py_ssize_t insize, const char* errors)
8053
{
8054
    const DWORD flags = encode_code_page_flags(code_page, errors);
8055
    Py_ssize_t pos = unicode_offset;
8056
    Py_ssize_t endin = unicode_offset + insize;
8057
    /* Ideally, we should get reason from FormatMessage. This is the Windows
8058
       2000 English version of the message. */
8059
    const char *reason = "invalid character";
8060
    /* 4=maximum length of a UTF-8 sequence */
8061
    char buffer[4];
8062
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
8063
    Py_ssize_t outsize;
8064
    char *out;
8065
    PyObject *errorHandler = NULL;
8066
    PyObject *exc = NULL;
8067
    PyObject *encoding_obj = NULL;
8068
    const char *encoding;
8069
    Py_ssize_t newpos;
8070
    PyObject *rep;
8071
    int ret = -1;
8072
8073
    assert(insize > 0);
8074
8075
    encoding = code_page_name(code_page, &encoding_obj);
8076
    if (encoding == NULL)
8077
        return -1;
8078
8079
    if (errors == NULL || strcmp(errors, "strict") == 0) {
8080
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
8081
           then we raise a UnicodeEncodeError. */
8082
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
8083
        if (exc != NULL) {
8084
            PyCodec_StrictErrors(exc);
8085
            Py_DECREF(exc);
8086
        }
8087
        Py_XDECREF(encoding_obj);
8088
        return -1;
8089
    }
8090
8091
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
8092
        pusedDefaultChar = &usedDefaultChar;
8093
    else
8094
        pusedDefaultChar = NULL;
8095
8096
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
8097
        PyErr_NoMemory();
8098
        goto error;
8099
    }
8100
    outsize = insize * Py_ARRAY_LENGTH(buffer);
8101
8102
    if (*writer == NULL) {
8103
        /* Create string object */
8104
        *writer = PyBytesWriter_Create(outsize);
8105
        if (*writer == NULL) {
8106
            goto error;
8107
        }
8108
        out = PyBytesWriter_GetData(*writer);
8109
    }
8110
    else {
8111
        /* Extend string object */
8112
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8113
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8114
            goto error;
8115
        }
8116
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8117
    }
8118
8119
    /* Encode the string character per character */
8120
    while (pos < endin)
8121
    {
8122
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8123
        wchar_t chars[2];
8124
        int charsize;
8125
        if (ch < 0x10000) {
8126
            chars[0] = (wchar_t)ch;
8127
            charsize = 1;
8128
        }
8129
        else {
8130
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8131
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8132
            charsize = 2;
8133
        }
8134
8135
        outsize = WideCharToMultiByte(code_page, flags,
8136
                                      chars, charsize,
8137
                                      buffer, Py_ARRAY_LENGTH(buffer),
8138
                                      NULL, pusedDefaultChar);
8139
        if (outsize > 0) {
8140
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8141
            {
8142
                pos++;
8143
                memcpy(out, buffer, outsize);
8144
                out += outsize;
8145
                continue;
8146
            }
8147
        }
8148
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8149
            PyErr_SetFromWindowsErr(0);
8150
            goto error;
8151
        }
8152
8153
        rep = unicode_encode_call_errorhandler(
8154
                  errors, &errorHandler, encoding, reason,
8155
                  unicode, &exc,
8156
                  pos, pos + 1, &newpos);
8157
        if (rep == NULL)
8158
            goto error;
8159
8160
        Py_ssize_t morebytes = pos - newpos;
8161
        if (PyBytes_Check(rep)) {
8162
            outsize = PyBytes_GET_SIZE(rep);
8163
            morebytes += outsize;
8164
            if (morebytes > 0) {
8165
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8166
                if (out == NULL) {
8167
                    Py_DECREF(rep);
8168
                    goto error;
8169
                }
8170
            }
8171
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8172
            out += outsize;
8173
        }
8174
        else {
8175
            Py_ssize_t i;
8176
            int kind;
8177
            const void *data;
8178
8179
            outsize = PyUnicode_GET_LENGTH(rep);
8180
            morebytes += outsize;
8181
            if (morebytes > 0) {
8182
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8183
                if (out == NULL) {
8184
                    Py_DECREF(rep);
8185
                    goto error;
8186
                }
8187
            }
8188
            kind = PyUnicode_KIND(rep);
8189
            data = PyUnicode_DATA(rep);
8190
            for (i=0; i < outsize; i++) {
8191
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8192
                if (ch > 127) {
8193
                    raise_encode_exception(&exc,
8194
                        encoding, unicode,
8195
                        pos, pos + 1,
8196
                        "unable to encode error handler result to ASCII");
8197
                    Py_DECREF(rep);
8198
                    goto error;
8199
                }
8200
                *out = (unsigned char)ch;
8201
                out++;
8202
            }
8203
        }
8204
        pos = newpos;
8205
        Py_DECREF(rep);
8206
    }
8207
    /* write a NUL byte */
8208
    *out = 0;
8209
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8210
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8211
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8212
        goto error;
8213
    }
8214
    ret = 0;
8215
8216
error:
8217
    Py_XDECREF(encoding_obj);
8218
    Py_XDECREF(errorHandler);
8219
    Py_XDECREF(exc);
8220
    return ret;
8221
}
8222
8223
8224
PyObject *
8225
PyUnicode_EncodeCodePage(int code_page,
8226
                         PyObject *unicode,
8227
                         const char *errors)
8228
{
8229
    Py_ssize_t len;
8230
    PyBytesWriter *writer = NULL;
8231
    Py_ssize_t offset;
8232
    int chunk_len, ret, done;
8233
8234
    if (!PyUnicode_Check(unicode)) {
8235
        PyErr_BadArgument();
8236
        return NULL;
8237
    }
8238
8239
    len = PyUnicode_GET_LENGTH(unicode);
8240
8241
    if (code_page < 0) {
8242
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8243
        return NULL;
8244
    }
8245
8246
    if (len == 0)
8247
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8248
8249
    offset = 0;
8250
    do
8251
    {
8252
#ifdef NEED_RETRY
8253
        if (len > DECODING_CHUNK_SIZE) {
8254
            chunk_len = DECODING_CHUNK_SIZE;
8255
            done = 0;
8256
        }
8257
        else
8258
#endif
8259
        {
8260
            chunk_len = (int)len;
8261
            done = 1;
8262
        }
8263
8264
        ret = encode_code_page_strict(code_page, &writer,
8265
                                      unicode, offset, chunk_len,
8266
                                      errors);
8267
        if (ret == -2)
8268
            ret = encode_code_page_errors(code_page, &writer,
8269
                                          unicode, offset,
8270
                                          chunk_len, errors);
8271
        if (ret < 0) {
8272
            PyBytesWriter_Discard(writer);
8273
            return NULL;
8274
        }
8275
8276
        offset += chunk_len;
8277
        len -= chunk_len;
8278
    } while (!done);
8279
8280
    return PyBytesWriter_Finish(writer);
8281
}
8282
8283
8284
PyObject *
8285
PyUnicode_AsMBCSString(PyObject *unicode)
8286
{
8287
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8288
}
8289
8290
#undef NEED_RETRY
8291
8292
#endif /* MS_WINDOWS */
8293
8294
/* --- Character Mapping Codec -------------------------------------------- */
8295
8296
static int
8297
charmap_decode_string(const char *s,
8298
                      Py_ssize_t size,
8299
                      PyObject *mapping,
8300
                      const char *errors,
8301
                      _PyUnicodeWriter *writer)
8302
1.42k
{
8303
1.42k
    const char *starts = s;
8304
1.42k
    const char *e;
8305
1.42k
    Py_ssize_t startinpos, endinpos;
8306
1.42k
    PyObject *errorHandler = NULL, *exc = NULL;
8307
1.42k
    Py_ssize_t maplen;
8308
1.42k
    int mapkind;
8309
1.42k
    const void *mapdata;
8310
1.42k
    Py_UCS4 x;
8311
1.42k
    unsigned char ch;
8312
8313
1.42k
    maplen = PyUnicode_GET_LENGTH(mapping);
8314
1.42k
    mapdata = PyUnicode_DATA(mapping);
8315
1.42k
    mapkind = PyUnicode_KIND(mapping);
8316
8317
0
    e = s + size;
8318
8319
1.42k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8320
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8321
         * is disabled in encoding aliases, latin1 is preferred because
8322
         * its implementation is faster. */
8323
24
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8324
24
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8325
24
        Py_UCS4 maxchar = writer->maxchar;
8326
8327
24
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8328
1.72k
        while (s < e) {
8329
1.70k
            ch = *s;
8330
1.70k
            x = mapdata_ucs1[ch];
8331
1.70k
            if (x > maxchar) {
8332
24
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8333
0
                    goto onError;
8334
24
                maxchar = writer->maxchar;
8335
24
                outdata = (Py_UCS1 *)writer->data;
8336
24
            }
8337
1.70k
            outdata[writer->pos] = x;
8338
1.70k
            writer->pos++;
8339
1.70k
            ++s;
8340
1.70k
        }
8341
24
        return 0;
8342
24
    }
8343
8344
7.02k
    while (s < e) {
8345
6.67k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8346
6.67k
            int outkind = writer->kind;
8347
6.67k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8348
6.67k
            if (outkind == PyUnicode_1BYTE_KIND) {
8349
2.59k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8350
2.59k
                Py_UCS4 maxchar = writer->maxchar;
8351
401k
                while (s < e) {
8352
401k
                    ch = *s;
8353
401k
                    x = mapdata_ucs2[ch];
8354
401k
                    if (x > maxchar)
8355
2.46k
                        goto Error;
8356
399k
                    outdata[writer->pos] = x;
8357
399k
                    writer->pos++;
8358
399k
                    ++s;
8359
399k
                }
8360
136
                break;
8361
2.59k
            }
8362
4.07k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8363
4.07k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8364
1.59M
                while (s < e) {
8365
1.59M
                    ch = *s;
8366
1.59M
                    x = mapdata_ucs2[ch];
8367
1.59M
                    if (x == 0xFFFE)
8368
3.16k
                        goto Error;
8369
1.59M
                    outdata[writer->pos] = x;
8370
1.59M
                    writer->pos++;
8371
1.59M
                    ++s;
8372
1.59M
                }
8373
917
                break;
8374
4.07k
            }
8375
6.67k
        }
8376
0
        ch = *s;
8377
8378
0
        if (ch < maplen)
8379
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8380
0
        else
8381
0
            x = 0xfffe; /* invalid value */
8382
5.62k
Error:
8383
5.62k
        if (x == 0xfffe)
8384
3.17k
        {
8385
            /* undefined mapping */
8386
3.17k
            startinpos = s-starts;
8387
3.17k
            endinpos = startinpos+1;
8388
3.17k
            if (unicode_decode_call_errorhandler_writer(
8389
3.17k
                    errors, &errorHandler,
8390
3.17k
                    "charmap", "character maps to <undefined>",
8391
3.17k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8392
3.17k
                    writer)) {
8393
2
                goto onError;
8394
2
            }
8395
3.16k
            continue;
8396
3.17k
        }
8397
8398
2.45k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8399
0
            goto onError;
8400
2.45k
        ++s;
8401
2.45k
    }
8402
1.39k
    Py_XDECREF(errorHandler);
8403
1.39k
    Py_XDECREF(exc);
8404
1.39k
    return 0;
8405
8406
2
onError:
8407
2
    Py_XDECREF(errorHandler);
8408
2
    Py_XDECREF(exc);
8409
2
    return -1;
8410
1.40k
}
8411
8412
static int
8413
charmap_decode_mapping(const char *s,
8414
                       Py_ssize_t size,
8415
                       PyObject *mapping,
8416
                       const char *errors,
8417
                       _PyUnicodeWriter *writer)
8418
0
{
8419
0
    const char *starts = s;
8420
0
    const char *e;
8421
0
    Py_ssize_t startinpos, endinpos;
8422
0
    PyObject *errorHandler = NULL, *exc = NULL;
8423
0
    unsigned char ch;
8424
0
    PyObject *key, *item = NULL;
8425
8426
0
    e = s + size;
8427
8428
0
    while (s < e) {
8429
0
        ch = *s;
8430
8431
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8432
0
        key = PyLong_FromLong((long)ch);
8433
0
        if (key == NULL)
8434
0
            goto onError;
8435
8436
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8437
0
        Py_DECREF(key);
8438
0
        if (rc == 0) {
8439
            /* No mapping found means: mapping is undefined. */
8440
0
            goto Undefined;
8441
0
        }
8442
0
        if (item == NULL) {
8443
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8444
                /* No mapping found means: mapping is undefined. */
8445
0
                PyErr_Clear();
8446
0
                goto Undefined;
8447
0
            } else
8448
0
                goto onError;
8449
0
        }
8450
8451
        /* Apply mapping */
8452
0
        if (item == Py_None)
8453
0
            goto Undefined;
8454
0
        if (PyLong_Check(item)) {
8455
0
            long value = PyLong_AsLong(item);
8456
0
            if (value == 0xFFFE)
8457
0
                goto Undefined;
8458
0
            if (value < 0 || value > MAX_UNICODE) {
8459
0
                PyErr_Format(PyExc_TypeError,
8460
0
                             "character mapping must be in range(0x%x)",
8461
0
                             (unsigned long)MAX_UNICODE + 1);
8462
0
                goto onError;
8463
0
            }
8464
8465
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8466
0
                goto onError;
8467
0
        }
8468
0
        else if (PyUnicode_Check(item)) {
8469
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8470
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8471
0
                if (value == 0xFFFE)
8472
0
                    goto Undefined;
8473
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8474
0
                    goto onError;
8475
0
            }
8476
0
            else {
8477
0
                writer->overallocate = 1;
8478
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8479
0
                    goto onError;
8480
0
            }
8481
0
        }
8482
0
        else {
8483
            /* wrong return value */
8484
0
            PyErr_SetString(PyExc_TypeError,
8485
0
                            "character mapping must return integer, None or str");
8486
0
            goto onError;
8487
0
        }
8488
0
        Py_CLEAR(item);
8489
0
        ++s;
8490
0
        continue;
8491
8492
0
Undefined:
8493
        /* undefined mapping */
8494
0
        Py_CLEAR(item);
8495
0
        startinpos = s-starts;
8496
0
        endinpos = startinpos+1;
8497
0
        if (unicode_decode_call_errorhandler_writer(
8498
0
                errors, &errorHandler,
8499
0
                "charmap", "character maps to <undefined>",
8500
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8501
0
                writer)) {
8502
0
            goto onError;
8503
0
        }
8504
0
    }
8505
0
    Py_XDECREF(errorHandler);
8506
0
    Py_XDECREF(exc);
8507
0
    return 0;
8508
8509
0
onError:
8510
0
    Py_XDECREF(item);
8511
0
    Py_XDECREF(errorHandler);
8512
0
    Py_XDECREF(exc);
8513
0
    return -1;
8514
0
}
8515
8516
PyObject *
8517
PyUnicode_DecodeCharmap(const char *s,
8518
                        Py_ssize_t size,
8519
                        PyObject *mapping,
8520
                        const char *errors)
8521
1.42k
{
8522
1.42k
    _PyUnicodeWriter writer;
8523
8524
    /* Default to Latin-1 */
8525
1.42k
    if (mapping == NULL)
8526
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8527
8528
1.42k
    if (size == 0)
8529
0
        _Py_RETURN_UNICODE_EMPTY();
8530
1.42k
    _PyUnicodeWriter_Init(&writer);
8531
1.42k
    writer.min_length = size;
8532
1.42k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8533
0
        goto onError;
8534
8535
1.42k
    if (PyUnicode_CheckExact(mapping)) {
8536
1.42k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8537
2
            goto onError;
8538
1.42k
    }
8539
0
    else {
8540
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8541
0
            goto onError;
8542
0
    }
8543
1.42k
    return _PyUnicodeWriter_Finish(&writer);
8544
8545
2
  onError:
8546
2
    _PyUnicodeWriter_Dealloc(&writer);
8547
2
    return NULL;
8548
1.42k
}
8549
8550
/* Charmap encoding: the lookup table */
8551
8552
/*[clinic input]
8553
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8554
[clinic start generated code]*/
8555
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8556
8557
struct encoding_map {
8558
    PyObject_HEAD
8559
    unsigned char level1[32];
8560
    int count2, count3;
8561
    unsigned char level23[1];
8562
};
8563
8564
/*[clinic input]
8565
EncodingMap.size
8566
8567
Return the size (in bytes) of this object.
8568
[clinic start generated code]*/
8569
8570
static PyObject *
8571
EncodingMap_size_impl(struct encoding_map *self)
8572
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8573
0
{
8574
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8575
0
                           128*self->count3);
8576
0
}
8577
8578
static PyMethodDef encoding_map_methods[] = {
8579
    ENCODINGMAP_SIZE_METHODDEF
8580
    {NULL, NULL}
8581
};
8582
8583
static PyTypeObject EncodingMapType = {
8584
    PyVarObject_HEAD_INIT(NULL, 0)
8585
    .tp_name = "EncodingMap",
8586
    .tp_basicsize = sizeof(struct encoding_map),
8587
    /* methods */
8588
    .tp_flags = Py_TPFLAGS_DEFAULT,
8589
    .tp_methods = encoding_map_methods,
8590
};
8591
8592
PyObject*
8593
PyUnicode_BuildEncodingMap(PyObject* string)
8594
53
{
8595
53
    PyObject *result;
8596
53
    struct encoding_map *mresult;
8597
53
    int i;
8598
53
    int need_dict = 0;
8599
53
    unsigned char level1[32];
8600
53
    unsigned char level2[512];
8601
53
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8602
53
    int count2 = 0, count3 = 0;
8603
53
    int kind;
8604
53
    const void *data;
8605
53
    int length;
8606
53
    Py_UCS4 ch;
8607
8608
53
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8609
0
        PyErr_BadArgument();
8610
0
        return NULL;
8611
0
    }
8612
53
    kind = PyUnicode_KIND(string);
8613
53
    data = PyUnicode_DATA(string);
8614
53
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8615
53
    memset(level1, 0xFF, sizeof level1);
8616
53
    memset(level2, 0xFF, sizeof level2);
8617
8618
    /* If there isn't a one-to-one mapping of NULL to \0,
8619
       or if there are non-BMP characters, we need to use
8620
       a mapping dictionary. */
8621
53
    if (PyUnicode_READ(kind, data, 0) != 0)
8622
0
        need_dict = 1;
8623
13.5k
    for (i = 1; i < length; i++) {
8624
13.5k
        int l1, l2;
8625
13.5k
        ch = PyUnicode_READ(kind, data, i);
8626
13.5k
        if (ch == 0 || ch > 0xFFFF) {
8627
0
            need_dict = 1;
8628
0
            break;
8629
0
        }
8630
13.5k
        if (ch == 0xFFFE)
8631
            /* unmapped character */
8632
377
            continue;
8633
13.1k
        l1 = ch >> 11;
8634
13.1k
        l2 = ch >> 7;
8635
13.1k
        if (level1[l1] == 0xFF)
8636
95
            level1[l1] = count2++;
8637
13.1k
        if (level2[l2] == 0xFF)
8638
270
            level2[l2] = count3++;
8639
13.1k
    }
8640
8641
53
    if (count2 >= 0xFF || count3 >= 0xFF)
8642
0
        need_dict = 1;
8643
8644
53
    if (need_dict) {
8645
0
        PyObject *result = PyDict_New();
8646
0
        if (!result)
8647
0
            return NULL;
8648
0
        for (i = 0; i < length; i++) {
8649
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8650
0
            PyObject *key = PyLong_FromLong(c);
8651
0
            if (key == NULL) {
8652
0
                Py_DECREF(result);
8653
0
                return NULL;
8654
0
            }
8655
0
            PyObject *value = PyLong_FromLong(i);
8656
0
            if (value == NULL) {
8657
0
                Py_DECREF(key);
8658
0
                Py_DECREF(result);
8659
0
                return NULL;
8660
0
            }
8661
0
            int rc = PyDict_SetItem(result, key, value);
8662
0
            Py_DECREF(key);
8663
0
            Py_DECREF(value);
8664
0
            if (rc < 0) {
8665
0
                Py_DECREF(result);
8666
0
                return NULL;
8667
0
            }
8668
0
        }
8669
0
        return result;
8670
0
    }
8671
8672
    /* Create a three-level trie */
8673
53
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8674
53
                             16*count2 + 128*count3 - 1);
8675
53
    if (!result) {
8676
0
        return PyErr_NoMemory();
8677
0
    }
8678
8679
53
    _PyObject_Init(result, &EncodingMapType);
8680
53
    mresult = (struct encoding_map*)result;
8681
53
    mresult->count2 = count2;
8682
53
    mresult->count3 = count3;
8683
53
    mlevel1 = mresult->level1;
8684
53
    mlevel2 = mresult->level23;
8685
53
    mlevel3 = mresult->level23 + 16*count2;
8686
53
    memcpy(mlevel1, level1, 32);
8687
53
    memset(mlevel2, 0xFF, 16*count2);
8688
53
    memset(mlevel3, 0, 128*count3);
8689
53
    count3 = 0;
8690
13.5k
    for (i = 1; i < length; i++) {
8691
13.5k
        int o1, o2, o3, i2, i3;
8692
13.5k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8693
13.5k
        if (ch == 0xFFFE)
8694
            /* unmapped character */
8695
377
            continue;
8696
13.1k
        o1 = ch>>11;
8697
13.1k
        o2 = (ch>>7) & 0xF;
8698
13.1k
        i2 = 16*mlevel1[o1] + o2;
8699
13.1k
        if (mlevel2[i2] == 0xFF)
8700
270
            mlevel2[i2] = count3++;
8701
13.1k
        o3 = ch & 0x7F;
8702
13.1k
        i3 = 128*mlevel2[i2] + o3;
8703
13.1k
        mlevel3[i3] = i;
8704
13.1k
    }
8705
53
    return result;
8706
53
}
8707
8708
static int
8709
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8710
0
{
8711
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8712
0
    int l1 = c>>11;
8713
0
    int l2 = (c>>7) & 0xF;
8714
0
    int l3 = c & 0x7F;
8715
0
    int i;
8716
8717
0
    if (c > 0xFFFF)
8718
0
        return -1;
8719
0
    if (c == 0)
8720
0
        return 0;
8721
    /* level 1*/
8722
0
    i = map->level1[l1];
8723
0
    if (i == 0xFF) {
8724
0
        return -1;
8725
0
    }
8726
    /* level 2*/
8727
0
    i = map->level23[16*i+l2];
8728
0
    if (i == 0xFF) {
8729
0
        return -1;
8730
0
    }
8731
    /* level 3 */
8732
0
    i = map->level23[16*map->count2 + 128*i + l3];
8733
0
    if (i == 0) {
8734
0
        return -1;
8735
0
    }
8736
0
    return i;
8737
0
}
8738
8739
/* Lookup the character in the mapping.
8740
   On success, return PyLong, PyBytes or None (if the character can't be found).
8741
   If the result is PyLong, put its value in replace.
8742
   On error, return NULL.
8743
   */
8744
static PyObject *
8745
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8746
0
{
8747
0
    PyObject *w = PyLong_FromLong((long)c);
8748
0
    PyObject *x;
8749
8750
0
    if (w == NULL)
8751
0
        return NULL;
8752
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8753
0
    Py_DECREF(w);
8754
0
    if (rc == 0) {
8755
        /* No mapping found means: mapping is undefined. */
8756
0
        Py_RETURN_NONE;
8757
0
    }
8758
0
    if (x == NULL) {
8759
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8760
            /* No mapping found means: mapping is undefined. */
8761
0
            PyErr_Clear();
8762
0
            Py_RETURN_NONE;
8763
0
        } else
8764
0
            return NULL;
8765
0
    }
8766
0
    else if (x == Py_None)
8767
0
        return x;
8768
0
    else if (PyLong_Check(x)) {
8769
0
        long value = PyLong_AsLong(x);
8770
0
        if (value < 0 || value > 255) {
8771
0
            PyErr_SetString(PyExc_TypeError,
8772
0
                            "character mapping must be in range(256)");
8773
0
            Py_DECREF(x);
8774
0
            return NULL;
8775
0
        }
8776
0
        *replace = (unsigned char)value;
8777
0
        return x;
8778
0
    }
8779
0
    else if (PyBytes_Check(x))
8780
0
        return x;
8781
0
    else {
8782
        /* wrong return value */
8783
0
        PyErr_Format(PyExc_TypeError,
8784
0
                     "character mapping must return integer, bytes or None, not %.400s",
8785
0
                     Py_TYPE(x)->tp_name);
8786
0
        Py_DECREF(x);
8787
0
        return NULL;
8788
0
    }
8789
0
}
8790
8791
static int
8792
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8793
0
{
8794
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8795
    /* exponentially overallocate to minimize reallocations */
8796
0
    if (requiredsize < 2 * outsize)
8797
0
        requiredsize = 2 * outsize;
8798
0
    return PyBytesWriter_Resize(writer, requiredsize);
8799
0
}
8800
8801
typedef enum charmapencode_result {
8802
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8803
} charmapencode_result;
8804
/* lookup the character, put the result in the output string and adjust
8805
   various state variables. Resize the output bytes object if not enough
8806
   space is available. Return a new reference to the object that
8807
   was put in the output buffer, or Py_None, if the mapping was undefined
8808
   (in which case no character was written) or NULL, if a
8809
   reallocation error occurred. The caller must decref the result */
8810
static charmapencode_result
8811
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8812
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8813
0
{
8814
0
    PyObject *rep;
8815
0
    unsigned char replace;
8816
0
    char *outstart;
8817
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8818
8819
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8820
0
        int res = encoding_map_lookup(c, mapping);
8821
0
        Py_ssize_t requiredsize = *outpos+1;
8822
0
        if (res == -1) {
8823
0
            return enc_FAILED;
8824
0
        }
8825
8826
0
        if (outsize<requiredsize) {
8827
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8828
0
                return enc_EXCEPTION;
8829
0
            }
8830
0
        }
8831
0
        outstart = _PyBytesWriter_GetData(writer);
8832
0
        outstart[(*outpos)++] = (char)res;
8833
0
        return enc_SUCCESS;
8834
0
    }
8835
8836
0
    rep = charmapencode_lookup(c, mapping, &replace);
8837
0
    if (rep==NULL)
8838
0
        return enc_EXCEPTION;
8839
0
    else if (rep==Py_None) {
8840
0
        Py_DECREF(rep);
8841
0
        return enc_FAILED;
8842
0
    } else {
8843
0
        if (PyLong_Check(rep)) {
8844
0
            Py_ssize_t requiredsize = *outpos+1;
8845
0
            if (outsize<requiredsize)
8846
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8847
0
                    Py_DECREF(rep);
8848
0
                    return enc_EXCEPTION;
8849
0
                }
8850
0
            outstart = _PyBytesWriter_GetData(writer);
8851
0
            outstart[(*outpos)++] = (char)replace;
8852
0
        }
8853
0
        else {
8854
0
            const char *repchars = PyBytes_AS_STRING(rep);
8855
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8856
0
            Py_ssize_t requiredsize = *outpos+repsize;
8857
0
            if (outsize<requiredsize)
8858
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8859
0
                    Py_DECREF(rep);
8860
0
                    return enc_EXCEPTION;
8861
0
                }
8862
0
            outstart = _PyBytesWriter_GetData(writer);
8863
0
            memcpy(outstart + *outpos, repchars, repsize);
8864
0
            *outpos += repsize;
8865
0
        }
8866
0
    }
8867
0
    Py_DECREF(rep);
8868
0
    return enc_SUCCESS;
8869
0
}
8870
8871
/* handle an error in _PyUnicode_EncodeCharmap()
8872
   Return 0 on success, -1 on error */
8873
static int
8874
charmap_encoding_error(
8875
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8876
    PyObject **exceptionObject,
8877
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8878
    PyBytesWriter *writer, Py_ssize_t *respos)
8879
0
{
8880
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8881
0
    Py_ssize_t size, repsize;
8882
0
    Py_ssize_t newpos;
8883
0
    int kind;
8884
0
    const void *data;
8885
0
    Py_ssize_t index;
8886
    /* startpos for collecting unencodable chars */
8887
0
    Py_ssize_t collstartpos = *inpos;
8888
0
    Py_ssize_t collendpos = *inpos+1;
8889
0
    Py_ssize_t collpos;
8890
0
    const char *encoding = "charmap";
8891
0
    const char *reason = "character maps to <undefined>";
8892
0
    charmapencode_result x;
8893
0
    Py_UCS4 ch;
8894
0
    int val;
8895
8896
0
    size = PyUnicode_GET_LENGTH(unicode);
8897
    /* find all unencodable characters */
8898
0
    while (collendpos < size) {
8899
0
        PyObject *rep;
8900
0
        unsigned char replace;
8901
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8902
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8903
0
            val = encoding_map_lookup(ch, mapping);
8904
0
            if (val != -1)
8905
0
                break;
8906
0
            ++collendpos;
8907
0
            continue;
8908
0
        }
8909
8910
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8911
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8912
0
        if (rep==NULL)
8913
0
            return -1;
8914
0
        else if (rep!=Py_None) {
8915
0
            Py_DECREF(rep);
8916
0
            break;
8917
0
        }
8918
0
        Py_DECREF(rep);
8919
0
        ++collendpos;
8920
0
    }
8921
    /* cache callback name lookup
8922
     * (if not done yet, i.e. it's the first error) */
8923
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8924
0
        *error_handler = _Py_GetErrorHandler(errors);
8925
8926
0
    switch (*error_handler) {
8927
0
    case _Py_ERROR_STRICT:
8928
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8929
0
        return -1;
8930
8931
0
    case _Py_ERROR_REPLACE:
8932
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8933
0
            x = charmapencode_output('?', mapping, writer, respos);
8934
0
            if (x==enc_EXCEPTION) {
8935
0
                return -1;
8936
0
            }
8937
0
            else if (x==enc_FAILED) {
8938
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8939
0
                return -1;
8940
0
            }
8941
0
        }
8942
0
        _Py_FALLTHROUGH;
8943
0
    case _Py_ERROR_IGNORE:
8944
0
        *inpos = collendpos;
8945
0
        break;
8946
8947
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8948
        /* generate replacement (temporarily (mis)uses p) */
8949
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8950
0
            char buffer[2+29+1+1];
8951
0
            char *cp;
8952
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8953
0
            for (cp = buffer; *cp; ++cp) {
8954
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8955
0
                if (x==enc_EXCEPTION)
8956
0
                    return -1;
8957
0
                else if (x==enc_FAILED) {
8958
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8959
0
                    return -1;
8960
0
                }
8961
0
            }
8962
0
        }
8963
0
        *inpos = collendpos;
8964
0
        break;
8965
8966
0
    default:
8967
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8968
0
                                                      encoding, reason, unicode, exceptionObject,
8969
0
                                                      collstartpos, collendpos, &newpos);
8970
0
        if (repunicode == NULL)
8971
0
            return -1;
8972
0
        if (PyBytes_Check(repunicode)) {
8973
            /* Directly copy bytes result to output. */
8974
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8975
0
            Py_ssize_t requiredsize;
8976
0
            repsize = PyBytes_Size(repunicode);
8977
0
            requiredsize = *respos + repsize;
8978
0
            if (requiredsize > outsize)
8979
                /* Make room for all additional bytes. */
8980
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8981
0
                    Py_DECREF(repunicode);
8982
0
                    return -1;
8983
0
                }
8984
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8985
0
                   PyBytes_AsString(repunicode),  repsize);
8986
0
            *respos += repsize;
8987
0
            *inpos = newpos;
8988
0
            Py_DECREF(repunicode);
8989
0
            break;
8990
0
        }
8991
        /* generate replacement  */
8992
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8993
0
        data = PyUnicode_DATA(repunicode);
8994
0
        kind = PyUnicode_KIND(repunicode);
8995
0
        for (index = 0; index < repsize; index++) {
8996
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8997
0
            x = charmapencode_output(repch, mapping, writer, respos);
8998
0
            if (x==enc_EXCEPTION) {
8999
0
                Py_DECREF(repunicode);
9000
0
                return -1;
9001
0
            }
9002
0
            else if (x==enc_FAILED) {
9003
0
                Py_DECREF(repunicode);
9004
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
9005
0
                return -1;
9006
0
            }
9007
0
        }
9008
0
        *inpos = newpos;
9009
0
        Py_DECREF(repunicode);
9010
0
    }
9011
0
    return 0;
9012
0
}
9013
9014
PyObject *
9015
_PyUnicode_EncodeCharmap(PyObject *unicode,
9016
                         PyObject *mapping,
9017
                         const char *errors)
9018
0
{
9019
    /* Default to Latin-1 */
9020
0
    if (mapping == NULL) {
9021
0
        return unicode_encode_ucs1(unicode, errors, 256);
9022
0
    }
9023
9024
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
9025
0
    if (size == 0) {
9026
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
9027
0
    }
9028
0
    const void *data = PyUnicode_DATA(unicode);
9029
0
    int kind = PyUnicode_KIND(unicode);
9030
9031
0
    PyObject *error_handler_obj = NULL;
9032
0
    PyObject *exc = NULL;
9033
9034
    /* output object */
9035
0
    PyBytesWriter *writer;
9036
    /* allocate enough for a simple encoding without
9037
       replacements, if we need more, we'll resize */
9038
0
    writer = PyBytesWriter_Create(size);
9039
0
    if (writer == NULL) {
9040
0
        goto onError;
9041
0
    }
9042
9043
    /* current input position */
9044
0
    Py_ssize_t inpos = 0;
9045
    /* current output position */
9046
0
    Py_ssize_t respos = 0;
9047
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
9048
9049
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
9050
0
        char *outstart = _PyBytesWriter_GetData(writer);
9051
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
9052
9053
0
        while (inpos<size) {
9054
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9055
9056
            /* try to encode it */
9057
0
            int res = encoding_map_lookup(ch, mapping);
9058
0
            Py_ssize_t requiredsize = respos+1;
9059
0
            if (res == -1) {
9060
0
                goto enc_FAILED;
9061
0
            }
9062
9063
0
            if (outsize<requiredsize) {
9064
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
9065
0
                    goto onError;
9066
0
                }
9067
0
                outstart = _PyBytesWriter_GetData(writer);
9068
0
                outsize = _PyBytesWriter_GetSize(writer);
9069
0
            }
9070
0
            outstart[respos++] = (char)res;
9071
9072
            /* done with this character => adjust input position */
9073
0
            ++inpos;
9074
0
            continue;
9075
9076
0
enc_FAILED:
9077
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
9078
0
                                       &exc,
9079
0
                                       &error_handler, &error_handler_obj, errors,
9080
0
                                       writer, &respos)) {
9081
0
                goto onError;
9082
0
            }
9083
0
            outstart = _PyBytesWriter_GetData(writer);
9084
0
            outsize = _PyBytesWriter_GetSize(writer);
9085
0
        }
9086
0
    }
9087
0
    else {
9088
0
        while (inpos<size) {
9089
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9090
            /* try to encode it */
9091
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
9092
0
            if (x==enc_EXCEPTION) { /* error */
9093
0
                goto onError;
9094
0
            }
9095
0
            if (x==enc_FAILED) { /* unencodable character */
9096
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
9097
0
                                           &exc,
9098
0
                                           &error_handler, &error_handler_obj, errors,
9099
0
                                           writer, &respos)) {
9100
0
                    goto onError;
9101
0
                }
9102
0
            }
9103
0
            else {
9104
                /* done with this character => adjust input position */
9105
0
                ++inpos;
9106
0
            }
9107
0
        }
9108
0
    }
9109
9110
0
    Py_XDECREF(exc);
9111
0
    Py_XDECREF(error_handler_obj);
9112
9113
    /* Resize if we allocated too much */
9114
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9115
9116
0
  onError:
9117
0
    PyBytesWriter_Discard(writer);
9118
0
    Py_XDECREF(exc);
9119
0
    Py_XDECREF(error_handler_obj);
9120
0
    return NULL;
9121
0
}
9122
9123
PyObject *
9124
PyUnicode_AsCharmapString(PyObject *unicode,
9125
                          PyObject *mapping)
9126
0
{
9127
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9128
0
        PyErr_BadArgument();
9129
0
        return NULL;
9130
0
    }
9131
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9132
0
}
9133
9134
/* create or adjust a UnicodeTranslateError */
9135
static void
9136
make_translate_exception(PyObject **exceptionObject,
9137
                         PyObject *unicode,
9138
                         Py_ssize_t startpos, Py_ssize_t endpos,
9139
                         const char *reason)
9140
0
{
9141
0
    if (*exceptionObject == NULL) {
9142
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9143
0
            unicode, startpos, endpos, reason);
9144
0
    }
9145
0
    else {
9146
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9147
0
            goto onError;
9148
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9149
0
            goto onError;
9150
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9151
0
            goto onError;
9152
0
        return;
9153
0
      onError:
9154
0
        Py_CLEAR(*exceptionObject);
9155
0
    }
9156
0
}
9157
9158
/* error handling callback helper:
9159
   build arguments, call the callback and check the arguments,
9160
   put the result into newpos and return the replacement string, which
9161
   has to be freed by the caller */
9162
static PyObject *
9163
unicode_translate_call_errorhandler(const char *errors,
9164
                                    PyObject **errorHandler,
9165
                                    const char *reason,
9166
                                    PyObject *unicode, PyObject **exceptionObject,
9167
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9168
                                    Py_ssize_t *newpos)
9169
0
{
9170
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9171
9172
0
    Py_ssize_t i_newpos;
9173
0
    PyObject *restuple;
9174
0
    PyObject *resunicode;
9175
9176
0
    if (*errorHandler == NULL) {
9177
0
        *errorHandler = PyCodec_LookupError(errors);
9178
0
        if (*errorHandler == NULL)
9179
0
            return NULL;
9180
0
    }
9181
9182
0
    make_translate_exception(exceptionObject,
9183
0
                             unicode, startpos, endpos, reason);
9184
0
    if (*exceptionObject == NULL)
9185
0
        return NULL;
9186
9187
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9188
0
    if (restuple == NULL)
9189
0
        return NULL;
9190
0
    if (!PyTuple_Check(restuple)) {
9191
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9192
0
        Py_DECREF(restuple);
9193
0
        return NULL;
9194
0
    }
9195
0
    if (!PyArg_ParseTuple(restuple, argparse,
9196
0
                          &resunicode, &i_newpos)) {
9197
0
        Py_DECREF(restuple);
9198
0
        return NULL;
9199
0
    }
9200
0
    if (i_newpos<0)
9201
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9202
0
    else
9203
0
        *newpos = i_newpos;
9204
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9205
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9206
0
        Py_DECREF(restuple);
9207
0
        return NULL;
9208
0
    }
9209
0
    Py_INCREF(resunicode);
9210
0
    Py_DECREF(restuple);
9211
0
    return resunicode;
9212
0
}
9213
9214
/* Lookup the character ch in the mapping and put the result in result,
9215
   which must be decrefed by the caller.
9216
   The result can be PyLong, PyUnicode, None or NULL.
9217
   If the result is PyLong, put its value in replace.
9218
   Return 0 on success, -1 on error */
9219
static int
9220
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9221
282
{
9222
282
    PyObject *w = PyLong_FromLong((long)c);
9223
282
    PyObject *x;
9224
9225
282
    if (w == NULL)
9226
0
        return -1;
9227
282
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9228
282
    Py_DECREF(w);
9229
282
    if (rc == 0) {
9230
        /* No mapping found means: use 1:1 mapping. */
9231
126
        *result = NULL;
9232
126
        return 0;
9233
126
    }
9234
156
    if (x == NULL) {
9235
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9236
            /* No mapping found means: use 1:1 mapping. */
9237
0
            PyErr_Clear();
9238
0
            *result = NULL;
9239
0
            return 0;
9240
0
        } else
9241
0
            return -1;
9242
0
    }
9243
156
    else if (x == Py_None) {
9244
0
        *result = x;
9245
0
        return 0;
9246
0
    }
9247
156
    else if (PyLong_Check(x)) {
9248
0
        long value = PyLong_AsLong(x);
9249
0
        if (value < 0 || value > MAX_UNICODE) {
9250
0
            PyErr_Format(PyExc_ValueError,
9251
0
                         "character mapping must be in range(0x%x)",
9252
0
                         MAX_UNICODE+1);
9253
0
            Py_DECREF(x);
9254
0
            return -1;
9255
0
        }
9256
0
        *result = x;
9257
0
        *replace = (Py_UCS4)value;
9258
0
        return 0;
9259
0
    }
9260
156
    else if (PyUnicode_Check(x)) {
9261
156
        *result = x;
9262
156
        return 0;
9263
156
    }
9264
0
    else {
9265
        /* wrong return value */
9266
0
        PyErr_SetString(PyExc_TypeError,
9267
0
                        "character mapping must return integer, None or str");
9268
0
        Py_DECREF(x);
9269
0
        return -1;
9270
0
    }
9271
156
}
9272
9273
/* lookup the character, write the result into the writer.
9274
   Return 1 if the result was written into the writer, return 0 if the mapping
9275
   was undefined, raise an exception return -1 on error. */
9276
static int
9277
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9278
                        _PyUnicodeWriter *writer)
9279
108
{
9280
108
    PyObject *item;
9281
108
    Py_UCS4 replace;
9282
9283
108
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9284
0
        return -1;
9285
9286
108
    if (item == NULL) {
9287
        /* not found => default to 1:1 mapping */
9288
24
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9289
0
            return -1;
9290
0
        }
9291
24
        return 1;
9292
24
    }
9293
9294
84
    if (item == Py_None) {
9295
0
        Py_DECREF(item);
9296
0
        return 0;
9297
0
    }
9298
9299
84
    if (PyLong_Check(item)) {
9300
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9301
0
            Py_DECREF(item);
9302
0
            return -1;
9303
0
        }
9304
0
        Py_DECREF(item);
9305
0
        return 1;
9306
0
    }
9307
9308
84
    if (!PyUnicode_Check(item)) {
9309
0
        Py_DECREF(item);
9310
0
        return -1;
9311
0
    }
9312
9313
84
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9314
0
        Py_DECREF(item);
9315
0
        return -1;
9316
0
    }
9317
9318
84
    Py_DECREF(item);
9319
84
    return 1;
9320
84
}
9321
9322
static int
9323
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9324
                              Py_UCS1 *translate)
9325
174
{
9326
174
    PyObject *item = NULL;
9327
174
    Py_UCS4 replace;
9328
174
    int ret = 0;
9329
9330
174
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9331
0
        return -1;
9332
0
    }
9333
9334
174
    if (item == Py_None) {
9335
        /* deletion */
9336
0
        translate[ch] = 0xfe;
9337
0
    }
9338
174
    else if (item == NULL) {
9339
        /* not found => default to 1:1 mapping */
9340
102
        translate[ch] = ch;
9341
102
        return 1;
9342
102
    }
9343
72
    else if (PyLong_Check(item)) {
9344
0
        if (replace > 127) {
9345
            /* invalid character or character outside ASCII:
9346
               skip the fast translate */
9347
0
            goto exit;
9348
0
        }
9349
0
        translate[ch] = (Py_UCS1)replace;
9350
0
    }
9351
72
    else if (PyUnicode_Check(item)) {
9352
72
        if (PyUnicode_GET_LENGTH(item) != 1)
9353
72
            goto exit;
9354
9355
0
        replace = PyUnicode_READ_CHAR(item, 0);
9356
0
        if (replace > 127)
9357
0
            goto exit;
9358
0
        translate[ch] = (Py_UCS1)replace;
9359
0
    }
9360
0
    else {
9361
        /* not None, NULL, long or unicode */
9362
0
        goto exit;
9363
0
    }
9364
0
    ret = 1;
9365
9366
72
  exit:
9367
72
    Py_DECREF(item);
9368
72
    return ret;
9369
0
}
9370
9371
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9372
   was translated into writer, return 0 if the input string was partially
9373
   translated into writer, raise an exception and return -1 on error. */
9374
static int
9375
unicode_fast_translate(PyObject *input, PyObject *mapping,
9376
                       _PyUnicodeWriter *writer, int ignore,
9377
                       Py_ssize_t *input_pos)
9378
144
{
9379
144
    Py_UCS1 ascii_table[128], ch, ch2;
9380
144
    Py_ssize_t len;
9381
144
    const Py_UCS1 *in, *end;
9382
144
    Py_UCS1 *out;
9383
144
    int res = 0;
9384
9385
144
    len = PyUnicode_GET_LENGTH(input);
9386
9387
144
    memset(ascii_table, 0xff, 128);
9388
9389
144
    in = PyUnicode_1BYTE_DATA(input);
9390
144
    end = in + len;
9391
9392
144
    assert(PyUnicode_IS_ASCII(writer->buffer));
9393
144
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9394
144
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9395
9396
267
    for (; in < end; in++) {
9397
195
        ch = *in;
9398
195
        ch2 = ascii_table[ch];
9399
195
        if (ch2 == 0xff) {
9400
174
            int translate = unicode_fast_translate_lookup(mapping, ch,
9401
174
                                                          ascii_table);
9402
174
            if (translate < 0)
9403
0
                return -1;
9404
174
            if (translate == 0)
9405
72
                goto exit;
9406
102
            ch2 = ascii_table[ch];
9407
102
        }
9408
123
        if (ch2 == 0xfe) {
9409
0
            if (ignore)
9410
0
                continue;
9411
0
            goto exit;
9412
0
        }
9413
123
        assert(ch2 < 128);
9414
123
        *out = ch2;
9415
123
        out++;
9416
123
    }
9417
72
    res = 1;
9418
9419
144
exit:
9420
144
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9421
144
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9422
144
    return res;
9423
72
}
9424
9425
static PyObject *
9426
_PyUnicode_TranslateCharmap(PyObject *input,
9427
                            PyObject *mapping,
9428
                            const char *errors)
9429
144
{
9430
    /* input object */
9431
144
    const void *data;
9432
144
    Py_ssize_t size, i;
9433
144
    int kind;
9434
    /* output buffer */
9435
144
    _PyUnicodeWriter writer;
9436
    /* error handler */
9437
144
    const char *reason = "character maps to <undefined>";
9438
144
    PyObject *errorHandler = NULL;
9439
144
    PyObject *exc = NULL;
9440
144
    int ignore;
9441
144
    int res;
9442
9443
144
    if (mapping == NULL) {
9444
0
        PyErr_BadArgument();
9445
0
        return NULL;
9446
0
    }
9447
9448
144
    data = PyUnicode_DATA(input);
9449
144
    kind = PyUnicode_KIND(input);
9450
144
    size = PyUnicode_GET_LENGTH(input);
9451
9452
144
    if (size == 0)
9453
0
        return PyUnicode_FromObject(input);
9454
9455
    /* allocate enough for a simple 1:1 translation without
9456
       replacements, if we need more, we'll resize */
9457
144
    _PyUnicodeWriter_Init(&writer);
9458
144
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9459
0
        goto onError;
9460
9461
144
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9462
9463
144
    if (PyUnicode_IS_ASCII(input)) {
9464
144
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9465
144
        if (res < 0) {
9466
0
            _PyUnicodeWriter_Dealloc(&writer);
9467
0
            return NULL;
9468
0
        }
9469
144
        if (res == 1)
9470
72
            return _PyUnicodeWriter_Finish(&writer);
9471
144
    }
9472
0
    else {
9473
0
        i = 0;
9474
0
    }
9475
9476
180
    while (i<size) {
9477
        /* try to encode it */
9478
108
        int translate;
9479
108
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9480
108
        Py_ssize_t newpos;
9481
        /* startpos for collecting untranslatable chars */
9482
108
        Py_ssize_t collstart;
9483
108
        Py_ssize_t collend;
9484
108
        Py_UCS4 ch;
9485
9486
108
        ch = PyUnicode_READ(kind, data, i);
9487
108
        translate = charmaptranslate_output(ch, mapping, &writer);
9488
108
        if (translate < 0)
9489
0
            goto onError;
9490
9491
108
        if (translate != 0) {
9492
            /* it worked => adjust input pointer */
9493
108
            ++i;
9494
108
            continue;
9495
108
        }
9496
9497
        /* untranslatable character */
9498
0
        collstart = i;
9499
0
        collend = i+1;
9500
9501
        /* find all untranslatable characters */
9502
0
        while (collend < size) {
9503
0
            PyObject *x;
9504
0
            Py_UCS4 replace;
9505
0
            ch = PyUnicode_READ(kind, data, collend);
9506
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9507
0
                goto onError;
9508
0
            Py_XDECREF(x);
9509
0
            if (x != Py_None)
9510
0
                break;
9511
0
            ++collend;
9512
0
        }
9513
9514
0
        if (ignore) {
9515
0
            i = collend;
9516
0
        }
9517
0
        else {
9518
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9519
0
                                                             reason, input, &exc,
9520
0
                                                             collstart, collend, &newpos);
9521
0
            if (repunicode == NULL)
9522
0
                goto onError;
9523
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9524
0
                Py_DECREF(repunicode);
9525
0
                goto onError;
9526
0
            }
9527
0
            Py_DECREF(repunicode);
9528
0
            i = newpos;
9529
0
        }
9530
0
    }
9531
72
    Py_XDECREF(exc);
9532
72
    Py_XDECREF(errorHandler);
9533
72
    return _PyUnicodeWriter_Finish(&writer);
9534
9535
0
  onError:
9536
0
    _PyUnicodeWriter_Dealloc(&writer);
9537
0
    Py_XDECREF(exc);
9538
0
    Py_XDECREF(errorHandler);
9539
0
    return NULL;
9540
72
}
9541
9542
PyObject *
9543
PyUnicode_Translate(PyObject *str,
9544
                    PyObject *mapping,
9545
                    const char *errors)
9546
0
{
9547
0
    if (ensure_unicode(str) < 0)
9548
0
        return NULL;
9549
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9550
0
}
9551
9552
PyObject *
9553
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9554
146k
{
9555
146k
    if (!PyUnicode_Check(unicode)) {
9556
0
        PyErr_BadInternalCall();
9557
0
        return NULL;
9558
0
    }
9559
146k
    if (PyUnicode_IS_ASCII(unicode)) {
9560
        /* If the string is already ASCII, just return the same string */
9561
145k
        return Py_NewRef(unicode);
9562
145k
    }
9563
9564
1.57k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9565
1.57k
    PyObject *result = PyUnicode_New(len, 127);
9566
1.57k
    if (result == NULL) {
9567
0
        return NULL;
9568
0
    }
9569
9570
1.57k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9571
1.57k
    int kind = PyUnicode_KIND(unicode);
9572
1.57k
    const void *data = PyUnicode_DATA(unicode);
9573
1.57k
    Py_ssize_t i;
9574
1.77M
    for (i = 0; i < len; ++i) {
9575
1.77M
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9576
1.77M
        if (ch < 127) {
9577
1.76M
            out[i] = ch;
9578
1.76M
        }
9579
9.42k
        else if (Py_UNICODE_ISSPACE(ch)) {
9580
7.78k
            out[i] = ' ';
9581
7.78k
        }
9582
1.64k
        else {
9583
1.64k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9584
1.64k
            if (decimal < 0) {
9585
1.18k
                out[i] = '?';
9586
1.18k
                out[i+1] = '\0';
9587
1.18k
                _PyUnicode_LENGTH(result) = i + 1;
9588
0
                break;
9589
1.18k
            }
9590
462
            out[i] = '0' + decimal;
9591
462
        }
9592
1.77M
    }
9593
9594
1.57k
    assert(_PyUnicode_CheckConsistency(result, 1));
9595
1.57k
    return result;
9596
1.57k
}
9597
9598
/* --- Helpers ------------------------------------------------------------ */
9599
9600
/* helper macro to fixup start/end slice values */
9601
#define ADJUST_INDICES(start, end, len) \
9602
3.76M
    do {                                \
9603
3.76M
        if (end > len) {                \
9604
737k
            end = len;                  \
9605
737k
        }                               \
9606
3.76M
        else if (end < 0) {             \
9607
0
            end += len;                 \
9608
0
            if (end < 0) {              \
9609
0
                end = 0;                \
9610
0
            }                           \
9611
0
        }                               \
9612
3.76M
        if (start < 0) {                \
9613
0
            start += len;               \
9614
0
            if (start < 0) {            \
9615
0
                start = 0;              \
9616
0
            }                           \
9617
0
        }                               \
9618
3.76M
    } while (0)
9619
9620
static Py_ssize_t
9621
any_find_slice(PyObject* s1, PyObject* s2,
9622
               Py_ssize_t start,
9623
               Py_ssize_t end,
9624
               int direction)
9625
12.2k
{
9626
12.2k
    int kind1, kind2;
9627
12.2k
    const void *buf1, *buf2;
9628
12.2k
    Py_ssize_t len1, len2, result;
9629
9630
12.2k
    kind1 = PyUnicode_KIND(s1);
9631
12.2k
    kind2 = PyUnicode_KIND(s2);
9632
12.2k
    if (kind1 < kind2)
9633
0
        return -1;
9634
9635
12.2k
    len1 = PyUnicode_GET_LENGTH(s1);
9636
12.2k
    len2 = PyUnicode_GET_LENGTH(s2);
9637
12.2k
    ADJUST_INDICES(start, end, len1);
9638
12.2k
    if (end - start < len2)
9639
1.21k
        return -1;
9640
9641
11.0k
    buf1 = PyUnicode_DATA(s1);
9642
11.0k
    buf2 = PyUnicode_DATA(s2);
9643
11.0k
    if (len2 == 1) {
9644
11.0k
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9645
11.0k
        result = findchar((const char *)buf1 + kind1*start,
9646
11.0k
                          kind1, end - start, ch, direction);
9647
11.0k
        if (result == -1)
9648
9.34k
            return -1;
9649
1.65k
        else
9650
1.65k
            return start + result;
9651
11.0k
    }
9652
9653
0
    if (kind2 != kind1) {
9654
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9655
0
        if (!buf2)
9656
0
            return -2;
9657
0
    }
9658
9659
0
    if (direction > 0) {
9660
0
        switch (kind1) {
9661
0
        case PyUnicode_1BYTE_KIND:
9662
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9663
0
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9664
0
            else
9665
0
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9666
0
            break;
9667
0
        case PyUnicode_2BYTE_KIND:
9668
0
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9669
0
            break;
9670
0
        case PyUnicode_4BYTE_KIND:
9671
0
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9672
0
            break;
9673
0
        default:
9674
0
            Py_UNREACHABLE();
9675
0
        }
9676
0
    }
9677
0
    else {
9678
0
        switch (kind1) {
9679
0
        case PyUnicode_1BYTE_KIND:
9680
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9681
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9682
0
            else
9683
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9684
0
            break;
9685
0
        case PyUnicode_2BYTE_KIND:
9686
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9687
0
            break;
9688
0
        case PyUnicode_4BYTE_KIND:
9689
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9690
0
            break;
9691
0
        default:
9692
0
            Py_UNREACHABLE();
9693
0
        }
9694
0
    }
9695
9696
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9697
0
    if (kind2 != kind1)
9698
0
        PyMem_Free((void *)buf2);
9699
9700
0
    return result;
9701
0
}
9702
9703
9704
Py_ssize_t
9705
PyUnicode_Count(PyObject *str,
9706
                PyObject *substr,
9707
                Py_ssize_t start,
9708
                Py_ssize_t end)
9709
0
{
9710
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9711
0
        return -1;
9712
9713
0
    return unicode_count_impl(str, substr, start, end);
9714
0
}
9715
9716
Py_ssize_t
9717
PyUnicode_Find(PyObject *str,
9718
               PyObject *substr,
9719
               Py_ssize_t start,
9720
               Py_ssize_t end,
9721
               int direction)
9722
5.07k
{
9723
5.07k
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9724
0
        return -2;
9725
9726
5.07k
    return any_find_slice(str, substr, start, end, direction);
9727
5.07k
}
9728
9729
Py_ssize_t
9730
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9731
                   Py_ssize_t start, Py_ssize_t end,
9732
                   int direction)
9733
3.00M
{
9734
3.00M
    int kind;
9735
3.00M
    Py_ssize_t len, result;
9736
3.00M
    len = PyUnicode_GET_LENGTH(str);
9737
3.00M
    ADJUST_INDICES(start, end, len);
9738
3.00M
    if (end - start < 1)
9739
0
        return -1;
9740
3.00M
    kind = PyUnicode_KIND(str);
9741
3.00M
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9742
3.00M
                      kind, end-start, ch, direction);
9743
3.00M
    if (result == -1)
9744
2.74M
        return -1;
9745
265k
    else
9746
265k
        return start + result;
9747
3.00M
}
9748
9749
static int
9750
tailmatch(PyObject *self,
9751
          PyObject *substring,
9752
          Py_ssize_t start,
9753
          Py_ssize_t end,
9754
          int direction)
9755
736k
{
9756
736k
    int kind_self;
9757
736k
    int kind_sub;
9758
736k
    const void *data_self;
9759
736k
    const void *data_sub;
9760
736k
    Py_ssize_t offset;
9761
736k
    Py_ssize_t i;
9762
736k
    Py_ssize_t end_sub;
9763
9764
736k
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9765
736k
    end -= PyUnicode_GET_LENGTH(substring);
9766
736k
    if (end < start)
9767
202k
        return 0;
9768
9769
534k
    if (PyUnicode_GET_LENGTH(substring) == 0)
9770
0
        return 1;
9771
9772
534k
    kind_self = PyUnicode_KIND(self);
9773
534k
    data_self = PyUnicode_DATA(self);
9774
534k
    kind_sub = PyUnicode_KIND(substring);
9775
534k
    data_sub = PyUnicode_DATA(substring);
9776
534k
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9777
9778
534k
    if (direction > 0)
9779
239k
        offset = end;
9780
294k
    else
9781
294k
        offset = start;
9782
9783
534k
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9784
534k
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9785
519k
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9786
519k
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9787
        /* If both are of the same kind, memcmp is sufficient */
9788
479k
        if (kind_self == kind_sub) {
9789
478k
            return ! memcmp((char *)data_self +
9790
478k
                                (offset * PyUnicode_KIND(substring)),
9791
0
                            data_sub,
9792
478k
                            PyUnicode_GET_LENGTH(substring) *
9793
478k
                                PyUnicode_KIND(substring));
9794
478k
        }
9795
        /* otherwise we have to compare each character by first accessing it */
9796
1.11k
        else {
9797
            /* We do not need to compare 0 and len(substring)-1 because
9798
               the if statement above ensured already that they are equal
9799
               when we end up here. */
9800
1.94k
            for (i = 1; i < end_sub; ++i) {
9801
1.94k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9802
1.94k
                    PyUnicode_READ(kind_sub, data_sub, i))
9803
1.11k
                    return 0;
9804
1.94k
            }
9805
1
            return 1;
9806
1.11k
        }
9807
479k
    }
9808
9809
54.6k
    return 0;
9810
534k
}
9811
9812
Py_ssize_t
9813
PyUnicode_Tailmatch(PyObject *str,
9814
                    PyObject *substr,
9815
                    Py_ssize_t start,
9816
                    Py_ssize_t end,
9817
                    int direction)
9818
0
{
9819
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9820
0
        return -1;
9821
9822
0
    return tailmatch(str, substr, start, end, direction);
9823
0
}
9824
9825
static PyObject *
9826
ascii_upper_or_lower(PyObject *self, int lower)
9827
94.9k
{
9828
94.9k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9829
94.9k
    const char *data = PyUnicode_DATA(self);
9830
94.9k
    char *resdata;
9831
94.9k
    PyObject *res;
9832
9833
94.9k
    res = PyUnicode_New(len, 127);
9834
94.9k
    if (res == NULL)
9835
0
        return NULL;
9836
94.9k
    resdata = PyUnicode_DATA(res);
9837
94.9k
    if (lower)
9838
94.8k
        _Py_bytes_lower(resdata, data, len);
9839
153
    else
9840
153
        _Py_bytes_upper(resdata, data, len);
9841
94.9k
    return res;
9842
94.9k
}
9843
9844
static Py_UCS4
9845
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9846
75
{
9847
75
    Py_ssize_t j;
9848
75
    int final_sigma;
9849
75
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9850
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9851
9852
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9853
9854
    where ! is a negation and \p{xxx} is a character with property xxx.
9855
    */
9856
75
    for (j = i - 1; j >= 0; j--) {
9857
0
        c = PyUnicode_READ(kind, data, j);
9858
0
        if (!_PyUnicode_IsCaseIgnorable(c))
9859
0
            break;
9860
0
    }
9861
75
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9862
75
    if (final_sigma) {
9863
0
        for (j = i + 1; j < length; j++) {
9864
0
            c = PyUnicode_READ(kind, data, j);
9865
0
            if (!_PyUnicode_IsCaseIgnorable(c))
9866
0
                break;
9867
0
        }
9868
0
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9869
0
    }
9870
75
    return (final_sigma) ? 0x3C2 : 0x3C3;
9871
75
}
9872
9873
static int
9874
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9875
           Py_UCS4 c, Py_UCS4 *mapped)
9876
775k
{
9877
    /* Obscure special case. */
9878
775k
    if (c == 0x3A3) {
9879
75
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9880
75
        return 1;
9881
75
    }
9882
775k
    return _PyUnicode_ToLowerFull(c, mapped);
9883
775k
}
9884
9885
static Py_ssize_t
9886
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9887
0
{
9888
0
    Py_ssize_t i, k = 0;
9889
0
    int n_res, j;
9890
0
    Py_UCS4 c, mapped[3];
9891
9892
0
    c = PyUnicode_READ(kind, data, 0);
9893
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9894
0
    for (j = 0; j < n_res; j++) {
9895
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9896
0
        res[k++] = mapped[j];
9897
0
    }
9898
0
    for (i = 1; i < length; i++) {
9899
0
        c = PyUnicode_READ(kind, data, i);
9900
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9901
0
        for (j = 0; j < n_res; j++) {
9902
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9903
0
            res[k++] = mapped[j];
9904
0
        }
9905
0
    }
9906
0
    return k;
9907
0
}
9908
9909
static Py_ssize_t
9910
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9911
0
    Py_ssize_t i, k = 0;
9912
9913
0
    for (i = 0; i < length; i++) {
9914
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9915
0
        int n_res, j;
9916
0
        if (Py_UNICODE_ISUPPER(c)) {
9917
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9918
0
        }
9919
0
        else if (Py_UNICODE_ISLOWER(c)) {
9920
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9921
0
        }
9922
0
        else {
9923
0
            n_res = 1;
9924
0
            mapped[0] = c;
9925
0
        }
9926
0
        for (j = 0; j < n_res; j++) {
9927
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9928
0
            res[k++] = mapped[j];
9929
0
        }
9930
0
    }
9931
0
    return k;
9932
0
}
9933
9934
static Py_ssize_t
9935
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9936
                  Py_UCS4 *maxchar, int lower)
9937
498k
{
9938
498k
    Py_ssize_t i, k = 0;
9939
9940
1.27M
    for (i = 0; i < length; i++) {
9941
775k
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9942
775k
        int n_res, j;
9943
775k
        if (lower)
9944
775k
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9945
0
        else
9946
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9947
1.55M
        for (j = 0; j < n_res; j++) {
9948
775k
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9949
775k
            res[k++] = mapped[j];
9950
775k
        }
9951
775k
    }
9952
498k
    return k;
9953
498k
}
9954
9955
static Py_ssize_t
9956
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9957
0
{
9958
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9959
0
}
9960
9961
static Py_ssize_t
9962
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9963
498k
{
9964
498k
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9965
498k
}
9966
9967
static Py_ssize_t
9968
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9969
0
{
9970
0
    Py_ssize_t i, k = 0;
9971
9972
0
    for (i = 0; i < length; i++) {
9973
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9974
0
        Py_UCS4 mapped[3];
9975
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9976
0
        for (j = 0; j < n_res; j++) {
9977
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9978
0
            res[k++] = mapped[j];
9979
0
        }
9980
0
    }
9981
0
    return k;
9982
0
}
9983
9984
static Py_ssize_t
9985
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9986
0
{
9987
0
    Py_ssize_t i, k = 0;
9988
0
    int previous_is_cased;
9989
9990
0
    previous_is_cased = 0;
9991
0
    for (i = 0; i < length; i++) {
9992
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9993
0
        Py_UCS4 mapped[3];
9994
0
        int n_res, j;
9995
9996
0
        if (previous_is_cased)
9997
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9998
0
        else
9999
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
10000
10001
0
        for (j = 0; j < n_res; j++) {
10002
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10003
0
            res[k++] = mapped[j];
10004
0
        }
10005
10006
0
        previous_is_cased = _PyUnicode_IsCased(c);
10007
0
    }
10008
0
    return k;
10009
0
}
10010
10011
static PyObject *
10012
case_operation(PyObject *self,
10013
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
10014
498k
{
10015
498k
    PyObject *res = NULL;
10016
498k
    Py_ssize_t length, newlength = 0;
10017
498k
    int kind, outkind;
10018
498k
    const void *data;
10019
498k
    void *outdata;
10020
498k
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
10021
10022
498k
    kind = PyUnicode_KIND(self);
10023
498k
    data = PyUnicode_DATA(self);
10024
498k
    length = PyUnicode_GET_LENGTH(self);
10025
498k
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10026
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
10027
0
        return NULL;
10028
0
    }
10029
498k
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10030
498k
    if (tmp == NULL)
10031
0
        return PyErr_NoMemory();
10032
498k
    newlength = perform(kind, data, length, tmp, &maxchar);
10033
498k
    res = PyUnicode_New(newlength, maxchar);
10034
498k
    if (res == NULL)
10035
0
        goto leave;
10036
498k
    tmpend = tmp + newlength;
10037
498k
    outdata = PyUnicode_DATA(res);
10038
498k
    outkind = PyUnicode_KIND(res);
10039
0
    switch (outkind) {
10040
35.6k
    case PyUnicode_1BYTE_KIND:
10041
35.6k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10042
35.6k
        break;
10043
441k
    case PyUnicode_2BYTE_KIND:
10044
441k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10045
441k
        break;
10046
21.5k
    case PyUnicode_4BYTE_KIND:
10047
21.5k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10048
21.5k
        break;
10049
0
    default:
10050
0
        Py_UNREACHABLE();
10051
498k
    }
10052
498k
  leave:
10053
498k
    PyMem_Free(tmp);
10054
498k
    return res;
10055
498k
}
10056
10057
PyObject *
10058
PyUnicode_Join(PyObject *separator, PyObject *seq)
10059
351k
{
10060
351k
    PyObject *res;
10061
351k
    PyObject *fseq;
10062
351k
    Py_ssize_t seqlen;
10063
351k
    PyObject **items;
10064
10065
351k
    fseq = PySequence_Fast(seq, "can only join an iterable");
10066
351k
    if (fseq == NULL) {
10067
0
        return NULL;
10068
0
    }
10069
10070
351k
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
10071
10072
351k
    items = PySequence_Fast_ITEMS(fseq);
10073
351k
    seqlen = PySequence_Fast_GET_SIZE(fseq);
10074
351k
    res = _PyUnicode_JoinArray(separator, items, seqlen);
10075
10076
351k
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
10077
10078
351k
    Py_DECREF(fseq);
10079
351k
    return res;
10080
351k
}
10081
10082
PyObject *
10083
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10084
610k
{
10085
610k
    PyObject *res = NULL; /* the result */
10086
610k
    PyObject *sep = NULL;
10087
610k
    Py_ssize_t seplen;
10088
610k
    PyObject *item;
10089
610k
    Py_ssize_t sz, i, res_offset;
10090
610k
    Py_UCS4 maxchar;
10091
610k
    Py_UCS4 item_maxchar;
10092
610k
    int use_memcpy;
10093
610k
    unsigned char *res_data = NULL, *sep_data = NULL;
10094
610k
    PyObject *last_obj;
10095
610k
    int kind = 0;
10096
10097
    /* If empty sequence, return u"". */
10098
610k
    if (seqlen == 0) {
10099
5
        _Py_RETURN_UNICODE_EMPTY();
10100
5
    }
10101
10102
    /* If singleton sequence with an exact Unicode, return that. */
10103
610k
    last_obj = NULL;
10104
610k
    if (seqlen == 1) {
10105
252k
        if (PyUnicode_CheckExact(items[0])) {
10106
252k
            res = items[0];
10107
252k
            return Py_NewRef(res);
10108
252k
        }
10109
0
        seplen = 0;
10110
0
        maxchar = 0;
10111
0
    }
10112
358k
    else {
10113
        /* Set up sep and seplen */
10114
358k
        if (separator == NULL) {
10115
            /* fall back to a blank space separator */
10116
0
            sep = PyUnicode_FromOrdinal(' ');
10117
0
            if (!sep)
10118
0
                goto onError;
10119
0
            seplen = 1;
10120
0
            maxchar = 32;
10121
0
        }
10122
358k
        else {
10123
358k
            if (!PyUnicode_Check(separator)) {
10124
0
                PyErr_Format(PyExc_TypeError,
10125
0
                             "separator: expected str instance,"
10126
0
                             " %.80s found",
10127
0
                             Py_TYPE(separator)->tp_name);
10128
0
                goto onError;
10129
0
            }
10130
358k
            sep = separator;
10131
358k
            seplen = PyUnicode_GET_LENGTH(separator);
10132
358k
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10133
            /* inc refcount to keep this code path symmetric with the
10134
               above case of a blank separator */
10135
358k
            Py_INCREF(sep);
10136
358k
        }
10137
358k
        last_obj = sep;
10138
358k
    }
10139
10140
    /* There are at least two things to join, or else we have a subclass
10141
     * of str in the sequence.
10142
     * Do a pre-pass to figure out the total amount of space we'll
10143
     * need (sz), and see whether all argument are strings.
10144
     */
10145
358k
    sz = 0;
10146
#ifdef Py_DEBUG
10147
    use_memcpy = 0;
10148
#else
10149
358k
    use_memcpy = 1;
10150
358k
#endif
10151
45.3M
    for (i = 0; i < seqlen; i++) {
10152
44.9M
        size_t add_sz;
10153
44.9M
        item = items[i];
10154
44.9M
        if (!PyUnicode_Check(item)) {
10155
0
            PyErr_Format(PyExc_TypeError,
10156
0
                         "sequence item %zd: expected str instance,"
10157
0
                         " %.80s found",
10158
0
                         i, Py_TYPE(item)->tp_name);
10159
0
            goto onError;
10160
0
        }
10161
44.9M
        add_sz = PyUnicode_GET_LENGTH(item);
10162
44.9M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10163
44.9M
        maxchar = Py_MAX(maxchar, item_maxchar);
10164
44.9M
        if (i != 0) {
10165
44.5M
            add_sz += seplen;
10166
44.5M
        }
10167
44.9M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10168
0
            PyErr_SetString(PyExc_OverflowError,
10169
0
                            "join() result is too long for a Python string");
10170
0
            goto onError;
10171
0
        }
10172
44.9M
        sz += add_sz;
10173
44.9M
        if (use_memcpy && last_obj != NULL) {
10174
30.9M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10175
62.1k
                use_memcpy = 0;
10176
15.4M
        }
10177
0
        last_obj = item;
10178
44.9M
    }
10179
10180
358k
    res = PyUnicode_New(sz, maxchar);
10181
358k
    if (res == NULL)
10182
0
        goto onError;
10183
10184
    /* Catenate everything. */
10185
#ifdef Py_DEBUG
10186
    use_memcpy = 0;
10187
#else
10188
358k
    if (use_memcpy) {
10189
296k
        res_data = PyUnicode_1BYTE_DATA(res);
10190
296k
        kind = PyUnicode_KIND(res);
10191
296k
        if (seplen != 0)
10192
5.12k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10193
296k
    }
10194
358k
#endif
10195
358k
    if (use_memcpy) {
10196
15.3M
        for (i = 0; i < seqlen; ++i) {
10197
15.0M
            Py_ssize_t itemlen;
10198
15.0M
            item = items[i];
10199
10200
            /* Copy item, and maybe the separator. */
10201
15.0M
            if (i && seplen != 0) {
10202
10.2k
                memcpy(res_data,
10203
10.2k
                          sep_data,
10204
10.2k
                          kind * seplen);
10205
10.2k
                res_data += kind * seplen;
10206
10.2k
            }
10207
10208
15.0M
            itemlen = PyUnicode_GET_LENGTH(item);
10209
15.0M
            if (itemlen != 0) {
10210
15.0M
                memcpy(res_data,
10211
15.0M
                          PyUnicode_DATA(item),
10212
15.0M
                          kind * itemlen);
10213
15.0M
                res_data += kind * itemlen;
10214
15.0M
            }
10215
15.0M
        }
10216
296k
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10217
296k
                           + kind * PyUnicode_GET_LENGTH(res));
10218
296k
    }
10219
62.1k
    else {
10220
29.9M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10221
29.8M
            Py_ssize_t itemlen;
10222
29.8M
            item = items[i];
10223
10224
            /* Copy item, and maybe the separator. */
10225
29.8M
            if (i && seplen != 0) {
10226
38.8k
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10227
38.8k
                res_offset += seplen;
10228
38.8k
            }
10229
10230
29.8M
            itemlen = PyUnicode_GET_LENGTH(item);
10231
29.8M
            if (itemlen != 0) {
10232
29.8M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10233
29.8M
                res_offset += itemlen;
10234
29.8M
            }
10235
29.8M
        }
10236
62.1k
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10237
62.1k
    }
10238
10239
358k
    Py_XDECREF(sep);
10240
358k
    assert(_PyUnicode_CheckConsistency(res, 1));
10241
358k
    return res;
10242
10243
0
  onError:
10244
0
    Py_XDECREF(sep);
10245
0
    Py_XDECREF(res);
10246
0
    return NULL;
10247
358k
}
10248
10249
void
10250
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10251
                    Py_UCS4 fill_char)
10252
691
{
10253
691
    const int kind = PyUnicode_KIND(unicode);
10254
691
    void *data = PyUnicode_DATA(unicode);
10255
691
    assert(unicode_modifiable(unicode));
10256
691
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10257
691
    assert(start >= 0);
10258
691
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10259
691
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10260
691
}
10261
10262
Py_ssize_t
10263
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10264
               Py_UCS4 fill_char)
10265
691
{
10266
691
    Py_ssize_t maxlen;
10267
10268
691
    if (!PyUnicode_Check(unicode)) {
10269
0
        PyErr_BadInternalCall();
10270
0
        return -1;
10271
0
    }
10272
691
    if (unicode_check_modifiable(unicode))
10273
0
        return -1;
10274
10275
691
    if (start < 0) {
10276
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10277
0
        return -1;
10278
0
    }
10279
691
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10280
0
        PyErr_SetString(PyExc_ValueError,
10281
0
                         "fill character is bigger than "
10282
0
                         "the string maximum character");
10283
0
        return -1;
10284
0
    }
10285
10286
691
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10287
691
    length = Py_MIN(maxlen, length);
10288
691
    if (length <= 0)
10289
0
        return 0;
10290
10291
691
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10292
691
    return length;
10293
691
}
10294
10295
static PyObject *
10296
pad(PyObject *self,
10297
    Py_ssize_t left,
10298
    Py_ssize_t right,
10299
    Py_UCS4 fill)
10300
0
{
10301
0
    PyObject *u;
10302
0
    Py_UCS4 maxchar;
10303
0
    int kind;
10304
0
    void *data;
10305
10306
0
    if (left < 0)
10307
0
        left = 0;
10308
0
    if (right < 0)
10309
0
        right = 0;
10310
10311
0
    if (left == 0 && right == 0)
10312
0
        return unicode_result_unchanged(self);
10313
10314
0
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10315
0
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10316
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10317
0
        return NULL;
10318
0
    }
10319
0
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10320
0
    maxchar = Py_MAX(maxchar, fill);
10321
0
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10322
0
    if (!u)
10323
0
        return NULL;
10324
10325
0
    kind = PyUnicode_KIND(u);
10326
0
    data = PyUnicode_DATA(u);
10327
0
    if (left)
10328
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10329
0
    if (right)
10330
0
        _PyUnicode_Fill(kind, data, fill,
10331
0
                        left + _PyUnicode_LENGTH(self), right);
10332
0
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10333
0
    assert(_PyUnicode_CheckConsistency(u, 1));
10334
0
    return u;
10335
0
}
10336
10337
PyObject *
10338
PyUnicode_Splitlines(PyObject *string, int keepends)
10339
0
{
10340
0
    PyObject *list;
10341
10342
0
    if (ensure_unicode(string) < 0)
10343
0
        return NULL;
10344
10345
0
    switch (PyUnicode_KIND(string)) {
10346
0
    case PyUnicode_1BYTE_KIND:
10347
0
        if (PyUnicode_IS_ASCII(string))
10348
0
            list = asciilib_splitlines(
10349
0
                string, PyUnicode_1BYTE_DATA(string),
10350
0
                PyUnicode_GET_LENGTH(string), keepends);
10351
0
        else
10352
0
            list = ucs1lib_splitlines(
10353
0
                string, PyUnicode_1BYTE_DATA(string),
10354
0
                PyUnicode_GET_LENGTH(string), keepends);
10355
0
        break;
10356
0
    case PyUnicode_2BYTE_KIND:
10357
0
        list = ucs2lib_splitlines(
10358
0
            string, PyUnicode_2BYTE_DATA(string),
10359
0
            PyUnicode_GET_LENGTH(string), keepends);
10360
0
        break;
10361
0
    case PyUnicode_4BYTE_KIND:
10362
0
        list = ucs4lib_splitlines(
10363
0
            string, PyUnicode_4BYTE_DATA(string),
10364
0
            PyUnicode_GET_LENGTH(string), keepends);
10365
0
        break;
10366
0
    default:
10367
0
        Py_UNREACHABLE();
10368
0
    }
10369
0
    return list;
10370
0
}
10371
10372
static PyObject *
10373
split(PyObject *self,
10374
      PyObject *substring,
10375
      Py_ssize_t maxcount)
10376
1.69k
{
10377
1.69k
    int kind1, kind2;
10378
1.69k
    const void *buf1, *buf2;
10379
1.69k
    Py_ssize_t len1, len2;
10380
1.69k
    PyObject* out;
10381
1.69k
    len1 = PyUnicode_GET_LENGTH(self);
10382
1.69k
    kind1 = PyUnicode_KIND(self);
10383
10384
1.69k
    if (substring == NULL) {
10385
7
        if (maxcount < 0) {
10386
7
            maxcount = (len1 - 1) / 2 + 1;
10387
7
        }
10388
7
        switch (kind1) {
10389
7
        case PyUnicode_1BYTE_KIND:
10390
7
            if (PyUnicode_IS_ASCII(self))
10391
7
                return asciilib_split_whitespace(
10392
7
                    self,  PyUnicode_1BYTE_DATA(self),
10393
7
                    len1, maxcount
10394
7
                    );
10395
0
            else
10396
0
                return ucs1lib_split_whitespace(
10397
0
                    self,  PyUnicode_1BYTE_DATA(self),
10398
0
                    len1, maxcount
10399
0
                    );
10400
0
        case PyUnicode_2BYTE_KIND:
10401
0
            return ucs2lib_split_whitespace(
10402
0
                self,  PyUnicode_2BYTE_DATA(self),
10403
0
                len1, maxcount
10404
0
                );
10405
0
        case PyUnicode_4BYTE_KIND:
10406
0
            return ucs4lib_split_whitespace(
10407
0
                self,  PyUnicode_4BYTE_DATA(self),
10408
0
                len1, maxcount
10409
0
                );
10410
0
        default:
10411
0
            Py_UNREACHABLE();
10412
7
        }
10413
7
    }
10414
10415
1.69k
    kind2 = PyUnicode_KIND(substring);
10416
1.69k
    len2 = PyUnicode_GET_LENGTH(substring);
10417
1.69k
    if (maxcount < 0) {
10418
        // if len2 == 0, it will raise ValueError.
10419
1.69k
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10420
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10421
1.69k
        maxcount = maxcount < 0 ? len1 : maxcount;
10422
1.69k
    }
10423
1.69k
    if (kind1 < kind2 || len1 < len2) {
10424
3
        out = PyList_New(1);
10425
3
        if (out == NULL)
10426
0
            return NULL;
10427
3
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10428
3
        return out;
10429
3
    }
10430
1.68k
    buf1 = PyUnicode_DATA(self);
10431
1.68k
    buf2 = PyUnicode_DATA(substring);
10432
1.68k
    if (kind2 != kind1) {
10433
1.01k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10434
1.01k
        if (!buf2)
10435
0
            return NULL;
10436
1.01k
    }
10437
10438
1.68k
    switch (kind1) {
10439
674
    case PyUnicode_1BYTE_KIND:
10440
674
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10441
416
            out = asciilib_split(
10442
416
                self,  buf1, len1, buf2, len2, maxcount);
10443
258
        else
10444
258
            out = ucs1lib_split(
10445
258
                self,  buf1, len1, buf2, len2, maxcount);
10446
674
        break;
10447
517
    case PyUnicode_2BYTE_KIND:
10448
517
        out = ucs2lib_split(
10449
517
            self,  buf1, len1, buf2, len2, maxcount);
10450
517
        break;
10451
498
    case PyUnicode_4BYTE_KIND:
10452
498
        out = ucs4lib_split(
10453
498
            self,  buf1, len1, buf2, len2, maxcount);
10454
498
        break;
10455
0
    default:
10456
0
        out = NULL;
10457
1.68k
    }
10458
1.68k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10459
1.68k
    if (kind2 != kind1)
10460
1.01k
        PyMem_Free((void *)buf2);
10461
1.68k
    return out;
10462
1.68k
}
10463
10464
static PyObject *
10465
rsplit(PyObject *self,
10466
       PyObject *substring,
10467
       Py_ssize_t maxcount)
10468
0
{
10469
0
    int kind1, kind2;
10470
0
    const void *buf1, *buf2;
10471
0
    Py_ssize_t len1, len2;
10472
0
    PyObject* out;
10473
10474
0
    len1 = PyUnicode_GET_LENGTH(self);
10475
0
    kind1 = PyUnicode_KIND(self);
10476
10477
0
    if (substring == NULL) {
10478
0
        if (maxcount < 0) {
10479
0
            maxcount = (len1 - 1) / 2 + 1;
10480
0
        }
10481
0
        switch (kind1) {
10482
0
        case PyUnicode_1BYTE_KIND:
10483
0
            if (PyUnicode_IS_ASCII(self))
10484
0
                return asciilib_rsplit_whitespace(
10485
0
                    self,  PyUnicode_1BYTE_DATA(self),
10486
0
                    len1, maxcount
10487
0
                    );
10488
0
            else
10489
0
                return ucs1lib_rsplit_whitespace(
10490
0
                    self,  PyUnicode_1BYTE_DATA(self),
10491
0
                    len1, maxcount
10492
0
                    );
10493
0
        case PyUnicode_2BYTE_KIND:
10494
0
            return ucs2lib_rsplit_whitespace(
10495
0
                self,  PyUnicode_2BYTE_DATA(self),
10496
0
                len1, maxcount
10497
0
                );
10498
0
        case PyUnicode_4BYTE_KIND:
10499
0
            return ucs4lib_rsplit_whitespace(
10500
0
                self,  PyUnicode_4BYTE_DATA(self),
10501
0
                len1, maxcount
10502
0
                );
10503
0
        default:
10504
0
            Py_UNREACHABLE();
10505
0
        }
10506
0
    }
10507
0
    kind2 = PyUnicode_KIND(substring);
10508
0
    len2 = PyUnicode_GET_LENGTH(substring);
10509
0
    if (maxcount < 0) {
10510
        // if len2 == 0, it will raise ValueError.
10511
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10512
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10513
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10514
0
    }
10515
0
    if (kind1 < kind2 || len1 < len2) {
10516
0
        out = PyList_New(1);
10517
0
        if (out == NULL)
10518
0
            return NULL;
10519
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10520
0
        return out;
10521
0
    }
10522
0
    buf1 = PyUnicode_DATA(self);
10523
0
    buf2 = PyUnicode_DATA(substring);
10524
0
    if (kind2 != kind1) {
10525
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10526
0
        if (!buf2)
10527
0
            return NULL;
10528
0
    }
10529
10530
0
    switch (kind1) {
10531
0
    case PyUnicode_1BYTE_KIND:
10532
0
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10533
0
            out = asciilib_rsplit(
10534
0
                self,  buf1, len1, buf2, len2, maxcount);
10535
0
        else
10536
0
            out = ucs1lib_rsplit(
10537
0
                self,  buf1, len1, buf2, len2, maxcount);
10538
0
        break;
10539
0
    case PyUnicode_2BYTE_KIND:
10540
0
        out = ucs2lib_rsplit(
10541
0
            self,  buf1, len1, buf2, len2, maxcount);
10542
0
        break;
10543
0
    case PyUnicode_4BYTE_KIND:
10544
0
        out = ucs4lib_rsplit(
10545
0
            self,  buf1, len1, buf2, len2, maxcount);
10546
0
        break;
10547
0
    default:
10548
0
        out = NULL;
10549
0
    }
10550
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10551
0
    if (kind2 != kind1)
10552
0
        PyMem_Free((void *)buf2);
10553
0
    return out;
10554
0
}
10555
10556
static Py_ssize_t
10557
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10558
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10559
44.4k
{
10560
44.4k
    switch (kind) {
10561
15.9k
    case PyUnicode_1BYTE_KIND:
10562
15.9k
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10563
10.1k
            return asciilib_find(buf1, len1, buf2, len2, offset);
10564
5.81k
        else
10565
5.81k
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10566
17.3k
    case PyUnicode_2BYTE_KIND:
10567
17.3k
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10568
11.1k
    case PyUnicode_4BYTE_KIND:
10569
11.1k
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10570
44.4k
    }
10571
44.4k
    Py_UNREACHABLE();
10572
44.4k
}
10573
10574
static Py_ssize_t
10575
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10576
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10577
55.4k
{
10578
55.4k
    switch (kind) {
10579
46.7k
    case PyUnicode_1BYTE_KIND:
10580
46.7k
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10581
5.65k
    case PyUnicode_2BYTE_KIND:
10582
5.65k
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10583
3.06k
    case PyUnicode_4BYTE_KIND:
10584
3.06k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10585
55.4k
    }
10586
55.4k
    Py_UNREACHABLE();
10587
55.4k
}
10588
10589
static void
10590
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10591
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10592
176
{
10593
176
    int kind = PyUnicode_KIND(u);
10594
176
    void *data = PyUnicode_DATA(u);
10595
176
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10596
176
    if (kind == PyUnicode_1BYTE_KIND) {
10597
176
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10598
176
                                      (Py_UCS1 *)data + len,
10599
176
                                      u1, u2, maxcount);
10600
176
    }
10601
0
    else if (kind == PyUnicode_2BYTE_KIND) {
10602
0
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10603
0
                                      (Py_UCS2 *)data + len,
10604
0
                                      u1, u2, maxcount);
10605
0
    }
10606
0
    else {
10607
0
        assert(kind == PyUnicode_4BYTE_KIND);
10608
0
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10609
0
                                      (Py_UCS4 *)data + len,
10610
0
                                      u1, u2, maxcount);
10611
0
    }
10612
176
}
10613
10614
static PyObject *
10615
replace(PyObject *self, PyObject *str1,
10616
        PyObject *str2, Py_ssize_t maxcount)
10617
58.8k
{
10618
58.8k
    PyObject *u;
10619
58.8k
    const char *sbuf = PyUnicode_DATA(self);
10620
58.8k
    const void *buf1 = PyUnicode_DATA(str1);
10621
58.8k
    const void *buf2 = PyUnicode_DATA(str2);
10622
58.8k
    int srelease = 0, release1 = 0, release2 = 0;
10623
58.8k
    int skind = PyUnicode_KIND(self);
10624
58.8k
    int kind1 = PyUnicode_KIND(str1);
10625
58.8k
    int kind2 = PyUnicode_KIND(str2);
10626
58.8k
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10627
58.8k
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10628
58.8k
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10629
58.8k
    int mayshrink;
10630
58.8k
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10631
10632
58.8k
    if (slen < len1)
10633
2.43k
        goto nothing;
10634
10635
56.3k
    if (maxcount < 0)
10636
56.3k
        maxcount = PY_SSIZE_T_MAX;
10637
0
    else if (maxcount == 0)
10638
0
        goto nothing;
10639
10640
56.3k
    if (str1 == str2)
10641
0
        goto nothing;
10642
10643
56.3k
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10644
56.3k
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10645
56.3k
    if (maxchar < maxchar_str1)
10646
        /* substring too wide to be present */
10647
0
        goto nothing;
10648
56.3k
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10649
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10650
       result string. */
10651
56.3k
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10652
56.3k
    maxchar = Py_MAX(maxchar, maxchar_str2);
10653
10654
56.3k
    if (len1 == len2) {
10655
        /* same length */
10656
917
        if (len1 == 0)
10657
0
            goto nothing;
10658
917
        if (len1 == 1) {
10659
            /* replace characters */
10660
917
            Py_UCS4 u1, u2;
10661
917
            Py_ssize_t pos;
10662
10663
917
            u1 = PyUnicode_READ(kind1, buf1, 0);
10664
917
            pos = findchar(sbuf, skind, slen, u1, 1);
10665
917
            if (pos < 0)
10666
741
                goto nothing;
10667
176
            u2 = PyUnicode_READ(kind2, buf2, 0);
10668
176
            u = PyUnicode_New(slen, maxchar);
10669
176
            if (!u)
10670
0
                goto error;
10671
10672
176
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10673
176
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10674
176
        }
10675
0
        else {
10676
0
            int rkind = skind;
10677
0
            char *res;
10678
0
            Py_ssize_t i;
10679
10680
0
            if (kind1 < rkind) {
10681
                /* widen substring */
10682
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10683
0
                if (!buf1) goto error;
10684
0
                release1 = 1;
10685
0
            }
10686
0
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10687
0
            if (i < 0)
10688
0
                goto nothing;
10689
0
            if (rkind > kind2) {
10690
                /* widen replacement */
10691
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10692
0
                if (!buf2) goto error;
10693
0
                release2 = 1;
10694
0
            }
10695
0
            else if (rkind < kind2) {
10696
                /* widen self and buf1 */
10697
0
                rkind = kind2;
10698
0
                if (release1) {
10699
0
                    assert(buf1 != PyUnicode_DATA(str1));
10700
0
                    PyMem_Free((void *)buf1);
10701
0
                    buf1 = PyUnicode_DATA(str1);
10702
0
                    release1 = 0;
10703
0
                }
10704
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10705
0
                if (!sbuf) goto error;
10706
0
                srelease = 1;
10707
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10708
0
                if (!buf1) goto error;
10709
0
                release1 = 1;
10710
0
            }
10711
0
            u = PyUnicode_New(slen, maxchar);
10712
0
            if (!u)
10713
0
                goto error;
10714
0
            assert(PyUnicode_KIND(u) == rkind);
10715
0
            res = PyUnicode_DATA(u);
10716
10717
0
            memcpy(res, sbuf, rkind * slen);
10718
            /* change everything in-place, starting with this one */
10719
0
            memcpy(res + rkind * i,
10720
0
                   buf2,
10721
0
                   rkind * len2);
10722
0
            i += len1;
10723
10724
0
            while ( --maxcount > 0) {
10725
0
                i = anylib_find(rkind, self,
10726
0
                                sbuf+rkind*i, slen-i,
10727
0
                                str1, buf1, len1, i);
10728
0
                if (i == -1)
10729
0
                    break;
10730
0
                memcpy(res + rkind * i,
10731
0
                       buf2,
10732
0
                       rkind * len2);
10733
0
                i += len1;
10734
0
            }
10735
0
        }
10736
917
    }
10737
55.4k
    else {
10738
55.4k
        Py_ssize_t n, i, j, ires;
10739
55.4k
        Py_ssize_t new_size;
10740
55.4k
        int rkind = skind;
10741
55.4k
        char *res;
10742
10743
55.4k
        if (kind1 < rkind) {
10744
            /* widen substring */
10745
8.72k
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10746
8.72k
            if (!buf1) goto error;
10747
8.72k
            release1 = 1;
10748
8.72k
        }
10749
55.4k
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10750
55.4k
        if (n == 0)
10751
44.0k
            goto nothing;
10752
11.3k
        if (kind2 < rkind) {
10753
            /* widen replacement */
10754
5.36k
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10755
5.36k
            if (!buf2) goto error;
10756
5.36k
            release2 = 1;
10757
5.36k
        }
10758
6.02k
        else if (kind2 > rkind) {
10759
            /* widen self and buf1 */
10760
0
            rkind = kind2;
10761
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10762
0
            if (!sbuf) goto error;
10763
0
            srelease = 1;
10764
0
            if (release1) {
10765
0
                assert(buf1 != PyUnicode_DATA(str1));
10766
0
                PyMem_Free((void *)buf1);
10767
0
                buf1 = PyUnicode_DATA(str1);
10768
0
                release1 = 0;
10769
0
            }
10770
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10771
0
            if (!buf1) goto error;
10772
0
            release1 = 1;
10773
0
        }
10774
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10775
           PyUnicode_GET_LENGTH(str1)); */
10776
11.3k
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10777
0
                PyErr_SetString(PyExc_OverflowError,
10778
0
                                "replace string is too long");
10779
0
                goto error;
10780
0
        }
10781
11.3k
        new_size = slen + n * (len2 - len1);
10782
11.3k
        if (new_size == 0) {
10783
0
            u = unicode_get_empty();
10784
0
            goto done;
10785
0
        }
10786
11.3k
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10787
0
            PyErr_SetString(PyExc_OverflowError,
10788
0
                            "replace string is too long");
10789
0
            goto error;
10790
0
        }
10791
11.3k
        u = PyUnicode_New(new_size, maxchar);
10792
11.3k
        if (!u)
10793
0
            goto error;
10794
11.3k
        assert(PyUnicode_KIND(u) == rkind);
10795
11.3k
        res = PyUnicode_DATA(u);
10796
11.3k
        ires = i = 0;
10797
11.3k
        if (len1 > 0) {
10798
55.8k
            while (n-- > 0) {
10799
                /* look for next match */
10800
44.4k
                j = anylib_find(rkind, self,
10801
44.4k
                                sbuf + rkind * i, slen-i,
10802
44.4k
                                str1, buf1, len1, i);
10803
44.4k
                if (j == -1)
10804
0
                    break;
10805
44.4k
                else if (j > i) {
10806
                    /* copy unchanged part [i:j] */
10807
15.7k
                    memcpy(res + rkind * ires,
10808
15.7k
                           sbuf + rkind * i,
10809
15.7k
                           rkind * (j-i));
10810
15.7k
                    ires += j - i;
10811
15.7k
                }
10812
                /* copy substitution string */
10813
44.4k
                if (len2 > 0) {
10814
44.1k
                    memcpy(res + rkind * ires,
10815
44.1k
                           buf2,
10816
44.1k
                           rkind * len2);
10817
44.1k
                    ires += len2;
10818
44.1k
                }
10819
44.4k
                i = j + len1;
10820
44.4k
            }
10821
11.3k
            if (i < slen)
10822
                /* copy tail [i:] */
10823
7.70k
                memcpy(res + rkind * ires,
10824
7.70k
                       sbuf + rkind * i,
10825
7.70k
                       rkind * (slen-i));
10826
11.3k
        }
10827
0
        else {
10828
            /* interleave */
10829
0
            while (n > 0) {
10830
0
                memcpy(res + rkind * ires,
10831
0
                       buf2,
10832
0
                       rkind * len2);
10833
0
                ires += len2;
10834
0
                if (--n <= 0)
10835
0
                    break;
10836
0
                memcpy(res + rkind * ires,
10837
0
                       sbuf + rkind * i,
10838
0
                       rkind);
10839
0
                ires++;
10840
0
                i++;
10841
0
            }
10842
0
            memcpy(res + rkind * ires,
10843
0
                   sbuf + rkind * i,
10844
0
                   rkind * (slen-i));
10845
0
        }
10846
11.3k
    }
10847
10848
11.5k
    if (mayshrink) {
10849
0
        unicode_adjust_maxchar(&u);
10850
0
        if (u == NULL)
10851
0
            goto error;
10852
0
    }
10853
10854
11.5k
  done:
10855
11.5k
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10856
11.5k
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10857
11.5k
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10858
11.5k
    if (srelease)
10859
0
        PyMem_Free((void *)sbuf);
10860
11.5k
    if (release1)
10861
5.36k
        PyMem_Free((void *)buf1);
10862
11.5k
    if (release2)
10863
5.36k
        PyMem_Free((void *)buf2);
10864
11.5k
    assert(_PyUnicode_CheckConsistency(u, 1));
10865
11.5k
    return u;
10866
10867
47.2k
  nothing:
10868
    /* nothing to replace; return original string (when possible) */
10869
47.2k
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10870
47.2k
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10871
47.2k
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10872
47.2k
    if (srelease)
10873
0
        PyMem_Free((void *)sbuf);
10874
47.2k
    if (release1)
10875
3.35k
        PyMem_Free((void *)buf1);
10876
47.2k
    if (release2)
10877
0
        PyMem_Free((void *)buf2);
10878
47.2k
    return unicode_result_unchanged(self);
10879
10880
0
  error:
10881
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10882
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10883
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10884
0
    if (srelease)
10885
0
        PyMem_Free((void *)sbuf);
10886
0
    if (release1)
10887
0
        PyMem_Free((void *)buf1);
10888
0
    if (release2)
10889
0
        PyMem_Free((void *)buf2);
10890
0
    return NULL;
10891
0
}
10892
10893
/* --- Unicode Object Methods --------------------------------------------- */
10894
10895
/*[clinic input]
10896
@permit_long_docstring_body
10897
str.title as unicode_title
10898
10899
Return a version of the string where each word is titlecased.
10900
10901
More specifically, words start with uppercased characters and all remaining
10902
cased characters have lower case.
10903
[clinic start generated code]*/
10904
10905
static PyObject *
10906
unicode_title_impl(PyObject *self)
10907
/*[clinic end generated code: output=c75ae03809574902 input=533ce0eb6a7f5d1b]*/
10908
0
{
10909
0
    return case_operation(self, do_title);
10910
0
}
10911
10912
/*[clinic input]
10913
@permit_long_docstring_body
10914
str.capitalize as unicode_capitalize
10915
10916
Return a capitalized version of the string.
10917
10918
More specifically, make the first character have upper case and the rest lower
10919
case.
10920
[clinic start generated code]*/
10921
10922
static PyObject *
10923
unicode_capitalize_impl(PyObject *self)
10924
/*[clinic end generated code: output=e49a4c333cdb7667 input=a4a15ade41f6f9e9]*/
10925
0
{
10926
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10927
0
        return unicode_result_unchanged(self);
10928
0
    return case_operation(self, do_capitalize);
10929
0
}
10930
10931
/*[clinic input]
10932
str.casefold as unicode_casefold
10933
10934
Return a version of the string suitable for caseless comparisons.
10935
[clinic start generated code]*/
10936
10937
static PyObject *
10938
unicode_casefold_impl(PyObject *self)
10939
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10940
0
{
10941
0
    if (PyUnicode_IS_ASCII(self))
10942
0
        return ascii_upper_or_lower(self, 1);
10943
0
    return case_operation(self, do_casefold);
10944
0
}
10945
10946
10947
/* Argument converter. Accepts a single Unicode character. */
10948
10949
static int
10950
convert_uc(PyObject *obj, void *addr)
10951
0
{
10952
0
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10953
10954
0
    if (!PyUnicode_Check(obj)) {
10955
0
        PyErr_Format(PyExc_TypeError,
10956
0
                     "The fill character must be a unicode character, "
10957
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10958
0
        return 0;
10959
0
    }
10960
0
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10961
0
        PyErr_SetString(PyExc_TypeError,
10962
0
                        "The fill character must be exactly one character long");
10963
0
        return 0;
10964
0
    }
10965
0
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10966
0
    return 1;
10967
0
}
10968
10969
/*[clinic input]
10970
str.center as unicode_center
10971
10972
    width: Py_ssize_t
10973
    fillchar: Py_UCS4 = ' '
10974
    /
10975
10976
Return a centered string of length width.
10977
10978
Padding is done using the specified fill character (default is a space).
10979
[clinic start generated code]*/
10980
10981
static PyObject *
10982
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10983
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10984
0
{
10985
0
    Py_ssize_t marg, left;
10986
10987
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10988
0
        return unicode_result_unchanged(self);
10989
10990
0
    marg = width - PyUnicode_GET_LENGTH(self);
10991
0
    left = marg / 2 + (marg & width & 1);
10992
10993
0
    return pad(self, left, marg - left, fillchar);
10994
0
}
10995
10996
/* This function assumes that str1 and str2 are readied by the caller. */
10997
10998
static int
10999
unicode_compare(PyObject *str1, PyObject *str2)
11000
14.2M
{
11001
14.2M
#define COMPARE(TYPE1, TYPE2) \
11002
14.2M
    do { \
11003
11.3M
        TYPE1* p1 = (TYPE1 *)data1; \
11004
11.3M
        TYPE2* p2 = (TYPE2 *)data2; \
11005
11.3M
        TYPE1* end = p1 + len; \
11006
11.3M
        Py_UCS4 c1, c2; \
11007
11.5M
        for (; p1 != end; p1++, p2++) { \
11008
11.5M
            c1 = *p1; \
11009
11.5M
            c2 = *p2; \
11010
11.5M
            if (c1 != c2) \
11011
11.5M
                return (c1 < c2) ? -1 : 1; \
11012
11.5M
        } \
11013
11.3M
    } \
11014
11.3M
    while (0)
11015
11016
14.2M
    int kind1, kind2;
11017
14.2M
    const void *data1, *data2;
11018
14.2M
    Py_ssize_t len1, len2, len;
11019
11020
14.2M
    kind1 = PyUnicode_KIND(str1);
11021
14.2M
    kind2 = PyUnicode_KIND(str2);
11022
14.2M
    data1 = PyUnicode_DATA(str1);
11023
14.2M
    data2 = PyUnicode_DATA(str2);
11024
14.2M
    len1 = PyUnicode_GET_LENGTH(str1);
11025
14.2M
    len2 = PyUnicode_GET_LENGTH(str2);
11026
14.2M
    len = Py_MIN(len1, len2);
11027
11028
14.2M
    switch(kind1) {
11029
3.03M
    case PyUnicode_1BYTE_KIND:
11030
3.03M
    {
11031
3.03M
        switch(kind2) {
11032
1.83M
        case PyUnicode_1BYTE_KIND:
11033
1.83M
        {
11034
1.83M
            int cmp = memcmp(data1, data2, len);
11035
            /* normalize result of memcmp() into the range [-1; 1] */
11036
1.83M
            if (cmp < 0)
11037
1.01M
                return -1;
11038
817k
            if (cmp > 0)
11039
718k
                return 1;
11040
99.0k
            break;
11041
817k
        }
11042
1.13M
        case PyUnicode_2BYTE_KIND:
11043
1.13M
            COMPARE(Py_UCS1, Py_UCS2);
11044
2.43k
            break;
11045
66.5k
        case PyUnicode_4BYTE_KIND:
11046
66.5k
            COMPARE(Py_UCS1, Py_UCS4);
11047
257
            break;
11048
257
        default:
11049
0
            Py_UNREACHABLE();
11050
3.03M
        }
11051
101k
        break;
11052
3.03M
    }
11053
9.93M
    case PyUnicode_2BYTE_KIND:
11054
9.93M
    {
11055
9.93M
        switch(kind2) {
11056
37.9k
        case PyUnicode_1BYTE_KIND:
11057
37.9k
            COMPARE(Py_UCS2, Py_UCS1);
11058
1.57k
            break;
11059
9.62M
        case PyUnicode_2BYTE_KIND:
11060
9.62M
        {
11061
9.62M
            COMPARE(Py_UCS2, Py_UCS2);
11062
1.68k
            break;
11063
9.62M
        }
11064
275k
        case PyUnicode_4BYTE_KIND:
11065
275k
            COMPARE(Py_UCS2, Py_UCS4);
11066
78
            break;
11067
78
        default:
11068
0
            Py_UNREACHABLE();
11069
9.93M
        }
11070
3.33k
        break;
11071
9.93M
    }
11072
1.29M
    case PyUnicode_4BYTE_KIND:
11073
1.29M
    {
11074
1.29M
        switch(kind2) {
11075
9.14k
        case PyUnicode_1BYTE_KIND:
11076
9.14k
            COMPARE(Py_UCS4, Py_UCS1);
11077
508
            break;
11078
250k
        case PyUnicode_2BYTE_KIND:
11079
250k
            COMPARE(Py_UCS4, Py_UCS2);
11080
92
            break;
11081
1.03M
        case PyUnicode_4BYTE_KIND:
11082
1.03M
        {
11083
1.03M
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11084
1.03M
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11085
            /* normalize result of wmemcmp() into the range [-1; 1] */
11086
1.03M
            if (cmp < 0)
11087
515k
                return -1;
11088
522k
            if (cmp > 0)
11089
521k
                return 1;
11090
#else
11091
            COMPARE(Py_UCS4, Py_UCS4);
11092
#endif
11093
731
            break;
11094
522k
        }
11095
731
        default:
11096
0
            Py_UNREACHABLE();
11097
1.29M
        }
11098
1.33k
        break;
11099
1.29M
    }
11100
1.33k
    default:
11101
0
        Py_UNREACHABLE();
11102
14.2M
    }
11103
11104
106k
    if (len1 == len2)
11105
347
        return 0;
11106
106k
    if (len1 < len2)
11107
48.2k
        return -1;
11108
57.8k
    else
11109
57.8k
        return 1;
11110
11111
106k
#undef COMPARE
11112
106k
}
11113
11114
11115
int
11116
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11117
77.2M
{
11118
77.2M
    assert(PyUnicode_Check(str1));
11119
77.2M
    assert(PyUnicode_Check(str2));
11120
77.2M
    if (str1 == str2) {
11121
9.25M
        return 1;
11122
9.25M
    }
11123
68.0M
    return unicode_eq(str1, str2);
11124
77.2M
}
11125
11126
11127
int
11128
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11129
0
{
11130
0
    if (!PyUnicode_Check(str1)) {
11131
0
        PyErr_Format(PyExc_TypeError,
11132
0
                     "first argument must be str, not %T", str1);
11133
0
        return -1;
11134
0
    }
11135
0
    if (!PyUnicode_Check(str2)) {
11136
0
        PyErr_Format(PyExc_TypeError,
11137
0
                     "second argument must be str, not %T", str2);
11138
0
        return -1;
11139
0
    }
11140
11141
0
    return _PyUnicode_Equal(str1, str2);
11142
0
}
11143
11144
11145
int
11146
PyUnicode_Compare(PyObject *left, PyObject *right)
11147
260k
{
11148
260k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11149
        /* a string is equal to itself */
11150
260k
        if (left == right)
11151
89
            return 0;
11152
11153
260k
        return unicode_compare(left, right);
11154
260k
    }
11155
0
    PyErr_Format(PyExc_TypeError,
11156
0
                 "Can't compare %.100s and %.100s",
11157
0
                 Py_TYPE(left)->tp_name,
11158
0
                 Py_TYPE(right)->tp_name);
11159
0
    return -1;
11160
260k
}
11161
11162
int
11163
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11164
1.50M
{
11165
1.50M
    Py_ssize_t i;
11166
1.50M
    int kind;
11167
1.50M
    Py_UCS4 chr;
11168
11169
1.50M
    assert(_PyUnicode_CHECK(uni));
11170
1.50M
    kind = PyUnicode_KIND(uni);
11171
1.50M
    if (kind == PyUnicode_1BYTE_KIND) {
11172
1.50M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11173
1.50M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11174
1.50M
        size_t len, len2 = strlen(str);
11175
1.50M
        int cmp;
11176
11177
1.50M
        len = Py_MIN(len1, len2);
11178
1.50M
        cmp = memcmp(data, str, len);
11179
1.50M
        if (cmp != 0) {
11180
773k
            if (cmp < 0)
11181
19.5k
                return -1;
11182
754k
            else
11183
754k
                return 1;
11184
773k
        }
11185
732k
        if (len1 > len2)
11186
87
            return 1; /* uni is longer */
11187
732k
        if (len1 < len2)
11188
762
            return -1; /* str is longer */
11189
731k
        return 0;
11190
732k
    }
11191
1.91k
    else {
11192
1.91k
        const void *data = PyUnicode_DATA(uni);
11193
        /* Compare Unicode string and source character set string */
11194
3.28k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11195
3.17k
            if (chr != (unsigned char)str[i])
11196
1.80k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11197
        /* This check keeps Python strings that end in '\0' from comparing equal
11198
         to C strings identical up to that point. */
11199
113
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11200
113
            return 1; /* uni is longer */
11201
0
        if (str[i])
11202
0
            return -1; /* str is longer */
11203
0
        return 0;
11204
0
    }
11205
1.50M
}
11206
11207
int
11208
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11209
0
{
11210
0
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11211
0
}
11212
11213
int
11214
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11215
0
{
11216
0
    assert(_PyUnicode_CHECK(unicode));
11217
0
    assert(str);
11218
11219
0
    if (PyUnicode_IS_ASCII(unicode)) {
11220
0
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11221
0
        return size == len &&
11222
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11223
0
    }
11224
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11225
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11226
0
        return size == len &&
11227
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11228
0
    }
11229
11230
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11231
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11232
0
        return 0;
11233
0
    }
11234
0
    const unsigned char *s = (const unsigned char *)str;
11235
0
    const unsigned char *ends = s + (size_t)size;
11236
0
    int kind = PyUnicode_KIND(unicode);
11237
0
    const void *data = PyUnicode_DATA(unicode);
11238
    /* Compare Unicode string and UTF-8 string */
11239
0
    for (Py_ssize_t i = 0; i < len; i++) {
11240
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11241
0
        if (ch < 0x80) {
11242
0
            if (ends == s || s[0] != ch) {
11243
0
                return 0;
11244
0
            }
11245
0
            s += 1;
11246
0
        }
11247
0
        else if (ch < 0x800) {
11248
0
            if ((ends - s) < 2 ||
11249
0
                s[0] != (0xc0 | (ch >> 6)) ||
11250
0
                s[1] != (0x80 | (ch & 0x3f)))
11251
0
            {
11252
0
                return 0;
11253
0
            }
11254
0
            s += 2;
11255
0
        }
11256
0
        else if (ch < 0x10000) {
11257
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11258
0
                (ends - s) < 3 ||
11259
0
                s[0] != (0xe0 | (ch >> 12)) ||
11260
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11261
0
                s[2] != (0x80 | (ch & 0x3f)))
11262
0
            {
11263
0
                return 0;
11264
0
            }
11265
0
            s += 3;
11266
0
        }
11267
0
        else {
11268
0
            assert(ch <= MAX_UNICODE);
11269
0
            if ((ends - s) < 4 ||
11270
0
                s[0] != (0xf0 | (ch >> 18)) ||
11271
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11272
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11273
0
                s[3] != (0x80 | (ch & 0x3f)))
11274
0
            {
11275
0
                return 0;
11276
0
            }
11277
0
            s += 4;
11278
0
        }
11279
0
    }
11280
0
    return s == ends;
11281
0
}
11282
11283
int
11284
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11285
17.2M
{
11286
17.2M
    size_t len;
11287
17.2M
    assert(_PyUnicode_CHECK(unicode));
11288
17.2M
    assert(str);
11289
17.2M
#ifndef NDEBUG
11290
96.8M
    for (const char *p = str; *p; p++) {
11291
79.6M
        assert((unsigned char)*p < 128);
11292
79.6M
    }
11293
17.2M
#endif
11294
17.2M
    if (!PyUnicode_IS_ASCII(unicode))
11295
676k
        return 0;
11296
16.5M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11297
16.5M
    return strlen(str) == len &&
11298
1.22M
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11299
17.2M
}
11300
11301
int
11302
_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11303
0
{
11304
0
    PyObject *right_uni;
11305
11306
0
    assert(_PyUnicode_CHECK(left));
11307
0
    assert(right->string);
11308
0
#ifndef NDEBUG
11309
0
    for (const char *p = right->string; *p; p++) {
11310
0
        assert((unsigned char)*p < 128);
11311
0
    }
11312
0
#endif
11313
11314
0
    if (!PyUnicode_IS_ASCII(left))
11315
0
        return 0;
11316
11317
0
    right_uni = _PyUnicode_FromId(right);       /* borrowed */
11318
0
    if (right_uni == NULL) {
11319
        /* memory error or bad data */
11320
0
        PyErr_Clear();
11321
0
        return _PyUnicode_EqualToASCIIString(left, right->string);
11322
0
    }
11323
11324
0
    if (left == right_uni)
11325
0
        return 1;
11326
11327
0
    assert(PyUnicode_CHECK_INTERNED(right_uni));
11328
0
    if (PyUnicode_CHECK_INTERNED(left)) {
11329
0
        return 0;
11330
0
    }
11331
11332
0
    Py_hash_t right_hash = PyUnicode_HASH(right_uni);
11333
0
    assert(right_hash != -1);
11334
0
    Py_hash_t hash = PyUnicode_HASH(left);
11335
0
    if (hash != -1 && hash != right_hash) {
11336
0
        return 0;
11337
0
    }
11338
11339
0
    return unicode_eq(left, right_uni);
11340
0
}
11341
11342
PyObject *
11343
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11344
15.8M
{
11345
15.8M
    int result;
11346
11347
15.8M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11348
26.3k
        Py_RETURN_NOTIMPLEMENTED;
11349
11350
15.8M
    if (left == right) {
11351
291
        switch (op) {
11352
215
        case Py_EQ:
11353
215
        case Py_LE:
11354
215
        case Py_GE:
11355
            /* a string is equal to itself */
11356
215
            Py_RETURN_TRUE;
11357
76
        case Py_NE:
11358
76
        case Py_LT:
11359
76
        case Py_GT:
11360
76
            Py_RETURN_FALSE;
11361
0
        default:
11362
0
            PyErr_BadArgument();
11363
0
            return NULL;
11364
291
        }
11365
291
    }
11366
15.8M
    else if (op == Py_EQ || op == Py_NE) {
11367
1.83M
        result = unicode_eq(left, right);
11368
1.83M
        result ^= (op == Py_NE);
11369
1.83M
        return PyBool_FromLong(result);
11370
1.83M
    }
11371
14.0M
    else {
11372
14.0M
        result = unicode_compare(left, right);
11373
14.0M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11374
14.0M
    }
11375
15.8M
}
11376
11377
int
11378
PyUnicode_Contains(PyObject *str, PyObject *substr)
11379
308M
{
11380
308M
    int kind1, kind2;
11381
308M
    const void *buf1, *buf2;
11382
308M
    Py_ssize_t len1, len2;
11383
308M
    int result;
11384
11385
308M
    if (!PyUnicode_Check(substr)) {
11386
0
        PyErr_Format(PyExc_TypeError,
11387
0
                     "'in <string>' requires string as left operand, not %.100s",
11388
0
                     Py_TYPE(substr)->tp_name);
11389
0
        return -1;
11390
0
    }
11391
308M
    if (ensure_unicode(str) < 0)
11392
0
        return -1;
11393
11394
308M
    kind1 = PyUnicode_KIND(str);
11395
308M
    kind2 = PyUnicode_KIND(substr);
11396
308M
    if (kind1 < kind2)
11397
3
        return 0;
11398
308M
    len1 = PyUnicode_GET_LENGTH(str);
11399
308M
    len2 = PyUnicode_GET_LENGTH(substr);
11400
308M
    if (len1 < len2)
11401
44
        return 0;
11402
308M
    buf1 = PyUnicode_DATA(str);
11403
308M
    buf2 = PyUnicode_DATA(substr);
11404
308M
    if (len2 == 1) {
11405
23.8M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11406
23.8M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11407
23.8M
        return result;
11408
23.8M
    }
11409
284M
    if (kind2 != kind1) {
11410
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11411
0
        if (!buf2)
11412
0
            return -1;
11413
0
    }
11414
11415
284M
    switch (kind1) {
11416
284M
    case PyUnicode_1BYTE_KIND:
11417
284M
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11418
284M
        break;
11419
0
    case PyUnicode_2BYTE_KIND:
11420
0
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11421
0
        break;
11422
0
    case PyUnicode_4BYTE_KIND:
11423
0
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11424
0
        break;
11425
0
    default:
11426
0
        Py_UNREACHABLE();
11427
284M
    }
11428
11429
284M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11430
284M
    if (kind2 != kind1)
11431
0
        PyMem_Free((void *)buf2);
11432
11433
284M
    return result;
11434
284M
}
11435
11436
/* Concat to string or Unicode object giving a new Unicode object. */
11437
11438
PyObject *
11439
PyUnicode_Concat(PyObject *left, PyObject *right)
11440
572k
{
11441
572k
    PyObject *result;
11442
572k
    Py_UCS4 maxchar, maxchar2;
11443
572k
    Py_ssize_t left_len, right_len, new_len;
11444
11445
572k
    if (ensure_unicode(left) < 0)
11446
0
        return NULL;
11447
11448
572k
    if (!PyUnicode_Check(right)) {
11449
350
        PyErr_Format(PyExc_TypeError,
11450
350
            "can only concatenate str (not \"%.200s\") to str",
11451
350
            Py_TYPE(right)->tp_name);
11452
350
        return NULL;
11453
350
    }
11454
11455
    /* Shortcuts */
11456
571k
    PyObject *empty = unicode_get_empty();  // Borrowed reference
11457
571k
    if (left == empty) {
11458
42.3k
        return PyUnicode_FromObject(right);
11459
42.3k
    }
11460
529k
    if (right == empty) {
11461
50.6k
        return PyUnicode_FromObject(left);
11462
50.6k
    }
11463
11464
478k
    left_len = PyUnicode_GET_LENGTH(left);
11465
478k
    right_len = PyUnicode_GET_LENGTH(right);
11466
478k
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11467
0
        PyErr_SetString(PyExc_OverflowError,
11468
0
                        "strings are too large to concat");
11469
0
        return NULL;
11470
0
    }
11471
478k
    new_len = left_len + right_len;
11472
11473
478k
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11474
478k
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11475
478k
    maxchar = Py_MAX(maxchar, maxchar2);
11476
11477
    /* Concat the two Unicode strings */
11478
478k
    result = PyUnicode_New(new_len, maxchar);
11479
478k
    if (result == NULL)
11480
0
        return NULL;
11481
478k
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11482
478k
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11483
478k
    assert(_PyUnicode_CheckConsistency(result, 1));
11484
478k
    return result;
11485
478k
}
11486
11487
void
11488
PyUnicode_Append(PyObject **p_left, PyObject *right)
11489
5.54M
{
11490
5.54M
    PyObject *left, *res;
11491
5.54M
    Py_UCS4 maxchar, maxchar2;
11492
5.54M
    Py_ssize_t left_len, right_len, new_len;
11493
11494
5.54M
    if (p_left == NULL) {
11495
0
        if (!PyErr_Occurred())
11496
0
            PyErr_BadInternalCall();
11497
0
        return;
11498
0
    }
11499
5.54M
    left = *p_left;
11500
5.54M
    if (right == NULL || left == NULL
11501
5.54M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11502
0
        if (!PyErr_Occurred())
11503
0
            PyErr_BadInternalCall();
11504
0
        goto error;
11505
0
    }
11506
11507
    /* Shortcuts */
11508
5.54M
    PyObject *empty = unicode_get_empty();  // Borrowed reference
11509
5.54M
    if (left == empty) {
11510
37.2k
        Py_DECREF(left);
11511
37.2k
        *p_left = Py_NewRef(right);
11512
37.2k
        return;
11513
37.2k
    }
11514
5.51M
    if (right == empty) {
11515
2.22k
        return;
11516
2.22k
    }
11517
11518
5.51M
    left_len = PyUnicode_GET_LENGTH(left);
11519
5.51M
    right_len = PyUnicode_GET_LENGTH(right);
11520
5.51M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11521
0
        PyErr_SetString(PyExc_OverflowError,
11522
0
                        "strings are too large to concat");
11523
0
        goto error;
11524
0
    }
11525
5.51M
    new_len = left_len + right_len;
11526
11527
5.51M
    if (unicode_modifiable(left)
11528
5.51M
        && PyUnicode_CheckExact(right)
11529
10.4M
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11530
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11531
           to change the structure size, but characters are stored just after
11532
           the structure, and so it requires to move all characters which is
11533
           not so different than duplicating the string. */
11534
5.22M
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11535
5.22M
    {
11536
        /* append inplace */
11537
5.22M
        if (unicode_resize(p_left, new_len) != 0)
11538
0
            goto error;
11539
11540
        /* copy 'right' into the newly allocated area of 'left' */
11541
5.22M
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11542
5.22M
    }
11543
288k
    else {
11544
288k
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11545
288k
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11546
288k
        maxchar = Py_MAX(maxchar, maxchar2);
11547
11548
        /* Concat the two Unicode strings */
11549
288k
        res = PyUnicode_New(new_len, maxchar);
11550
288k
        if (res == NULL)
11551
0
            goto error;
11552
288k
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11553
288k
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11554
288k
        Py_DECREF(left);
11555
288k
        *p_left = res;
11556
288k
    }
11557
5.51M
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11558
5.51M
    return;
11559
11560
5.51M
error:
11561
0
    Py_CLEAR(*p_left);
11562
0
}
11563
11564
void
11565
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11566
0
{
11567
0
    PyUnicode_Append(pleft, right);
11568
0
    Py_XDECREF(right);
11569
0
}
11570
11571
/*[clinic input]
11572
@permit_long_summary
11573
@text_signature "($self, sub[, start[, end]], /)"
11574
str.count as unicode_count -> Py_ssize_t
11575
11576
    self as str: self
11577
    sub as substr: unicode
11578
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11579
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11580
    /
11581
11582
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11583
11584
Optional arguments start and end are interpreted as in slice notation.
11585
[clinic start generated code]*/
11586
11587
static Py_ssize_t
11588
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11589
                   Py_ssize_t end)
11590
/*[clinic end generated code: output=8fcc3aef0b18edbf input=8590716ee228b935]*/
11591
6.36k
{
11592
6.36k
    assert(PyUnicode_Check(str));
11593
6.36k
    assert(PyUnicode_Check(substr));
11594
11595
6.36k
    Py_ssize_t result;
11596
6.36k
    int kind1, kind2;
11597
6.36k
    const void *buf1 = NULL, *buf2 = NULL;
11598
6.36k
    Py_ssize_t len1, len2;
11599
11600
6.36k
    kind1 = PyUnicode_KIND(str);
11601
6.36k
    kind2 = PyUnicode_KIND(substr);
11602
6.36k
    if (kind1 < kind2)
11603
0
        return 0;
11604
11605
6.36k
    len1 = PyUnicode_GET_LENGTH(str);
11606
6.36k
    len2 = PyUnicode_GET_LENGTH(substr);
11607
6.36k
    ADJUST_INDICES(start, end, len1);
11608
6.36k
    if (end - start < len2)
11609
1.21k
        return 0;
11610
11611
5.14k
    buf1 = PyUnicode_DATA(str);
11612
5.14k
    buf2 = PyUnicode_DATA(substr);
11613
5.14k
    if (kind2 != kind1) {
11614
2.29k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11615
2.29k
        if (!buf2)
11616
0
            goto onError;
11617
2.29k
    }
11618
11619
    // We don't reuse `anylib_count` here because of the explicit casts.
11620
5.14k
    switch (kind1) {
11621
2.85k
    case PyUnicode_1BYTE_KIND:
11622
2.85k
        result = ucs1lib_count(
11623
2.85k
            ((const Py_UCS1*)buf1) + start, end - start,
11624
2.85k
            buf2, len2, PY_SSIZE_T_MAX
11625
2.85k
            );
11626
2.85k
        break;
11627
1.04k
    case PyUnicode_2BYTE_KIND:
11628
1.04k
        result = ucs2lib_count(
11629
1.04k
            ((const Py_UCS2*)buf1) + start, end - start,
11630
1.04k
            buf2, len2, PY_SSIZE_T_MAX
11631
1.04k
            );
11632
1.04k
        break;
11633
1.24k
    case PyUnicode_4BYTE_KIND:
11634
1.24k
        result = ucs4lib_count(
11635
1.24k
            ((const Py_UCS4*)buf1) + start, end - start,
11636
1.24k
            buf2, len2, PY_SSIZE_T_MAX
11637
1.24k
            );
11638
1.24k
        break;
11639
0
    default:
11640
0
        Py_UNREACHABLE();
11641
5.14k
    }
11642
11643
5.14k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11644
5.14k
    if (kind2 != kind1)
11645
2.29k
        PyMem_Free((void *)buf2);
11646
11647
5.14k
    return result;
11648
0
  onError:
11649
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11650
0
    if (kind2 != kind1)
11651
0
        PyMem_Free((void *)buf2);
11652
0
    return -1;
11653
0
}
11654
11655
/*[clinic input]
11656
str.encode as unicode_encode
11657
11658
    encoding: str(c_default="NULL") = 'utf-8'
11659
        The encoding in which to encode the string.
11660
    errors: str(c_default="NULL") = 'strict'
11661
        The error handling scheme to use for encoding errors.
11662
        The default is 'strict' meaning that encoding errors raise a
11663
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11664
        'xmlcharrefreplace' as well as any other name registered with
11665
        codecs.register_error that can handle UnicodeEncodeErrors.
11666
11667
Encode the string using the codec registered for encoding.
11668
[clinic start generated code]*/
11669
11670
static PyObject *
11671
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11672
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11673
53.9k
{
11674
53.9k
    return PyUnicode_AsEncodedString(self, encoding, errors);
11675
53.9k
}
11676
11677
/*[clinic input]
11678
str.expandtabs as unicode_expandtabs
11679
11680
    tabsize: int = 8
11681
11682
Return a copy where all tab characters are expanded using spaces.
11683
11684
If tabsize is not given, a tab size of 8 characters is assumed.
11685
[clinic start generated code]*/
11686
11687
static PyObject *
11688
unicode_expandtabs_impl(PyObject *self, int tabsize)
11689
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11690
2.53k
{
11691
2.53k
    Py_ssize_t i, j, line_pos, src_len, incr;
11692
2.53k
    Py_UCS4 ch;
11693
2.53k
    PyObject *u;
11694
2.53k
    const void *src_data;
11695
2.53k
    void *dest_data;
11696
2.53k
    int kind;
11697
2.53k
    int found;
11698
11699
    /* First pass: determine size of output string */
11700
2.53k
    src_len = PyUnicode_GET_LENGTH(self);
11701
2.53k
    i = j = line_pos = 0;
11702
2.53k
    kind = PyUnicode_KIND(self);
11703
2.53k
    src_data = PyUnicode_DATA(self);
11704
2.53k
    found = 0;
11705
65.2k
    for (; i < src_len; i++) {
11706
62.7k
        ch = PyUnicode_READ(kind, src_data, i);
11707
62.7k
        if (ch == '\t') {
11708
19.8k
            found = 1;
11709
19.8k
            if (tabsize > 0) {
11710
19.8k
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11711
19.8k
                if (j > PY_SSIZE_T_MAX - incr)
11712
0
                    goto overflow;
11713
19.8k
                line_pos += incr;
11714
19.8k
                j += incr;
11715
19.8k
            }
11716
19.8k
        }
11717
42.8k
        else {
11718
42.8k
            if (j > PY_SSIZE_T_MAX - 1)
11719
0
                goto overflow;
11720
42.8k
            line_pos++;
11721
42.8k
            j++;
11722
42.8k
            if (ch == '\n' || ch == '\r')
11723
9.23k
                line_pos = 0;
11724
42.8k
        }
11725
62.7k
    }
11726
2.53k
    if (!found)
11727
1.48k
        return unicode_result_unchanged(self);
11728
11729
    /* Second pass: create output string and fill it */
11730
1.05k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11731
1.05k
    if (!u)
11732
0
        return NULL;
11733
1.05k
    dest_data = PyUnicode_DATA(u);
11734
11735
1.05k
    i = j = line_pos = 0;
11736
11737
59.7k
    for (; i < src_len; i++) {
11738
58.7k
        ch = PyUnicode_READ(kind, src_data, i);
11739
58.7k
        if (ch == '\t') {
11740
19.8k
            if (tabsize > 0) {
11741
19.8k
                incr = tabsize - (line_pos % tabsize);
11742
19.8k
                line_pos += incr;
11743
19.8k
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11744
19.8k
                j += incr;
11745
19.8k
            }
11746
19.8k
        }
11747
38.8k
        else {
11748
38.8k
            line_pos++;
11749
38.8k
            PyUnicode_WRITE(kind, dest_data, j, ch);
11750
38.8k
            j++;
11751
38.8k
            if (ch == '\n' || ch == '\r')
11752
8.63k
                line_pos = 0;
11753
38.8k
        }
11754
58.7k
    }
11755
1.05k
    assert (j == PyUnicode_GET_LENGTH(u));
11756
1.05k
    return unicode_result(u);
11757
11758
0
  overflow:
11759
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11760
0
    return NULL;
11761
1.05k
}
11762
11763
/*[clinic input]
11764
@permit_long_summary
11765
str.find as unicode_find = str.count
11766
11767
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11768
11769
Optional arguments start and end are interpreted as in slice notation.
11770
Return -1 on failure.
11771
[clinic start generated code]*/
11772
11773
static Py_ssize_t
11774
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11775
                  Py_ssize_t end)
11776
/*[clinic end generated code: output=51dbe6255712e278 input=3a9d650fe4c24695]*/
11777
22
{
11778
22
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11779
22
    if (result < 0) {
11780
0
        return -1;
11781
0
    }
11782
22
    return result;
11783
22
}
11784
11785
static PyObject *
11786
unicode_getitem(PyObject *self, Py_ssize_t index)
11787
14.5M
{
11788
14.5M
    const void *data;
11789
14.5M
    int kind;
11790
14.5M
    Py_UCS4 ch;
11791
11792
14.5M
    if (!PyUnicode_Check(self)) {
11793
0
        PyErr_BadArgument();
11794
0
        return NULL;
11795
0
    }
11796
14.5M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11797
7.24k
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11798
7.24k
        return NULL;
11799
7.24k
    }
11800
14.5M
    kind = PyUnicode_KIND(self);
11801
14.5M
    data = PyUnicode_DATA(self);
11802
14.5M
    ch = PyUnicode_READ(kind, data, index);
11803
14.5M
    return unicode_char(ch);
11804
14.5M
}
11805
11806
/* Believe it or not, this produces the same value for ASCII strings
11807
   as bytes_hash(). */
11808
static Py_hash_t
11809
unicode_hash(PyObject *self)
11810
7.53M
{
11811
7.53M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11812
11813
#ifdef Py_DEBUG
11814
    assert(_Py_HashSecret_Initialized);
11815
#endif
11816
7.53M
    Py_hash_t hash = PyUnicode_HASH(self);
11817
7.53M
    if (hash != -1) {
11818
1.26M
        return hash;
11819
1.26M
    }
11820
6.26M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11821
6.26M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11822
11823
0
    PyUnicode_SET_HASH(self, x);
11824
6.26M
    return x;
11825
6.26M
}
11826
11827
/*[clinic input]
11828
@permit_long_summary
11829
str.index as unicode_index = str.count
11830
11831
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11832
11833
Optional arguments start and end are interpreted as in slice notation.
11834
Raises ValueError when the substring is not found.
11835
[clinic start generated code]*/
11836
11837
static Py_ssize_t
11838
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11839
                   Py_ssize_t end)
11840
/*[clinic end generated code: output=77558288837cdf40 input=ae5e48f69ed75b06]*/
11841
0
{
11842
0
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11843
0
    if (result == -1) {
11844
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
11845
0
    }
11846
0
    else if (result < 0) {
11847
0
        return -1;
11848
0
    }
11849
0
    return result;
11850
0
}
11851
11852
/*[clinic input]
11853
str.isascii as unicode_isascii
11854
11855
Return True if all characters in the string are ASCII, False otherwise.
11856
11857
ASCII characters have code points in the range U+0000-U+007F.
11858
Empty string is ASCII too.
11859
[clinic start generated code]*/
11860
11861
static PyObject *
11862
unicode_isascii_impl(PyObject *self)
11863
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11864
12.8M
{
11865
12.8M
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11866
12.8M
}
11867
11868
/*[clinic input]
11869
@permit_long_docstring_body
11870
str.islower as unicode_islower
11871
11872
Return True if the string is a lowercase string, False otherwise.
11873
11874
A string is lowercase if all cased characters in the string are lowercase and
11875
there is at least one cased character in the string.
11876
[clinic start generated code]*/
11877
11878
static PyObject *
11879
unicode_islower_impl(PyObject *self)
11880
/*[clinic end generated code: output=dbd41995bd005b81 input=c6fc0295241a1aaa]*/
11881
0
{
11882
0
    Py_ssize_t i, length;
11883
0
    int kind;
11884
0
    const void *data;
11885
0
    int cased;
11886
11887
0
    length = PyUnicode_GET_LENGTH(self);
11888
0
    kind = PyUnicode_KIND(self);
11889
0
    data = PyUnicode_DATA(self);
11890
11891
    /* Shortcut for single character strings */
11892
0
    if (length == 1)
11893
0
        return PyBool_FromLong(
11894
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11895
11896
    /* Special case for empty strings */
11897
0
    if (length == 0)
11898
0
        Py_RETURN_FALSE;
11899
11900
0
    cased = 0;
11901
0
    for (i = 0; i < length; i++) {
11902
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11903
11904
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11905
0
            Py_RETURN_FALSE;
11906
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11907
0
            cased = 1;
11908
0
    }
11909
0
    return PyBool_FromLong(cased);
11910
0
}
11911
11912
/*[clinic input]
11913
@permit_long_docstring_body
11914
str.isupper as unicode_isupper
11915
11916
Return True if the string is an uppercase string, False otherwise.
11917
11918
A string is uppercase if all cased characters in the string are uppercase and
11919
there is at least one cased character in the string.
11920
[clinic start generated code]*/
11921
11922
static PyObject *
11923
unicode_isupper_impl(PyObject *self)
11924
/*[clinic end generated code: output=049209c8e7f15f59 input=8d5cb33e67efde72]*/
11925
0
{
11926
0
    Py_ssize_t i, length;
11927
0
    int kind;
11928
0
    const void *data;
11929
0
    int cased;
11930
11931
0
    length = PyUnicode_GET_LENGTH(self);
11932
0
    kind = PyUnicode_KIND(self);
11933
0
    data = PyUnicode_DATA(self);
11934
11935
    /* Shortcut for single character strings */
11936
0
    if (length == 1)
11937
0
        return PyBool_FromLong(
11938
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11939
11940
    /* Special case for empty strings */
11941
0
    if (length == 0)
11942
0
        Py_RETURN_FALSE;
11943
11944
0
    cased = 0;
11945
0
    for (i = 0; i < length; i++) {
11946
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11947
11948
0
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11949
0
            Py_RETURN_FALSE;
11950
0
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11951
0
            cased = 1;
11952
0
    }
11953
0
    return PyBool_FromLong(cased);
11954
0
}
11955
11956
/*[clinic input]
11957
str.istitle as unicode_istitle
11958
11959
Return True if the string is a title-cased string, False otherwise.
11960
11961
In a title-cased string, upper- and title-case characters may only
11962
follow uncased characters and lowercase characters only cased ones.
11963
[clinic start generated code]*/
11964
11965
static PyObject *
11966
unicode_istitle_impl(PyObject *self)
11967
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11968
0
{
11969
0
    Py_ssize_t i, length;
11970
0
    int kind;
11971
0
    const void *data;
11972
0
    int cased, previous_is_cased;
11973
11974
0
    length = PyUnicode_GET_LENGTH(self);
11975
0
    kind = PyUnicode_KIND(self);
11976
0
    data = PyUnicode_DATA(self);
11977
11978
    /* Shortcut for single character strings */
11979
0
    if (length == 1) {
11980
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11981
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11982
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11983
0
    }
11984
11985
    /* Special case for empty strings */
11986
0
    if (length == 0)
11987
0
        Py_RETURN_FALSE;
11988
11989
0
    cased = 0;
11990
0
    previous_is_cased = 0;
11991
0
    for (i = 0; i < length; i++) {
11992
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11993
11994
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11995
0
            if (previous_is_cased)
11996
0
                Py_RETURN_FALSE;
11997
0
            previous_is_cased = 1;
11998
0
            cased = 1;
11999
0
        }
12000
0
        else if (Py_UNICODE_ISLOWER(ch)) {
12001
0
            if (!previous_is_cased)
12002
0
                Py_RETURN_FALSE;
12003
0
            previous_is_cased = 1;
12004
0
            cased = 1;
12005
0
        }
12006
0
        else
12007
0
            previous_is_cased = 0;
12008
0
    }
12009
0
    return PyBool_FromLong(cased);
12010
0
}
12011
12012
/*[clinic input]
12013
@permit_long_docstring_body
12014
str.isspace as unicode_isspace
12015
12016
Return True if the string is a whitespace string, False otherwise.
12017
12018
A string is whitespace if all characters in the string are whitespace and there
12019
is at least one character in the string.
12020
[clinic start generated code]*/
12021
12022
static PyObject *
12023
unicode_isspace_impl(PyObject *self)
12024
/*[clinic end generated code: output=163a63bfa08ac2b9 input=44fe05e248c6e159]*/
12025
0
{
12026
0
    Py_ssize_t i, length;
12027
0
    int kind;
12028
0
    const void *data;
12029
12030
0
    length = PyUnicode_GET_LENGTH(self);
12031
0
    kind = PyUnicode_KIND(self);
12032
0
    data = PyUnicode_DATA(self);
12033
12034
    /* Shortcut for single character strings */
12035
0
    if (length == 1)
12036
0
        return PyBool_FromLong(
12037
0
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12038
12039
    /* Special case for empty strings */
12040
0
    if (length == 0)
12041
0
        Py_RETURN_FALSE;
12042
12043
0
    for (i = 0; i < length; i++) {
12044
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12045
0
        if (!Py_UNICODE_ISSPACE(ch))
12046
0
            Py_RETURN_FALSE;
12047
0
    }
12048
0
    Py_RETURN_TRUE;
12049
0
}
12050
12051
/*[clinic input]
12052
@permit_long_docstring_body
12053
str.isalpha as unicode_isalpha
12054
12055
Return True if the string is an alphabetic string, False otherwise.
12056
12057
A string is alphabetic if all characters in the string are alphabetic and there
12058
is at least one character in the string.
12059
[clinic start generated code]*/
12060
12061
static PyObject *
12062
unicode_isalpha_impl(PyObject *self)
12063
/*[clinic end generated code: output=cc81b9ac3883ec4f input=c233000624a56e0d]*/
12064
19
{
12065
19
    Py_ssize_t i, length;
12066
19
    int kind;
12067
19
    const void *data;
12068
12069
19
    length = PyUnicode_GET_LENGTH(self);
12070
19
    kind = PyUnicode_KIND(self);
12071
19
    data = PyUnicode_DATA(self);
12072
12073
    /* Shortcut for single character strings */
12074
19
    if (length == 1)
12075
16
        return PyBool_FromLong(
12076
16
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12077
12078
    /* Special case for empty strings */
12079
3
    if (length == 0)
12080
0
        Py_RETURN_FALSE;
12081
12082
3
    for (i = 0; i < length; i++) {
12083
3
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12084
3
            Py_RETURN_FALSE;
12085
3
    }
12086
3
    Py_RETURN_TRUE;
12087
3
}
12088
12089
/*[clinic input]
12090
@permit_long_docstring_body
12091
str.isalnum as unicode_isalnum
12092
12093
Return True if the string is an alpha-numeric string, False otherwise.
12094
12095
A string is alpha-numeric if all characters in the string are alpha-numeric and
12096
there is at least one character in the string.
12097
[clinic start generated code]*/
12098
12099
static PyObject *
12100
unicode_isalnum_impl(PyObject *self)
12101
/*[clinic end generated code: output=a5a23490ffc3660c input=5d63ba9c9bafdb6b]*/
12102
12.8M
{
12103
12.8M
    int kind;
12104
12.8M
    const void *data;
12105
12.8M
    Py_ssize_t len, i;
12106
12107
12.8M
    kind = PyUnicode_KIND(self);
12108
12.8M
    data = PyUnicode_DATA(self);
12109
12.8M
    len = PyUnicode_GET_LENGTH(self);
12110
12111
    /* Shortcut for single character strings */
12112
12.8M
    if (len == 1) {
12113
12.8M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12114
12.8M
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12115
12.8M
    }
12116
12117
    /* Special case for empty strings */
12118
0
    if (len == 0)
12119
0
        Py_RETURN_FALSE;
12120
12121
0
    for (i = 0; i < len; i++) {
12122
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12123
0
        if (!Py_UNICODE_ISALNUM(ch))
12124
0
            Py_RETURN_FALSE;
12125
0
    }
12126
0
    Py_RETURN_TRUE;
12127
0
}
12128
12129
/*[clinic input]
12130
@permit_long_docstring_body
12131
str.isdecimal as unicode_isdecimal
12132
12133
Return True if the string is a decimal string, False otherwise.
12134
12135
A string is a decimal string if all characters in the string are decimal and
12136
there is at least one character in the string.
12137
[clinic start generated code]*/
12138
12139
static PyObject *
12140
unicode_isdecimal_impl(PyObject *self)
12141
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=8e84a58b414935a3]*/
12142
4.57k
{
12143
4.57k
    Py_ssize_t i, length;
12144
4.57k
    int kind;
12145
4.57k
    const void *data;
12146
12147
4.57k
    length = PyUnicode_GET_LENGTH(self);
12148
4.57k
    kind = PyUnicode_KIND(self);
12149
4.57k
    data = PyUnicode_DATA(self);
12150
12151
    /* Shortcut for single character strings */
12152
4.57k
    if (length == 1)
12153
2.76k
        return PyBool_FromLong(
12154
2.76k
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12155
12156
    /* Special case for empty strings */
12157
1.81k
    if (length == 0)
12158
0
        Py_RETURN_FALSE;
12159
12160
885k
    for (i = 0; i < length; i++) {
12161
884k
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12162
379
            Py_RETURN_FALSE;
12163
884k
    }
12164
1.81k
    Py_RETURN_TRUE;
12165
1.81k
}
12166
12167
/*[clinic input]
12168
@permit_long_docstring_body
12169
str.isdigit as unicode_isdigit
12170
12171
Return True if the string is a digit string, False otherwise.
12172
12173
A string is a digit string if all characters in the string are digits and there
12174
is at least one character in the string.
12175
[clinic start generated code]*/
12176
12177
static PyObject *
12178
unicode_isdigit_impl(PyObject *self)
12179
/*[clinic end generated code: output=10a6985311da6858 input=99e284affb54d4a0]*/
12180
0
{
12181
0
    Py_ssize_t i, length;
12182
0
    int kind;
12183
0
    const void *data;
12184
12185
0
    length = PyUnicode_GET_LENGTH(self);
12186
0
    kind = PyUnicode_KIND(self);
12187
0
    data = PyUnicode_DATA(self);
12188
12189
    /* Shortcut for single character strings */
12190
0
    if (length == 1) {
12191
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12192
0
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12193
0
    }
12194
12195
    /* Special case for empty strings */
12196
0
    if (length == 0)
12197
0
        Py_RETURN_FALSE;
12198
12199
0
    for (i = 0; i < length; i++) {
12200
0
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12201
0
            Py_RETURN_FALSE;
12202
0
    }
12203
0
    Py_RETURN_TRUE;
12204
0
}
12205
12206
/*[clinic input]
12207
@permit_long_docstring_body
12208
str.isnumeric as unicode_isnumeric
12209
12210
Return True if the string is a numeric string, False otherwise.
12211
12212
A string is numeric if all characters in the string are numeric and there is at
12213
least one character in the string.
12214
[clinic start generated code]*/
12215
12216
static PyObject *
12217
unicode_isnumeric_impl(PyObject *self)
12218
/*[clinic end generated code: output=9172a32d9013051a input=e9f5b6b8b29b0ee6]*/
12219
0
{
12220
0
    Py_ssize_t i, length;
12221
0
    int kind;
12222
0
    const void *data;
12223
12224
0
    length = PyUnicode_GET_LENGTH(self);
12225
0
    kind = PyUnicode_KIND(self);
12226
0
    data = PyUnicode_DATA(self);
12227
12228
    /* Shortcut for single character strings */
12229
0
    if (length == 1)
12230
0
        return PyBool_FromLong(
12231
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12232
12233
    /* Special case for empty strings */
12234
0
    if (length == 0)
12235
0
        Py_RETURN_FALSE;
12236
12237
0
    for (i = 0; i < length; i++) {
12238
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12239
0
            Py_RETURN_FALSE;
12240
0
    }
12241
0
    Py_RETURN_TRUE;
12242
0
}
12243
12244
Py_ssize_t
12245
_PyUnicode_ScanIdentifier(PyObject *self)
12246
48.3k
{
12247
48.3k
    Py_ssize_t i;
12248
48.3k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12249
48.3k
    if (len == 0) {
12250
        /* an empty string is not a valid identifier */
12251
0
        return 0;
12252
0
    }
12253
12254
48.3k
    int kind = PyUnicode_KIND(self);
12255
48.3k
    const void *data = PyUnicode_DATA(self);
12256
48.3k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12257
    /* PEP 3131 says that the first character must be in
12258
       XID_Start and subsequent characters in XID_Continue,
12259
       and for the ASCII range, the 2.x rules apply (i.e
12260
       start with letters and underscore, continue with
12261
       letters, digits, underscore). However, given the current
12262
       definition of XID_Start and XID_Continue, it is sufficient
12263
       to check just for these, except that _ must be allowed
12264
       as starting an identifier.  */
12265
48.3k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12266
364
        return 0;
12267
364
    }
12268
12269
7.95M
    for (i = 1; i < len; i++) {
12270
7.90M
        ch = PyUnicode_READ(kind, data, i);
12271
7.90M
        if (!_PyUnicode_IsXidContinue(ch)) {
12272
150
            return i;
12273
150
        }
12274
7.90M
    }
12275
47.7k
    return i;
12276
47.9k
}
12277
12278
int
12279
PyUnicode_IsIdentifier(PyObject *self)
12280
1.25k
{
12281
1.25k
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12282
1.25k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12283
    /* an empty string is not a valid identifier */
12284
1.25k
    return len && i == len;
12285
1.25k
}
12286
12287
/*[clinic input]
12288
@permit_long_docstring_body
12289
str.isidentifier as unicode_isidentifier
12290
12291
Return True if the string is a valid Python identifier, False otherwise.
12292
12293
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12294
such as "def" or "class".
12295
[clinic start generated code]*/
12296
12297
static PyObject *
12298
unicode_isidentifier_impl(PyObject *self)
12299
/*[clinic end generated code: output=fe585a9666572905 input=86315dd889d7bd04]*/
12300
1.02k
{
12301
1.02k
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12302
1.02k
}
12303
12304
/*[clinic input]
12305
@permit_long_summary
12306
str.isprintable as unicode_isprintable
12307
12308
Return True if all characters in the string are printable, False otherwise.
12309
12310
A character is printable if repr() may use it in its output.
12311
[clinic start generated code]*/
12312
12313
static PyObject *
12314
unicode_isprintable_impl(PyObject *self)
12315
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12316
0
{
12317
0
    Py_ssize_t i, length;
12318
0
    int kind;
12319
0
    const void *data;
12320
12321
0
    length = PyUnicode_GET_LENGTH(self);
12322
0
    kind = PyUnicode_KIND(self);
12323
0
    data = PyUnicode_DATA(self);
12324
12325
    /* Shortcut for single character strings */
12326
0
    if (length == 1)
12327
0
        return PyBool_FromLong(
12328
0
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12329
12330
0
    for (i = 0; i < length; i++) {
12331
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12332
0
            Py_RETURN_FALSE;
12333
0
        }
12334
0
    }
12335
0
    Py_RETURN_TRUE;
12336
0
}
12337
12338
/*[clinic input]
12339
@permit_long_docstring_body
12340
str.join as unicode_join
12341
12342
    iterable: object
12343
    /
12344
12345
Concatenate any number of strings.
12346
12347
The string whose method is called is inserted in between each given string.
12348
The result is returned as a new string.
12349
12350
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12351
[clinic start generated code]*/
12352
12353
static PyObject *
12354
unicode_join(PyObject *self, PyObject *iterable)
12355
/*[clinic end generated code: output=6857e7cecfe7bf98 input=bac724ed412ef3f8]*/
12356
288k
{
12357
288k
    return PyUnicode_Join(self, iterable);
12358
288k
}
12359
12360
static Py_ssize_t
12361
unicode_length(PyObject *self)
12362
6.93M
{
12363
6.93M
    return PyUnicode_GET_LENGTH(self);
12364
6.93M
}
12365
12366
/*[clinic input]
12367
str.ljust as unicode_ljust
12368
12369
    width: Py_ssize_t
12370
    fillchar: Py_UCS4 = ' '
12371
    /
12372
12373
Return a left-justified string of length width.
12374
12375
Padding is done using the specified fill character (default is a space).
12376
[clinic start generated code]*/
12377
12378
static PyObject *
12379
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12380
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12381
0
{
12382
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12383
0
        return unicode_result_unchanged(self);
12384
12385
0
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12386
0
}
12387
12388
/*[clinic input]
12389
str.lower as unicode_lower
12390
12391
Return a copy of the string converted to lowercase.
12392
[clinic start generated code]*/
12393
12394
static PyObject *
12395
unicode_lower_impl(PyObject *self)
12396
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12397
593k
{
12398
593k
    if (PyUnicode_IS_ASCII(self))
12399
94.8k
        return ascii_upper_or_lower(self, 1);
12400
498k
    return case_operation(self, do_lower);
12401
593k
}
12402
12403
31.4k
#define LEFTSTRIP 0
12404
62.8k
#define RIGHTSTRIP 1
12405
12
#define BOTHSTRIP 2
12406
12407
/* Arrays indexed by above */
12408
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12409
12410
0
#define STRIPNAME(i) (stripfuncnames[i])
12411
12412
/* externally visible for str.strip(unicode) */
12413
PyObject *
12414
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12415
10.6k
{
12416
10.6k
    const void *data;
12417
10.6k
    int kind;
12418
10.6k
    Py_ssize_t i, j, len;
12419
10.6k
    BLOOM_MASK sepmask;
12420
10.6k
    Py_ssize_t seplen;
12421
12422
10.6k
    kind = PyUnicode_KIND(self);
12423
10.6k
    data = PyUnicode_DATA(self);
12424
10.6k
    len = PyUnicode_GET_LENGTH(self);
12425
10.6k
    seplen = PyUnicode_GET_LENGTH(sepobj);
12426
10.6k
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12427
10.6k
                              PyUnicode_DATA(sepobj),
12428
10.6k
                              seplen);
12429
12430
0
    i = 0;
12431
10.6k
    if (striptype != RIGHTSTRIP) {
12432
28
        while (i < len) {
12433
24
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12434
24
            if (!BLOOM(sepmask, ch))
12435
20
                break;
12436
4
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12437
0
                break;
12438
4
            i++;
12439
4
        }
12440
24
    }
12441
12442
10.6k
    j = len;
12443
10.6k
    if (striptype != LEFTSTRIP) {
12444
10.5k
        j--;
12445
10.6k
        while (j >= i) {
12446
10.6k
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12447
10.6k
            if (!BLOOM(sepmask, ch))
12448
7.85k
                break;
12449
2.76k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12450
2.72k
                break;
12451
44
            j--;
12452
44
        }
12453
12454
10.5k
        j++;
12455
10.5k
    }
12456
12457
10.6k
    return PyUnicode_Substring(self, i, j);
12458
10.6k
}
12459
12460
PyObject*
12461
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12462
1.33M
{
12463
1.33M
    const unsigned char *data;
12464
1.33M
    int kind;
12465
1.33M
    Py_ssize_t length;
12466
12467
1.33M
    length = PyUnicode_GET_LENGTH(self);
12468
1.33M
    end = Py_MIN(end, length);
12469
12470
1.33M
    if (start == 0 && end == length)
12471
37.8k
        return unicode_result_unchanged(self);
12472
12473
1.30M
    if (start < 0 || end < 0) {
12474
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12475
0
        return NULL;
12476
0
    }
12477
1.30M
    if (start >= length || end < start)
12478
8
        _Py_RETURN_UNICODE_EMPTY();
12479
12480
1.30M
    length = end - start;
12481
1.30M
    if (PyUnicode_IS_ASCII(self)) {
12482
506k
        data = PyUnicode_1BYTE_DATA(self);
12483
506k
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12484
506k
    }
12485
794k
    else {
12486
794k
        kind = PyUnicode_KIND(self);
12487
794k
        data = PyUnicode_1BYTE_DATA(self);
12488
794k
        return PyUnicode_FromKindAndData(kind,
12489
794k
                                         data + kind * start,
12490
794k
                                         length);
12491
794k
    }
12492
1.30M
}
12493
12494
static PyObject *
12495
do_strip(PyObject *self, int striptype)
12496
20.8k
{
12497
20.8k
    Py_ssize_t len, i, j;
12498
12499
20.8k
    len = PyUnicode_GET_LENGTH(self);
12500
12501
20.8k
    if (PyUnicode_IS_ASCII(self)) {
12502
20.8k
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12503
12504
20.8k
        i = 0;
12505
20.8k
        if (striptype != RIGHTSTRIP) {
12506
12
            while (i < len) {
12507
12
                Py_UCS1 ch = data[i];
12508
12
                if (!_Py_ascii_whitespace[ch])
12509
12
                    break;
12510
0
                i++;
12511
0
            }
12512
12
        }
12513
12514
20.8k
        j = len;
12515
20.8k
        if (striptype != LEFTSTRIP) {
12516
20.8k
            j--;
12517
20.8k
            while (j >= i) {
12518
20.8k
                Py_UCS1 ch = data[j];
12519
20.8k
                if (!_Py_ascii_whitespace[ch])
12520
20.8k
                    break;
12521
0
                j--;
12522
0
            }
12523
20.8k
            j++;
12524
20.8k
        }
12525
20.8k
    }
12526
0
    else {
12527
0
        int kind = PyUnicode_KIND(self);
12528
0
        const void *data = PyUnicode_DATA(self);
12529
12530
0
        i = 0;
12531
0
        if (striptype != RIGHTSTRIP) {
12532
0
            while (i < len) {
12533
0
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12534
0
                if (!Py_UNICODE_ISSPACE(ch))
12535
0
                    break;
12536
0
                i++;
12537
0
            }
12538
0
        }
12539
12540
0
        j = len;
12541
0
        if (striptype != LEFTSTRIP) {
12542
0
            j--;
12543
0
            while (j >= i) {
12544
0
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12545
0
                if (!Py_UNICODE_ISSPACE(ch))
12546
0
                    break;
12547
0
                j--;
12548
0
            }
12549
0
            j++;
12550
0
        }
12551
0
    }
12552
12553
20.8k
    return PyUnicode_Substring(self, i, j);
12554
20.8k
}
12555
12556
12557
static PyObject *
12558
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12559
31.4k
{
12560
31.4k
    if (sep != Py_None) {
12561
10.6k
        if (PyUnicode_Check(sep))
12562
10.6k
            return _PyUnicode_XStrip(self, striptype, sep);
12563
0
        else {
12564
0
            PyErr_Format(PyExc_TypeError,
12565
0
                         "%s arg must be None or str",
12566
0
                         STRIPNAME(striptype));
12567
0
            return NULL;
12568
0
        }
12569
10.6k
    }
12570
12571
20.8k
    return do_strip(self, striptype);
12572
31.4k
}
12573
12574
12575
/*[clinic input]
12576
@permit_long_summary
12577
str.strip as unicode_strip
12578
12579
    chars: object = None
12580
    /
12581
12582
Return a copy of the string with leading and trailing whitespace removed.
12583
12584
If chars is given and not None, remove characters in chars instead.
12585
[clinic start generated code]*/
12586
12587
static PyObject *
12588
unicode_strip_impl(PyObject *self, PyObject *chars)
12589
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12590
12
{
12591
12
    return do_argstrip(self, BOTHSTRIP, chars);
12592
12
}
12593
12594
12595
/*[clinic input]
12596
str.lstrip as unicode_lstrip
12597
12598
    chars: object = None
12599
    /
12600
12601
Return a copy of the string with leading whitespace removed.
12602
12603
If chars is given and not None, remove characters in chars instead.
12604
[clinic start generated code]*/
12605
12606
static PyObject *
12607
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12608
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12609
24
{
12610
24
    return do_argstrip(self, LEFTSTRIP, chars);
12611
24
}
12612
12613
12614
/*[clinic input]
12615
str.rstrip as unicode_rstrip
12616
12617
    chars: object = None
12618
    /
12619
12620
Return a copy of the string with trailing whitespace removed.
12621
12622
If chars is given and not None, remove characters in chars instead.
12623
[clinic start generated code]*/
12624
12625
static PyObject *
12626
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12627
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12628
31.3k
{
12629
31.3k
    return do_argstrip(self, RIGHTSTRIP, chars);
12630
31.3k
}
12631
12632
12633
static PyObject*
12634
unicode_repeat(PyObject *str, Py_ssize_t len)
12635
12.7k
{
12636
12.7k
    PyObject *u;
12637
12.7k
    Py_ssize_t nchars, n;
12638
12639
12.7k
    if (len < 1)
12640
2.08k
        _Py_RETURN_UNICODE_EMPTY();
12641
12642
    /* no repeat, return original string */
12643
10.6k
    if (len == 1)
12644
2.88k
        return unicode_result_unchanged(str);
12645
12646
7.74k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12647
0
        PyErr_SetString(PyExc_OverflowError,
12648
0
                        "repeated string is too long");
12649
0
        return NULL;
12650
0
    }
12651
7.74k
    nchars = len * PyUnicode_GET_LENGTH(str);
12652
12653
7.74k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12654
7.74k
    if (!u)
12655
0
        return NULL;
12656
7.74k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12657
12658
7.74k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12659
1.35k
        int kind = PyUnicode_KIND(str);
12660
1.35k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12661
1.35k
        if (kind == PyUnicode_1BYTE_KIND) {
12662
975
            void *to = PyUnicode_DATA(u);
12663
975
            memset(to, (unsigned char)fill_char, len);
12664
975
        }
12665
381
        else if (kind == PyUnicode_2BYTE_KIND) {
12666
323
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12667
2.49k
            for (n = 0; n < len; ++n)
12668
2.16k
                ucs2[n] = fill_char;
12669
323
        } else {
12670
58
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12671
58
            assert(kind == PyUnicode_4BYTE_KIND);
12672
4.59k
            for (n = 0; n < len; ++n)
12673
4.53k
                ucs4[n] = fill_char;
12674
58
        }
12675
1.35k
    }
12676
6.38k
    else {
12677
6.38k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12678
6.38k
        char *to = (char *) PyUnicode_DATA(u);
12679
6.38k
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12680
6.38k
            PyUnicode_GET_LENGTH(str) * char_size);
12681
6.38k
    }
12682
12683
7.74k
    assert(_PyUnicode_CheckConsistency(u, 1));
12684
7.74k
    return u;
12685
7.74k
}
12686
12687
PyObject *
12688
PyUnicode_Replace(PyObject *str,
12689
                  PyObject *substr,
12690
                  PyObject *replstr,
12691
                  Py_ssize_t maxcount)
12692
36.8k
{
12693
36.8k
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12694
36.8k
            ensure_unicode(replstr) < 0)
12695
0
        return NULL;
12696
36.8k
    return replace(str, substr, replstr, maxcount);
12697
36.8k
}
12698
12699
/*[clinic input]
12700
@permit_long_docstring_body
12701
str.replace as unicode_replace
12702
12703
    old: unicode
12704
    new: unicode
12705
    /
12706
    count: Py_ssize_t = -1
12707
        Maximum number of occurrences to replace.
12708
        -1 (the default value) means replace all occurrences.
12709
12710
Return a copy with all occurrences of substring old replaced by new.
12711
12712
If the optional argument count is given, only the first count occurrences are
12713
replaced.
12714
[clinic start generated code]*/
12715
12716
static PyObject *
12717
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12718
                     Py_ssize_t count)
12719
/*[clinic end generated code: output=b63f1a8b5eebf448 input=f27ca92ac46b65a1]*/
12720
21.9k
{
12721
21.9k
    return replace(self, old, new, count);
12722
21.9k
}
12723
12724
/*[clinic input]
12725
@permit_long_docstring_body
12726
str.removeprefix as unicode_removeprefix
12727
12728
    prefix: unicode
12729
    /
12730
12731
Return a str with the given prefix string removed if present.
12732
12733
If the string starts with the prefix string, return string[len(prefix):].
12734
Otherwise, return a copy of the original string.
12735
[clinic start generated code]*/
12736
12737
static PyObject *
12738
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12739
/*[clinic end generated code: output=f1e5945e9763bcb9 input=1989a856dbb813f1]*/
12740
0
{
12741
0
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12742
0
    if (match == -1) {
12743
0
        return NULL;
12744
0
    }
12745
0
    if (match) {
12746
0
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12747
0
                                   PyUnicode_GET_LENGTH(self));
12748
0
    }
12749
0
    return unicode_result_unchanged(self);
12750
0
}
12751
12752
/*[clinic input]
12753
str.removesuffix as unicode_removesuffix
12754
12755
    suffix: unicode
12756
    /
12757
12758
Return a str with the given suffix string removed if present.
12759
12760
If the string ends with the suffix string and that suffix is not empty,
12761
return string[:-len(suffix)]. Otherwise, return a copy of the original
12762
string.
12763
[clinic start generated code]*/
12764
12765
static PyObject *
12766
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12767
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12768
0
{
12769
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12770
0
    if (match == -1) {
12771
0
        return NULL;
12772
0
    }
12773
0
    if (match) {
12774
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12775
0
                                            - PyUnicode_GET_LENGTH(suffix));
12776
0
    }
12777
0
    return unicode_result_unchanged(self);
12778
0
}
12779
12780
static PyObject *
12781
unicode_repr(PyObject *unicode)
12782
11.8k
{
12783
11.8k
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12784
11.8k
    const void *idata = PyUnicode_DATA(unicode);
12785
12786
    /* Compute length of output, quote characters, and
12787
       maximum character */
12788
11.8k
    Py_ssize_t osize = 0;
12789
11.8k
    Py_UCS4 maxch = 127;
12790
11.8k
    Py_ssize_t squote = 0;
12791
11.8k
    Py_ssize_t dquote = 0;
12792
11.8k
    int ikind = PyUnicode_KIND(unicode);
12793
22.0M
    for (Py_ssize_t i = 0; i < isize; i++) {
12794
22.0M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12795
22.0M
        Py_ssize_t incr = 1;
12796
22.0M
        switch (ch) {
12797
49.7k
        case '\'': squote++; break;
12798
19.6k
        case '"':  dquote++; break;
12799
56.5k
        case '\\': case '\t': case '\r': case '\n':
12800
56.5k
            incr = 2;
12801
56.5k
            break;
12802
21.9M
        default:
12803
            /* Fast-path ASCII */
12804
21.9M
            if (ch < ' ' || ch == 0x7f)
12805
1.29M
                incr = 4; /* \xHH */
12806
20.6M
            else if (ch < 0x7f)
12807
19.6M
                ;
12808
956k
            else if (Py_UNICODE_ISPRINTABLE(ch))
12809
215k
                maxch = (ch > maxch) ? ch : maxch;
12810
740k
            else if (ch < 0x100)
12811
694k
                incr = 4; /* \xHH */
12812
46.3k
            else if (ch < 0x10000)
12813
42.3k
                incr = 6; /* \uHHHH */
12814
4.06k
            else
12815
4.06k
                incr = 10; /* \uHHHHHHHH */
12816
22.0M
        }
12817
22.0M
        if (osize > PY_SSIZE_T_MAX - incr) {
12818
0
            PyErr_SetString(PyExc_OverflowError,
12819
0
                            "string is too long to generate repr");
12820
0
            return NULL;
12821
0
        }
12822
22.0M
        osize += incr;
12823
22.0M
    }
12824
12825
11.8k
    Py_UCS4 quote = '\'';
12826
11.8k
    int changed = (osize != isize);
12827
11.8k
    if (squote) {
12828
1.78k
        changed = 1;
12829
1.78k
        if (dquote)
12830
            /* Both squote and dquote present. Use squote,
12831
               and escape them */
12832
1.11k
            osize += squote;
12833
667
        else
12834
667
            quote = '"';
12835
1.78k
    }
12836
11.8k
    osize += 2;   /* quotes */
12837
12838
11.8k
    PyObject *repr = PyUnicode_New(osize, maxch);
12839
11.8k
    if (repr == NULL)
12840
0
        return NULL;
12841
11.8k
    int okind = PyUnicode_KIND(repr);
12842
11.8k
    void *odata = PyUnicode_DATA(repr);
12843
12844
11.8k
    if (!changed) {
12845
4.88k
        PyUnicode_WRITE(okind, odata, 0, quote);
12846
12847
4.88k
        _PyUnicode_FastCopyCharacters(repr, 1,
12848
4.88k
                                      unicode, 0,
12849
4.88k
                                      isize);
12850
12851
4.88k
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12852
4.88k
    }
12853
6.96k
    else {
12854
6.96k
        switch (okind) {
12855
3.33k
        case PyUnicode_1BYTE_KIND:
12856
3.33k
            ucs1lib_repr(unicode, quote, odata);
12857
3.33k
            break;
12858
2.32k
        case PyUnicode_2BYTE_KIND:
12859
2.32k
            ucs2lib_repr(unicode, quote, odata);
12860
2.32k
            break;
12861
1.30k
        default:
12862
1.30k
            assert(okind == PyUnicode_4BYTE_KIND);
12863
1.30k
            ucs4lib_repr(unicode, quote, odata);
12864
6.96k
        }
12865
6.96k
    }
12866
12867
11.8k
    assert(_PyUnicode_CheckConsistency(repr, 1));
12868
11.8k
    return repr;
12869
11.8k
}
12870
12871
/*[clinic input]
12872
@permit_long_summary
12873
str.rfind as unicode_rfind = str.count
12874
12875
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12876
12877
Optional arguments start and end are interpreted as in slice notation.
12878
Return -1 on failure.
12879
[clinic start generated code]*/
12880
12881
static Py_ssize_t
12882
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12883
                   Py_ssize_t end)
12884
/*[clinic end generated code: output=880b29f01dd014c8 input=7f7e97d5cd3299a2]*/
12885
7.11k
{
12886
7.11k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12887
7.11k
    if (result < 0) {
12888
5.51k
        return -1;
12889
5.51k
    }
12890
1.60k
    return result;
12891
7.11k
}
12892
12893
/*[clinic input]
12894
@permit_long_summary
12895
str.rindex as unicode_rindex = str.count
12896
12897
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12898
12899
Optional arguments start and end are interpreted as in slice notation.
12900
Raises ValueError when the substring is not found.
12901
[clinic start generated code]*/
12902
12903
static Py_ssize_t
12904
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12905
                    Py_ssize_t end)
12906
/*[clinic end generated code: output=5f3aef124c867fe1 input=0363a324740b3e62]*/
12907
0
{
12908
0
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12909
0
    if (result == -1) {
12910
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12911
0
    }
12912
0
    else if (result < 0) {
12913
0
        return -1;
12914
0
    }
12915
0
    return result;
12916
0
}
12917
12918
/*[clinic input]
12919
str.rjust as unicode_rjust
12920
12921
    width: Py_ssize_t
12922
    fillchar: Py_UCS4 = ' '
12923
    /
12924
12925
Return a right-justified string of length width.
12926
12927
Padding is done using the specified fill character (default is a space).
12928
[clinic start generated code]*/
12929
12930
static PyObject *
12931
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12932
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12933
0
{
12934
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12935
0
        return unicode_result_unchanged(self);
12936
12937
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12938
0
}
12939
12940
PyObject *
12941
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12942
0
{
12943
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12944
0
        return NULL;
12945
12946
0
    return split(s, sep, maxsplit);
12947
0
}
12948
12949
/*[clinic input]
12950
@permit_long_summary
12951
str.split as unicode_split
12952
12953
    sep: object = None
12954
        The separator used to split the string.
12955
12956
        When set to None (the default value), will split on any whitespace
12957
        character (including \n \r \t \f and spaces) and will discard
12958
        empty strings from the result.
12959
    maxsplit: Py_ssize_t = -1
12960
        Maximum number of splits.
12961
        -1 (the default value) means no limit.
12962
12963
Return a list of the substrings in the string, using sep as the separator string.
12964
12965
Splitting starts at the front of the string and works to the end.
12966
12967
Note, str.split() is mainly useful for data that has been intentionally
12968
delimited.  With natural text that includes punctuation, consider using
12969
the regular expression module.
12970
12971
[clinic start generated code]*/
12972
12973
static PyObject *
12974
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12975
/*[clinic end generated code: output=3a65b1db356948dc input=2c1fd08a78e038b8]*/
12976
1.69k
{
12977
1.69k
    if (sep == Py_None)
12978
7
        return split(self, NULL, maxsplit);
12979
1.69k
    if (PyUnicode_Check(sep))
12980
1.69k
        return split(self, sep, maxsplit);
12981
12982
0
    PyErr_Format(PyExc_TypeError,
12983
0
                 "must be str or None, not %.100s",
12984
0
                 Py_TYPE(sep)->tp_name);
12985
0
    return NULL;
12986
1.69k
}
12987
12988
PyObject *
12989
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12990
0
{
12991
0
    PyObject* out;
12992
0
    int kind1, kind2;
12993
0
    const void *buf1, *buf2;
12994
0
    Py_ssize_t len1, len2;
12995
12996
0
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12997
0
        return NULL;
12998
12999
0
    kind1 = PyUnicode_KIND(str_obj);
13000
0
    kind2 = PyUnicode_KIND(sep_obj);
13001
0
    len1 = PyUnicode_GET_LENGTH(str_obj);
13002
0
    len2 = PyUnicode_GET_LENGTH(sep_obj);
13003
0
    if (kind1 < kind2 || len1 < len2) {
13004
0
        PyObject *empty = unicode_get_empty();  // Borrowed reference
13005
0
        return PyTuple_Pack(3, str_obj, empty, empty);
13006
0
    }
13007
0
    buf1 = PyUnicode_DATA(str_obj);
13008
0
    buf2 = PyUnicode_DATA(sep_obj);
13009
0
    if (kind2 != kind1) {
13010
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
13011
0
        if (!buf2)
13012
0
            return NULL;
13013
0
    }
13014
13015
0
    switch (kind1) {
13016
0
    case PyUnicode_1BYTE_KIND:
13017
0
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13018
0
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13019
0
        else
13020
0
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13021
0
        break;
13022
0
    case PyUnicode_2BYTE_KIND:
13023
0
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13024
0
        break;
13025
0
    case PyUnicode_4BYTE_KIND:
13026
0
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13027
0
        break;
13028
0
    default:
13029
0
        Py_UNREACHABLE();
13030
0
    }
13031
13032
0
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13033
0
    if (kind2 != kind1)
13034
0
        PyMem_Free((void *)buf2);
13035
13036
0
    return out;
13037
0
}
13038
13039
13040
PyObject *
13041
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13042
4.46k
{
13043
4.46k
    PyObject* out;
13044
4.46k
    int kind1, kind2;
13045
4.46k
    const void *buf1, *buf2;
13046
4.46k
    Py_ssize_t len1, len2;
13047
13048
4.46k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13049
0
        return NULL;
13050
13051
4.46k
    kind1 = PyUnicode_KIND(str_obj);
13052
4.46k
    kind2 = PyUnicode_KIND(sep_obj);
13053
4.46k
    len1 = PyUnicode_GET_LENGTH(str_obj);
13054
4.46k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
13055
4.46k
    if (kind1 < kind2 || len1 < len2) {
13056
0
        PyObject *empty = unicode_get_empty();  // Borrowed reference
13057
0
        return PyTuple_Pack(3, empty, empty, str_obj);
13058
0
    }
13059
4.46k
    buf1 = PyUnicode_DATA(str_obj);
13060
4.46k
    buf2 = PyUnicode_DATA(sep_obj);
13061
4.46k
    if (kind2 != kind1) {
13062
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
13063
0
        if (!buf2)
13064
0
            return NULL;
13065
0
    }
13066
13067
4.46k
    switch (kind1) {
13068
4.46k
    case PyUnicode_1BYTE_KIND:
13069
4.46k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13070
4.46k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13071
0
        else
13072
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13073
4.46k
        break;
13074
0
    case PyUnicode_2BYTE_KIND:
13075
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13076
0
        break;
13077
0
    case PyUnicode_4BYTE_KIND:
13078
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13079
0
        break;
13080
0
    default:
13081
0
        Py_UNREACHABLE();
13082
4.46k
    }
13083
13084
4.46k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13085
4.46k
    if (kind2 != kind1)
13086
0
        PyMem_Free((void *)buf2);
13087
13088
4.46k
    return out;
13089
4.46k
}
13090
13091
/*[clinic input]
13092
@permit_long_docstring_body
13093
str.partition as unicode_partition
13094
13095
    sep: object
13096
    /
13097
13098
Partition the string into three parts using the given separator.
13099
13100
This will search for the separator in the string.  If the separator is found,
13101
returns a 3-tuple containing the part before the separator, the separator
13102
itself, and the part after it.
13103
13104
If the separator is not found, returns a 3-tuple containing the original string
13105
and two empty strings.
13106
[clinic start generated code]*/
13107
13108
static PyObject *
13109
unicode_partition(PyObject *self, PyObject *sep)
13110
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=4d854b520d7b0e97]*/
13111
0
{
13112
0
    return PyUnicode_Partition(self, sep);
13113
0
}
13114
13115
/*[clinic input]
13116
@permit_long_docstring_body
13117
str.rpartition as unicode_rpartition = str.partition
13118
13119
Partition the string into three parts using the given separator.
13120
13121
This will search for the separator in the string, starting at the end. If
13122
the separator is found, returns a 3-tuple containing the part before the
13123
separator, the separator itself, and the part after it.
13124
13125
If the separator is not found, returns a 3-tuple containing two empty strings
13126
and the original string.
13127
[clinic start generated code]*/
13128
13129
static PyObject *
13130
unicode_rpartition(PyObject *self, PyObject *sep)
13131
/*[clinic end generated code: output=1aa13cf1156572aa input=a6adabe91e75b486]*/
13132
4.46k
{
13133
4.46k
    return PyUnicode_RPartition(self, sep);
13134
4.46k
}
13135
13136
PyObject *
13137
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13138
0
{
13139
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13140
0
        return NULL;
13141
13142
0
    return rsplit(s, sep, maxsplit);
13143
0
}
13144
13145
/*[clinic input]
13146
@permit_long_summary
13147
str.rsplit as unicode_rsplit = str.split
13148
13149
Return a list of the substrings in the string, using sep as the separator string.
13150
13151
Splitting starts at the end of the string and works to the front.
13152
[clinic start generated code]*/
13153
13154
static PyObject *
13155
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13156
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13157
0
{
13158
0
    if (sep == Py_None)
13159
0
        return rsplit(self, NULL, maxsplit);
13160
0
    if (PyUnicode_Check(sep))
13161
0
        return rsplit(self, sep, maxsplit);
13162
13163
0
    PyErr_Format(PyExc_TypeError,
13164
0
                 "must be str or None, not %.100s",
13165
0
                 Py_TYPE(sep)->tp_name);
13166
0
    return NULL;
13167
0
}
13168
13169
/*[clinic input]
13170
@permit_long_docstring_body
13171
str.splitlines as unicode_splitlines
13172
13173
    keepends: bool = False
13174
13175
Return a list of the lines in the string, breaking at line boundaries.
13176
13177
Line breaks are not included in the resulting list unless keepends is given and
13178
true.
13179
[clinic start generated code]*/
13180
13181
static PyObject *
13182
unicode_splitlines_impl(PyObject *self, int keepends)
13183
/*[clinic end generated code: output=f664dcdad153ec40 input=39eeafbfef61c827]*/
13184
0
{
13185
0
    return PyUnicode_Splitlines(self, keepends);
13186
0
}
13187
13188
static
13189
PyObject *unicode_str(PyObject *self)
13190
0
{
13191
0
    return unicode_result_unchanged(self);
13192
0
}
13193
13194
/*[clinic input]
13195
@permit_long_summary
13196
str.swapcase as unicode_swapcase
13197
13198
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13199
[clinic start generated code]*/
13200
13201
static PyObject *
13202
unicode_swapcase_impl(PyObject *self)
13203
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13204
0
{
13205
0
    return case_operation(self, do_swapcase);
13206
0
}
13207
13208
/*[clinic input]
13209
13210
@staticmethod
13211
str.maketrans as unicode_maketrans
13212
13213
  x: object
13214
13215
  y: unicode=NULL
13216
13217
  z: unicode=NULL
13218
13219
  /
13220
13221
Return a translation table usable for str.translate().
13222
13223
If there is only one argument, it must be a dictionary mapping Unicode
13224
ordinals (integers) or characters to Unicode ordinals, strings or None.
13225
Character keys will be then converted to ordinals.
13226
If there are two arguments, they must be strings of equal length, and
13227
in the resulting dictionary, each character in x will be mapped to the
13228
character at the same position in y. If there is a third argument, it
13229
must be a string, whose characters will be mapped to None in the result.
13230
[clinic start generated code]*/
13231
13232
static PyObject *
13233
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13234
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13235
0
{
13236
0
    PyObject *new = NULL, *key, *value;
13237
0
    Py_ssize_t i = 0;
13238
0
    int res;
13239
13240
0
    new = PyDict_New();
13241
0
    if (!new)
13242
0
        return NULL;
13243
0
    if (y != NULL) {
13244
0
        int x_kind, y_kind, z_kind;
13245
0
        const void *x_data, *y_data, *z_data;
13246
13247
        /* x must be a string too, of equal length */
13248
0
        if (!PyUnicode_Check(x)) {
13249
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13250
0
                            "be a string if there is a second argument");
13251
0
            goto err;
13252
0
        }
13253
0
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13254
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13255
0
                            "arguments must have equal length");
13256
0
            goto err;
13257
0
        }
13258
        /* create entries for translating chars in x to those in y */
13259
0
        x_kind = PyUnicode_KIND(x);
13260
0
        y_kind = PyUnicode_KIND(y);
13261
0
        x_data = PyUnicode_DATA(x);
13262
0
        y_data = PyUnicode_DATA(y);
13263
0
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13264
0
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13265
0
            if (!key)
13266
0
                goto err;
13267
0
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13268
0
            if (!value) {
13269
0
                Py_DECREF(key);
13270
0
                goto err;
13271
0
            }
13272
0
            res = PyDict_SetItem(new, key, value);
13273
0
            Py_DECREF(key);
13274
0
            Py_DECREF(value);
13275
0
            if (res < 0)
13276
0
                goto err;
13277
0
        }
13278
        /* create entries for deleting chars in z */
13279
0
        if (z != NULL) {
13280
0
            z_kind = PyUnicode_KIND(z);
13281
0
            z_data = PyUnicode_DATA(z);
13282
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13283
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13284
0
                if (!key)
13285
0
                    goto err;
13286
0
                res = PyDict_SetItem(new, key, Py_None);
13287
0
                Py_DECREF(key);
13288
0
                if (res < 0)
13289
0
                    goto err;
13290
0
            }
13291
0
        }
13292
0
    } else {
13293
0
        int kind;
13294
0
        const void *data;
13295
13296
        /* x must be a dict */
13297
0
        if (!PyDict_CheckExact(x)) {
13298
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13299
0
                            "to maketrans it must be a dict");
13300
0
            goto err;
13301
0
        }
13302
        /* copy entries into the new dict, converting string keys to int keys */
13303
0
        while (PyDict_Next(x, &i, &key, &value)) {
13304
0
            if (PyUnicode_Check(key)) {
13305
                /* convert string keys to integer keys */
13306
0
                PyObject *newkey;
13307
0
                if (PyUnicode_GET_LENGTH(key) != 1) {
13308
0
                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13309
0
                                    "table must be of length 1");
13310
0
                    goto err;
13311
0
                }
13312
0
                kind = PyUnicode_KIND(key);
13313
0
                data = PyUnicode_DATA(key);
13314
0
                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13315
0
                if (!newkey)
13316
0
                    goto err;
13317
0
                res = PyDict_SetItem(new, newkey, value);
13318
0
                Py_DECREF(newkey);
13319
0
                if (res < 0)
13320
0
                    goto err;
13321
0
            } else if (PyLong_Check(key)) {
13322
                /* just keep integer keys */
13323
0
                if (PyDict_SetItem(new, key, value) < 0)
13324
0
                    goto err;
13325
0
            } else {
13326
0
                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13327
0
                                "be strings or integers");
13328
0
                goto err;
13329
0
            }
13330
0
        }
13331
0
    }
13332
0
    return new;
13333
0
  err:
13334
0
    Py_DECREF(new);
13335
0
    return NULL;
13336
0
}
13337
13338
/*[clinic input]
13339
@permit_long_docstring_body
13340
str.translate as unicode_translate
13341
13342
    table: object
13343
        Translation table, which must be a mapping of Unicode ordinals to
13344
        Unicode ordinals, strings, or None.
13345
    /
13346
13347
Replace each character in the string using the given translation table.
13348
13349
The table must implement lookup/indexing via __getitem__, for instance a
13350
dictionary or list.  If this operation raises LookupError, the character is
13351
left untouched.  Characters mapped to None are deleted.
13352
[clinic start generated code]*/
13353
13354
static PyObject *
13355
unicode_translate(PyObject *self, PyObject *table)
13356
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=699e5fa0ebf9f5e9]*/
13357
144
{
13358
144
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13359
144
}
13360
13361
/*[clinic input]
13362
str.upper as unicode_upper
13363
13364
Return a copy of the string converted to uppercase.
13365
[clinic start generated code]*/
13366
13367
static PyObject *
13368
unicode_upper_impl(PyObject *self)
13369
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13370
153
{
13371
153
    if (PyUnicode_IS_ASCII(self))
13372
153
        return ascii_upper_or_lower(self, 0);
13373
0
    return case_operation(self, do_upper);
13374
153
}
13375
13376
/*[clinic input]
13377
@permit_long_summary
13378
str.zfill as unicode_zfill
13379
13380
    width: Py_ssize_t
13381
    /
13382
13383
Pad a numeric string with zeros on the left, to fill a field of the given width.
13384
13385
The string is never truncated.
13386
[clinic start generated code]*/
13387
13388
static PyObject *
13389
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13390
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13391
0
{
13392
0
    Py_ssize_t fill;
13393
0
    PyObject *u;
13394
0
    int kind;
13395
0
    const void *data;
13396
0
    Py_UCS4 chr;
13397
13398
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13399
0
        return unicode_result_unchanged(self);
13400
13401
0
    fill = width - PyUnicode_GET_LENGTH(self);
13402
13403
0
    u = pad(self, fill, 0, '0');
13404
13405
0
    if (u == NULL)
13406
0
        return NULL;
13407
13408
0
    kind = PyUnicode_KIND(u);
13409
0
    data = PyUnicode_DATA(u);
13410
0
    chr = PyUnicode_READ(kind, data, fill);
13411
13412
0
    if (chr == '+' || chr == '-') {
13413
        /* move sign to beginning of string */
13414
0
        PyUnicode_WRITE(kind, data, 0, chr);
13415
0
        PyUnicode_WRITE(kind, data, fill, '0');
13416
0
    }
13417
13418
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13419
0
    return u;
13420
0
}
13421
13422
/*[clinic input]
13423
@permit_long_summary
13424
@text_signature "($self, prefix[, start[, end]], /)"
13425
str.startswith as unicode_startswith
13426
13427
    prefix as subobj: object
13428
        A string or a tuple of strings to try.
13429
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13430
        Optional start position. Default: start of the string.
13431
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13432
        Optional stop position. Default: end of the string.
13433
    /
13434
13435
Return True if the string starts with the specified prefix, False otherwise.
13436
[clinic start generated code]*/
13437
13438
static PyObject *
13439
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13440
                        Py_ssize_t end)
13441
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13442
496k
{
13443
496k
    if (PyTuple_Check(subobj)) {
13444
120
        Py_ssize_t i;
13445
840
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13446
720
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13447
720
            if (!PyUnicode_Check(substring)) {
13448
0
                PyErr_Format(PyExc_TypeError,
13449
0
                             "tuple for startswith must only contain str, "
13450
0
                             "not %.100s",
13451
0
                             Py_TYPE(substring)->tp_name);
13452
0
                return NULL;
13453
0
            }
13454
720
            int result = tailmatch(self, substring, start, end, -1);
13455
720
            if (result < 0) {
13456
0
                return NULL;
13457
0
            }
13458
720
            if (result) {
13459
0
                Py_RETURN_TRUE;
13460
0
            }
13461
720
        }
13462
        /* nothing matched */
13463
120
        Py_RETURN_FALSE;
13464
120
    }
13465
496k
    if (!PyUnicode_Check(subobj)) {
13466
0
        PyErr_Format(PyExc_TypeError,
13467
0
                     "startswith first arg must be str or "
13468
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13469
0
        return NULL;
13470
0
    }
13471
496k
    int result = tailmatch(self, subobj, start, end, -1);
13472
496k
    if (result < 0) {
13473
0
        return NULL;
13474
0
    }
13475
496k
    return PyBool_FromLong(result);
13476
496k
}
13477
13478
13479
/*[clinic input]
13480
@permit_long_summary
13481
@text_signature "($self, suffix[, start[, end]], /)"
13482
str.endswith as unicode_endswith
13483
13484
    suffix as subobj: object
13485
        A string or a tuple of strings to try.
13486
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13487
        Optional start position. Default: start of the string.
13488
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13489
        Optional stop position. Default: end of the string.
13490
    /
13491
13492
Return True if the string ends with the specified suffix, False otherwise.
13493
[clinic start generated code]*/
13494
13495
static PyObject *
13496
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13497
                      Py_ssize_t end)
13498
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13499
239k
{
13500
239k
    if (PyTuple_Check(subobj)) {
13501
394
        Py_ssize_t i;
13502
442
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13503
394
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13504
394
            if (!PyUnicode_Check(substring)) {
13505
0
                PyErr_Format(PyExc_TypeError,
13506
0
                             "tuple for endswith must only contain str, "
13507
0
                             "not %.100s",
13508
0
                             Py_TYPE(substring)->tp_name);
13509
0
                return NULL;
13510
0
            }
13511
394
            int result = tailmatch(self, substring, start, end, +1);
13512
394
            if (result < 0) {
13513
0
                return NULL;
13514
0
            }
13515
394
            if (result) {
13516
346
                Py_RETURN_TRUE;
13517
346
            }
13518
394
        }
13519
394
        Py_RETURN_FALSE;
13520
394
    }
13521
238k
    if (!PyUnicode_Check(subobj)) {
13522
0
        PyErr_Format(PyExc_TypeError,
13523
0
                     "endswith first arg must be str or "
13524
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13525
0
        return NULL;
13526
0
    }
13527
238k
    int result = tailmatch(self, subobj, start, end, +1);
13528
238k
    if (result < 0) {
13529
0
        return NULL;
13530
0
    }
13531
238k
    return PyBool_FromLong(result);
13532
238k
}
13533
13534
13535
static inline void
13536
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13537
4.96M
{
13538
4.96M
    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13539
4.96M
    writer->data = PyUnicode_DATA(writer->buffer);
13540
13541
4.96M
    if (!writer->readonly) {
13542
4.93M
        writer->kind = PyUnicode_KIND(writer->buffer);
13543
4.93M
        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13544
4.93M
    }
13545
27.4k
    else {
13546
        /* use a value smaller than PyUnicode_1BYTE_KIND() so
13547
           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13548
27.4k
        writer->kind = 0;
13549
27.4k
        assert(writer->kind <= PyUnicode_1BYTE_KIND);
13550
13551
        /* Copy-on-write mode: set buffer size to 0 so
13552
         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13553
         * next write. */
13554
27.4k
        writer->size = 0;
13555
27.4k
    }
13556
4.96M
}
13557
13558
13559
void
13560
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13561
1.78M
{
13562
1.78M
    memset(writer, 0, sizeof(*writer));
13563
13564
    /* ASCII is the bare minimum */
13565
1.78M
    writer->min_char = 127;
13566
13567
    /* use a kind value smaller than PyUnicode_1BYTE_KIND so
13568
       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13569
1.78M
    assert(writer->kind == 0);
13570
1.78M
    assert(writer->kind < PyUnicode_1BYTE_KIND);
13571
1.78M
}
13572
13573
13574
PyUnicodeWriter*
13575
PyUnicodeWriter_Create(Py_ssize_t length)
13576
122k
{
13577
122k
    if (length < 0) {
13578
0
        PyErr_SetString(PyExc_ValueError,
13579
0
                        "length must be positive");
13580
0
        return NULL;
13581
0
    }
13582
13583
122k
    const size_t size = sizeof(_PyUnicodeWriter);
13584
122k
    PyUnicodeWriter *pub_writer;
13585
122k
    pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
13586
122k
    if (pub_writer == NULL) {
13587
2.12k
        pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
13588
2.12k
        if (pub_writer == NULL) {
13589
0
            return (PyUnicodeWriter *)PyErr_NoMemory();
13590
0
        }
13591
2.12k
    }
13592
122k
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
13593
13594
122k
    _PyUnicodeWriter_Init(writer);
13595
122k
    if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
13596
0
        PyUnicodeWriter_Discard(pub_writer);
13597
0
        return NULL;
13598
0
    }
13599
122k
    writer->overallocate = 1;
13600
13601
122k
    return pub_writer;
13602
122k
}
13603
13604
13605
void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
13606
1.06k
{
13607
1.06k
    if (writer == NULL) {
13608
217
        return;
13609
217
    }
13610
848
    _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
13611
848
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
13612
848
}
13613
13614
13615
// Initialize _PyUnicodeWriter with initial buffer
13616
static inline void
13617
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13618
1.54M
{
13619
1.54M
    memset(writer, 0, sizeof(*writer));
13620
1.54M
    writer->buffer = buffer;
13621
1.54M
    _PyUnicodeWriter_Update(writer);
13622
1.54M
    writer->min_length = writer->size;
13623
1.54M
}
13624
13625
13626
int
13627
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13628
                                 Py_ssize_t length, Py_UCS4 maxchar)
13629
3.39M
{
13630
3.39M
    Py_ssize_t newlen;
13631
3.39M
    PyObject *newbuffer;
13632
13633
3.39M
    assert(length >= 0);
13634
3.39M
    assert(maxchar <= MAX_UNICODE);
13635
13636
    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13637
3.39M
    assert((maxchar > writer->maxchar && length >= 0)
13638
3.39M
           || length > 0);
13639
13640
3.39M
    if (length > PY_SSIZE_T_MAX - writer->pos) {
13641
0
        PyErr_NoMemory();
13642
0
        return -1;
13643
0
    }
13644
3.39M
    newlen = writer->pos + length;
13645
13646
3.39M
    maxchar = Py_MAX(maxchar, writer->min_char);
13647
13648
3.39M
    if (writer->buffer == NULL) {
13649
1.75M
        assert(!writer->readonly);
13650
1.75M
        if (writer->overallocate
13651
1.68M
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13652
            /* overallocate to limit the number of realloc() */
13653
1.68M
            newlen += newlen / OVERALLOCATE_FACTOR;
13654
1.68M
        }
13655
1.75M
        if (newlen < writer->min_length)
13656
1.60M
            newlen = writer->min_length;
13657
13658
1.75M
        writer->buffer = PyUnicode_New(newlen, maxchar);
13659
1.75M
        if (writer->buffer == NULL)
13660
0
            return -1;
13661
1.75M
    }
13662
1.63M
    else if (newlen > writer->size) {
13663
131k
        if (writer->overallocate
13664
129k
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13665
            /* overallocate to limit the number of realloc() */
13666
129k
            newlen += newlen / OVERALLOCATE_FACTOR;
13667
129k
        }
13668
131k
        if (newlen < writer->min_length)
13669
0
            newlen = writer->min_length;
13670
13671
131k
        if (maxchar > writer->maxchar || writer->readonly) {
13672
            /* resize + widen */
13673
34.9k
            maxchar = Py_MAX(maxchar, writer->maxchar);
13674
34.9k
            newbuffer = PyUnicode_New(newlen, maxchar);
13675
34.9k
            if (newbuffer == NULL)
13676
0
                return -1;
13677
34.9k
            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13678
34.9k
                                          writer->buffer, 0, writer->pos);
13679
34.9k
            Py_DECREF(writer->buffer);
13680
34.9k
            writer->readonly = 0;
13681
34.9k
        }
13682
96.9k
        else {
13683
96.9k
            newbuffer = resize_compact(writer->buffer, newlen);
13684
96.9k
            if (newbuffer == NULL)
13685
0
                return -1;
13686
96.9k
        }
13687
131k
        writer->buffer = newbuffer;
13688
131k
    }
13689
1.50M
    else if (maxchar > writer->maxchar) {
13690
1.50M
        assert(!writer->readonly);
13691
1.50M
        newbuffer = PyUnicode_New(writer->size, maxchar);
13692
1.50M
        if (newbuffer == NULL)
13693
0
            return -1;
13694
1.50M
        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13695
1.50M
                                      writer->buffer, 0, writer->pos);
13696
1.50M
        Py_SETREF(writer->buffer, newbuffer);
13697
1.50M
    }
13698
3.39M
    _PyUnicodeWriter_Update(writer);
13699
3.39M
    return 0;
13700
13701
3.39M
#undef OVERALLOCATE_FACTOR
13702
3.39M
}
13703
13704
int
13705
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13706
                                     int kind)
13707
9
{
13708
9
    Py_UCS4 maxchar;
13709
13710
    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13711
9
    assert(writer->kind < kind);
13712
13713
9
    switch (kind)
13714
9
    {
13715
0
    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13716
9
    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13717
0
    case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13718
0
    default:
13719
0
        Py_UNREACHABLE();
13720
9
    }
13721
13722
9
    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13723
9
}
13724
13725
static inline int
13726
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13727
5.56M
{
13728
5.56M
    assert(ch <= MAX_UNICODE);
13729
5.56M
    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13730
0
        return -1;
13731
5.56M
    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13732
5.56M
    writer->pos++;
13733
5.56M
    return 0;
13734
5.56M
}
13735
13736
int
13737
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13738
3.92M
{
13739
3.92M
    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13740
3.92M
}
13741
13742
int
13743
PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
13744
216k
{
13745
216k
    if (ch > MAX_UNICODE) {
13746
0
        PyErr_SetString(PyExc_ValueError,
13747
0
                        "character must be in range(0x110000)");
13748
0
        return -1;
13749
0
    }
13750
13751
216k
    return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
13752
216k
}
13753
13754
int
13755
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13756
1.75M
{
13757
1.75M
    assert(PyUnicode_Check(str));
13758
13759
1.75M
    Py_UCS4 maxchar;
13760
1.75M
    Py_ssize_t len;
13761
13762
1.75M
    len = PyUnicode_GET_LENGTH(str);
13763
1.75M
    if (len == 0)
13764
4.96k
        return 0;
13765
1.75M
    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13766
1.75M
    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13767
53.7k
        if (writer->buffer == NULL && !writer->overallocate) {
13768
0
            assert(_PyUnicode_CheckConsistency(str, 1));
13769
0
            writer->readonly = 1;
13770
0
            writer->buffer = Py_NewRef(str);
13771
0
            _PyUnicodeWriter_Update(writer);
13772
0
            writer->pos += len;
13773
0
            return 0;
13774
0
        }
13775
53.7k
        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13776
0
            return -1;
13777
53.7k
    }
13778
1.75M
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13779
1.75M
                                  str, 0, len);
13780
1.75M
    writer->pos += len;
13781
1.75M
    return 0;
13782
1.75M
}
13783
13784
int
13785
PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
13786
233k
{
13787
233k
    PyTypeObject *type = Py_TYPE(obj);
13788
233k
    if (type == &PyUnicode_Type) {
13789
233k
        return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
13790
233k
    }
13791
13792
0
    if (type == &PyLong_Type) {
13793
0
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
13794
0
    }
13795
13796
0
    PyObject *str = PyObject_Str(obj);
13797
0
    if (str == NULL) {
13798
0
        return -1;
13799
0
    }
13800
13801
0
    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
13802
0
    Py_DECREF(str);
13803
0
    return res;
13804
0
}
13805
13806
13807
int
13808
PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
13809
0
{
13810
0
    if (Py_TYPE(obj) == &PyLong_Type) {
13811
0
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
13812
0
    }
13813
13814
0
    PyObject *repr = PyObject_Repr(obj);
13815
0
    if (repr == NULL) {
13816
0
        return -1;
13817
0
    }
13818
13819
0
    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
13820
0
    Py_DECREF(repr);
13821
0
    return res;
13822
0
}
13823
13824
13825
int
13826
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13827
                                Py_ssize_t start, Py_ssize_t end)
13828
2.84M
{
13829
2.84M
    assert(0 <= start);
13830
2.84M
    assert(end <= PyUnicode_GET_LENGTH(str));
13831
2.84M
    assert(start <= end);
13832
13833
2.84M
    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13834
17.3k
        return _PyUnicodeWriter_WriteStr(writer, str);
13835
13836
2.82M
    Py_ssize_t len = end - start;
13837
2.82M
    if (len == 0) {
13838
0
        return 0;
13839
0
    }
13840
13841
2.82M
    Py_UCS4 maxchar;
13842
2.82M
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
13843
1.53M
        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13844
1.53M
    }
13845
1.28M
    else {
13846
1.28M
        maxchar = writer->maxchar;
13847
1.28M
    }
13848
2.82M
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
13849
0
        return -1;
13850
0
    }
13851
13852
2.82M
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13853
2.82M
                                  str, start, len);
13854
2.82M
    writer->pos += len;
13855
2.82M
    return 0;
13856
2.82M
}
13857
13858
13859
int
13860
PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
13861
                               Py_ssize_t start, Py_ssize_t end)
13862
204k
{
13863
204k
    if (!PyUnicode_Check(str)) {
13864
0
        PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
13865
0
        return -1;
13866
0
    }
13867
204k
    if (start < 0 || start > end) {
13868
0
        PyErr_Format(PyExc_ValueError, "invalid start argument");
13869
0
        return -1;
13870
0
    }
13871
204k
    if (end > PyUnicode_GET_LENGTH(str)) {
13872
0
        PyErr_Format(PyExc_ValueError, "invalid end argument");
13873
0
        return -1;
13874
0
    }
13875
13876
204k
    return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
13877
204k
                                           start, end);
13878
204k
}
13879
13880
13881
int
13882
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13883
                                  const char *ascii, Py_ssize_t len)
13884
556k
{
13885
556k
    if (len == -1)
13886
0
        len = strlen(ascii);
13887
13888
556k
    assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13889
13890
556k
    if (writer->buffer == NULL && !writer->overallocate) {
13891
27.4k
        PyObject *str;
13892
13893
27.4k
        str = _PyUnicode_FromASCII(ascii, len);
13894
27.4k
        if (str == NULL)
13895
0
            return -1;
13896
13897
27.4k
        writer->readonly = 1;
13898
27.4k
        writer->buffer = str;
13899
27.4k
        _PyUnicodeWriter_Update(writer);
13900
27.4k
        writer->pos += len;
13901
27.4k
        return 0;
13902
27.4k
    }
13903
13904
529k
    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13905
0
        return -1;
13906
13907
529k
    switch (writer->kind)
13908
529k
    {
13909
523k
    case PyUnicode_1BYTE_KIND:
13910
523k
    {
13911
523k
        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13912
523k
        Py_UCS1 *data = writer->data;
13913
13914
523k
        memcpy(data + writer->pos, str, len);
13915
523k
        break;
13916
0
    }
13917
4.70k
    case PyUnicode_2BYTE_KIND:
13918
4.70k
    {
13919
4.70k
        _PyUnicode_CONVERT_BYTES(
13920
4.70k
            Py_UCS1, Py_UCS2,
13921
4.70k
            ascii, ascii + len,
13922
4.70k
            (Py_UCS2 *)writer->data + writer->pos);
13923
4.70k
        break;
13924
0
    }
13925
1.08k
    case PyUnicode_4BYTE_KIND:
13926
1.08k
    {
13927
1.08k
        _PyUnicode_CONVERT_BYTES(
13928
1.08k
            Py_UCS1, Py_UCS4,
13929
1.08k
            ascii, ascii + len,
13930
1.08k
            (Py_UCS4 *)writer->data + writer->pos);
13931
1.08k
        break;
13932
0
    }
13933
0
    default:
13934
0
        Py_UNREACHABLE();
13935
529k
    }
13936
13937
529k
    writer->pos += len;
13938
529k
    return 0;
13939
529k
}
13940
13941
13942
int
13943
PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
13944
                           const char *str,
13945
                           Py_ssize_t size)
13946
25
{
13947
25
    assert(writer != NULL);
13948
25
    _Py_AssertHoldsTstate();
13949
13950
25
    _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer;
13951
25
    return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
13952
25
}
13953
13954
13955
int
13956
PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
13957
                          const char *str,
13958
                          Py_ssize_t size)
13959
187k
{
13960
187k
    if (size < 0) {
13961
187k
        size = strlen(str);
13962
187k
    }
13963
13964
187k
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
13965
187k
    Py_ssize_t old_pos = _writer->pos;
13966
187k
    int res = unicode_decode_utf8_writer(_writer, str, size,
13967
187k
                                         _Py_ERROR_STRICT, NULL, NULL);
13968
187k
    if (res < 0) {
13969
0
        _writer->pos = old_pos;
13970
0
    }
13971
187k
    return res;
13972
187k
}
13973
13974
13975
int
13976
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
13977
                                   const char *string,
13978
                                   Py_ssize_t length,
13979
                                   const char *errors,
13980
                                   Py_ssize_t *consumed)
13981
0
{
13982
0
    if (length < 0) {
13983
0
        length = strlen(string);
13984
0
    }
13985
13986
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
13987
0
    Py_ssize_t old_pos = _writer->pos;
13988
0
    int res = unicode_decode_utf8_writer(_writer, string, length,
13989
0
                                         _Py_ERROR_UNKNOWN, errors, consumed);
13990
0
    if (res < 0) {
13991
0
        _writer->pos = old_pos;
13992
0
        if (consumed) {
13993
0
            *consumed = 0;
13994
0
        }
13995
0
    }
13996
0
    return res;
13997
0
}
13998
13999
14000
int
14001
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14002
                                   const char *str, Py_ssize_t len)
14003
0
{
14004
0
    Py_UCS4 maxchar;
14005
14006
0
    maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14007
0
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14008
0
        return -1;
14009
0
    unicode_write_cstr(writer->buffer, writer->pos, str, len);
14010
0
    writer->pos += len;
14011
0
    return 0;
14012
0
}
14013
14014
PyObject *
14015
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14016
3.32M
{
14017
3.32M
    PyObject *str;
14018
14019
3.32M
    if (writer->pos == 0) {
14020
2.47k
        Py_CLEAR(writer->buffer);
14021
2.47k
        _Py_RETURN_UNICODE_EMPTY();
14022
2.47k
    }
14023
14024
3.32M
    str = writer->buffer;
14025
3.32M
    writer->buffer = NULL;
14026
14027
3.32M
    if (writer->readonly) {
14028
27.4k
        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14029
27.4k
        return str;
14030
27.4k
    }
14031
14032
3.29M
    if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14033
2.81M
        PyObject *str2;
14034
2.81M
        str2 = resize_compact(str, writer->pos);
14035
2.81M
        if (str2 == NULL) {
14036
0
            Py_DECREF(str);
14037
0
            return NULL;
14038
0
        }
14039
2.81M
        str = str2;
14040
2.81M
    }
14041
14042
3.29M
    assert(_PyUnicode_CheckConsistency(str, 1));
14043
3.29M
    return unicode_result(str);
14044
3.29M
}
14045
14046
14047
PyObject*
14048
PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
14049
121k
{
14050
121k
    PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
14051
121k
    assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
14052
121k
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
14053
121k
    return str;
14054
121k
}
14055
14056
14057
void
14058
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14059
7.67k
{
14060
7.67k
    Py_CLEAR(writer->buffer);
14061
7.67k
}
14062
14063
#include "stringlib/unicode_format.h"
14064
14065
PyDoc_STRVAR(format__doc__,
14066
             "format($self, /, *args, **kwargs)\n\
14067
--\n\
14068
\n\
14069
Return a formatted version of the string, using substitutions from args and kwargs.\n\
14070
The substitutions are identified by braces ('{' and '}').");
14071
14072
PyDoc_STRVAR(format_map__doc__,
14073
             "format_map($self, mapping, /)\n\
14074
--\n\
14075
\n\
14076
Return a formatted version of the string, using substitutions from mapping.\n\
14077
The substitutions are identified by braces ('{' and '}').");
14078
14079
/*[clinic input]
14080
str.__format__ as unicode___format__
14081
14082
    format_spec: unicode
14083
    /
14084
14085
Return a formatted version of the string as described by format_spec.
14086
[clinic start generated code]*/
14087
14088
static PyObject *
14089
unicode___format___impl(PyObject *self, PyObject *format_spec)
14090
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14091
0
{
14092
0
    _PyUnicodeWriter writer;
14093
0
    int ret;
14094
14095
0
    _PyUnicodeWriter_Init(&writer);
14096
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
14097
0
                                          self, format_spec, 0,
14098
0
                                          PyUnicode_GET_LENGTH(format_spec));
14099
0
    if (ret == -1) {
14100
0
        _PyUnicodeWriter_Dealloc(&writer);
14101
0
        return NULL;
14102
0
    }
14103
0
    return _PyUnicodeWriter_Finish(&writer);
14104
0
}
14105
14106
/*[clinic input]
14107
str.__sizeof__ as unicode_sizeof
14108
14109
Return the size of the string in memory, in bytes.
14110
[clinic start generated code]*/
14111
14112
static PyObject *
14113
unicode_sizeof_impl(PyObject *self)
14114
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14115
0
{
14116
0
    Py_ssize_t size;
14117
14118
    /* If it's a compact object, account for base structure +
14119
       character data. */
14120
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
14121
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14122
0
    }
14123
0
    else if (PyUnicode_IS_COMPACT(self)) {
14124
0
        size = sizeof(PyCompactUnicodeObject) +
14125
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14126
0
    }
14127
0
    else {
14128
        /* If it is a two-block object, account for base object, and
14129
           for character block if present. */
14130
0
        size = sizeof(PyUnicodeObject);
14131
0
        if (_PyUnicode_DATA_ANY(self))
14132
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
14133
0
                PyUnicode_KIND(self);
14134
0
    }
14135
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
14136
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
14137
14138
0
    return PyLong_FromSsize_t(size);
14139
0
}
14140
14141
static PyObject *
14142
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14143
0
{
14144
0
    PyObject *copy = _PyUnicode_Copy(v);
14145
0
    if (!copy)
14146
0
        return NULL;
14147
0
    return Py_BuildValue("(N)", copy);
14148
0
}
14149
14150
/*
14151
This function searchs the longest common leading whitespace
14152
of all lines in the [src, end).
14153
It returns the length of the common leading whitespace and sets `output` to
14154
point to the beginning of the common leading whitespace if length > 0.
14155
*/
14156
static Py_ssize_t
14157
search_longest_common_leading_whitespace(
14158
    const char *const src,
14159
    const char *const end,
14160
    const char **output)
14161
0
{
14162
    // [_start, _start + _len)
14163
    // describes the current longest common leading whitespace
14164
0
    const char *_start = NULL;
14165
0
    Py_ssize_t _len = 0;
14166
14167
0
    for (const char *iter = src; iter < end; ++iter) {
14168
0
        const char *line_start = iter;
14169
0
        const char *leading_whitespace_end = NULL;
14170
14171
        // scan the whole line
14172
0
        while (iter < end && *iter != '\n') {
14173
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
14174
                /* `iter` points to the first non-whitespace character
14175
                   in this line */
14176
0
                if (iter == line_start) {
14177
                    // some line has no indent, fast exit!
14178
0
                    return 0;
14179
0
                }
14180
0
                leading_whitespace_end = iter;
14181
0
            }
14182
0
            ++iter;
14183
0
        }
14184
14185
        // if this line has all white space, skip it
14186
0
        if (!leading_whitespace_end) {
14187
0
            continue;
14188
0
        }
14189
14190
0
        if (!_start) {
14191
            // update the first leading whitespace
14192
0
            _start = line_start;
14193
0
            _len = leading_whitespace_end - line_start;
14194
0
            assert(_len > 0);
14195
0
        }
14196
0
        else {
14197
            /* We then compare with the current longest leading whitespace.
14198
14199
               [line_start, leading_whitespace_end) is the leading
14200
               whitespace of this line,
14201
14202
               [_start, _start + _len) is the leading whitespace of the
14203
               current longest leading whitespace. */
14204
0
            Py_ssize_t new_len = 0;
14205
0
            const char *_iter = _start, *line_iter = line_start;
14206
14207
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
14208
0
                   && *_iter == *line_iter)
14209
0
            {
14210
0
                ++_iter;
14211
0
                ++line_iter;
14212
0
                ++new_len;
14213
0
            }
14214
14215
0
            _len = new_len;
14216
0
            if (_len == 0) {
14217
                // No common things now, fast exit!
14218
0
                return 0;
14219
0
            }
14220
0
        }
14221
0
    }
14222
14223
0
    assert(_len >= 0);
14224
0
    if (_len > 0) {
14225
0
        *output = _start;
14226
0
    }
14227
0
    return _len;
14228
0
}
14229
14230
/* Dedent a string.
14231
   Behaviour is expected to be an exact match of `textwrap.dedent`.
14232
   Return a new reference on success, NULL with exception set on error.
14233
   */
14234
PyObject *
14235
_PyUnicode_Dedent(PyObject *unicode)
14236
0
{
14237
0
    Py_ssize_t src_len = 0;
14238
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
14239
0
    if (!src) {
14240
0
        return NULL;
14241
0
    }
14242
0
    assert(src_len >= 0);
14243
0
    if (src_len == 0) {
14244
0
        return Py_NewRef(unicode);
14245
0
    }
14246
14247
0
    const char *const end = src + src_len;
14248
14249
    // [whitespace_start, whitespace_start + whitespace_len)
14250
    // describes the current longest common leading whitespace
14251
0
    const char *whitespace_start = NULL;
14252
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
14253
0
        src, end, &whitespace_start);
14254
14255
0
    if (whitespace_len == 0) {
14256
0
        return Py_NewRef(unicode);
14257
0
    }
14258
14259
    // now we should trigger a dedent
14260
0
    char *dest = PyMem_Malloc(src_len);
14261
0
    if (!dest) {
14262
0
        PyErr_NoMemory();
14263
0
        return NULL;
14264
0
    }
14265
0
    char *dest_iter = dest;
14266
14267
0
    for (const char *iter = src; iter < end; ++iter) {
14268
0
        const char *line_start = iter;
14269
0
        bool in_leading_space = true;
14270
14271
        // iterate over a line to find the end of a line
14272
0
        while (iter < end && *iter != '\n') {
14273
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
14274
0
                in_leading_space = false;
14275
0
            }
14276
0
            ++iter;
14277
0
        }
14278
14279
        // invariant: *iter == '\n' or iter == end
14280
0
        bool append_newline = iter < end;
14281
14282
        // if this line has all white space, write '\n' and continue
14283
0
        if (in_leading_space && append_newline) {
14284
0
            *dest_iter++ = '\n';
14285
0
            continue;
14286
0
        }
14287
14288
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
14289
            conditionally append '\n' */
14290
14291
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
14292
0
        assert(new_line_len >= 0);
14293
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
14294
14295
0
        dest_iter += new_line_len;
14296
14297
0
        if (append_newline) {
14298
0
            *dest_iter++ = '\n';
14299
0
        }
14300
0
    }
14301
14302
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
14303
0
    PyMem_Free(dest);
14304
0
    return res;
14305
0
}
14306
14307
static PyMethodDef unicode_methods[] = {
14308
    UNICODE_ENCODE_METHODDEF
14309
    UNICODE_REPLACE_METHODDEF
14310
    UNICODE_SPLIT_METHODDEF
14311
    UNICODE_RSPLIT_METHODDEF
14312
    UNICODE_JOIN_METHODDEF
14313
    UNICODE_CAPITALIZE_METHODDEF
14314
    UNICODE_CASEFOLD_METHODDEF
14315
    UNICODE_TITLE_METHODDEF
14316
    UNICODE_CENTER_METHODDEF
14317
    UNICODE_COUNT_METHODDEF
14318
    UNICODE_EXPANDTABS_METHODDEF
14319
    UNICODE_FIND_METHODDEF
14320
    UNICODE_PARTITION_METHODDEF
14321
    UNICODE_INDEX_METHODDEF
14322
    UNICODE_LJUST_METHODDEF
14323
    UNICODE_LOWER_METHODDEF
14324
    UNICODE_LSTRIP_METHODDEF
14325
    UNICODE_RFIND_METHODDEF
14326
    UNICODE_RINDEX_METHODDEF
14327
    UNICODE_RJUST_METHODDEF
14328
    UNICODE_RSTRIP_METHODDEF
14329
    UNICODE_RPARTITION_METHODDEF
14330
    UNICODE_SPLITLINES_METHODDEF
14331
    UNICODE_STRIP_METHODDEF
14332
    UNICODE_SWAPCASE_METHODDEF
14333
    UNICODE_TRANSLATE_METHODDEF
14334
    UNICODE_UPPER_METHODDEF
14335
    UNICODE_STARTSWITH_METHODDEF
14336
    UNICODE_ENDSWITH_METHODDEF
14337
    UNICODE_REMOVEPREFIX_METHODDEF
14338
    UNICODE_REMOVESUFFIX_METHODDEF
14339
    UNICODE_ISASCII_METHODDEF
14340
    UNICODE_ISLOWER_METHODDEF
14341
    UNICODE_ISUPPER_METHODDEF
14342
    UNICODE_ISTITLE_METHODDEF
14343
    UNICODE_ISSPACE_METHODDEF
14344
    UNICODE_ISDECIMAL_METHODDEF
14345
    UNICODE_ISDIGIT_METHODDEF
14346
    UNICODE_ISNUMERIC_METHODDEF
14347
    UNICODE_ISALPHA_METHODDEF
14348
    UNICODE_ISALNUM_METHODDEF
14349
    UNICODE_ISIDENTIFIER_METHODDEF
14350
    UNICODE_ISPRINTABLE_METHODDEF
14351
    UNICODE_ZFILL_METHODDEF
14352
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
14353
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
14354
    UNICODE___FORMAT___METHODDEF
14355
    UNICODE_MAKETRANS_METHODDEF
14356
    UNICODE_SIZEOF_METHODDEF
14357
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
14358
    {NULL, NULL}
14359
};
14360
14361
static PyObject *
14362
unicode_mod(PyObject *v, PyObject *w)
14363
1.39M
{
14364
1.39M
    if (!PyUnicode_Check(v))
14365
11
        Py_RETURN_NOTIMPLEMENTED;
14366
1.39M
    return PyUnicode_Format(v, w);
14367
1.39M
}
14368
14369
static PyNumberMethods unicode_as_number = {
14370
    0,              /*nb_add*/
14371
    0,              /*nb_subtract*/
14372
    0,              /*nb_multiply*/
14373
    unicode_mod,            /*nb_remainder*/
14374
};
14375
14376
static PySequenceMethods unicode_as_sequence = {
14377
    unicode_length,     /* sq_length */
14378
    PyUnicode_Concat,   /* sq_concat */
14379
    unicode_repeat,     /* sq_repeat */
14380
    unicode_getitem,    /* sq_item */
14381
    0,                  /* sq_slice */
14382
    0,                  /* sq_ass_item */
14383
    0,                  /* sq_ass_slice */
14384
    PyUnicode_Contains, /* sq_contains */
14385
};
14386
14387
static PyObject*
14388
unicode_subscript(PyObject* self, PyObject* item)
14389
15.2M
{
14390
15.2M
    if (_PyIndex_Check(item)) {
14391
14.5M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14392
14.5M
        if (i == -1 && PyErr_Occurred())
14393
330
            return NULL;
14394
14.5M
        if (i < 0)
14395
3.42k
            i += PyUnicode_GET_LENGTH(self);
14396
14.5M
        return unicode_getitem(self, i);
14397
14.5M
    } else if (PySlice_Check(item)) {
14398
636k
        Py_ssize_t start, stop, step, slicelength, i;
14399
636k
        size_t cur;
14400
636k
        PyObject *result;
14401
636k
        const void *src_data;
14402
636k
        void *dest_data;
14403
636k
        int src_kind, dest_kind;
14404
636k
        Py_UCS4 ch, max_char, kind_limit;
14405
14406
636k
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14407
344
            return NULL;
14408
344
        }
14409
636k
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14410
636k
                                            &start, &stop, step);
14411
14412
636k
        if (slicelength <= 0) {
14413
93.3k
            _Py_RETURN_UNICODE_EMPTY();
14414
543k
        } else if (start == 0 && step == 1 &&
14415
242k
                   slicelength == PyUnicode_GET_LENGTH(self)) {
14416
63.5k
            return unicode_result_unchanged(self);
14417
479k
        } else if (step == 1) {
14418
477k
            return PyUnicode_Substring(self,
14419
477k
                                       start, start + slicelength);
14420
477k
        }
14421
        /* General case */
14422
1.73k
        src_kind = PyUnicode_KIND(self);
14423
1.73k
        src_data = PyUnicode_DATA(self);
14424
1.73k
        if (!PyUnicode_IS_ASCII(self)) {
14425
850
            kind_limit = kind_maxchar_limit(src_kind);
14426
850
            max_char = 0;
14427
12.0k
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14428
11.7k
                ch = PyUnicode_READ(src_kind, src_data, cur);
14429
11.7k
                if (ch > max_char) {
14430
2.67k
                    max_char = ch;
14431
2.67k
                    if (max_char >= kind_limit)
14432
550
                        break;
14433
2.67k
                }
14434
11.7k
            }
14435
850
        }
14436
880
        else
14437
880
            max_char = 127;
14438
1.73k
        result = PyUnicode_New(slicelength, max_char);
14439
1.73k
        if (result == NULL)
14440
0
            return NULL;
14441
1.73k
        dest_kind = PyUnicode_KIND(result);
14442
1.73k
        dest_data = PyUnicode_DATA(result);
14443
14444
24.3k
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14445
22.6k
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14446
22.6k
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14447
22.6k
        }
14448
1.73k
        assert(_PyUnicode_CheckConsistency(result, 1));
14449
1.73k
        return result;
14450
1.73k
    } else {
14451
965
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
14452
965
                     Py_TYPE(item)->tp_name);
14453
965
        return NULL;
14454
965
    }
14455
15.2M
}
14456
14457
static PyMappingMethods unicode_as_mapping = {
14458
    unicode_length,     /* mp_length */
14459
    unicode_subscript,  /* mp_subscript */
14460
    0,                  /* mp_ass_subscript */
14461
};
14462
14463
14464
/* Helpers for PyUnicode_Format() */
14465
14466
struct unicode_formatter_t {
14467
    PyObject *args;
14468
    int args_owned;
14469
    Py_ssize_t arglen, argidx;
14470
    PyObject *dict;
14471
14472
    int fmtkind;
14473
    Py_ssize_t fmtcnt, fmtpos;
14474
    const void *fmtdata;
14475
    PyObject *fmtstr;
14476
14477
    _PyUnicodeWriter writer;
14478
};
14479
14480
struct unicode_format_arg_t {
14481
    Py_UCS4 ch;
14482
    int flags;
14483
    Py_ssize_t width;
14484
    int prec;
14485
    int sign;
14486
};
14487
14488
static PyObject *
14489
unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14490
2.63M
{
14491
2.63M
    Py_ssize_t argidx = ctx->argidx;
14492
14493
2.63M
    if (argidx < ctx->arglen) {
14494
2.63M
        ctx->argidx++;
14495
2.63M
        if (ctx->arglen < 0)
14496
170k
            return ctx->args;
14497
2.46M
        else
14498
2.46M
            return PyTuple_GetItem(ctx->args, argidx);
14499
2.63M
    }
14500
0
    PyErr_SetString(PyExc_TypeError,
14501
0
                    "not enough arguments for format string");
14502
0
    return NULL;
14503
2.63M
}
14504
14505
/* Returns a new reference to a PyUnicode object, or NULL on failure. */
14506
14507
/* Format a float into the writer if the writer is not NULL, or into *p_output
14508
   otherwise.
14509
14510
   Return 0 on success, raise an exception and return -1 on error. */
14511
static int
14512
formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14513
            PyObject **p_output,
14514
            _PyUnicodeWriter *writer)
14515
0
{
14516
0
    char *p;
14517
0
    double x;
14518
0
    Py_ssize_t len;
14519
0
    int prec;
14520
0
    int dtoa_flags = 0;
14521
14522
0
    x = PyFloat_AsDouble(v);
14523
0
    if (x == -1.0 && PyErr_Occurred())
14524
0
        return -1;
14525
14526
0
    prec = arg->prec;
14527
0
    if (prec < 0)
14528
0
        prec = 6;
14529
14530
0
    if (arg->flags & F_ALT)
14531
0
        dtoa_flags |= Py_DTSF_ALT;
14532
0
    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14533
0
    if (p == NULL)
14534
0
        return -1;
14535
0
    len = strlen(p);
14536
0
    if (writer) {
14537
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14538
0
            PyMem_Free(p);
14539
0
            return -1;
14540
0
        }
14541
0
    }
14542
0
    else
14543
0
        *p_output = _PyUnicode_FromASCII(p, len);
14544
0
    PyMem_Free(p);
14545
0
    return 0;
14546
0
}
14547
14548
/* formatlong() emulates the format codes d, u, o, x and X, and
14549
 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14550
 * Python's regular ints.
14551
 * Return value:  a new PyUnicodeObject*, or NULL if error.
14552
 *     The output string is of the form
14553
 *         "-"? ("0x" | "0X")? digit+
14554
 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14555
 *         set in flags.  The case of hex digits will be correct,
14556
 *     There will be at least prec digits, zero-filled on the left if
14557
 *         necessary to get that many.
14558
 * val          object to be converted
14559
 * flags        bitmask of format flags; only F_ALT is looked at
14560
 * prec         minimum number of digits; 0-fill on left if needed
14561
 * type         a character in [duoxX]; u acts the same as d
14562
 *
14563
 * CAUTION:  o, x and X conversions on regular ints can never
14564
 * produce a '-' sign, but can for Python's unbounded ints.
14565
 */
14566
PyObject *
14567
_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14568
0
{
14569
0
    PyObject *result = NULL;
14570
0
    char *buf;
14571
0
    Py_ssize_t i;
14572
0
    int sign;           /* 1 if '-', else 0 */
14573
0
    int len;            /* number of characters */
14574
0
    Py_ssize_t llen;
14575
0
    int numdigits;      /* len == numnondigits + numdigits */
14576
0
    int numnondigits = 0;
14577
14578
    /* Avoid exceeding SSIZE_T_MAX */
14579
0
    if (prec > INT_MAX-3) {
14580
0
        PyErr_SetString(PyExc_OverflowError,
14581
0
                        "precision too large");
14582
0
        return NULL;
14583
0
    }
14584
14585
0
    assert(PyLong_Check(val));
14586
14587
0
    switch (type) {
14588
0
    default:
14589
0
        Py_UNREACHABLE();
14590
0
    case 'd':
14591
0
    case 'i':
14592
0
    case 'u':
14593
        /* int and int subclasses should print numerically when a numeric */
14594
        /* format code is used (see issue18780) */
14595
0
        result = PyNumber_ToBase(val, 10);
14596
0
        break;
14597
0
    case 'o':
14598
0
        numnondigits = 2;
14599
0
        result = PyNumber_ToBase(val, 8);
14600
0
        break;
14601
0
    case 'x':
14602
0
    case 'X':
14603
0
        numnondigits = 2;
14604
0
        result = PyNumber_ToBase(val, 16);
14605
0
        break;
14606
0
    }
14607
0
    if (!result)
14608
0
        return NULL;
14609
14610
0
    assert(unicode_modifiable(result));
14611
0
    assert(PyUnicode_IS_ASCII(result));
14612
14613
    /* To modify the string in-place, there can only be one reference. */
14614
0
    if (!_PyObject_IsUniquelyReferenced(result)) {
14615
0
        Py_DECREF(result);
14616
0
        PyErr_BadInternalCall();
14617
0
        return NULL;
14618
0
    }
14619
0
    buf = PyUnicode_DATA(result);
14620
0
    llen = PyUnicode_GET_LENGTH(result);
14621
0
    if (llen > INT_MAX) {
14622
0
        Py_DECREF(result);
14623
0
        PyErr_SetString(PyExc_ValueError,
14624
0
                        "string too large in _PyUnicode_FormatLong");
14625
0
        return NULL;
14626
0
    }
14627
0
    len = (int)llen;
14628
0
    sign = buf[0] == '-';
14629
0
    numnondigits += sign;
14630
0
    numdigits = len - numnondigits;
14631
0
    assert(numdigits > 0);
14632
14633
    /* Get rid of base marker unless F_ALT */
14634
0
    if (((alt) == 0 &&
14635
0
        (type == 'o' || type == 'x' || type == 'X'))) {
14636
0
        assert(buf[sign] == '0');
14637
0
        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14638
0
               buf[sign+1] == 'o');
14639
0
        numnondigits -= 2;
14640
0
        buf += 2;
14641
0
        len -= 2;
14642
0
        if (sign)
14643
0
            buf[0] = '-';
14644
0
        assert(len == numnondigits + numdigits);
14645
0
        assert(numdigits > 0);
14646
0
    }
14647
14648
    /* Fill with leading zeroes to meet minimum width. */
14649
0
    if (prec > numdigits) {
14650
0
        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14651
0
                                numnondigits + prec);
14652
0
        char *b1;
14653
0
        if (!r1) {
14654
0
            Py_DECREF(result);
14655
0
            return NULL;
14656
0
        }
14657
0
        b1 = PyBytes_AS_STRING(r1);
14658
0
        for (i = 0; i < numnondigits; ++i)
14659
0
            *b1++ = *buf++;
14660
0
        for (i = 0; i < prec - numdigits; i++)
14661
0
            *b1++ = '0';
14662
0
        for (i = 0; i < numdigits; i++)
14663
0
            *b1++ = *buf++;
14664
0
        *b1 = '\0';
14665
0
        Py_SETREF(result, r1);
14666
0
        buf = PyBytes_AS_STRING(result);
14667
0
        len = numnondigits + prec;
14668
0
    }
14669
14670
    /* Fix up case for hex conversions. */
14671
0
    if (type == 'X') {
14672
        /* Need to convert all lower case letters to upper case.
14673
           and need to convert 0x to 0X (and -0x to -0X). */
14674
0
        for (i = 0; i < len; i++)
14675
0
            if (buf[i] >= 'a' && buf[i] <= 'x')
14676
0
                buf[i] -= 'a'-'A';
14677
0
    }
14678
0
    if (!PyUnicode_Check(result)
14679
0
        || buf != PyUnicode_DATA(result)) {
14680
0
        PyObject *unicode;
14681
0
        unicode = _PyUnicode_FromASCII(buf, len);
14682
0
        Py_SETREF(result, unicode);
14683
0
    }
14684
0
    else if (len != PyUnicode_GET_LENGTH(result)) {
14685
0
        if (PyUnicode_Resize(&result, len) < 0)
14686
0
            Py_CLEAR(result);
14687
0
    }
14688
0
    return result;
14689
0
}
14690
14691
/* Format an integer or a float as an integer.
14692
 * Return 1 if the number has been formatted into the writer,
14693
 *        0 if the number has been formatted into *p_output
14694
 *       -1 and raise an exception on error */
14695
static int
14696
mainformatlong(PyObject *v,
14697
               struct unicode_format_arg_t *arg,
14698
               PyObject **p_output,
14699
               _PyUnicodeWriter *writer)
14700
1.41M
{
14701
1.41M
    PyObject *iobj, *res;
14702
1.41M
    char type = (char)arg->ch;
14703
14704
1.41M
    if (!PyNumber_Check(v))
14705
0
        goto wrongtype;
14706
14707
    /* make sure number is a type of integer for o, x, and X */
14708
1.41M
    if (!PyLong_Check(v)) {
14709
0
        if (type == 'o' || type == 'x' || type == 'X') {
14710
0
            iobj = _PyNumber_Index(v);
14711
0
        }
14712
0
        else {
14713
0
            iobj = PyNumber_Long(v);
14714
0
        }
14715
0
        if (iobj == NULL ) {
14716
0
            if (PyErr_ExceptionMatches(PyExc_TypeError))
14717
0
                goto wrongtype;
14718
0
            return -1;
14719
0
        }
14720
0
        assert(PyLong_Check(iobj));
14721
0
    }
14722
1.41M
    else {
14723
1.41M
        iobj = Py_NewRef(v);
14724
1.41M
    }
14725
14726
1.41M
    if (PyLong_CheckExact(v)
14727
1.41M
        && arg->width == -1 && arg->prec == -1
14728
1.41M
        && !(arg->flags & (F_SIGN | F_BLANK))
14729
1.41M
        && type != 'X')
14730
1.41M
    {
14731
        /* Fast path */
14732
1.41M
        int alternate = arg->flags & F_ALT;
14733
1.41M
        int base;
14734
14735
1.41M
        switch(type)
14736
1.41M
        {
14737
0
            default:
14738
0
                Py_UNREACHABLE();
14739
1.41M
            case 'd':
14740
1.41M
            case 'i':
14741
1.41M
            case 'u':
14742
1.41M
                base = 10;
14743
1.41M
                break;
14744
0
            case 'o':
14745
0
                base = 8;
14746
0
                break;
14747
0
            case 'x':
14748
0
            case 'X':
14749
0
                base = 16;
14750
0
                break;
14751
1.41M
        }
14752
14753
1.41M
        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14754
0
            Py_DECREF(iobj);
14755
0
            return -1;
14756
0
        }
14757
1.41M
        Py_DECREF(iobj);
14758
1.41M
        return 1;
14759
1.41M
    }
14760
14761
0
    res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14762
0
    Py_DECREF(iobj);
14763
0
    if (res == NULL)
14764
0
        return -1;
14765
0
    *p_output = res;
14766
0
    return 0;
14767
14768
0
wrongtype:
14769
0
    switch(type)
14770
0
    {
14771
0
        case 'o':
14772
0
        case 'x':
14773
0
        case 'X':
14774
0
            PyErr_Format(PyExc_TypeError,
14775
0
                    "%%%c format: an integer is required, "
14776
0
                    "not %.200s",
14777
0
                    type, Py_TYPE(v)->tp_name);
14778
0
            break;
14779
0
        default:
14780
0
            PyErr_Format(PyExc_TypeError,
14781
0
                    "%%%c format: a real number is required, "
14782
0
                    "not %.200s",
14783
0
                    type, Py_TYPE(v)->tp_name);
14784
0
            break;
14785
0
    }
14786
0
    return -1;
14787
0
}
14788
14789
static Py_UCS4
14790
formatchar(PyObject *v)
14791
0
{
14792
    /* presume that the buffer is at least 3 characters long */
14793
0
    if (PyUnicode_Check(v)) {
14794
0
        if (PyUnicode_GET_LENGTH(v) == 1) {
14795
0
            return PyUnicode_READ_CHAR(v, 0);
14796
0
        }
14797
0
        PyErr_Format(PyExc_TypeError,
14798
0
                     "%%c requires an int or a unicode character, "
14799
0
                     "not a string of length %zd",
14800
0
                     PyUnicode_GET_LENGTH(v));
14801
0
        return (Py_UCS4) -1;
14802
0
    }
14803
0
    else {
14804
0
        int overflow;
14805
0
        long x = PyLong_AsLongAndOverflow(v, &overflow);
14806
0
        if (x == -1 && PyErr_Occurred()) {
14807
0
            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14808
0
                PyErr_Format(PyExc_TypeError,
14809
0
                             "%%c requires an int or a unicode character, not %T",
14810
0
                             v);
14811
0
                return (Py_UCS4) -1;
14812
0
            }
14813
0
            return (Py_UCS4) -1;
14814
0
        }
14815
14816
0
        if (x < 0 || x > MAX_UNICODE) {
14817
            /* this includes an overflow in converting to C long */
14818
0
            PyErr_SetString(PyExc_OverflowError,
14819
0
                            "%c arg not in range(0x110000)");
14820
0
            return (Py_UCS4) -1;
14821
0
        }
14822
14823
0
        return (Py_UCS4) x;
14824
0
    }
14825
0
}
14826
14827
/* Parse options of an argument: flags, width, precision.
14828
   Handle also "%(name)" syntax.
14829
14830
   Return 0 if the argument has been formatted into arg->str.
14831
   Return 1 if the argument has been written into ctx->writer,
14832
   Raise an exception and return -1 on error. */
14833
static int
14834
unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14835
                         struct unicode_format_arg_t *arg)
14836
2.63M
{
14837
2.63M
#define FORMAT_READ(ctx) \
14838
2.63M
        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14839
14840
2.63M
    PyObject *v;
14841
14842
2.63M
    if (arg->ch == '(') {
14843
        /* Get argument value from a dictionary. Example: "%(name)s". */
14844
0
        Py_ssize_t keystart;
14845
0
        Py_ssize_t keylen;
14846
0
        PyObject *key;
14847
0
        int pcount = 1;
14848
14849
0
        if (ctx->dict == NULL) {
14850
0
            PyErr_SetString(PyExc_TypeError,
14851
0
                            "format requires a mapping");
14852
0
            return -1;
14853
0
        }
14854
0
        ++ctx->fmtpos;
14855
0
        --ctx->fmtcnt;
14856
0
        keystart = ctx->fmtpos;
14857
        /* Skip over balanced parentheses */
14858
0
        while (pcount > 0 && --ctx->fmtcnt >= 0) {
14859
0
            arg->ch = FORMAT_READ(ctx);
14860
0
            if (arg->ch == ')')
14861
0
                --pcount;
14862
0
            else if (arg->ch == '(')
14863
0
                ++pcount;
14864
0
            ctx->fmtpos++;
14865
0
        }
14866
0
        keylen = ctx->fmtpos - keystart - 1;
14867
0
        if (ctx->fmtcnt < 0 || pcount > 0) {
14868
0
            PyErr_SetString(PyExc_ValueError,
14869
0
                            "incomplete format key");
14870
0
            return -1;
14871
0
        }
14872
0
        key = PyUnicode_Substring(ctx->fmtstr,
14873
0
                                  keystart, keystart + keylen);
14874
0
        if (key == NULL)
14875
0
            return -1;
14876
0
        if (ctx->args_owned) {
14877
0
            ctx->args_owned = 0;
14878
0
            Py_DECREF(ctx->args);
14879
0
        }
14880
0
        ctx->args = PyObject_GetItem(ctx->dict, key);
14881
0
        Py_DECREF(key);
14882
0
        if (ctx->args == NULL)
14883
0
            return -1;
14884
0
        ctx->args_owned = 1;
14885
0
        ctx->arglen = -1;
14886
0
        ctx->argidx = -2;
14887
0
    }
14888
14889
    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14890
2.63M
    while (--ctx->fmtcnt >= 0) {
14891
2.63M
        arg->ch = FORMAT_READ(ctx);
14892
2.63M
        ctx->fmtpos++;
14893
2.63M
        switch (arg->ch) {
14894
0
        case '-': arg->flags |= F_LJUST; continue;
14895
0
        case '+': arg->flags |= F_SIGN; continue;
14896
0
        case ' ': arg->flags |= F_BLANK; continue;
14897
0
        case '#': arg->flags |= F_ALT; continue;
14898
0
        case '0': arg->flags |= F_ZERO; continue;
14899
2.63M
        }
14900
2.63M
        break;
14901
2.63M
    }
14902
14903
    /* Parse width. Example: "%10s" => width=10 */
14904
2.63M
    if (arg->ch == '*') {
14905
0
        v = unicode_format_getnextarg(ctx);
14906
0
        if (v == NULL)
14907
0
            return -1;
14908
0
        if (!PyLong_Check(v)) {
14909
0
            PyErr_SetString(PyExc_TypeError,
14910
0
                            "* wants int");
14911
0
            return -1;
14912
0
        }
14913
0
        arg->width = PyLong_AsSsize_t(v);
14914
0
        if (arg->width == -1 && PyErr_Occurred())
14915
0
            return -1;
14916
0
        if (arg->width < 0) {
14917
0
            arg->flags |= F_LJUST;
14918
0
            arg->width = -arg->width;
14919
0
        }
14920
0
        if (--ctx->fmtcnt >= 0) {
14921
0
            arg->ch = FORMAT_READ(ctx);
14922
0
            ctx->fmtpos++;
14923
0
        }
14924
0
    }
14925
2.63M
    else if (arg->ch >= '0' && arg->ch <= '9') {
14926
0
        arg->width = arg->ch - '0';
14927
0
        while (--ctx->fmtcnt >= 0) {
14928
0
            arg->ch = FORMAT_READ(ctx);
14929
0
            ctx->fmtpos++;
14930
0
            if (arg->ch < '0' || arg->ch > '9')
14931
0
                break;
14932
            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14933
               mixing signed and unsigned comparison. Since arg->ch is between
14934
               '0' and '9', casting to int is safe. */
14935
0
            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14936
0
                PyErr_SetString(PyExc_ValueError,
14937
0
                                "width too big");
14938
0
                return -1;
14939
0
            }
14940
0
            arg->width = arg->width*10 + (arg->ch - '0');
14941
0
        }
14942
0
    }
14943
14944
    /* Parse precision. Example: "%.3f" => prec=3 */
14945
2.63M
    if (arg->ch == '.') {
14946
0
        arg->prec = 0;
14947
0
        if (--ctx->fmtcnt >= 0) {
14948
0
            arg->ch = FORMAT_READ(ctx);
14949
0
            ctx->fmtpos++;
14950
0
        }
14951
0
        if (arg->ch == '*') {
14952
0
            v = unicode_format_getnextarg(ctx);
14953
0
            if (v == NULL)
14954
0
                return -1;
14955
0
            if (!PyLong_Check(v)) {
14956
0
                PyErr_SetString(PyExc_TypeError,
14957
0
                                "* wants int");
14958
0
                return -1;
14959
0
            }
14960
0
            arg->prec = PyLong_AsInt(v);
14961
0
            if (arg->prec == -1 && PyErr_Occurred())
14962
0
                return -1;
14963
0
            if (arg->prec < 0)
14964
0
                arg->prec = 0;
14965
0
            if (--ctx->fmtcnt >= 0) {
14966
0
                arg->ch = FORMAT_READ(ctx);
14967
0
                ctx->fmtpos++;
14968
0
            }
14969
0
        }
14970
0
        else if (arg->ch >= '0' && arg->ch <= '9') {
14971
0
            arg->prec = arg->ch - '0';
14972
0
            while (--ctx->fmtcnt >= 0) {
14973
0
                arg->ch = FORMAT_READ(ctx);
14974
0
                ctx->fmtpos++;
14975
0
                if (arg->ch < '0' || arg->ch > '9')
14976
0
                    break;
14977
0
                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14978
0
                    PyErr_SetString(PyExc_ValueError,
14979
0
                                    "precision too big");
14980
0
                    return -1;
14981
0
                }
14982
0
                arg->prec = arg->prec*10 + (arg->ch - '0');
14983
0
            }
14984
0
        }
14985
0
    }
14986
14987
    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14988
2.63M
    if (ctx->fmtcnt >= 0) {
14989
2.63M
        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14990
0
            if (--ctx->fmtcnt >= 0) {
14991
0
                arg->ch = FORMAT_READ(ctx);
14992
0
                ctx->fmtpos++;
14993
0
            }
14994
0
        }
14995
2.63M
    }
14996
2.63M
    if (ctx->fmtcnt < 0) {
14997
0
        PyErr_SetString(PyExc_ValueError,
14998
0
                        "incomplete format");
14999
0
        return -1;
15000
0
    }
15001
2.63M
    return 0;
15002
15003
2.63M
#undef FORMAT_READ
15004
2.63M
}
15005
15006
/* Format one argument. Supported conversion specifiers:
15007
15008
   - "s", "r", "a": any type
15009
   - "i", "d", "u": int or float
15010
   - "o", "x", "X": int
15011
   - "e", "E", "f", "F", "g", "G": float
15012
   - "c": int or str (1 character)
15013
15014
   When possible, the output is written directly into the Unicode writer
15015
   (ctx->writer). A string is created when padding is required.
15016
15017
   Return 0 if the argument has been formatted into *p_str,
15018
          1 if the argument has been written into ctx->writer,
15019
         -1 on error. */
15020
static int
15021
unicode_format_arg_format(struct unicode_formatter_t *ctx,
15022
                          struct unicode_format_arg_t *arg,
15023
                          PyObject **p_str)
15024
2.63M
{
15025
2.63M
    PyObject *v;
15026
2.63M
    _PyUnicodeWriter *writer = &ctx->writer;
15027
15028
2.63M
    if (ctx->fmtcnt == 0)
15029
1.39M
        ctx->writer.overallocate = 0;
15030
15031
2.63M
    v = unicode_format_getnextarg(ctx);
15032
2.63M
    if (v == NULL)
15033
0
        return -1;
15034
15035
15036
2.63M
    switch (arg->ch) {
15037
1.22M
    case 's':
15038
1.22M
    case 'r':
15039
1.22M
    case 'a':
15040
1.22M
        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15041
            /* Fast path */
15042
0
            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15043
0
                return -1;
15044
0
            return 1;
15045
0
        }
15046
15047
1.22M
        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15048
1.22M
            *p_str = Py_NewRef(v);
15049
1.22M
        }
15050
419
        else {
15051
419
            if (arg->ch == 's')
15052
0
                *p_str = PyObject_Str(v);
15053
419
            else if (arg->ch == 'r')
15054
129
                *p_str = PyObject_Repr(v);
15055
290
            else
15056
290
                *p_str = PyObject_ASCII(v);
15057
419
        }
15058
1.22M
        break;
15059
15060
0
    case 'i':
15061
1.41M
    case 'd':
15062
1.41M
    case 'u':
15063
1.41M
    case 'o':
15064
1.41M
    case 'x':
15065
1.41M
    case 'X':
15066
1.41M
    {
15067
1.41M
        int ret = mainformatlong(v, arg, p_str, writer);
15068
1.41M
        if (ret != 0)
15069
1.41M
            return ret;
15070
0
        arg->sign = 1;
15071
0
        break;
15072
1.41M
    }
15073
15074
0
    case 'e':
15075
0
    case 'E':
15076
0
    case 'f':
15077
0
    case 'F':
15078
0
    case 'g':
15079
0
    case 'G':
15080
0
        if (arg->width == -1 && arg->prec == -1
15081
0
            && !(arg->flags & (F_SIGN | F_BLANK)))
15082
0
        {
15083
            /* Fast path */
15084
0
            if (formatfloat(v, arg, NULL, writer) == -1)
15085
0
                return -1;
15086
0
            return 1;
15087
0
        }
15088
15089
0
        arg->sign = 1;
15090
0
        if (formatfloat(v, arg, p_str, NULL) == -1)
15091
0
            return -1;
15092
0
        break;
15093
15094
0
    case 'c':
15095
0
    {
15096
0
        Py_UCS4 ch = formatchar(v);
15097
0
        if (ch == (Py_UCS4) -1)
15098
0
            return -1;
15099
0
        if (arg->width == -1 && arg->prec == -1) {
15100
            /* Fast path */
15101
0
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
15102
0
                return -1;
15103
0
            return 1;
15104
0
        }
15105
0
        *p_str = PyUnicode_FromOrdinal(ch);
15106
0
        break;
15107
0
    }
15108
15109
0
    default:
15110
0
        PyErr_Format(PyExc_ValueError,
15111
0
                     "unsupported format character '%c' (0x%x) "
15112
0
                     "at index %zd",
15113
0
                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15114
0
                     (int)arg->ch,
15115
0
                     ctx->fmtpos - 1);
15116
0
        return -1;
15117
2.63M
    }
15118
1.22M
    if (*p_str == NULL)
15119
0
        return -1;
15120
1.22M
    assert (PyUnicode_Check(*p_str));
15121
1.22M
    return 0;
15122
1.22M
}
15123
15124
static int
15125
unicode_format_arg_output(struct unicode_formatter_t *ctx,
15126
                          struct unicode_format_arg_t *arg,
15127
                          PyObject *str)
15128
1.22M
{
15129
1.22M
    Py_ssize_t len;
15130
1.22M
    int kind;
15131
1.22M
    const void *pbuf;
15132
1.22M
    Py_ssize_t pindex;
15133
1.22M
    Py_UCS4 signchar;
15134
1.22M
    Py_ssize_t buflen;
15135
1.22M
    Py_UCS4 maxchar;
15136
1.22M
    Py_ssize_t sublen;
15137
1.22M
    _PyUnicodeWriter *writer = &ctx->writer;
15138
1.22M
    Py_UCS4 fill;
15139
15140
1.22M
    fill = ' ';
15141
1.22M
    if (arg->sign && arg->flags & F_ZERO)
15142
0
        fill = '0';
15143
15144
1.22M
    len = PyUnicode_GET_LENGTH(str);
15145
1.22M
    if ((arg->width == -1 || arg->width <= len)
15146
1.22M
        && (arg->prec == -1 || arg->prec >= len)
15147
1.22M
        && !(arg->flags & (F_SIGN | F_BLANK)))
15148
1.22M
    {
15149
        /* Fast path */
15150
1.22M
        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15151
0
            return -1;
15152
1.22M
        return 0;
15153
1.22M
    }
15154
15155
    /* Truncate the string for "s", "r" and "a" formats
15156
       if the precision is set */
15157
0
    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15158
0
        if (arg->prec >= 0 && len > arg->prec)
15159
0
            len = arg->prec;
15160
0
    }
15161
15162
    /* Adjust sign and width */
15163
0
    kind = PyUnicode_KIND(str);
15164
0
    pbuf = PyUnicode_DATA(str);
15165
0
    pindex = 0;
15166
0
    signchar = '\0';
15167
0
    if (arg->sign) {
15168
0
        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15169
0
        if (ch == '-' || ch == '+') {
15170
0
            signchar = ch;
15171
0
            len--;
15172
0
            pindex++;
15173
0
        }
15174
0
        else if (arg->flags & F_SIGN)
15175
0
            signchar = '+';
15176
0
        else if (arg->flags & F_BLANK)
15177
0
            signchar = ' ';
15178
0
        else
15179
0
            arg->sign = 0;
15180
0
    }
15181
0
    if (arg->width < len)
15182
0
        arg->width = len;
15183
15184
    /* Prepare the writer */
15185
0
    maxchar = writer->maxchar;
15186
0
    if (!(arg->flags & F_LJUST)) {
15187
0
        if (arg->sign) {
15188
0
            if ((arg->width-1) > len)
15189
0
                maxchar = Py_MAX(maxchar, fill);
15190
0
        }
15191
0
        else {
15192
0
            if (arg->width > len)
15193
0
                maxchar = Py_MAX(maxchar, fill);
15194
0
        }
15195
0
    }
15196
0
    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15197
0
        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15198
0
        maxchar = Py_MAX(maxchar, strmaxchar);
15199
0
    }
15200
15201
0
    buflen = arg->width;
15202
0
    if (arg->sign && len == arg->width)
15203
0
        buflen++;
15204
0
    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15205
0
        return -1;
15206
15207
    /* Write the sign if needed */
15208
0
    if (arg->sign) {
15209
0
        if (fill != ' ') {
15210
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15211
0
            writer->pos += 1;
15212
0
        }
15213
0
        if (arg->width > len)
15214
0
            arg->width--;
15215
0
    }
15216
15217
    /* Write the numeric prefix for "x", "X" and "o" formats
15218
       if the alternate form is used.
15219
       For example, write "0x" for the "%#x" format. */
15220
0
    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15221
0
        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15222
0
        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15223
0
        if (fill != ' ') {
15224
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15225
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15226
0
            writer->pos += 2;
15227
0
            pindex += 2;
15228
0
        }
15229
0
        arg->width -= 2;
15230
0
        if (arg->width < 0)
15231
0
            arg->width = 0;
15232
0
        len -= 2;
15233
0
    }
15234
15235
    /* Pad left with the fill character if needed */
15236
0
    if (arg->width > len && !(arg->flags & F_LJUST)) {
15237
0
        sublen = arg->width - len;
15238
0
        _PyUnicode_Fill(writer->kind, writer->data, fill, writer->pos, sublen);
15239
0
        writer->pos += sublen;
15240
0
        arg->width = len;
15241
0
    }
15242
15243
    /* If padding with spaces: write sign if needed and/or numeric prefix if
15244
       the alternate form is used */
15245
0
    if (fill == ' ') {
15246
0
        if (arg->sign) {
15247
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15248
0
            writer->pos += 1;
15249
0
        }
15250
0
        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15251
0
            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15252
0
            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15253
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15254
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15255
0
            writer->pos += 2;
15256
0
            pindex += 2;
15257
0
        }
15258
0
    }
15259
15260
    /* Write characters */
15261
0
    if (len) {
15262
0
        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15263
0
                                      str, pindex, len);
15264
0
        writer->pos += len;
15265
0
    }
15266
15267
    /* Pad right with the fill character if needed */
15268
0
    if (arg->width > len) {
15269
0
        sublen = arg->width - len;
15270
0
        _PyUnicode_Fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15271
0
        writer->pos += sublen;
15272
0
    }
15273
0
    return 0;
15274
0
}
15275
15276
/* Helper of PyUnicode_Format(): format one arg.
15277
   Return 0 on success, raise an exception and return -1 on error. */
15278
static int
15279
unicode_format_arg(struct unicode_formatter_t *ctx)
15280
2.63M
{
15281
2.63M
    struct unicode_format_arg_t arg;
15282
2.63M
    PyObject *str;
15283
2.63M
    int ret;
15284
15285
2.63M
    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15286
2.63M
    if (arg.ch == '%') {
15287
0
        ctx->fmtpos++;
15288
0
        ctx->fmtcnt--;
15289
0
        if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15290
0
            return -1;
15291
0
        return 0;
15292
0
    }
15293
2.63M
    arg.flags = 0;
15294
2.63M
    arg.width = -1;
15295
2.63M
    arg.prec = -1;
15296
2.63M
    arg.sign = 0;
15297
2.63M
    str = NULL;
15298
15299
2.63M
    ret = unicode_format_arg_parse(ctx, &arg);
15300
2.63M
    if (ret == -1)
15301
0
        return -1;
15302
15303
2.63M
    ret = unicode_format_arg_format(ctx, &arg, &str);
15304
2.63M
    if (ret == -1)
15305
0
        return -1;
15306
15307
2.63M
    if (ret != 1) {
15308
1.22M
        ret = unicode_format_arg_output(ctx, &arg, str);
15309
1.22M
        Py_DECREF(str);
15310
1.22M
        if (ret == -1)
15311
0
            return -1;
15312
1.22M
    }
15313
15314
2.63M
    if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15315
0
        PyErr_SetString(PyExc_TypeError,
15316
0
                        "not all arguments converted during string formatting");
15317
0
        return -1;
15318
0
    }
15319
2.63M
    return 0;
15320
2.63M
}
15321
15322
PyObject *
15323
PyUnicode_Format(PyObject *format, PyObject *args)
15324
1.39M
{
15325
1.39M
    struct unicode_formatter_t ctx;
15326
15327
1.39M
    if (format == NULL || args == NULL) {
15328
0
        PyErr_BadInternalCall();
15329
0
        return NULL;
15330
0
    }
15331
15332
1.39M
    if (ensure_unicode(format) < 0)
15333
0
        return NULL;
15334
15335
1.39M
    ctx.fmtstr = format;
15336
1.39M
    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15337
1.39M
    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15338
1.39M
    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15339
1.39M
    ctx.fmtpos = 0;
15340
15341
1.39M
    _PyUnicodeWriter_Init(&ctx.writer);
15342
1.39M
    ctx.writer.min_length = ctx.fmtcnt + 100;
15343
1.39M
    ctx.writer.overallocate = 1;
15344
15345
1.39M
    if (PyTuple_Check(args)) {
15346
1.22M
        ctx.arglen = PyTuple_Size(args);
15347
1.22M
        ctx.argidx = 0;
15348
1.22M
    }
15349
170k
    else {
15350
170k
        ctx.arglen = -1;
15351
170k
        ctx.argidx = -2;
15352
170k
    }
15353
1.39M
    ctx.args_owned = 0;
15354
1.39M
    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15355
0
        ctx.dict = args;
15356
1.39M
    else
15357
1.39M
        ctx.dict = NULL;
15358
1.39M
    ctx.args = args;
15359
15360
6.67M
    while (--ctx.fmtcnt >= 0) {
15361
5.27M
        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15362
2.63M
            Py_ssize_t nonfmtpos;
15363
15364
2.63M
            nonfmtpos = ctx.fmtpos++;
15365
37.5M
            while (ctx.fmtcnt >= 0 &&
15366
37.5M
                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15367
34.9M
                ctx.fmtpos++;
15368
34.9M
                ctx.fmtcnt--;
15369
34.9M
            }
15370
2.63M
            if (ctx.fmtcnt < 0) {
15371
6.82k
                ctx.fmtpos--;
15372
6.82k
                ctx.writer.overallocate = 0;
15373
6.82k
            }
15374
15375
2.63M
            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15376
2.63M
                                                nonfmtpos, ctx.fmtpos) < 0)
15377
0
                goto onError;
15378
2.63M
        }
15379
2.63M
        else {
15380
2.63M
            ctx.fmtpos++;
15381
2.63M
            if (unicode_format_arg(&ctx) == -1)
15382
0
                goto onError;
15383
2.63M
        }
15384
5.27M
    }
15385
15386
1.39M
    if (ctx.argidx < ctx.arglen && !ctx.dict) {
15387
0
        PyErr_SetString(PyExc_TypeError,
15388
0
                        "not all arguments converted during string formatting");
15389
0
        goto onError;
15390
0
    }
15391
15392
1.39M
    if (ctx.args_owned) {
15393
0
        Py_DECREF(ctx.args);
15394
0
    }
15395
1.39M
    return _PyUnicodeWriter_Finish(&ctx.writer);
15396
15397
0
  onError:
15398
0
    _PyUnicodeWriter_Dealloc(&ctx.writer);
15399
0
    if (ctx.args_owned) {
15400
0
        Py_DECREF(ctx.args);
15401
0
    }
15402
0
    return NULL;
15403
1.39M
}
15404
15405
static PyObject *
15406
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15407
15408
/*[clinic input]
15409
@classmethod
15410
str.__new__ as unicode_new
15411
15412
    object as x: object = NULL
15413
    encoding: str = NULL
15414
    errors: str = NULL
15415
15416
[clinic start generated code]*/
15417
15418
static PyObject *
15419
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15420
                 const char *errors)
15421
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15422
49
{
15423
49
    PyObject *unicode;
15424
49
    if (x == NULL) {
15425
0
        unicode = unicode_get_empty();
15426
0
    }
15427
49
    else if (encoding == NULL && errors == NULL) {
15428
49
        unicode = PyObject_Str(x);
15429
49
    }
15430
0
    else {
15431
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15432
0
    }
15433
15434
49
    if (unicode != NULL && type != &PyUnicode_Type) {
15435
49
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15436
49
    }
15437
49
    return unicode;
15438
49
}
15439
15440
static const char *
15441
arg_as_utf8(PyObject *obj, const char *name)
15442
91.2k
{
15443
91.2k
    if (!PyUnicode_Check(obj)) {
15444
0
        PyErr_Format(PyExc_TypeError,
15445
0
                     "str() argument '%s' must be str, not %T",
15446
0
                     name, obj);
15447
0
        return NULL;
15448
0
    }
15449
91.2k
    return _PyUnicode_AsUTF8NoNUL(obj);
15450
91.2k
}
15451
15452
static PyObject *
15453
unicode_vectorcall(PyObject *type, PyObject *const *args,
15454
                   size_t nargsf, PyObject *kwnames)
15455
88.3k
{
15456
88.3k
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
15457
15458
88.3k
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
15459
88.3k
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
15460
        // Fallback to unicode_new()
15461
0
        PyObject *tuple = _PyTuple_FromArray(args, nargs);
15462
0
        if (tuple == NULL) {
15463
0
            return NULL;
15464
0
        }
15465
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
15466
0
        if (dict == NULL) {
15467
0
            Py_DECREF(tuple);
15468
0
            return NULL;
15469
0
        }
15470
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
15471
0
        Py_DECREF(tuple);
15472
0
        Py_DECREF(dict);
15473
0
        return ret;
15474
0
    }
15475
88.3k
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
15476
0
        return NULL;
15477
0
    }
15478
88.3k
    if (nargs == 0) {
15479
0
        return unicode_get_empty();
15480
0
    }
15481
88.3k
    PyObject *object = args[0];
15482
88.3k
    if (nargs == 1) {
15483
142
        return PyObject_Str(object);
15484
142
    }
15485
88.2k
    const char *encoding = arg_as_utf8(args[1], "encoding");
15486
88.2k
    if (encoding == NULL) {
15487
0
        return NULL;
15488
0
    }
15489
88.2k
    const char *errors = NULL;
15490
88.2k
    if (nargs == 3) {
15491
2.99k
        errors = arg_as_utf8(args[2], "errors");
15492
2.99k
        if (errors == NULL) {
15493
0
            return NULL;
15494
0
        }
15495
2.99k
    }
15496
88.2k
    return PyUnicode_FromEncodedObject(object, encoding, errors);
15497
88.2k
}
15498
15499
static PyObject *
15500
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15501
49
{
15502
49
    PyObject *self;
15503
49
    Py_ssize_t length, char_size;
15504
49
    int share_utf8;
15505
49
    int kind;
15506
49
    void *data;
15507
15508
49
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
15509
49
    assert(_PyUnicode_CHECK(unicode));
15510
15511
49
    self = type->tp_alloc(type, 0);
15512
49
    if (self == NULL) {
15513
0
        return NULL;
15514
0
    }
15515
49
    kind = PyUnicode_KIND(unicode);
15516
49
    length = PyUnicode_GET_LENGTH(unicode);
15517
15518
49
    _PyUnicode_LENGTH(self) = length;
15519
#ifdef Py_DEBUG
15520
    _PyUnicode_HASH(self) = -1;
15521
#else
15522
49
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15523
0
#endif
15524
49
    _PyUnicode_STATE(self).interned = 0;
15525
49
    _PyUnicode_STATE(self).kind = kind;
15526
49
    _PyUnicode_STATE(self).compact = 0;
15527
49
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15528
49
    _PyUnicode_STATE(self).statically_allocated = 0;
15529
0
    PyUnicode_SET_UTF8_LENGTH(self, 0);
15530
49
    PyUnicode_SET_UTF8(self, NULL);
15531
49
    _PyUnicode_DATA_ANY(self) = NULL;
15532
15533
0
    share_utf8 = 0;
15534
49
    if (kind == PyUnicode_1BYTE_KIND) {
15535
49
        char_size = 1;
15536
49
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15537
49
            share_utf8 = 1;
15538
49
    }
15539
0
    else if (kind == PyUnicode_2BYTE_KIND) {
15540
0
        char_size = 2;
15541
0
    }
15542
0
    else {
15543
0
        assert(kind == PyUnicode_4BYTE_KIND);
15544
0
        char_size = 4;
15545
0
    }
15546
15547
    /* Ensure we won't overflow the length. */
15548
49
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15549
0
        PyErr_NoMemory();
15550
0
        goto onError;
15551
0
    }
15552
49
    data = PyMem_Malloc((length + 1) * char_size);
15553
49
    if (data == NULL) {
15554
0
        PyErr_NoMemory();
15555
0
        goto onError;
15556
0
    }
15557
15558
98
    _PyUnicode_DATA_ANY(self) = data;
15559
49
    if (share_utf8) {
15560
49
        PyUnicode_SET_UTF8_LENGTH(self, length);
15561
49
        PyUnicode_SET_UTF8(self, data);
15562
49
    }
15563
15564
98
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
15565
98
    assert(_PyUnicode_CheckConsistency(self, 1));
15566
#ifdef Py_DEBUG
15567
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15568
#endif
15569
49
    return self;
15570
15571
0
onError:
15572
0
    Py_DECREF(self);
15573
0
    return NULL;
15574
98
}
15575
15576
void
15577
_PyUnicode_ExactDealloc(PyObject *op)
15578
1.84M
{
15579
1.84M
    assert(PyUnicode_CheckExact(op));
15580
1.84M
    unicode_dealloc(op);
15581
1.84M
}
15582
15583
PyDoc_STRVAR(unicode_doc,
15584
"str(object='') -> str\n\
15585
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15586
\n\
15587
Create a new string object from the given object. If encoding or\n\
15588
errors is specified, then the object must expose a data buffer\n\
15589
that will be decoded using the given encoding and error handler.\n\
15590
Otherwise, returns the result of object.__str__() (if defined)\n\
15591
or repr(object).\n\
15592
encoding defaults to 'utf-8'.\n\
15593
errors defaults to 'strict'.");
15594
15595
static PyObject *unicode_iter(PyObject *seq);
15596
15597
PyTypeObject PyUnicode_Type = {
15598
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15599
    "str",                        /* tp_name */
15600
    sizeof(PyUnicodeObject),      /* tp_basicsize */
15601
    0,                            /* tp_itemsize */
15602
    /* Slots */
15603
    unicode_dealloc,              /* tp_dealloc */
15604
    0,                            /* tp_vectorcall_offset */
15605
    0,                            /* tp_getattr */
15606
    0,                            /* tp_setattr */
15607
    0,                            /* tp_as_async */
15608
    unicode_repr,                 /* tp_repr */
15609
    &unicode_as_number,           /* tp_as_number */
15610
    &unicode_as_sequence,         /* tp_as_sequence */
15611
    &unicode_as_mapping,          /* tp_as_mapping */
15612
    unicode_hash,                 /* tp_hash*/
15613
    0,                            /* tp_call*/
15614
    unicode_str,                  /* tp_str */
15615
    PyObject_GenericGetAttr,      /* tp_getattro */
15616
    0,                            /* tp_setattro */
15617
    0,                            /* tp_as_buffer */
15618
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15619
        Py_TPFLAGS_UNICODE_SUBCLASS |
15620
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15621
    unicode_doc,                  /* tp_doc */
15622
    0,                            /* tp_traverse */
15623
    0,                            /* tp_clear */
15624
    PyUnicode_RichCompare,        /* tp_richcompare */
15625
    0,                            /* tp_weaklistoffset */
15626
    unicode_iter,                 /* tp_iter */
15627
    0,                            /* tp_iternext */
15628
    unicode_methods,              /* tp_methods */
15629
    0,                            /* tp_members */
15630
    0,                            /* tp_getset */
15631
    0,                            /* tp_base */
15632
    0,                            /* tp_dict */
15633
    0,                            /* tp_descr_get */
15634
    0,                            /* tp_descr_set */
15635
    0,                            /* tp_dictoffset */
15636
    0,                            /* tp_init */
15637
    0,                            /* tp_alloc */
15638
    unicode_new,                  /* tp_new */
15639
    PyObject_Free,                /* tp_free */
15640
    .tp_vectorcall = unicode_vectorcall,
15641
};
15642
15643
/* Initialize the Unicode implementation */
15644
15645
static void
15646
_init_global_state(void)
15647
22
{
15648
22
    static int initialized = 0;
15649
22
    if (initialized) {
15650
0
        return;
15651
0
    }
15652
22
    initialized = 1;
15653
15654
    /* initialize the linebreak bloom filter */
15655
22
    const Py_UCS2 linebreak[] = {
15656
22
        0x000A, /* LINE FEED */
15657
22
        0x000D, /* CARRIAGE RETURN */
15658
22
        0x001C, /* FILE SEPARATOR */
15659
22
        0x001D, /* GROUP SEPARATOR */
15660
22
        0x001E, /* RECORD SEPARATOR */
15661
22
        0x0085, /* NEXT LINE */
15662
22
        0x2028, /* LINE SEPARATOR */
15663
22
        0x2029, /* PARAGRAPH SEPARATOR */
15664
22
    };
15665
22
    bloom_linebreak = make_bloom_mask(
15666
22
        PyUnicode_2BYTE_KIND, linebreak,
15667
22
        Py_ARRAY_LENGTH(linebreak));
15668
22
}
15669
15670
void
15671
_PyUnicode_InitState(PyInterpreterState *interp)
15672
22
{
15673
22
    if (!_Py_IsMainInterpreter(interp)) {
15674
0
        return;
15675
0
    }
15676
22
    _init_global_state();
15677
22
}
15678
15679
15680
PyStatus
15681
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
15682
22
{
15683
22
    if (_Py_IsMainInterpreter(interp)) {
15684
22
        PyStatus status = init_global_interned_strings(interp);
15685
22
        if (_PyStatus_EXCEPTION(status)) {
15686
0
            return status;
15687
0
        }
15688
22
    }
15689
22
    assert(INTERNED_STRINGS);
15690
15691
22
    if (init_interned_dict(interp)) {
15692
0
        PyErr_Clear();
15693
0
        return _PyStatus_ERR("failed to create interned dict");
15694
0
    }
15695
15696
22
    return _PyStatus_OK();
15697
22
}
15698
15699
15700
PyStatus
15701
_PyUnicode_InitTypes(PyInterpreterState *interp)
15702
22
{
15703
22
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
15704
0
        goto error;
15705
0
    }
15706
22
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
15707
0
        goto error;
15708
0
    }
15709
22
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
15710
0
        goto error;
15711
0
    }
15712
22
    return _PyStatus_OK();
15713
15714
0
error:
15715
0
    return _PyStatus_ERR("Can't initialize unicode types");
15716
22
}
15717
15718
static /* non-null */ PyObject*
15719
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
15720
23.6k
{
15721
    // Note that this steals a reference to `s`, but in many cases that
15722
    // stolen ref is returned, requiring no decref/incref.
15723
15724
23.6k
    assert(s != NULL);
15725
23.6k
    assert(_PyUnicode_CHECK(s));
15726
23.6k
    assert(_PyUnicode_STATE(s).statically_allocated);
15727
23.6k
    assert(!PyUnicode_CHECK_INTERNED(s));
15728
15729
#ifdef Py_DEBUG
15730
    /* We must not add process-global interned string if there's already a
15731
     * per-interpreter interned_dict, which might contain duplicates.
15732
     */
15733
    PyObject *interned = get_interned_dict(interp);
15734
    assert(interned == NULL);
15735
#endif
15736
15737
    /* Look in the global cache first. */
15738
23.6k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
15739
    /* We should only init each string once */
15740
23.6k
    assert(r == NULL);
15741
    /* but just in case (for the non-debug build), handle this */
15742
23.6k
    if (r != NULL && r != s) {
15743
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
15744
0
        assert(_PyUnicode_CHECK(r));
15745
0
        Py_DECREF(s);
15746
0
        return Py_NewRef(r);
15747
0
    }
15748
15749
23.6k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
15750
0
        Py_FatalError("failed to intern static string");
15751
0
    }
15752
15753
23.6k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
15754
0
    return s;
15755
23.6k
}
15756
15757
void
15758
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
15759
23.6k
{
15760
    // This should only be called as part of runtime initialization
15761
23.6k
    assert(!Py_IsInitialized());
15762
15763
23.6k
    *p = intern_static(interp, *p);
15764
23.6k
    assert(*p);
15765
23.6k
}
15766
15767
static void
15768
immortalize_interned(PyObject *s)
15769
106k
{
15770
106k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
15771
106k
    assert(!_Py_IsImmortal(s));
15772
#ifdef Py_REF_DEBUG
15773
    /* The reference count value should be excluded from the RefTotal.
15774
       The decrements to these objects will not be registered so they
15775
       need to be accounted for in here. */
15776
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
15777
        _Py_DecRefTotal(_PyThreadState_GET());
15778
    }
15779
#endif
15780
213k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
15781
0
    _Py_SetImmortal(s);
15782
213k
}
15783
15784
static /* non-null */ PyObject*
15785
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
15786
              bool immortalize)
15787
6.17M
{
15788
    // Note that this steals a reference to `s`, but in many cases that
15789
    // stolen ref is returned, requiring no decref/incref.
15790
15791
#ifdef Py_DEBUG
15792
    assert(s != NULL);
15793
    assert(_PyUnicode_CHECK(s));
15794
#else
15795
6.17M
    if (s == NULL || !PyUnicode_Check(s)) {
15796
0
        return s;
15797
0
    }
15798
6.17M
#endif
15799
15800
    /* If it's a subclass, we don't really know what putting
15801
       it in the interned dict might do. */
15802
6.17M
    if (!PyUnicode_CheckExact(s)) {
15803
0
        return s;
15804
0
    }
15805
15806
    /* Is it already interned? */
15807
6.17M
    switch (PyUnicode_CHECK_INTERNED(s)) {
15808
3.22M
        case SSTATE_NOT_INTERNED:
15809
            // no, go on
15810
3.22M
            break;
15811
77.3k
        case SSTATE_INTERNED_MORTAL:
15812
            // yes but we might need to make it immortal
15813
77.3k
            if (immortalize) {
15814
106
                immortalize_interned(s);
15815
106
            }
15816
77.3k
            return s;
15817
2.87M
        default:
15818
            // all done
15819
2.87M
            return s;
15820
6.17M
    }
15821
15822
    /* Statically allocated strings must be already interned. */
15823
6.17M
    assert(!_PyUnicode_STATE(s).statically_allocated);
15824
15825
#if Py_GIL_DISABLED
15826
    /* In the free-threaded build, all interned strings are immortal */
15827
    immortalize = 1;
15828
#endif
15829
15830
    /* If it's already immortal, intern it as such */
15831
3.22M
    if (_Py_IsImmortal(s)) {
15832
0
        immortalize = 1;
15833
0
    }
15834
15835
    /* if it's a short string, get the singleton */
15836
3.22M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
15837
25.7k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
15838
59
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
15839
59
        assert(PyUnicode_CHECK_INTERNED(r));
15840
59
        Py_DECREF(s);
15841
59
        return r;
15842
59
    }
15843
#ifdef Py_DEBUG
15844
    assert(!unicode_is_singleton(s));
15845
#endif
15846
15847
    /* Look in the global cache now. */
15848
3.22M
    {
15849
3.22M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
15850
3.22M
        if (r != NULL) {
15851
170k
            assert(_PyUnicode_STATE(r).statically_allocated);
15852
170k
            assert(r != s);  // r must be statically_allocated; s is not
15853
170k
            Py_DECREF(s);
15854
170k
            return Py_NewRef(r);
15855
170k
        }
15856
3.22M
    }
15857
15858
    /* Do a setdefault on the per-interpreter cache. */
15859
3.05M
    PyObject *interned = get_interned_dict(interp);
15860
3.05M
    assert(interned != NULL);
15861
#ifdef Py_GIL_DISABLED
15862
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
15863
#endif
15864
3.05M
    FT_MUTEX_LOCK(INTERN_MUTEX);
15865
3.05M
    PyObject *t;
15866
3.05M
    {
15867
3.05M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
15868
3.05M
        if (res < 0) {
15869
0
            PyErr_Clear();
15870
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
15871
0
            return s;
15872
0
        }
15873
3.05M
        else if (res == 1) {
15874
            // value was already present (not inserted)
15875
2.69M
            Py_DECREF(s);
15876
2.69M
            if (immortalize &&
15877
2.32M
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
15878
3.68k
                immortalize_interned(t);
15879
3.68k
            }
15880
2.69M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
15881
2.69M
            return t;
15882
2.69M
        }
15883
358k
        else {
15884
            // value was newly inserted
15885
358k
            assert (s == t);
15886
358k
            Py_DECREF(t);
15887
358k
        }
15888
3.05M
    }
15889
15890
    /* NOT_INTERNED -> INTERNED_MORTAL */
15891
15892
3.05M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
15893
15894
358k
    if (!_Py_IsImmortal(s)) {
15895
        /* The two references in interned dict (key and value) are not counted.
15896
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
15897
358k
        Py_DECREF(s);
15898
358k
        Py_DECREF(s);
15899
358k
    }
15900
358k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
15901
15902
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
15903
15904
#ifdef Py_DEBUG
15905
    if (_Py_IsImmortal(s)) {
15906
        assert(immortalize);
15907
    }
15908
#endif
15909
358k
    if (immortalize) {
15910
103k
        immortalize_interned(s);
15911
103k
    }
15912
15913
358k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
15914
358k
    return s;
15915
358k
}
15916
15917
void
15918
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
15919
4.89M
{
15920
4.89M
    *p = intern_common(interp, *p, 1);
15921
4.89M
    assert(*p);
15922
4.89M
}
15923
15924
void
15925
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
15926
1.28M
{
15927
1.28M
    *p = intern_common(interp, *p, 0);
15928
1.28M
    assert(*p);
15929
1.28M
}
15930
15931
15932
void
15933
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
15934
0
{
15935
0
    _PyUnicode_InternImmortal(interp, p);
15936
0
    return;
15937
0
}
15938
15939
void
15940
PyUnicode_InternInPlace(PyObject **p)
15941
0
{
15942
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
15943
0
    _PyUnicode_InternMortal(interp, p);
15944
0
}
15945
15946
// Public-looking name kept for the stable ABI; user should not call this:
15947
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
15948
void
15949
PyUnicode_InternImmortal(PyObject **p)
15950
0
{
15951
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
15952
0
    _PyUnicode_InternImmortal(interp, p);
15953
0
}
15954
15955
PyObject *
15956
PyUnicode_InternFromString(const char *cp)
15957
596k
{
15958
596k
    PyObject *s = PyUnicode_FromString(cp);
15959
596k
    if (s == NULL) {
15960
0
        return NULL;
15961
0
    }
15962
596k
    PyInterpreterState *interp = _PyInterpreterState_GET();
15963
596k
    _PyUnicode_InternMortal(interp, &s);
15964
596k
    return s;
15965
596k
}
15966
15967
15968
void
15969
_PyUnicode_ClearInterned(PyInterpreterState *interp)
15970
0
{
15971
0
    PyObject *interned = get_interned_dict(interp);
15972
0
    if (interned == NULL) {
15973
0
        return;
15974
0
    }
15975
0
    assert(PyDict_CheckExact(interned));
15976
15977
0
    if (has_shared_intern_dict(interp)) {
15978
        // the dict doesn't belong to this interpreter, skip the debug
15979
        // checks on it and just clear the pointer to it
15980
0
        clear_interned_dict(interp);
15981
0
        return;
15982
0
    }
15983
15984
#ifdef INTERNED_STATS
15985
    fprintf(stderr, "releasing %zd interned strings\n",
15986
            PyDict_GET_SIZE(interned));
15987
15988
    Py_ssize_t total_length = 0;
15989
#endif
15990
0
    Py_ssize_t pos = 0;
15991
0
    PyObject *s, *ignored_value;
15992
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15993
0
        int shared = 0;
15994
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
15995
0
        case SSTATE_INTERNED_IMMORTAL:
15996
            /* Make immortal interned strings mortal again. */
15997
            // Skip the Immortal Instance check and restore
15998
            // the two references (key and value) ignored
15999
            // by PyUnicode_InternInPlace().
16000
0
            _Py_SetMortal(s, 2);
16001
#ifdef Py_REF_DEBUG
16002
            /* let's be pedantic with the ref total */
16003
            _Py_IncRefTotal(_PyThreadState_GET());
16004
            _Py_IncRefTotal(_PyThreadState_GET());
16005
#endif
16006
#ifdef INTERNED_STATS
16007
            total_length += PyUnicode_GET_LENGTH(s);
16008
#endif
16009
0
            break;
16010
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
16011
            /* It is shared between interpreters, so we should unmark it
16012
               only when this is the last interpreter in which it's
16013
               interned.  We immortalize all the statically initialized
16014
               strings during startup, so we can rely on the
16015
               main interpreter to be the last one. */
16016
0
            if (!_Py_IsMainInterpreter(interp)) {
16017
0
                shared = 1;
16018
0
            }
16019
0
            break;
16020
0
        case SSTATE_INTERNED_MORTAL:
16021
            // Restore 2 references held by the interned dict; these will
16022
            // be decref'd by clear_interned_dict's PyDict_Clear.
16023
0
            _Py_RefcntAdd(s, 2);
16024
#ifdef Py_REF_DEBUG
16025
            /* let's be pedantic with the ref total */
16026
            _Py_IncRefTotal(_PyThreadState_GET());
16027
            _Py_IncRefTotal(_PyThreadState_GET());
16028
#endif
16029
0
            break;
16030
0
        case SSTATE_NOT_INTERNED:
16031
0
            _Py_FALLTHROUGH;
16032
0
        default:
16033
0
            Py_UNREACHABLE();
16034
0
        }
16035
0
        if (!shared) {
16036
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
16037
0
        }
16038
0
    }
16039
#ifdef INTERNED_STATS
16040
    fprintf(stderr,
16041
            "total length of all interned strings: %zd characters\n",
16042
            total_length);
16043
#endif
16044
16045
0
    struct _Py_unicode_state *state = &interp->unicode;
16046
0
    struct _Py_unicode_ids *ids = &state->ids;
16047
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
16048
0
        Py_XINCREF(ids->array[i]);
16049
0
    }
16050
0
    clear_interned_dict(interp);
16051
0
    if (_Py_IsMainInterpreter(interp)) {
16052
0
        clear_global_interned_strings();
16053
0
    }
16054
0
}
16055
16056
16057
/********************* Unicode Iterator **************************/
16058
16059
typedef struct {
16060
    PyObject_HEAD
16061
    Py_ssize_t it_index;
16062
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
16063
} unicodeiterobject;
16064
16065
static void
16066
unicodeiter_dealloc(PyObject *op)
16067
799k
{
16068
799k
    unicodeiterobject *it = (unicodeiterobject *)op;
16069
799k
    _PyObject_GC_UNTRACK(it);
16070
799k
    Py_XDECREF(it->it_seq);
16071
799k
    PyObject_GC_Del(it);
16072
799k
}
16073
16074
static int
16075
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
16076
0
{
16077
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16078
0
    Py_VISIT(it->it_seq);
16079
0
    return 0;
16080
0
}
16081
16082
static PyObject *
16083
unicodeiter_next(PyObject *op)
16084
97.4M
{
16085
97.4M
    unicodeiterobject *it = (unicodeiterobject *)op;
16086
97.4M
    PyObject *seq;
16087
16088
97.4M
    assert(it != NULL);
16089
97.4M
    seq = it->it_seq;
16090
97.4M
    if (seq == NULL)
16091
0
        return NULL;
16092
97.4M
    assert(_PyUnicode_CHECK(seq));
16093
16094
97.4M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16095
96.6M
        int kind = PyUnicode_KIND(seq);
16096
96.6M
        const void *data = PyUnicode_DATA(seq);
16097
96.6M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
16098
96.6M
        it->it_index++;
16099
96.6M
        return unicode_char(chr);
16100
96.6M
    }
16101
16102
757k
    it->it_seq = NULL;
16103
757k
    Py_DECREF(seq);
16104
757k
    return NULL;
16105
97.4M
}
16106
16107
static PyObject *
16108
unicode_ascii_iter_next(PyObject *op)
16109
13.0M
{
16110
13.0M
    unicodeiterobject *it = (unicodeiterobject *)op;
16111
13.0M
    assert(it != NULL);
16112
13.0M
    PyObject *seq = it->it_seq;
16113
13.0M
    if (seq == NULL) {
16114
0
        return NULL;
16115
0
    }
16116
13.0M
    assert(_PyUnicode_CHECK(seq));
16117
13.0M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
16118
13.0M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16119
13.0M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
16120
13.0M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
16121
13.0M
                                              data, it->it_index);
16122
13.0M
        it->it_index++;
16123
13.0M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
16124
13.0M
    }
16125
41.6k
    it->it_seq = NULL;
16126
41.6k
    Py_DECREF(seq);
16127
41.6k
    return NULL;
16128
13.0M
}
16129
16130
static PyObject *
16131
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
16132
286k
{
16133
286k
    unicodeiterobject *it = (unicodeiterobject *)op;
16134
286k
    Py_ssize_t len = 0;
16135
286k
    if (it->it_seq)
16136
286k
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
16137
286k
    return PyLong_FromSsize_t(len);
16138
286k
}
16139
16140
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
16141
16142
static PyObject *
16143
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
16144
0
{
16145
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16146
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
16147
16148
    /* _PyEval_GetBuiltin can invoke arbitrary code,
16149
     * call must be before access of iterator pointers.
16150
     * see issue #101765 */
16151
16152
0
    if (it->it_seq != NULL) {
16153
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
16154
0
    } else {
16155
0
        PyObject *u = unicode_get_empty();
16156
0
        if (u == NULL) {
16157
0
            Py_XDECREF(iter);
16158
0
            return NULL;
16159
0
        }
16160
0
        return Py_BuildValue("N(N)", iter, u);
16161
0
    }
16162
0
}
16163
16164
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
16165
16166
static PyObject *
16167
unicodeiter_setstate(PyObject *op, PyObject *state)
16168
0
{
16169
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16170
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
16171
0
    if (index == -1 && PyErr_Occurred())
16172
0
        return NULL;
16173
0
    if (it->it_seq != NULL) {
16174
0
        if (index < 0)
16175
0
            index = 0;
16176
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
16177
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
16178
0
        it->it_index = index;
16179
0
    }
16180
0
    Py_RETURN_NONE;
16181
0
}
16182
16183
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
16184
16185
static PyMethodDef unicodeiter_methods[] = {
16186
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
16187
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
16188
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
16189
    {NULL,      NULL}       /* sentinel */
16190
};
16191
16192
PyTypeObject PyUnicodeIter_Type = {
16193
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
16194
    "str_iterator",         /* tp_name */
16195
    sizeof(unicodeiterobject),      /* tp_basicsize */
16196
    0,                  /* tp_itemsize */
16197
    /* methods */
16198
    unicodeiter_dealloc,/* tp_dealloc */
16199
    0,                  /* tp_vectorcall_offset */
16200
    0,                  /* tp_getattr */
16201
    0,                  /* tp_setattr */
16202
    0,                  /* tp_as_async */
16203
    0,                  /* tp_repr */
16204
    0,                  /* tp_as_number */
16205
    0,                  /* tp_as_sequence */
16206
    0,                  /* tp_as_mapping */
16207
    0,                  /* tp_hash */
16208
    0,                  /* tp_call */
16209
    0,                  /* tp_str */
16210
    PyObject_GenericGetAttr,        /* tp_getattro */
16211
    0,                  /* tp_setattro */
16212
    0,                  /* tp_as_buffer */
16213
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
16214
    0,                  /* tp_doc */
16215
    unicodeiter_traverse, /* tp_traverse */
16216
    0,                  /* tp_clear */
16217
    0,                  /* tp_richcompare */
16218
    0,                  /* tp_weaklistoffset */
16219
    PyObject_SelfIter,          /* tp_iter */
16220
    unicodeiter_next,   /* tp_iternext */
16221
    unicodeiter_methods,            /* tp_methods */
16222
    0,
16223
};
16224
16225
PyTypeObject _PyUnicodeASCIIIter_Type = {
16226
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
16227
    .tp_name = "str_ascii_iterator",
16228
    .tp_basicsize = sizeof(unicodeiterobject),
16229
    .tp_dealloc = unicodeiter_dealloc,
16230
    .tp_getattro = PyObject_GenericGetAttr,
16231
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
16232
    .tp_traverse = unicodeiter_traverse,
16233
    .tp_iter = PyObject_SelfIter,
16234
    .tp_iternext = unicode_ascii_iter_next,
16235
    .tp_methods = unicodeiter_methods,
16236
};
16237
16238
static PyObject *
16239
unicode_iter(PyObject *seq)
16240
799k
{
16241
799k
    unicodeiterobject *it;
16242
16243
799k
    if (!PyUnicode_Check(seq)) {
16244
0
        PyErr_BadInternalCall();
16245
0
        return NULL;
16246
0
    }
16247
799k
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
16248
41.7k
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
16249
41.7k
    }
16250
757k
    else {
16251
757k
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16252
757k
    }
16253
799k
    if (it == NULL)
16254
0
        return NULL;
16255
799k
    it->it_index = 0;
16256
799k
    it->it_seq = Py_NewRef(seq);
16257
799k
    _PyObject_GC_TRACK(it);
16258
799k
    return (PyObject *)it;
16259
799k
}
16260
16261
static int
16262
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
16263
88
{
16264
88
    int res;
16265
88
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16266
88
    if (res == -2) {
16267
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
16268
0
        return -1;
16269
0
    }
16270
88
    if (res < 0) {
16271
0
        PyErr_NoMemory();
16272
0
        return -1;
16273
0
    }
16274
88
    return 0;
16275
88
}
16276
16277
16278
static int
16279
config_get_codec_name(wchar_t **config_encoding)
16280
44
{
16281
44
    char *encoding;
16282
44
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16283
0
        return -1;
16284
0
    }
16285
16286
44
    PyObject *name_obj = NULL;
16287
44
    PyObject *codec = _PyCodec_Lookup(encoding);
16288
44
    PyMem_RawFree(encoding);
16289
16290
44
    if (!codec)
16291
0
        goto error;
16292
16293
44
    name_obj = PyObject_GetAttrString(codec, "name");
16294
44
    Py_CLEAR(codec);
16295
44
    if (!name_obj) {
16296
0
        goto error;
16297
0
    }
16298
16299
44
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16300
44
    Py_DECREF(name_obj);
16301
44
    if (wname == NULL) {
16302
0
        goto error;
16303
0
    }
16304
16305
44
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16306
44
    if (raw_wname == NULL) {
16307
0
        PyMem_Free(wname);
16308
0
        PyErr_NoMemory();
16309
0
        goto error;
16310
0
    }
16311
16312
44
    PyMem_RawFree(*config_encoding);
16313
44
    *config_encoding = raw_wname;
16314
16315
44
    PyMem_Free(wname);
16316
44
    return 0;
16317
16318
0
error:
16319
0
    Py_XDECREF(codec);
16320
0
    Py_XDECREF(name_obj);
16321
0
    return -1;
16322
44
}
16323
16324
16325
static PyStatus
16326
init_stdio_encoding(PyInterpreterState *interp)
16327
22
{
16328
    /* Update the stdio encoding to the normalized Python codec name. */
16329
22
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16330
22
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
16331
0
        return _PyStatus_ERR("failed to get the Python codec name "
16332
0
                             "of the stdio encoding");
16333
0
    }
16334
22
    return _PyStatus_OK();
16335
22
}
16336
16337
16338
static int
16339
init_fs_codec(PyInterpreterState *interp)
16340
22
{
16341
22
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
16342
16343
22
    _Py_error_handler error_handler;
16344
22
    error_handler = get_error_handler_wide(config->filesystem_errors);
16345
22
    if (error_handler == _Py_ERROR_UNKNOWN) {
16346
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
16347
0
        return -1;
16348
0
    }
16349
16350
22
    char *encoding, *errors;
16351
22
    if (encode_wstr_utf8(config->filesystem_encoding,
16352
22
                         &encoding,
16353
22
                         "filesystem_encoding") < 0) {
16354
0
        return -1;
16355
0
    }
16356
16357
22
    if (encode_wstr_utf8(config->filesystem_errors,
16358
22
                         &errors,
16359
22
                         "filesystem_errors") < 0) {
16360
0
        PyMem_RawFree(encoding);
16361
0
        return -1;
16362
0
    }
16363
16364
22
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16365
22
    PyMem_RawFree(fs_codec->encoding);
16366
22
    fs_codec->encoding = encoding;
16367
    /* encoding has been normalized by init_fs_encoding() */
16368
22
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16369
22
    PyMem_RawFree(fs_codec->errors);
16370
22
    fs_codec->errors = errors;
16371
22
    fs_codec->error_handler = error_handler;
16372
16373
#ifdef _Py_FORCE_UTF8_FS_ENCODING
16374
    assert(fs_codec->utf8 == 1);
16375
#endif
16376
16377
    /* At this point, PyUnicode_EncodeFSDefault() and
16378
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16379
       the C implementation of the filesystem encoding. */
16380
16381
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16382
       global configuration variables. */
16383
22
    if (_Py_IsMainInterpreter(interp)) {
16384
16385
22
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16386
22
                                      fs_codec->errors) < 0) {
16387
0
            PyErr_NoMemory();
16388
0
            return -1;
16389
0
        }
16390
22
    }
16391
22
    return 0;
16392
22
}
16393
16394
16395
static PyStatus
16396
init_fs_encoding(PyThreadState *tstate)
16397
22
{
16398
22
    PyInterpreterState *interp = tstate->interp;
16399
16400
    /* Update the filesystem encoding to the normalized Python codec name.
16401
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16402
       (Python codec name). */
16403
22
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16404
22
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16405
0
        _Py_DumpPathConfig(tstate);
16406
0
        return _PyStatus_ERR("failed to get the Python codec "
16407
0
                             "of the filesystem encoding");
16408
0
    }
16409
16410
22
    if (init_fs_codec(interp) < 0) {
16411
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
16412
0
    }
16413
22
    return _PyStatus_OK();
16414
22
}
16415
16416
16417
PyStatus
16418
_PyUnicode_InitEncodings(PyThreadState *tstate)
16419
22
{
16420
22
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
16421
22
    if (_PyStatus_EXCEPTION(status)) {
16422
0
        return status;
16423
0
    }
16424
22
    status = init_fs_encoding(tstate);
16425
22
    if (_PyStatus_EXCEPTION(status)) {
16426
0
        return status;
16427
0
    }
16428
16429
22
    return init_stdio_encoding(tstate->interp);
16430
22
}
16431
16432
16433
static void
16434
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16435
0
{
16436
0
    PyMem_RawFree(fs_codec->encoding);
16437
0
    fs_codec->encoding = NULL;
16438
0
    fs_codec->utf8 = 0;
16439
0
    PyMem_RawFree(fs_codec->errors);
16440
0
    fs_codec->errors = NULL;
16441
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16442
0
}
16443
16444
16445
#ifdef MS_WINDOWS
16446
int
16447
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16448
{
16449
    PyInterpreterState *interp = _PyInterpreterState_GET();
16450
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16451
16452
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16453
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16454
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16455
    if (encoding == NULL || errors == NULL) {
16456
        PyMem_RawFree(encoding);
16457
        PyMem_RawFree(errors);
16458
        PyErr_NoMemory();
16459
        return -1;
16460
    }
16461
16462
    PyMem_RawFree(config->filesystem_encoding);
16463
    config->filesystem_encoding = encoding;
16464
    PyMem_RawFree(config->filesystem_errors);
16465
    config->filesystem_errors = errors;
16466
16467
    return init_fs_codec(interp);
16468
}
16469
#endif
16470
16471
16472
#ifdef Py_DEBUG
16473
static inline int
16474
unicode_is_finalizing(void)
16475
{
16476
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
16477
}
16478
#endif
16479
16480
16481
void
16482
_PyUnicode_FiniTypes(PyInterpreterState *interp)
16483
0
{
16484
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
16485
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
16486
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
16487
0
}
16488
16489
16490
void
16491
_PyUnicode_Fini(PyInterpreterState *interp)
16492
0
{
16493
0
    struct _Py_unicode_state *state = &interp->unicode;
16494
16495
0
    if (!has_shared_intern_dict(interp)) {
16496
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16497
0
        assert(get_interned_dict(interp) == NULL);
16498
0
    }
16499
16500
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
16501
16502
    // bpo-47182: force a unicodedata CAPI capsule re-import on
16503
    // subsequent initialization of interpreter.
16504
0
    interp->unicode.ucnhash_capi = NULL;
16505
16506
0
    unicode_clear_identifiers(state);
16507
0
}
16508
16509
/* A _string module, to export formatter_parser and formatter_field_name_split
16510
   to the string.Formatter class implemented in Python. */
16511
16512
static PyMethodDef _string_methods[] = {
16513
    {"formatter_field_name_split", formatter_field_name_split,
16514
     METH_O, PyDoc_STR("split the argument as a field name")},
16515
    {"formatter_parser", formatter_parser,
16516
     METH_O, PyDoc_STR("parse the argument as a format string")},
16517
    {NULL, NULL}
16518
};
16519
16520
static PyModuleDef_Slot module_slots[] = {
16521
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
16522
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
16523
    {0, NULL}
16524
};
16525
16526
static struct PyModuleDef _string_module = {
16527
    PyModuleDef_HEAD_INIT,
16528
    .m_name = "_string",
16529
    .m_doc = PyDoc_STR("string helper module"),
16530
    .m_size = 0,
16531
    .m_methods = _string_methods,
16532
    .m_slots = module_slots,
16533
};
16534
16535
PyMODINIT_FUNC
16536
PyInit__string(void)
16537
0
{
16538
0
    return PyModuleDef_Init(&_string_module);
16539
0
}
16540
16541
16542
#undef PyUnicode_KIND
16543
int PyUnicode_KIND(PyObject *op)
16544
0
{
16545
0
    if (!PyUnicode_Check(op)) {
16546
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
16547
0
        return -1;
16548
0
    }
16549
0
    return _PyASCIIObject_CAST(op)->state.kind;
16550
0
}
16551
16552
#undef PyUnicode_DATA
16553
void* PyUnicode_DATA(PyObject *op)
16554
0
{
16555
0
    if (!PyUnicode_Check(op)) {
16556
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
16557
0
        return NULL;
16558
0
    }
16559
0
    return _PyUnicode_DATA(op);
16560
0
}