Coverage Report

Created: 2025-11-24 06:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython-install/include/python3.15/cpython/unicodeobject.h
Line
Count
Source
1
#ifndef Py_CPYTHON_UNICODEOBJECT_H
2
#  error "this header file must not be included directly"
3
#endif
4
5
/* Py_UNICODE was the native Unicode storage format (code unit) used by
6
   Python and represents a single Unicode element in the Unicode type.
7
   With PEP 393, Py_UNICODE is deprecated and replaced with a
8
   typedef to wchar_t. */
9
Py_DEPRECATED(3.13) typedef wchar_t PY_UNICODE_TYPE;
10
Py_DEPRECATED(3.13) typedef wchar_t Py_UNICODE;
11
12
13
/* --- Internal Unicode Operations ---------------------------------------- */
14
15
// Static inline functions to work with surrogates
16
0
static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
17
0
    return (0xD800 <= ch && ch <= 0xDFFF);
18
0
}
19
0
static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
20
0
    return (0xD800 <= ch && ch <= 0xDBFF);
21
0
}
22
0
static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
23
0
    return (0xDC00 <= ch && ch <= 0xDFFF);
24
0
}
25
26
// Join two surrogate characters and return a single Py_UCS4 value.
27
0
static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low)  {
28
0
    assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
29
0
    assert(Py_UNICODE_IS_LOW_SURROGATE(low));
30
0
    return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
31
0
}
32
33
// High surrogate = top 10 bits added to 0xD800.
34
// The character must be in the range [U+10000; U+10ffff].
35
0
static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
36
0
    assert(0x10000 <= ch && ch <= 0x10ffff);
37
0
    return (0xD800 - (0x10000 >> 10) + (ch >> 10));
38
0
}
39
40
// Low surrogate = bottom 10 bits added to 0xDC00.
41
// The character must be in the range [U+10000; U+10ffff].
42
0
static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
43
0
    assert(0x10000 <= ch && ch <= 0x10ffff);
44
0
    return (0xDC00 + (ch & 0x3FF));
45
0
}
46
47
48
/* --- Unicode Type ------------------------------------------------------- */
49
50
struct _PyUnicodeObject_state {
51
    /* If interned is non-zero, the two references from the
52
       dictionary to this object are *not* counted in ob_refcnt.
53
       The possible values here are:
54
           0: Not Interned
55
           1: Interned
56
           2: Interned and Immortal
57
           3: Interned, Immortal, and Static
58
       This categorization allows the runtime to determine the right
59
       cleanup mechanism at runtime shutdown. */
60
#ifdef Py_GIL_DISABLED
61
    // Needs to be accessed atomically, so can't be a bit field.
62
    unsigned char interned;
63
#else
64
    unsigned int interned:2;
65
#endif
66
    /* Character size:
67
68
       - PyUnicode_1BYTE_KIND (1):
69
70
         * character type = Py_UCS1 (8 bits, unsigned)
71
         * all characters are in the range U+0000-U+00FF (latin1)
72
         * if ascii is set, all characters are in the range U+0000-U+007F
73
         (ASCII), otherwise at least one character is in the range
74
         U+0080-U+00FF
75
76
       - PyUnicode_2BYTE_KIND (2):
77
78
         * character type = Py_UCS2 (16 bits, unsigned)
79
         * all characters are in the range U+0000-U+FFFF (BMP)
80
         * at least one character is in the range U+0100-U+FFFF
81
82
       - PyUnicode_4BYTE_KIND (4):
83
84
         * character type = Py_UCS4 (32 bits, unsigned)
85
         * all characters are in the range U+0000-U+10FFFF
86
         * at least one character is in the range U+10000-U+10FFFF
87
       */
88
    unsigned int kind:3;
89
    /* Compact is with respect to the allocation scheme. Compact unicode
90
       objects only require one memory block while non-compact objects use
91
       one block for the PyUnicodeObject struct and another for its data
92
       buffer. */
93
    unsigned int compact:1;
94
    /* The string only contains characters in the range U+0000-U+007F (ASCII)
95
       and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
96
       set, use the PyASCIIObject structure. */
97
    unsigned int ascii:1;
98
    /* The object is statically allocated. */
99
    unsigned int statically_allocated:1;
100
#ifndef Py_GIL_DISABLED
101
    /* Historical: padding to ensure that PyUnicode_DATA() is always aligned to
102
       4 bytes (see issue gh-63736 on m68k) */
103
    unsigned int :24;
104
#endif
105
};
106
107
/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
108
   structure. state.ascii and state.compact are set, and the data
109
   immediately follow the structure. utf8_length can be found
110
   in the length field; the utf8 pointer is equal to the data pointer. */
111
typedef struct {
112
    /* There are 3 forms of Unicode strings:
113
114
       - compact ascii:
115
116
         * structure = PyASCIIObject
117
         * test: PyUnicode_IS_COMPACT_ASCII(op)
118
         * kind = PyUnicode_1BYTE_KIND
119
         * compact = 1
120
         * ascii = 1
121
         * (length is the length of the utf8)
122
         * (data starts just after the structure)
123
         * (since ASCII is decoded from UTF-8, the utf8 string are the data)
124
125
       - compact:
126
127
         * structure = PyCompactUnicodeObject
128
         * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
129
         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
130
           PyUnicode_4BYTE_KIND
131
         * compact = 1
132
         * ascii = 0
133
         * utf8 is not shared with data
134
         * utf8_length = 0 if utf8 is NULL
135
         * (data starts just after the structure)
136
137
       - legacy string:
138
139
         * structure = PyUnicodeObject structure
140
         * test: !PyUnicode_IS_COMPACT(op)
141
         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
142
           PyUnicode_4BYTE_KIND
143
         * compact = 0
144
         * data.any is not NULL
145
         * utf8 is shared and utf8_length = length with data.any if ascii = 1
146
         * utf8_length = 0 if utf8 is NULL
147
148
       Compact strings use only one memory block (structure + characters),
149
       whereas legacy strings use one block for the structure and one block
150
       for characters.
151
152
       Legacy strings are created by subclasses of Unicode.
153
154
       See also _PyUnicode_CheckConsistency().
155
    */
156
    PyObject_HEAD
157
    Py_ssize_t length;          /* Number of code points in the string */
158
    Py_hash_t hash;             /* Hash value; -1 if not set */
159
    /* Ensure 4 byte alignment for PyUnicode_DATA(), see gh-63736 on m68k. */
160
   _Py_ALIGNED_DEF(4, struct _PyUnicodeObject_state) state;
161
} PyASCIIObject;
162
163
/* Non-ASCII strings allocated through PyUnicode_New use the
164
   PyCompactUnicodeObject structure. state.compact is set, and the data
165
   immediately follow the structure. */
166
typedef struct {
167
    PyASCIIObject _base;
168
    Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
169
                                 * terminating \0. */
170
    char *utf8;                 /* UTF-8 representation (null-terminated) */
171
} PyCompactUnicodeObject;
172
173
/* Object format for Unicode subclasses. */
174
typedef struct {
175
    PyCompactUnicodeObject _base;
176
    union {
177
        void *any;
178
        Py_UCS1 *latin1;
179
        Py_UCS2 *ucs2;
180
        Py_UCS4 *ucs4;
181
    } data;                     /* Canonical, smallest-form Unicode buffer */
182
} PyUnicodeObject;
183
184
185
#define _PyASCIIObject_CAST(op) \
186
    (assert(PyUnicode_Check(op)), \
187
     _Py_CAST(PyASCIIObject*, (op)))
188
#define _PyCompactUnicodeObject_CAST(op) \
189
    (assert(PyUnicode_Check(op)), \
190
     _Py_CAST(PyCompactUnicodeObject*, (op)))
191
#define _PyUnicodeObject_CAST(op) \
192
    (assert(PyUnicode_Check(op)), \
193
     _Py_CAST(PyUnicodeObject*, (op)))
194
195
196
/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
197
198
/* Values for PyASCIIObject.state: */
199
200
/* Interning state. */
201
#define SSTATE_NOT_INTERNED 0
202
#define SSTATE_INTERNED_MORTAL 1
203
#define SSTATE_INTERNED_IMMORTAL 2
204
#define SSTATE_INTERNED_IMMORTAL_STATIC 3
205
206
/* Use only if you know it's a string */
207
0
static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
208
0
#ifdef Py_GIL_DISABLED
209
0
    return _Py_atomic_load_uint8_relaxed(&_PyASCIIObject_CAST(op)->state.interned);
210
0
#else
211
0
    return _PyASCIIObject_CAST(op)->state.interned;
212
0
#endif
213
0
}
214
#define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
215
216
/* For backward compatibility. Soft-deprecated. */
217
0
static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
218
0
    return 1;
219
0
}
220
#define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
221
222
/* Return true if the string contains only ASCII characters, or 0 if not. The
223
   string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. */
224
0
static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
225
0
    return _PyASCIIObject_CAST(op)->state.ascii;
226
0
}
227
#define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
228
229
/* Return true if the string is compact or 0 if not.
230
   No type checks are performed. */
231
0
static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
232
0
    return _PyASCIIObject_CAST(op)->state.compact;
233
0
}
234
#define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
235
236
/* Return true if the string is a compact ASCII string (use PyASCIIObject
237
   structure), or 0 if not.  No type checks are performed. */
238
0
static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
239
0
    return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
240
0
}
241
#define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
242
243
enum PyUnicode_Kind {
244
/* Return values of the PyUnicode_KIND() function: */
245
    PyUnicode_1BYTE_KIND = 1,
246
    PyUnicode_2BYTE_KIND = 2,
247
    PyUnicode_4BYTE_KIND = 4
248
};
249
250
PyAPI_FUNC(int) PyUnicode_KIND(PyObject *op);
251
252
// PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above.
253
//
254
// gh-89653: Converting this macro to a static inline function would introduce
255
// new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
256
// unsigned numbers) where kind type is an int or on
257
// "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
258
#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
259
260
/* Return a void pointer to the raw unicode buffer. */
261
0
static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
262
0
    if (PyUnicode_IS_ASCII(op)) {
263
0
        return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
264
0
    }
265
0
    return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
266
0
}
267
268
0
static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
269
0
    void *data;
270
0
    assert(!PyUnicode_IS_COMPACT(op));
271
0
    data = _PyUnicodeObject_CAST(op)->data.any;
272
0
    assert(data != NULL);
273
0
    return data;
274
0
}
275
276
PyAPI_FUNC(void*) PyUnicode_DATA(PyObject *op);
277
278
0
static inline void* _PyUnicode_DATA(PyObject *op) {
279
0
    if (PyUnicode_IS_COMPACT(op)) {
280
0
        return _PyUnicode_COMPACT_DATA(op);
281
0
    }
282
0
    return _PyUnicode_NONCOMPACT_DATA(op);
283
0
}
284
#define PyUnicode_DATA(op) _PyUnicode_DATA(_PyObject_CAST(op))
285
286
/* Return pointers to the canonical representation cast to unsigned char,
287
   Py_UCS2, or Py_UCS4 for direct character access.
288
   No checks are performed, use PyUnicode_KIND() before to ensure
289
   these will work correctly. */
290
291
#define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
292
#define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
293
#define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
294
295
/* Returns the length of the unicode string. */
296
0
static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
297
0
    return _PyASCIIObject_CAST(op)->length;
298
0
}
299
#define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
300
301
/* Returns the cached hash, or -1 if not cached yet. */
302
static inline Py_hash_t
303
0
PyUnstable_Unicode_GET_CACHED_HASH(PyObject *op) {
304
0
#ifdef Py_GIL_DISABLED
305
0
    return _Py_atomic_load_ssize_relaxed(&_PyASCIIObject_CAST(op)->hash);
306
0
#else
307
0
    return _PyASCIIObject_CAST(op)->hash;
308
0
#endif
309
0
}
310
311
/* Write into the canonical representation, this function does not do any sanity
312
   checks and is intended for usage in loops.  The caller should cache the
313
   kind and data pointers obtained from other function calls.
314
   index is the index in the string (starts at 0) and value is the new
315
   code point value which should be written to that location. */
316
static inline void PyUnicode_WRITE(int kind, void *data,
317
                                   Py_ssize_t index, Py_UCS4 value)
318
0
{
319
0
    assert(index >= 0);
320
0
    if (kind == PyUnicode_1BYTE_KIND) {
321
0
        assert(value <= 0xffU);
322
0
        _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
323
0
    }
324
0
    else if (kind == PyUnicode_2BYTE_KIND) {
325
0
        assert(value <= 0xffffU);
326
0
        _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
327
0
    }
328
0
    else {
329
0
        assert(kind == PyUnicode_4BYTE_KIND);
330
0
        assert(value <= 0x10ffffU);
331
0
        _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
332
0
    }
333
0
}
334
#define PyUnicode_WRITE(kind, data, index, value) \
335
    PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
336
                    (index), _Py_STATIC_CAST(Py_UCS4, value))
337
338
/* Read a code point from the string's canonical representation.  No checks
339
   are performed. */
340
static inline Py_UCS4 PyUnicode_READ(int kind,
341
                                     const void *data, Py_ssize_t index)
342
0
{
343
0
    assert(index >= 0);
344
0
    if (kind == PyUnicode_1BYTE_KIND) {
345
0
        return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
346
0
    }
347
0
    if (kind == PyUnicode_2BYTE_KIND) {
348
0
        return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
349
0
    }
350
0
    assert(kind == PyUnicode_4BYTE_KIND);
351
0
    return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
352
0
}
353
#define PyUnicode_READ(kind, data, index) \
354
    PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
355
                   _Py_STATIC_CAST(const void*, data), \
356
                   (index))
357
358
/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
359
   calls PyUnicode_KIND() and might call it twice.  For single reads, use
360
   PyUnicode_READ_CHAR, for multiple consecutive reads callers should
361
   cache kind and use PyUnicode_READ instead. */
362
static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
363
0
{
364
0
    int kind;
365
0
366
0
    assert(index >= 0);
367
0
    // Tolerate reading the NUL character at str[len(str)]
368
0
    assert(index <= PyUnicode_GET_LENGTH(unicode));
369
0
370
0
    kind = PyUnicode_KIND(unicode);
371
0
    if (kind == PyUnicode_1BYTE_KIND) {
372
0
        return PyUnicode_1BYTE_DATA(unicode)[index];
373
0
    }
374
0
    if (kind == PyUnicode_2BYTE_KIND) {
375
0
        return PyUnicode_2BYTE_DATA(unicode)[index];
376
0
    }
377
0
    assert(kind == PyUnicode_4BYTE_KIND);
378
0
    return PyUnicode_4BYTE_DATA(unicode)[index];
379
0
}
380
#define PyUnicode_READ_CHAR(unicode, index) \
381
    PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
382
383
/* Return a maximum character value which is suitable for creating another
384
   string based on op.  This is always an approximation but more efficient
385
   than iterating over the string. */
386
static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
387
0
{
388
0
    int kind;
389
0
390
0
    if (PyUnicode_IS_ASCII(op)) {
391
0
        return 0x7fU;
392
0
    }
393
0
394
0
    kind = PyUnicode_KIND(op);
395
0
    if (kind == PyUnicode_1BYTE_KIND) {
396
0
       return 0xffU;
397
0
    }
398
0
    if (kind == PyUnicode_2BYTE_KIND) {
399
0
        return 0xffffU;
400
0
    }
401
0
    assert(kind == PyUnicode_4BYTE_KIND);
402
0
    return 0x10ffffU;
403
0
}
404
#define PyUnicode_MAX_CHAR_VALUE(op) \
405
    PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
406
407
408
/* === Public API ========================================================= */
409
410
/* With PEP 393, this is the recommended way to allocate a new unicode object.
411
   This function will allocate the object and its buffer in a single memory
412
   block.  Objects created using this function are not resizable. */
413
PyAPI_FUNC(PyObject*) PyUnicode_New(
414
    Py_ssize_t size,            /* Number of code points in the new string */
415
    Py_UCS4 maxchar             /* maximum code point value in the string */
416
    );
417
418
/* For backward compatibility. Soft-deprecated. */
419
static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
420
0
{
421
0
    return 0;
422
0
}
423
#define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
424
425
/* Copy character from one unicode object into another, this function performs
426
   character conversion when necessary and falls back to memcpy() if possible.
427
428
   Fail if to is too small (smaller than *how_many* or smaller than
429
   len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
430
   kind(to), or if *to* has more than 1 reference.
431
432
   Return the number of written character, or return -1 and raise an exception
433
   on error.
434
435
   Pseudo-code:
436
437
       how_many = min(how_many, len(from) - from_start)
438
       to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
439
       return how_many
440
441
   Note: The function doesn't write a terminating null character.
442
   */
443
PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
444
    PyObject *to,
445
    Py_ssize_t to_start,
446
    PyObject *from,
447
    Py_ssize_t from_start,
448
    Py_ssize_t how_many
449
    );
450
451
/* Fill a string with a character: write fill_char into
452
   unicode[start:start+length].
453
454
   Fail if fill_char is bigger than the string maximum character, or if the
455
   string has more than 1 reference.
456
457
   Return the number of written character, or return -1 and raise an exception
458
   on error. */
459
PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
460
    PyObject *unicode,
461
    Py_ssize_t start,
462
    Py_ssize_t length,
463
    Py_UCS4 fill_char
464
    );
465
466
/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
467
   Scan the string to find the maximum character. */
468
PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
469
    int kind,
470
    const void *buffer,
471
    Py_ssize_t size);
472
473
474
/* --- Public PyUnicodeWriter API ----------------------------------------- */
475
476
typedef struct PyUnicodeWriter PyUnicodeWriter;
477
478
PyAPI_FUNC(PyUnicodeWriter*) PyUnicodeWriter_Create(Py_ssize_t length);
479
PyAPI_FUNC(void) PyUnicodeWriter_Discard(PyUnicodeWriter *writer);
480
PyAPI_FUNC(PyObject*) PyUnicodeWriter_Finish(PyUnicodeWriter *writer);
481
482
PyAPI_FUNC(int) PyUnicodeWriter_WriteChar(
483
    PyUnicodeWriter *writer,
484
    Py_UCS4 ch);
485
PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8(
486
    PyUnicodeWriter *writer,
487
    const char *str,
488
    Py_ssize_t size);
489
PyAPI_FUNC(int) PyUnicodeWriter_WriteASCII(
490
    PyUnicodeWriter *writer,
491
    const char *str,
492
    Py_ssize_t size);
493
PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
494
    PyUnicodeWriter *writer,
495
    const wchar_t *str,
496
    Py_ssize_t size);
497
PyAPI_FUNC(int) PyUnicodeWriter_WriteUCS4(
498
    PyUnicodeWriter *writer,
499
    Py_UCS4 *str,
500
    Py_ssize_t size);
501
502
PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
503
    PyUnicodeWriter *writer,
504
    PyObject *obj);
505
PyAPI_FUNC(int) PyUnicodeWriter_WriteRepr(
506
    PyUnicodeWriter *writer,
507
    PyObject *obj);
508
PyAPI_FUNC(int) PyUnicodeWriter_WriteSubstring(
509
    PyUnicodeWriter *writer,
510
    PyObject *str,
511
    Py_ssize_t start,
512
    Py_ssize_t end);
513
PyAPI_FUNC(int) PyUnicodeWriter_Format(
514
    PyUnicodeWriter *writer,
515
    const char *format,
516
    ...);
517
PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful(
518
    PyUnicodeWriter *writer,
519
    const char *string,         /* UTF-8 encoded string */
520
    Py_ssize_t length,          /* size of string */
521
    const char *errors,         /* error handling */
522
    Py_ssize_t *consumed);      /* bytes consumed */
523
524
525
/* --- Private _PyUnicodeWriter API --------------------------------------- */
526
527
typedef struct {
528
    PyObject *buffer;
529
    void *data;
530
    int kind;
531
    Py_UCS4 maxchar;
532
    Py_ssize_t size;
533
    Py_ssize_t pos;
534
535
    /* minimum number of allocated characters (default: 0) */
536
    Py_ssize_t min_length;
537
538
    /* minimum character (default: 127, ASCII) */
539
    Py_UCS4 min_char;
540
541
    /* If non-zero, overallocate the buffer (default: 0). */
542
    unsigned char overallocate;
543
544
    /* If readonly is 1, buffer is a shared string (cannot be modified)
545
       and size is set to 0. */
546
    unsigned char readonly;
547
} _PyUnicodeWriter;
548
549
// Initialize a Unicode writer.
550
//
551
// By default, the minimum buffer size is 0 character and overallocation is
552
// disabled. Set min_length, min_char and overallocate attributes to control
553
// the allocation of the buffer.
554
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(void) _PyUnicodeWriter_Init(
555
    _PyUnicodeWriter *writer);
556
557
/* Prepare the buffer to write 'length' characters
558
   with the specified maximum character.
559
560
   Return 0 on success, raise an exception and return -1 on error. */
561
#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
562
    (((MAXCHAR) <= (WRITER)->maxchar                                  \
563
      && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
564
     ? 0                                                              \
565
     : (((LENGTH) == 0)                                               \
566
        ? 0                                                           \
567
        : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
568
569
/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
570
   instead. */
571
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_PrepareInternal(
572
    _PyUnicodeWriter *writer,
573
    Py_ssize_t length,
574
    Py_UCS4 maxchar);
575
576
/* Prepare the buffer to have at least the kind KIND.
577
   For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
578
   support characters in range U+000-U+FFFF.
579
580
   Return 0 on success, raise an exception and return -1 on error. */
581
#define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
582
    ((KIND) <= (WRITER)->kind                                         \
583
     ? 0                                                              \
584
     : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
585
586
/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
587
   macro instead. */
588
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_PrepareKindInternal(
589
    _PyUnicodeWriter *writer,
590
    int kind);
591
592
/* Append a Unicode character.
593
   Return 0 on success, raise an exception and return -1 on error. */
594
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteChar(
595
    _PyUnicodeWriter *writer,
596
    Py_UCS4 ch);
597
598
/* Append a Unicode string.
599
   Return 0 on success, raise an exception and return -1 on error. */
600
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteStr(
601
    _PyUnicodeWriter *writer,
602
    PyObject *str);               /* Unicode string */
603
604
/* Append a substring of a Unicode string.
605
   Return 0 on success, raise an exception and return -1 on error. */
606
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteSubstring(
607
    _PyUnicodeWriter *writer,
608
    PyObject *str,              /* Unicode string */
609
    Py_ssize_t start,
610
    Py_ssize_t end);
611
612
/* Append an ASCII-encoded byte string.
613
   Return 0 on success, raise an exception and return -1 on error. */
614
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteASCIIString(
615
    _PyUnicodeWriter *writer,
616
    const char *str,           /* ASCII-encoded byte string */
617
    Py_ssize_t len);           /* number of bytes, or -1 if unknown */
618
619
/* Append a latin1-encoded byte string.
620
   Return 0 on success, raise an exception and return -1 on error. */
621
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteLatin1String(
622
    _PyUnicodeWriter *writer,
623
    const char *str,           /* latin1-encoded byte string */
624
    Py_ssize_t len);           /* length in bytes */
625
626
/* Get the value of the writer as a Unicode string. Clear the
627
   buffer of the writer. Raise an exception and return NULL
628
   on error. */
629
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(PyObject *) _PyUnicodeWriter_Finish(
630
    _PyUnicodeWriter *writer);
631
632
/* Deallocate memory of a writer (clear its internal buffer). */
633
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(void) _PyUnicodeWriter_Dealloc(
634
    _PyUnicodeWriter *writer);
635
636
637
/* --- Manage the default encoding ---------------------------------------- */
638
639
/* Returns a pointer to the default encoding (UTF-8) of the
640
   Unicode object unicode.
641
642
   Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
643
   in the unicodeobject.
644
645
   _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
646
   support the previous internal function with the same behaviour.
647
648
   Use of this API is DEPRECATED since no size information can be
649
   extracted from the returned data.
650
*/
651
652
PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
653
654
// Deprecated alias kept for backward compatibility
655
Py_DEPRECATED(3.14) static inline const char*
656
_PyUnicode_AsString(PyObject *unicode)
657
0
{
658
0
    return PyUnicode_AsUTF8(unicode);
659
0
}
660
661
662
/* === Characters Type APIs =============================================== */
663
664
/* These should not be used directly. Use the Py_UNICODE_IS* and
665
   Py_UNICODE_TO* macros instead.
666
667
   These APIs are implemented in Objects/unicodectype.c.
668
669
*/
670
671
PyAPI_FUNC(int) _PyUnicode_IsLowercase(
672
    Py_UCS4 ch       /* Unicode character */
673
    );
674
675
PyAPI_FUNC(int) _PyUnicode_IsUppercase(
676
    Py_UCS4 ch       /* Unicode character */
677
    );
678
679
PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
680
    Py_UCS4 ch       /* Unicode character */
681
    );
682
683
PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
684
    const Py_UCS4 ch         /* Unicode character */
685
    );
686
687
PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
688
    const Py_UCS4 ch         /* Unicode character */
689
    );
690
691
PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
692
    Py_UCS4 ch       /* Unicode character */
693
    );
694
695
PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
696
    Py_UCS4 ch       /* Unicode character */
697
    );
698
699
PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
700
    Py_UCS4 ch       /* Unicode character */
701
    );
702
703
PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
704
    Py_UCS4 ch       /* Unicode character */
705
    );
706
707
PyAPI_FUNC(int) _PyUnicode_ToDigit(
708
    Py_UCS4 ch       /* Unicode character */
709
    );
710
711
PyAPI_FUNC(double) _PyUnicode_ToNumeric(
712
    Py_UCS4 ch       /* Unicode character */
713
    );
714
715
PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
716
    Py_UCS4 ch       /* Unicode character */
717
    );
718
719
PyAPI_FUNC(int) _PyUnicode_IsDigit(
720
    Py_UCS4 ch       /* Unicode character */
721
    );
722
723
PyAPI_FUNC(int) _PyUnicode_IsNumeric(
724
    Py_UCS4 ch       /* Unicode character */
725
    );
726
727
PyAPI_FUNC(int) _PyUnicode_IsPrintable(
728
    Py_UCS4 ch       /* Unicode character */
729
    );
730
731
PyAPI_FUNC(int) _PyUnicode_IsAlpha(
732
    Py_UCS4 ch       /* Unicode character */
733
    );
734
735
// Helper array used by Py_UNICODE_ISSPACE().
736
PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
737
738
// Since splitting on whitespace is an important use case, and
739
// whitespace in most situations is solely ASCII whitespace, we
740
// optimize for the common case by using a quick look-up table
741
// _Py_ascii_whitespace (see below) with an inlined check.
742
0
static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
743
0
    if (ch < 128) {
744
0
        return _Py_ascii_whitespace[ch];
745
0
    }
746
0
    return _PyUnicode_IsWhitespace(ch);
747
0
}
748
749
#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
750
#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
751
#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
752
#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
753
754
#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
755
#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
756
#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
757
758
#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
759
#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
760
#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
761
#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
762
763
#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
764
#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
765
#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
766
767
#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
768
769
0
static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
770
0
   return (Py_UNICODE_ISALPHA(ch)
771
0
           || Py_UNICODE_ISDECIMAL(ch)
772
0
           || Py_UNICODE_ISDIGIT(ch)
773
0
           || Py_UNICODE_ISNUMERIC(ch));
774
0
}
775
776
777
/* === Misc functions ===================================================== */
778
779
// Return an interned Unicode object for an Identifier; may fail if there is no
780
// memory.
781
PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);