/src/cpython-install/include/python3.15/cpython/unicodeobject.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef Py_CPYTHON_UNICODEOBJECT_H |
2 | | # error "this header file must not be included directly" |
3 | | #endif |
4 | | |
5 | | /* Py_UNICODE was the native Unicode storage format (code unit) used by |
6 | | Python and represents a single Unicode element in the Unicode type. |
7 | | With PEP 393, Py_UNICODE is deprecated and replaced with a |
8 | | typedef to wchar_t. */ |
9 | | Py_DEPRECATED(3.13) typedef wchar_t PY_UNICODE_TYPE; |
10 | | Py_DEPRECATED(3.13) typedef wchar_t Py_UNICODE; |
11 | | |
12 | | |
13 | | /* --- Internal Unicode Operations ---------------------------------------- */ |
14 | | |
15 | | // Static inline functions to work with surrogates |
16 | 0 | static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) { |
17 | 0 | return (0xD800 <= ch && ch <= 0xDFFF); |
18 | 0 | } |
19 | 0 | static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) { |
20 | 0 | return (0xD800 <= ch && ch <= 0xDBFF); |
21 | 0 | } |
22 | 0 | static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) { |
23 | 0 | return (0xDC00 <= ch && ch <= 0xDFFF); |
24 | 0 | } |
25 | | |
26 | | // Join two surrogate characters and return a single Py_UCS4 value. |
27 | 0 | static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low) { |
28 | 0 | assert(Py_UNICODE_IS_HIGH_SURROGATE(high)); |
29 | 0 | assert(Py_UNICODE_IS_LOW_SURROGATE(low)); |
30 | 0 | return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF)); |
31 | 0 | } |
32 | | |
33 | | // High surrogate = top 10 bits added to 0xD800. |
34 | | // The character must be in the range [U+10000; U+10ffff]. |
35 | 0 | static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) { |
36 | 0 | assert(0x10000 <= ch && ch <= 0x10ffff); |
37 | 0 | return (0xD800 - (0x10000 >> 10) + (ch >> 10)); |
38 | 0 | } |
39 | | |
40 | | // Low surrogate = bottom 10 bits added to 0xDC00. |
41 | | // The character must be in the range [U+10000; U+10ffff]. |
42 | 0 | static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) { |
43 | 0 | assert(0x10000 <= ch && ch <= 0x10ffff); |
44 | 0 | return (0xDC00 + (ch & 0x3FF)); |
45 | 0 | } |
46 | | |
47 | | |
48 | | /* --- Unicode Type ------------------------------------------------------- */ |
49 | | |
50 | | struct _PyUnicodeObject_state { |
51 | | /* If interned is non-zero, the two references from the |
52 | | dictionary to this object are *not* counted in ob_refcnt. |
53 | | The possible values here are: |
54 | | 0: Not Interned |
55 | | 1: Interned |
56 | | 2: Interned and Immortal |
57 | | 3: Interned, Immortal, and Static |
58 | | This categorization allows the runtime to determine the right |
59 | | cleanup mechanism at runtime shutdown. */ |
60 | | #ifdef Py_GIL_DISABLED |
61 | | // Needs to be accessed atomically, so can't be a bit field. |
62 | | unsigned char interned; |
63 | | #else |
64 | | unsigned int interned:2; |
65 | | #endif |
66 | | /* Character size: |
67 | | |
68 | | - PyUnicode_1BYTE_KIND (1): |
69 | | |
70 | | * character type = Py_UCS1 (8 bits, unsigned) |
71 | | * all characters are in the range U+0000-U+00FF (latin1) |
72 | | * if ascii is set, all characters are in the range U+0000-U+007F |
73 | | (ASCII), otherwise at least one character is in the range |
74 | | U+0080-U+00FF |
75 | | |
76 | | - PyUnicode_2BYTE_KIND (2): |
77 | | |
78 | | * character type = Py_UCS2 (16 bits, unsigned) |
79 | | * all characters are in the range U+0000-U+FFFF (BMP) |
80 | | * at least one character is in the range U+0100-U+FFFF |
81 | | |
82 | | - PyUnicode_4BYTE_KIND (4): |
83 | | |
84 | | * character type = Py_UCS4 (32 bits, unsigned) |
85 | | * all characters are in the range U+0000-U+10FFFF |
86 | | * at least one character is in the range U+10000-U+10FFFF |
87 | | */ |
88 | | unsigned int kind:3; |
89 | | /* Compact is with respect to the allocation scheme. Compact unicode |
90 | | objects only require one memory block while non-compact objects use |
91 | | one block for the PyUnicodeObject struct and another for its data |
92 | | buffer. */ |
93 | | unsigned int compact:1; |
94 | | /* The string only contains characters in the range U+0000-U+007F (ASCII) |
95 | | and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is |
96 | | set, use the PyASCIIObject structure. */ |
97 | | unsigned int ascii:1; |
98 | | /* The object is statically allocated. */ |
99 | | unsigned int statically_allocated:1; |
100 | | #ifndef Py_GIL_DISABLED |
101 | | /* Historical: padding to ensure that PyUnicode_DATA() is always aligned to |
102 | | 4 bytes (see issue gh-63736 on m68k) */ |
103 | | unsigned int :24; |
104 | | #endif |
105 | | }; |
106 | | |
107 | | /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject |
108 | | structure. state.ascii and state.compact are set, and the data |
109 | | immediately follow the structure. utf8_length can be found |
110 | | in the length field; the utf8 pointer is equal to the data pointer. */ |
111 | | typedef struct { |
112 | | /* There are 4 forms of Unicode strings: |
113 | | |
114 | | - compact ascii: |
115 | | |
116 | | * structure = PyASCIIObject |
117 | | * test: PyUnicode_IS_COMPACT_ASCII(op) |
118 | | * kind = PyUnicode_1BYTE_KIND |
119 | | * compact = 1 |
120 | | * ascii = 1 |
121 | | * (length is the length of the utf8) |
122 | | * (data starts just after the structure) |
123 | | * (since ASCII is decoded from UTF-8, the utf8 string are the data) |
124 | | |
125 | | - compact: |
126 | | |
127 | | * structure = PyCompactUnicodeObject |
128 | | * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op) |
129 | | * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or |
130 | | PyUnicode_4BYTE_KIND |
131 | | * compact = 1 |
132 | | * ascii = 0 |
133 | | * utf8 is not shared with data |
134 | | * utf8_length = 0 if utf8 is NULL |
135 | | * (data starts just after the structure) |
136 | | |
137 | | - legacy string: |
138 | | |
139 | | * structure = PyUnicodeObject structure |
140 | | * test: !PyUnicode_IS_COMPACT(op) |
141 | | * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or |
142 | | PyUnicode_4BYTE_KIND |
143 | | * compact = 0 |
144 | | * data.any is not NULL |
145 | | * utf8 is shared and utf8_length = length with data.any if ascii = 1 |
146 | | * utf8_length = 0 if utf8 is NULL |
147 | | |
148 | | Compact strings use only one memory block (structure + characters), |
149 | | whereas legacy strings use one block for the structure and one block |
150 | | for characters. |
151 | | |
152 | | Legacy strings are created by subclasses of Unicode. |
153 | | |
154 | | See also _PyUnicode_CheckConsistency(). |
155 | | */ |
156 | | PyObject_HEAD |
157 | | Py_ssize_t length; /* Number of code points in the string */ |
158 | | Py_hash_t hash; /* Hash value; -1 if not set */ |
159 | | /* Ensure 4 byte alignment for PyUnicode_DATA(), see gh-63736 on m68k. */ |
160 | | _Py_ALIGNED_DEF(4, struct _PyUnicodeObject_state) state; |
161 | | } PyASCIIObject; |
162 | | |
163 | | /* Non-ASCII strings allocated through PyUnicode_New use the |
164 | | PyCompactUnicodeObject structure. state.compact is set, and the data |
165 | | immediately follow the structure. */ |
166 | | typedef struct { |
167 | | PyASCIIObject _base; |
168 | | Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the |
169 | | * terminating \0. */ |
170 | | char *utf8; /* UTF-8 representation (null-terminated) */ |
171 | | } PyCompactUnicodeObject; |
172 | | |
173 | | /* Object format for Unicode subclasses. */ |
174 | | typedef struct { |
175 | | PyCompactUnicodeObject _base; |
176 | | union { |
177 | | void *any; |
178 | | Py_UCS1 *latin1; |
179 | | Py_UCS2 *ucs2; |
180 | | Py_UCS4 *ucs4; |
181 | | } data; /* Canonical, smallest-form Unicode buffer */ |
182 | | } PyUnicodeObject; |
183 | | |
184 | | |
185 | | #define _PyASCIIObject_CAST(op) \ |
186 | | (assert(PyUnicode_Check(op)), \ |
187 | | _Py_CAST(PyASCIIObject*, (op))) |
188 | | #define _PyCompactUnicodeObject_CAST(op) \ |
189 | | (assert(PyUnicode_Check(op)), \ |
190 | | _Py_CAST(PyCompactUnicodeObject*, (op))) |
191 | | #define _PyUnicodeObject_CAST(op) \ |
192 | | (assert(PyUnicode_Check(op)), \ |
193 | | _Py_CAST(PyUnicodeObject*, (op))) |
194 | | |
195 | | |
196 | | /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */ |
197 | | |
198 | | /* Values for PyASCIIObject.state: */ |
199 | | |
200 | | /* Interning state. */ |
201 | | #define SSTATE_NOT_INTERNED 0 |
202 | | #define SSTATE_INTERNED_MORTAL 1 |
203 | | #define SSTATE_INTERNED_IMMORTAL 2 |
204 | | #define SSTATE_INTERNED_IMMORTAL_STATIC 3 |
205 | | |
206 | | /* Use only if you know it's a string */ |
207 | 0 | static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) { |
208 | 0 | #ifdef Py_GIL_DISABLED |
209 | 0 | return _Py_atomic_load_uint8_relaxed(&_PyASCIIObject_CAST(op)->state.interned); |
210 | 0 | #else |
211 | 0 | return _PyASCIIObject_CAST(op)->state.interned; |
212 | 0 | #endif |
213 | 0 | } |
214 | | #define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op)) |
215 | | |
216 | | /* For backward compatibility. Soft-deprecated. */ |
217 | 0 | static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) { |
218 | 0 | return 1; |
219 | 0 | } |
220 | | #define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op)) |
221 | | |
222 | | /* Return true if the string contains only ASCII characters, or 0 if not. The |
223 | | string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. */ |
224 | 0 | static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) { |
225 | 0 | return _PyASCIIObject_CAST(op)->state.ascii; |
226 | 0 | } |
227 | | #define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op)) |
228 | | |
229 | | /* Return true if the string is compact or 0 if not. |
230 | | No type checks are performed. */ |
231 | 0 | static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) { |
232 | 0 | return _PyASCIIObject_CAST(op)->state.compact; |
233 | 0 | } |
234 | | #define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op)) |
235 | | |
236 | | /* Return true if the string is a compact ASCII string (use PyASCIIObject |
237 | | structure), or 0 if not. No type checks are performed. */ |
238 | 0 | static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) { |
239 | 0 | return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op)); |
240 | 0 | } |
241 | | #define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op)) |
242 | | |
243 | | enum PyUnicode_Kind { |
244 | | /* Return values of the PyUnicode_KIND() function: */ |
245 | | PyUnicode_1BYTE_KIND = 1, |
246 | | PyUnicode_2BYTE_KIND = 2, |
247 | | PyUnicode_4BYTE_KIND = 4 |
248 | | }; |
249 | | |
250 | | PyAPI_FUNC(int) PyUnicode_KIND(PyObject *op); |
251 | | |
252 | | // PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above. |
253 | | // |
254 | | // gh-89653: Converting this macro to a static inline function would introduce |
255 | | // new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and |
256 | | // unsigned numbers) where kind type is an int or on |
257 | | // "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned). |
258 | | #define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind) |
259 | | |
260 | | /* Return a void pointer to the raw unicode buffer. */ |
261 | 0 | static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) { |
262 | 0 | if (PyUnicode_IS_ASCII(op)) { |
263 | 0 | return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1)); |
264 | 0 | } |
265 | 0 | return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1)); |
266 | 0 | } |
267 | | |
268 | 0 | static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) { |
269 | 0 | void *data; |
270 | 0 | assert(!PyUnicode_IS_COMPACT(op)); |
271 | 0 | data = _PyUnicodeObject_CAST(op)->data.any; |
272 | 0 | assert(data != NULL); |
273 | 0 | return data; |
274 | 0 | } |
275 | | |
276 | | PyAPI_FUNC(void*) PyUnicode_DATA(PyObject *op); |
277 | | |
278 | 0 | static inline void* _PyUnicode_DATA(PyObject *op) { |
279 | 0 | if (PyUnicode_IS_COMPACT(op)) { |
280 | 0 | return _PyUnicode_COMPACT_DATA(op); |
281 | 0 | } |
282 | 0 | return _PyUnicode_NONCOMPACT_DATA(op); |
283 | 0 | } |
284 | | #define PyUnicode_DATA(op) _PyUnicode_DATA(_PyObject_CAST(op)) |
285 | | |
286 | | /* Return pointers to the canonical representation cast to unsigned char, |
287 | | Py_UCS2, or Py_UCS4 for direct character access. |
288 | | No checks are performed, use PyUnicode_KIND() before to ensure |
289 | | these will work correctly. */ |
290 | | |
291 | | #define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op)) |
292 | | #define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op)) |
293 | | #define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op)) |
294 | | |
295 | | /* Returns the length of the unicode string. */ |
296 | 0 | static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) { |
297 | 0 | return _PyASCIIObject_CAST(op)->length; |
298 | 0 | } |
299 | | #define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op)) |
300 | | |
301 | | /* Returns the cached hash, or -1 if not cached yet. */ |
302 | | static inline Py_hash_t |
303 | 0 | PyUnstable_Unicode_GET_CACHED_HASH(PyObject *op) { |
304 | 0 | assert(PyUnicode_Check(op)); |
305 | 0 | #ifdef Py_GIL_DISABLED |
306 | 0 | return _Py_atomic_load_ssize_relaxed(&_PyASCIIObject_CAST(op)->hash); |
307 | 0 | #else |
308 | 0 | return _PyASCIIObject_CAST(op)->hash; |
309 | 0 | #endif |
310 | 0 | } |
311 | | |
312 | | /* Write into the canonical representation, this function does not do any sanity |
313 | | checks and is intended for usage in loops. The caller should cache the |
314 | | kind and data pointers obtained from other function calls. |
315 | | index is the index in the string (starts at 0) and value is the new |
316 | | code point value which should be written to that location. */ |
317 | | static inline void PyUnicode_WRITE(int kind, void *data, |
318 | | Py_ssize_t index, Py_UCS4 value) |
319 | 0 | { |
320 | 0 | assert(index >= 0); |
321 | 0 | if (kind == PyUnicode_1BYTE_KIND) { |
322 | 0 | assert(value <= 0xffU); |
323 | 0 | _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value); |
324 | 0 | } |
325 | 0 | else if (kind == PyUnicode_2BYTE_KIND) { |
326 | 0 | assert(value <= 0xffffU); |
327 | 0 | _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value); |
328 | 0 | } |
329 | 0 | else { |
330 | 0 | assert(kind == PyUnicode_4BYTE_KIND); |
331 | 0 | assert(value <= 0x10ffffU); |
332 | 0 | _Py_STATIC_CAST(Py_UCS4*, data)[index] = value; |
333 | 0 | } |
334 | 0 | } |
335 | | #define PyUnicode_WRITE(kind, data, index, value) \ |
336 | | PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \ |
337 | | (index), _Py_STATIC_CAST(Py_UCS4, value)) |
338 | | |
339 | | /* Read a code point from the string's canonical representation. No checks |
340 | | are performed. */ |
341 | | static inline Py_UCS4 PyUnicode_READ(int kind, |
342 | | const void *data, Py_ssize_t index) |
343 | 0 | { |
344 | 0 | assert(index >= 0); |
345 | 0 | if (kind == PyUnicode_1BYTE_KIND) { |
346 | 0 | return _Py_STATIC_CAST(const Py_UCS1*, data)[index]; |
347 | 0 | } |
348 | 0 | if (kind == PyUnicode_2BYTE_KIND) { |
349 | 0 | return _Py_STATIC_CAST(const Py_UCS2*, data)[index]; |
350 | 0 | } |
351 | 0 | assert(kind == PyUnicode_4BYTE_KIND); |
352 | 0 | return _Py_STATIC_CAST(const Py_UCS4*, data)[index]; |
353 | 0 | } |
354 | | #define PyUnicode_READ(kind, data, index) \ |
355 | | PyUnicode_READ(_Py_STATIC_CAST(int, kind), \ |
356 | | _Py_STATIC_CAST(const void*, data), \ |
357 | | (index)) |
358 | | |
359 | | /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it |
360 | | calls PyUnicode_KIND() and might call it twice. For single reads, use |
361 | | PyUnicode_READ_CHAR, for multiple consecutive reads callers should |
362 | | cache kind and use PyUnicode_READ instead. */ |
363 | | static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index) |
364 | 0 | { |
365 | 0 | int kind; |
366 | 0 |
|
367 | 0 | assert(index >= 0); |
368 | 0 | // Tolerate reading the NUL character at str[len(str)] |
369 | 0 | assert(index <= PyUnicode_GET_LENGTH(unicode)); |
370 | 0 |
|
371 | 0 | kind = PyUnicode_KIND(unicode); |
372 | 0 | if (kind == PyUnicode_1BYTE_KIND) { |
373 | 0 | return PyUnicode_1BYTE_DATA(unicode)[index]; |
374 | 0 | } |
375 | 0 | if (kind == PyUnicode_2BYTE_KIND) { |
376 | 0 | return PyUnicode_2BYTE_DATA(unicode)[index]; |
377 | 0 | } |
378 | 0 | assert(kind == PyUnicode_4BYTE_KIND); |
379 | 0 | return PyUnicode_4BYTE_DATA(unicode)[index]; |
380 | 0 | } |
381 | | #define PyUnicode_READ_CHAR(unicode, index) \ |
382 | | PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index)) |
383 | | |
384 | | /* Return a maximum character value which is suitable for creating another |
385 | | string based on op. This is always an approximation but more efficient |
386 | | than iterating over the string. */ |
387 | | static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op) |
388 | 0 | { |
389 | 0 | int kind; |
390 | 0 |
|
391 | 0 | if (PyUnicode_IS_ASCII(op)) { |
392 | 0 | return 0x7fU; |
393 | 0 | } |
394 | 0 |
|
395 | 0 | kind = PyUnicode_KIND(op); |
396 | 0 | if (kind == PyUnicode_1BYTE_KIND) { |
397 | 0 | return 0xffU; |
398 | 0 | } |
399 | 0 | if (kind == PyUnicode_2BYTE_KIND) { |
400 | 0 | return 0xffffU; |
401 | 0 | } |
402 | 0 | assert(kind == PyUnicode_4BYTE_KIND); |
403 | 0 | return 0x10ffffU; |
404 | 0 | } |
405 | | #define PyUnicode_MAX_CHAR_VALUE(op) \ |
406 | | PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op)) |
407 | | |
408 | | |
409 | | /* === Public API ========================================================= */ |
410 | | |
411 | | /* With PEP 393, this is the recommended way to allocate a new unicode object. |
412 | | This function will allocate the object and its buffer in a single memory |
413 | | block. Objects created using this function are not resizable. */ |
414 | | PyAPI_FUNC(PyObject*) PyUnicode_New( |
415 | | Py_ssize_t size, /* Number of code points in the new string */ |
416 | | Py_UCS4 maxchar /* maximum code point value in the string */ |
417 | | ); |
418 | | |
419 | | /* For backward compatibility. Soft-deprecated. */ |
420 | | static inline int PyUnicode_READY(PyObject* Py_UNUSED(op)) |
421 | 0 | { |
422 | 0 | return 0; |
423 | 0 | } |
424 | | #define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op)) |
425 | | |
426 | | /* Copy character from one unicode object into another, this function performs |
427 | | character conversion when necessary and falls back to memcpy() if possible. |
428 | | |
429 | | Fail if to is too small (smaller than *how_many* or smaller than |
430 | | len(from)-from_start), or if kind(from[from_start:from_start+how_many]) > |
431 | | kind(to), or if *to* has more than 1 reference. |
432 | | |
433 | | Return the number of written character, or return -1 and raise an exception |
434 | | on error. |
435 | | |
436 | | Pseudo-code: |
437 | | |
438 | | how_many = min(how_many, len(from) - from_start) |
439 | | to[to_start:to_start+how_many] = from[from_start:from_start+how_many] |
440 | | return how_many |
441 | | |
442 | | Note: The function doesn't write a terminating null character. |
443 | | */ |
444 | | PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters( |
445 | | PyObject *to, |
446 | | Py_ssize_t to_start, |
447 | | PyObject *from, |
448 | | Py_ssize_t from_start, |
449 | | Py_ssize_t how_many |
450 | | ); |
451 | | |
452 | | /* Fill a string with a character: write fill_char into |
453 | | unicode[start:start+length]. |
454 | | |
455 | | Fail if fill_char is bigger than the string maximum character, or if the |
456 | | string has more than 1 reference. |
457 | | |
458 | | Return the number of written character, or return -1 and raise an exception |
459 | | on error. */ |
460 | | PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill( |
461 | | PyObject *unicode, |
462 | | Py_ssize_t start, |
463 | | Py_ssize_t length, |
464 | | Py_UCS4 fill_char |
465 | | ); |
466 | | |
467 | | /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters. |
468 | | Scan the string to find the maximum character. */ |
469 | | PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( |
470 | | int kind, |
471 | | const void *buffer, |
472 | | Py_ssize_t size); |
473 | | |
474 | | |
475 | | /* --- Public PyUnicodeWriter API ----------------------------------------- */ |
476 | | |
477 | | typedef struct PyUnicodeWriter PyUnicodeWriter; |
478 | | |
479 | | PyAPI_FUNC(PyUnicodeWriter*) PyUnicodeWriter_Create(Py_ssize_t length); |
480 | | PyAPI_FUNC(void) PyUnicodeWriter_Discard(PyUnicodeWriter *writer); |
481 | | PyAPI_FUNC(PyObject*) PyUnicodeWriter_Finish(PyUnicodeWriter *writer); |
482 | | |
483 | | PyAPI_FUNC(int) PyUnicodeWriter_WriteChar( |
484 | | PyUnicodeWriter *writer, |
485 | | Py_UCS4 ch); |
486 | | PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8( |
487 | | PyUnicodeWriter *writer, |
488 | | const char *str, |
489 | | Py_ssize_t size); |
490 | | PyAPI_FUNC(int) PyUnicodeWriter_WriteASCII( |
491 | | PyUnicodeWriter *writer, |
492 | | const char *str, |
493 | | Py_ssize_t size); |
494 | | PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar( |
495 | | PyUnicodeWriter *writer, |
496 | | const wchar_t *str, |
497 | | Py_ssize_t size); |
498 | | PyAPI_FUNC(int) PyUnicodeWriter_WriteUCS4( |
499 | | PyUnicodeWriter *writer, |
500 | | Py_UCS4 *str, |
501 | | Py_ssize_t size); |
502 | | |
503 | | PyAPI_FUNC(int) PyUnicodeWriter_WriteStr( |
504 | | PyUnicodeWriter *writer, |
505 | | PyObject *obj); |
506 | | PyAPI_FUNC(int) PyUnicodeWriter_WriteRepr( |
507 | | PyUnicodeWriter *writer, |
508 | | PyObject *obj); |
509 | | PyAPI_FUNC(int) PyUnicodeWriter_WriteSubstring( |
510 | | PyUnicodeWriter *writer, |
511 | | PyObject *str, |
512 | | Py_ssize_t start, |
513 | | Py_ssize_t end); |
514 | | PyAPI_FUNC(int) PyUnicodeWriter_Format( |
515 | | PyUnicodeWriter *writer, |
516 | | const char *format, |
517 | | ...); |
518 | | PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful( |
519 | | PyUnicodeWriter *writer, |
520 | | const char *string, /* UTF-8 encoded string */ |
521 | | Py_ssize_t length, /* size of string */ |
522 | | const char *errors, /* error handling */ |
523 | | Py_ssize_t *consumed); /* bytes consumed */ |
524 | | |
525 | | |
526 | | /* --- Private _PyUnicodeWriter API --------------------------------------- */ |
527 | | |
528 | | typedef struct { |
529 | | PyObject *buffer; |
530 | | void *data; |
531 | | int kind; |
532 | | Py_UCS4 maxchar; |
533 | | Py_ssize_t size; |
534 | | Py_ssize_t pos; |
535 | | |
536 | | /* minimum number of allocated characters (default: 0) */ |
537 | | Py_ssize_t min_length; |
538 | | |
539 | | /* minimum character (default: 127, ASCII) */ |
540 | | Py_UCS4 min_char; |
541 | | |
542 | | /* If non-zero, overallocate the buffer (default: 0). */ |
543 | | unsigned char overallocate; |
544 | | |
545 | | /* If readonly is 1, buffer is a shared string (cannot be modified) |
546 | | and size is set to 0. */ |
547 | | unsigned char readonly; |
548 | | } _PyUnicodeWriter; |
549 | | |
550 | | // Initialize a Unicode writer. |
551 | | // |
552 | | // By default, the minimum buffer size is 0 character and overallocation is |
553 | | // disabled. Set min_length, min_char and overallocate attributes to control |
554 | | // the allocation of the buffer. |
555 | | _Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(void) _PyUnicodeWriter_Init( |
556 | | _PyUnicodeWriter *writer); |
557 | | |
558 | | /* Prepare the buffer to write 'length' characters |
559 | | with the specified maximum character. |
560 | | |
561 | | Return 0 on success, raise an exception and return -1 on error. */ |
562 | | #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \ |
563 | | (((MAXCHAR) <= (WRITER)->maxchar \ |
564 | | && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \ |
565 | | ? 0 \ |
566 | | : (((LENGTH) == 0) \ |
567 | | ? 0 \ |
568 | | : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR)))) |
569 | | |
570 | | /* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro |
571 | | instead. */ |
572 | | _Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_PrepareInternal( |
573 | | _PyUnicodeWriter *writer, |
574 | | Py_ssize_t length, |
575 | | Py_UCS4 maxchar); |
576 | | |
577 | | /* Prepare the buffer to have at least the kind KIND. |
578 | | For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will |
579 | | support characters in range U+000-U+FFFF. |
580 | | |
581 | | Return 0 on success, raise an exception and return -1 on error. */ |
582 | | #define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \ |
583 | | ((KIND) <= (WRITER)->kind \ |
584 | | ? 0 \ |
585 | | : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND))) |
586 | | |
587 | | /* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind() |
588 | | macro instead. */ |
589 | | _Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_PrepareKindInternal( |
590 | | _PyUnicodeWriter *writer, |
591 | | int kind); |
592 | | |
593 | | /* Append a Unicode character. |
594 | | Return 0 on success, raise an exception and return -1 on error. */ |
595 | | _Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteChar( |
596 | | _PyUnicodeWriter *writer, |
597 | | Py_UCS4 ch); |
598 | | |
599 | | /* Append a Unicode string. |
600 | | Return 0 on success, raise an exception and return -1 on error. */ |
601 | | _Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteStr( |
602 | | _PyUnicodeWriter *writer, |
603 | | PyObject *str); /* Unicode string */ |
604 | | |
605 | | /* Append a substring of a Unicode string. |
606 | | Return 0 on success, raise an exception and return -1 on error. */ |
607 | | _Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteSubstring( |
608 | | _PyUnicodeWriter *writer, |
609 | | PyObject *str, /* Unicode string */ |
610 | | Py_ssize_t start, |
611 | | Py_ssize_t end); |
612 | | |
613 | | /* Append an ASCII-encoded byte string. |
614 | | Return 0 on success, raise an exception and return -1 on error. */ |
615 | | _Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteASCIIString( |
616 | | _PyUnicodeWriter *writer, |
617 | | const char *str, /* ASCII-encoded byte string */ |
618 | | Py_ssize_t len); /* number of bytes, or -1 if unknown */ |
619 | | |
620 | | /* Append a latin1-encoded byte string. |
621 | | Return 0 on success, raise an exception and return -1 on error. */ |
622 | | _Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteLatin1String( |
623 | | _PyUnicodeWriter *writer, |
624 | | const char *str, /* latin1-encoded byte string */ |
625 | | Py_ssize_t len); /* length in bytes */ |
626 | | |
627 | | /* Get the value of the writer as a Unicode string. Clear the |
628 | | buffer of the writer. Raise an exception and return NULL |
629 | | on error. */ |
630 | | _Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(PyObject *) _PyUnicodeWriter_Finish( |
631 | | _PyUnicodeWriter *writer); |
632 | | |
633 | | /* Deallocate memory of a writer (clear its internal buffer). */ |
634 | | _Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(void) _PyUnicodeWriter_Dealloc( |
635 | | _PyUnicodeWriter *writer); |
636 | | |
637 | | |
638 | | /* --- Manage the default encoding ---------------------------------------- */ |
639 | | |
640 | | /* Returns a pointer to the default encoding (UTF-8) of the |
641 | | Unicode object unicode. |
642 | | |
643 | | Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation |
644 | | in the unicodeobject. |
645 | | |
646 | | _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to |
647 | | support the previous internal function with the same behaviour. |
648 | | |
649 | | Use of this API is DEPRECATED since no size information can be |
650 | | extracted from the returned data. |
651 | | */ |
652 | | |
653 | | PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode); |
654 | | |
655 | | // Deprecated alias kept for backward compatibility |
656 | | Py_DEPRECATED(3.14) static inline const char* |
657 | | _PyUnicode_AsString(PyObject *unicode) |
658 | 0 | { |
659 | 0 | return PyUnicode_AsUTF8(unicode); |
660 | 0 | } |
661 | | |
662 | | |
663 | | /* === Characters Type APIs =============================================== */ |
664 | | |
665 | | /* These should not be used directly. Use the Py_UNICODE_IS* and |
666 | | Py_UNICODE_TO* macros instead. |
667 | | |
668 | | These APIs are implemented in Objects/unicodectype.c. |
669 | | |
670 | | */ |
671 | | |
672 | | PyAPI_FUNC(int) _PyUnicode_IsLowercase( |
673 | | Py_UCS4 ch /* Unicode character */ |
674 | | ); |
675 | | |
676 | | PyAPI_FUNC(int) _PyUnicode_IsUppercase( |
677 | | Py_UCS4 ch /* Unicode character */ |
678 | | ); |
679 | | |
680 | | PyAPI_FUNC(int) _PyUnicode_IsTitlecase( |
681 | | Py_UCS4 ch /* Unicode character */ |
682 | | ); |
683 | | |
684 | | PyAPI_FUNC(int) _PyUnicode_IsWhitespace( |
685 | | const Py_UCS4 ch /* Unicode character */ |
686 | | ); |
687 | | |
688 | | PyAPI_FUNC(int) _PyUnicode_IsLinebreak( |
689 | | const Py_UCS4 ch /* Unicode character */ |
690 | | ); |
691 | | |
692 | | PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( |
693 | | Py_UCS4 ch /* Unicode character */ |
694 | | ); |
695 | | |
696 | | PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( |
697 | | Py_UCS4 ch /* Unicode character */ |
698 | | ); |
699 | | |
700 | | PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( |
701 | | Py_UCS4 ch /* Unicode character */ |
702 | | ); |
703 | | |
704 | | PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( |
705 | | Py_UCS4 ch /* Unicode character */ |
706 | | ); |
707 | | |
708 | | PyAPI_FUNC(int) _PyUnicode_ToDigit( |
709 | | Py_UCS4 ch /* Unicode character */ |
710 | | ); |
711 | | |
712 | | PyAPI_FUNC(double) _PyUnicode_ToNumeric( |
713 | | Py_UCS4 ch /* Unicode character */ |
714 | | ); |
715 | | |
716 | | PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( |
717 | | Py_UCS4 ch /* Unicode character */ |
718 | | ); |
719 | | |
720 | | PyAPI_FUNC(int) _PyUnicode_IsDigit( |
721 | | Py_UCS4 ch /* Unicode character */ |
722 | | ); |
723 | | |
724 | | PyAPI_FUNC(int) _PyUnicode_IsNumeric( |
725 | | Py_UCS4 ch /* Unicode character */ |
726 | | ); |
727 | | |
728 | | PyAPI_FUNC(int) _PyUnicode_IsPrintable( |
729 | | Py_UCS4 ch /* Unicode character */ |
730 | | ); |
731 | | |
732 | | PyAPI_FUNC(int) _PyUnicode_IsAlpha( |
733 | | Py_UCS4 ch /* Unicode character */ |
734 | | ); |
735 | | |
736 | | // Helper array used by Py_UNICODE_ISSPACE(). |
737 | | PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; |
738 | | |
739 | | // Since splitting on whitespace is an important use case, and |
740 | | // whitespace in most situations is solely ASCII whitespace, we |
741 | | // optimize for the common case by using a quick look-up table |
742 | | // _Py_ascii_whitespace (see below) with an inlined check. |
743 | 0 | static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) { |
744 | 0 | if (ch < 128) { |
745 | 0 | return _Py_ascii_whitespace[ch]; |
746 | 0 | } |
747 | 0 | return _PyUnicode_IsWhitespace(ch); |
748 | 0 | } |
749 | | |
750 | | #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) |
751 | | #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) |
752 | | #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) |
753 | | #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) |
754 | | |
755 | | #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) |
756 | | #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) |
757 | | #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) |
758 | | |
759 | | #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) |
760 | | #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) |
761 | | #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) |
762 | | #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) |
763 | | |
764 | | #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) |
765 | | #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) |
766 | | #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) |
767 | | |
768 | | #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) |
769 | | |
770 | 0 | static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) { |
771 | 0 | return (Py_UNICODE_ISALPHA(ch) |
772 | 0 | || Py_UNICODE_ISDECIMAL(ch) |
773 | 0 | || Py_UNICODE_ISDIGIT(ch) |
774 | 0 | || Py_UNICODE_ISNUMERIC(ch)); |
775 | 0 | } |
776 | | |
777 | | |
778 | | /* === Misc functions ===================================================== */ |
779 | | |
780 | | // Return an interned Unicode object for an Identifier; may fail if there is no |
781 | | // memory. |
782 | | PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*); |