/src/cpython/Objects/unicode_writer.c
Line | Count | Source |
1 | | /* |
2 | | |
3 | | Unicode implementation based on original code by Fredrik Lundh, |
4 | | modified by Marc-Andre Lemburg <mal@lemburg.com>. |
5 | | |
6 | | Major speed upgrades to the method implementations at the Reykjavik |
7 | | NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. |
8 | | |
9 | | Copyright (c) Corporation for National Research Initiatives. |
10 | | |
11 | | -------------------------------------------------------------------- |
12 | | The original string type implementation is: |
13 | | |
14 | | Copyright (c) 1999 by Secret Labs AB |
15 | | Copyright (c) 1999 by Fredrik Lundh |
16 | | |
17 | | By obtaining, using, and/or copying this software and/or its |
18 | | associated documentation, you agree that you have read, understood, |
19 | | and will comply with the following terms and conditions: |
20 | | |
21 | | Permission to use, copy, modify, and distribute this software and its |
22 | | associated documentation for any purpose and without fee is hereby |
23 | | granted, provided that the above copyright notice appears in all |
24 | | copies, and that both that copyright notice and this permission notice |
25 | | appear in supporting documentation, and that the name of Secret Labs |
26 | | AB or the author not be used in advertising or publicity pertaining to |
27 | | distribution of the software without specific, written prior |
28 | | permission. |
29 | | |
30 | | SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO |
31 | | THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
32 | | FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR |
33 | | ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
34 | | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
35 | | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT |
36 | | OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
37 | | -------------------------------------------------------------------- |
38 | | |
39 | | */ |
40 | | |
41 | | #include "Python.h" |
42 | | #include "pycore_freelist.h" // _Py_FREELIST_FREE() |
43 | | #include "pycore_long.h" // _PyLong_FormatWriter() |
44 | | #include "pycore_unicodeobject.h" // _PyUnicode_Result() |
45 | | |
46 | | |
47 | | #ifdef MS_WINDOWS |
48 | | /* On Windows, overallocate by 50% is the best factor */ |
49 | | # define OVERALLOCATE_FACTOR 2 |
50 | | #else |
51 | | /* On Linux, overallocate by 25% is the best factor */ |
52 | 82.3M | # define OVERALLOCATE_FACTOR 4 |
53 | | #endif |
54 | | |
55 | | |
56 | | /* Compilation of templated routines */ |
57 | | |
58 | | #define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty() |
59 | | |
60 | | #include "stringlib/ucs1lib.h" |
61 | | #include "stringlib/find_max_char.h" |
62 | | #include "stringlib/undef.h" |
63 | | |
64 | | |
65 | | /* Copy an ASCII or latin1 char* string into a Python Unicode string. |
66 | | |
67 | | WARNING: The function doesn't copy the terminating null character and |
68 | | doesn't check the maximum character (may write a latin1 character in an |
69 | | ASCII string). */ |
70 | | static void |
71 | | unicode_write_cstr(PyObject *unicode, Py_ssize_t index, |
72 | | const char *str, Py_ssize_t len) |
73 | 0 | { |
74 | 0 | int kind = PyUnicode_KIND(unicode); |
75 | 0 | const void *data = PyUnicode_DATA(unicode); |
76 | 0 | const char *end = str + len; |
77 | |
|
78 | 0 | assert(index + len <= PyUnicode_GET_LENGTH(unicode)); |
79 | 0 | switch (kind) { |
80 | 0 | case PyUnicode_1BYTE_KIND: { |
81 | | #ifdef Py_DEBUG |
82 | | if (PyUnicode_IS_ASCII(unicode)) { |
83 | | Py_UCS4 maxchar = ucs1lib_find_max_char( |
84 | | (const Py_UCS1*)str, |
85 | | (const Py_UCS1*)str + len); |
86 | | assert(maxchar < 128); |
87 | | } |
88 | | #endif |
89 | 0 | memcpy((char *) data + index, str, len); |
90 | 0 | break; |
91 | 0 | } |
92 | 0 | case PyUnicode_2BYTE_KIND: { |
93 | 0 | Py_UCS2 *start = (Py_UCS2 *)data + index; |
94 | 0 | Py_UCS2 *ucs2 = start; |
95 | |
|
96 | 0 | for (; str < end; ++ucs2, ++str) |
97 | 0 | *ucs2 = (Py_UCS2)*str; |
98 | |
|
99 | 0 | assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); |
100 | 0 | break; |
101 | 0 | } |
102 | 0 | case PyUnicode_4BYTE_KIND: { |
103 | 0 | Py_UCS4 *start = (Py_UCS4 *)data + index; |
104 | 0 | Py_UCS4 *ucs4 = start; |
105 | |
|
106 | 0 | for (; str < end; ++ucs4, ++str) |
107 | 0 | *ucs4 = (Py_UCS4)*str; |
108 | |
|
109 | 0 | assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); |
110 | 0 | break; |
111 | 0 | } |
112 | 0 | default: |
113 | 0 | Py_UNREACHABLE(); |
114 | 0 | } |
115 | 0 | } |
116 | | |
117 | | |
118 | | static inline void |
119 | | _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) |
120 | 64.8M | { |
121 | 64.8M | writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); |
122 | 64.8M | writer->data = PyUnicode_DATA(writer->buffer); |
123 | | |
124 | 64.8M | if (!writer->readonly) { |
125 | 64.7M | writer->kind = PyUnicode_KIND(writer->buffer); |
126 | 64.7M | writer->size = PyUnicode_GET_LENGTH(writer->buffer); |
127 | 64.7M | } |
128 | 86.2k | else { |
129 | | /* use a value smaller than PyUnicode_1BYTE_KIND() so |
130 | | _PyUnicodeWriter_PrepareKind() will copy the buffer. */ |
131 | 86.2k | writer->kind = 0; |
132 | 86.2k | assert(writer->kind <= PyUnicode_1BYTE_KIND); |
133 | | |
134 | | /* Copy-on-write mode: set buffer size to 0 so |
135 | | * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on |
136 | | * next write. */ |
137 | 86.2k | writer->size = 0; |
138 | 86.2k | } |
139 | 64.8M | } |
140 | | |
141 | | |
142 | | void |
143 | | _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) |
144 | 39.5M | { |
145 | 39.5M | memset(writer, 0, sizeof(*writer)); |
146 | | |
147 | | /* ASCII is the bare minimum */ |
148 | 39.5M | writer->min_char = 127; |
149 | | |
150 | | /* use a kind value smaller than PyUnicode_1BYTE_KIND so |
151 | | _PyUnicodeWriter_PrepareKind() will copy the buffer. */ |
152 | 39.5M | assert(writer->kind == 0); |
153 | 39.5M | assert(writer->kind < PyUnicode_1BYTE_KIND); |
154 | 39.5M | } |
155 | | |
156 | | |
157 | | PyUnicodeWriter* |
158 | | PyUnicodeWriter_Create(Py_ssize_t length) |
159 | 3.99M | { |
160 | 3.99M | if (length < 0) { |
161 | 0 | PyErr_SetString(PyExc_ValueError, |
162 | 0 | "length must be positive"); |
163 | 0 | return NULL; |
164 | 0 | } |
165 | | |
166 | 3.99M | const size_t size = sizeof(_PyUnicodeWriter); |
167 | 3.99M | PyUnicodeWriter *pub_writer; |
168 | 3.99M | pub_writer = _Py_FREELIST_POP_MEM(unicode_writers); |
169 | 3.99M | if (pub_writer == NULL) { |
170 | 2.26M | pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size); |
171 | 2.26M | if (pub_writer == NULL) { |
172 | 0 | return (PyUnicodeWriter *)PyErr_NoMemory(); |
173 | 0 | } |
174 | 2.26M | } |
175 | 3.99M | _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer; |
176 | | |
177 | 3.99M | _PyUnicodeWriter_Init(writer); |
178 | 3.99M | if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) { |
179 | 0 | PyUnicodeWriter_Discard(pub_writer); |
180 | 0 | return NULL; |
181 | 0 | } |
182 | 3.99M | writer->overallocate = 1; |
183 | | |
184 | 3.99M | return pub_writer; |
185 | 3.99M | } |
186 | | |
187 | | |
188 | | void PyUnicodeWriter_Discard(PyUnicodeWriter *writer) |
189 | 77.1k | { |
190 | 77.1k | if (writer == NULL) { |
191 | 76.4k | return; |
192 | 76.4k | } |
193 | 660 | _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer); |
194 | 660 | _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free); |
195 | 660 | } |
196 | | |
197 | | |
198 | | // Initialize _PyUnicodeWriter with initial buffer |
199 | | void |
200 | | _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer) |
201 | 8.95M | { |
202 | 8.95M | memset(writer, 0, sizeof(*writer)); |
203 | 8.95M | writer->buffer = buffer; |
204 | 8.95M | _PyUnicodeWriter_Update(writer); |
205 | 8.95M | writer->min_length = writer->size; |
206 | 8.95M | } |
207 | | |
208 | | |
209 | | int |
210 | | _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, |
211 | | Py_ssize_t length, Py_UCS4 maxchar) |
212 | 55.7M | { |
213 | 55.7M | Py_ssize_t newlen; |
214 | 55.7M | PyObject *newbuffer; |
215 | | |
216 | 55.7M | assert(length >= 0); |
217 | 55.7M | assert(maxchar <= _Py_MAX_UNICODE); |
218 | | |
219 | | /* ensure that the _PyUnicodeWriter_Prepare macro was used */ |
220 | 55.7M | assert((maxchar > writer->maxchar && length >= 0) |
221 | 55.7M | || length > 0); |
222 | | |
223 | 55.7M | if (length > PY_SSIZE_T_MAX - writer->pos) { |
224 | 0 | PyErr_NoMemory(); |
225 | 0 | return -1; |
226 | 0 | } |
227 | 55.7M | newlen = writer->pos + length; |
228 | | |
229 | 55.7M | maxchar = Py_MAX(maxchar, writer->min_char); |
230 | | |
231 | 55.7M | if (writer->buffer == NULL) { |
232 | 38.6M | assert(!writer->readonly); |
233 | 38.6M | if (writer->overallocate |
234 | 33.0M | && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { |
235 | | /* overallocate to limit the number of realloc() */ |
236 | 33.0M | newlen += newlen / OVERALLOCATE_FACTOR; |
237 | 33.0M | } |
238 | 38.6M | if (newlen < writer->min_length) |
239 | 33.9M | newlen = writer->min_length; |
240 | | |
241 | 38.6M | writer->buffer = PyUnicode_New(newlen, maxchar); |
242 | 38.6M | if (writer->buffer == NULL) |
243 | 0 | return -1; |
244 | 38.6M | } |
245 | 17.0M | else if (newlen > writer->size) { |
246 | 8.46M | if (writer->overallocate |
247 | 8.09M | && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { |
248 | | /* overallocate to limit the number of realloc() */ |
249 | 8.09M | newlen += newlen / OVERALLOCATE_FACTOR; |
250 | 8.09M | } |
251 | 8.46M | if (newlen < writer->min_length) |
252 | 1.29k | newlen = writer->min_length; |
253 | | |
254 | 8.46M | if (maxchar > writer->maxchar || writer->readonly) { |
255 | | /* resize + widen */ |
256 | 2.84M | maxchar = Py_MAX(maxchar, writer->maxchar); |
257 | 2.84M | newbuffer = PyUnicode_New(newlen, maxchar); |
258 | 2.84M | if (newbuffer == NULL) |
259 | 0 | return -1; |
260 | 2.84M | _PyUnicode_FastCopyCharacters(newbuffer, 0, |
261 | 2.84M | writer->buffer, 0, writer->pos); |
262 | 2.84M | Py_DECREF(writer->buffer); |
263 | 2.84M | writer->readonly = 0; |
264 | 2.84M | } |
265 | 5.61M | else { |
266 | 5.61M | newbuffer = _PyUnicode_ResizeCompact(writer->buffer, newlen); |
267 | 5.61M | if (newbuffer == NULL) |
268 | 0 | return -1; |
269 | 5.61M | } |
270 | 8.46M | writer->buffer = newbuffer; |
271 | 8.46M | } |
272 | 8.62M | else if (maxchar > writer->maxchar) { |
273 | 8.62M | assert(!writer->readonly); |
274 | 8.62M | newbuffer = PyUnicode_New(writer->size, maxchar); |
275 | 8.62M | if (newbuffer == NULL) |
276 | 0 | return -1; |
277 | 8.62M | _PyUnicode_FastCopyCharacters(newbuffer, 0, |
278 | 8.62M | writer->buffer, 0, writer->pos); |
279 | 8.62M | Py_SETREF(writer->buffer, newbuffer); |
280 | 8.62M | } |
281 | 55.7M | _PyUnicodeWriter_Update(writer); |
282 | 55.7M | return 0; |
283 | | |
284 | 55.7M | #undef OVERALLOCATE_FACTOR |
285 | 55.7M | } |
286 | | |
287 | | int |
288 | | _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, |
289 | | int kind) |
290 | 198k | { |
291 | 198k | Py_UCS4 maxchar; |
292 | | |
293 | | /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */ |
294 | 198k | assert(writer->kind < kind); |
295 | | |
296 | 198k | switch (kind) |
297 | 198k | { |
298 | 0 | case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; |
299 | 198k | case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; |
300 | 0 | case PyUnicode_4BYTE_KIND: maxchar = _Py_MAX_UNICODE; break; |
301 | 0 | default: |
302 | 0 | Py_UNREACHABLE(); |
303 | 198k | } |
304 | | |
305 | 198k | return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); |
306 | 198k | } |
307 | | |
308 | | |
309 | | int |
310 | | _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) |
311 | 83.4M | { |
312 | 83.4M | return _PyUnicodeWriter_WriteCharInline(writer, ch); |
313 | 83.4M | } |
314 | | |
315 | | |
316 | | int |
317 | | PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch) |
318 | 61.5M | { |
319 | 61.5M | if (ch > _Py_MAX_UNICODE) { |
320 | 0 | PyErr_SetString(PyExc_ValueError, |
321 | 0 | "character must be in range(0x110000)"); |
322 | 0 | return -1; |
323 | 0 | } |
324 | | |
325 | 61.5M | return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch); |
326 | 61.5M | } |
327 | | |
328 | | |
329 | | int |
330 | | _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) |
331 | 59.6M | { |
332 | 59.6M | assert(PyUnicode_Check(str)); |
333 | | |
334 | 59.6M | Py_UCS4 maxchar; |
335 | 59.6M | Py_ssize_t len; |
336 | | |
337 | 59.6M | len = PyUnicode_GET_LENGTH(str); |
338 | 59.6M | if (len == 0) |
339 | 4.14M | return 0; |
340 | 55.4M | maxchar = PyUnicode_MAX_CHAR_VALUE(str); |
341 | 55.4M | if (maxchar > writer->maxchar || len > writer->size - writer->pos) { |
342 | 17.4M | if (writer->buffer == NULL && !writer->overallocate) { |
343 | 5.53k | assert(_PyUnicode_CheckConsistency(str, 1)); |
344 | 5.53k | writer->readonly = 1; |
345 | 5.53k | writer->buffer = Py_NewRef(str); |
346 | 5.53k | _PyUnicodeWriter_Update(writer); |
347 | 5.53k | writer->pos += len; |
348 | 5.53k | return 0; |
349 | 5.53k | } |
350 | 17.4M | if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) |
351 | 0 | return -1; |
352 | 17.4M | } |
353 | 55.4M | _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, |
354 | 55.4M | str, 0, len); |
355 | 55.4M | writer->pos += len; |
356 | 55.4M | return 0; |
357 | 55.4M | } |
358 | | |
359 | | |
360 | | int |
361 | | PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj) |
362 | 2.88M | { |
363 | 2.88M | PyTypeObject *type = Py_TYPE(obj); |
364 | 2.88M | if (type == &PyUnicode_Type) { |
365 | 2.88M | return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj); |
366 | 2.88M | } |
367 | | |
368 | 0 | if (type == &PyLong_Type) { |
369 | 0 | return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0); |
370 | 0 | } |
371 | | |
372 | 0 | PyObject *str = PyObject_Str(obj); |
373 | 0 | if (str == NULL) { |
374 | 0 | return -1; |
375 | 0 | } |
376 | | |
377 | 0 | int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str); |
378 | 0 | Py_DECREF(str); |
379 | 0 | return res; |
380 | 0 | } |
381 | | |
382 | | |
383 | | int |
384 | | PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj) |
385 | 7.67M | { |
386 | 7.67M | if (obj == NULL) { |
387 | 0 | return _PyUnicodeWriter_WriteASCIIString((_PyUnicodeWriter*)writer, "<NULL>", 6); |
388 | 0 | } |
389 | | |
390 | 7.67M | if (Py_TYPE(obj) == &PyLong_Type) { |
391 | 258k | return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0); |
392 | 258k | } |
393 | | |
394 | 7.41M | PyObject *repr = PyObject_Repr(obj); |
395 | 7.41M | if (repr == NULL) { |
396 | 0 | return -1; |
397 | 0 | } |
398 | | |
399 | 7.41M | int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr); |
400 | 7.41M | Py_DECREF(repr); |
401 | 7.41M | return res; |
402 | 7.41M | } |
403 | | |
404 | | |
405 | | int |
406 | | _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, |
407 | | Py_ssize_t start, Py_ssize_t end) |
408 | 45.6M | { |
409 | 45.6M | assert(0 <= start); |
410 | 45.6M | assert(end <= PyUnicode_GET_LENGTH(str)); |
411 | 45.6M | assert(start <= end); |
412 | | |
413 | 45.6M | if (start == 0 && end == PyUnicode_GET_LENGTH(str)) |
414 | 90 | return _PyUnicodeWriter_WriteStr(writer, str); |
415 | | |
416 | 45.6M | Py_ssize_t len = end - start; |
417 | 45.6M | if (len == 0) { |
418 | 192 | return 0; |
419 | 192 | } |
420 | | |
421 | 45.6M | Py_UCS4 maxchar; |
422 | 45.6M | if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) { |
423 | 13.3M | maxchar = _PyUnicode_FindMaxChar(str, start, end); |
424 | 13.3M | } |
425 | 32.2M | else { |
426 | 32.2M | maxchar = writer->maxchar; |
427 | 32.2M | } |
428 | 45.6M | if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) { |
429 | 0 | return -1; |
430 | 0 | } |
431 | | |
432 | 45.6M | _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, |
433 | 45.6M | str, start, len); |
434 | 45.6M | writer->pos += len; |
435 | 45.6M | return 0; |
436 | 45.6M | } |
437 | | |
438 | | |
439 | | int |
440 | | PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str, |
441 | | Py_ssize_t start, Py_ssize_t end) |
442 | 412k | { |
443 | 412k | if (!PyUnicode_Check(str)) { |
444 | 0 | PyErr_Format(PyExc_TypeError, "expect str, not %T", str); |
445 | 0 | return -1; |
446 | 0 | } |
447 | 412k | if (start < 0 || start > end) { |
448 | 0 | PyErr_Format(PyExc_ValueError, "invalid start argument"); |
449 | 0 | return -1; |
450 | 0 | } |
451 | 412k | if (end > PyUnicode_GET_LENGTH(str)) { |
452 | 0 | PyErr_Format(PyExc_ValueError, "invalid end argument"); |
453 | 0 | return -1; |
454 | 0 | } |
455 | | |
456 | 412k | return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str, |
457 | 412k | start, end); |
458 | 412k | } |
459 | | |
460 | | |
461 | | int |
462 | | _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, |
463 | | const char *ascii, Py_ssize_t len) |
464 | 40.4M | { |
465 | 40.4M | if (len == -1) |
466 | 0 | len = strlen(ascii); |
467 | | |
468 | 40.4M | if (len == 0) { |
469 | 0 | return 0; |
470 | 0 | } |
471 | | |
472 | 40.4M | assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128); |
473 | | |
474 | 40.4M | if (writer->buffer == NULL && !writer->overallocate) { |
475 | 80.6k | PyObject *str; |
476 | | |
477 | 80.6k | str = _PyUnicode_FromASCII(ascii, len); |
478 | 80.6k | if (str == NULL) |
479 | 0 | return -1; |
480 | | |
481 | 80.6k | writer->readonly = 1; |
482 | 80.6k | writer->buffer = str; |
483 | 80.6k | _PyUnicodeWriter_Update(writer); |
484 | 80.6k | writer->pos += len; |
485 | 80.6k | return 0; |
486 | 80.6k | } |
487 | | |
488 | 40.3M | if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) |
489 | 0 | return -1; |
490 | | |
491 | 40.3M | switch (writer->kind) |
492 | 40.3M | { |
493 | 40.3M | case PyUnicode_1BYTE_KIND: |
494 | 40.3M | { |
495 | 40.3M | const Py_UCS1 *str = (const Py_UCS1 *)ascii; |
496 | 40.3M | Py_UCS1 *data = writer->data; |
497 | | |
498 | 40.3M | memcpy(data + writer->pos, str, len); |
499 | 40.3M | break; |
500 | 0 | } |
501 | 12.0k | case PyUnicode_2BYTE_KIND: |
502 | 12.0k | { |
503 | 12.0k | _PyUnicode_CONVERT_BYTES( |
504 | 12.0k | Py_UCS1, Py_UCS2, |
505 | 12.0k | ascii, ascii + len, |
506 | 12.0k | (Py_UCS2 *)writer->data + writer->pos); |
507 | 12.0k | break; |
508 | 0 | } |
509 | 3.43k | case PyUnicode_4BYTE_KIND: |
510 | 3.43k | { |
511 | 3.43k | _PyUnicode_CONVERT_BYTES( |
512 | 3.43k | Py_UCS1, Py_UCS4, |
513 | 3.43k | ascii, ascii + len, |
514 | 3.43k | (Py_UCS4 *)writer->data + writer->pos); |
515 | 3.43k | break; |
516 | 0 | } |
517 | 0 | default: |
518 | 0 | Py_UNREACHABLE(); |
519 | 40.3M | } |
520 | | |
521 | 40.3M | writer->pos += len; |
522 | 40.3M | return 0; |
523 | 40.3M | } |
524 | | |
525 | | |
526 | | int |
527 | | PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer, |
528 | | const char *str, |
529 | | Py_ssize_t size) |
530 | 490k | { |
531 | 490k | assert(writer != NULL); |
532 | 490k | _Py_AssertHoldsTstate(); |
533 | | |
534 | 490k | _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer; |
535 | 490k | return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size); |
536 | 490k | } |
537 | | |
538 | | |
539 | | int |
540 | | PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer, |
541 | | const char *str, |
542 | | Py_ssize_t size) |
543 | 0 | { |
544 | 0 | if (size < 0) { |
545 | 0 | size = strlen(str); |
546 | 0 | } |
547 | |
|
548 | 0 | _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer; |
549 | 0 | Py_ssize_t old_pos = _writer->pos; |
550 | 0 | int res = _PyUnicode_DecodeUTF8Writer(_writer, str, size, |
551 | 0 | _Py_ERROR_STRICT, NULL, NULL); |
552 | 0 | if (res < 0) { |
553 | 0 | _writer->pos = old_pos; |
554 | 0 | } |
555 | 0 | return res; |
556 | 0 | } |
557 | | |
558 | | |
559 | | int |
560 | | PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, |
561 | | const char *string, |
562 | | Py_ssize_t length, |
563 | | const char *errors, |
564 | | Py_ssize_t *consumed) |
565 | 0 | { |
566 | 0 | if (length < 0) { |
567 | 0 | length = strlen(string); |
568 | 0 | } |
569 | |
|
570 | 0 | _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer; |
571 | 0 | Py_ssize_t old_pos = _writer->pos; |
572 | 0 | int res = _PyUnicode_DecodeUTF8Writer(_writer, string, length, |
573 | 0 | _Py_ERROR_UNKNOWN, errors, |
574 | 0 | consumed); |
575 | 0 | if (res < 0) { |
576 | 0 | _writer->pos = old_pos; |
577 | 0 | if (consumed) { |
578 | 0 | *consumed = 0; |
579 | 0 | } |
580 | 0 | } |
581 | 0 | return res; |
582 | 0 | } |
583 | | |
584 | | |
585 | | int |
586 | | _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, |
587 | | const char *str, Py_ssize_t len) |
588 | 0 | { |
589 | 0 | Py_UCS4 maxchar; |
590 | |
|
591 | 0 | maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len); |
592 | 0 | if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) |
593 | 0 | return -1; |
594 | 0 | unicode_write_cstr(writer->buffer, writer->pos, str, len); |
595 | 0 | writer->pos += len; |
596 | 0 | return 0; |
597 | 0 | } |
598 | | |
599 | | |
600 | | PyObject * |
601 | | _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) |
602 | 45.6M | { |
603 | 45.6M | PyObject *str; |
604 | | |
605 | 45.6M | if (writer->pos == 0) { |
606 | 829 | Py_CLEAR(writer->buffer); |
607 | 829 | return _PyUnicode_GetEmpty(); |
608 | 829 | } |
609 | | |
610 | 45.6M | str = writer->buffer; |
611 | 45.6M | writer->buffer = NULL; |
612 | | |
613 | 45.6M | if (writer->readonly) { |
614 | 84.9k | assert(PyUnicode_GET_LENGTH(str) == writer->pos); |
615 | 84.9k | return str; |
616 | 84.9k | } |
617 | | |
618 | 45.5M | if (PyUnicode_GET_LENGTH(str) != writer->pos) { |
619 | 43.8M | PyObject *str2; |
620 | 43.8M | str2 = _PyUnicode_ResizeCompact(str, writer->pos); |
621 | 43.8M | if (str2 == NULL) { |
622 | 0 | Py_DECREF(str); |
623 | 0 | return NULL; |
624 | 0 | } |
625 | 43.8M | str = str2; |
626 | 43.8M | } |
627 | | |
628 | 45.5M | assert(_PyUnicode_CheckConsistency(str, 1)); |
629 | 45.5M | return _PyUnicode_Result(str); |
630 | 45.5M | } |
631 | | |
632 | | |
633 | | PyObject* |
634 | | PyUnicodeWriter_Finish(PyUnicodeWriter *writer) |
635 | 3.99M | { |
636 | 3.99M | PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer); |
637 | 3.99M | assert(((_PyUnicodeWriter*)writer)->buffer == NULL); |
638 | 3.99M | _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free); |
639 | 3.99M | return str; |
640 | 3.99M | } |
641 | | |
642 | | |
643 | | void |
644 | | _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) |
645 | 2.84M | { |
646 | | Py_CLEAR(writer->buffer); |
647 | 2.84M | } |