/src/cpython/Python/codecs.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* ------------------------------------------------------------------------ |
2 | | |
3 | | Python Codec Registry and support functions |
4 | | |
5 | | Written by Marc-Andre Lemburg (mal@lemburg.com). |
6 | | |
7 | | Copyright (c) Corporation for National Research Initiatives. |
8 | | |
9 | | ------------------------------------------------------------------------ */ |
10 | | |
11 | | #include "Python.h" |
12 | | #include "pycore_call.h" // _PyObject_CallNoArgs() |
13 | | #include "pycore_interp.h" // PyInterpreterState.codec_search_path |
14 | | #include "pycore_pyerrors.h" // _PyErr_FormatNote() |
15 | | #include "pycore_pystate.h" // _PyInterpreterState_GET() |
16 | | #include "pycore_runtime.h" // _Py_ID() |
17 | | #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI |
18 | | #include "pycore_unicodeobject.h" // _PyUnicode_InternMortal() |
19 | | #include "pycore_pyatomic_ft_wrappers.h" |
20 | | |
21 | | static const char *codecs_builtin_error_handlers[] = { |
22 | | "strict", "ignore", "replace", |
23 | | "xmlcharrefreplace", "backslashreplace", "namereplace", |
24 | | "surrogatepass", "surrogateescape", |
25 | | }; |
26 | | |
27 | | const char *Py_hexdigits = "0123456789abcdef"; |
28 | | |
29 | | /* --- Codec Registry ----------------------------------------------------- */ |
30 | | |
31 | | int PyCodec_Register(PyObject *search_function) |
32 | 16 | { |
33 | 16 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
34 | 16 | assert(interp->codecs.initialized); |
35 | 16 | if (search_function == NULL) { |
36 | 0 | PyErr_BadArgument(); |
37 | 0 | goto onError; |
38 | 0 | } |
39 | 16 | if (!PyCallable_Check(search_function)) { |
40 | 0 | PyErr_SetString(PyExc_TypeError, "argument must be callable"); |
41 | 0 | goto onError; |
42 | 0 | } |
43 | 16 | FT_MUTEX_LOCK(&interp->codecs.search_path_mutex); |
44 | 16 | int ret = PyList_Append(interp->codecs.search_path, search_function); |
45 | 16 | FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex); |
46 | | |
47 | 16 | return ret; |
48 | | |
49 | 0 | onError: |
50 | 0 | return -1; |
51 | 16 | } |
52 | | |
53 | | int |
54 | | PyCodec_Unregister(PyObject *search_function) |
55 | 0 | { |
56 | 0 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
57 | 0 | if (interp->codecs.initialized != 1) { |
58 | | /* Do nothing if codecs state was cleared (only possible during |
59 | | interpreter shutdown). */ |
60 | 0 | return 0; |
61 | 0 | } |
62 | | |
63 | 0 | PyObject *codec_search_path = interp->codecs.search_path; |
64 | 0 | assert(PyList_CheckExact(codec_search_path)); |
65 | 0 | for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) { |
66 | 0 | FT_MUTEX_LOCK(&interp->codecs.search_path_mutex); |
67 | 0 | PyObject *item = PyList_GetItemRef(codec_search_path, i); |
68 | 0 | int ret = 1; |
69 | 0 | if (item == search_function) { |
70 | | // We hold a reference to the item, so its destructor can't run |
71 | | // while we hold search_path_mutex. |
72 | 0 | ret = PyList_SetSlice(codec_search_path, i, i+1, NULL); |
73 | 0 | } |
74 | 0 | FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex); |
75 | 0 | Py_DECREF(item); |
76 | 0 | if (ret != 1) { |
77 | 0 | assert(interp->codecs.search_cache != NULL); |
78 | 0 | assert(PyDict_CheckExact(interp->codecs.search_cache)); |
79 | 0 | PyDict_Clear(interp->codecs.search_cache); |
80 | 0 | return ret; |
81 | 0 | } |
82 | 0 | } |
83 | 0 | return 0; |
84 | 0 | } |
85 | | |
86 | | extern int _Py_normalize_encoding(const char *, char *, size_t); |
87 | | |
88 | | /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are |
89 | | converted to lower case, spaces and hyphens are replaced with underscores. */ |
90 | | |
91 | | static |
92 | | PyObject *normalizestring(const char *string) |
93 | 1.26M | { |
94 | 1.26M | size_t len = strlen(string); |
95 | 1.26M | char *encoding; |
96 | 1.26M | PyObject *v; |
97 | | |
98 | 1.26M | if (len > PY_SSIZE_T_MAX) { |
99 | 0 | PyErr_SetString(PyExc_OverflowError, "string is too large"); |
100 | 0 | return NULL; |
101 | 0 | } |
102 | | |
103 | 1.26M | encoding = PyMem_Malloc(len + 1); |
104 | 1.26M | if (encoding == NULL) |
105 | 0 | return PyErr_NoMemory(); |
106 | | |
107 | 1.26M | if (!_Py_normalize_encoding(string, encoding, len + 1)) |
108 | 0 | { |
109 | 0 | PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed"); |
110 | 0 | PyMem_Free(encoding); |
111 | 0 | return NULL; |
112 | 0 | } |
113 | | |
114 | 1.26M | v = PyUnicode_FromString(encoding); |
115 | 1.26M | PyMem_Free(encoding); |
116 | 1.26M | return v; |
117 | 1.26M | } |
118 | | |
119 | | /* Lookup the given encoding and return a tuple providing the codec |
120 | | facilities. |
121 | | |
122 | | The encoding string is looked up converted to all lower-case |
123 | | characters. This makes encodings looked up through this mechanism |
124 | | effectively case-insensitive. |
125 | | |
126 | | If no codec is found, a LookupError is set and NULL returned. |
127 | | |
128 | | As side effect, this tries to load the encodings package, if not |
129 | | yet done. This is part of the lazy load strategy for the encodings |
130 | | package. |
131 | | |
132 | | */ |
133 | | |
134 | | PyObject *_PyCodec_Lookup(const char *encoding) |
135 | 1.26M | { |
136 | 1.26M | if (encoding == NULL) { |
137 | 0 | PyErr_BadArgument(); |
138 | 0 | return NULL; |
139 | 0 | } |
140 | | |
141 | 1.26M | PyInterpreterState *interp = _PyInterpreterState_GET(); |
142 | 1.26M | assert(interp->codecs.initialized); |
143 | | |
144 | | /* Convert the encoding to a normalized Python string: all |
145 | | characters are converted to lower case, spaces and hyphens are |
146 | | replaced with underscores. */ |
147 | 1.26M | PyObject *v = normalizestring(encoding); |
148 | 1.26M | if (v == NULL) { |
149 | 0 | return NULL; |
150 | 0 | } |
151 | | |
152 | | /* Intern the string. We'll make it immortal later if lookup succeeds. */ |
153 | 1.26M | _PyUnicode_InternMortal(interp, &v); |
154 | | |
155 | | /* First, try to lookup the name in the registry dictionary */ |
156 | 1.26M | PyObject *result; |
157 | 1.26M | if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) { |
158 | 0 | goto onError; |
159 | 0 | } |
160 | 1.26M | if (result != NULL) { |
161 | 1.15M | Py_DECREF(v); |
162 | 1.15M | return result; |
163 | 1.15M | } |
164 | | |
165 | | /* Next, scan the search functions in order of registration */ |
166 | 107k | const Py_ssize_t len = PyList_Size(interp->codecs.search_path); |
167 | 107k | if (len < 0) |
168 | 0 | goto onError; |
169 | 107k | if (len == 0) { |
170 | 0 | PyErr_SetString(PyExc_LookupError, |
171 | 0 | "no codec search functions registered: " |
172 | 0 | "can't find encoding"); |
173 | 0 | goto onError; |
174 | 0 | } |
175 | | |
176 | 107k | Py_ssize_t i; |
177 | 213k | for (i = 0; i < len; i++) { |
178 | 107k | PyObject *func; |
179 | | |
180 | 107k | func = PyList_GetItemRef(interp->codecs.search_path, i); |
181 | 107k | if (func == NULL) |
182 | 0 | goto onError; |
183 | 107k | result = PyObject_CallOneArg(func, v); |
184 | 107k | Py_DECREF(func); |
185 | 107k | if (result == NULL) |
186 | 0 | goto onError; |
187 | 107k | if (result == Py_None) { |
188 | 106k | Py_CLEAR(result); |
189 | 106k | continue; |
190 | 106k | } |
191 | 461 | if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) { |
192 | 0 | PyErr_SetString(PyExc_TypeError, |
193 | 0 | "codec search functions must return 4-tuples"); |
194 | 0 | Py_DECREF(result); |
195 | 0 | goto onError; |
196 | 0 | } |
197 | 461 | break; |
198 | 461 | } |
199 | 107k | if (result == NULL) { |
200 | | /* XXX Perhaps we should cache misses too ? */ |
201 | 106k | PyErr_Format(PyExc_LookupError, |
202 | 106k | "unknown encoding: %s", encoding); |
203 | 106k | goto onError; |
204 | 106k | } |
205 | | |
206 | 461 | _PyUnicode_InternImmortal(interp, &v); |
207 | | |
208 | | /* Cache and return the result */ |
209 | 461 | if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) { |
210 | 0 | Py_DECREF(result); |
211 | 0 | goto onError; |
212 | 0 | } |
213 | 461 | Py_DECREF(v); |
214 | 461 | return result; |
215 | | |
216 | 106k | onError: |
217 | 106k | Py_DECREF(v); |
218 | 106k | return NULL; |
219 | 461 | } |
220 | | |
221 | | /* Codec registry encoding check API. */ |
222 | | |
223 | | int PyCodec_KnownEncoding(const char *encoding) |
224 | 0 | { |
225 | 0 | PyObject *codecs; |
226 | |
|
227 | 0 | codecs = _PyCodec_Lookup(encoding); |
228 | 0 | if (!codecs) { |
229 | 0 | PyErr_Clear(); |
230 | 0 | return 0; |
231 | 0 | } |
232 | 0 | else { |
233 | 0 | Py_DECREF(codecs); |
234 | 0 | return 1; |
235 | 0 | } |
236 | 0 | } |
237 | | |
238 | | static |
239 | | PyObject *args_tuple(PyObject *object, |
240 | | const char *errors) |
241 | 1.15M | { |
242 | 1.15M | PyObject *args; |
243 | | |
244 | 1.15M | args = PyTuple_New(1 + (errors != NULL)); |
245 | 1.15M | if (args == NULL) |
246 | 0 | return NULL; |
247 | 1.15M | PyTuple_SET_ITEM(args, 0, Py_NewRef(object)); |
248 | 1.15M | if (errors) { |
249 | 182k | PyObject *v; |
250 | | |
251 | 182k | v = PyUnicode_FromString(errors); |
252 | 182k | if (v == NULL) { |
253 | 0 | Py_DECREF(args); |
254 | 0 | return NULL; |
255 | 0 | } |
256 | 182k | PyTuple_SET_ITEM(args, 1, v); |
257 | 182k | } |
258 | 1.15M | return args; |
259 | 1.15M | } |
260 | | |
261 | | /* Helper function to get a codec item */ |
262 | | |
263 | | static |
264 | | PyObject *codec_getitem(const char *encoding, int index) |
265 | 0 | { |
266 | 0 | PyObject *codecs; |
267 | 0 | PyObject *v; |
268 | |
|
269 | 0 | codecs = _PyCodec_Lookup(encoding); |
270 | 0 | if (codecs == NULL) |
271 | 0 | return NULL; |
272 | 0 | v = PyTuple_GET_ITEM(codecs, index); |
273 | 0 | Py_DECREF(codecs); |
274 | 0 | return Py_NewRef(v); |
275 | 0 | } |
276 | | |
277 | | /* Helper functions to create an incremental codec. */ |
278 | | static |
279 | | PyObject *codec_makeincrementalcodec(PyObject *codec_info, |
280 | | const char *errors, |
281 | | const char *attrname) |
282 | 48 | { |
283 | 48 | PyObject *ret, *inccodec; |
284 | | |
285 | 48 | inccodec = PyObject_GetAttrString(codec_info, attrname); |
286 | 48 | if (inccodec == NULL) |
287 | 0 | return NULL; |
288 | 48 | if (errors) |
289 | 48 | ret = PyObject_CallFunction(inccodec, "s", errors); |
290 | 0 | else |
291 | 0 | ret = _PyObject_CallNoArgs(inccodec); |
292 | 48 | Py_DECREF(inccodec); |
293 | 48 | return ret; |
294 | 48 | } |
295 | | |
296 | | static |
297 | | PyObject *codec_getincrementalcodec(const char *encoding, |
298 | | const char *errors, |
299 | | const char *attrname) |
300 | 0 | { |
301 | 0 | PyObject *codec_info, *ret; |
302 | |
|
303 | 0 | codec_info = _PyCodec_Lookup(encoding); |
304 | 0 | if (codec_info == NULL) |
305 | 0 | return NULL; |
306 | 0 | ret = codec_makeincrementalcodec(codec_info, errors, attrname); |
307 | 0 | Py_DECREF(codec_info); |
308 | 0 | return ret; |
309 | 0 | } |
310 | | |
311 | | /* Helper function to create a stream codec. */ |
312 | | |
313 | | static |
314 | | PyObject *codec_getstreamcodec(const char *encoding, |
315 | | PyObject *stream, |
316 | | const char *errors, |
317 | | const int index) |
318 | 0 | { |
319 | 0 | PyObject *codecs, *streamcodec, *codeccls; |
320 | |
|
321 | 0 | codecs = _PyCodec_Lookup(encoding); |
322 | 0 | if (codecs == NULL) |
323 | 0 | return NULL; |
324 | | |
325 | 0 | codeccls = PyTuple_GET_ITEM(codecs, index); |
326 | 0 | if (errors != NULL) |
327 | 0 | streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors); |
328 | 0 | else |
329 | 0 | streamcodec = PyObject_CallOneArg(codeccls, stream); |
330 | 0 | Py_DECREF(codecs); |
331 | 0 | return streamcodec; |
332 | 0 | } |
333 | | |
334 | | /* Helpers to work with the result of _PyCodec_Lookup |
335 | | |
336 | | */ |
337 | | PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info, |
338 | | const char *errors) |
339 | 16 | { |
340 | 16 | return codec_makeincrementalcodec(codec_info, errors, |
341 | 16 | "incrementaldecoder"); |
342 | 16 | } |
343 | | |
344 | | PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info, |
345 | | const char *errors) |
346 | 32 | { |
347 | 32 | return codec_makeincrementalcodec(codec_info, errors, |
348 | 32 | "incrementalencoder"); |
349 | 32 | } |
350 | | |
351 | | |
352 | | /* Convenience APIs to query the Codec registry. |
353 | | |
354 | | All APIs return a codec object with incremented refcount. |
355 | | |
356 | | */ |
357 | | |
358 | | PyObject *PyCodec_Encoder(const char *encoding) |
359 | 0 | { |
360 | 0 | return codec_getitem(encoding, 0); |
361 | 0 | } |
362 | | |
363 | | PyObject *PyCodec_Decoder(const char *encoding) |
364 | 0 | { |
365 | 0 | return codec_getitem(encoding, 1); |
366 | 0 | } |
367 | | |
368 | | PyObject *PyCodec_IncrementalEncoder(const char *encoding, |
369 | | const char *errors) |
370 | 0 | { |
371 | 0 | return codec_getincrementalcodec(encoding, errors, "incrementalencoder"); |
372 | 0 | } |
373 | | |
374 | | PyObject *PyCodec_IncrementalDecoder(const char *encoding, |
375 | | const char *errors) |
376 | 0 | { |
377 | 0 | return codec_getincrementalcodec(encoding, errors, "incrementaldecoder"); |
378 | 0 | } |
379 | | |
380 | | PyObject *PyCodec_StreamReader(const char *encoding, |
381 | | PyObject *stream, |
382 | | const char *errors) |
383 | 0 | { |
384 | 0 | return codec_getstreamcodec(encoding, stream, errors, 2); |
385 | 0 | } |
386 | | |
387 | | PyObject *PyCodec_StreamWriter(const char *encoding, |
388 | | PyObject *stream, |
389 | | const char *errors) |
390 | 0 | { |
391 | 0 | return codec_getstreamcodec(encoding, stream, errors, 3); |
392 | 0 | } |
393 | | |
394 | | /* Encode an object (e.g. a Unicode object) using the given encoding |
395 | | and return the resulting encoded object (usually a Python string). |
396 | | |
397 | | errors is passed to the encoder factory as argument if non-NULL. */ |
398 | | |
399 | | static PyObject * |
400 | | _PyCodec_EncodeInternal(PyObject *object, |
401 | | PyObject *encoder, |
402 | | const char *encoding, |
403 | | const char *errors) |
404 | 892k | { |
405 | 892k | PyObject *args = NULL, *result = NULL; |
406 | 892k | PyObject *v = NULL; |
407 | | |
408 | 892k | args = args_tuple(object, errors); |
409 | 892k | if (args == NULL) |
410 | 0 | goto onError; |
411 | | |
412 | 892k | result = PyObject_Call(encoder, args, NULL); |
413 | 892k | if (result == NULL) { |
414 | 0 | _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding); |
415 | 0 | goto onError; |
416 | 0 | } |
417 | | |
418 | 892k | if (!PyTuple_Check(result) || |
419 | 892k | PyTuple_GET_SIZE(result) != 2) { |
420 | 0 | PyErr_SetString(PyExc_TypeError, |
421 | 0 | "encoder must return a tuple (object, integer)"); |
422 | 0 | goto onError; |
423 | 0 | } |
424 | 892k | v = Py_NewRef(PyTuple_GET_ITEM(result,0)); |
425 | | /* We don't check or use the second (integer) entry. */ |
426 | | |
427 | 892k | Py_DECREF(args); |
428 | 892k | Py_DECREF(encoder); |
429 | 892k | Py_DECREF(result); |
430 | 892k | return v; |
431 | | |
432 | 0 | onError: |
433 | 0 | Py_XDECREF(result); |
434 | 0 | Py_XDECREF(args); |
435 | 0 | Py_XDECREF(encoder); |
436 | 0 | return NULL; |
437 | 892k | } |
438 | | |
439 | | /* Decode an object (usually a Python string) using the given encoding |
440 | | and return an equivalent object (e.g. a Unicode object). |
441 | | |
442 | | errors is passed to the decoder factory as argument if non-NULL. */ |
443 | | |
444 | | static PyObject * |
445 | | _PyCodec_DecodeInternal(PyObject *object, |
446 | | PyObject *decoder, |
447 | | const char *encoding, |
448 | | const char *errors) |
449 | 257k | { |
450 | 257k | PyObject *args = NULL, *result = NULL; |
451 | 257k | PyObject *v; |
452 | | |
453 | 257k | args = args_tuple(object, errors); |
454 | 257k | if (args == NULL) |
455 | 0 | goto onError; |
456 | | |
457 | 257k | result = PyObject_Call(decoder, args, NULL); |
458 | 257k | if (result == NULL) { |
459 | 58.0k | _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding); |
460 | 58.0k | goto onError; |
461 | 58.0k | } |
462 | 199k | if (!PyTuple_Check(result) || |
463 | 199k | PyTuple_GET_SIZE(result) != 2) { |
464 | 0 | PyErr_SetString(PyExc_TypeError, |
465 | 0 | "decoder must return a tuple (object,integer)"); |
466 | 0 | goto onError; |
467 | 0 | } |
468 | 199k | v = Py_NewRef(PyTuple_GET_ITEM(result,0)); |
469 | | /* We don't check or use the second (integer) entry. */ |
470 | | |
471 | 199k | Py_DECREF(args); |
472 | 199k | Py_DECREF(decoder); |
473 | 199k | Py_DECREF(result); |
474 | 199k | return v; |
475 | | |
476 | 58.0k | onError: |
477 | 58.0k | Py_XDECREF(args); |
478 | 58.0k | Py_XDECREF(decoder); |
479 | 58.0k | Py_XDECREF(result); |
480 | 58.0k | return NULL; |
481 | 199k | } |
482 | | |
483 | | /* Generic encoding/decoding API */ |
484 | | PyObject *PyCodec_Encode(PyObject *object, |
485 | | const char *encoding, |
486 | | const char *errors) |
487 | 0 | { |
488 | 0 | PyObject *encoder; |
489 | |
|
490 | 0 | encoder = PyCodec_Encoder(encoding); |
491 | 0 | if (encoder == NULL) |
492 | 0 | return NULL; |
493 | | |
494 | 0 | return _PyCodec_EncodeInternal(object, encoder, encoding, errors); |
495 | 0 | } |
496 | | |
497 | | PyObject *PyCodec_Decode(PyObject *object, |
498 | | const char *encoding, |
499 | | const char *errors) |
500 | 0 | { |
501 | 0 | PyObject *decoder; |
502 | |
|
503 | 0 | decoder = PyCodec_Decoder(encoding); |
504 | 0 | if (decoder == NULL) |
505 | 0 | return NULL; |
506 | | |
507 | 0 | return _PyCodec_DecodeInternal(object, decoder, encoding, errors); |
508 | 0 | } |
509 | | |
510 | | /* Text encoding/decoding API */ |
511 | | PyObject * _PyCodec_LookupTextEncoding(const char *encoding, |
512 | | const char *alternate_command) |
513 | 1.26M | { |
514 | 1.26M | PyObject *codec; |
515 | 1.26M | PyObject *attr; |
516 | 1.26M | int is_text_codec; |
517 | | |
518 | 1.26M | codec = _PyCodec_Lookup(encoding); |
519 | 1.26M | if (codec == NULL) |
520 | 106k | return NULL; |
521 | | |
522 | | /* Backwards compatibility: assume any raw tuple describes a text |
523 | | * encoding, and the same for anything lacking the private |
524 | | * attribute. |
525 | | */ |
526 | 1.15M | if (!PyTuple_CheckExact(codec)) { |
527 | 1.15M | if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) { |
528 | 0 | Py_DECREF(codec); |
529 | 0 | return NULL; |
530 | 0 | } |
531 | 1.15M | if (attr != NULL) { |
532 | 1.15M | is_text_codec = PyObject_IsTrue(attr); |
533 | 1.15M | Py_DECREF(attr); |
534 | 1.15M | if (is_text_codec <= 0) { |
535 | 3.14k | Py_DECREF(codec); |
536 | 3.14k | if (!is_text_codec) { |
537 | 3.14k | if (alternate_command != NULL) { |
538 | 3.14k | PyErr_Format(PyExc_LookupError, |
539 | 3.14k | "'%.400s' is not a text encoding; " |
540 | 3.14k | "use %s to handle arbitrary codecs", |
541 | 3.14k | encoding, alternate_command); |
542 | 3.14k | } |
543 | 0 | else { |
544 | 0 | PyErr_Format(PyExc_LookupError, |
545 | 0 | "'%.400s' is not a text encoding", |
546 | 0 | encoding); |
547 | 0 | } |
548 | 3.14k | } |
549 | 3.14k | return NULL; |
550 | 3.14k | } |
551 | 1.15M | } |
552 | 1.15M | } |
553 | | |
554 | | /* This appears to be a valid text encoding */ |
555 | 1.15M | return codec; |
556 | 1.15M | } |
557 | | |
558 | | |
559 | | static |
560 | | PyObject *codec_getitem_checked(const char *encoding, |
561 | | const char *alternate_command, |
562 | | int index) |
563 | 1.25M | { |
564 | 1.25M | PyObject *codec; |
565 | 1.25M | PyObject *v; |
566 | | |
567 | 1.25M | codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); |
568 | 1.25M | if (codec == NULL) |
569 | 109k | return NULL; |
570 | | |
571 | 1.15M | v = Py_NewRef(PyTuple_GET_ITEM(codec, index)); |
572 | 1.15M | Py_DECREF(codec); |
573 | 1.15M | return v; |
574 | 1.25M | } |
575 | | |
576 | | static PyObject * _PyCodec_TextEncoder(const char *encoding) |
577 | 892k | { |
578 | 892k | return codec_getitem_checked(encoding, "codecs.encode()", 0); |
579 | 892k | } |
580 | | |
581 | | static PyObject * _PyCodec_TextDecoder(const char *encoding) |
582 | 367k | { |
583 | 367k | return codec_getitem_checked(encoding, "codecs.decode()", 1); |
584 | 367k | } |
585 | | |
586 | | PyObject *_PyCodec_EncodeText(PyObject *object, |
587 | | const char *encoding, |
588 | | const char *errors) |
589 | 892k | { |
590 | 892k | PyObject *encoder; |
591 | | |
592 | 892k | encoder = _PyCodec_TextEncoder(encoding); |
593 | 892k | if (encoder == NULL) |
594 | 0 | return NULL; |
595 | | |
596 | 892k | return _PyCodec_EncodeInternal(object, encoder, encoding, errors); |
597 | 892k | } |
598 | | |
599 | | PyObject *_PyCodec_DecodeText(PyObject *object, |
600 | | const char *encoding, |
601 | | const char *errors) |
602 | 367k | { |
603 | 367k | PyObject *decoder; |
604 | | |
605 | 367k | decoder = _PyCodec_TextDecoder(encoding); |
606 | 367k | if (decoder == NULL) |
607 | 109k | return NULL; |
608 | | |
609 | 257k | return _PyCodec_DecodeInternal(object, decoder, encoding, errors); |
610 | 367k | } |
611 | | |
612 | | /* Register the error handling callback function error under the name |
613 | | name. This function will be called by the codec when it encounters |
614 | | an unencodable characters/undecodable bytes and doesn't know the |
615 | | callback name, when name is specified as the error parameter |
616 | | in the call to the encode/decode function. |
617 | | Return 0 on success, -1 on error */ |
618 | | int PyCodec_RegisterError(const char *name, PyObject *error) |
619 | 0 | { |
620 | 0 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
621 | 0 | assert(interp->codecs.initialized); |
622 | 0 | if (!PyCallable_Check(error)) { |
623 | 0 | PyErr_SetString(PyExc_TypeError, "handler must be callable"); |
624 | 0 | return -1; |
625 | 0 | } |
626 | 0 | return PyDict_SetItemString(interp->codecs.error_registry, |
627 | 0 | name, error); |
628 | 0 | } |
629 | | |
630 | | int _PyCodec_UnregisterError(const char *name) |
631 | 0 | { |
632 | 0 | for (size_t i = 0; i < Py_ARRAY_LENGTH(codecs_builtin_error_handlers); ++i) { |
633 | 0 | if (strcmp(name, codecs_builtin_error_handlers[i]) == 0) { |
634 | 0 | PyErr_Format(PyExc_ValueError, |
635 | 0 | "cannot un-register built-in error handler '%s'", name); |
636 | 0 | return -1; |
637 | 0 | } |
638 | 0 | } |
639 | 0 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
640 | 0 | assert(interp->codecs.initialized); |
641 | 0 | return PyDict_PopString(interp->codecs.error_registry, name, NULL); |
642 | 0 | } |
643 | | |
644 | | /* Lookup the error handling callback function registered under the |
645 | | name error. As a special case NULL can be passed, in which case |
646 | | the error handling callback for strict encoding will be returned. */ |
647 | | PyObject *PyCodec_LookupError(const char *name) |
648 | 287k | { |
649 | 287k | PyInterpreterState *interp = _PyInterpreterState_GET(); |
650 | 287k | assert(interp->codecs.initialized); |
651 | | |
652 | 287k | if (name==NULL) |
653 | 200k | name = "strict"; |
654 | 287k | PyObject *handler; |
655 | 287k | if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) { |
656 | 0 | return NULL; |
657 | 0 | } |
658 | 287k | if (handler == NULL) { |
659 | 0 | PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); |
660 | 0 | return NULL; |
661 | 0 | } |
662 | 287k | return handler; |
663 | 287k | } |
664 | | |
665 | | |
666 | | static inline void |
667 | | wrong_exception_type(PyObject *exc) |
668 | 0 | { |
669 | 0 | PyErr_Format(PyExc_TypeError, |
670 | 0 | "don't know how to handle %T in error callback", exc); |
671 | 0 | } |
672 | | |
673 | | |
674 | | #define _PyIsUnicodeEncodeError(EXC) \ |
675 | 276k | PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeEncodeError) |
676 | | #define _PyIsUnicodeDecodeError(EXC) \ |
677 | 267k | PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeDecodeError) |
678 | | #define _PyIsUnicodeTranslateError(EXC) \ |
679 | 0 | PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError) |
680 | | |
681 | | |
682 | | // --- codecs handlers: utilities --------------------------------------------- |
683 | | |
684 | | /* |
685 | | * Return the number of characters (including special prefixes) |
686 | | * needed to represent 'ch' by codec_handler_write_unicode_hex(). |
687 | | */ |
688 | | static inline Py_ssize_t |
689 | | codec_handler_unicode_hex_width(Py_UCS4 ch) |
690 | 0 | { |
691 | 0 | if (ch >= 0x10000) { |
692 | | // format: '\\' + 'U' + 8 hex digits |
693 | 0 | return 1 + 1 + 8; |
694 | 0 | } |
695 | 0 | else if (ch >= 0x100) { |
696 | | // format: '\\' + 'u' + 4 hex digits |
697 | 0 | return 1 + 1 + 4; |
698 | 0 | } |
699 | 0 | else { |
700 | | // format: '\\' + 'x' + 2 hex digits |
701 | 0 | return 1 + 1 + 2; |
702 | 0 | } |
703 | 0 | } |
704 | | |
705 | | |
706 | | /* |
707 | | * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p' |
708 | | * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively. |
709 | | */ |
710 | | static inline void |
711 | | codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch) |
712 | 0 | { |
713 | 0 | *(*p)++ = '\\'; |
714 | 0 | if (ch >= 0x10000) { |
715 | 0 | *(*p)++ = 'U'; |
716 | 0 | *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf]; |
717 | 0 | *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf]; |
718 | 0 | *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf]; |
719 | 0 | *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf]; |
720 | 0 | *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf]; |
721 | 0 | *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf]; |
722 | 0 | } |
723 | 0 | else if (ch >= 0x100) { |
724 | 0 | *(*p)++ = 'u'; |
725 | 0 | *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf]; |
726 | 0 | *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf]; |
727 | 0 | } |
728 | 0 | else { |
729 | 0 | *(*p)++ = 'x'; |
730 | 0 | } |
731 | 0 | *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf]; |
732 | 0 | *(*p)++ = Py_hexdigits[ch & 0xf]; |
733 | 0 | } |
734 | | |
735 | | |
736 | | /* |
737 | | * Determine the number of digits for a decimal representation of Unicode |
738 | | * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits). |
739 | | */ |
740 | | static inline int |
741 | | n_decimal_digits_for_codepoint(Py_UCS4 ch) |
742 | 0 | { |
743 | 0 | if (ch < 10) return 1; |
744 | 0 | if (ch < 100) return 2; |
745 | 0 | if (ch < 1000) return 3; |
746 | 0 | if (ch < 10000) return 4; |
747 | 0 | if (ch < 100000) return 5; |
748 | 0 | if (ch < 1000000) return 6; |
749 | 0 | if (ch < 10000000) return 7; |
750 | | // Unicode codepoints are limited to 1114111 (7 decimal digits) |
751 | 0 | Py_UNREACHABLE(); |
752 | 0 | } |
753 | | |
754 | | |
755 | | /* |
756 | | * Create a Unicode string containing 'count' copies of the official |
757 | | * Unicode REPLACEMENT CHARACTER (0xFFFD). |
758 | | */ |
759 | | static PyObject * |
760 | | codec_handler_unicode_replacement_character(Py_ssize_t count) |
761 | 214k | { |
762 | 214k | PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER); |
763 | 214k | if (res == NULL) { |
764 | 0 | return NULL; |
765 | 0 | } |
766 | 214k | assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); |
767 | 214k | Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res); |
768 | 429k | for (Py_ssize_t i = 0; i < count; ++i) { |
769 | 214k | outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; |
770 | 214k | } |
771 | 214k | assert(_PyUnicode_CheckConsistency(res, 1)); |
772 | 214k | return res; |
773 | 214k | } |
774 | | |
775 | | |
776 | | // --- handler: 'strict' ------------------------------------------------------ |
777 | | |
778 | | PyObject *PyCodec_StrictErrors(PyObject *exc) |
779 | 268k | { |
780 | 268k | if (PyExceptionInstance_Check(exc)) { |
781 | 268k | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
782 | 268k | } |
783 | 0 | else { |
784 | 0 | PyErr_SetString(PyExc_TypeError, "codec must pass exception instance"); |
785 | 0 | } |
786 | 268k | return NULL; |
787 | 268k | } |
788 | | |
789 | | |
790 | | // --- handler: 'ignore' ------------------------------------------------------ |
791 | | |
792 | | static PyObject * |
793 | | _PyCodec_IgnoreError(PyObject *exc, int as_bytes) |
794 | 0 | { |
795 | 0 | Py_ssize_t end; |
796 | 0 | if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL, |
797 | 0 | &end, NULL, as_bytes) < 0) |
798 | 0 | { |
799 | 0 | return NULL; |
800 | 0 | } |
801 | 0 | return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end); |
802 | 0 | } |
803 | | |
804 | | |
805 | | PyObject *PyCodec_IgnoreErrors(PyObject *exc) |
806 | 0 | { |
807 | 0 | if (_PyIsUnicodeEncodeError(exc) || _PyIsUnicodeTranslateError(exc)) { |
808 | 0 | return _PyCodec_IgnoreError(exc, false); |
809 | 0 | } |
810 | 0 | else if (_PyIsUnicodeDecodeError(exc)) { |
811 | 0 | return _PyCodec_IgnoreError(exc, true); |
812 | 0 | } |
813 | 0 | else { |
814 | 0 | wrong_exception_type(exc); |
815 | 0 | return NULL; |
816 | 0 | } |
817 | 0 | } |
818 | | |
819 | | |
820 | | // --- handler: 'replace' ----------------------------------------------------- |
821 | | |
822 | | static PyObject * |
823 | | _PyCodec_ReplaceUnicodeEncodeError(PyObject *exc) |
824 | 0 | { |
825 | 0 | Py_ssize_t start, end, slen; |
826 | 0 | if (_PyUnicodeError_GetParams(exc, NULL, NULL, |
827 | 0 | &start, &end, &slen, false) < 0) |
828 | 0 | { |
829 | 0 | return NULL; |
830 | 0 | } |
831 | 0 | PyObject *res = PyUnicode_New(slen, '?'); |
832 | 0 | if (res == NULL) { |
833 | 0 | return NULL; |
834 | 0 | } |
835 | 0 | assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND); |
836 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
837 | 0 | memset(outp, '?', sizeof(Py_UCS1) * slen); |
838 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
839 | 0 | return Py_BuildValue("(Nn)", res, end); |
840 | 0 | } |
841 | | |
842 | | |
843 | | static PyObject * |
844 | | _PyCodec_ReplaceUnicodeDecodeError(PyObject *exc) |
845 | 214k | { |
846 | 214k | Py_ssize_t end; |
847 | 214k | if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) { |
848 | 0 | return NULL; |
849 | 0 | } |
850 | 214k | PyObject *res = codec_handler_unicode_replacement_character(1); |
851 | 214k | if (res == NULL) { |
852 | 0 | return NULL; |
853 | 0 | } |
854 | 214k | return Py_BuildValue("(Nn)", res, end); |
855 | 214k | } |
856 | | |
857 | | |
858 | | static PyObject * |
859 | | _PyCodec_ReplaceUnicodeTranslateError(PyObject *exc) |
860 | 0 | { |
861 | 0 | Py_ssize_t start, end, slen; |
862 | 0 | if (_PyUnicodeError_GetParams(exc, NULL, NULL, |
863 | 0 | &start, &end, &slen, false) < 0) |
864 | 0 | { |
865 | 0 | return NULL; |
866 | 0 | } |
867 | 0 | PyObject *res = codec_handler_unicode_replacement_character(slen); |
868 | 0 | if (res == NULL) { |
869 | 0 | return NULL; |
870 | 0 | } |
871 | 0 | return Py_BuildValue("(Nn)", res, end); |
872 | 0 | } |
873 | | |
874 | | |
875 | | PyObject *PyCodec_ReplaceErrors(PyObject *exc) |
876 | 214k | { |
877 | 214k | if (_PyIsUnicodeEncodeError(exc)) { |
878 | 0 | return _PyCodec_ReplaceUnicodeEncodeError(exc); |
879 | 0 | } |
880 | 214k | else if (_PyIsUnicodeDecodeError(exc)) { |
881 | 214k | return _PyCodec_ReplaceUnicodeDecodeError(exc); |
882 | 214k | } |
883 | 0 | else if (_PyIsUnicodeTranslateError(exc)) { |
884 | 0 | return _PyCodec_ReplaceUnicodeTranslateError(exc); |
885 | 0 | } |
886 | 0 | else { |
887 | 0 | wrong_exception_type(exc); |
888 | 0 | return NULL; |
889 | 0 | } |
890 | 214k | } |
891 | | |
892 | | |
893 | | // --- handler: 'xmlcharrefreplace' ------------------------------------------- |
894 | | |
895 | | PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) |
896 | 0 | { |
897 | 0 | if (!_PyIsUnicodeEncodeError(exc)) { |
898 | 0 | wrong_exception_type(exc); |
899 | 0 | return NULL; |
900 | 0 | } |
901 | | |
902 | 0 | PyObject *obj; |
903 | 0 | Py_ssize_t objlen, start, end, slen; |
904 | 0 | if (_PyUnicodeError_GetParams(exc, |
905 | 0 | &obj, &objlen, |
906 | 0 | &start, &end, &slen, false) < 0) |
907 | 0 | { |
908 | 0 | return NULL; |
909 | 0 | } |
910 | | |
911 | | // The number of characters that each character 'ch' contributes |
912 | | // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch} |
913 | | // and will be formatted as "&#" + DIGITS + ";". Since the Unicode |
914 | | // range is below 10^7, each "block" requires at most 2 + 7 + 1 |
915 | | // characters. |
916 | 0 | if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) { |
917 | 0 | end = start + PY_SSIZE_T_MAX / (2 + 7 + 1); |
918 | 0 | end = Py_MIN(end, objlen); |
919 | 0 | slen = Py_MAX(0, end - start); |
920 | 0 | } |
921 | |
|
922 | 0 | Py_ssize_t ressize = 0; |
923 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
924 | 0 | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
925 | 0 | int k = n_decimal_digits_for_codepoint(ch); |
926 | 0 | assert(k != 0); |
927 | 0 | assert(k <= 7); |
928 | 0 | ressize += 2 + k + 1; |
929 | 0 | } |
930 | | |
931 | | /* allocate replacement */ |
932 | 0 | PyObject *res = PyUnicode_New(ressize, 127); |
933 | 0 | if (res == NULL) { |
934 | 0 | Py_DECREF(obj); |
935 | 0 | return NULL; |
936 | 0 | } |
937 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
938 | | /* generate replacement */ |
939 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
940 | 0 | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
941 | | /* |
942 | | * Write the decimal representation of 'ch' to the buffer pointed by 'p' |
943 | | * using at most 7 characters prefixed by '&#' and suffixed by ';'. |
944 | | */ |
945 | 0 | *outp++ = '&'; |
946 | 0 | *outp++ = '#'; |
947 | 0 | Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch); |
948 | 0 | for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) { |
949 | 0 | *p_digit = '0' + (ch % 10); |
950 | 0 | ch /= 10; |
951 | 0 | } |
952 | 0 | assert(ch == 0); |
953 | 0 | outp = digit_end; |
954 | 0 | *outp++ = ';'; |
955 | 0 | } |
956 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
957 | 0 | PyObject *restuple = Py_BuildValue("(Nn)", res, end); |
958 | 0 | Py_DECREF(obj); |
959 | 0 | return restuple; |
960 | 0 | } |
961 | | |
962 | | |
963 | | // --- handler: 'backslashreplace' -------------------------------------------- |
964 | | |
965 | | static PyObject * |
966 | | _PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc) |
967 | 0 | { |
968 | 0 | PyObject *obj; |
969 | 0 | Py_ssize_t objlen, start, end, slen; |
970 | 0 | if (_PyUnicodeError_GetParams(exc, |
971 | 0 | &obj, &objlen, |
972 | 0 | &start, &end, &slen, false) < 0) |
973 | 0 | { |
974 | 0 | return NULL; |
975 | 0 | } |
976 | | |
977 | | // The number of characters that each character 'ch' contributes |
978 | | // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch} |
979 | | // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS, |
980 | | // where the number of hexdigits is either 2, 4, or 8 (not 6). |
981 | | // Since the Unicode range is below 10^7, we choose k = 8 whence |
982 | | // each "block" requires at most 1 + 1 + 8 characters. |
983 | 0 | if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) { |
984 | 0 | end = start + PY_SSIZE_T_MAX / (1 + 1 + 8); |
985 | 0 | end = Py_MIN(end, objlen); |
986 | 0 | slen = Py_MAX(0, end - start); |
987 | 0 | } |
988 | |
|
989 | 0 | Py_ssize_t ressize = 0; |
990 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
991 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); |
992 | 0 | ressize += codec_handler_unicode_hex_width(c); |
993 | 0 | } |
994 | 0 | PyObject *res = PyUnicode_New(ressize, 127); |
995 | 0 | if (res == NULL) { |
996 | 0 | Py_DECREF(obj); |
997 | 0 | return NULL; |
998 | 0 | } |
999 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
1000 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
1001 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); |
1002 | 0 | codec_handler_write_unicode_hex(&outp, c); |
1003 | 0 | } |
1004 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1005 | 0 | Py_DECREF(obj); |
1006 | 0 | return Py_BuildValue("(Nn)", res, end); |
1007 | 0 | } |
1008 | | |
1009 | | |
1010 | | static PyObject * |
1011 | | _PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc) |
1012 | 0 | { |
1013 | 0 | PyObject *obj; |
1014 | 0 | Py_ssize_t objlen, start, end, slen; |
1015 | 0 | if (_PyUnicodeError_GetParams(exc, |
1016 | 0 | &obj, &objlen, |
1017 | 0 | &start, &end, &slen, true) < 0) |
1018 | 0 | { |
1019 | 0 | return NULL; |
1020 | 0 | } |
1021 | | |
1022 | 0 | PyObject *res = PyUnicode_New(4 * slen, 127); |
1023 | 0 | if (res == NULL) { |
1024 | 0 | Py_DECREF(obj); |
1025 | 0 | return NULL; |
1026 | 0 | } |
1027 | | |
1028 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
1029 | 0 | const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); |
1030 | 0 | for (Py_ssize_t i = start; i < end; i++, outp += 4) { |
1031 | 0 | const unsigned char ch = p[i]; |
1032 | 0 | outp[0] = '\\'; |
1033 | 0 | outp[1] = 'x'; |
1034 | 0 | outp[2] = Py_hexdigits[(ch >> 4) & 0xf]; |
1035 | 0 | outp[3] = Py_hexdigits[ch & 0xf]; |
1036 | 0 | } |
1037 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1038 | 0 | Py_DECREF(obj); |
1039 | 0 | return Py_BuildValue("(Nn)", res, end); |
1040 | 0 | } |
1041 | | |
1042 | | |
1043 | | static inline PyObject * |
1044 | | _PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc) |
1045 | 0 | { |
1046 | | // Same implementation as for UnicodeEncodeError objects. |
1047 | 0 | return _PyCodec_BackslashReplaceUnicodeEncodeError(exc); |
1048 | 0 | } |
1049 | | |
1050 | | |
1051 | | PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) |
1052 | 0 | { |
1053 | 0 | if (_PyIsUnicodeEncodeError(exc)) { |
1054 | 0 | return _PyCodec_BackslashReplaceUnicodeEncodeError(exc); |
1055 | 0 | } |
1056 | 0 | else if (_PyIsUnicodeDecodeError(exc)) { |
1057 | 0 | return _PyCodec_BackslashReplaceUnicodeDecodeError(exc); |
1058 | 0 | } |
1059 | 0 | else if (_PyIsUnicodeTranslateError(exc)) { |
1060 | 0 | return _PyCodec_BackslashReplaceUnicodeTranslateError(exc); |
1061 | 0 | } |
1062 | 0 | else { |
1063 | 0 | wrong_exception_type(exc); |
1064 | 0 | return NULL; |
1065 | 0 | } |
1066 | 0 | } |
1067 | | |
1068 | | |
1069 | | // --- handler: 'namereplace' ------------------------------------------------- |
1070 | | |
1071 | | PyObject *PyCodec_NameReplaceErrors(PyObject *exc) |
1072 | 0 | { |
1073 | 0 | if (!_PyIsUnicodeEncodeError(exc)) { |
1074 | 0 | wrong_exception_type(exc); |
1075 | 0 | return NULL; |
1076 | 0 | } |
1077 | | |
1078 | 0 | _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI(); |
1079 | 0 | if (ucnhash_capi == NULL) { |
1080 | 0 | return NULL; |
1081 | 0 | } |
1082 | | |
1083 | 0 | PyObject *obj; |
1084 | 0 | Py_ssize_t start, end; |
1085 | 0 | if (_PyUnicodeError_GetParams(exc, |
1086 | 0 | &obj, NULL, |
1087 | 0 | &start, &end, NULL, false) < 0) |
1088 | 0 | { |
1089 | 0 | return NULL; |
1090 | 0 | } |
1091 | | |
1092 | 0 | char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */ |
1093 | 0 | Py_ssize_t imax = start, ressize = 0, replsize; |
1094 | 0 | for (; imax < end; ++imax) { |
1095 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax); |
1096 | 0 | if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) { |
1097 | | // If 'c' is recognized by getname(), the corresponding replacement |
1098 | | // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1 |
1099 | | // characters. Failures of getname() are ignored by the handler. |
1100 | 0 | replsize = 1 + 1 + 1 + strlen(buffer) + 1; |
1101 | 0 | } |
1102 | 0 | else { |
1103 | 0 | replsize = codec_handler_unicode_hex_width(c); |
1104 | 0 | } |
1105 | 0 | if (ressize > PY_SSIZE_T_MAX - replsize) { |
1106 | 0 | break; |
1107 | 0 | } |
1108 | 0 | ressize += replsize; |
1109 | 0 | } |
1110 | |
|
1111 | 0 | PyObject *res = PyUnicode_New(ressize, 127); |
1112 | 0 | if (res == NULL) { |
1113 | 0 | Py_DECREF(obj); |
1114 | 0 | return NULL; |
1115 | 0 | } |
1116 | | |
1117 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
1118 | 0 | for (Py_ssize_t i = start; i < imax; ++i) { |
1119 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); |
1120 | 0 | if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) { |
1121 | 0 | *outp++ = '\\'; |
1122 | 0 | *outp++ = 'N'; |
1123 | 0 | *outp++ = '{'; |
1124 | 0 | (void)strcpy((char *)outp, buffer); |
1125 | 0 | outp += strlen(buffer); |
1126 | 0 | *outp++ = '}'; |
1127 | 0 | } |
1128 | 0 | else { |
1129 | 0 | codec_handler_write_unicode_hex(&outp, c); |
1130 | 0 | } |
1131 | 0 | } |
1132 | |
|
1133 | 0 | assert(outp == PyUnicode_1BYTE_DATA(res) + ressize); |
1134 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1135 | 0 | PyObject *restuple = Py_BuildValue("(Nn)", res, imax); |
1136 | 0 | Py_DECREF(obj); |
1137 | 0 | return restuple; |
1138 | 0 | } |
1139 | | |
1140 | | |
1141 | 0 | #define ENC_UNKNOWN -1 |
1142 | 0 | #define ENC_UTF8 0 |
1143 | 0 | #define ENC_UTF16BE 1 |
1144 | 0 | #define ENC_UTF16LE 2 |
1145 | 0 | #define ENC_UTF32BE 3 |
1146 | 0 | #define ENC_UTF32LE 4 |
1147 | | |
1148 | | static int |
1149 | | get_standard_encoding_impl(const char *encoding, int *bytelength) |
1150 | 0 | { |
1151 | 0 | if (Py_TOLOWER(encoding[0]) == 'u' && |
1152 | 0 | Py_TOLOWER(encoding[1]) == 't' && |
1153 | 0 | Py_TOLOWER(encoding[2]) == 'f') { |
1154 | 0 | encoding += 3; |
1155 | 0 | if (*encoding == '-' || *encoding == '_' ) |
1156 | 0 | encoding++; |
1157 | 0 | if (encoding[0] == '8' && encoding[1] == '\0') { |
1158 | 0 | *bytelength = 3; |
1159 | 0 | return ENC_UTF8; |
1160 | 0 | } |
1161 | 0 | else if (encoding[0] == '1' && encoding[1] == '6') { |
1162 | 0 | encoding += 2; |
1163 | 0 | *bytelength = 2; |
1164 | 0 | if (*encoding == '\0') { |
1165 | | #ifdef WORDS_BIGENDIAN |
1166 | | return ENC_UTF16BE; |
1167 | | #else |
1168 | 0 | return ENC_UTF16LE; |
1169 | 0 | #endif |
1170 | 0 | } |
1171 | 0 | if (*encoding == '-' || *encoding == '_' ) |
1172 | 0 | encoding++; |
1173 | 0 | if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { |
1174 | 0 | if (Py_TOLOWER(encoding[0]) == 'b') |
1175 | 0 | return ENC_UTF16BE; |
1176 | 0 | if (Py_TOLOWER(encoding[0]) == 'l') |
1177 | 0 | return ENC_UTF16LE; |
1178 | 0 | } |
1179 | 0 | } |
1180 | 0 | else if (encoding[0] == '3' && encoding[1] == '2') { |
1181 | 0 | encoding += 2; |
1182 | 0 | *bytelength = 4; |
1183 | 0 | if (*encoding == '\0') { |
1184 | | #ifdef WORDS_BIGENDIAN |
1185 | | return ENC_UTF32BE; |
1186 | | #else |
1187 | 0 | return ENC_UTF32LE; |
1188 | 0 | #endif |
1189 | 0 | } |
1190 | 0 | if (*encoding == '-' || *encoding == '_' ) |
1191 | 0 | encoding++; |
1192 | 0 | if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { |
1193 | 0 | if (Py_TOLOWER(encoding[0]) == 'b') |
1194 | 0 | return ENC_UTF32BE; |
1195 | 0 | if (Py_TOLOWER(encoding[0]) == 'l') |
1196 | 0 | return ENC_UTF32LE; |
1197 | 0 | } |
1198 | 0 | } |
1199 | 0 | } |
1200 | 0 | else if (strcmp(encoding, "cp65001") == 0) { |
1201 | 0 | *bytelength = 3; |
1202 | 0 | return ENC_UTF8; |
1203 | 0 | } |
1204 | 0 | return ENC_UNKNOWN; |
1205 | 0 | } |
1206 | | |
1207 | | |
1208 | | static int |
1209 | | get_standard_encoding(PyObject *encoding, int *code, int *bytelength) |
1210 | 0 | { |
1211 | 0 | const char *encoding_cstr = PyUnicode_AsUTF8(encoding); |
1212 | 0 | if (encoding_cstr == NULL) { |
1213 | 0 | return -1; |
1214 | 0 | } |
1215 | 0 | *code = get_standard_encoding_impl(encoding_cstr, bytelength); |
1216 | 0 | return 0; |
1217 | 0 | } |
1218 | | |
1219 | | |
1220 | | // --- handler: 'surrogatepass' ----------------------------------------------- |
1221 | | |
1222 | | static PyObject * |
1223 | | _PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc) |
1224 | 0 | { |
1225 | 0 | PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc); |
1226 | 0 | if (encoding == NULL) { |
1227 | 0 | return NULL; |
1228 | 0 | } |
1229 | 0 | int code, bytelength; |
1230 | 0 | int rc = get_standard_encoding(encoding, &code, &bytelength); |
1231 | 0 | Py_DECREF(encoding); |
1232 | 0 | if (rc < 0) { |
1233 | 0 | return NULL; |
1234 | 0 | } |
1235 | 0 | if (code == ENC_UNKNOWN) { |
1236 | 0 | goto bail; |
1237 | 0 | } |
1238 | | |
1239 | 0 | PyObject *obj; |
1240 | 0 | Py_ssize_t objlen, start, end, slen; |
1241 | 0 | if (_PyUnicodeError_GetParams(exc, |
1242 | 0 | &obj, &objlen, |
1243 | 0 | &start, &end, &slen, false) < 0) |
1244 | 0 | { |
1245 | 0 | return NULL; |
1246 | 0 | } |
1247 | | |
1248 | 0 | if (slen > PY_SSIZE_T_MAX / bytelength) { |
1249 | 0 | end = start + PY_SSIZE_T_MAX / bytelength; |
1250 | 0 | end = Py_MIN(end, objlen); |
1251 | 0 | slen = Py_MAX(0, end - start); |
1252 | 0 | } |
1253 | |
|
1254 | 0 | PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen); |
1255 | 0 | if (res == NULL) { |
1256 | 0 | Py_DECREF(obj); |
1257 | 0 | return NULL; |
1258 | 0 | } |
1259 | | |
1260 | 0 | unsigned char *outp = (unsigned char *)PyBytes_AsString(res); |
1261 | 0 | for (Py_ssize_t i = start; i < end; i++) { |
1262 | 0 | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
1263 | 0 | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
1264 | | /* Not a surrogate, fail with original exception */ |
1265 | 0 | Py_DECREF(obj); |
1266 | 0 | Py_DECREF(res); |
1267 | 0 | goto bail; |
1268 | 0 | } |
1269 | 0 | switch (code) { |
1270 | 0 | case ENC_UTF8: { |
1271 | 0 | *outp++ = (unsigned char)(0xe0 | (ch >> 12)); |
1272 | 0 | *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); |
1273 | 0 | *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); |
1274 | 0 | break; |
1275 | 0 | } |
1276 | 0 | case ENC_UTF16LE: { |
1277 | 0 | *outp++ = (unsigned char)ch; |
1278 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1279 | 0 | break; |
1280 | 0 | } |
1281 | 0 | case ENC_UTF16BE: { |
1282 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1283 | 0 | *outp++ = (unsigned char)ch; |
1284 | 0 | break; |
1285 | 0 | } |
1286 | 0 | case ENC_UTF32LE: { |
1287 | 0 | *outp++ = (unsigned char)ch; |
1288 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1289 | 0 | *outp++ = (unsigned char)(ch >> 16); |
1290 | 0 | *outp++ = (unsigned char)(ch >> 24); |
1291 | 0 | break; |
1292 | 0 | } |
1293 | 0 | case ENC_UTF32BE: { |
1294 | 0 | *outp++ = (unsigned char)(ch >> 24); |
1295 | 0 | *outp++ = (unsigned char)(ch >> 16); |
1296 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1297 | 0 | *outp++ = (unsigned char)ch; |
1298 | 0 | break; |
1299 | 0 | } |
1300 | 0 | } |
1301 | 0 | } |
1302 | | |
1303 | 0 | Py_DECREF(obj); |
1304 | 0 | PyObject *restuple = Py_BuildValue("(Nn)", res, end); |
1305 | 0 | return restuple; |
1306 | | |
1307 | 0 | bail: |
1308 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1309 | 0 | return NULL; |
1310 | 0 | } |
1311 | | |
1312 | | |
1313 | | static PyObject * |
1314 | | _PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc) |
1315 | 0 | { |
1316 | 0 | PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc); |
1317 | 0 | if (encoding == NULL) { |
1318 | 0 | return NULL; |
1319 | 0 | } |
1320 | 0 | int code, bytelength; |
1321 | 0 | int rc = get_standard_encoding(encoding, &code, &bytelength); |
1322 | 0 | Py_DECREF(encoding); |
1323 | 0 | if (rc < 0) { |
1324 | 0 | return NULL; |
1325 | 0 | } |
1326 | 0 | if (code == ENC_UNKNOWN) { |
1327 | 0 | goto bail; |
1328 | 0 | } |
1329 | | |
1330 | 0 | PyObject *obj; |
1331 | 0 | Py_ssize_t objlen, start, end, slen; |
1332 | 0 | if (_PyUnicodeError_GetParams(exc, |
1333 | 0 | &obj, &objlen, |
1334 | 0 | &start, &end, &slen, true) < 0) |
1335 | 0 | { |
1336 | 0 | return NULL; |
1337 | 0 | } |
1338 | | |
1339 | | /* Try decoding a single surrogate character. If |
1340 | | there are more, let the codec call us again. */ |
1341 | 0 | Py_UCS4 ch = 0; |
1342 | 0 | const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); |
1343 | 0 | p += start; |
1344 | |
|
1345 | 0 | if (objlen - start >= bytelength) { |
1346 | 0 | switch (code) { |
1347 | 0 | case ENC_UTF8: { |
1348 | 0 | if ((p[0] & 0xf0) == 0xe0 && |
1349 | 0 | (p[1] & 0xc0) == 0x80 && |
1350 | 0 | (p[2] & 0xc0) == 0x80) |
1351 | 0 | { |
1352 | | /* it's a three-byte code */ |
1353 | 0 | ch = ((p[0] & 0x0f) << 12) + |
1354 | 0 | ((p[1] & 0x3f) << 6) + |
1355 | 0 | (p[2] & 0x3f); |
1356 | 0 | } |
1357 | 0 | break; |
1358 | 0 | } |
1359 | 0 | case ENC_UTF16LE: { |
1360 | 0 | ch = p[1] << 8 | p[0]; |
1361 | 0 | break; |
1362 | 0 | } |
1363 | 0 | case ENC_UTF16BE: { |
1364 | 0 | ch = p[0] << 8 | p[1]; |
1365 | 0 | break; |
1366 | 0 | } |
1367 | 0 | case ENC_UTF32LE: { |
1368 | 0 | ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; |
1369 | 0 | break; |
1370 | 0 | } |
1371 | 0 | case ENC_UTF32BE: { |
1372 | 0 | ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; |
1373 | 0 | break; |
1374 | 0 | } |
1375 | 0 | } |
1376 | 0 | } |
1377 | 0 | Py_DECREF(obj); |
1378 | 0 | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
1379 | 0 | goto bail; |
1380 | 0 | } |
1381 | | |
1382 | 0 | PyObject *res = PyUnicode_FromOrdinal(ch); |
1383 | 0 | if (res == NULL) { |
1384 | 0 | return NULL; |
1385 | 0 | } |
1386 | 0 | return Py_BuildValue("(Nn)", res, start + bytelength); |
1387 | | |
1388 | 0 | bail: |
1389 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1390 | 0 | return NULL; |
1391 | 0 | } |
1392 | | |
1393 | | |
1394 | | /* This handler is declared static until someone demonstrates |
1395 | | a need to call it directly. */ |
1396 | | static PyObject * |
1397 | | PyCodec_SurrogatePassErrors(PyObject *exc) |
1398 | 0 | { |
1399 | 0 | if (_PyIsUnicodeEncodeError(exc)) { |
1400 | 0 | return _PyCodec_SurrogatePassUnicodeEncodeError(exc); |
1401 | 0 | } |
1402 | 0 | else if (_PyIsUnicodeDecodeError(exc)) { |
1403 | 0 | return _PyCodec_SurrogatePassUnicodeDecodeError(exc); |
1404 | 0 | } |
1405 | 0 | else { |
1406 | 0 | wrong_exception_type(exc); |
1407 | 0 | return NULL; |
1408 | 0 | } |
1409 | 0 | } |
1410 | | |
1411 | | |
1412 | | // --- handler: 'surrogateescape' --------------------------------------------- |
1413 | | |
1414 | | static PyObject * |
1415 | | _PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc) |
1416 | 9.32k | { |
1417 | 9.32k | PyObject *obj; |
1418 | 9.32k | Py_ssize_t start, end, slen; |
1419 | 9.32k | if (_PyUnicodeError_GetParams(exc, |
1420 | 9.32k | &obj, NULL, |
1421 | 9.32k | &start, &end, &slen, false) < 0) |
1422 | 0 | { |
1423 | 0 | return NULL; |
1424 | 0 | } |
1425 | | |
1426 | 9.32k | PyObject *res = PyBytes_FromStringAndSize(NULL, slen); |
1427 | 9.32k | if (res == NULL) { |
1428 | 0 | Py_DECREF(obj); |
1429 | 0 | return NULL; |
1430 | 0 | } |
1431 | | |
1432 | 9.32k | char *outp = PyBytes_AsString(res); |
1433 | 9.32k | for (Py_ssize_t i = start; i < end; i++) { |
1434 | 9.32k | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
1435 | 9.32k | if (ch < 0xdc80 || ch > 0xdcff) { |
1436 | | /* Not a UTF-8b surrogate, fail with original exception. */ |
1437 | 9.32k | Py_DECREF(obj); |
1438 | 9.32k | Py_DECREF(res); |
1439 | 9.32k | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1440 | 9.32k | return NULL; |
1441 | 9.32k | } |
1442 | 0 | *outp++ = ch - 0xdc00; |
1443 | 0 | } |
1444 | 0 | Py_DECREF(obj); |
1445 | |
|
1446 | 0 | return Py_BuildValue("(Nn)", res, end); |
1447 | 9.32k | } |
1448 | | |
1449 | | |
1450 | | static PyObject * |
1451 | | _PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc) |
1452 | 52.4k | { |
1453 | 52.4k | PyObject *obj; |
1454 | 52.4k | Py_ssize_t start, end, slen; |
1455 | 52.4k | if (_PyUnicodeError_GetParams(exc, |
1456 | 52.4k | &obj, NULL, |
1457 | 52.4k | &start, &end, &slen, true) < 0) |
1458 | 0 | { |
1459 | 0 | return NULL; |
1460 | 0 | } |
1461 | | |
1462 | 52.4k | Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ |
1463 | 52.4k | int consumed = 0; |
1464 | 52.4k | const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); |
1465 | 114k | while (consumed < 4 && consumed < slen) { |
1466 | | /* Refuse to escape ASCII bytes. */ |
1467 | 93.0k | if (p[start + consumed] < 128) { |
1468 | 31.1k | break; |
1469 | 31.1k | } |
1470 | 61.8k | ch[consumed] = 0xdc00 + p[start + consumed]; |
1471 | 61.8k | consumed++; |
1472 | 61.8k | } |
1473 | 52.4k | Py_DECREF(obj); |
1474 | | |
1475 | 52.4k | if (consumed == 0) { |
1476 | | /* Codec complained about ASCII byte. */ |
1477 | 18.0k | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1478 | 18.0k | return NULL; |
1479 | 18.0k | } |
1480 | | |
1481 | 34.4k | PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed); |
1482 | 34.4k | if (str == NULL) { |
1483 | 0 | return NULL; |
1484 | 0 | } |
1485 | 34.4k | return Py_BuildValue("(Nn)", str, start + consumed); |
1486 | 34.4k | } |
1487 | | |
1488 | | |
1489 | | static PyObject * |
1490 | | PyCodec_SurrogateEscapeErrors(PyObject *exc) |
1491 | 61.7k | { |
1492 | 61.7k | if (_PyIsUnicodeEncodeError(exc)) { |
1493 | 9.32k | return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc); |
1494 | 9.32k | } |
1495 | 52.4k | else if (_PyIsUnicodeDecodeError(exc)) { |
1496 | 52.4k | return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc); |
1497 | 52.4k | } |
1498 | 0 | else { |
1499 | 0 | wrong_exception_type(exc); |
1500 | 0 | return NULL; |
1501 | 0 | } |
1502 | 61.7k | } |
1503 | | |
1504 | | |
1505 | | // --- Codecs registry handlers ----------------------------------------------- |
1506 | | |
1507 | | static inline PyObject * |
1508 | | strict_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1509 | 228k | { |
1510 | 228k | return PyCodec_StrictErrors(exc); |
1511 | 228k | } |
1512 | | |
1513 | | |
1514 | | static inline PyObject * |
1515 | | ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1516 | 0 | { |
1517 | 0 | return PyCodec_IgnoreErrors(exc); |
1518 | 0 | } |
1519 | | |
1520 | | |
1521 | | static inline PyObject * |
1522 | | replace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1523 | 214k | { |
1524 | 214k | return PyCodec_ReplaceErrors(exc); |
1525 | 214k | } |
1526 | | |
1527 | | |
1528 | | static inline PyObject * |
1529 | | xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1530 | 0 | { |
1531 | 0 | return PyCodec_XMLCharRefReplaceErrors(exc); |
1532 | 0 | } |
1533 | | |
1534 | | |
1535 | | static inline PyObject * |
1536 | | backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1537 | 0 | { |
1538 | 0 | return PyCodec_BackslashReplaceErrors(exc); |
1539 | 0 | } |
1540 | | |
1541 | | |
1542 | | static inline PyObject * |
1543 | | namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1544 | 0 | { |
1545 | 0 | return PyCodec_NameReplaceErrors(exc); |
1546 | 0 | } |
1547 | | |
1548 | | |
1549 | | static inline PyObject * |
1550 | | surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1551 | 0 | { |
1552 | 0 | return PyCodec_SurrogatePassErrors(exc); |
1553 | 0 | } |
1554 | | |
1555 | | |
1556 | | static inline PyObject * |
1557 | | surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1558 | 61.7k | { |
1559 | 61.7k | return PyCodec_SurrogateEscapeErrors(exc); |
1560 | 61.7k | } |
1561 | | |
1562 | | |
1563 | | PyStatus |
1564 | | _PyCodec_InitRegistry(PyInterpreterState *interp) |
1565 | 16 | { |
1566 | 16 | static struct { |
1567 | 16 | const char *name; |
1568 | 16 | PyMethodDef def; |
1569 | 16 | } methods[] = |
1570 | 16 | { |
1571 | 16 | { |
1572 | 16 | "strict", |
1573 | 16 | { |
1574 | 16 | "strict_errors", |
1575 | 16 | strict_errors, |
1576 | 16 | METH_O, |
1577 | 16 | PyDoc_STR("Implements the 'strict' error handling, which " |
1578 | 16 | "raises a UnicodeError on coding errors.") |
1579 | 16 | } |
1580 | 16 | }, |
1581 | 16 | { |
1582 | 16 | "ignore", |
1583 | 16 | { |
1584 | 16 | "ignore_errors", |
1585 | 16 | ignore_errors, |
1586 | 16 | METH_O, |
1587 | 16 | PyDoc_STR("Implements the 'ignore' error handling, which " |
1588 | 16 | "ignores malformed data and continues.") |
1589 | 16 | } |
1590 | 16 | }, |
1591 | 16 | { |
1592 | 16 | "replace", |
1593 | 16 | { |
1594 | 16 | "replace_errors", |
1595 | 16 | replace_errors, |
1596 | 16 | METH_O, |
1597 | 16 | PyDoc_STR("Implements the 'replace' error handling, which " |
1598 | 16 | "replaces malformed data with a replacement marker.") |
1599 | 16 | } |
1600 | 16 | }, |
1601 | 16 | { |
1602 | 16 | "xmlcharrefreplace", |
1603 | 16 | { |
1604 | 16 | "xmlcharrefreplace_errors", |
1605 | 16 | xmlcharrefreplace_errors, |
1606 | 16 | METH_O, |
1607 | 16 | PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, " |
1608 | 16 | "which replaces an unencodable character with the " |
1609 | 16 | "appropriate XML character reference.") |
1610 | 16 | } |
1611 | 16 | }, |
1612 | 16 | { |
1613 | 16 | "backslashreplace", |
1614 | 16 | { |
1615 | 16 | "backslashreplace_errors", |
1616 | 16 | backslashreplace_errors, |
1617 | 16 | METH_O, |
1618 | 16 | PyDoc_STR("Implements the 'backslashreplace' error handling, " |
1619 | 16 | "which replaces malformed data with a backslashed " |
1620 | 16 | "escape sequence.") |
1621 | 16 | } |
1622 | 16 | }, |
1623 | 16 | { |
1624 | 16 | "namereplace", |
1625 | 16 | { |
1626 | 16 | "namereplace_errors", |
1627 | 16 | namereplace_errors, |
1628 | 16 | METH_O, |
1629 | 16 | PyDoc_STR("Implements the 'namereplace' error handling, " |
1630 | 16 | "which replaces an unencodable character with a " |
1631 | 16 | "\\N{...} escape sequence.") |
1632 | 16 | } |
1633 | 16 | }, |
1634 | 16 | { |
1635 | 16 | "surrogatepass", |
1636 | 16 | { |
1637 | 16 | "surrogatepass", |
1638 | 16 | surrogatepass_errors, |
1639 | 16 | METH_O |
1640 | 16 | } |
1641 | 16 | }, |
1642 | 16 | { |
1643 | 16 | "surrogateescape", |
1644 | 16 | { |
1645 | 16 | "surrogateescape", |
1646 | 16 | surrogateescape_errors, |
1647 | 16 | METH_O |
1648 | 16 | } |
1649 | 16 | } |
1650 | 16 | }; |
1651 | | // ensure that the built-in error handlers' names are kept in sync |
1652 | 16 | assert(Py_ARRAY_LENGTH(methods) == Py_ARRAY_LENGTH(codecs_builtin_error_handlers)); |
1653 | | |
1654 | 16 | assert(interp->codecs.initialized == 0); |
1655 | 16 | interp->codecs.search_path = PyList_New(0); |
1656 | 16 | if (interp->codecs.search_path == NULL) { |
1657 | 0 | return PyStatus_NoMemory(); |
1658 | 0 | } |
1659 | 16 | interp->codecs.search_cache = PyDict_New(); |
1660 | 16 | if (interp->codecs.search_cache == NULL) { |
1661 | 0 | return PyStatus_NoMemory(); |
1662 | 0 | } |
1663 | 16 | interp->codecs.error_registry = PyDict_New(); |
1664 | 16 | if (interp->codecs.error_registry == NULL) { |
1665 | 0 | return PyStatus_NoMemory(); |
1666 | 0 | } |
1667 | 144 | for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) { |
1668 | 128 | PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL); |
1669 | 128 | if (func == NULL) { |
1670 | 0 | return PyStatus_NoMemory(); |
1671 | 0 | } |
1672 | | |
1673 | 128 | int res = PyDict_SetItemString(interp->codecs.error_registry, |
1674 | 128 | methods[i].name, func); |
1675 | 128 | Py_DECREF(func); |
1676 | 128 | if (res < 0) { |
1677 | 0 | return PyStatus_Error("Failed to insert into codec error registry"); |
1678 | 0 | } |
1679 | 128 | } |
1680 | | |
1681 | 16 | interp->codecs.initialized = 1; |
1682 | | |
1683 | | // Importing `encodings' will call back into this module to register codec |
1684 | | // search functions, so this is done after everything else is initialized. |
1685 | 16 | PyObject *mod = PyImport_ImportModule("encodings"); |
1686 | 16 | if (mod == NULL) { |
1687 | 0 | return PyStatus_Error("Failed to import encodings module"); |
1688 | 0 | } |
1689 | 16 | Py_DECREF(mod); |
1690 | | |
1691 | 16 | return PyStatus_Ok(); |
1692 | 16 | } |
1693 | | |
1694 | | void |
1695 | | _PyCodec_Fini(PyInterpreterState *interp) |
1696 | 0 | { |
1697 | 0 | Py_CLEAR(interp->codecs.search_path); |
1698 | 0 | Py_CLEAR(interp->codecs.search_cache); |
1699 | 0 | Py_CLEAR(interp->codecs.error_registry); |
1700 | 0 | interp->codecs.initialized = 0; |
1701 | 0 | } |