/src/cpython/Python/codecs.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* ------------------------------------------------------------------------ |
2 | | |
3 | | Python Codec Registry and support functions |
4 | | |
5 | | Written by Marc-Andre Lemburg (mal@lemburg.com). |
6 | | |
7 | | Copyright (c) Corporation for National Research Initiatives. |
8 | | |
9 | | ------------------------------------------------------------------------ */ |
10 | | |
11 | | #include "Python.h" |
12 | | #include "pycore_call.h" // _PyObject_CallNoArgs() |
13 | | #include "pycore_interp.h" // PyInterpreterState.codec_search_path |
14 | | #include "pycore_pyerrors.h" // _PyErr_FormatNote() |
15 | | #include "pycore_pystate.h" // _PyInterpreterState_GET() |
16 | | #include "pycore_runtime.h" // _Py_ID() |
17 | | #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI |
18 | | #include "pycore_unicodeobject.h" // _PyUnicode_InternMortal() |
19 | | |
20 | | |
21 | | static const char *codecs_builtin_error_handlers[] = { |
22 | | "strict", "ignore", "replace", |
23 | | "xmlcharrefreplace", "backslashreplace", "namereplace", |
24 | | "surrogatepass", "surrogateescape", |
25 | | }; |
26 | | |
27 | | const char *Py_hexdigits = "0123456789abcdef"; |
28 | | |
29 | | /* --- Codec Registry ----------------------------------------------------- */ |
30 | | |
31 | | int PyCodec_Register(PyObject *search_function) |
32 | 16 | { |
33 | 16 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
34 | 16 | assert(interp->codecs.initialized); |
35 | 16 | if (search_function == NULL) { |
36 | 0 | PyErr_BadArgument(); |
37 | 0 | goto onError; |
38 | 0 | } |
39 | 16 | if (!PyCallable_Check(search_function)) { |
40 | 0 | PyErr_SetString(PyExc_TypeError, "argument must be callable"); |
41 | 0 | goto onError; |
42 | 0 | } |
43 | | #ifdef Py_GIL_DISABLED |
44 | | PyMutex_Lock(&interp->codecs.search_path_mutex); |
45 | | #endif |
46 | 16 | int ret = PyList_Append(interp->codecs.search_path, search_function); |
47 | | #ifdef Py_GIL_DISABLED |
48 | | PyMutex_Unlock(&interp->codecs.search_path_mutex); |
49 | | #endif |
50 | 16 | return ret; |
51 | | |
52 | 0 | onError: |
53 | 0 | return -1; |
54 | 16 | } |
55 | | |
56 | | int |
57 | | PyCodec_Unregister(PyObject *search_function) |
58 | 0 | { |
59 | 0 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
60 | 0 | if (interp->codecs.initialized != 1) { |
61 | | /* Do nothing if codecs state was cleared (only possible during |
62 | | interpreter shutdown). */ |
63 | 0 | return 0; |
64 | 0 | } |
65 | | |
66 | 0 | PyObject *codec_search_path = interp->codecs.search_path; |
67 | 0 | assert(PyList_CheckExact(codec_search_path)); |
68 | 0 | for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) { |
69 | | #ifdef Py_GIL_DISABLED |
70 | | PyMutex_Lock(&interp->codecs.search_path_mutex); |
71 | | #endif |
72 | 0 | PyObject *item = PyList_GetItemRef(codec_search_path, i); |
73 | 0 | int ret = 1; |
74 | 0 | if (item == search_function) { |
75 | | // We hold a reference to the item, so its destructor can't run |
76 | | // while we hold search_path_mutex. |
77 | 0 | ret = PyList_SetSlice(codec_search_path, i, i+1, NULL); |
78 | 0 | } |
79 | | #ifdef Py_GIL_DISABLED |
80 | | PyMutex_Unlock(&interp->codecs.search_path_mutex); |
81 | | #endif |
82 | 0 | Py_DECREF(item); |
83 | 0 | if (ret != 1) { |
84 | 0 | assert(interp->codecs.search_cache != NULL); |
85 | 0 | assert(PyDict_CheckExact(interp->codecs.search_cache)); |
86 | 0 | PyDict_Clear(interp->codecs.search_cache); |
87 | 0 | return ret; |
88 | 0 | } |
89 | 0 | } |
90 | 0 | return 0; |
91 | 0 | } |
92 | | |
93 | | extern int _Py_normalize_encoding(const char *, char *, size_t); |
94 | | |
95 | | /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are |
96 | | converted to lower case, spaces and hyphens are replaced with underscores. */ |
97 | | |
98 | | static |
99 | | PyObject *normalizestring(const char *string) |
100 | 966k | { |
101 | 966k | size_t len = strlen(string); |
102 | 966k | char *encoding; |
103 | 966k | PyObject *v; |
104 | | |
105 | 966k | if (len > PY_SSIZE_T_MAX) { |
106 | 0 | PyErr_SetString(PyExc_OverflowError, "string is too large"); |
107 | 0 | return NULL; |
108 | 0 | } |
109 | | |
110 | 966k | encoding = PyMem_Malloc(len + 1); |
111 | 966k | if (encoding == NULL) |
112 | 0 | return PyErr_NoMemory(); |
113 | | |
114 | 966k | if (!_Py_normalize_encoding(string, encoding, len + 1)) |
115 | 0 | { |
116 | 0 | PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed"); |
117 | 0 | PyMem_Free(encoding); |
118 | 0 | return NULL; |
119 | 0 | } |
120 | | |
121 | 966k | v = PyUnicode_FromString(encoding); |
122 | 966k | PyMem_Free(encoding); |
123 | 966k | return v; |
124 | 966k | } |
125 | | |
126 | | /* Lookup the given encoding and return a tuple providing the codec |
127 | | facilities. |
128 | | |
129 | | The encoding string is looked up converted to all lower-case |
130 | | characters. This makes encodings looked up through this mechanism |
131 | | effectively case-insensitive. |
132 | | |
133 | | If no codec is found, a LookupError is set and NULL returned. |
134 | | |
135 | | As side effect, this tries to load the encodings package, if not |
136 | | yet done. This is part of the lazy load strategy for the encodings |
137 | | package. |
138 | | |
139 | | */ |
140 | | |
141 | | PyObject *_PyCodec_Lookup(const char *encoding) |
142 | 966k | { |
143 | 966k | if (encoding == NULL) { |
144 | 0 | PyErr_BadArgument(); |
145 | 0 | return NULL; |
146 | 0 | } |
147 | | |
148 | 966k | PyInterpreterState *interp = _PyInterpreterState_GET(); |
149 | 966k | assert(interp->codecs.initialized); |
150 | | |
151 | | /* Convert the encoding to a normalized Python string: all |
152 | | characters are converted to lower case, spaces and hyphens are |
153 | | replaced with underscores. */ |
154 | 966k | PyObject *v = normalizestring(encoding); |
155 | 966k | if (v == NULL) { |
156 | 0 | return NULL; |
157 | 0 | } |
158 | | |
159 | | /* Intern the string. We'll make it immortal later if lookup succeeds. */ |
160 | 966k | _PyUnicode_InternMortal(interp, &v); |
161 | | |
162 | | /* First, try to lookup the name in the registry dictionary */ |
163 | 966k | PyObject *result; |
164 | 966k | if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) { |
165 | 0 | goto onError; |
166 | 0 | } |
167 | 966k | if (result != NULL) { |
168 | 885k | Py_DECREF(v); |
169 | 885k | return result; |
170 | 885k | } |
171 | | |
172 | | /* Next, scan the search functions in order of registration */ |
173 | 81.5k | const Py_ssize_t len = PyList_Size(interp->codecs.search_path); |
174 | 81.5k | if (len < 0) |
175 | 0 | goto onError; |
176 | 81.5k | if (len == 0) { |
177 | 0 | PyErr_SetString(PyExc_LookupError, |
178 | 0 | "no codec search functions registered: " |
179 | 0 | "can't find encoding"); |
180 | 0 | goto onError; |
181 | 0 | } |
182 | | |
183 | 81.5k | Py_ssize_t i; |
184 | 162k | for (i = 0; i < len; i++) { |
185 | 81.5k | PyObject *func; |
186 | | |
187 | 81.5k | func = PyList_GetItemRef(interp->codecs.search_path, i); |
188 | 81.5k | if (func == NULL) |
189 | 0 | goto onError; |
190 | 81.5k | result = PyObject_CallOneArg(func, v); |
191 | 81.5k | Py_DECREF(func); |
192 | 81.5k | if (result == NULL) |
193 | 0 | goto onError; |
194 | 81.5k | if (result == Py_None) { |
195 | 81.0k | Py_CLEAR(result); |
196 | 81.0k | continue; |
197 | 81.0k | } |
198 | 488 | if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) { |
199 | 0 | PyErr_SetString(PyExc_TypeError, |
200 | 0 | "codec search functions must return 4-tuples"); |
201 | 0 | Py_DECREF(result); |
202 | 0 | goto onError; |
203 | 0 | } |
204 | 488 | break; |
205 | 488 | } |
206 | 81.5k | if (result == NULL) { |
207 | | /* XXX Perhaps we should cache misses too ? */ |
208 | 81.0k | PyErr_Format(PyExc_LookupError, |
209 | 81.0k | "unknown encoding: %s", encoding); |
210 | 81.0k | goto onError; |
211 | 81.0k | } |
212 | | |
213 | 488 | _PyUnicode_InternImmortal(interp, &v); |
214 | | |
215 | | /* Cache and return the result */ |
216 | 488 | if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) { |
217 | 0 | Py_DECREF(result); |
218 | 0 | goto onError; |
219 | 0 | } |
220 | 488 | Py_DECREF(v); |
221 | 488 | return result; |
222 | | |
223 | 81.0k | onError: |
224 | 81.0k | Py_DECREF(v); |
225 | 81.0k | return NULL; |
226 | 488 | } |
227 | | |
228 | | /* Codec registry encoding check API. */ |
229 | | |
230 | | int PyCodec_KnownEncoding(const char *encoding) |
231 | 0 | { |
232 | 0 | PyObject *codecs; |
233 | |
|
234 | 0 | codecs = _PyCodec_Lookup(encoding); |
235 | 0 | if (!codecs) { |
236 | 0 | PyErr_Clear(); |
237 | 0 | return 0; |
238 | 0 | } |
239 | 0 | else { |
240 | 0 | Py_DECREF(codecs); |
241 | 0 | return 1; |
242 | 0 | } |
243 | 0 | } |
244 | | |
245 | | static |
246 | | PyObject *args_tuple(PyObject *object, |
247 | | const char *errors) |
248 | 883k | { |
249 | 883k | PyObject *args; |
250 | | |
251 | 883k | args = PyTuple_New(1 + (errors != NULL)); |
252 | 883k | if (args == NULL) |
253 | 0 | return NULL; |
254 | 883k | PyTuple_SET_ITEM(args, 0, Py_NewRef(object)); |
255 | 883k | if (errors) { |
256 | 148k | PyObject *v; |
257 | | |
258 | 148k | v = PyUnicode_FromString(errors); |
259 | 148k | if (v == NULL) { |
260 | 0 | Py_DECREF(args); |
261 | 0 | return NULL; |
262 | 0 | } |
263 | 148k | PyTuple_SET_ITEM(args, 1, v); |
264 | 148k | } |
265 | 883k | return args; |
266 | 883k | } |
267 | | |
268 | | /* Helper function to get a codec item */ |
269 | | |
270 | | static |
271 | | PyObject *codec_getitem(const char *encoding, int index) |
272 | 0 | { |
273 | 0 | PyObject *codecs; |
274 | 0 | PyObject *v; |
275 | |
|
276 | 0 | codecs = _PyCodec_Lookup(encoding); |
277 | 0 | if (codecs == NULL) |
278 | 0 | return NULL; |
279 | 0 | v = PyTuple_GET_ITEM(codecs, index); |
280 | 0 | Py_DECREF(codecs); |
281 | 0 | return Py_NewRef(v); |
282 | 0 | } |
283 | | |
284 | | /* Helper functions to create an incremental codec. */ |
285 | | static |
286 | | PyObject *codec_makeincrementalcodec(PyObject *codec_info, |
287 | | const char *errors, |
288 | | const char *attrname) |
289 | 48 | { |
290 | 48 | PyObject *ret, *inccodec; |
291 | | |
292 | 48 | inccodec = PyObject_GetAttrString(codec_info, attrname); |
293 | 48 | if (inccodec == NULL) |
294 | 0 | return NULL; |
295 | 48 | if (errors) |
296 | 48 | ret = PyObject_CallFunction(inccodec, "s", errors); |
297 | 0 | else |
298 | 0 | ret = _PyObject_CallNoArgs(inccodec); |
299 | 48 | Py_DECREF(inccodec); |
300 | 48 | return ret; |
301 | 48 | } |
302 | | |
303 | | static |
304 | | PyObject *codec_getincrementalcodec(const char *encoding, |
305 | | const char *errors, |
306 | | const char *attrname) |
307 | 0 | { |
308 | 0 | PyObject *codec_info, *ret; |
309 | |
|
310 | 0 | codec_info = _PyCodec_Lookup(encoding); |
311 | 0 | if (codec_info == NULL) |
312 | 0 | return NULL; |
313 | 0 | ret = codec_makeincrementalcodec(codec_info, errors, attrname); |
314 | 0 | Py_DECREF(codec_info); |
315 | 0 | return ret; |
316 | 0 | } |
317 | | |
318 | | /* Helper function to create a stream codec. */ |
319 | | |
320 | | static |
321 | | PyObject *codec_getstreamcodec(const char *encoding, |
322 | | PyObject *stream, |
323 | | const char *errors, |
324 | | const int index) |
325 | 0 | { |
326 | 0 | PyObject *codecs, *streamcodec, *codeccls; |
327 | |
|
328 | 0 | codecs = _PyCodec_Lookup(encoding); |
329 | 0 | if (codecs == NULL) |
330 | 0 | return NULL; |
331 | | |
332 | 0 | codeccls = PyTuple_GET_ITEM(codecs, index); |
333 | 0 | if (errors != NULL) |
334 | 0 | streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors); |
335 | 0 | else |
336 | 0 | streamcodec = PyObject_CallOneArg(codeccls, stream); |
337 | 0 | Py_DECREF(codecs); |
338 | 0 | return streamcodec; |
339 | 0 | } |
340 | | |
341 | | /* Helpers to work with the result of _PyCodec_Lookup |
342 | | |
343 | | */ |
344 | | PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info, |
345 | | const char *errors) |
346 | 16 | { |
347 | 16 | return codec_makeincrementalcodec(codec_info, errors, |
348 | 16 | "incrementaldecoder"); |
349 | 16 | } |
350 | | |
351 | | PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info, |
352 | | const char *errors) |
353 | 32 | { |
354 | 32 | return codec_makeincrementalcodec(codec_info, errors, |
355 | 32 | "incrementalencoder"); |
356 | 32 | } |
357 | | |
358 | | |
359 | | /* Convenience APIs to query the Codec registry. |
360 | | |
361 | | All APIs return a codec object with incremented refcount. |
362 | | |
363 | | */ |
364 | | |
365 | | PyObject *PyCodec_Encoder(const char *encoding) |
366 | 0 | { |
367 | 0 | return codec_getitem(encoding, 0); |
368 | 0 | } |
369 | | |
370 | | PyObject *PyCodec_Decoder(const char *encoding) |
371 | 0 | { |
372 | 0 | return codec_getitem(encoding, 1); |
373 | 0 | } |
374 | | |
375 | | PyObject *PyCodec_IncrementalEncoder(const char *encoding, |
376 | | const char *errors) |
377 | 0 | { |
378 | 0 | return codec_getincrementalcodec(encoding, errors, "incrementalencoder"); |
379 | 0 | } |
380 | | |
381 | | PyObject *PyCodec_IncrementalDecoder(const char *encoding, |
382 | | const char *errors) |
383 | 0 | { |
384 | 0 | return codec_getincrementalcodec(encoding, errors, "incrementaldecoder"); |
385 | 0 | } |
386 | | |
387 | | PyObject *PyCodec_StreamReader(const char *encoding, |
388 | | PyObject *stream, |
389 | | const char *errors) |
390 | 0 | { |
391 | 0 | return codec_getstreamcodec(encoding, stream, errors, 2); |
392 | 0 | } |
393 | | |
394 | | PyObject *PyCodec_StreamWriter(const char *encoding, |
395 | | PyObject *stream, |
396 | | const char *errors) |
397 | 0 | { |
398 | 0 | return codec_getstreamcodec(encoding, stream, errors, 3); |
399 | 0 | } |
400 | | |
401 | | /* Encode an object (e.g. a Unicode object) using the given encoding |
402 | | and return the resulting encoded object (usually a Python string). |
403 | | |
404 | | errors is passed to the encoder factory as argument if non-NULL. */ |
405 | | |
406 | | static PyObject * |
407 | | _PyCodec_EncodeInternal(PyObject *object, |
408 | | PyObject *encoder, |
409 | | const char *encoding, |
410 | | const char *errors) |
411 | 670k | { |
412 | 670k | PyObject *args = NULL, *result = NULL; |
413 | 670k | PyObject *v = NULL; |
414 | | |
415 | 670k | args = args_tuple(object, errors); |
416 | 670k | if (args == NULL) |
417 | 0 | goto onError; |
418 | | |
419 | 670k | result = PyObject_Call(encoder, args, NULL); |
420 | 670k | if (result == NULL) { |
421 | 0 | _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding); |
422 | 0 | goto onError; |
423 | 0 | } |
424 | | |
425 | 670k | if (!PyTuple_Check(result) || |
426 | 670k | PyTuple_GET_SIZE(result) != 2) { |
427 | 0 | PyErr_SetString(PyExc_TypeError, |
428 | 0 | "encoder must return a tuple (object, integer)"); |
429 | 0 | goto onError; |
430 | 0 | } |
431 | 670k | v = Py_NewRef(PyTuple_GET_ITEM(result,0)); |
432 | | /* We don't check or use the second (integer) entry. */ |
433 | | |
434 | 670k | Py_DECREF(args); |
435 | 670k | Py_DECREF(encoder); |
436 | 670k | Py_DECREF(result); |
437 | 670k | return v; |
438 | | |
439 | 0 | onError: |
440 | 0 | Py_XDECREF(result); |
441 | 0 | Py_XDECREF(args); |
442 | 0 | Py_XDECREF(encoder); |
443 | 0 | return NULL; |
444 | 670k | } |
445 | | |
446 | | /* Decode an object (usually a Python string) using the given encoding |
447 | | and return an equivalent object (e.g. a Unicode object). |
448 | | |
449 | | errors is passed to the decoder factory as argument if non-NULL. */ |
450 | | |
451 | | static PyObject * |
452 | | _PyCodec_DecodeInternal(PyObject *object, |
453 | | PyObject *decoder, |
454 | | const char *encoding, |
455 | | const char *errors) |
456 | 212k | { |
457 | 212k | PyObject *args = NULL, *result = NULL; |
458 | 212k | PyObject *v; |
459 | | |
460 | 212k | args = args_tuple(object, errors); |
461 | 212k | if (args == NULL) |
462 | 0 | goto onError; |
463 | | |
464 | 212k | result = PyObject_Call(decoder, args, NULL); |
465 | 212k | if (result == NULL) { |
466 | 46.2k | _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding); |
467 | 46.2k | goto onError; |
468 | 46.2k | } |
469 | 166k | if (!PyTuple_Check(result) || |
470 | 166k | PyTuple_GET_SIZE(result) != 2) { |
471 | 0 | PyErr_SetString(PyExc_TypeError, |
472 | 0 | "decoder must return a tuple (object,integer)"); |
473 | 0 | goto onError; |
474 | 0 | } |
475 | 166k | v = Py_NewRef(PyTuple_GET_ITEM(result,0)); |
476 | | /* We don't check or use the second (integer) entry. */ |
477 | | |
478 | 166k | Py_DECREF(args); |
479 | 166k | Py_DECREF(decoder); |
480 | 166k | Py_DECREF(result); |
481 | 166k | return v; |
482 | | |
483 | 46.2k | onError: |
484 | 46.2k | Py_XDECREF(args); |
485 | 46.2k | Py_XDECREF(decoder); |
486 | 46.2k | Py_XDECREF(result); |
487 | 46.2k | return NULL; |
488 | 166k | } |
489 | | |
490 | | /* Generic encoding/decoding API */ |
491 | | PyObject *PyCodec_Encode(PyObject *object, |
492 | | const char *encoding, |
493 | | const char *errors) |
494 | 0 | { |
495 | 0 | PyObject *encoder; |
496 | |
|
497 | 0 | encoder = PyCodec_Encoder(encoding); |
498 | 0 | if (encoder == NULL) |
499 | 0 | return NULL; |
500 | | |
501 | 0 | return _PyCodec_EncodeInternal(object, encoder, encoding, errors); |
502 | 0 | } |
503 | | |
504 | | PyObject *PyCodec_Decode(PyObject *object, |
505 | | const char *encoding, |
506 | | const char *errors) |
507 | 0 | { |
508 | 0 | PyObject *decoder; |
509 | |
|
510 | 0 | decoder = PyCodec_Decoder(encoding); |
511 | 0 | if (decoder == NULL) |
512 | 0 | return NULL; |
513 | | |
514 | 0 | return _PyCodec_DecodeInternal(object, decoder, encoding, errors); |
515 | 0 | } |
516 | | |
517 | | /* Text encoding/decoding API */ |
518 | | PyObject * _PyCodec_LookupTextEncoding(const char *encoding, |
519 | | const char *alternate_command) |
520 | 966k | { |
521 | 966k | PyObject *codec; |
522 | 966k | PyObject *attr; |
523 | 966k | int is_text_codec; |
524 | | |
525 | 966k | codec = _PyCodec_Lookup(encoding); |
526 | 966k | if (codec == NULL) |
527 | 81.0k | return NULL; |
528 | | |
529 | | /* Backwards compatibility: assume any raw tuple describes a text |
530 | | * encoding, and the same for anything lacking the private |
531 | | * attribute. |
532 | | */ |
533 | 885k | if (!PyTuple_CheckExact(codec)) { |
534 | 885k | if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) { |
535 | 0 | Py_DECREF(codec); |
536 | 0 | return NULL; |
537 | 0 | } |
538 | 885k | if (attr != NULL) { |
539 | 885k | is_text_codec = PyObject_IsTrue(attr); |
540 | 885k | Py_DECREF(attr); |
541 | 885k | if (is_text_codec <= 0) { |
542 | 2.52k | Py_DECREF(codec); |
543 | 2.52k | if (!is_text_codec) { |
544 | 2.52k | if (alternate_command != NULL) { |
545 | 2.52k | PyErr_Format(PyExc_LookupError, |
546 | 2.52k | "'%.400s' is not a text encoding; " |
547 | 2.52k | "use %s to handle arbitrary codecs", |
548 | 2.52k | encoding, alternate_command); |
549 | 2.52k | } |
550 | 0 | else { |
551 | 0 | PyErr_Format(PyExc_LookupError, |
552 | 0 | "'%.400s' is not a text encoding", |
553 | 0 | encoding); |
554 | 0 | } |
555 | 2.52k | } |
556 | 2.52k | return NULL; |
557 | 2.52k | } |
558 | 885k | } |
559 | 885k | } |
560 | | |
561 | | /* This appears to be a valid text encoding */ |
562 | 883k | return codec; |
563 | 885k | } |
564 | | |
565 | | |
566 | | static |
567 | | PyObject *codec_getitem_checked(const char *encoding, |
568 | | const char *alternate_command, |
569 | | int index) |
570 | 966k | { |
571 | 966k | PyObject *codec; |
572 | 966k | PyObject *v; |
573 | | |
574 | 966k | codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); |
575 | 966k | if (codec == NULL) |
576 | 83.6k | return NULL; |
577 | | |
578 | 883k | v = Py_NewRef(PyTuple_GET_ITEM(codec, index)); |
579 | 883k | Py_DECREF(codec); |
580 | 883k | return v; |
581 | 966k | } |
582 | | |
583 | | static PyObject * _PyCodec_TextEncoder(const char *encoding) |
584 | 670k | { |
585 | 670k | return codec_getitem_checked(encoding, "codecs.encode()", 0); |
586 | 670k | } |
587 | | |
588 | | static PyObject * _PyCodec_TextDecoder(const char *encoding) |
589 | 296k | { |
590 | 296k | return codec_getitem_checked(encoding, "codecs.decode()", 1); |
591 | 296k | } |
592 | | |
593 | | PyObject *_PyCodec_EncodeText(PyObject *object, |
594 | | const char *encoding, |
595 | | const char *errors) |
596 | 670k | { |
597 | 670k | PyObject *encoder; |
598 | | |
599 | 670k | encoder = _PyCodec_TextEncoder(encoding); |
600 | 670k | if (encoder == NULL) |
601 | 0 | return NULL; |
602 | | |
603 | 670k | return _PyCodec_EncodeInternal(object, encoder, encoding, errors); |
604 | 670k | } |
605 | | |
606 | | PyObject *_PyCodec_DecodeText(PyObject *object, |
607 | | const char *encoding, |
608 | | const char *errors) |
609 | 296k | { |
610 | 296k | PyObject *decoder; |
611 | | |
612 | 296k | decoder = _PyCodec_TextDecoder(encoding); |
613 | 296k | if (decoder == NULL) |
614 | 83.6k | return NULL; |
615 | | |
616 | 212k | return _PyCodec_DecodeInternal(object, decoder, encoding, errors); |
617 | 296k | } |
618 | | |
619 | | /* Register the error handling callback function error under the name |
620 | | name. This function will be called by the codec when it encounters |
621 | | an unencodable characters/undecodable bytes and doesn't know the |
622 | | callback name, when name is specified as the error parameter |
623 | | in the call to the encode/decode function. |
624 | | Return 0 on success, -1 on error */ |
625 | | int PyCodec_RegisterError(const char *name, PyObject *error) |
626 | 0 | { |
627 | 0 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
628 | 0 | assert(interp->codecs.initialized); |
629 | 0 | if (!PyCallable_Check(error)) { |
630 | 0 | PyErr_SetString(PyExc_TypeError, "handler must be callable"); |
631 | 0 | return -1; |
632 | 0 | } |
633 | 0 | return PyDict_SetItemString(interp->codecs.error_registry, |
634 | 0 | name, error); |
635 | 0 | } |
636 | | |
637 | | int _PyCodec_UnregisterError(const char *name) |
638 | 0 | { |
639 | 0 | for (size_t i = 0; i < Py_ARRAY_LENGTH(codecs_builtin_error_handlers); ++i) { |
640 | 0 | if (strcmp(name, codecs_builtin_error_handlers[i]) == 0) { |
641 | 0 | PyErr_Format(PyExc_ValueError, |
642 | 0 | "cannot un-register built-in error handler '%s'", name); |
643 | 0 | return -1; |
644 | 0 | } |
645 | 0 | } |
646 | 0 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
647 | 0 | assert(interp->codecs.initialized); |
648 | 0 | return PyDict_PopString(interp->codecs.error_registry, name, NULL); |
649 | 0 | } |
650 | | |
651 | | /* Lookup the error handling callback function registered under the |
652 | | name error. As a special case NULL can be passed, in which case |
653 | | the error handling callback for strict encoding will be returned. */ |
654 | | PyObject *PyCodec_LookupError(const char *name) |
655 | 203k | { |
656 | 203k | PyInterpreterState *interp = _PyInterpreterState_GET(); |
657 | 203k | assert(interp->codecs.initialized); |
658 | | |
659 | 203k | if (name==NULL) |
660 | 139k | name = "strict"; |
661 | 203k | PyObject *handler; |
662 | 203k | if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) { |
663 | 0 | return NULL; |
664 | 0 | } |
665 | 203k | if (handler == NULL) { |
666 | 0 | PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); |
667 | 0 | return NULL; |
668 | 0 | } |
669 | 203k | return handler; |
670 | 203k | } |
671 | | |
672 | | |
673 | | static inline void |
674 | | wrong_exception_type(PyObject *exc) |
675 | 0 | { |
676 | 0 | PyErr_Format(PyExc_TypeError, |
677 | 0 | "don't know how to handle %T in error callback", exc); |
678 | 0 | } |
679 | | |
680 | | |
681 | | #define _PyIsUnicodeEncodeError(EXC) \ |
682 | 243k | PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeEncodeError) |
683 | | #define _PyIsUnicodeDecodeError(EXC) \ |
684 | 236k | PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeDecodeError) |
685 | | #define _PyIsUnicodeTranslateError(EXC) \ |
686 | 0 | PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError) |
687 | | |
688 | | |
689 | | // --- codecs handlers: utilities --------------------------------------------- |
690 | | |
691 | | /* |
692 | | * Return the number of characters (including special prefixes) |
693 | | * needed to represent 'ch' by codec_handler_write_unicode_hex(). |
694 | | */ |
695 | | static inline Py_ssize_t |
696 | | codec_handler_unicode_hex_width(Py_UCS4 ch) |
697 | 0 | { |
698 | 0 | if (ch >= 0x10000) { |
699 | | // format: '\\' + 'U' + 8 hex digits |
700 | 0 | return 1 + 1 + 8; |
701 | 0 | } |
702 | 0 | else if (ch >= 0x100) { |
703 | | // format: '\\' + 'u' + 4 hex digits |
704 | 0 | return 1 + 1 + 4; |
705 | 0 | } |
706 | 0 | else { |
707 | | // format: '\\' + 'x' + 2 hex digits |
708 | 0 | return 1 + 1 + 2; |
709 | 0 | } |
710 | 0 | } |
711 | | |
712 | | |
713 | | /* |
714 | | * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p' |
715 | | * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively. |
716 | | */ |
717 | | static inline void |
718 | | codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch) |
719 | 0 | { |
720 | 0 | *(*p)++ = '\\'; |
721 | 0 | if (ch >= 0x10000) { |
722 | 0 | *(*p)++ = 'U'; |
723 | 0 | *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf]; |
724 | 0 | *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf]; |
725 | 0 | *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf]; |
726 | 0 | *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf]; |
727 | 0 | *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf]; |
728 | 0 | *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf]; |
729 | 0 | } |
730 | 0 | else if (ch >= 0x100) { |
731 | 0 | *(*p)++ = 'u'; |
732 | 0 | *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf]; |
733 | 0 | *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf]; |
734 | 0 | } |
735 | 0 | else { |
736 | 0 | *(*p)++ = 'x'; |
737 | 0 | } |
738 | 0 | *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf]; |
739 | 0 | *(*p)++ = Py_hexdigits[ch & 0xf]; |
740 | 0 | } |
741 | | |
742 | | |
743 | | /* |
744 | | * Determine the number of digits for a decimal representation of Unicode |
745 | | * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits). |
746 | | */ |
747 | | static inline int |
748 | | n_decimal_digits_for_codepoint(Py_UCS4 ch) |
749 | 0 | { |
750 | 0 | if (ch < 10) return 1; |
751 | 0 | if (ch < 100) return 2; |
752 | 0 | if (ch < 1000) return 3; |
753 | 0 | if (ch < 10000) return 4; |
754 | 0 | if (ch < 100000) return 5; |
755 | 0 | if (ch < 1000000) return 6; |
756 | 0 | if (ch < 10000000) return 7; |
757 | | // Unicode codepoints are limited to 1114111 (7 decimal digits) |
758 | 0 | Py_UNREACHABLE(); |
759 | 0 | } |
760 | | |
761 | | |
762 | | /* |
763 | | * Create a Unicode string containing 'count' copies of the official |
764 | | * Unicode REPLACEMENT CHARACTER (0xFFFD). |
765 | | */ |
766 | | static PyObject * |
767 | | codec_handler_unicode_replacement_character(Py_ssize_t count) |
768 | 194k | { |
769 | 194k | PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER); |
770 | 194k | if (res == NULL) { |
771 | 0 | return NULL; |
772 | 0 | } |
773 | 194k | assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); |
774 | 194k | Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res); |
775 | 388k | for (Py_ssize_t i = 0; i < count; ++i) { |
776 | 194k | outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; |
777 | 194k | } |
778 | 194k | assert(_PyUnicode_CheckConsistency(res, 1)); |
779 | 194k | return res; |
780 | 194k | } |
781 | | |
782 | | |
783 | | // --- handler: 'strict' ------------------------------------------------------ |
784 | | |
785 | | PyObject *PyCodec_StrictErrors(PyObject *exc) |
786 | 198k | { |
787 | 198k | if (PyExceptionInstance_Check(exc)) { |
788 | 198k | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
789 | 198k | } |
790 | 0 | else { |
791 | 0 | PyErr_SetString(PyExc_TypeError, "codec must pass exception instance"); |
792 | 0 | } |
793 | 198k | return NULL; |
794 | 198k | } |
795 | | |
796 | | |
797 | | // --- handler: 'ignore' ------------------------------------------------------ |
798 | | |
799 | | static PyObject * |
800 | | _PyCodec_IgnoreError(PyObject *exc, int as_bytes) |
801 | 0 | { |
802 | 0 | Py_ssize_t end; |
803 | 0 | if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL, |
804 | 0 | &end, NULL, as_bytes) < 0) |
805 | 0 | { |
806 | 0 | return NULL; |
807 | 0 | } |
808 | 0 | return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end); |
809 | 0 | } |
810 | | |
811 | | |
812 | | PyObject *PyCodec_IgnoreErrors(PyObject *exc) |
813 | 0 | { |
814 | 0 | if (_PyIsUnicodeEncodeError(exc) || _PyIsUnicodeTranslateError(exc)) { |
815 | 0 | return _PyCodec_IgnoreError(exc, false); |
816 | 0 | } |
817 | 0 | else if (_PyIsUnicodeDecodeError(exc)) { |
818 | 0 | return _PyCodec_IgnoreError(exc, true); |
819 | 0 | } |
820 | 0 | else { |
821 | 0 | wrong_exception_type(exc); |
822 | 0 | return NULL; |
823 | 0 | } |
824 | 0 | } |
825 | | |
826 | | |
827 | | // --- handler: 'replace' ----------------------------------------------------- |
828 | | |
829 | | static PyObject * |
830 | | _PyCodec_ReplaceUnicodeEncodeError(PyObject *exc) |
831 | 0 | { |
832 | 0 | Py_ssize_t start, end, slen; |
833 | 0 | if (_PyUnicodeError_GetParams(exc, NULL, NULL, |
834 | 0 | &start, &end, &slen, false) < 0) |
835 | 0 | { |
836 | 0 | return NULL; |
837 | 0 | } |
838 | 0 | PyObject *res = PyUnicode_New(slen, '?'); |
839 | 0 | if (res == NULL) { |
840 | 0 | return NULL; |
841 | 0 | } |
842 | 0 | assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND); |
843 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
844 | 0 | memset(outp, '?', sizeof(Py_UCS1) * slen); |
845 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
846 | 0 | return Py_BuildValue("(Nn)", res, end); |
847 | 0 | } |
848 | | |
849 | | |
850 | | static PyObject * |
851 | | _PyCodec_ReplaceUnicodeDecodeError(PyObject *exc) |
852 | 194k | { |
853 | 194k | Py_ssize_t end; |
854 | 194k | if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) { |
855 | 0 | return NULL; |
856 | 0 | } |
857 | 194k | PyObject *res = codec_handler_unicode_replacement_character(1); |
858 | 194k | if (res == NULL) { |
859 | 0 | return NULL; |
860 | 0 | } |
861 | 194k | return Py_BuildValue("(Nn)", res, end); |
862 | 194k | } |
863 | | |
864 | | |
865 | | static PyObject * |
866 | | _PyCodec_ReplaceUnicodeTranslateError(PyObject *exc) |
867 | 0 | { |
868 | 0 | Py_ssize_t start, end, slen; |
869 | 0 | if (_PyUnicodeError_GetParams(exc, NULL, NULL, |
870 | 0 | &start, &end, &slen, false) < 0) |
871 | 0 | { |
872 | 0 | return NULL; |
873 | 0 | } |
874 | 0 | PyObject *res = codec_handler_unicode_replacement_character(slen); |
875 | 0 | if (res == NULL) { |
876 | 0 | return NULL; |
877 | 0 | } |
878 | 0 | return Py_BuildValue("(Nn)", res, end); |
879 | 0 | } |
880 | | |
881 | | |
882 | | PyObject *PyCodec_ReplaceErrors(PyObject *exc) |
883 | 194k | { |
884 | 194k | if (_PyIsUnicodeEncodeError(exc)) { |
885 | 0 | return _PyCodec_ReplaceUnicodeEncodeError(exc); |
886 | 0 | } |
887 | 194k | else if (_PyIsUnicodeDecodeError(exc)) { |
888 | 194k | return _PyCodec_ReplaceUnicodeDecodeError(exc); |
889 | 194k | } |
890 | 0 | else if (_PyIsUnicodeTranslateError(exc)) { |
891 | 0 | return _PyCodec_ReplaceUnicodeTranslateError(exc); |
892 | 0 | } |
893 | 0 | else { |
894 | 0 | wrong_exception_type(exc); |
895 | 0 | return NULL; |
896 | 0 | } |
897 | 194k | } |
898 | | |
899 | | |
900 | | // --- handler: 'xmlcharrefreplace' ------------------------------------------- |
901 | | |
902 | | PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) |
903 | 0 | { |
904 | 0 | if (!_PyIsUnicodeEncodeError(exc)) { |
905 | 0 | wrong_exception_type(exc); |
906 | 0 | return NULL; |
907 | 0 | } |
908 | | |
909 | 0 | PyObject *obj; |
910 | 0 | Py_ssize_t objlen, start, end, slen; |
911 | 0 | if (_PyUnicodeError_GetParams(exc, |
912 | 0 | &obj, &objlen, |
913 | 0 | &start, &end, &slen, false) < 0) |
914 | 0 | { |
915 | 0 | return NULL; |
916 | 0 | } |
917 | | |
918 | | // The number of characters that each character 'ch' contributes |
919 | | // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch} |
920 | | // and will be formatted as "&#" + DIGITS + ";". Since the Unicode |
921 | | // range is below 10^7, each "block" requires at most 2 + 7 + 1 |
922 | | // characters. |
923 | 0 | if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) { |
924 | 0 | end = start + PY_SSIZE_T_MAX / (2 + 7 + 1); |
925 | 0 | end = Py_MIN(end, objlen); |
926 | 0 | slen = Py_MAX(0, end - start); |
927 | 0 | } |
928 | |
|
929 | 0 | Py_ssize_t ressize = 0; |
930 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
931 | 0 | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
932 | 0 | int k = n_decimal_digits_for_codepoint(ch); |
933 | 0 | assert(k != 0); |
934 | 0 | assert(k <= 7); |
935 | 0 | ressize += 2 + k + 1; |
936 | 0 | } |
937 | | |
938 | | /* allocate replacement */ |
939 | 0 | PyObject *res = PyUnicode_New(ressize, 127); |
940 | 0 | if (res == NULL) { |
941 | 0 | Py_DECREF(obj); |
942 | 0 | return NULL; |
943 | 0 | } |
944 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
945 | | /* generate replacement */ |
946 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
947 | 0 | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
948 | | /* |
949 | | * Write the decimal representation of 'ch' to the buffer pointed by 'p' |
950 | | * using at most 7 characters prefixed by '&#' and suffixed by ';'. |
951 | | */ |
952 | 0 | *outp++ = '&'; |
953 | 0 | *outp++ = '#'; |
954 | 0 | Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch); |
955 | 0 | for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) { |
956 | 0 | *p_digit = '0' + (ch % 10); |
957 | 0 | ch /= 10; |
958 | 0 | } |
959 | 0 | assert(ch == 0); |
960 | 0 | outp = digit_end; |
961 | 0 | *outp++ = ';'; |
962 | 0 | } |
963 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
964 | 0 | PyObject *restuple = Py_BuildValue("(Nn)", res, end); |
965 | 0 | Py_DECREF(obj); |
966 | 0 | return restuple; |
967 | 0 | } |
968 | | |
969 | | |
970 | | // --- handler: 'backslashreplace' -------------------------------------------- |
971 | | |
972 | | static PyObject * |
973 | | _PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc) |
974 | 0 | { |
975 | 0 | PyObject *obj; |
976 | 0 | Py_ssize_t objlen, start, end, slen; |
977 | 0 | if (_PyUnicodeError_GetParams(exc, |
978 | 0 | &obj, &objlen, |
979 | 0 | &start, &end, &slen, false) < 0) |
980 | 0 | { |
981 | 0 | return NULL; |
982 | 0 | } |
983 | | |
984 | | // The number of characters that each character 'ch' contributes |
985 | | // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch} |
986 | | // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS, |
987 | | // where the number of hexdigits is either 2, 4, or 8 (not 6). |
988 | | // Since the Unicode range is below 10^7, we choose k = 8 whence |
989 | | // each "block" requires at most 1 + 1 + 8 characters. |
990 | 0 | if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) { |
991 | 0 | end = start + PY_SSIZE_T_MAX / (1 + 1 + 8); |
992 | 0 | end = Py_MIN(end, objlen); |
993 | 0 | slen = Py_MAX(0, end - start); |
994 | 0 | } |
995 | |
|
996 | 0 | Py_ssize_t ressize = 0; |
997 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
998 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); |
999 | 0 | ressize += codec_handler_unicode_hex_width(c); |
1000 | 0 | } |
1001 | 0 | PyObject *res = PyUnicode_New(ressize, 127); |
1002 | 0 | if (res == NULL) { |
1003 | 0 | Py_DECREF(obj); |
1004 | 0 | return NULL; |
1005 | 0 | } |
1006 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
1007 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
1008 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); |
1009 | 0 | codec_handler_write_unicode_hex(&outp, c); |
1010 | 0 | } |
1011 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1012 | 0 | Py_DECREF(obj); |
1013 | 0 | return Py_BuildValue("(Nn)", res, end); |
1014 | 0 | } |
1015 | | |
1016 | | |
1017 | | static PyObject * |
1018 | | _PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc) |
1019 | 0 | { |
1020 | 0 | PyObject *obj; |
1021 | 0 | Py_ssize_t objlen, start, end, slen; |
1022 | 0 | if (_PyUnicodeError_GetParams(exc, |
1023 | 0 | &obj, &objlen, |
1024 | 0 | &start, &end, &slen, true) < 0) |
1025 | 0 | { |
1026 | 0 | return NULL; |
1027 | 0 | } |
1028 | | |
1029 | 0 | PyObject *res = PyUnicode_New(4 * slen, 127); |
1030 | 0 | if (res == NULL) { |
1031 | 0 | Py_DECREF(obj); |
1032 | 0 | return NULL; |
1033 | 0 | } |
1034 | | |
1035 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
1036 | 0 | const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); |
1037 | 0 | for (Py_ssize_t i = start; i < end; i++, outp += 4) { |
1038 | 0 | const unsigned char ch = p[i]; |
1039 | 0 | outp[0] = '\\'; |
1040 | 0 | outp[1] = 'x'; |
1041 | 0 | outp[2] = Py_hexdigits[(ch >> 4) & 0xf]; |
1042 | 0 | outp[3] = Py_hexdigits[ch & 0xf]; |
1043 | 0 | } |
1044 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1045 | 0 | Py_DECREF(obj); |
1046 | 0 | return Py_BuildValue("(Nn)", res, end); |
1047 | 0 | } |
1048 | | |
1049 | | |
1050 | | static inline PyObject * |
1051 | | _PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc) |
1052 | 0 | { |
1053 | | // Same implementation as for UnicodeEncodeError objects. |
1054 | 0 | return _PyCodec_BackslashReplaceUnicodeEncodeError(exc); |
1055 | 0 | } |
1056 | | |
1057 | | |
1058 | | PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) |
1059 | 0 | { |
1060 | 0 | if (_PyIsUnicodeEncodeError(exc)) { |
1061 | 0 | return _PyCodec_BackslashReplaceUnicodeEncodeError(exc); |
1062 | 0 | } |
1063 | 0 | else if (_PyIsUnicodeDecodeError(exc)) { |
1064 | 0 | return _PyCodec_BackslashReplaceUnicodeDecodeError(exc); |
1065 | 0 | } |
1066 | 0 | else if (_PyIsUnicodeTranslateError(exc)) { |
1067 | 0 | return _PyCodec_BackslashReplaceUnicodeTranslateError(exc); |
1068 | 0 | } |
1069 | 0 | else { |
1070 | 0 | wrong_exception_type(exc); |
1071 | 0 | return NULL; |
1072 | 0 | } |
1073 | 0 | } |
1074 | | |
1075 | | |
1076 | | // --- handler: 'namereplace' ------------------------------------------------- |
1077 | | |
1078 | | PyObject *PyCodec_NameReplaceErrors(PyObject *exc) |
1079 | 0 | { |
1080 | 0 | if (!_PyIsUnicodeEncodeError(exc)) { |
1081 | 0 | wrong_exception_type(exc); |
1082 | 0 | return NULL; |
1083 | 0 | } |
1084 | | |
1085 | 0 | _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI(); |
1086 | 0 | if (ucnhash_capi == NULL) { |
1087 | 0 | return NULL; |
1088 | 0 | } |
1089 | | |
1090 | 0 | PyObject *obj; |
1091 | 0 | Py_ssize_t start, end; |
1092 | 0 | if (_PyUnicodeError_GetParams(exc, |
1093 | 0 | &obj, NULL, |
1094 | 0 | &start, &end, NULL, false) < 0) |
1095 | 0 | { |
1096 | 0 | return NULL; |
1097 | 0 | } |
1098 | | |
1099 | 0 | char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */ |
1100 | 0 | Py_ssize_t imax = start, ressize = 0, replsize; |
1101 | 0 | for (; imax < end; ++imax) { |
1102 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax); |
1103 | 0 | if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) { |
1104 | | // If 'c' is recognized by getname(), the corresponding replacement |
1105 | | // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1 |
1106 | | // characters. Failures of getname() are ignored by the handler. |
1107 | 0 | replsize = 1 + 1 + 1 + strlen(buffer) + 1; |
1108 | 0 | } |
1109 | 0 | else { |
1110 | 0 | replsize = codec_handler_unicode_hex_width(c); |
1111 | 0 | } |
1112 | 0 | if (ressize > PY_SSIZE_T_MAX - replsize) { |
1113 | 0 | break; |
1114 | 0 | } |
1115 | 0 | ressize += replsize; |
1116 | 0 | } |
1117 | |
|
1118 | 0 | PyObject *res = PyUnicode_New(ressize, 127); |
1119 | 0 | if (res == NULL) { |
1120 | 0 | Py_DECREF(obj); |
1121 | 0 | return NULL; |
1122 | 0 | } |
1123 | | |
1124 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
1125 | 0 | for (Py_ssize_t i = start; i < imax; ++i) { |
1126 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); |
1127 | 0 | if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) { |
1128 | 0 | *outp++ = '\\'; |
1129 | 0 | *outp++ = 'N'; |
1130 | 0 | *outp++ = '{'; |
1131 | 0 | (void)strcpy((char *)outp, buffer); |
1132 | 0 | outp += strlen(buffer); |
1133 | 0 | *outp++ = '}'; |
1134 | 0 | } |
1135 | 0 | else { |
1136 | 0 | codec_handler_write_unicode_hex(&outp, c); |
1137 | 0 | } |
1138 | 0 | } |
1139 | |
|
1140 | 0 | assert(outp == PyUnicode_1BYTE_DATA(res) + ressize); |
1141 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1142 | 0 | PyObject *restuple = Py_BuildValue("(Nn)", res, imax); |
1143 | 0 | Py_DECREF(obj); |
1144 | 0 | return restuple; |
1145 | 0 | } |
1146 | | |
1147 | | |
1148 | 0 | #define ENC_UNKNOWN -1 |
1149 | 0 | #define ENC_UTF8 0 |
1150 | 0 | #define ENC_UTF16BE 1 |
1151 | 0 | #define ENC_UTF16LE 2 |
1152 | 0 | #define ENC_UTF32BE 3 |
1153 | 0 | #define ENC_UTF32LE 4 |
1154 | | |
1155 | | static int |
1156 | | get_standard_encoding_impl(const char *encoding, int *bytelength) |
1157 | 0 | { |
1158 | 0 | if (Py_TOLOWER(encoding[0]) == 'u' && |
1159 | 0 | Py_TOLOWER(encoding[1]) == 't' && |
1160 | 0 | Py_TOLOWER(encoding[2]) == 'f') { |
1161 | 0 | encoding += 3; |
1162 | 0 | if (*encoding == '-' || *encoding == '_' ) |
1163 | 0 | encoding++; |
1164 | 0 | if (encoding[0] == '8' && encoding[1] == '\0') { |
1165 | 0 | *bytelength = 3; |
1166 | 0 | return ENC_UTF8; |
1167 | 0 | } |
1168 | 0 | else if (encoding[0] == '1' && encoding[1] == '6') { |
1169 | 0 | encoding += 2; |
1170 | 0 | *bytelength = 2; |
1171 | 0 | if (*encoding == '\0') { |
1172 | | #ifdef WORDS_BIGENDIAN |
1173 | | return ENC_UTF16BE; |
1174 | | #else |
1175 | 0 | return ENC_UTF16LE; |
1176 | 0 | #endif |
1177 | 0 | } |
1178 | 0 | if (*encoding == '-' || *encoding == '_' ) |
1179 | 0 | encoding++; |
1180 | 0 | if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { |
1181 | 0 | if (Py_TOLOWER(encoding[0]) == 'b') |
1182 | 0 | return ENC_UTF16BE; |
1183 | 0 | if (Py_TOLOWER(encoding[0]) == 'l') |
1184 | 0 | return ENC_UTF16LE; |
1185 | 0 | } |
1186 | 0 | } |
1187 | 0 | else if (encoding[0] == '3' && encoding[1] == '2') { |
1188 | 0 | encoding += 2; |
1189 | 0 | *bytelength = 4; |
1190 | 0 | if (*encoding == '\0') { |
1191 | | #ifdef WORDS_BIGENDIAN |
1192 | | return ENC_UTF32BE; |
1193 | | #else |
1194 | 0 | return ENC_UTF32LE; |
1195 | 0 | #endif |
1196 | 0 | } |
1197 | 0 | if (*encoding == '-' || *encoding == '_' ) |
1198 | 0 | encoding++; |
1199 | 0 | if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { |
1200 | 0 | if (Py_TOLOWER(encoding[0]) == 'b') |
1201 | 0 | return ENC_UTF32BE; |
1202 | 0 | if (Py_TOLOWER(encoding[0]) == 'l') |
1203 | 0 | return ENC_UTF32LE; |
1204 | 0 | } |
1205 | 0 | } |
1206 | 0 | } |
1207 | 0 | else if (strcmp(encoding, "CP_UTF8") == 0) { |
1208 | 0 | *bytelength = 3; |
1209 | 0 | return ENC_UTF8; |
1210 | 0 | } |
1211 | 0 | return ENC_UNKNOWN; |
1212 | 0 | } |
1213 | | |
1214 | | |
1215 | | static int |
1216 | | get_standard_encoding(PyObject *encoding, int *code, int *bytelength) |
1217 | 0 | { |
1218 | 0 | const char *encoding_cstr = PyUnicode_AsUTF8(encoding); |
1219 | 0 | if (encoding_cstr == NULL) { |
1220 | 0 | return -1; |
1221 | 0 | } |
1222 | 0 | *code = get_standard_encoding_impl(encoding_cstr, bytelength); |
1223 | 0 | return 0; |
1224 | 0 | } |
1225 | | |
1226 | | |
1227 | | // --- handler: 'surrogatepass' ----------------------------------------------- |
1228 | | |
1229 | | static PyObject * |
1230 | | _PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc) |
1231 | 0 | { |
1232 | 0 | PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc); |
1233 | 0 | if (encoding == NULL) { |
1234 | 0 | return NULL; |
1235 | 0 | } |
1236 | 0 | int code, bytelength; |
1237 | 0 | int rc = get_standard_encoding(encoding, &code, &bytelength); |
1238 | 0 | Py_DECREF(encoding); |
1239 | 0 | if (rc < 0) { |
1240 | 0 | return NULL; |
1241 | 0 | } |
1242 | 0 | if (code == ENC_UNKNOWN) { |
1243 | 0 | goto bail; |
1244 | 0 | } |
1245 | | |
1246 | 0 | PyObject *obj; |
1247 | 0 | Py_ssize_t objlen, start, end, slen; |
1248 | 0 | if (_PyUnicodeError_GetParams(exc, |
1249 | 0 | &obj, &objlen, |
1250 | 0 | &start, &end, &slen, false) < 0) |
1251 | 0 | { |
1252 | 0 | return NULL; |
1253 | 0 | } |
1254 | | |
1255 | 0 | if (slen > PY_SSIZE_T_MAX / bytelength) { |
1256 | 0 | end = start + PY_SSIZE_T_MAX / bytelength; |
1257 | 0 | end = Py_MIN(end, objlen); |
1258 | 0 | slen = Py_MAX(0, end - start); |
1259 | 0 | } |
1260 | |
|
1261 | 0 | PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen); |
1262 | 0 | if (res == NULL) { |
1263 | 0 | Py_DECREF(obj); |
1264 | 0 | return NULL; |
1265 | 0 | } |
1266 | | |
1267 | 0 | unsigned char *outp = (unsigned char *)PyBytes_AsString(res); |
1268 | 0 | for (Py_ssize_t i = start; i < end; i++) { |
1269 | 0 | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
1270 | 0 | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
1271 | | /* Not a surrogate, fail with original exception */ |
1272 | 0 | Py_DECREF(obj); |
1273 | 0 | Py_DECREF(res); |
1274 | 0 | goto bail; |
1275 | 0 | } |
1276 | 0 | switch (code) { |
1277 | 0 | case ENC_UTF8: { |
1278 | 0 | *outp++ = (unsigned char)(0xe0 | (ch >> 12)); |
1279 | 0 | *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); |
1280 | 0 | *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); |
1281 | 0 | break; |
1282 | 0 | } |
1283 | 0 | case ENC_UTF16LE: { |
1284 | 0 | *outp++ = (unsigned char)ch; |
1285 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1286 | 0 | break; |
1287 | 0 | } |
1288 | 0 | case ENC_UTF16BE: { |
1289 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1290 | 0 | *outp++ = (unsigned char)ch; |
1291 | 0 | break; |
1292 | 0 | } |
1293 | 0 | case ENC_UTF32LE: { |
1294 | 0 | *outp++ = (unsigned char)ch; |
1295 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1296 | 0 | *outp++ = (unsigned char)(ch >> 16); |
1297 | 0 | *outp++ = (unsigned char)(ch >> 24); |
1298 | 0 | break; |
1299 | 0 | } |
1300 | 0 | case ENC_UTF32BE: { |
1301 | 0 | *outp++ = (unsigned char)(ch >> 24); |
1302 | 0 | *outp++ = (unsigned char)(ch >> 16); |
1303 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1304 | 0 | *outp++ = (unsigned char)ch; |
1305 | 0 | break; |
1306 | 0 | } |
1307 | 0 | } |
1308 | 0 | } |
1309 | | |
1310 | 0 | Py_DECREF(obj); |
1311 | 0 | PyObject *restuple = Py_BuildValue("(Nn)", res, end); |
1312 | 0 | return restuple; |
1313 | | |
1314 | 0 | bail: |
1315 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1316 | 0 | return NULL; |
1317 | 0 | } |
1318 | | |
1319 | | |
1320 | | static PyObject * |
1321 | | _PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc) |
1322 | 0 | { |
1323 | 0 | PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc); |
1324 | 0 | if (encoding == NULL) { |
1325 | 0 | return NULL; |
1326 | 0 | } |
1327 | 0 | int code, bytelength; |
1328 | 0 | int rc = get_standard_encoding(encoding, &code, &bytelength); |
1329 | 0 | Py_DECREF(encoding); |
1330 | 0 | if (rc < 0) { |
1331 | 0 | return NULL; |
1332 | 0 | } |
1333 | 0 | if (code == ENC_UNKNOWN) { |
1334 | 0 | goto bail; |
1335 | 0 | } |
1336 | | |
1337 | 0 | PyObject *obj; |
1338 | 0 | Py_ssize_t objlen, start, end, slen; |
1339 | 0 | if (_PyUnicodeError_GetParams(exc, |
1340 | 0 | &obj, &objlen, |
1341 | 0 | &start, &end, &slen, true) < 0) |
1342 | 0 | { |
1343 | 0 | return NULL; |
1344 | 0 | } |
1345 | | |
1346 | | /* Try decoding a single surrogate character. If |
1347 | | there are more, let the codec call us again. */ |
1348 | 0 | Py_UCS4 ch = 0; |
1349 | 0 | const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); |
1350 | 0 | p += start; |
1351 | |
|
1352 | 0 | if (objlen - start >= bytelength) { |
1353 | 0 | switch (code) { |
1354 | 0 | case ENC_UTF8: { |
1355 | 0 | if ((p[0] & 0xf0) == 0xe0 && |
1356 | 0 | (p[1] & 0xc0) == 0x80 && |
1357 | 0 | (p[2] & 0xc0) == 0x80) |
1358 | 0 | { |
1359 | | /* it's a three-byte code */ |
1360 | 0 | ch = ((p[0] & 0x0f) << 12) + |
1361 | 0 | ((p[1] & 0x3f) << 6) + |
1362 | 0 | (p[2] & 0x3f); |
1363 | 0 | } |
1364 | 0 | break; |
1365 | 0 | } |
1366 | 0 | case ENC_UTF16LE: { |
1367 | 0 | ch = p[1] << 8 | p[0]; |
1368 | 0 | break; |
1369 | 0 | } |
1370 | 0 | case ENC_UTF16BE: { |
1371 | 0 | ch = p[0] << 8 | p[1]; |
1372 | 0 | break; |
1373 | 0 | } |
1374 | 0 | case ENC_UTF32LE: { |
1375 | 0 | ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; |
1376 | 0 | break; |
1377 | 0 | } |
1378 | 0 | case ENC_UTF32BE: { |
1379 | 0 | ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; |
1380 | 0 | break; |
1381 | 0 | } |
1382 | 0 | } |
1383 | 0 | } |
1384 | 0 | Py_DECREF(obj); |
1385 | 0 | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
1386 | 0 | goto bail; |
1387 | 0 | } |
1388 | | |
1389 | 0 | PyObject *res = PyUnicode_FromOrdinal(ch); |
1390 | 0 | if (res == NULL) { |
1391 | 0 | return NULL; |
1392 | 0 | } |
1393 | 0 | return Py_BuildValue("(Nn)", res, start + bytelength); |
1394 | | |
1395 | 0 | bail: |
1396 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1397 | 0 | return NULL; |
1398 | 0 | } |
1399 | | |
1400 | | |
1401 | | /* This handler is declared static until someone demonstrates |
1402 | | a need to call it directly. */ |
1403 | | static PyObject * |
1404 | | PyCodec_SurrogatePassErrors(PyObject *exc) |
1405 | 0 | { |
1406 | 0 | if (_PyIsUnicodeEncodeError(exc)) { |
1407 | 0 | return _PyCodec_SurrogatePassUnicodeEncodeError(exc); |
1408 | 0 | } |
1409 | 0 | else if (_PyIsUnicodeDecodeError(exc)) { |
1410 | 0 | return _PyCodec_SurrogatePassUnicodeDecodeError(exc); |
1411 | 0 | } |
1412 | 0 | else { |
1413 | 0 | wrong_exception_type(exc); |
1414 | 0 | return NULL; |
1415 | 0 | } |
1416 | 0 | } |
1417 | | |
1418 | | |
1419 | | // --- handler: 'surrogateescape' --------------------------------------------- |
1420 | | |
1421 | | static PyObject * |
1422 | | _PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc) |
1423 | 6.61k | { |
1424 | 6.61k | PyObject *obj; |
1425 | 6.61k | Py_ssize_t start, end, slen; |
1426 | 6.61k | if (_PyUnicodeError_GetParams(exc, |
1427 | 6.61k | &obj, NULL, |
1428 | 6.61k | &start, &end, &slen, false) < 0) |
1429 | 0 | { |
1430 | 0 | return NULL; |
1431 | 0 | } |
1432 | | |
1433 | 6.61k | PyObject *res = PyBytes_FromStringAndSize(NULL, slen); |
1434 | 6.61k | if (res == NULL) { |
1435 | 0 | Py_DECREF(obj); |
1436 | 0 | return NULL; |
1437 | 0 | } |
1438 | | |
1439 | 6.61k | char *outp = PyBytes_AsString(res); |
1440 | 6.61k | for (Py_ssize_t i = start; i < end; i++) { |
1441 | 6.61k | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
1442 | 6.61k | if (ch < 0xdc80 || ch > 0xdcff) { |
1443 | | /* Not a UTF-8b surrogate, fail with original exception. */ |
1444 | 6.61k | Py_DECREF(obj); |
1445 | 6.61k | Py_DECREF(res); |
1446 | 6.61k | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1447 | 6.61k | return NULL; |
1448 | 6.61k | } |
1449 | 0 | *outp++ = ch - 0xdc00; |
1450 | 0 | } |
1451 | 0 | Py_DECREF(obj); |
1452 | |
|
1453 | 0 | return Py_BuildValue("(Nn)", res, end); |
1454 | 6.61k | } |
1455 | | |
1456 | | |
1457 | | static PyObject * |
1458 | | _PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc) |
1459 | 42.3k | { |
1460 | 42.3k | PyObject *obj; |
1461 | 42.3k | Py_ssize_t start, end, slen; |
1462 | 42.3k | if (_PyUnicodeError_GetParams(exc, |
1463 | 42.3k | &obj, NULL, |
1464 | 42.3k | &start, &end, &slen, true) < 0) |
1465 | 0 | { |
1466 | 0 | return NULL; |
1467 | 0 | } |
1468 | | |
1469 | 42.3k | Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ |
1470 | 42.3k | int consumed = 0; |
1471 | 42.3k | const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); |
1472 | 91.2k | while (consumed < 4 && consumed < slen) { |
1473 | | /* Refuse to escape ASCII bytes. */ |
1474 | 73.8k | if (p[start + consumed] < 128) { |
1475 | 24.9k | break; |
1476 | 24.9k | } |
1477 | 48.8k | ch[consumed] = 0xdc00 + p[start + consumed]; |
1478 | 48.8k | consumed++; |
1479 | 48.8k | } |
1480 | 42.3k | Py_DECREF(obj); |
1481 | | |
1482 | 42.3k | if (consumed == 0) { |
1483 | | /* Codec complained about ASCII byte. */ |
1484 | 14.5k | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1485 | 14.5k | return NULL; |
1486 | 14.5k | } |
1487 | | |
1488 | 27.7k | PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed); |
1489 | 27.7k | if (str == NULL) { |
1490 | 0 | return NULL; |
1491 | 0 | } |
1492 | 27.7k | return Py_BuildValue("(Nn)", str, start + consumed); |
1493 | 27.7k | } |
1494 | | |
1495 | | |
1496 | | static PyObject * |
1497 | | PyCodec_SurrogateEscapeErrors(PyObject *exc) |
1498 | 48.9k | { |
1499 | 48.9k | if (_PyIsUnicodeEncodeError(exc)) { |
1500 | 6.61k | return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc); |
1501 | 6.61k | } |
1502 | 42.3k | else if (_PyIsUnicodeDecodeError(exc)) { |
1503 | 42.3k | return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc); |
1504 | 42.3k | } |
1505 | 0 | else { |
1506 | 0 | wrong_exception_type(exc); |
1507 | 0 | return NULL; |
1508 | 0 | } |
1509 | 48.9k | } |
1510 | | |
1511 | | |
1512 | | // --- Codecs registry handlers ----------------------------------------------- |
1513 | | |
1514 | | static inline PyObject * |
1515 | | strict_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1516 | 162k | { |
1517 | 162k | return PyCodec_StrictErrors(exc); |
1518 | 162k | } |
1519 | | |
1520 | | |
1521 | | static inline PyObject * |
1522 | | ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1523 | 0 | { |
1524 | 0 | return PyCodec_IgnoreErrors(exc); |
1525 | 0 | } |
1526 | | |
1527 | | |
1528 | | static inline PyObject * |
1529 | | replace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1530 | 194k | { |
1531 | 194k | return PyCodec_ReplaceErrors(exc); |
1532 | 194k | } |
1533 | | |
1534 | | |
1535 | | static inline PyObject * |
1536 | | xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1537 | 0 | { |
1538 | 0 | return PyCodec_XMLCharRefReplaceErrors(exc); |
1539 | 0 | } |
1540 | | |
1541 | | |
1542 | | static inline PyObject * |
1543 | | backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1544 | 0 | { |
1545 | 0 | return PyCodec_BackslashReplaceErrors(exc); |
1546 | 0 | } |
1547 | | |
1548 | | |
1549 | | static inline PyObject * |
1550 | | namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1551 | 0 | { |
1552 | 0 | return PyCodec_NameReplaceErrors(exc); |
1553 | 0 | } |
1554 | | |
1555 | | |
1556 | | static inline PyObject * |
1557 | | surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1558 | 0 | { |
1559 | 0 | return PyCodec_SurrogatePassErrors(exc); |
1560 | 0 | } |
1561 | | |
1562 | | |
1563 | | static inline PyObject * |
1564 | | surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1565 | 48.9k | { |
1566 | 48.9k | return PyCodec_SurrogateEscapeErrors(exc); |
1567 | 48.9k | } |
1568 | | |
1569 | | |
1570 | | PyStatus |
1571 | | _PyCodec_InitRegistry(PyInterpreterState *interp) |
1572 | 16 | { |
1573 | 16 | static struct { |
1574 | 16 | const char *name; |
1575 | 16 | PyMethodDef def; |
1576 | 16 | } methods[] = |
1577 | 16 | { |
1578 | 16 | { |
1579 | 16 | "strict", |
1580 | 16 | { |
1581 | 16 | "strict_errors", |
1582 | 16 | strict_errors, |
1583 | 16 | METH_O, |
1584 | 16 | PyDoc_STR("Implements the 'strict' error handling, which " |
1585 | 16 | "raises a UnicodeError on coding errors.") |
1586 | 16 | } |
1587 | 16 | }, |
1588 | 16 | { |
1589 | 16 | "ignore", |
1590 | 16 | { |
1591 | 16 | "ignore_errors", |
1592 | 16 | ignore_errors, |
1593 | 16 | METH_O, |
1594 | 16 | PyDoc_STR("Implements the 'ignore' error handling, which " |
1595 | 16 | "ignores malformed data and continues.") |
1596 | 16 | } |
1597 | 16 | }, |
1598 | 16 | { |
1599 | 16 | "replace", |
1600 | 16 | { |
1601 | 16 | "replace_errors", |
1602 | 16 | replace_errors, |
1603 | 16 | METH_O, |
1604 | 16 | PyDoc_STR("Implements the 'replace' error handling, which " |
1605 | 16 | "replaces malformed data with a replacement marker.") |
1606 | 16 | } |
1607 | 16 | }, |
1608 | 16 | { |
1609 | 16 | "xmlcharrefreplace", |
1610 | 16 | { |
1611 | 16 | "xmlcharrefreplace_errors", |
1612 | 16 | xmlcharrefreplace_errors, |
1613 | 16 | METH_O, |
1614 | 16 | PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, " |
1615 | 16 | "which replaces an unencodable character with the " |
1616 | 16 | "appropriate XML character reference.") |
1617 | 16 | } |
1618 | 16 | }, |
1619 | 16 | { |
1620 | 16 | "backslashreplace", |
1621 | 16 | { |
1622 | 16 | "backslashreplace_errors", |
1623 | 16 | backslashreplace_errors, |
1624 | 16 | METH_O, |
1625 | 16 | PyDoc_STR("Implements the 'backslashreplace' error handling, " |
1626 | 16 | "which replaces malformed data with a backslashed " |
1627 | 16 | "escape sequence.") |
1628 | 16 | } |
1629 | 16 | }, |
1630 | 16 | { |
1631 | 16 | "namereplace", |
1632 | 16 | { |
1633 | 16 | "namereplace_errors", |
1634 | 16 | namereplace_errors, |
1635 | 16 | METH_O, |
1636 | 16 | PyDoc_STR("Implements the 'namereplace' error handling, " |
1637 | 16 | "which replaces an unencodable character with a " |
1638 | 16 | "\\N{...} escape sequence.") |
1639 | 16 | } |
1640 | 16 | }, |
1641 | 16 | { |
1642 | 16 | "surrogatepass", |
1643 | 16 | { |
1644 | 16 | "surrogatepass", |
1645 | 16 | surrogatepass_errors, |
1646 | 16 | METH_O |
1647 | 16 | } |
1648 | 16 | }, |
1649 | 16 | { |
1650 | 16 | "surrogateescape", |
1651 | 16 | { |
1652 | 16 | "surrogateescape", |
1653 | 16 | surrogateescape_errors, |
1654 | 16 | METH_O |
1655 | 16 | } |
1656 | 16 | } |
1657 | 16 | }; |
1658 | | // ensure that the built-in error handlers' names are kept in sync |
1659 | 16 | assert(Py_ARRAY_LENGTH(methods) == Py_ARRAY_LENGTH(codecs_builtin_error_handlers)); |
1660 | | |
1661 | 16 | assert(interp->codecs.initialized == 0); |
1662 | 16 | interp->codecs.search_path = PyList_New(0); |
1663 | 16 | if (interp->codecs.search_path == NULL) { |
1664 | 0 | return PyStatus_NoMemory(); |
1665 | 0 | } |
1666 | 16 | interp->codecs.search_cache = PyDict_New(); |
1667 | 16 | if (interp->codecs.search_cache == NULL) { |
1668 | 0 | return PyStatus_NoMemory(); |
1669 | 0 | } |
1670 | 16 | interp->codecs.error_registry = PyDict_New(); |
1671 | 16 | if (interp->codecs.error_registry == NULL) { |
1672 | 0 | return PyStatus_NoMemory(); |
1673 | 0 | } |
1674 | 144 | for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) { |
1675 | 128 | PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL); |
1676 | 128 | if (func == NULL) { |
1677 | 0 | return PyStatus_NoMemory(); |
1678 | 0 | } |
1679 | | |
1680 | 128 | int res = PyDict_SetItemString(interp->codecs.error_registry, |
1681 | 128 | methods[i].name, func); |
1682 | 128 | Py_DECREF(func); |
1683 | 128 | if (res < 0) { |
1684 | 0 | return PyStatus_Error("Failed to insert into codec error registry"); |
1685 | 0 | } |
1686 | 128 | } |
1687 | | |
1688 | 16 | interp->codecs.initialized = 1; |
1689 | | |
1690 | | // Importing `encodings' will call back into this module to register codec |
1691 | | // search functions, so this is done after everything else is initialized. |
1692 | 16 | PyObject *mod = PyImport_ImportModule("encodings"); |
1693 | 16 | if (mod == NULL) { |
1694 | 0 | return PyStatus_Error("Failed to import encodings module"); |
1695 | 0 | } |
1696 | 16 | Py_DECREF(mod); |
1697 | | |
1698 | 16 | return PyStatus_Ok(); |
1699 | 16 | } |
1700 | | |
1701 | | void |
1702 | | _PyCodec_Fini(PyInterpreterState *interp) |
1703 | 0 | { |
1704 | 0 | Py_CLEAR(interp->codecs.search_path); |
1705 | 0 | Py_CLEAR(interp->codecs.search_cache); |
1706 | 0 | Py_CLEAR(interp->codecs.error_registry); |
1707 | 0 | interp->codecs.initialized = 0; |
1708 | 0 | } |