/src/cpython/Python/codecs.c
Line | Count | Source |
1 | | /* ------------------------------------------------------------------------ |
2 | | |
3 | | Python Codec Registry and support functions |
4 | | |
5 | | Written by Marc-Andre Lemburg (mal@lemburg.com). |
6 | | |
7 | | Copyright (c) Corporation for National Research Initiatives. |
8 | | |
9 | | ------------------------------------------------------------------------ */ |
10 | | |
11 | | #include "Python.h" |
12 | | #include "pycore_call.h" // _PyObject_CallNoArgs() |
13 | | #include "pycore_codecs.h" // export _PyCodec_LookupTextEncoding() |
14 | | #include "pycore_initconfig.h" // _Py_DumpPathConfig() |
15 | | #include "pycore_interp.h" // PyInterpreterState.codec_search_path |
16 | | #include "pycore_pyerrors.h" // _PyErr_FormatNote() |
17 | | #include "pycore_pystate.h" // _PyInterpreterState_GET() |
18 | | #include "pycore_runtime.h" // _Py_ID() |
19 | | #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI |
20 | | #include "pycore_unicodeobject.h" // _PyUnicode_InternMortal() |
21 | | #include "pycore_pyatomic_ft_wrappers.h" |
22 | | |
23 | | static const char *codecs_builtin_error_handlers[] = { |
24 | | "strict", "ignore", "replace", |
25 | | "xmlcharrefreplace", "backslashreplace", "namereplace", |
26 | | "surrogatepass", "surrogateescape", |
27 | | }; |
28 | | |
29 | | const char *Py_hexdigits = "0123456789abcdef"; |
30 | | |
31 | | /* --- Codec Registry ----------------------------------------------------- */ |
32 | | |
33 | | int PyCodec_Register(PyObject *search_function) |
34 | 36 | { |
35 | 36 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
36 | 36 | assert(interp->codecs.initialized); |
37 | 36 | if (search_function == NULL) { |
38 | 0 | PyErr_BadArgument(); |
39 | 0 | goto onError; |
40 | 0 | } |
41 | 36 | if (!PyCallable_Check(search_function)) { |
42 | 0 | PyErr_SetString(PyExc_TypeError, "argument must be callable"); |
43 | 0 | goto onError; |
44 | 0 | } |
45 | 36 | FT_MUTEX_LOCK(&interp->codecs.search_path_mutex); |
46 | 36 | int ret = PyList_Append(interp->codecs.search_path, search_function); |
47 | 36 | FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex); |
48 | | |
49 | 36 | return ret; |
50 | | |
51 | 0 | onError: |
52 | 0 | return -1; |
53 | 36 | } |
54 | | |
55 | | int |
56 | | PyCodec_Unregister(PyObject *search_function) |
57 | 0 | { |
58 | 0 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
59 | 0 | if (interp->codecs.initialized != 1) { |
60 | | /* Do nothing if codecs state was cleared (only possible during |
61 | | interpreter shutdown). */ |
62 | 0 | return 0; |
63 | 0 | } |
64 | | |
65 | 0 | PyObject *codec_search_path = interp->codecs.search_path; |
66 | 0 | assert(PyList_CheckExact(codec_search_path)); |
67 | 0 | for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) { |
68 | 0 | FT_MUTEX_LOCK(&interp->codecs.search_path_mutex); |
69 | 0 | PyObject *item = PyList_GetItemRef(codec_search_path, i); |
70 | 0 | int ret = 1; |
71 | 0 | if (item == search_function) { |
72 | | // We hold a reference to the item, so its destructor can't run |
73 | | // while we hold search_path_mutex. |
74 | 0 | ret = PyList_SetSlice(codec_search_path, i, i+1, NULL); |
75 | 0 | } |
76 | 0 | FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex); |
77 | 0 | Py_DECREF(item); |
78 | 0 | if (ret != 1) { |
79 | 0 | assert(interp->codecs.search_cache != NULL); |
80 | 0 | assert(PyDict_CheckExact(interp->codecs.search_cache)); |
81 | 0 | PyDict_Clear(interp->codecs.search_cache); |
82 | 0 | return ret; |
83 | 0 | } |
84 | 0 | } |
85 | 0 | return 0; |
86 | 0 | } |
87 | | |
88 | | /* Convert a string to a normalized Python string: all ASCII letters are |
89 | | converted to lower case, spaces are replaced with hyphens. */ |
90 | | |
91 | | static PyObject* |
92 | | normalizestring(const char *string) |
93 | 2.41M | { |
94 | 2.41M | size_t i; |
95 | 2.41M | size_t len = strlen(string); |
96 | 2.41M | char *p; |
97 | 2.41M | PyObject *v; |
98 | | |
99 | 2.41M | if (len > PY_SSIZE_T_MAX) { |
100 | 0 | PyErr_SetString(PyExc_OverflowError, "string is too large"); |
101 | 0 | return NULL; |
102 | 0 | } |
103 | | |
104 | 2.41M | p = PyMem_Malloc(len + 1); |
105 | 2.41M | if (p == NULL) |
106 | 0 | return PyErr_NoMemory(); |
107 | 33.9M | for (i = 0; i < len; i++) { |
108 | 31.4M | char ch = string[i]; |
109 | 31.4M | if (ch == ' ') |
110 | 219k | ch = '-'; |
111 | 31.2M | else |
112 | 31.2M | ch = Py_TOLOWER(Py_CHARMASK(ch)); |
113 | 31.4M | p[i] = ch; |
114 | 31.4M | } |
115 | 2.41M | p[i] = '\0'; |
116 | 2.41M | v = PyUnicode_FromString(p); |
117 | 2.41M | PyMem_Free(p); |
118 | 2.41M | return v; |
119 | 2.41M | } |
120 | | |
121 | | /* Lookup the given encoding and return a tuple providing the codec |
122 | | facilities. |
123 | | |
124 | | ASCII letters in the encoding string is looked up converted to all |
125 | | lower case. This makes encodings looked up through this mechanism |
126 | | effectively case-insensitive. Spaces are replaced with hyphens for |
127 | | names like "US ASCII" and "ISO 8859-1". |
128 | | |
129 | | If no codec is found, a LookupError is set and NULL returned. |
130 | | |
131 | | As side effect, this tries to load the encodings package, if not |
132 | | yet done. This is part of the lazy load strategy for the encodings |
133 | | package. |
134 | | |
135 | | */ |
136 | | |
137 | | PyObject *_PyCodec_Lookup(const char *encoding) |
138 | 2.41M | { |
139 | 2.41M | if (encoding == NULL) { |
140 | 0 | PyErr_BadArgument(); |
141 | 0 | return NULL; |
142 | 0 | } |
143 | | |
144 | 2.41M | PyInterpreterState *interp = _PyInterpreterState_GET(); |
145 | 2.41M | assert(interp->codecs.initialized); |
146 | | |
147 | | /* Convert the encoding to a normalized Python string: all |
148 | | ASCII letters are converted to lower case, spaces are |
149 | | replaced with hyphens. */ |
150 | 2.41M | PyObject *v = normalizestring(encoding); |
151 | 2.41M | if (v == NULL) { |
152 | 0 | return NULL; |
153 | 0 | } |
154 | | |
155 | | /* Intern the string. We'll make it immortal later if lookup succeeds. */ |
156 | 2.41M | _PyUnicode_InternMortal(interp, &v); |
157 | | |
158 | | /* First, try to lookup the name in the registry dictionary */ |
159 | 2.41M | PyObject *result; |
160 | 2.41M | if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) { |
161 | 0 | goto onError; |
162 | 0 | } |
163 | 2.41M | if (result != NULL) { |
164 | 2.34M | Py_DECREF(v); |
165 | 2.34M | return result; |
166 | 2.34M | } |
167 | | |
168 | | /* Next, scan the search functions in order of registration */ |
169 | 68.3k | const Py_ssize_t len = PyList_Size(interp->codecs.search_path); |
170 | 68.3k | if (len < 0) |
171 | 0 | goto onError; |
172 | 68.3k | if (len == 0) { |
173 | 0 | PyErr_SetString(PyExc_LookupError, |
174 | 0 | "no codec search functions registered: " |
175 | 0 | "can't find encoding"); |
176 | 0 | goto onError; |
177 | 0 | } |
178 | | |
179 | 68.3k | Py_ssize_t i; |
180 | 134k | for (i = 0; i < len; i++) { |
181 | 68.3k | PyObject *func; |
182 | | |
183 | 68.3k | func = PyList_GetItemRef(interp->codecs.search_path, i); |
184 | 68.3k | if (func == NULL) |
185 | 0 | goto onError; |
186 | 68.3k | result = PyObject_CallOneArg(func, v); |
187 | 68.3k | Py_DECREF(func); |
188 | 68.3k | if (result == NULL) |
189 | 0 | goto onError; |
190 | 68.3k | if (result == Py_None) { |
191 | 66.2k | Py_CLEAR(result); |
192 | 66.2k | continue; |
193 | 66.2k | } |
194 | 2.09k | if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) { |
195 | 0 | PyErr_SetString(PyExc_TypeError, |
196 | 0 | "codec search functions must return 4-tuples"); |
197 | 0 | Py_DECREF(result); |
198 | 0 | goto onError; |
199 | 0 | } |
200 | 2.09k | break; |
201 | 2.09k | } |
202 | 68.3k | if (result == NULL) { |
203 | | /* XXX Perhaps we should cache misses too ? */ |
204 | 66.2k | PyErr_Format(PyExc_LookupError, |
205 | 66.2k | "unknown encoding: %s", encoding); |
206 | 66.2k | goto onError; |
207 | 66.2k | } |
208 | | |
209 | 2.09k | _PyUnicode_InternImmortal(interp, &v); |
210 | | |
211 | | /* Cache and return the result */ |
212 | 2.09k | if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) { |
213 | 0 | Py_DECREF(result); |
214 | 0 | goto onError; |
215 | 0 | } |
216 | 2.09k | Py_DECREF(v); |
217 | 2.09k | return result; |
218 | | |
219 | 66.2k | onError: |
220 | 66.2k | Py_DECREF(v); |
221 | 66.2k | return NULL; |
222 | 2.09k | } |
223 | | |
224 | | /* Codec registry encoding check API. */ |
225 | | |
226 | | int PyCodec_KnownEncoding(const char *encoding) |
227 | 0 | { |
228 | 0 | PyObject *codecs; |
229 | |
|
230 | 0 | codecs = _PyCodec_Lookup(encoding); |
231 | 0 | if (!codecs) { |
232 | 0 | PyErr_Clear(); |
233 | 0 | return 0; |
234 | 0 | } |
235 | 0 | else { |
236 | 0 | Py_DECREF(codecs); |
237 | 0 | return 1; |
238 | 0 | } |
239 | 0 | } |
240 | | |
241 | | static |
242 | | PyObject *args_tuple(PyObject *object, |
243 | | const char *errors) |
244 | 2.07M | { |
245 | 2.07M | PyObject *args; |
246 | | |
247 | 2.07M | args = PyTuple_New(1 + (errors != NULL)); |
248 | 2.07M | if (args == NULL) |
249 | 0 | return NULL; |
250 | 2.07M | PyTuple_SET_ITEM(args, 0, Py_NewRef(object)); |
251 | 2.07M | if (errors) { |
252 | 187k | PyObject *v; |
253 | | |
254 | 187k | v = PyUnicode_FromString(errors); |
255 | 187k | if (v == NULL) { |
256 | 0 | Py_DECREF(args); |
257 | 0 | return NULL; |
258 | 0 | } |
259 | 187k | PyTuple_SET_ITEM(args, 1, v); |
260 | 187k | } |
261 | 2.07M | return args; |
262 | 2.07M | } |
263 | | |
264 | | /* Helper function to get a codec item */ |
265 | | |
266 | | static |
267 | | PyObject *codec_getitem(const char *encoding, int index) |
268 | 0 | { |
269 | 0 | PyObject *codecs; |
270 | 0 | PyObject *v; |
271 | |
|
272 | 0 | codecs = _PyCodec_Lookup(encoding); |
273 | 0 | if (codecs == NULL) |
274 | 0 | return NULL; |
275 | 0 | v = PyTuple_GET_ITEM(codecs, index); |
276 | 0 | Py_DECREF(codecs); |
277 | 0 | return Py_NewRef(v); |
278 | 0 | } |
279 | | |
280 | | /* Helper functions to create an incremental codec. */ |
281 | | static |
282 | | PyObject *codec_makeincrementalcodec(PyObject *codec_info, |
283 | | const char *errors, |
284 | | const char *attrname) |
285 | 127 | { |
286 | 127 | PyObject *ret, *inccodec; |
287 | | |
288 | 127 | inccodec = PyObject_GetAttrString(codec_info, attrname); |
289 | 127 | if (inccodec == NULL) |
290 | 0 | return NULL; |
291 | 127 | if (errors) |
292 | 127 | ret = PyObject_CallFunction(inccodec, "s", errors); |
293 | 0 | else |
294 | 0 | ret = _PyObject_CallNoArgs(inccodec); |
295 | 127 | Py_DECREF(inccodec); |
296 | 127 | return ret; |
297 | 127 | } |
298 | | |
299 | | static |
300 | | PyObject *codec_getincrementalcodec(const char *encoding, |
301 | | const char *errors, |
302 | | const char *attrname) |
303 | 0 | { |
304 | 0 | PyObject *codec_info, *ret; |
305 | |
|
306 | 0 | codec_info = _PyCodec_Lookup(encoding); |
307 | 0 | if (codec_info == NULL) |
308 | 0 | return NULL; |
309 | 0 | ret = codec_makeincrementalcodec(codec_info, errors, attrname); |
310 | 0 | Py_DECREF(codec_info); |
311 | 0 | return ret; |
312 | 0 | } |
313 | | |
314 | | /* Helper function to create a stream codec. */ |
315 | | |
316 | | static |
317 | | PyObject *codec_getstreamcodec(const char *encoding, |
318 | | PyObject *stream, |
319 | | const char *errors, |
320 | | const int index) |
321 | 0 | { |
322 | 0 | PyObject *codecs, *streamcodec, *codeccls; |
323 | |
|
324 | 0 | codecs = _PyCodec_Lookup(encoding); |
325 | 0 | if (codecs == NULL) |
326 | 0 | return NULL; |
327 | | |
328 | 0 | codeccls = PyTuple_GET_ITEM(codecs, index); |
329 | 0 | if (errors != NULL) |
330 | 0 | streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors); |
331 | 0 | else |
332 | 0 | streamcodec = PyObject_CallOneArg(codeccls, stream); |
333 | 0 | Py_DECREF(codecs); |
334 | 0 | return streamcodec; |
335 | 0 | } |
336 | | |
337 | | /* Helpers to work with the result of _PyCodec_Lookup |
338 | | |
339 | | */ |
340 | | PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info, |
341 | | const char *errors) |
342 | 54 | { |
343 | 54 | return codec_makeincrementalcodec(codec_info, errors, |
344 | 54 | "incrementaldecoder"); |
345 | 54 | } |
346 | | |
347 | | PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info, |
348 | | const char *errors) |
349 | 73 | { |
350 | 73 | return codec_makeincrementalcodec(codec_info, errors, |
351 | 73 | "incrementalencoder"); |
352 | 73 | } |
353 | | |
354 | | |
355 | | /* Convenience APIs to query the Codec registry. |
356 | | |
357 | | All APIs return a codec object with incremented refcount. |
358 | | |
359 | | */ |
360 | | |
361 | | PyObject *PyCodec_Encoder(const char *encoding) |
362 | 0 | { |
363 | 0 | return codec_getitem(encoding, 0); |
364 | 0 | } |
365 | | |
366 | | PyObject *PyCodec_Decoder(const char *encoding) |
367 | 0 | { |
368 | 0 | return codec_getitem(encoding, 1); |
369 | 0 | } |
370 | | |
371 | | PyObject *PyCodec_IncrementalEncoder(const char *encoding, |
372 | | const char *errors) |
373 | 0 | { |
374 | 0 | return codec_getincrementalcodec(encoding, errors, "incrementalencoder"); |
375 | 0 | } |
376 | | |
377 | | PyObject *PyCodec_IncrementalDecoder(const char *encoding, |
378 | | const char *errors) |
379 | 0 | { |
380 | 0 | return codec_getincrementalcodec(encoding, errors, "incrementaldecoder"); |
381 | 0 | } |
382 | | |
383 | | PyObject *PyCodec_StreamReader(const char *encoding, |
384 | | PyObject *stream, |
385 | | const char *errors) |
386 | 0 | { |
387 | 0 | return codec_getstreamcodec(encoding, stream, errors, 2); |
388 | 0 | } |
389 | | |
390 | | PyObject *PyCodec_StreamWriter(const char *encoding, |
391 | | PyObject *stream, |
392 | | const char *errors) |
393 | 0 | { |
394 | 0 | return codec_getstreamcodec(encoding, stream, errors, 3); |
395 | 0 | } |
396 | | |
397 | | /* Encode an object (e.g. a Unicode object) using the given encoding |
398 | | and return the resulting encoded object (usually a Python string). |
399 | | |
400 | | errors is passed to the encoder factory as argument if non-NULL. */ |
401 | | |
402 | | static PyObject * |
403 | | _PyCodec_EncodeInternal(PyObject *object, |
404 | | PyObject *encoder, |
405 | | const char *encoding, |
406 | | const char *errors) |
407 | 842k | { |
408 | 842k | PyObject *args = NULL, *result = NULL; |
409 | 842k | PyObject *v = NULL; |
410 | | |
411 | 842k | args = args_tuple(object, errors); |
412 | 842k | if (args == NULL) |
413 | 0 | goto onError; |
414 | | |
415 | 842k | result = PyObject_Call(encoder, args, NULL); |
416 | 842k | if (result == NULL) { |
417 | 0 | _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding); |
418 | 0 | goto onError; |
419 | 0 | } |
420 | | |
421 | 842k | if (!PyTuple_Check(result) || |
422 | 842k | PyTuple_GET_SIZE(result) != 2) { |
423 | 0 | PyErr_SetString(PyExc_TypeError, |
424 | 0 | "encoder must return a tuple (object, integer)"); |
425 | 0 | goto onError; |
426 | 0 | } |
427 | 842k | v = Py_NewRef(PyTuple_GET_ITEM(result,0)); |
428 | | /* We don't check or use the second (integer) entry. */ |
429 | | |
430 | 842k | Py_DECREF(args); |
431 | 842k | Py_DECREF(encoder); |
432 | 842k | Py_DECREF(result); |
433 | 842k | return v; |
434 | | |
435 | 0 | onError: |
436 | 0 | Py_XDECREF(result); |
437 | 0 | Py_XDECREF(args); |
438 | 0 | Py_XDECREF(encoder); |
439 | 0 | return NULL; |
440 | 842k | } |
441 | | |
442 | | /* Decode an object (usually a Python string) using the given encoding |
443 | | and return an equivalent object (e.g. a Unicode object). |
444 | | |
445 | | errors is passed to the decoder factory as argument if non-NULL. */ |
446 | | |
447 | | static PyObject * |
448 | | _PyCodec_DecodeInternal(PyObject *object, |
449 | | PyObject *decoder, |
450 | | const char *encoding, |
451 | | const char *errors) |
452 | 1.23M | { |
453 | 1.23M | PyObject *args = NULL, *result = NULL; |
454 | 1.23M | PyObject *v; |
455 | | |
456 | 1.23M | args = args_tuple(object, errors); |
457 | 1.23M | if (args == NULL) |
458 | 0 | goto onError; |
459 | | |
460 | 1.23M | result = PyObject_Call(decoder, args, NULL); |
461 | 1.23M | if (result == NULL) { |
462 | 72.2k | _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding); |
463 | 72.2k | goto onError; |
464 | 72.2k | } |
465 | 1.15M | if (!PyTuple_Check(result) || |
466 | 1.15M | PyTuple_GET_SIZE(result) != 2) { |
467 | 0 | PyErr_SetString(PyExc_TypeError, |
468 | 0 | "decoder must return a tuple (object,integer)"); |
469 | 0 | goto onError; |
470 | 0 | } |
471 | 1.15M | v = Py_NewRef(PyTuple_GET_ITEM(result,0)); |
472 | | /* We don't check or use the second (integer) entry. */ |
473 | | |
474 | 1.15M | Py_DECREF(args); |
475 | 1.15M | Py_DECREF(decoder); |
476 | 1.15M | Py_DECREF(result); |
477 | 1.15M | return v; |
478 | | |
479 | 72.2k | onError: |
480 | 72.2k | Py_XDECREF(args); |
481 | 72.2k | Py_XDECREF(decoder); |
482 | 72.2k | Py_XDECREF(result); |
483 | 72.2k | return NULL; |
484 | 1.15M | } |
485 | | |
486 | | /* Generic encoding/decoding API */ |
487 | | PyObject *PyCodec_Encode(PyObject *object, |
488 | | const char *encoding, |
489 | | const char *errors) |
490 | 0 | { |
491 | 0 | PyObject *encoder; |
492 | |
|
493 | 0 | encoder = PyCodec_Encoder(encoding); |
494 | 0 | if (encoder == NULL) |
495 | 0 | return NULL; |
496 | | |
497 | 0 | return _PyCodec_EncodeInternal(object, encoder, encoding, errors); |
498 | 0 | } |
499 | | |
500 | | PyObject *PyCodec_Decode(PyObject *object, |
501 | | const char *encoding, |
502 | | const char *errors) |
503 | 0 | { |
504 | 0 | PyObject *decoder; |
505 | |
|
506 | 0 | decoder = PyCodec_Decoder(encoding); |
507 | 0 | if (decoder == NULL) |
508 | 0 | return NULL; |
509 | | |
510 | 0 | return _PyCodec_DecodeInternal(object, decoder, encoding, errors); |
511 | 0 | } |
512 | | |
513 | | /* Text encoding/decoding API */ |
514 | | PyObject * _PyCodec_LookupTextEncoding(const char *encoding, |
515 | | const char *alternate_command) |
516 | 2.08M | { |
517 | 2.08M | PyObject *codec; |
518 | 2.08M | PyObject *attr; |
519 | 2.08M | int is_text_codec; |
520 | | |
521 | 2.08M | codec = _PyCodec_Lookup(encoding); |
522 | 2.08M | if (codec == NULL) |
523 | 11.3k | return NULL; |
524 | | |
525 | | /* Backwards compatibility: assume any raw tuple describes a text |
526 | | * encoding, and the same for anything lacking the private |
527 | | * attribute. |
528 | | */ |
529 | 2.07M | if (!PyTuple_CheckExact(codec)) { |
530 | 2.07M | if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) { |
531 | 0 | Py_DECREF(codec); |
532 | 0 | return NULL; |
533 | 0 | } |
534 | 2.07M | if (attr != NULL) { |
535 | 2.07M | is_text_codec = PyObject_IsTrue(attr); |
536 | 2.07M | Py_DECREF(attr); |
537 | 2.07M | if (is_text_codec <= 0) { |
538 | 3.34k | Py_DECREF(codec); |
539 | 3.34k | if (!is_text_codec) { |
540 | 3.34k | if (alternate_command != NULL) { |
541 | 3.34k | PyErr_Format(PyExc_LookupError, |
542 | 3.34k | "'%.400s' is not a text encoding; " |
543 | 3.34k | "use %s to handle arbitrary codecs", |
544 | 3.34k | encoding, alternate_command); |
545 | 3.34k | } |
546 | 0 | else { |
547 | 0 | PyErr_Format(PyExc_LookupError, |
548 | 0 | "'%.400s' is not a text encoding", |
549 | 0 | encoding); |
550 | 0 | } |
551 | 3.34k | } |
552 | 3.34k | return NULL; |
553 | 3.34k | } |
554 | 2.07M | } |
555 | 2.07M | } |
556 | | |
557 | | /* This appears to be a valid text encoding */ |
558 | 2.07M | return codec; |
559 | 2.07M | } |
560 | | |
561 | | |
562 | | static |
563 | | PyObject *codec_getitem_checked(const char *encoding, |
564 | | const char *alternate_command, |
565 | | int index) |
566 | 2.08M | { |
567 | 2.08M | PyObject *codec; |
568 | 2.08M | PyObject *v; |
569 | | |
570 | 2.08M | codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); |
571 | 2.08M | if (codec == NULL) |
572 | 14.6k | return NULL; |
573 | | |
574 | 2.07M | v = Py_NewRef(PyTuple_GET_ITEM(codec, index)); |
575 | 2.07M | Py_DECREF(codec); |
576 | 2.07M | return v; |
577 | 2.08M | } |
578 | | |
579 | | static PyObject * _PyCodec_TextEncoder(const char *encoding) |
580 | 842k | { |
581 | 842k | return codec_getitem_checked(encoding, "codecs.encode()", 0); |
582 | 842k | } |
583 | | |
584 | | static PyObject * _PyCodec_TextDecoder(const char *encoding) |
585 | 1.24M | { |
586 | 1.24M | return codec_getitem_checked(encoding, "codecs.decode()", 1); |
587 | 1.24M | } |
588 | | |
589 | | PyObject *_PyCodec_EncodeText(PyObject *object, |
590 | | const char *encoding, |
591 | | const char *errors) |
592 | 842k | { |
593 | 842k | PyObject *encoder; |
594 | | |
595 | 842k | encoder = _PyCodec_TextEncoder(encoding); |
596 | 842k | if (encoder == NULL) |
597 | 0 | return NULL; |
598 | | |
599 | 842k | return _PyCodec_EncodeInternal(object, encoder, encoding, errors); |
600 | 842k | } |
601 | | |
602 | | PyObject *_PyCodec_DecodeText(PyObject *object, |
603 | | const char *encoding, |
604 | | const char *errors) |
605 | 1.24M | { |
606 | 1.24M | PyObject *decoder; |
607 | | |
608 | 1.24M | decoder = _PyCodec_TextDecoder(encoding); |
609 | 1.24M | if (decoder == NULL) |
610 | 14.6k | return NULL; |
611 | | |
612 | 1.23M | return _PyCodec_DecodeInternal(object, decoder, encoding, errors); |
613 | 1.24M | } |
614 | | |
615 | | /* Register the error handling callback function error under the name |
616 | | name. This function will be called by the codec when it encounters |
617 | | an unencodable characters/undecodable bytes and doesn't know the |
618 | | callback name, when name is specified as the error parameter |
619 | | in the call to the encode/decode function. |
620 | | Return 0 on success, -1 on error */ |
621 | | int PyCodec_RegisterError(const char *name, PyObject *error) |
622 | 0 | { |
623 | 0 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
624 | 0 | assert(interp->codecs.initialized); |
625 | 0 | if (!PyCallable_Check(error)) { |
626 | 0 | PyErr_SetString(PyExc_TypeError, "handler must be callable"); |
627 | 0 | return -1; |
628 | 0 | } |
629 | 0 | return PyDict_SetItemString(interp->codecs.error_registry, |
630 | 0 | name, error); |
631 | 0 | } |
632 | | |
633 | | int _PyCodec_UnregisterError(const char *name) |
634 | 0 | { |
635 | 0 | for (size_t i = 0; i < Py_ARRAY_LENGTH(codecs_builtin_error_handlers); ++i) { |
636 | 0 | if (strcmp(name, codecs_builtin_error_handlers[i]) == 0) { |
637 | 0 | PyErr_Format(PyExc_ValueError, |
638 | 0 | "cannot un-register built-in error handler '%s'", name); |
639 | 0 | return -1; |
640 | 0 | } |
641 | 0 | } |
642 | 0 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
643 | 0 | assert(interp->codecs.initialized); |
644 | 0 | return PyDict_PopString(interp->codecs.error_registry, name, NULL); |
645 | 0 | } |
646 | | |
647 | | /* Lookup the error handling callback function registered under the |
648 | | name error. As a special case NULL can be passed, in which case |
649 | | the error handling callback for strict encoding will be returned. */ |
650 | | PyObject *PyCodec_LookupError(const char *name) |
651 | 3.28M | { |
652 | 3.28M | PyInterpreterState *interp = _PyInterpreterState_GET(); |
653 | 3.28M | assert(interp->codecs.initialized); |
654 | | |
655 | 3.28M | if (name==NULL) |
656 | 161k | name = "strict"; |
657 | 3.28M | PyObject *handler; |
658 | 3.28M | if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) { |
659 | 0 | return NULL; |
660 | 0 | } |
661 | 3.28M | if (handler == NULL) { |
662 | 0 | PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); |
663 | 0 | return NULL; |
664 | 0 | } |
665 | 3.28M | return handler; |
666 | 3.28M | } |
667 | | |
668 | | |
669 | | static inline void |
670 | | wrong_exception_type(PyObject *exc) |
671 | 0 | { |
672 | 0 | PyErr_Format(PyExc_TypeError, |
673 | 0 | "don't know how to handle %T in error callback", exc); |
674 | 0 | } |
675 | | |
676 | | |
677 | | #define _PyIsUnicodeEncodeError(EXC) \ |
678 | 312k | PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeEncodeError) |
679 | | #define _PyIsUnicodeDecodeError(EXC) \ |
680 | 300k | PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeDecodeError) |
681 | | #define _PyIsUnicodeTranslateError(EXC) \ |
682 | 0 | PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError) |
683 | | |
684 | | |
685 | | // --- codecs handlers: utilities --------------------------------------------- |
686 | | |
687 | | /* |
688 | | * Return the number of characters (including special prefixes) |
689 | | * needed to represent 'ch' by codec_handler_write_unicode_hex(). |
690 | | */ |
691 | | static inline Py_ssize_t |
692 | | codec_handler_unicode_hex_width(Py_UCS4 ch) |
693 | 0 | { |
694 | 0 | if (ch >= 0x10000) { |
695 | | // format: '\\' + 'U' + 8 hex digits |
696 | 0 | return 1 + 1 + 8; |
697 | 0 | } |
698 | 0 | else if (ch >= 0x100) { |
699 | | // format: '\\' + 'u' + 4 hex digits |
700 | 0 | return 1 + 1 + 4; |
701 | 0 | } |
702 | 0 | else { |
703 | | // format: '\\' + 'x' + 2 hex digits |
704 | 0 | return 1 + 1 + 2; |
705 | 0 | } |
706 | 0 | } |
707 | | |
708 | | |
709 | | /* |
710 | | * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p' |
711 | | * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively. |
712 | | */ |
713 | | static inline void |
714 | | codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch) |
715 | 0 | { |
716 | 0 | *(*p)++ = '\\'; |
717 | 0 | if (ch >= 0x10000) { |
718 | 0 | *(*p)++ = 'U'; |
719 | 0 | *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf]; |
720 | 0 | *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf]; |
721 | 0 | *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf]; |
722 | 0 | *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf]; |
723 | 0 | *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf]; |
724 | 0 | *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf]; |
725 | 0 | } |
726 | 0 | else if (ch >= 0x100) { |
727 | 0 | *(*p)++ = 'u'; |
728 | 0 | *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf]; |
729 | 0 | *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf]; |
730 | 0 | } |
731 | 0 | else { |
732 | 0 | *(*p)++ = 'x'; |
733 | 0 | } |
734 | 0 | *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf]; |
735 | 0 | *(*p)++ = Py_hexdigits[ch & 0xf]; |
736 | 0 | } |
737 | | |
738 | | |
739 | | /* |
740 | | * Determine the number of digits for a decimal representation of Unicode |
741 | | * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits). |
742 | | */ |
743 | | static inline int |
744 | | n_decimal_digits_for_codepoint(Py_UCS4 ch) |
745 | 0 | { |
746 | 0 | if (ch < 10) return 1; |
747 | 0 | if (ch < 100) return 2; |
748 | 0 | if (ch < 1000) return 3; |
749 | 0 | if (ch < 10000) return 4; |
750 | 0 | if (ch < 100000) return 5; |
751 | 0 | if (ch < 1000000) return 6; |
752 | 0 | if (ch < 10000000) return 7; |
753 | | // Unicode codepoints are limited to 1114111 (7 decimal digits) |
754 | 0 | Py_UNREACHABLE(); |
755 | 0 | } |
756 | | |
757 | | |
758 | | /* |
759 | | * Create a Unicode string containing 'count' copies of the official |
760 | | * Unicode REPLACEMENT CHARACTER (0xFFFD). |
761 | | */ |
762 | | static PyObject * |
763 | | codec_handler_unicode_replacement_character(Py_ssize_t count) |
764 | 228k | { |
765 | 228k | PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER); |
766 | 228k | if (res == NULL) { |
767 | 0 | return NULL; |
768 | 0 | } |
769 | 228k | assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); |
770 | 228k | Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res); |
771 | 456k | for (Py_ssize_t i = 0; i < count; ++i) { |
772 | 228k | outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; |
773 | 228k | } |
774 | 228k | assert(_PyUnicode_CheckConsistency(res, 1)); |
775 | 228k | return res; |
776 | 228k | } |
777 | | |
778 | | |
779 | | // --- handler: 'strict' ------------------------------------------------------ |
780 | | |
781 | | PyObject *PyCodec_StrictErrors(PyObject *exc) |
782 | 3.78M | { |
783 | 3.78M | if (PyExceptionInstance_Check(exc)) { |
784 | 3.78M | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
785 | 3.78M | } |
786 | 0 | else { |
787 | 0 | PyErr_SetString(PyExc_TypeError, "codec must pass exception instance"); |
788 | 0 | } |
789 | 3.78M | return NULL; |
790 | 3.78M | } |
791 | | |
792 | | |
793 | | // --- handler: 'ignore' ------------------------------------------------------ |
794 | | |
795 | | static PyObject * |
796 | | _PyCodec_IgnoreError(PyObject *exc, int as_bytes) |
797 | 0 | { |
798 | 0 | Py_ssize_t end; |
799 | 0 | if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL, |
800 | 0 | &end, NULL, as_bytes) < 0) |
801 | 0 | { |
802 | 0 | return NULL; |
803 | 0 | } |
804 | 0 | return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end); |
805 | 0 | } |
806 | | |
807 | | |
808 | | PyObject *PyCodec_IgnoreErrors(PyObject *exc) |
809 | 0 | { |
810 | 0 | if (_PyIsUnicodeEncodeError(exc) || _PyIsUnicodeTranslateError(exc)) { |
811 | 0 | return _PyCodec_IgnoreError(exc, false); |
812 | 0 | } |
813 | 0 | else if (_PyIsUnicodeDecodeError(exc)) { |
814 | 0 | return _PyCodec_IgnoreError(exc, true); |
815 | 0 | } |
816 | 0 | else { |
817 | 0 | wrong_exception_type(exc); |
818 | 0 | return NULL; |
819 | 0 | } |
820 | 0 | } |
821 | | |
822 | | |
823 | | // --- handler: 'replace' ----------------------------------------------------- |
824 | | |
825 | | static PyObject * |
826 | | _PyCodec_ReplaceUnicodeEncodeError(PyObject *exc) |
827 | 0 | { |
828 | 0 | Py_ssize_t start, end, slen; |
829 | 0 | if (_PyUnicodeError_GetParams(exc, NULL, NULL, |
830 | 0 | &start, &end, &slen, false) < 0) |
831 | 0 | { |
832 | 0 | return NULL; |
833 | 0 | } |
834 | 0 | PyObject *res = PyUnicode_New(slen, '?'); |
835 | 0 | if (res == NULL) { |
836 | 0 | return NULL; |
837 | 0 | } |
838 | 0 | assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND); |
839 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
840 | 0 | memset(outp, '?', sizeof(Py_UCS1) * slen); |
841 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
842 | 0 | return Py_BuildValue("(Nn)", res, end); |
843 | 0 | } |
844 | | |
845 | | |
846 | | static PyObject * |
847 | | _PyCodec_ReplaceUnicodeDecodeError(PyObject *exc) |
848 | 228k | { |
849 | 228k | Py_ssize_t end; |
850 | 228k | if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) { |
851 | 0 | return NULL; |
852 | 0 | } |
853 | 228k | PyObject *res = codec_handler_unicode_replacement_character(1); |
854 | 228k | if (res == NULL) { |
855 | 0 | return NULL; |
856 | 0 | } |
857 | 228k | return Py_BuildValue("(Nn)", res, end); |
858 | 228k | } |
859 | | |
860 | | |
861 | | static PyObject * |
862 | | _PyCodec_ReplaceUnicodeTranslateError(PyObject *exc) |
863 | 0 | { |
864 | 0 | Py_ssize_t start, end, slen; |
865 | 0 | if (_PyUnicodeError_GetParams(exc, NULL, NULL, |
866 | 0 | &start, &end, &slen, false) < 0) |
867 | 0 | { |
868 | 0 | return NULL; |
869 | 0 | } |
870 | 0 | PyObject *res = codec_handler_unicode_replacement_character(slen); |
871 | 0 | if (res == NULL) { |
872 | 0 | return NULL; |
873 | 0 | } |
874 | 0 | return Py_BuildValue("(Nn)", res, end); |
875 | 0 | } |
876 | | |
877 | | |
878 | | PyObject *PyCodec_ReplaceErrors(PyObject *exc) |
879 | 228k | { |
880 | 228k | if (_PyIsUnicodeEncodeError(exc)) { |
881 | 0 | return _PyCodec_ReplaceUnicodeEncodeError(exc); |
882 | 0 | } |
883 | 228k | else if (_PyIsUnicodeDecodeError(exc)) { |
884 | 228k | return _PyCodec_ReplaceUnicodeDecodeError(exc); |
885 | 228k | } |
886 | 0 | else if (_PyIsUnicodeTranslateError(exc)) { |
887 | 0 | return _PyCodec_ReplaceUnicodeTranslateError(exc); |
888 | 0 | } |
889 | 0 | else { |
890 | 0 | wrong_exception_type(exc); |
891 | 0 | return NULL; |
892 | 0 | } |
893 | 228k | } |
894 | | |
895 | | |
896 | | // --- handler: 'xmlcharrefreplace' ------------------------------------------- |
897 | | |
898 | | PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) |
899 | 0 | { |
900 | 0 | if (!_PyIsUnicodeEncodeError(exc)) { |
901 | 0 | wrong_exception_type(exc); |
902 | 0 | return NULL; |
903 | 0 | } |
904 | | |
905 | 0 | PyObject *obj; |
906 | 0 | Py_ssize_t objlen, start, end, slen; |
907 | 0 | if (_PyUnicodeError_GetParams(exc, |
908 | 0 | &obj, &objlen, |
909 | 0 | &start, &end, &slen, false) < 0) |
910 | 0 | { |
911 | 0 | return NULL; |
912 | 0 | } |
913 | | |
914 | | // The number of characters that each character 'ch' contributes |
915 | | // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch} |
916 | | // and will be formatted as "&#" + DIGITS + ";". Since the Unicode |
917 | | // range is below 10^7, each "block" requires at most 2 + 7 + 1 |
918 | | // characters. |
919 | 0 | if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) { |
920 | 0 | end = start + PY_SSIZE_T_MAX / (2 + 7 + 1); |
921 | 0 | end = Py_MIN(end, objlen); |
922 | 0 | slen = Py_MAX(0, end - start); |
923 | 0 | } |
924 | |
|
925 | 0 | Py_ssize_t ressize = 0; |
926 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
927 | 0 | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
928 | 0 | int k = n_decimal_digits_for_codepoint(ch); |
929 | 0 | assert(k != 0); |
930 | 0 | assert(k <= 7); |
931 | 0 | ressize += 2 + k + 1; |
932 | 0 | } |
933 | | |
934 | | /* allocate replacement */ |
935 | 0 | PyObject *res = PyUnicode_New(ressize, 127); |
936 | 0 | if (res == NULL) { |
937 | 0 | Py_DECREF(obj); |
938 | 0 | return NULL; |
939 | 0 | } |
940 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
941 | | /* generate replacement */ |
942 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
943 | 0 | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
944 | | /* |
945 | | * Write the decimal representation of 'ch' to the buffer pointed by 'p' |
946 | | * using at most 7 characters prefixed by '&#' and suffixed by ';'. |
947 | | */ |
948 | 0 | *outp++ = '&'; |
949 | 0 | *outp++ = '#'; |
950 | 0 | Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch); |
951 | 0 | for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) { |
952 | 0 | *p_digit = '0' + (ch % 10); |
953 | 0 | ch /= 10; |
954 | 0 | } |
955 | 0 | assert(ch == 0); |
956 | 0 | outp = digit_end; |
957 | 0 | *outp++ = ';'; |
958 | 0 | } |
959 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
960 | 0 | PyObject *restuple = Py_BuildValue("(Nn)", res, end); |
961 | 0 | Py_DECREF(obj); |
962 | 0 | return restuple; |
963 | 0 | } |
964 | | |
965 | | |
966 | | // --- handler: 'backslashreplace' -------------------------------------------- |
967 | | |
968 | | static PyObject * |
969 | | _PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc) |
970 | 0 | { |
971 | 0 | PyObject *obj; |
972 | 0 | Py_ssize_t objlen, start, end, slen; |
973 | 0 | if (_PyUnicodeError_GetParams(exc, |
974 | 0 | &obj, &objlen, |
975 | 0 | &start, &end, &slen, false) < 0) |
976 | 0 | { |
977 | 0 | return NULL; |
978 | 0 | } |
979 | | |
980 | | // The number of characters that each character 'ch' contributes |
981 | | // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch} |
982 | | // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS, |
983 | | // where the number of hexdigits is either 2, 4, or 8 (not 6). |
984 | | // Since the Unicode range is below 10^7, we choose k = 8 whence |
985 | | // each "block" requires at most 1 + 1 + 8 characters. |
986 | 0 | if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) { |
987 | 0 | end = start + PY_SSIZE_T_MAX / (1 + 1 + 8); |
988 | 0 | end = Py_MIN(end, objlen); |
989 | 0 | slen = Py_MAX(0, end - start); |
990 | 0 | } |
991 | |
|
992 | 0 | Py_ssize_t ressize = 0; |
993 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
994 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); |
995 | 0 | ressize += codec_handler_unicode_hex_width(c); |
996 | 0 | } |
997 | 0 | PyObject *res = PyUnicode_New(ressize, 127); |
998 | 0 | if (res == NULL) { |
999 | 0 | Py_DECREF(obj); |
1000 | 0 | return NULL; |
1001 | 0 | } |
1002 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
1003 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
1004 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); |
1005 | 0 | codec_handler_write_unicode_hex(&outp, c); |
1006 | 0 | } |
1007 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1008 | 0 | Py_DECREF(obj); |
1009 | 0 | return Py_BuildValue("(Nn)", res, end); |
1010 | 0 | } |
1011 | | |
1012 | | |
1013 | | static PyObject * |
1014 | | _PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc) |
1015 | 0 | { |
1016 | 0 | PyObject *obj; |
1017 | 0 | Py_ssize_t objlen, start, end, slen; |
1018 | 0 | if (_PyUnicodeError_GetParams(exc, |
1019 | 0 | &obj, &objlen, |
1020 | 0 | &start, &end, &slen, true) < 0) |
1021 | 0 | { |
1022 | 0 | return NULL; |
1023 | 0 | } |
1024 | | |
1025 | 0 | PyObject *res = PyUnicode_New(4 * slen, 127); |
1026 | 0 | if (res == NULL) { |
1027 | 0 | Py_DECREF(obj); |
1028 | 0 | return NULL; |
1029 | 0 | } |
1030 | | |
1031 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
1032 | 0 | const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); |
1033 | 0 | for (Py_ssize_t i = start; i < end; i++, outp += 4) { |
1034 | 0 | const unsigned char ch = p[i]; |
1035 | 0 | outp[0] = '\\'; |
1036 | 0 | outp[1] = 'x'; |
1037 | 0 | outp[2] = Py_hexdigits[(ch >> 4) & 0xf]; |
1038 | 0 | outp[3] = Py_hexdigits[ch & 0xf]; |
1039 | 0 | } |
1040 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1041 | 0 | Py_DECREF(obj); |
1042 | 0 | return Py_BuildValue("(Nn)", res, end); |
1043 | 0 | } |
1044 | | |
1045 | | |
1046 | | static inline PyObject * |
1047 | | _PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc) |
1048 | 0 | { |
1049 | | // Same implementation as for UnicodeEncodeError objects. |
1050 | 0 | return _PyCodec_BackslashReplaceUnicodeEncodeError(exc); |
1051 | 0 | } |
1052 | | |
1053 | | |
1054 | | PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) |
1055 | 0 | { |
1056 | 0 | if (_PyIsUnicodeEncodeError(exc)) { |
1057 | 0 | return _PyCodec_BackslashReplaceUnicodeEncodeError(exc); |
1058 | 0 | } |
1059 | 0 | else if (_PyIsUnicodeDecodeError(exc)) { |
1060 | 0 | return _PyCodec_BackslashReplaceUnicodeDecodeError(exc); |
1061 | 0 | } |
1062 | 0 | else if (_PyIsUnicodeTranslateError(exc)) { |
1063 | 0 | return _PyCodec_BackslashReplaceUnicodeTranslateError(exc); |
1064 | 0 | } |
1065 | 0 | else { |
1066 | 0 | wrong_exception_type(exc); |
1067 | 0 | return NULL; |
1068 | 0 | } |
1069 | 0 | } |
1070 | | |
1071 | | |
1072 | | // --- handler: 'namereplace' ------------------------------------------------- |
1073 | | |
1074 | | PyObject *PyCodec_NameReplaceErrors(PyObject *exc) |
1075 | 0 | { |
1076 | 0 | if (!_PyIsUnicodeEncodeError(exc)) { |
1077 | 0 | wrong_exception_type(exc); |
1078 | 0 | return NULL; |
1079 | 0 | } |
1080 | | |
1081 | 0 | _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI(); |
1082 | 0 | if (ucnhash_capi == NULL) { |
1083 | 0 | return NULL; |
1084 | 0 | } |
1085 | | |
1086 | 0 | PyObject *obj; |
1087 | 0 | Py_ssize_t start, end; |
1088 | 0 | if (_PyUnicodeError_GetParams(exc, |
1089 | 0 | &obj, NULL, |
1090 | 0 | &start, &end, NULL, false) < 0) |
1091 | 0 | { |
1092 | 0 | return NULL; |
1093 | 0 | } |
1094 | | |
1095 | 0 | char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */ |
1096 | 0 | Py_ssize_t imax = start, ressize = 0, replsize; |
1097 | 0 | for (; imax < end; ++imax) { |
1098 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax); |
1099 | 0 | if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) { |
1100 | | // If 'c' is recognized by getname(), the corresponding replacement |
1101 | | // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1 |
1102 | | // characters. Failures of getname() are ignored by the handler. |
1103 | 0 | replsize = 1 + 1 + 1 + strlen(buffer) + 1; |
1104 | 0 | } |
1105 | 0 | else { |
1106 | 0 | replsize = codec_handler_unicode_hex_width(c); |
1107 | 0 | } |
1108 | 0 | if (ressize > PY_SSIZE_T_MAX - replsize) { |
1109 | 0 | break; |
1110 | 0 | } |
1111 | 0 | ressize += replsize; |
1112 | 0 | } |
1113 | |
|
1114 | 0 | PyObject *res = PyUnicode_New(ressize, 127); |
1115 | 0 | if (res == NULL) { |
1116 | 0 | Py_DECREF(obj); |
1117 | 0 | return NULL; |
1118 | 0 | } |
1119 | | |
1120 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
1121 | 0 | for (Py_ssize_t i = start; i < imax; ++i) { |
1122 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); |
1123 | 0 | if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) { |
1124 | 0 | *outp++ = '\\'; |
1125 | 0 | *outp++ = 'N'; |
1126 | 0 | *outp++ = '{'; |
1127 | 0 | (void)strcpy((char *)outp, buffer); |
1128 | 0 | outp += strlen(buffer); |
1129 | 0 | *outp++ = '}'; |
1130 | 0 | } |
1131 | 0 | else { |
1132 | 0 | codec_handler_write_unicode_hex(&outp, c); |
1133 | 0 | } |
1134 | 0 | } |
1135 | |
|
1136 | 0 | assert(outp == PyUnicode_1BYTE_DATA(res) + ressize); |
1137 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1138 | 0 | PyObject *restuple = Py_BuildValue("(Nn)", res, imax); |
1139 | 0 | Py_DECREF(obj); |
1140 | 0 | return restuple; |
1141 | 0 | } |
1142 | | |
1143 | | |
1144 | 8 | #define ENC_UNKNOWN -1 |
1145 | 16 | #define ENC_UTF8 0 |
1146 | 0 | #define ENC_UTF16BE 1 |
1147 | 0 | #define ENC_UTF16LE 2 |
1148 | 0 | #define ENC_UTF32BE 3 |
1149 | 0 | #define ENC_UTF32LE 4 |
1150 | | |
1151 | | static int |
1152 | | get_standard_encoding_impl(const char *encoding, int *bytelength) |
1153 | 8 | { |
1154 | 8 | if (Py_TOLOWER(encoding[0]) == 'u' && |
1155 | 8 | Py_TOLOWER(encoding[1]) == 't' && |
1156 | 8 | Py_TOLOWER(encoding[2]) == 'f') { |
1157 | 8 | encoding += 3; |
1158 | 8 | if (*encoding == '-' || *encoding == '_' ) |
1159 | 8 | encoding++; |
1160 | 8 | if (encoding[0] == '8' && encoding[1] == '\0') { |
1161 | 8 | *bytelength = 3; |
1162 | 8 | return ENC_UTF8; |
1163 | 8 | } |
1164 | 0 | else if (encoding[0] == '1' && encoding[1] == '6') { |
1165 | 0 | encoding += 2; |
1166 | 0 | *bytelength = 2; |
1167 | 0 | if (*encoding == '\0') { |
1168 | | #ifdef WORDS_BIGENDIAN |
1169 | | return ENC_UTF16BE; |
1170 | | #else |
1171 | 0 | return ENC_UTF16LE; |
1172 | 0 | #endif |
1173 | 0 | } |
1174 | 0 | if (*encoding == '-' || *encoding == '_' ) |
1175 | 0 | encoding++; |
1176 | 0 | if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { |
1177 | 0 | if (Py_TOLOWER(encoding[0]) == 'b') |
1178 | 0 | return ENC_UTF16BE; |
1179 | 0 | if (Py_TOLOWER(encoding[0]) == 'l') |
1180 | 0 | return ENC_UTF16LE; |
1181 | 0 | } |
1182 | 0 | } |
1183 | 0 | else if (encoding[0] == '3' && encoding[1] == '2') { |
1184 | 0 | encoding += 2; |
1185 | 0 | *bytelength = 4; |
1186 | 0 | if (*encoding == '\0') { |
1187 | | #ifdef WORDS_BIGENDIAN |
1188 | | return ENC_UTF32BE; |
1189 | | #else |
1190 | 0 | return ENC_UTF32LE; |
1191 | 0 | #endif |
1192 | 0 | } |
1193 | 0 | if (*encoding == '-' || *encoding == '_' ) |
1194 | 0 | encoding++; |
1195 | 0 | if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { |
1196 | 0 | if (Py_TOLOWER(encoding[0]) == 'b') |
1197 | 0 | return ENC_UTF32BE; |
1198 | 0 | if (Py_TOLOWER(encoding[0]) == 'l') |
1199 | 0 | return ENC_UTF32LE; |
1200 | 0 | } |
1201 | 0 | } |
1202 | 8 | } |
1203 | 0 | else if (strcmp(encoding, "cp65001") == 0) { |
1204 | 0 | *bytelength = 3; |
1205 | 0 | return ENC_UTF8; |
1206 | 0 | } |
1207 | 0 | return ENC_UNKNOWN; |
1208 | 8 | } |
1209 | | |
1210 | | |
1211 | | static int |
1212 | | get_standard_encoding(PyObject *encoding, int *code, int *bytelength) |
1213 | 8 | { |
1214 | 8 | const char *encoding_cstr = PyUnicode_AsUTF8(encoding); |
1215 | 8 | if (encoding_cstr == NULL) { |
1216 | 0 | return -1; |
1217 | 0 | } |
1218 | 8 | *code = get_standard_encoding_impl(encoding_cstr, bytelength); |
1219 | 8 | return 0; |
1220 | 8 | } |
1221 | | |
1222 | | |
1223 | | // --- handler: 'surrogatepass' ----------------------------------------------- |
1224 | | |
1225 | | static PyObject * |
1226 | | _PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc) |
1227 | 0 | { |
1228 | 0 | PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc); |
1229 | 0 | if (encoding == NULL) { |
1230 | 0 | return NULL; |
1231 | 0 | } |
1232 | 0 | int code, bytelength; |
1233 | 0 | int rc = get_standard_encoding(encoding, &code, &bytelength); |
1234 | 0 | Py_DECREF(encoding); |
1235 | 0 | if (rc < 0) { |
1236 | 0 | return NULL; |
1237 | 0 | } |
1238 | 0 | if (code == ENC_UNKNOWN) { |
1239 | 0 | goto bail; |
1240 | 0 | } |
1241 | | |
1242 | 0 | PyObject *obj; |
1243 | 0 | Py_ssize_t objlen, start, end, slen; |
1244 | 0 | if (_PyUnicodeError_GetParams(exc, |
1245 | 0 | &obj, &objlen, |
1246 | 0 | &start, &end, &slen, false) < 0) |
1247 | 0 | { |
1248 | 0 | return NULL; |
1249 | 0 | } |
1250 | | |
1251 | 0 | if (slen > PY_SSIZE_T_MAX / bytelength) { |
1252 | 0 | end = start + PY_SSIZE_T_MAX / bytelength; |
1253 | 0 | end = Py_MIN(end, objlen); |
1254 | 0 | slen = Py_MAX(0, end - start); |
1255 | 0 | } |
1256 | |
|
1257 | 0 | PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen); |
1258 | 0 | if (res == NULL) { |
1259 | 0 | Py_DECREF(obj); |
1260 | 0 | return NULL; |
1261 | 0 | } |
1262 | | |
1263 | 0 | unsigned char *outp = (unsigned char *)PyBytes_AsString(res); |
1264 | 0 | for (Py_ssize_t i = start; i < end; i++) { |
1265 | 0 | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
1266 | 0 | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
1267 | | /* Not a surrogate, fail with original exception */ |
1268 | 0 | Py_DECREF(obj); |
1269 | 0 | Py_DECREF(res); |
1270 | 0 | goto bail; |
1271 | 0 | } |
1272 | 0 | switch (code) { |
1273 | 0 | case ENC_UTF8: { |
1274 | 0 | *outp++ = (unsigned char)(0xe0 | (ch >> 12)); |
1275 | 0 | *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); |
1276 | 0 | *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); |
1277 | 0 | break; |
1278 | 0 | } |
1279 | 0 | case ENC_UTF16LE: { |
1280 | 0 | *outp++ = (unsigned char)ch; |
1281 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1282 | 0 | break; |
1283 | 0 | } |
1284 | 0 | case ENC_UTF16BE: { |
1285 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1286 | 0 | *outp++ = (unsigned char)ch; |
1287 | 0 | break; |
1288 | 0 | } |
1289 | 0 | case ENC_UTF32LE: { |
1290 | 0 | *outp++ = (unsigned char)ch; |
1291 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1292 | 0 | *outp++ = (unsigned char)(ch >> 16); |
1293 | 0 | *outp++ = (unsigned char)(ch >> 24); |
1294 | 0 | break; |
1295 | 0 | } |
1296 | 0 | case ENC_UTF32BE: { |
1297 | 0 | *outp++ = (unsigned char)(ch >> 24); |
1298 | 0 | *outp++ = (unsigned char)(ch >> 16); |
1299 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1300 | 0 | *outp++ = (unsigned char)ch; |
1301 | 0 | break; |
1302 | 0 | } |
1303 | 0 | } |
1304 | 0 | } |
1305 | | |
1306 | 0 | Py_DECREF(obj); |
1307 | 0 | PyObject *restuple = Py_BuildValue("(Nn)", res, end); |
1308 | 0 | return restuple; |
1309 | | |
1310 | 0 | bail: |
1311 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1312 | 0 | return NULL; |
1313 | 0 | } |
1314 | | |
1315 | | |
1316 | | static PyObject * |
1317 | | _PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc) |
1318 | 8 | { |
1319 | 8 | PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc); |
1320 | 8 | if (encoding == NULL) { |
1321 | 0 | return NULL; |
1322 | 0 | } |
1323 | 8 | int code, bytelength; |
1324 | 8 | int rc = get_standard_encoding(encoding, &code, &bytelength); |
1325 | 8 | Py_DECREF(encoding); |
1326 | 8 | if (rc < 0) { |
1327 | 0 | return NULL; |
1328 | 0 | } |
1329 | 8 | if (code == ENC_UNKNOWN) { |
1330 | 0 | goto bail; |
1331 | 0 | } |
1332 | | |
1333 | 8 | PyObject *obj; |
1334 | 8 | Py_ssize_t objlen, start, end, slen; |
1335 | 8 | if (_PyUnicodeError_GetParams(exc, |
1336 | 8 | &obj, &objlen, |
1337 | 8 | &start, &end, &slen, true) < 0) |
1338 | 0 | { |
1339 | 0 | return NULL; |
1340 | 0 | } |
1341 | | |
1342 | | /* Try decoding a single surrogate character. If |
1343 | | there are more, let the codec call us again. */ |
1344 | 8 | Py_UCS4 ch = 0; |
1345 | 8 | const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); |
1346 | 8 | p += start; |
1347 | | |
1348 | 8 | if (objlen - start >= bytelength) { |
1349 | 8 | switch (code) { |
1350 | 8 | case ENC_UTF8: { |
1351 | 8 | if ((p[0] & 0xf0) == 0xe0 && |
1352 | 8 | (p[1] & 0xc0) == 0x80 && |
1353 | 8 | (p[2] & 0xc0) == 0x80) |
1354 | 8 | { |
1355 | | /* it's a three-byte code */ |
1356 | 8 | ch = ((p[0] & 0x0f) << 12) + |
1357 | 8 | ((p[1] & 0x3f) << 6) + |
1358 | 8 | (p[2] & 0x3f); |
1359 | 8 | } |
1360 | 8 | break; |
1361 | 0 | } |
1362 | 0 | case ENC_UTF16LE: { |
1363 | 0 | ch = p[1] << 8 | p[0]; |
1364 | 0 | break; |
1365 | 0 | } |
1366 | 0 | case ENC_UTF16BE: { |
1367 | 0 | ch = p[0] << 8 | p[1]; |
1368 | 0 | break; |
1369 | 0 | } |
1370 | 0 | case ENC_UTF32LE: { |
1371 | 0 | ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; |
1372 | 0 | break; |
1373 | 0 | } |
1374 | 0 | case ENC_UTF32BE: { |
1375 | 0 | ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; |
1376 | 0 | break; |
1377 | 0 | } |
1378 | 8 | } |
1379 | 8 | } |
1380 | 8 | Py_DECREF(obj); |
1381 | 8 | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
1382 | 0 | goto bail; |
1383 | 0 | } |
1384 | | |
1385 | 8 | PyObject *res = PyUnicode_FromOrdinal(ch); |
1386 | 8 | if (res == NULL) { |
1387 | 0 | return NULL; |
1388 | 0 | } |
1389 | 8 | return Py_BuildValue("(Nn)", res, start + bytelength); |
1390 | | |
1391 | 0 | bail: |
1392 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1393 | 0 | return NULL; |
1394 | 8 | } |
1395 | | |
1396 | | |
1397 | | /* This handler is declared static until someone demonstrates |
1398 | | a need to call it directly. */ |
1399 | | static PyObject * |
1400 | | PyCodec_SurrogatePassErrors(PyObject *exc) |
1401 | 8 | { |
1402 | 8 | if (_PyIsUnicodeEncodeError(exc)) { |
1403 | 0 | return _PyCodec_SurrogatePassUnicodeEncodeError(exc); |
1404 | 0 | } |
1405 | 8 | else if (_PyIsUnicodeDecodeError(exc)) { |
1406 | 8 | return _PyCodec_SurrogatePassUnicodeDecodeError(exc); |
1407 | 8 | } |
1408 | 0 | else { |
1409 | 0 | wrong_exception_type(exc); |
1410 | 0 | return NULL; |
1411 | 0 | } |
1412 | 8 | } |
1413 | | |
1414 | | |
1415 | | // --- handler: 'surrogateescape' --------------------------------------------- |
1416 | | |
1417 | | static PyObject * |
1418 | | _PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc) |
1419 | 11.6k | { |
1420 | 11.6k | PyObject *obj; |
1421 | 11.6k | Py_ssize_t start, end, slen; |
1422 | 11.6k | if (_PyUnicodeError_GetParams(exc, |
1423 | 11.6k | &obj, NULL, |
1424 | 11.6k | &start, &end, &slen, false) < 0) |
1425 | 0 | { |
1426 | 0 | return NULL; |
1427 | 0 | } |
1428 | | |
1429 | 11.6k | PyObject *res = PyBytes_FromStringAndSize(NULL, slen); |
1430 | 11.6k | if (res == NULL) { |
1431 | 0 | Py_DECREF(obj); |
1432 | 0 | return NULL; |
1433 | 0 | } |
1434 | | |
1435 | 11.6k | char *outp = PyBytes_AsString(res); |
1436 | 11.6k | for (Py_ssize_t i = start; i < end; i++) { |
1437 | 11.6k | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
1438 | 11.6k | if (ch < 0xdc80 || ch > 0xdcff) { |
1439 | | /* Not a UTF-8b surrogate, fail with original exception. */ |
1440 | 11.6k | Py_DECREF(obj); |
1441 | 11.6k | Py_DECREF(res); |
1442 | 11.6k | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1443 | 11.6k | return NULL; |
1444 | 11.6k | } |
1445 | 0 | *outp++ = ch - 0xdc00; |
1446 | 0 | } |
1447 | 0 | Py_DECREF(obj); |
1448 | |
|
1449 | 0 | return Py_BuildValue("(Nn)", res, end); |
1450 | 11.6k | } |
1451 | | |
1452 | | |
1453 | | static PyObject * |
1454 | | _PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc) |
1455 | 72.0k | { |
1456 | 72.0k | PyObject *obj; |
1457 | 72.0k | Py_ssize_t start, end, slen; |
1458 | 72.0k | if (_PyUnicodeError_GetParams(exc, |
1459 | 72.0k | &obj, NULL, |
1460 | 72.0k | &start, &end, &slen, true) < 0) |
1461 | 0 | { |
1462 | 0 | return NULL; |
1463 | 0 | } |
1464 | | |
1465 | 72.0k | Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ |
1466 | 72.0k | int consumed = 0; |
1467 | 72.0k | const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); |
1468 | 162k | while (consumed < 4 && consumed < slen) { |
1469 | | /* Refuse to escape ASCII bytes. */ |
1470 | 132k | if (p[start + consumed] < 128) { |
1471 | 41.9k | break; |
1472 | 41.9k | } |
1473 | 90.8k | ch[consumed] = 0xdc00 + p[start + consumed]; |
1474 | 90.8k | consumed++; |
1475 | 90.8k | } |
1476 | 72.0k | Py_DECREF(obj); |
1477 | | |
1478 | 72.0k | if (consumed == 0) { |
1479 | | /* Codec complained about ASCII byte. */ |
1480 | 23.6k | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1481 | 23.6k | return NULL; |
1482 | 23.6k | } |
1483 | | |
1484 | 48.3k | PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed); |
1485 | 48.3k | if (str == NULL) { |
1486 | 0 | return NULL; |
1487 | 0 | } |
1488 | 48.3k | return Py_BuildValue("(Nn)", str, start + consumed); |
1489 | 48.3k | } |
1490 | | |
1491 | | |
1492 | | static PyObject * |
1493 | | PyCodec_SurrogateEscapeErrors(PyObject *exc) |
1494 | 83.7k | { |
1495 | 83.7k | if (_PyIsUnicodeEncodeError(exc)) { |
1496 | 11.6k | return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc); |
1497 | 11.6k | } |
1498 | 72.0k | else if (_PyIsUnicodeDecodeError(exc)) { |
1499 | 72.0k | return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc); |
1500 | 72.0k | } |
1501 | 0 | else { |
1502 | 0 | wrong_exception_type(exc); |
1503 | 0 | return NULL; |
1504 | 0 | } |
1505 | 83.7k | } |
1506 | | |
1507 | | |
1508 | | // --- Codecs registry handlers ----------------------------------------------- |
1509 | | |
1510 | | static inline PyObject * |
1511 | | strict_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1512 | 3.20M | { |
1513 | 3.20M | return PyCodec_StrictErrors(exc); |
1514 | 3.20M | } |
1515 | | |
1516 | | |
1517 | | static inline PyObject * |
1518 | | ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1519 | 0 | { |
1520 | 0 | return PyCodec_IgnoreErrors(exc); |
1521 | 0 | } |
1522 | | |
1523 | | |
1524 | | static inline PyObject * |
1525 | | replace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1526 | 228k | { |
1527 | 228k | return PyCodec_ReplaceErrors(exc); |
1528 | 228k | } |
1529 | | |
1530 | | |
1531 | | static inline PyObject * |
1532 | | xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1533 | 0 | { |
1534 | 0 | return PyCodec_XMLCharRefReplaceErrors(exc); |
1535 | 0 | } |
1536 | | |
1537 | | |
1538 | | static inline PyObject * |
1539 | | backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1540 | 0 | { |
1541 | 0 | return PyCodec_BackslashReplaceErrors(exc); |
1542 | 0 | } |
1543 | | |
1544 | | |
1545 | | static inline PyObject * |
1546 | | namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1547 | 0 | { |
1548 | 0 | return PyCodec_NameReplaceErrors(exc); |
1549 | 0 | } |
1550 | | |
1551 | | |
1552 | | static inline PyObject * |
1553 | | surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1554 | 8 | { |
1555 | 8 | return PyCodec_SurrogatePassErrors(exc); |
1556 | 8 | } |
1557 | | |
1558 | | |
1559 | | static inline PyObject * |
1560 | | surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1561 | 83.7k | { |
1562 | 83.7k | return PyCodec_SurrogateEscapeErrors(exc); |
1563 | 83.7k | } |
1564 | | |
1565 | | |
1566 | | PyStatus |
1567 | | _PyCodec_InitRegistry(PyInterpreterState *interp) |
1568 | 36 | { |
1569 | 36 | static struct { |
1570 | 36 | const char *name; |
1571 | 36 | PyMethodDef def; |
1572 | 36 | } methods[] = |
1573 | 36 | { |
1574 | 36 | { |
1575 | 36 | "strict", |
1576 | 36 | { |
1577 | 36 | "strict_errors", |
1578 | 36 | strict_errors, |
1579 | 36 | METH_O, |
1580 | 36 | PyDoc_STR("Implements the 'strict' error handling, which " |
1581 | 36 | "raises a UnicodeError on coding errors.") |
1582 | 36 | } |
1583 | 36 | }, |
1584 | 36 | { |
1585 | 36 | "ignore", |
1586 | 36 | { |
1587 | 36 | "ignore_errors", |
1588 | 36 | ignore_errors, |
1589 | 36 | METH_O, |
1590 | 36 | PyDoc_STR("Implements the 'ignore' error handling, which " |
1591 | 36 | "ignores malformed data and continues.") |
1592 | 36 | } |
1593 | 36 | }, |
1594 | 36 | { |
1595 | 36 | "replace", |
1596 | 36 | { |
1597 | 36 | "replace_errors", |
1598 | 36 | replace_errors, |
1599 | 36 | METH_O, |
1600 | 36 | PyDoc_STR("Implements the 'replace' error handling, which " |
1601 | 36 | "replaces malformed data with a replacement marker.") |
1602 | 36 | } |
1603 | 36 | }, |
1604 | 36 | { |
1605 | 36 | "xmlcharrefreplace", |
1606 | 36 | { |
1607 | 36 | "xmlcharrefreplace_errors", |
1608 | 36 | xmlcharrefreplace_errors, |
1609 | 36 | METH_O, |
1610 | 36 | PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, " |
1611 | 36 | "which replaces an unencodable character with the " |
1612 | 36 | "appropriate XML character reference.") |
1613 | 36 | } |
1614 | 36 | }, |
1615 | 36 | { |
1616 | 36 | "backslashreplace", |
1617 | 36 | { |
1618 | 36 | "backslashreplace_errors", |
1619 | 36 | backslashreplace_errors, |
1620 | 36 | METH_O, |
1621 | 36 | PyDoc_STR("Implements the 'backslashreplace' error handling, " |
1622 | 36 | "which replaces malformed data with a backslashed " |
1623 | 36 | "escape sequence.") |
1624 | 36 | } |
1625 | 36 | }, |
1626 | 36 | { |
1627 | 36 | "namereplace", |
1628 | 36 | { |
1629 | 36 | "namereplace_errors", |
1630 | 36 | namereplace_errors, |
1631 | 36 | METH_O, |
1632 | 36 | PyDoc_STR("Implements the 'namereplace' error handling, " |
1633 | 36 | "which replaces an unencodable character with a " |
1634 | 36 | "\\N{...} escape sequence.") |
1635 | 36 | } |
1636 | 36 | }, |
1637 | 36 | { |
1638 | 36 | "surrogatepass", |
1639 | 36 | { |
1640 | 36 | "surrogatepass", |
1641 | 36 | surrogatepass_errors, |
1642 | 36 | METH_O |
1643 | 36 | } |
1644 | 36 | }, |
1645 | 36 | { |
1646 | 36 | "surrogateescape", |
1647 | 36 | { |
1648 | 36 | "surrogateescape", |
1649 | 36 | surrogateescape_errors, |
1650 | 36 | METH_O |
1651 | 36 | } |
1652 | 36 | } |
1653 | 36 | }; |
1654 | | // ensure that the built-in error handlers' names are kept in sync |
1655 | 36 | assert(Py_ARRAY_LENGTH(methods) == Py_ARRAY_LENGTH(codecs_builtin_error_handlers)); |
1656 | | |
1657 | 36 | assert(interp->codecs.initialized == 0); |
1658 | 36 | interp->codecs.search_path = PyList_New(0); |
1659 | 36 | if (interp->codecs.search_path == NULL) { |
1660 | 0 | return PyStatus_NoMemory(); |
1661 | 0 | } |
1662 | 36 | interp->codecs.search_cache = PyDict_New(); |
1663 | 36 | if (interp->codecs.search_cache == NULL) { |
1664 | 0 | return PyStatus_NoMemory(); |
1665 | 0 | } |
1666 | 36 | interp->codecs.error_registry = PyDict_New(); |
1667 | 36 | if (interp->codecs.error_registry == NULL) { |
1668 | 0 | return PyStatus_NoMemory(); |
1669 | 0 | } |
1670 | 324 | for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) { |
1671 | 288 | PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL); |
1672 | 288 | if (func == NULL) { |
1673 | 0 | return PyStatus_NoMemory(); |
1674 | 0 | } |
1675 | | |
1676 | 288 | int res = PyDict_SetItemString(interp->codecs.error_registry, |
1677 | 288 | methods[i].name, func); |
1678 | 288 | Py_DECREF(func); |
1679 | 288 | if (res < 0) { |
1680 | 0 | return PyStatus_Error("Failed to insert into codec error registry"); |
1681 | 0 | } |
1682 | 288 | } |
1683 | | |
1684 | 36 | interp->codecs.initialized = 1; |
1685 | | |
1686 | | // Importing `encodings' will call back into this module to register codec |
1687 | | // search functions, so this is done after everything else is initialized. |
1688 | 36 | PyObject *mod = PyImport_ImportModule("encodings"); |
1689 | 36 | if (mod == NULL) { |
1690 | 0 | PyThreadState *tstate = _PyThreadState_GET(); |
1691 | 0 | _Py_DumpPathConfig(tstate); |
1692 | 0 | return PyStatus_Error("Failed to import encodings module"); |
1693 | 0 | } |
1694 | 36 | Py_DECREF(mod); |
1695 | | |
1696 | 36 | return PyStatus_Ok(); |
1697 | 36 | } |
1698 | | |
1699 | | void |
1700 | | _PyCodec_Fini(PyInterpreterState *interp) |
1701 | 0 | { |
1702 | 0 | Py_CLEAR(interp->codecs.search_path); |
1703 | 0 | Py_CLEAR(interp->codecs.search_cache); |
1704 | | Py_CLEAR(interp->codecs.error_registry); |
1705 | 0 | interp->codecs.initialized = 0; |
1706 | 0 | } |