/src/cpython/Python/codecs.c
Line | Count | Source |
1 | | /* ------------------------------------------------------------------------ |
2 | | |
3 | | Python Codec Registry and support functions |
4 | | |
5 | | Written by Marc-Andre Lemburg (mal@lemburg.com). |
6 | | |
7 | | Copyright (c) Corporation for National Research Initiatives. |
8 | | |
9 | | ------------------------------------------------------------------------ */ |
10 | | |
11 | | #include "Python.h" |
12 | | #include "pycore_call.h" // _PyObject_CallNoArgs() |
13 | | #include "pycore_codecs.h" // export _PyCodec_LookupTextEncoding() |
14 | | #include "pycore_interp.h" // PyInterpreterState.codec_search_path |
15 | | #include "pycore_pyerrors.h" // _PyErr_FormatNote() |
16 | | #include "pycore_pystate.h" // _PyInterpreterState_GET() |
17 | | #include "pycore_runtime.h" // _Py_ID() |
18 | | #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI |
19 | | #include "pycore_unicodeobject.h" // _PyUnicode_InternMortal() |
20 | | #include "pycore_pyatomic_ft_wrappers.h" |
21 | | |
22 | | static const char *codecs_builtin_error_handlers[] = { |
23 | | "strict", "ignore", "replace", |
24 | | "xmlcharrefreplace", "backslashreplace", "namereplace", |
25 | | "surrogatepass", "surrogateescape", |
26 | | }; |
27 | | |
28 | | const char *Py_hexdigits = "0123456789abcdef"; |
29 | | |
30 | | /* --- Codec Registry ----------------------------------------------------- */ |
31 | | |
32 | | int PyCodec_Register(PyObject *search_function) |
33 | 37 | { |
34 | 37 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
35 | 37 | assert(interp->codecs.initialized); |
36 | 37 | if (search_function == NULL) { |
37 | 0 | PyErr_BadArgument(); |
38 | 0 | goto onError; |
39 | 0 | } |
40 | 37 | if (!PyCallable_Check(search_function)) { |
41 | 0 | PyErr_SetString(PyExc_TypeError, "argument must be callable"); |
42 | 0 | goto onError; |
43 | 0 | } |
44 | 37 | FT_MUTEX_LOCK(&interp->codecs.search_path_mutex); |
45 | 37 | int ret = PyList_Append(interp->codecs.search_path, search_function); |
46 | 37 | FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex); |
47 | | |
48 | 37 | return ret; |
49 | | |
50 | 0 | onError: |
51 | 0 | return -1; |
52 | 37 | } |
53 | | |
54 | | int |
55 | | PyCodec_Unregister(PyObject *search_function) |
56 | 0 | { |
57 | 0 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
58 | 0 | if (interp->codecs.initialized != 1) { |
59 | | /* Do nothing if codecs state was cleared (only possible during |
60 | | interpreter shutdown). */ |
61 | 0 | return 0; |
62 | 0 | } |
63 | | |
64 | 0 | PyObject *codec_search_path = interp->codecs.search_path; |
65 | 0 | assert(PyList_CheckExact(codec_search_path)); |
66 | 0 | for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) { |
67 | 0 | FT_MUTEX_LOCK(&interp->codecs.search_path_mutex); |
68 | 0 | PyObject *item = PyList_GetItemRef(codec_search_path, i); |
69 | 0 | int ret = 1; |
70 | 0 | if (item == search_function) { |
71 | | // We hold a reference to the item, so its destructor can't run |
72 | | // while we hold search_path_mutex. |
73 | 0 | ret = PyList_SetSlice(codec_search_path, i, i+1, NULL); |
74 | 0 | } |
75 | 0 | FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex); |
76 | 0 | Py_DECREF(item); |
77 | 0 | if (ret != 1) { |
78 | 0 | assert(interp->codecs.search_cache != NULL); |
79 | 0 | assert(PyDict_CheckExact(interp->codecs.search_cache)); |
80 | 0 | PyDict_Clear(interp->codecs.search_cache); |
81 | 0 | return ret; |
82 | 0 | } |
83 | 0 | } |
84 | 0 | return 0; |
85 | 0 | } |
86 | | |
87 | | /* Convert a string to a normalized Python string: all ASCII letters are |
88 | | converted to lower case, spaces are replaced with hyphens. */ |
89 | | |
90 | | static PyObject* |
91 | | normalizestring(const char *string) |
92 | 2.36M | { |
93 | 2.36M | size_t i; |
94 | 2.36M | size_t len = strlen(string); |
95 | 2.36M | char *p; |
96 | 2.36M | PyObject *v; |
97 | | |
98 | 2.36M | if (len > PY_SSIZE_T_MAX) { |
99 | 0 | PyErr_SetString(PyExc_OverflowError, "string is too large"); |
100 | 0 | return NULL; |
101 | 0 | } |
102 | | |
103 | 2.36M | p = PyMem_Malloc(len + 1); |
104 | 2.36M | if (p == NULL) |
105 | 0 | return PyErr_NoMemory(); |
106 | 33.7M | for (i = 0; i < len; i++) { |
107 | 31.4M | char ch = string[i]; |
108 | 31.4M | if (ch == ' ') |
109 | 237k | ch = '-'; |
110 | 31.1M | else |
111 | 31.1M | ch = Py_TOLOWER(Py_CHARMASK(ch)); |
112 | 31.4M | p[i] = ch; |
113 | 31.4M | } |
114 | 2.36M | p[i] = '\0'; |
115 | 2.36M | v = PyUnicode_FromString(p); |
116 | 2.36M | PyMem_Free(p); |
117 | 2.36M | return v; |
118 | 2.36M | } |
119 | | |
120 | | /* Lookup the given encoding and return a tuple providing the codec |
121 | | facilities. |
122 | | |
123 | | ASCII letters in the encoding string is looked up converted to all |
124 | | lower case. This makes encodings looked up through this mechanism |
125 | | effectively case-insensitive. Spaces are replaced with hyphens for |
126 | | names like "US ASCII" and "ISO 8859-1". |
127 | | |
128 | | If no codec is found, a LookupError is set and NULL returned. |
129 | | |
130 | | As side effect, this tries to load the encodings package, if not |
131 | | yet done. This is part of the lazy load strategy for the encodings |
132 | | package. |
133 | | |
134 | | */ |
135 | | |
136 | | PyObject *_PyCodec_Lookup(const char *encoding) |
137 | 2.36M | { |
138 | 2.36M | if (encoding == NULL) { |
139 | 0 | PyErr_BadArgument(); |
140 | 0 | return NULL; |
141 | 0 | } |
142 | | |
143 | 2.36M | PyInterpreterState *interp = _PyInterpreterState_GET(); |
144 | 2.36M | assert(interp->codecs.initialized); |
145 | | |
146 | | /* Convert the encoding to a normalized Python string: all |
147 | | ASCII letters are converted to lower case, spaces are |
148 | | replaced with hyphens. */ |
149 | 2.36M | PyObject *v = normalizestring(encoding); |
150 | 2.36M | if (v == NULL) { |
151 | 0 | return NULL; |
152 | 0 | } |
153 | | |
154 | | /* Intern the string. We'll make it immortal later if lookup succeeds. */ |
155 | 2.36M | _PyUnicode_InternMortal(interp, &v); |
156 | | |
157 | | /* First, try to lookup the name in the registry dictionary */ |
158 | 2.36M | PyObject *result; |
159 | 2.36M | if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) { |
160 | 0 | goto onError; |
161 | 0 | } |
162 | 2.36M | if (result != NULL) { |
163 | 2.28M | Py_DECREF(v); |
164 | 2.28M | return result; |
165 | 2.28M | } |
166 | | |
167 | | /* Next, scan the search functions in order of registration */ |
168 | 77.8k | const Py_ssize_t len = PyList_Size(interp->codecs.search_path); |
169 | 77.8k | if (len < 0) |
170 | 0 | goto onError; |
171 | 77.8k | if (len == 0) { |
172 | 0 | PyErr_SetString(PyExc_LookupError, |
173 | 0 | "no codec search functions registered: " |
174 | 0 | "can't find encoding"); |
175 | 0 | goto onError; |
176 | 0 | } |
177 | | |
178 | 77.8k | Py_ssize_t i; |
179 | 153k | for (i = 0; i < len; i++) { |
180 | 77.8k | PyObject *func; |
181 | | |
182 | 77.8k | func = PyList_GetItemRef(interp->codecs.search_path, i); |
183 | 77.8k | if (func == NULL) |
184 | 0 | goto onError; |
185 | 77.8k | result = PyObject_CallOneArg(func, v); |
186 | 77.8k | Py_DECREF(func); |
187 | 77.8k | if (result == NULL) |
188 | 0 | goto onError; |
189 | 77.8k | if (result == Py_None) { |
190 | 75.7k | Py_CLEAR(result); |
191 | 75.7k | continue; |
192 | 75.7k | } |
193 | 2.08k | if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) { |
194 | 0 | PyErr_SetString(PyExc_TypeError, |
195 | 0 | "codec search functions must return 4-tuples"); |
196 | 0 | Py_DECREF(result); |
197 | 0 | goto onError; |
198 | 0 | } |
199 | 2.08k | break; |
200 | 2.08k | } |
201 | 77.8k | if (result == NULL) { |
202 | | /* XXX Perhaps we should cache misses too ? */ |
203 | 75.7k | PyErr_Format(PyExc_LookupError, |
204 | 75.7k | "unknown encoding: %s", encoding); |
205 | 75.7k | goto onError; |
206 | 75.7k | } |
207 | | |
208 | 2.08k | _PyUnicode_InternImmortal(interp, &v); |
209 | | |
210 | | /* Cache and return the result */ |
211 | 2.08k | if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) { |
212 | 0 | Py_DECREF(result); |
213 | 0 | goto onError; |
214 | 0 | } |
215 | 2.08k | Py_DECREF(v); |
216 | 2.08k | return result; |
217 | | |
218 | 75.7k | onError: |
219 | 75.7k | Py_DECREF(v); |
220 | 75.7k | return NULL; |
221 | 2.08k | } |
222 | | |
223 | | /* Codec registry encoding check API. */ |
224 | | |
225 | | int PyCodec_KnownEncoding(const char *encoding) |
226 | 0 | { |
227 | 0 | PyObject *codecs; |
228 | |
|
229 | 0 | codecs = _PyCodec_Lookup(encoding); |
230 | 0 | if (!codecs) { |
231 | 0 | PyErr_Clear(); |
232 | 0 | return 0; |
233 | 0 | } |
234 | 0 | else { |
235 | 0 | Py_DECREF(codecs); |
236 | 0 | return 1; |
237 | 0 | } |
238 | 0 | } |
239 | | |
240 | | static |
241 | | PyObject *args_tuple(PyObject *object, |
242 | | const char *errors) |
243 | 2.01M | { |
244 | 2.01M | PyObject *args; |
245 | | |
246 | 2.01M | args = PyTuple_New(1 + (errors != NULL)); |
247 | 2.01M | if (args == NULL) |
248 | 0 | return NULL; |
249 | 2.01M | PyTuple_SET_ITEM(args, 0, Py_NewRef(object)); |
250 | 2.01M | if (errors) { |
251 | 191k | PyObject *v; |
252 | | |
253 | 191k | v = PyUnicode_FromString(errors); |
254 | 191k | if (v == NULL) { |
255 | 0 | Py_DECREF(args); |
256 | 0 | return NULL; |
257 | 0 | } |
258 | 191k | PyTuple_SET_ITEM(args, 1, v); |
259 | 191k | } |
260 | 2.01M | return args; |
261 | 2.01M | } |
262 | | |
263 | | /* Helper function to get a codec item */ |
264 | | |
265 | | static |
266 | | PyObject *codec_getitem(const char *encoding, int index) |
267 | 0 | { |
268 | 0 | PyObject *codecs; |
269 | 0 | PyObject *v; |
270 | |
|
271 | 0 | codecs = _PyCodec_Lookup(encoding); |
272 | 0 | if (codecs == NULL) |
273 | 0 | return NULL; |
274 | 0 | v = PyTuple_GET_ITEM(codecs, index); |
275 | 0 | Py_DECREF(codecs); |
276 | 0 | return Py_NewRef(v); |
277 | 0 | } |
278 | | |
279 | | /* Helper functions to create an incremental codec. */ |
280 | | static |
281 | | PyObject *codec_makeincrementalcodec(PyObject *codec_info, |
282 | | const char *errors, |
283 | | const char *attrname) |
284 | 130 | { |
285 | 130 | PyObject *ret, *inccodec; |
286 | | |
287 | 130 | inccodec = PyObject_GetAttrString(codec_info, attrname); |
288 | 130 | if (inccodec == NULL) |
289 | 0 | return NULL; |
290 | 130 | if (errors) |
291 | 130 | ret = PyObject_CallFunction(inccodec, "s", errors); |
292 | 0 | else |
293 | 0 | ret = _PyObject_CallNoArgs(inccodec); |
294 | 130 | Py_DECREF(inccodec); |
295 | 130 | return ret; |
296 | 130 | } |
297 | | |
298 | | static |
299 | | PyObject *codec_getincrementalcodec(const char *encoding, |
300 | | const char *errors, |
301 | | const char *attrname) |
302 | 0 | { |
303 | 0 | PyObject *codec_info, *ret; |
304 | |
|
305 | 0 | codec_info = _PyCodec_Lookup(encoding); |
306 | 0 | if (codec_info == NULL) |
307 | 0 | return NULL; |
308 | 0 | ret = codec_makeincrementalcodec(codec_info, errors, attrname); |
309 | 0 | Py_DECREF(codec_info); |
310 | 0 | return ret; |
311 | 0 | } |
312 | | |
313 | | /* Helper function to create a stream codec. */ |
314 | | |
315 | | static |
316 | | PyObject *codec_getstreamcodec(const char *encoding, |
317 | | PyObject *stream, |
318 | | const char *errors, |
319 | | const int index) |
320 | 0 | { |
321 | 0 | PyObject *codecs, *streamcodec, *codeccls; |
322 | |
|
323 | 0 | codecs = _PyCodec_Lookup(encoding); |
324 | 0 | if (codecs == NULL) |
325 | 0 | return NULL; |
326 | | |
327 | 0 | codeccls = PyTuple_GET_ITEM(codecs, index); |
328 | 0 | if (errors != NULL) |
329 | 0 | streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors); |
330 | 0 | else |
331 | 0 | streamcodec = PyObject_CallOneArg(codeccls, stream); |
332 | 0 | Py_DECREF(codecs); |
333 | 0 | return streamcodec; |
334 | 0 | } |
335 | | |
336 | | /* Helpers to work with the result of _PyCodec_Lookup |
337 | | |
338 | | */ |
339 | | PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info, |
340 | | const char *errors) |
341 | 55 | { |
342 | 55 | return codec_makeincrementalcodec(codec_info, errors, |
343 | 55 | "incrementaldecoder"); |
344 | 55 | } |
345 | | |
346 | | PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info, |
347 | | const char *errors) |
348 | 75 | { |
349 | 75 | return codec_makeincrementalcodec(codec_info, errors, |
350 | 75 | "incrementalencoder"); |
351 | 75 | } |
352 | | |
353 | | |
354 | | /* Convenience APIs to query the Codec registry. |
355 | | |
356 | | All APIs return a codec object with incremented refcount. |
357 | | |
358 | | */ |
359 | | |
360 | | PyObject *PyCodec_Encoder(const char *encoding) |
361 | 0 | { |
362 | 0 | return codec_getitem(encoding, 0); |
363 | 0 | } |
364 | | |
365 | | PyObject *PyCodec_Decoder(const char *encoding) |
366 | 0 | { |
367 | 0 | return codec_getitem(encoding, 1); |
368 | 0 | } |
369 | | |
370 | | PyObject *PyCodec_IncrementalEncoder(const char *encoding, |
371 | | const char *errors) |
372 | 0 | { |
373 | 0 | return codec_getincrementalcodec(encoding, errors, "incrementalencoder"); |
374 | 0 | } |
375 | | |
376 | | PyObject *PyCodec_IncrementalDecoder(const char *encoding, |
377 | | const char *errors) |
378 | 0 | { |
379 | 0 | return codec_getincrementalcodec(encoding, errors, "incrementaldecoder"); |
380 | 0 | } |
381 | | |
382 | | PyObject *PyCodec_StreamReader(const char *encoding, |
383 | | PyObject *stream, |
384 | | const char *errors) |
385 | 0 | { |
386 | 0 | return codec_getstreamcodec(encoding, stream, errors, 2); |
387 | 0 | } |
388 | | |
389 | | PyObject *PyCodec_StreamWriter(const char *encoding, |
390 | | PyObject *stream, |
391 | | const char *errors) |
392 | 0 | { |
393 | 0 | return codec_getstreamcodec(encoding, stream, errors, 3); |
394 | 0 | } |
395 | | |
396 | | /* Encode an object (e.g. a Unicode object) using the given encoding |
397 | | and return the resulting encoded object (usually a Python string). |
398 | | |
399 | | errors is passed to the encoder factory as argument if non-NULL. */ |
400 | | |
401 | | static PyObject * |
402 | | _PyCodec_EncodeInternal(PyObject *object, |
403 | | PyObject *encoder, |
404 | | const char *encoding, |
405 | | const char *errors) |
406 | 855k | { |
407 | 855k | PyObject *args = NULL, *result = NULL; |
408 | 855k | PyObject *v = NULL; |
409 | | |
410 | 855k | args = args_tuple(object, errors); |
411 | 855k | if (args == NULL) |
412 | 0 | goto onError; |
413 | | |
414 | 855k | result = PyObject_Call(encoder, args, NULL); |
415 | 855k | if (result == NULL) { |
416 | 0 | _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding); |
417 | 0 | goto onError; |
418 | 0 | } |
419 | | |
420 | 855k | if (!PyTuple_Check(result) || |
421 | 855k | PyTuple_GET_SIZE(result) != 2) { |
422 | 0 | PyErr_SetString(PyExc_TypeError, |
423 | 0 | "encoder must return a tuple (object, integer)"); |
424 | 0 | goto onError; |
425 | 0 | } |
426 | 855k | v = Py_NewRef(PyTuple_GET_ITEM(result,0)); |
427 | | /* We don't check or use the second (integer) entry. */ |
428 | | |
429 | 855k | Py_DECREF(args); |
430 | 855k | Py_DECREF(encoder); |
431 | 855k | Py_DECREF(result); |
432 | 855k | return v; |
433 | | |
434 | 0 | onError: |
435 | 0 | Py_XDECREF(result); |
436 | 0 | Py_XDECREF(args); |
437 | 0 | Py_XDECREF(encoder); |
438 | 0 | return NULL; |
439 | 855k | } |
440 | | |
441 | | /* Decode an object (usually a Python string) using the given encoding |
442 | | and return an equivalent object (e.g. a Unicode object). |
443 | | |
444 | | errors is passed to the decoder factory as argument if non-NULL. */ |
445 | | |
446 | | static PyObject * |
447 | | _PyCodec_DecodeInternal(PyObject *object, |
448 | | PyObject *decoder, |
449 | | const char *encoding, |
450 | | const char *errors) |
451 | 1.15M | { |
452 | 1.15M | PyObject *args = NULL, *result = NULL; |
453 | 1.15M | PyObject *v; |
454 | | |
455 | 1.15M | args = args_tuple(object, errors); |
456 | 1.15M | if (args == NULL) |
457 | 0 | goto onError; |
458 | | |
459 | 1.15M | result = PyObject_Call(decoder, args, NULL); |
460 | 1.15M | if (result == NULL) { |
461 | 79.5k | _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding); |
462 | 79.5k | goto onError; |
463 | 79.5k | } |
464 | 1.07M | if (!PyTuple_Check(result) || |
465 | 1.07M | PyTuple_GET_SIZE(result) != 2) { |
466 | 0 | PyErr_SetString(PyExc_TypeError, |
467 | 0 | "decoder must return a tuple (object,integer)"); |
468 | 0 | goto onError; |
469 | 0 | } |
470 | 1.07M | v = Py_NewRef(PyTuple_GET_ITEM(result,0)); |
471 | | /* We don't check or use the second (integer) entry. */ |
472 | | |
473 | 1.07M | Py_DECREF(args); |
474 | 1.07M | Py_DECREF(decoder); |
475 | 1.07M | Py_DECREF(result); |
476 | 1.07M | return v; |
477 | | |
478 | 79.5k | onError: |
479 | 79.5k | Py_XDECREF(args); |
480 | 79.5k | Py_XDECREF(decoder); |
481 | 79.5k | Py_XDECREF(result); |
482 | 79.5k | return NULL; |
483 | 1.07M | } |
484 | | |
485 | | /* Generic encoding/decoding API */ |
486 | | PyObject *PyCodec_Encode(PyObject *object, |
487 | | const char *encoding, |
488 | | const char *errors) |
489 | 0 | { |
490 | 0 | PyObject *encoder; |
491 | |
|
492 | 0 | encoder = PyCodec_Encoder(encoding); |
493 | 0 | if (encoder == NULL) |
494 | 0 | return NULL; |
495 | | |
496 | 0 | return _PyCodec_EncodeInternal(object, encoder, encoding, errors); |
497 | 0 | } |
498 | | |
499 | | PyObject *PyCodec_Decode(PyObject *object, |
500 | | const char *encoding, |
501 | | const char *errors) |
502 | 0 | { |
503 | 0 | PyObject *decoder; |
504 | |
|
505 | 0 | decoder = PyCodec_Decoder(encoding); |
506 | 0 | if (decoder == NULL) |
507 | 0 | return NULL; |
508 | | |
509 | 0 | return _PyCodec_DecodeInternal(object, decoder, encoding, errors); |
510 | 0 | } |
511 | | |
512 | | /* Text encoding/decoding API */ |
513 | | PyObject * _PyCodec_LookupTextEncoding(const char *encoding, |
514 | | const char *alternate_command) |
515 | 2.02M | { |
516 | 2.02M | PyObject *codec; |
517 | 2.02M | PyObject *attr; |
518 | 2.02M | int is_text_codec; |
519 | | |
520 | 2.02M | codec = _PyCodec_Lookup(encoding); |
521 | 2.02M | if (codec == NULL) |
522 | 11.3k | return NULL; |
523 | | |
524 | | /* Backwards compatibility: assume any raw tuple describes a text |
525 | | * encoding, and the same for anything lacking the private |
526 | | * attribute. |
527 | | */ |
528 | 2.01M | if (!PyTuple_CheckExact(codec)) { |
529 | 2.01M | if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) { |
530 | 0 | Py_DECREF(codec); |
531 | 0 | return NULL; |
532 | 0 | } |
533 | 2.01M | if (attr != NULL) { |
534 | 2.01M | is_text_codec = PyObject_IsTrue(attr); |
535 | 2.01M | Py_DECREF(attr); |
536 | 2.01M | if (is_text_codec <= 0) { |
537 | 3.10k | Py_DECREF(codec); |
538 | 3.10k | if (!is_text_codec) { |
539 | 3.10k | if (alternate_command != NULL) { |
540 | 3.10k | PyErr_Format(PyExc_LookupError, |
541 | 3.10k | "'%.400s' is not a text encoding; " |
542 | 3.10k | "use %s to handle arbitrary codecs", |
543 | 3.10k | encoding, alternate_command); |
544 | 3.10k | } |
545 | 0 | else { |
546 | 0 | PyErr_Format(PyExc_LookupError, |
547 | 0 | "'%.400s' is not a text encoding", |
548 | 0 | encoding); |
549 | 0 | } |
550 | 3.10k | } |
551 | 3.10k | return NULL; |
552 | 3.10k | } |
553 | 2.01M | } |
554 | 2.01M | } |
555 | | |
556 | | /* This appears to be a valid text encoding */ |
557 | 2.01M | return codec; |
558 | 2.01M | } |
559 | | |
560 | | |
561 | | static |
562 | | PyObject *codec_getitem_checked(const char *encoding, |
563 | | const char *alternate_command, |
564 | | int index) |
565 | 2.02M | { |
566 | 2.02M | PyObject *codec; |
567 | 2.02M | PyObject *v; |
568 | | |
569 | 2.02M | codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); |
570 | 2.02M | if (codec == NULL) |
571 | 14.4k | return NULL; |
572 | | |
573 | 2.01M | v = Py_NewRef(PyTuple_GET_ITEM(codec, index)); |
574 | 2.01M | Py_DECREF(codec); |
575 | 2.01M | return v; |
576 | 2.02M | } |
577 | | |
578 | | static PyObject * _PyCodec_TextEncoder(const char *encoding) |
579 | 855k | { |
580 | 855k | return codec_getitem_checked(encoding, "codecs.encode()", 0); |
581 | 855k | } |
582 | | |
583 | | static PyObject * _PyCodec_TextDecoder(const char *encoding) |
584 | 1.17M | { |
585 | 1.17M | return codec_getitem_checked(encoding, "codecs.decode()", 1); |
586 | 1.17M | } |
587 | | |
588 | | PyObject *_PyCodec_EncodeText(PyObject *object, |
589 | | const char *encoding, |
590 | | const char *errors) |
591 | 855k | { |
592 | 855k | PyObject *encoder; |
593 | | |
594 | 855k | encoder = _PyCodec_TextEncoder(encoding); |
595 | 855k | if (encoder == NULL) |
596 | 0 | return NULL; |
597 | | |
598 | 855k | return _PyCodec_EncodeInternal(object, encoder, encoding, errors); |
599 | 855k | } |
600 | | |
601 | | PyObject *_PyCodec_DecodeText(PyObject *object, |
602 | | const char *encoding, |
603 | | const char *errors) |
604 | 1.17M | { |
605 | 1.17M | PyObject *decoder; |
606 | | |
607 | 1.17M | decoder = _PyCodec_TextDecoder(encoding); |
608 | 1.17M | if (decoder == NULL) |
609 | 14.4k | return NULL; |
610 | | |
611 | 1.15M | return _PyCodec_DecodeInternal(object, decoder, encoding, errors); |
612 | 1.17M | } |
613 | | |
614 | | /* Register the error handling callback function error under the name |
615 | | name. This function will be called by the codec when it encounters |
616 | | an unencodable characters/undecodable bytes and doesn't know the |
617 | | callback name, when name is specified as the error parameter |
618 | | in the call to the encode/decode function. |
619 | | Return 0 on success, -1 on error */ |
620 | | int PyCodec_RegisterError(const char *name, PyObject *error) |
621 | 0 | { |
622 | 0 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
623 | 0 | assert(interp->codecs.initialized); |
624 | 0 | if (!PyCallable_Check(error)) { |
625 | 0 | PyErr_SetString(PyExc_TypeError, "handler must be callable"); |
626 | 0 | return -1; |
627 | 0 | } |
628 | 0 | return PyDict_SetItemString(interp->codecs.error_registry, |
629 | 0 | name, error); |
630 | 0 | } |
631 | | |
632 | | int _PyCodec_UnregisterError(const char *name) |
633 | 0 | { |
634 | 0 | for (size_t i = 0; i < Py_ARRAY_LENGTH(codecs_builtin_error_handlers); ++i) { |
635 | 0 | if (strcmp(name, codecs_builtin_error_handlers[i]) == 0) { |
636 | 0 | PyErr_Format(PyExc_ValueError, |
637 | 0 | "cannot un-register built-in error handler '%s'", name); |
638 | 0 | return -1; |
639 | 0 | } |
640 | 0 | } |
641 | 0 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
642 | 0 | assert(interp->codecs.initialized); |
643 | 0 | return PyDict_PopString(interp->codecs.error_registry, name, NULL); |
644 | 0 | } |
645 | | |
646 | | /* Lookup the error handling callback function registered under the |
647 | | name error. As a special case NULL can be passed, in which case |
648 | | the error handling callback for strict encoding will be returned. */ |
649 | | PyObject *PyCodec_LookupError(const char *name) |
650 | 2.53M | { |
651 | 2.53M | PyInterpreterState *interp = _PyInterpreterState_GET(); |
652 | 2.53M | assert(interp->codecs.initialized); |
653 | | |
654 | 2.53M | if (name==NULL) |
655 | 182k | name = "strict"; |
656 | 2.53M | PyObject *handler; |
657 | 2.53M | if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) { |
658 | 0 | return NULL; |
659 | 0 | } |
660 | 2.53M | if (handler == NULL) { |
661 | 0 | PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); |
662 | 0 | return NULL; |
663 | 0 | } |
664 | 2.53M | return handler; |
665 | 2.53M | } |
666 | | |
667 | | |
668 | | static inline void |
669 | | wrong_exception_type(PyObject *exc) |
670 | 0 | { |
671 | 0 | PyErr_Format(PyExc_TypeError, |
672 | 0 | "don't know how to handle %T in error callback", exc); |
673 | 0 | } |
674 | | |
675 | | |
676 | | #define _PyIsUnicodeEncodeError(EXC) \ |
677 | 318k | PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeEncodeError) |
678 | | #define _PyIsUnicodeDecodeError(EXC) \ |
679 | 306k | PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeDecodeError) |
680 | | #define _PyIsUnicodeTranslateError(EXC) \ |
681 | 0 | PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError) |
682 | | |
683 | | |
684 | | // --- codecs handlers: utilities --------------------------------------------- |
685 | | |
686 | | /* |
687 | | * Return the number of characters (including special prefixes) |
688 | | * needed to represent 'ch' by codec_handler_write_unicode_hex(). |
689 | | */ |
690 | | static inline Py_ssize_t |
691 | | codec_handler_unicode_hex_width(Py_UCS4 ch) |
692 | 0 | { |
693 | 0 | if (ch >= 0x10000) { |
694 | | // format: '\\' + 'U' + 8 hex digits |
695 | 0 | return 1 + 1 + 8; |
696 | 0 | } |
697 | 0 | else if (ch >= 0x100) { |
698 | | // format: '\\' + 'u' + 4 hex digits |
699 | 0 | return 1 + 1 + 4; |
700 | 0 | } |
701 | 0 | else { |
702 | | // format: '\\' + 'x' + 2 hex digits |
703 | 0 | return 1 + 1 + 2; |
704 | 0 | } |
705 | 0 | } |
706 | | |
707 | | |
708 | | /* |
709 | | * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p' |
710 | | * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively. |
711 | | */ |
712 | | static inline void |
713 | | codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch) |
714 | 0 | { |
715 | 0 | *(*p)++ = '\\'; |
716 | 0 | if (ch >= 0x10000) { |
717 | 0 | *(*p)++ = 'U'; |
718 | 0 | *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf]; |
719 | 0 | *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf]; |
720 | 0 | *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf]; |
721 | 0 | *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf]; |
722 | 0 | *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf]; |
723 | 0 | *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf]; |
724 | 0 | } |
725 | 0 | else if (ch >= 0x100) { |
726 | 0 | *(*p)++ = 'u'; |
727 | 0 | *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf]; |
728 | 0 | *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf]; |
729 | 0 | } |
730 | 0 | else { |
731 | 0 | *(*p)++ = 'x'; |
732 | 0 | } |
733 | 0 | *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf]; |
734 | 0 | *(*p)++ = Py_hexdigits[ch & 0xf]; |
735 | 0 | } |
736 | | |
737 | | |
738 | | /* |
739 | | * Determine the number of digits for a decimal representation of Unicode |
740 | | * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits). |
741 | | */ |
742 | | static inline int |
743 | | n_decimal_digits_for_codepoint(Py_UCS4 ch) |
744 | 0 | { |
745 | 0 | if (ch < 10) return 1; |
746 | 0 | if (ch < 100) return 2; |
747 | 0 | if (ch < 1000) return 3; |
748 | 0 | if (ch < 10000) return 4; |
749 | 0 | if (ch < 100000) return 5; |
750 | 0 | if (ch < 1000000) return 6; |
751 | 0 | if (ch < 10000000) return 7; |
752 | | // Unicode codepoints are limited to 1114111 (7 decimal digits) |
753 | 0 | Py_UNREACHABLE(); |
754 | 0 | } |
755 | | |
756 | | |
757 | | /* |
758 | | * Create a Unicode string containing 'count' copies of the official |
759 | | * Unicode REPLACEMENT CHARACTER (0xFFFD). |
760 | | */ |
761 | | static PyObject * |
762 | | codec_handler_unicode_replacement_character(Py_ssize_t count) |
763 | 228k | { |
764 | 228k | PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER); |
765 | 228k | if (res == NULL) { |
766 | 0 | return NULL; |
767 | 0 | } |
768 | 228k | assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); |
769 | 228k | Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res); |
770 | 456k | for (Py_ssize_t i = 0; i < count; ++i) { |
771 | 228k | outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; |
772 | 228k | } |
773 | 228k | assert(_PyUnicode_CheckConsistency(res, 1)); |
774 | 228k | return res; |
775 | 228k | } |
776 | | |
777 | | |
778 | | // --- handler: 'strict' ------------------------------------------------------ |
779 | | |
780 | | PyObject *PyCodec_StrictErrors(PyObject *exc) |
781 | 3.02M | { |
782 | 3.02M | if (PyExceptionInstance_Check(exc)) { |
783 | 3.02M | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
784 | 3.02M | } |
785 | 0 | else { |
786 | 0 | PyErr_SetString(PyExc_TypeError, "codec must pass exception instance"); |
787 | 0 | } |
788 | 3.02M | return NULL; |
789 | 3.02M | } |
790 | | |
791 | | |
792 | | // --- handler: 'ignore' ------------------------------------------------------ |
793 | | |
794 | | static PyObject * |
795 | | _PyCodec_IgnoreError(PyObject *exc, int as_bytes) |
796 | 0 | { |
797 | 0 | Py_ssize_t end; |
798 | 0 | if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL, |
799 | 0 | &end, NULL, as_bytes) < 0) |
800 | 0 | { |
801 | 0 | return NULL; |
802 | 0 | } |
803 | 0 | return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end); |
804 | 0 | } |
805 | | |
806 | | |
807 | | PyObject *PyCodec_IgnoreErrors(PyObject *exc) |
808 | 0 | { |
809 | 0 | if (_PyIsUnicodeEncodeError(exc) || _PyIsUnicodeTranslateError(exc)) { |
810 | 0 | return _PyCodec_IgnoreError(exc, false); |
811 | 0 | } |
812 | 0 | else if (_PyIsUnicodeDecodeError(exc)) { |
813 | 0 | return _PyCodec_IgnoreError(exc, true); |
814 | 0 | } |
815 | 0 | else { |
816 | 0 | wrong_exception_type(exc); |
817 | 0 | return NULL; |
818 | 0 | } |
819 | 0 | } |
820 | | |
821 | | |
822 | | // --- handler: 'replace' ----------------------------------------------------- |
823 | | |
824 | | static PyObject * |
825 | | _PyCodec_ReplaceUnicodeEncodeError(PyObject *exc) |
826 | 0 | { |
827 | 0 | Py_ssize_t start, end, slen; |
828 | 0 | if (_PyUnicodeError_GetParams(exc, NULL, NULL, |
829 | 0 | &start, &end, &slen, false) < 0) |
830 | 0 | { |
831 | 0 | return NULL; |
832 | 0 | } |
833 | 0 | PyObject *res = PyUnicode_New(slen, '?'); |
834 | 0 | if (res == NULL) { |
835 | 0 | return NULL; |
836 | 0 | } |
837 | 0 | assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND); |
838 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
839 | 0 | memset(outp, '?', sizeof(Py_UCS1) * slen); |
840 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
841 | 0 | return Py_BuildValue("(Nn)", res, end); |
842 | 0 | } |
843 | | |
844 | | |
845 | | static PyObject * |
846 | | _PyCodec_ReplaceUnicodeDecodeError(PyObject *exc) |
847 | 228k | { |
848 | 228k | Py_ssize_t end; |
849 | 228k | if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) { |
850 | 0 | return NULL; |
851 | 0 | } |
852 | 228k | PyObject *res = codec_handler_unicode_replacement_character(1); |
853 | 228k | if (res == NULL) { |
854 | 0 | return NULL; |
855 | 0 | } |
856 | 228k | return Py_BuildValue("(Nn)", res, end); |
857 | 228k | } |
858 | | |
859 | | |
860 | | static PyObject * |
861 | | _PyCodec_ReplaceUnicodeTranslateError(PyObject *exc) |
862 | 0 | { |
863 | 0 | Py_ssize_t start, end, slen; |
864 | 0 | if (_PyUnicodeError_GetParams(exc, NULL, NULL, |
865 | 0 | &start, &end, &slen, false) < 0) |
866 | 0 | { |
867 | 0 | return NULL; |
868 | 0 | } |
869 | 0 | PyObject *res = codec_handler_unicode_replacement_character(slen); |
870 | 0 | if (res == NULL) { |
871 | 0 | return NULL; |
872 | 0 | } |
873 | 0 | return Py_BuildValue("(Nn)", res, end); |
874 | 0 | } |
875 | | |
876 | | |
877 | | PyObject *PyCodec_ReplaceErrors(PyObject *exc) |
878 | 228k | { |
879 | 228k | if (_PyIsUnicodeEncodeError(exc)) { |
880 | 0 | return _PyCodec_ReplaceUnicodeEncodeError(exc); |
881 | 0 | } |
882 | 228k | else if (_PyIsUnicodeDecodeError(exc)) { |
883 | 228k | return _PyCodec_ReplaceUnicodeDecodeError(exc); |
884 | 228k | } |
885 | 0 | else if (_PyIsUnicodeTranslateError(exc)) { |
886 | 0 | return _PyCodec_ReplaceUnicodeTranslateError(exc); |
887 | 0 | } |
888 | 0 | else { |
889 | 0 | wrong_exception_type(exc); |
890 | 0 | return NULL; |
891 | 0 | } |
892 | 228k | } |
893 | | |
894 | | |
895 | | // --- handler: 'xmlcharrefreplace' ------------------------------------------- |
896 | | |
897 | | PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) |
898 | 0 | { |
899 | 0 | if (!_PyIsUnicodeEncodeError(exc)) { |
900 | 0 | wrong_exception_type(exc); |
901 | 0 | return NULL; |
902 | 0 | } |
903 | | |
904 | 0 | PyObject *obj; |
905 | 0 | Py_ssize_t objlen, start, end, slen; |
906 | 0 | if (_PyUnicodeError_GetParams(exc, |
907 | 0 | &obj, &objlen, |
908 | 0 | &start, &end, &slen, false) < 0) |
909 | 0 | { |
910 | 0 | return NULL; |
911 | 0 | } |
912 | | |
913 | | // The number of characters that each character 'ch' contributes |
914 | | // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch} |
915 | | // and will be formatted as "&#" + DIGITS + ";". Since the Unicode |
916 | | // range is below 10^7, each "block" requires at most 2 + 7 + 1 |
917 | | // characters. |
918 | 0 | if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) { |
919 | 0 | end = start + PY_SSIZE_T_MAX / (2 + 7 + 1); |
920 | 0 | end = Py_MIN(end, objlen); |
921 | 0 | slen = Py_MAX(0, end - start); |
922 | 0 | } |
923 | |
|
924 | 0 | Py_ssize_t ressize = 0; |
925 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
926 | 0 | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
927 | 0 | int k = n_decimal_digits_for_codepoint(ch); |
928 | 0 | assert(k != 0); |
929 | 0 | assert(k <= 7); |
930 | 0 | ressize += 2 + k + 1; |
931 | 0 | } |
932 | | |
933 | | /* allocate replacement */ |
934 | 0 | PyObject *res = PyUnicode_New(ressize, 127); |
935 | 0 | if (res == NULL) { |
936 | 0 | Py_DECREF(obj); |
937 | 0 | return NULL; |
938 | 0 | } |
939 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
940 | | /* generate replacement */ |
941 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
942 | 0 | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
943 | | /* |
944 | | * Write the decimal representation of 'ch' to the buffer pointed by 'p' |
945 | | * using at most 7 characters prefixed by '&#' and suffixed by ';'. |
946 | | */ |
947 | 0 | *outp++ = '&'; |
948 | 0 | *outp++ = '#'; |
949 | 0 | Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch); |
950 | 0 | for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) { |
951 | 0 | *p_digit = '0' + (ch % 10); |
952 | 0 | ch /= 10; |
953 | 0 | } |
954 | 0 | assert(ch == 0); |
955 | 0 | outp = digit_end; |
956 | 0 | *outp++ = ';'; |
957 | 0 | } |
958 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
959 | 0 | PyObject *restuple = Py_BuildValue("(Nn)", res, end); |
960 | 0 | Py_DECREF(obj); |
961 | 0 | return restuple; |
962 | 0 | } |
963 | | |
964 | | |
965 | | // --- handler: 'backslashreplace' -------------------------------------------- |
966 | | |
967 | | static PyObject * |
968 | | _PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc) |
969 | 0 | { |
970 | 0 | PyObject *obj; |
971 | 0 | Py_ssize_t objlen, start, end, slen; |
972 | 0 | if (_PyUnicodeError_GetParams(exc, |
973 | 0 | &obj, &objlen, |
974 | 0 | &start, &end, &slen, false) < 0) |
975 | 0 | { |
976 | 0 | return NULL; |
977 | 0 | } |
978 | | |
979 | | // The number of characters that each character 'ch' contributes |
980 | | // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch} |
981 | | // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS, |
982 | | // where the number of hexdigits is either 2, 4, or 8 (not 6). |
983 | | // Since the Unicode range is below 10^7, we choose k = 8 whence |
984 | | // each "block" requires at most 1 + 1 + 8 characters. |
985 | 0 | if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) { |
986 | 0 | end = start + PY_SSIZE_T_MAX / (1 + 1 + 8); |
987 | 0 | end = Py_MIN(end, objlen); |
988 | 0 | slen = Py_MAX(0, end - start); |
989 | 0 | } |
990 | |
|
991 | 0 | Py_ssize_t ressize = 0; |
992 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
993 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); |
994 | 0 | ressize += codec_handler_unicode_hex_width(c); |
995 | 0 | } |
996 | 0 | PyObject *res = PyUnicode_New(ressize, 127); |
997 | 0 | if (res == NULL) { |
998 | 0 | Py_DECREF(obj); |
999 | 0 | return NULL; |
1000 | 0 | } |
1001 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
1002 | 0 | for (Py_ssize_t i = start; i < end; ++i) { |
1003 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); |
1004 | 0 | codec_handler_write_unicode_hex(&outp, c); |
1005 | 0 | } |
1006 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1007 | 0 | Py_DECREF(obj); |
1008 | 0 | return Py_BuildValue("(Nn)", res, end); |
1009 | 0 | } |
1010 | | |
1011 | | |
1012 | | static PyObject * |
1013 | | _PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc) |
1014 | 0 | { |
1015 | 0 | PyObject *obj; |
1016 | 0 | Py_ssize_t objlen, start, end, slen; |
1017 | 0 | if (_PyUnicodeError_GetParams(exc, |
1018 | 0 | &obj, &objlen, |
1019 | 0 | &start, &end, &slen, true) < 0) |
1020 | 0 | { |
1021 | 0 | return NULL; |
1022 | 0 | } |
1023 | | |
1024 | 0 | PyObject *res = PyUnicode_New(4 * slen, 127); |
1025 | 0 | if (res == NULL) { |
1026 | 0 | Py_DECREF(obj); |
1027 | 0 | return NULL; |
1028 | 0 | } |
1029 | | |
1030 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
1031 | 0 | const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); |
1032 | 0 | for (Py_ssize_t i = start; i < end; i++, outp += 4) { |
1033 | 0 | const unsigned char ch = p[i]; |
1034 | 0 | outp[0] = '\\'; |
1035 | 0 | outp[1] = 'x'; |
1036 | 0 | outp[2] = Py_hexdigits[(ch >> 4) & 0xf]; |
1037 | 0 | outp[3] = Py_hexdigits[ch & 0xf]; |
1038 | 0 | } |
1039 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1040 | 0 | Py_DECREF(obj); |
1041 | 0 | return Py_BuildValue("(Nn)", res, end); |
1042 | 0 | } |
1043 | | |
1044 | | |
1045 | | static inline PyObject * |
1046 | | _PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc) |
1047 | 0 | { |
1048 | | // Same implementation as for UnicodeEncodeError objects. |
1049 | 0 | return _PyCodec_BackslashReplaceUnicodeEncodeError(exc); |
1050 | 0 | } |
1051 | | |
1052 | | |
1053 | | PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) |
1054 | 0 | { |
1055 | 0 | if (_PyIsUnicodeEncodeError(exc)) { |
1056 | 0 | return _PyCodec_BackslashReplaceUnicodeEncodeError(exc); |
1057 | 0 | } |
1058 | 0 | else if (_PyIsUnicodeDecodeError(exc)) { |
1059 | 0 | return _PyCodec_BackslashReplaceUnicodeDecodeError(exc); |
1060 | 0 | } |
1061 | 0 | else if (_PyIsUnicodeTranslateError(exc)) { |
1062 | 0 | return _PyCodec_BackslashReplaceUnicodeTranslateError(exc); |
1063 | 0 | } |
1064 | 0 | else { |
1065 | 0 | wrong_exception_type(exc); |
1066 | 0 | return NULL; |
1067 | 0 | } |
1068 | 0 | } |
1069 | | |
1070 | | |
1071 | | // --- handler: 'namereplace' ------------------------------------------------- |
1072 | | |
1073 | | PyObject *PyCodec_NameReplaceErrors(PyObject *exc) |
1074 | 0 | { |
1075 | 0 | if (!_PyIsUnicodeEncodeError(exc)) { |
1076 | 0 | wrong_exception_type(exc); |
1077 | 0 | return NULL; |
1078 | 0 | } |
1079 | | |
1080 | 0 | _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI(); |
1081 | 0 | if (ucnhash_capi == NULL) { |
1082 | 0 | return NULL; |
1083 | 0 | } |
1084 | | |
1085 | 0 | PyObject *obj; |
1086 | 0 | Py_ssize_t start, end; |
1087 | 0 | if (_PyUnicodeError_GetParams(exc, |
1088 | 0 | &obj, NULL, |
1089 | 0 | &start, &end, NULL, false) < 0) |
1090 | 0 | { |
1091 | 0 | return NULL; |
1092 | 0 | } |
1093 | | |
1094 | 0 | char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */ |
1095 | 0 | Py_ssize_t imax = start, ressize = 0, replsize; |
1096 | 0 | for (; imax < end; ++imax) { |
1097 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax); |
1098 | 0 | if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) { |
1099 | | // If 'c' is recognized by getname(), the corresponding replacement |
1100 | | // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1 |
1101 | | // characters. Failures of getname() are ignored by the handler. |
1102 | 0 | replsize = 1 + 1 + 1 + strlen(buffer) + 1; |
1103 | 0 | } |
1104 | 0 | else { |
1105 | 0 | replsize = codec_handler_unicode_hex_width(c); |
1106 | 0 | } |
1107 | 0 | if (ressize > PY_SSIZE_T_MAX - replsize) { |
1108 | 0 | break; |
1109 | 0 | } |
1110 | 0 | ressize += replsize; |
1111 | 0 | } |
1112 | |
|
1113 | 0 | PyObject *res = PyUnicode_New(ressize, 127); |
1114 | 0 | if (res == NULL) { |
1115 | 0 | Py_DECREF(obj); |
1116 | 0 | return NULL; |
1117 | 0 | } |
1118 | | |
1119 | 0 | Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); |
1120 | 0 | for (Py_ssize_t i = start; i < imax; ++i) { |
1121 | 0 | Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); |
1122 | 0 | if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) { |
1123 | 0 | *outp++ = '\\'; |
1124 | 0 | *outp++ = 'N'; |
1125 | 0 | *outp++ = '{'; |
1126 | 0 | (void)strcpy((char *)outp, buffer); |
1127 | 0 | outp += strlen(buffer); |
1128 | 0 | *outp++ = '}'; |
1129 | 0 | } |
1130 | 0 | else { |
1131 | 0 | codec_handler_write_unicode_hex(&outp, c); |
1132 | 0 | } |
1133 | 0 | } |
1134 | |
|
1135 | 0 | assert(outp == PyUnicode_1BYTE_DATA(res) + ressize); |
1136 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1137 | 0 | PyObject *restuple = Py_BuildValue("(Nn)", res, imax); |
1138 | 0 | Py_DECREF(obj); |
1139 | 0 | return restuple; |
1140 | 0 | } |
1141 | | |
1142 | | |
1143 | 8 | #define ENC_UNKNOWN -1 |
1144 | 16 | #define ENC_UTF8 0 |
1145 | 0 | #define ENC_UTF16BE 1 |
1146 | 0 | #define ENC_UTF16LE 2 |
1147 | 0 | #define ENC_UTF32BE 3 |
1148 | 0 | #define ENC_UTF32LE 4 |
1149 | | |
1150 | | static int |
1151 | | get_standard_encoding_impl(const char *encoding, int *bytelength) |
1152 | 8 | { |
1153 | 8 | if (Py_TOLOWER(encoding[0]) == 'u' && |
1154 | 8 | Py_TOLOWER(encoding[1]) == 't' && |
1155 | 8 | Py_TOLOWER(encoding[2]) == 'f') { |
1156 | 8 | encoding += 3; |
1157 | 8 | if (*encoding == '-' || *encoding == '_' ) |
1158 | 8 | encoding++; |
1159 | 8 | if (encoding[0] == '8' && encoding[1] == '\0') { |
1160 | 8 | *bytelength = 3; |
1161 | 8 | return ENC_UTF8; |
1162 | 8 | } |
1163 | 0 | else if (encoding[0] == '1' && encoding[1] == '6') { |
1164 | 0 | encoding += 2; |
1165 | 0 | *bytelength = 2; |
1166 | 0 | if (*encoding == '\0') { |
1167 | | #ifdef WORDS_BIGENDIAN |
1168 | | return ENC_UTF16BE; |
1169 | | #else |
1170 | 0 | return ENC_UTF16LE; |
1171 | 0 | #endif |
1172 | 0 | } |
1173 | 0 | if (*encoding == '-' || *encoding == '_' ) |
1174 | 0 | encoding++; |
1175 | 0 | if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { |
1176 | 0 | if (Py_TOLOWER(encoding[0]) == 'b') |
1177 | 0 | return ENC_UTF16BE; |
1178 | 0 | if (Py_TOLOWER(encoding[0]) == 'l') |
1179 | 0 | return ENC_UTF16LE; |
1180 | 0 | } |
1181 | 0 | } |
1182 | 0 | else if (encoding[0] == '3' && encoding[1] == '2') { |
1183 | 0 | encoding += 2; |
1184 | 0 | *bytelength = 4; |
1185 | 0 | if (*encoding == '\0') { |
1186 | | #ifdef WORDS_BIGENDIAN |
1187 | | return ENC_UTF32BE; |
1188 | | #else |
1189 | 0 | return ENC_UTF32LE; |
1190 | 0 | #endif |
1191 | 0 | } |
1192 | 0 | if (*encoding == '-' || *encoding == '_' ) |
1193 | 0 | encoding++; |
1194 | 0 | if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { |
1195 | 0 | if (Py_TOLOWER(encoding[0]) == 'b') |
1196 | 0 | return ENC_UTF32BE; |
1197 | 0 | if (Py_TOLOWER(encoding[0]) == 'l') |
1198 | 0 | return ENC_UTF32LE; |
1199 | 0 | } |
1200 | 0 | } |
1201 | 8 | } |
1202 | 0 | else if (strcmp(encoding, "cp65001") == 0) { |
1203 | 0 | *bytelength = 3; |
1204 | 0 | return ENC_UTF8; |
1205 | 0 | } |
1206 | 0 | return ENC_UNKNOWN; |
1207 | 8 | } |
1208 | | |
1209 | | |
1210 | | static int |
1211 | | get_standard_encoding(PyObject *encoding, int *code, int *bytelength) |
1212 | 8 | { |
1213 | 8 | const char *encoding_cstr = PyUnicode_AsUTF8(encoding); |
1214 | 8 | if (encoding_cstr == NULL) { |
1215 | 0 | return -1; |
1216 | 0 | } |
1217 | 8 | *code = get_standard_encoding_impl(encoding_cstr, bytelength); |
1218 | 8 | return 0; |
1219 | 8 | } |
1220 | | |
1221 | | |
1222 | | // --- handler: 'surrogatepass' ----------------------------------------------- |
1223 | | |
1224 | | static PyObject * |
1225 | | _PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc) |
1226 | 0 | { |
1227 | 0 | PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc); |
1228 | 0 | if (encoding == NULL) { |
1229 | 0 | return NULL; |
1230 | 0 | } |
1231 | 0 | int code, bytelength; |
1232 | 0 | int rc = get_standard_encoding(encoding, &code, &bytelength); |
1233 | 0 | Py_DECREF(encoding); |
1234 | 0 | if (rc < 0) { |
1235 | 0 | return NULL; |
1236 | 0 | } |
1237 | 0 | if (code == ENC_UNKNOWN) { |
1238 | 0 | goto bail; |
1239 | 0 | } |
1240 | | |
1241 | 0 | PyObject *obj; |
1242 | 0 | Py_ssize_t objlen, start, end, slen; |
1243 | 0 | if (_PyUnicodeError_GetParams(exc, |
1244 | 0 | &obj, &objlen, |
1245 | 0 | &start, &end, &slen, false) < 0) |
1246 | 0 | { |
1247 | 0 | return NULL; |
1248 | 0 | } |
1249 | | |
1250 | 0 | if (slen > PY_SSIZE_T_MAX / bytelength) { |
1251 | 0 | end = start + PY_SSIZE_T_MAX / bytelength; |
1252 | 0 | end = Py_MIN(end, objlen); |
1253 | 0 | slen = Py_MAX(0, end - start); |
1254 | 0 | } |
1255 | |
|
1256 | 0 | PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen); |
1257 | 0 | if (res == NULL) { |
1258 | 0 | Py_DECREF(obj); |
1259 | 0 | return NULL; |
1260 | 0 | } |
1261 | | |
1262 | 0 | unsigned char *outp = (unsigned char *)PyBytes_AsString(res); |
1263 | 0 | for (Py_ssize_t i = start; i < end; i++) { |
1264 | 0 | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
1265 | 0 | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
1266 | | /* Not a surrogate, fail with original exception */ |
1267 | 0 | Py_DECREF(obj); |
1268 | 0 | Py_DECREF(res); |
1269 | 0 | goto bail; |
1270 | 0 | } |
1271 | 0 | switch (code) { |
1272 | 0 | case ENC_UTF8: { |
1273 | 0 | *outp++ = (unsigned char)(0xe0 | (ch >> 12)); |
1274 | 0 | *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); |
1275 | 0 | *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); |
1276 | 0 | break; |
1277 | 0 | } |
1278 | 0 | case ENC_UTF16LE: { |
1279 | 0 | *outp++ = (unsigned char)ch; |
1280 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1281 | 0 | break; |
1282 | 0 | } |
1283 | 0 | case ENC_UTF16BE: { |
1284 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1285 | 0 | *outp++ = (unsigned char)ch; |
1286 | 0 | break; |
1287 | 0 | } |
1288 | 0 | case ENC_UTF32LE: { |
1289 | 0 | *outp++ = (unsigned char)ch; |
1290 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1291 | 0 | *outp++ = (unsigned char)(ch >> 16); |
1292 | 0 | *outp++ = (unsigned char)(ch >> 24); |
1293 | 0 | break; |
1294 | 0 | } |
1295 | 0 | case ENC_UTF32BE: { |
1296 | 0 | *outp++ = (unsigned char)(ch >> 24); |
1297 | 0 | *outp++ = (unsigned char)(ch >> 16); |
1298 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1299 | 0 | *outp++ = (unsigned char)ch; |
1300 | 0 | break; |
1301 | 0 | } |
1302 | 0 | } |
1303 | 0 | } |
1304 | | |
1305 | 0 | Py_DECREF(obj); |
1306 | 0 | PyObject *restuple = Py_BuildValue("(Nn)", res, end); |
1307 | 0 | return restuple; |
1308 | | |
1309 | 0 | bail: |
1310 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1311 | 0 | return NULL; |
1312 | 0 | } |
1313 | | |
1314 | | |
1315 | | static PyObject * |
1316 | | _PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc) |
1317 | 8 | { |
1318 | 8 | PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc); |
1319 | 8 | if (encoding == NULL) { |
1320 | 0 | return NULL; |
1321 | 0 | } |
1322 | 8 | int code, bytelength; |
1323 | 8 | int rc = get_standard_encoding(encoding, &code, &bytelength); |
1324 | 8 | Py_DECREF(encoding); |
1325 | 8 | if (rc < 0) { |
1326 | 0 | return NULL; |
1327 | 0 | } |
1328 | 8 | if (code == ENC_UNKNOWN) { |
1329 | 0 | goto bail; |
1330 | 0 | } |
1331 | | |
1332 | 8 | PyObject *obj; |
1333 | 8 | Py_ssize_t objlen, start, end, slen; |
1334 | 8 | if (_PyUnicodeError_GetParams(exc, |
1335 | 8 | &obj, &objlen, |
1336 | 8 | &start, &end, &slen, true) < 0) |
1337 | 0 | { |
1338 | 0 | return NULL; |
1339 | 0 | } |
1340 | | |
1341 | | /* Try decoding a single surrogate character. If |
1342 | | there are more, let the codec call us again. */ |
1343 | 8 | Py_UCS4 ch = 0; |
1344 | 8 | const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); |
1345 | 8 | p += start; |
1346 | | |
1347 | 8 | if (objlen - start >= bytelength) { |
1348 | 8 | switch (code) { |
1349 | 8 | case ENC_UTF8: { |
1350 | 8 | if ((p[0] & 0xf0) == 0xe0 && |
1351 | 8 | (p[1] & 0xc0) == 0x80 && |
1352 | 8 | (p[2] & 0xc0) == 0x80) |
1353 | 8 | { |
1354 | | /* it's a three-byte code */ |
1355 | 8 | ch = ((p[0] & 0x0f) << 12) + |
1356 | 8 | ((p[1] & 0x3f) << 6) + |
1357 | 8 | (p[2] & 0x3f); |
1358 | 8 | } |
1359 | 8 | break; |
1360 | 0 | } |
1361 | 0 | case ENC_UTF16LE: { |
1362 | 0 | ch = p[1] << 8 | p[0]; |
1363 | 0 | break; |
1364 | 0 | } |
1365 | 0 | case ENC_UTF16BE: { |
1366 | 0 | ch = p[0] << 8 | p[1]; |
1367 | 0 | break; |
1368 | 0 | } |
1369 | 0 | case ENC_UTF32LE: { |
1370 | 0 | ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; |
1371 | 0 | break; |
1372 | 0 | } |
1373 | 0 | case ENC_UTF32BE: { |
1374 | 0 | ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; |
1375 | 0 | break; |
1376 | 0 | } |
1377 | 8 | } |
1378 | 8 | } |
1379 | 8 | Py_DECREF(obj); |
1380 | 8 | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
1381 | 0 | goto bail; |
1382 | 0 | } |
1383 | | |
1384 | 8 | PyObject *res = PyUnicode_FromOrdinal(ch); |
1385 | 8 | if (res == NULL) { |
1386 | 0 | return NULL; |
1387 | 0 | } |
1388 | 8 | return Py_BuildValue("(Nn)", res, start + bytelength); |
1389 | | |
1390 | 0 | bail: |
1391 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1392 | 0 | return NULL; |
1393 | 8 | } |
1394 | | |
1395 | | |
1396 | | /* This handler is declared static until someone demonstrates |
1397 | | a need to call it directly. */ |
1398 | | static PyObject * |
1399 | | PyCodec_SurrogatePassErrors(PyObject *exc) |
1400 | 8 | { |
1401 | 8 | if (_PyIsUnicodeEncodeError(exc)) { |
1402 | 0 | return _PyCodec_SurrogatePassUnicodeEncodeError(exc); |
1403 | 0 | } |
1404 | 8 | else if (_PyIsUnicodeDecodeError(exc)) { |
1405 | 8 | return _PyCodec_SurrogatePassUnicodeDecodeError(exc); |
1406 | 8 | } |
1407 | 0 | else { |
1408 | 0 | wrong_exception_type(exc); |
1409 | 0 | return NULL; |
1410 | 0 | } |
1411 | 8 | } |
1412 | | |
1413 | | |
1414 | | // --- handler: 'surrogateescape' --------------------------------------------- |
1415 | | |
1416 | | static PyObject * |
1417 | | _PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc) |
1418 | 12.8k | { |
1419 | 12.8k | PyObject *obj; |
1420 | 12.8k | Py_ssize_t start, end, slen; |
1421 | 12.8k | if (_PyUnicodeError_GetParams(exc, |
1422 | 12.8k | &obj, NULL, |
1423 | 12.8k | &start, &end, &slen, false) < 0) |
1424 | 0 | { |
1425 | 0 | return NULL; |
1426 | 0 | } |
1427 | | |
1428 | 12.8k | PyObject *res = PyBytes_FromStringAndSize(NULL, slen); |
1429 | 12.8k | if (res == NULL) { |
1430 | 0 | Py_DECREF(obj); |
1431 | 0 | return NULL; |
1432 | 0 | } |
1433 | | |
1434 | 12.8k | char *outp = PyBytes_AsString(res); |
1435 | 12.8k | for (Py_ssize_t i = start; i < end; i++) { |
1436 | 12.8k | Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); |
1437 | 12.8k | if (ch < 0xdc80 || ch > 0xdcff) { |
1438 | | /* Not a UTF-8b surrogate, fail with original exception. */ |
1439 | 12.8k | Py_DECREF(obj); |
1440 | 12.8k | Py_DECREF(res); |
1441 | 12.8k | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1442 | 12.8k | return NULL; |
1443 | 12.8k | } |
1444 | 0 | *outp++ = ch - 0xdc00; |
1445 | 0 | } |
1446 | 0 | Py_DECREF(obj); |
1447 | |
|
1448 | 0 | return Py_BuildValue("(Nn)", res, end); |
1449 | 12.8k | } |
1450 | | |
1451 | | |
1452 | | static PyObject * |
1453 | | _PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc) |
1454 | 77.5k | { |
1455 | 77.5k | PyObject *obj; |
1456 | 77.5k | Py_ssize_t start, end, slen; |
1457 | 77.5k | if (_PyUnicodeError_GetParams(exc, |
1458 | 77.5k | &obj, NULL, |
1459 | 77.5k | &start, &end, &slen, true) < 0) |
1460 | 0 | { |
1461 | 0 | return NULL; |
1462 | 0 | } |
1463 | | |
1464 | 77.5k | Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ |
1465 | 77.5k | int consumed = 0; |
1466 | 77.5k | const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); |
1467 | 173k | while (consumed < 4 && consumed < slen) { |
1468 | | /* Refuse to escape ASCII bytes. */ |
1469 | 144k | if (p[start + consumed] < 128) { |
1470 | 48.6k | break; |
1471 | 48.6k | } |
1472 | 96.1k | ch[consumed] = 0xdc00 + p[start + consumed]; |
1473 | 96.1k | consumed++; |
1474 | 96.1k | } |
1475 | 77.5k | Py_DECREF(obj); |
1476 | | |
1477 | 77.5k | if (consumed == 0) { |
1478 | | /* Codec complained about ASCII byte. */ |
1479 | 27.0k | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1480 | 27.0k | return NULL; |
1481 | 27.0k | } |
1482 | | |
1483 | 50.5k | PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed); |
1484 | 50.5k | if (str == NULL) { |
1485 | 0 | return NULL; |
1486 | 0 | } |
1487 | 50.5k | return Py_BuildValue("(Nn)", str, start + consumed); |
1488 | 50.5k | } |
1489 | | |
1490 | | |
1491 | | static PyObject * |
1492 | | PyCodec_SurrogateEscapeErrors(PyObject *exc) |
1493 | 90.4k | { |
1494 | 90.4k | if (_PyIsUnicodeEncodeError(exc)) { |
1495 | 12.8k | return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc); |
1496 | 12.8k | } |
1497 | 77.5k | else if (_PyIsUnicodeDecodeError(exc)) { |
1498 | 77.5k | return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc); |
1499 | 77.5k | } |
1500 | 0 | else { |
1501 | 0 | wrong_exception_type(exc); |
1502 | 0 | return NULL; |
1503 | 0 | } |
1504 | 90.4k | } |
1505 | | |
1506 | | |
1507 | | // --- Codecs registry handlers ----------------------------------------------- |
1508 | | |
1509 | | static inline PyObject * |
1510 | | strict_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1511 | 2.45M | { |
1512 | 2.45M | return PyCodec_StrictErrors(exc); |
1513 | 2.45M | } |
1514 | | |
1515 | | |
1516 | | static inline PyObject * |
1517 | | ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1518 | 0 | { |
1519 | 0 | return PyCodec_IgnoreErrors(exc); |
1520 | 0 | } |
1521 | | |
1522 | | |
1523 | | static inline PyObject * |
1524 | | replace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1525 | 228k | { |
1526 | 228k | return PyCodec_ReplaceErrors(exc); |
1527 | 228k | } |
1528 | | |
1529 | | |
1530 | | static inline PyObject * |
1531 | | xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1532 | 0 | { |
1533 | 0 | return PyCodec_XMLCharRefReplaceErrors(exc); |
1534 | 0 | } |
1535 | | |
1536 | | |
1537 | | static inline PyObject * |
1538 | | backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1539 | 0 | { |
1540 | 0 | return PyCodec_BackslashReplaceErrors(exc); |
1541 | 0 | } |
1542 | | |
1543 | | |
1544 | | static inline PyObject * |
1545 | | namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1546 | 0 | { |
1547 | 0 | return PyCodec_NameReplaceErrors(exc); |
1548 | 0 | } |
1549 | | |
1550 | | |
1551 | | static inline PyObject * |
1552 | | surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1553 | 8 | { |
1554 | 8 | return PyCodec_SurrogatePassErrors(exc); |
1555 | 8 | } |
1556 | | |
1557 | | |
1558 | | static inline PyObject * |
1559 | | surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc) |
1560 | 90.4k | { |
1561 | 90.4k | return PyCodec_SurrogateEscapeErrors(exc); |
1562 | 90.4k | } |
1563 | | |
1564 | | |
1565 | | PyStatus |
1566 | | _PyCodec_InitRegistry(PyInterpreterState *interp) |
1567 | 37 | { |
1568 | 37 | static struct { |
1569 | 37 | const char *name; |
1570 | 37 | PyMethodDef def; |
1571 | 37 | } methods[] = |
1572 | 37 | { |
1573 | 37 | { |
1574 | 37 | "strict", |
1575 | 37 | { |
1576 | 37 | "strict_errors", |
1577 | 37 | strict_errors, |
1578 | 37 | METH_O, |
1579 | 37 | PyDoc_STR("Implements the 'strict' error handling, which " |
1580 | 37 | "raises a UnicodeError on coding errors.") |
1581 | 37 | } |
1582 | 37 | }, |
1583 | 37 | { |
1584 | 37 | "ignore", |
1585 | 37 | { |
1586 | 37 | "ignore_errors", |
1587 | 37 | ignore_errors, |
1588 | 37 | METH_O, |
1589 | 37 | PyDoc_STR("Implements the 'ignore' error handling, which " |
1590 | 37 | "ignores malformed data and continues.") |
1591 | 37 | } |
1592 | 37 | }, |
1593 | 37 | { |
1594 | 37 | "replace", |
1595 | 37 | { |
1596 | 37 | "replace_errors", |
1597 | 37 | replace_errors, |
1598 | 37 | METH_O, |
1599 | 37 | PyDoc_STR("Implements the 'replace' error handling, which " |
1600 | 37 | "replaces malformed data with a replacement marker.") |
1601 | 37 | } |
1602 | 37 | }, |
1603 | 37 | { |
1604 | 37 | "xmlcharrefreplace", |
1605 | 37 | { |
1606 | 37 | "xmlcharrefreplace_errors", |
1607 | 37 | xmlcharrefreplace_errors, |
1608 | 37 | METH_O, |
1609 | 37 | PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, " |
1610 | 37 | "which replaces an unencodable character with the " |
1611 | 37 | "appropriate XML character reference.") |
1612 | 37 | } |
1613 | 37 | }, |
1614 | 37 | { |
1615 | 37 | "backslashreplace", |
1616 | 37 | { |
1617 | 37 | "backslashreplace_errors", |
1618 | 37 | backslashreplace_errors, |
1619 | 37 | METH_O, |
1620 | 37 | PyDoc_STR("Implements the 'backslashreplace' error handling, " |
1621 | 37 | "which replaces malformed data with a backslashed " |
1622 | 37 | "escape sequence.") |
1623 | 37 | } |
1624 | 37 | }, |
1625 | 37 | { |
1626 | 37 | "namereplace", |
1627 | 37 | { |
1628 | 37 | "namereplace_errors", |
1629 | 37 | namereplace_errors, |
1630 | 37 | METH_O, |
1631 | 37 | PyDoc_STR("Implements the 'namereplace' error handling, " |
1632 | 37 | "which replaces an unencodable character with a " |
1633 | 37 | "\\N{...} escape sequence.") |
1634 | 37 | } |
1635 | 37 | }, |
1636 | 37 | { |
1637 | 37 | "surrogatepass", |
1638 | 37 | { |
1639 | 37 | "surrogatepass", |
1640 | 37 | surrogatepass_errors, |
1641 | 37 | METH_O |
1642 | 37 | } |
1643 | 37 | }, |
1644 | 37 | { |
1645 | 37 | "surrogateescape", |
1646 | 37 | { |
1647 | 37 | "surrogateescape", |
1648 | 37 | surrogateescape_errors, |
1649 | 37 | METH_O |
1650 | 37 | } |
1651 | 37 | } |
1652 | 37 | }; |
1653 | | // ensure that the built-in error handlers' names are kept in sync |
1654 | 37 | assert(Py_ARRAY_LENGTH(methods) == Py_ARRAY_LENGTH(codecs_builtin_error_handlers)); |
1655 | | |
1656 | 37 | assert(interp->codecs.initialized == 0); |
1657 | 37 | interp->codecs.search_path = PyList_New(0); |
1658 | 37 | if (interp->codecs.search_path == NULL) { |
1659 | 0 | return PyStatus_NoMemory(); |
1660 | 0 | } |
1661 | 37 | interp->codecs.search_cache = PyDict_New(); |
1662 | 37 | if (interp->codecs.search_cache == NULL) { |
1663 | 0 | return PyStatus_NoMemory(); |
1664 | 0 | } |
1665 | 37 | interp->codecs.error_registry = PyDict_New(); |
1666 | 37 | if (interp->codecs.error_registry == NULL) { |
1667 | 0 | return PyStatus_NoMemory(); |
1668 | 0 | } |
1669 | 333 | for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) { |
1670 | 296 | PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL); |
1671 | 296 | if (func == NULL) { |
1672 | 0 | return PyStatus_NoMemory(); |
1673 | 0 | } |
1674 | | |
1675 | 296 | int res = PyDict_SetItemString(interp->codecs.error_registry, |
1676 | 296 | methods[i].name, func); |
1677 | 296 | Py_DECREF(func); |
1678 | 296 | if (res < 0) { |
1679 | 0 | return PyStatus_Error("Failed to insert into codec error registry"); |
1680 | 0 | } |
1681 | 296 | } |
1682 | | |
1683 | 37 | interp->codecs.initialized = 1; |
1684 | | |
1685 | | // Importing `encodings' will call back into this module to register codec |
1686 | | // search functions, so this is done after everything else is initialized. |
1687 | 37 | PyObject *mod = PyImport_ImportModule("encodings"); |
1688 | 37 | if (mod == NULL) { |
1689 | 0 | return PyStatus_Error("Failed to import encodings module"); |
1690 | 0 | } |
1691 | 37 | Py_DECREF(mod); |
1692 | | |
1693 | 37 | return PyStatus_Ok(); |
1694 | 37 | } |
1695 | | |
1696 | | void |
1697 | | _PyCodec_Fini(PyInterpreterState *interp) |
1698 | 0 | { |
1699 | 0 | Py_CLEAR(interp->codecs.search_path); |
1700 | 0 | Py_CLEAR(interp->codecs.search_cache); |
1701 | | Py_CLEAR(interp->codecs.error_registry); |
1702 | 0 | interp->codecs.initialized = 0; |
1703 | 0 | } |