/src/Python-3.8.3/Python/codecs.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* ------------------------------------------------------------------------ |
2 | | |
3 | | Python Codec Registry and support functions |
4 | | |
5 | | Written by Marc-Andre Lemburg (mal@lemburg.com). |
6 | | |
7 | | Copyright (c) Corporation for National Research Initiatives. |
8 | | |
9 | | ------------------------------------------------------------------------ */ |
10 | | |
11 | | #include "Python.h" |
12 | | #include "pycore_pystate.h" |
13 | | #include "ucnhash.h" |
14 | | #include <ctype.h> |
15 | | |
16 | | const char *Py_hexdigits = "0123456789abcdef"; |
17 | | |
18 | | /* --- Codec Registry ----------------------------------------------------- */ |
19 | | |
20 | | /* Import the standard encodings package which will register the first |
21 | | codec search function. |
22 | | |
23 | | This is done in a lazy way so that the Unicode implementation does |
24 | | not downgrade startup time of scripts not needing it. |
25 | | |
26 | | ImportErrors are silently ignored by this function. Only one try is |
27 | | made. |
28 | | |
29 | | */ |
30 | | |
31 | | static int _PyCodecRegistry_Init(void); /* Forward */ |
32 | | |
33 | | int PyCodec_Register(PyObject *search_function) |
34 | 14 | { |
35 | 14 | PyInterpreterState *interp = _PyInterpreterState_Get(); |
36 | 14 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) |
37 | 0 | goto onError; |
38 | 14 | if (search_function == NULL) { |
39 | 0 | PyErr_BadArgument(); |
40 | 0 | goto onError; |
41 | 0 | } |
42 | 14 | if (!PyCallable_Check(search_function)) { |
43 | 0 | PyErr_SetString(PyExc_TypeError, "argument must be callable"); |
44 | 0 | goto onError; |
45 | 0 | } |
46 | 14 | return PyList_Append(interp->codec_search_path, search_function); |
47 | | |
48 | 0 | onError: |
49 | 0 | return -1; |
50 | 14 | } |
51 | | |
52 | | /* Convert a string to a normalized Python string: all characters are |
53 | | converted to lower case, spaces are replaced with underscores. */ |
54 | | |
55 | | static |
56 | | PyObject *normalizestring(const char *string) |
57 | 85 | { |
58 | 85 | size_t i; |
59 | 85 | size_t len = strlen(string); |
60 | 85 | char *p; |
61 | 85 | PyObject *v; |
62 | | |
63 | 85 | if (len > PY_SSIZE_T_MAX) { |
64 | 0 | PyErr_SetString(PyExc_OverflowError, "string is too large"); |
65 | 0 | return NULL; |
66 | 0 | } |
67 | | |
68 | 85 | p = PyMem_Malloc(len + 1); |
69 | 85 | if (p == NULL) |
70 | 0 | return PyErr_NoMemory(); |
71 | 762 | for (i = 0; i < len; i++) { |
72 | 677 | char ch = string[i]; |
73 | 677 | if (ch == ' ') |
74 | 0 | ch = '-'; |
75 | 677 | else |
76 | 677 | ch = Py_TOLOWER(Py_CHARMASK(ch)); |
77 | 677 | p[i] = ch; |
78 | 677 | } |
79 | 85 | p[i] = '\0'; |
80 | 85 | v = PyUnicode_FromString(p); |
81 | 85 | PyMem_Free(p); |
82 | 85 | return v; |
83 | 85 | } |
84 | | |
85 | | /* Lookup the given encoding and return a tuple providing the codec |
86 | | facilities. |
87 | | |
88 | | The encoding string is looked up converted to all lower-case |
89 | | characters. This makes encodings looked up through this mechanism |
90 | | effectively case-insensitive. |
91 | | |
92 | | If no codec is found, a LookupError is set and NULL returned. |
93 | | |
94 | | As side effect, this tries to load the encodings package, if not |
95 | | yet done. This is part of the lazy load strategy for the encodings |
96 | | package. |
97 | | |
98 | | */ |
99 | | |
100 | | PyObject *_PyCodec_Lookup(const char *encoding) |
101 | 85 | { |
102 | 85 | PyObject *result, *args = NULL, *v; |
103 | 85 | Py_ssize_t i, len; |
104 | | |
105 | 85 | if (encoding == NULL) { |
106 | 0 | PyErr_BadArgument(); |
107 | 0 | goto onError; |
108 | 0 | } |
109 | | |
110 | 85 | PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); |
111 | 85 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) |
112 | 0 | goto onError; |
113 | | |
114 | | /* Convert the encoding to a normalized Python string: all |
115 | | characters are converted to lower case, spaces and hyphens are |
116 | | replaced with underscores. */ |
117 | 85 | v = normalizestring(encoding); |
118 | 85 | if (v == NULL) |
119 | 0 | goto onError; |
120 | 85 | PyUnicode_InternInPlace(&v); |
121 | | |
122 | | /* First, try to lookup the name in the registry dictionary */ |
123 | 85 | result = PyDict_GetItemWithError(interp->codec_search_cache, v); |
124 | 85 | if (result != NULL) { |
125 | 56 | Py_INCREF(result); |
126 | 56 | Py_DECREF(v); |
127 | 56 | return result; |
128 | 56 | } |
129 | 29 | else if (PyErr_Occurred()) { |
130 | 0 | Py_DECREF(v); |
131 | 0 | return NULL; |
132 | 0 | } |
133 | | |
134 | | /* Next, scan the search functions in order of registration */ |
135 | 29 | args = PyTuple_New(1); |
136 | 29 | if (args == NULL) { |
137 | 0 | Py_DECREF(v); |
138 | 0 | return NULL; |
139 | 0 | } |
140 | 29 | PyTuple_SET_ITEM(args,0,v); |
141 | | |
142 | 29 | len = PyList_Size(interp->codec_search_path); |
143 | 29 | if (len < 0) |
144 | 0 | goto onError; |
145 | 29 | if (len == 0) { |
146 | 0 | PyErr_SetString(PyExc_LookupError, |
147 | 0 | "no codec search functions registered: " |
148 | 0 | "can't find encoding"); |
149 | 0 | goto onError; |
150 | 0 | } |
151 | | |
152 | 29 | for (i = 0; i < len; i++) { |
153 | 29 | PyObject *func; |
154 | | |
155 | 29 | func = PyList_GetItem(interp->codec_search_path, i); |
156 | 29 | if (func == NULL) |
157 | 0 | goto onError; |
158 | 29 | result = PyEval_CallObject(func, args); |
159 | 29 | if (result == NULL) |
160 | 0 | goto onError; |
161 | 29 | if (result == Py_None) { |
162 | 0 | Py_DECREF(result); |
163 | 0 | continue; |
164 | 0 | } |
165 | 29 | if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) { |
166 | 0 | PyErr_SetString(PyExc_TypeError, |
167 | 0 | "codec search functions must return 4-tuples"); |
168 | 0 | Py_DECREF(result); |
169 | 0 | goto onError; |
170 | 0 | } |
171 | 29 | break; |
172 | 29 | } |
173 | 29 | if (i == len) { |
174 | | /* XXX Perhaps we should cache misses too ? */ |
175 | 0 | PyErr_Format(PyExc_LookupError, |
176 | 0 | "unknown encoding: %s", encoding); |
177 | 0 | goto onError; |
178 | 0 | } |
179 | | |
180 | | /* Cache and return the result */ |
181 | 29 | if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) { |
182 | 0 | Py_DECREF(result); |
183 | 0 | goto onError; |
184 | 0 | } |
185 | 29 | Py_DECREF(args); |
186 | 29 | return result; |
187 | | |
188 | 0 | onError: |
189 | 0 | Py_XDECREF(args); |
190 | 0 | return NULL; |
191 | 29 | } |
192 | | |
193 | | int _PyCodec_Forget(const char *encoding) |
194 | 0 | { |
195 | 0 | PyObject *v; |
196 | 0 | int result; |
197 | |
|
198 | 0 | PyInterpreterState *interp = _PyInterpreterState_Get(); |
199 | 0 | if (interp->codec_search_path == NULL) { |
200 | 0 | return -1; |
201 | 0 | } |
202 | | |
203 | | /* Convert the encoding to a normalized Python string: all |
204 | | characters are converted to lower case, spaces and hyphens are |
205 | | replaced with underscores. */ |
206 | 0 | v = normalizestring(encoding); |
207 | 0 | if (v == NULL) { |
208 | 0 | return -1; |
209 | 0 | } |
210 | | |
211 | | /* Drop the named codec from the internal cache */ |
212 | 0 | result = PyDict_DelItem(interp->codec_search_cache, v); |
213 | 0 | Py_DECREF(v); |
214 | |
|
215 | 0 | return result; |
216 | 0 | } |
217 | | |
218 | | /* Codec registry encoding check API. */ |
219 | | |
220 | | int PyCodec_KnownEncoding(const char *encoding) |
221 | 0 | { |
222 | 0 | PyObject *codecs; |
223 | |
|
224 | 0 | codecs = _PyCodec_Lookup(encoding); |
225 | 0 | if (!codecs) { |
226 | 0 | PyErr_Clear(); |
227 | 0 | return 0; |
228 | 0 | } |
229 | 0 | else { |
230 | 0 | Py_DECREF(codecs); |
231 | 0 | return 1; |
232 | 0 | } |
233 | 0 | } |
234 | | |
235 | | static |
236 | | PyObject *args_tuple(PyObject *object, |
237 | | const char *errors) |
238 | 0 | { |
239 | 0 | PyObject *args; |
240 | |
|
241 | 0 | args = PyTuple_New(1 + (errors != NULL)); |
242 | 0 | if (args == NULL) |
243 | 0 | return NULL; |
244 | 0 | Py_INCREF(object); |
245 | 0 | PyTuple_SET_ITEM(args,0,object); |
246 | 0 | if (errors) { |
247 | 0 | PyObject *v; |
248 | |
|
249 | 0 | v = PyUnicode_FromString(errors); |
250 | 0 | if (v == NULL) { |
251 | 0 | Py_DECREF(args); |
252 | 0 | return NULL; |
253 | 0 | } |
254 | 0 | PyTuple_SET_ITEM(args, 1, v); |
255 | 0 | } |
256 | 0 | return args; |
257 | 0 | } |
258 | | |
259 | | /* Helper function to get a codec item */ |
260 | | |
261 | | static |
262 | | PyObject *codec_getitem(const char *encoding, int index) |
263 | 0 | { |
264 | 0 | PyObject *codecs; |
265 | 0 | PyObject *v; |
266 | |
|
267 | 0 | codecs = _PyCodec_Lookup(encoding); |
268 | 0 | if (codecs == NULL) |
269 | 0 | return NULL; |
270 | 0 | v = PyTuple_GET_ITEM(codecs, index); |
271 | 0 | Py_DECREF(codecs); |
272 | 0 | Py_INCREF(v); |
273 | 0 | return v; |
274 | 0 | } |
275 | | |
276 | | /* Helper functions to create an incremental codec. */ |
277 | | static |
278 | | PyObject *codec_makeincrementalcodec(PyObject *codec_info, |
279 | | const char *errors, |
280 | | const char *attrname) |
281 | 43 | { |
282 | 43 | PyObject *ret, *inccodec; |
283 | | |
284 | 43 | inccodec = PyObject_GetAttrString(codec_info, attrname); |
285 | 43 | if (inccodec == NULL) |
286 | 0 | return NULL; |
287 | 43 | if (errors) |
288 | 43 | ret = PyObject_CallFunction(inccodec, "s", errors); |
289 | 0 | else |
290 | 0 | ret = _PyObject_CallNoArg(inccodec); |
291 | 43 | Py_DECREF(inccodec); |
292 | 43 | return ret; |
293 | 43 | } |
294 | | |
295 | | static |
296 | | PyObject *codec_getincrementalcodec(const char *encoding, |
297 | | const char *errors, |
298 | | const char *attrname) |
299 | 0 | { |
300 | 0 | PyObject *codec_info, *ret; |
301 | |
|
302 | 0 | codec_info = _PyCodec_Lookup(encoding); |
303 | 0 | if (codec_info == NULL) |
304 | 0 | return NULL; |
305 | 0 | ret = codec_makeincrementalcodec(codec_info, errors, attrname); |
306 | 0 | Py_DECREF(codec_info); |
307 | 0 | return ret; |
308 | 0 | } |
309 | | |
310 | | /* Helper function to create a stream codec. */ |
311 | | |
312 | | static |
313 | | PyObject *codec_getstreamcodec(const char *encoding, |
314 | | PyObject *stream, |
315 | | const char *errors, |
316 | | const int index) |
317 | 0 | { |
318 | 0 | PyObject *codecs, *streamcodec, *codeccls; |
319 | |
|
320 | 0 | codecs = _PyCodec_Lookup(encoding); |
321 | 0 | if (codecs == NULL) |
322 | 0 | return NULL; |
323 | | |
324 | 0 | codeccls = PyTuple_GET_ITEM(codecs, index); |
325 | 0 | if (errors != NULL) |
326 | 0 | streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors); |
327 | 0 | else |
328 | 0 | streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL); |
329 | 0 | Py_DECREF(codecs); |
330 | 0 | return streamcodec; |
331 | 0 | } |
332 | | |
333 | | /* Helpers to work with the result of _PyCodec_Lookup |
334 | | |
335 | | */ |
336 | | PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info, |
337 | | const char *errors) |
338 | 15 | { |
339 | 15 | return codec_makeincrementalcodec(codec_info, errors, |
340 | 15 | "incrementaldecoder"); |
341 | 15 | } |
342 | | |
343 | | PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info, |
344 | | const char *errors) |
345 | 28 | { |
346 | 28 | return codec_makeincrementalcodec(codec_info, errors, |
347 | 28 | "incrementalencoder"); |
348 | 28 | } |
349 | | |
350 | | |
351 | | /* Convenience APIs to query the Codec registry. |
352 | | |
353 | | All APIs return a codec object with incremented refcount. |
354 | | |
355 | | */ |
356 | | |
357 | | PyObject *PyCodec_Encoder(const char *encoding) |
358 | 0 | { |
359 | 0 | return codec_getitem(encoding, 0); |
360 | 0 | } |
361 | | |
362 | | PyObject *PyCodec_Decoder(const char *encoding) |
363 | 0 | { |
364 | 0 | return codec_getitem(encoding, 1); |
365 | 0 | } |
366 | | |
367 | | PyObject *PyCodec_IncrementalEncoder(const char *encoding, |
368 | | const char *errors) |
369 | 0 | { |
370 | 0 | return codec_getincrementalcodec(encoding, errors, "incrementalencoder"); |
371 | 0 | } |
372 | | |
373 | | PyObject *PyCodec_IncrementalDecoder(const char *encoding, |
374 | | const char *errors) |
375 | 0 | { |
376 | 0 | return codec_getincrementalcodec(encoding, errors, "incrementaldecoder"); |
377 | 0 | } |
378 | | |
379 | | PyObject *PyCodec_StreamReader(const char *encoding, |
380 | | PyObject *stream, |
381 | | const char *errors) |
382 | 0 | { |
383 | 0 | return codec_getstreamcodec(encoding, stream, errors, 2); |
384 | 0 | } |
385 | | |
386 | | PyObject *PyCodec_StreamWriter(const char *encoding, |
387 | | PyObject *stream, |
388 | | const char *errors) |
389 | 0 | { |
390 | 0 | return codec_getstreamcodec(encoding, stream, errors, 3); |
391 | 0 | } |
392 | | |
393 | | /* Helper that tries to ensure the reported exception chain indicates the |
394 | | * codec that was invoked to trigger the failure without changing the type |
395 | | * of the exception raised. |
396 | | */ |
397 | | static void |
398 | | wrap_codec_error(const char *operation, |
399 | | const char *encoding) |
400 | 0 | { |
401 | | /* TrySetFromCause will replace the active exception with a suitably |
402 | | * updated clone if it can, otherwise it will leave the original |
403 | | * exception alone. |
404 | | */ |
405 | 0 | _PyErr_TrySetFromCause("%s with '%s' codec failed", |
406 | 0 | operation, encoding); |
407 | 0 | } |
408 | | |
409 | | /* Encode an object (e.g. a Unicode object) using the given encoding |
410 | | and return the resulting encoded object (usually a Python string). |
411 | | |
412 | | errors is passed to the encoder factory as argument if non-NULL. */ |
413 | | |
414 | | static PyObject * |
415 | | _PyCodec_EncodeInternal(PyObject *object, |
416 | | PyObject *encoder, |
417 | | const char *encoding, |
418 | | const char *errors) |
419 | 0 | { |
420 | 0 | PyObject *args = NULL, *result = NULL; |
421 | 0 | PyObject *v = NULL; |
422 | |
|
423 | 0 | args = args_tuple(object, errors); |
424 | 0 | if (args == NULL) |
425 | 0 | goto onError; |
426 | | |
427 | 0 | result = PyEval_CallObject(encoder, args); |
428 | 0 | if (result == NULL) { |
429 | 0 | wrap_codec_error("encoding", encoding); |
430 | 0 | goto onError; |
431 | 0 | } |
432 | | |
433 | 0 | if (!PyTuple_Check(result) || |
434 | 0 | PyTuple_GET_SIZE(result) != 2) { |
435 | 0 | PyErr_SetString(PyExc_TypeError, |
436 | 0 | "encoder must return a tuple (object, integer)"); |
437 | 0 | goto onError; |
438 | 0 | } |
439 | 0 | v = PyTuple_GET_ITEM(result,0); |
440 | 0 | Py_INCREF(v); |
441 | | /* We don't check or use the second (integer) entry. */ |
442 | |
|
443 | 0 | Py_DECREF(args); |
444 | 0 | Py_DECREF(encoder); |
445 | 0 | Py_DECREF(result); |
446 | 0 | return v; |
447 | | |
448 | 0 | onError: |
449 | 0 | Py_XDECREF(result); |
450 | 0 | Py_XDECREF(args); |
451 | 0 | Py_XDECREF(encoder); |
452 | 0 | return NULL; |
453 | 0 | } |
454 | | |
455 | | /* Decode an object (usually a Python string) using the given encoding |
456 | | and return an equivalent object (e.g. a Unicode object). |
457 | | |
458 | | errors is passed to the decoder factory as argument if non-NULL. */ |
459 | | |
460 | | static PyObject * |
461 | | _PyCodec_DecodeInternal(PyObject *object, |
462 | | PyObject *decoder, |
463 | | const char *encoding, |
464 | | const char *errors) |
465 | 0 | { |
466 | 0 | PyObject *args = NULL, *result = NULL; |
467 | 0 | PyObject *v; |
468 | |
|
469 | 0 | args = args_tuple(object, errors); |
470 | 0 | if (args == NULL) |
471 | 0 | goto onError; |
472 | | |
473 | 0 | result = PyEval_CallObject(decoder,args); |
474 | 0 | if (result == NULL) { |
475 | 0 | wrap_codec_error("decoding", encoding); |
476 | 0 | goto onError; |
477 | 0 | } |
478 | 0 | if (!PyTuple_Check(result) || |
479 | 0 | PyTuple_GET_SIZE(result) != 2) { |
480 | 0 | PyErr_SetString(PyExc_TypeError, |
481 | 0 | "decoder must return a tuple (object,integer)"); |
482 | 0 | goto onError; |
483 | 0 | } |
484 | 0 | v = PyTuple_GET_ITEM(result,0); |
485 | 0 | Py_INCREF(v); |
486 | | /* We don't check or use the second (integer) entry. */ |
487 | |
|
488 | 0 | Py_DECREF(args); |
489 | 0 | Py_DECREF(decoder); |
490 | 0 | Py_DECREF(result); |
491 | 0 | return v; |
492 | | |
493 | 0 | onError: |
494 | 0 | Py_XDECREF(args); |
495 | 0 | Py_XDECREF(decoder); |
496 | 0 | Py_XDECREF(result); |
497 | 0 | return NULL; |
498 | 0 | } |
499 | | |
500 | | /* Generic encoding/decoding API */ |
501 | | PyObject *PyCodec_Encode(PyObject *object, |
502 | | const char *encoding, |
503 | | const char *errors) |
504 | 0 | { |
505 | 0 | PyObject *encoder; |
506 | |
|
507 | 0 | encoder = PyCodec_Encoder(encoding); |
508 | 0 | if (encoder == NULL) |
509 | 0 | return NULL; |
510 | | |
511 | 0 | return _PyCodec_EncodeInternal(object, encoder, encoding, errors); |
512 | 0 | } |
513 | | |
514 | | PyObject *PyCodec_Decode(PyObject *object, |
515 | | const char *encoding, |
516 | | const char *errors) |
517 | 0 | { |
518 | 0 | PyObject *decoder; |
519 | |
|
520 | 0 | decoder = PyCodec_Decoder(encoding); |
521 | 0 | if (decoder == NULL) |
522 | 0 | return NULL; |
523 | | |
524 | 0 | return _PyCodec_DecodeInternal(object, decoder, encoding, errors); |
525 | 0 | } |
526 | | |
527 | | /* Text encoding/decoding API */ |
528 | | PyObject * _PyCodec_LookupTextEncoding(const char *encoding, |
529 | | const char *alternate_command) |
530 | 43 | { |
531 | 43 | _Py_IDENTIFIER(_is_text_encoding); |
532 | 43 | PyObject *codec; |
533 | 43 | PyObject *attr; |
534 | 43 | int is_text_codec; |
535 | | |
536 | 43 | codec = _PyCodec_Lookup(encoding); |
537 | 43 | if (codec == NULL) |
538 | 0 | return NULL; |
539 | | |
540 | | /* Backwards compatibility: assume any raw tuple describes a text |
541 | | * encoding, and the same for anything lacking the private |
542 | | * attribute. |
543 | | */ |
544 | 43 | if (!PyTuple_CheckExact(codec)) { |
545 | 43 | if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) { |
546 | 0 | Py_DECREF(codec); |
547 | 0 | return NULL; |
548 | 0 | } |
549 | 43 | if (attr != NULL) { |
550 | 43 | is_text_codec = PyObject_IsTrue(attr); |
551 | 43 | Py_DECREF(attr); |
552 | 43 | if (is_text_codec <= 0) { |
553 | 0 | Py_DECREF(codec); |
554 | 0 | if (!is_text_codec) |
555 | 0 | PyErr_Format(PyExc_LookupError, |
556 | 0 | "'%.400s' is not a text encoding; " |
557 | 0 | "use %s to handle arbitrary codecs", |
558 | 0 | encoding, alternate_command); |
559 | 0 | return NULL; |
560 | 0 | } |
561 | 43 | } |
562 | 43 | } |
563 | | |
564 | | /* This appears to be a valid text encoding */ |
565 | 43 | return codec; |
566 | 43 | } |
567 | | |
568 | | |
569 | | static |
570 | | PyObject *codec_getitem_checked(const char *encoding, |
571 | | const char *alternate_command, |
572 | | int index) |
573 | 0 | { |
574 | 0 | PyObject *codec; |
575 | 0 | PyObject *v; |
576 | |
|
577 | 0 | codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); |
578 | 0 | if (codec == NULL) |
579 | 0 | return NULL; |
580 | | |
581 | 0 | v = PyTuple_GET_ITEM(codec, index); |
582 | 0 | Py_INCREF(v); |
583 | 0 | Py_DECREF(codec); |
584 | 0 | return v; |
585 | 0 | } |
586 | | |
587 | | static PyObject * _PyCodec_TextEncoder(const char *encoding) |
588 | 0 | { |
589 | 0 | return codec_getitem_checked(encoding, "codecs.encode()", 0); |
590 | 0 | } |
591 | | |
592 | | static PyObject * _PyCodec_TextDecoder(const char *encoding) |
593 | 0 | { |
594 | 0 | return codec_getitem_checked(encoding, "codecs.decode()", 1); |
595 | 0 | } |
596 | | |
597 | | PyObject *_PyCodec_EncodeText(PyObject *object, |
598 | | const char *encoding, |
599 | | const char *errors) |
600 | 0 | { |
601 | 0 | PyObject *encoder; |
602 | |
|
603 | 0 | encoder = _PyCodec_TextEncoder(encoding); |
604 | 0 | if (encoder == NULL) |
605 | 0 | return NULL; |
606 | | |
607 | 0 | return _PyCodec_EncodeInternal(object, encoder, encoding, errors); |
608 | 0 | } |
609 | | |
610 | | PyObject *_PyCodec_DecodeText(PyObject *object, |
611 | | const char *encoding, |
612 | | const char *errors) |
613 | 0 | { |
614 | 0 | PyObject *decoder; |
615 | |
|
616 | 0 | decoder = _PyCodec_TextDecoder(encoding); |
617 | 0 | if (decoder == NULL) |
618 | 0 | return NULL; |
619 | | |
620 | 0 | return _PyCodec_DecodeInternal(object, decoder, encoding, errors); |
621 | 0 | } |
622 | | |
623 | | /* Register the error handling callback function error under the name |
624 | | name. This function will be called by the codec when it encounters |
625 | | an unencodable characters/undecodable bytes and doesn't know the |
626 | | callback name, when name is specified as the error parameter |
627 | | in the call to the encode/decode function. |
628 | | Return 0 on success, -1 on error */ |
629 | | int PyCodec_RegisterError(const char *name, PyObject *error) |
630 | 112 | { |
631 | 112 | PyInterpreterState *interp = _PyInterpreterState_Get(); |
632 | 112 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) |
633 | 0 | return -1; |
634 | 112 | if (!PyCallable_Check(error)) { |
635 | 0 | PyErr_SetString(PyExc_TypeError, "handler must be callable"); |
636 | 0 | return -1; |
637 | 0 | } |
638 | 112 | return PyDict_SetItemString(interp->codec_error_registry, |
639 | 112 | name, error); |
640 | 112 | } |
641 | | |
642 | | /* Lookup the error handling callback function registered under the |
643 | | name error. As a special case NULL can be passed, in which case |
644 | | the error handling callback for strict encoding will be returned. */ |
645 | | PyObject *PyCodec_LookupError(const char *name) |
646 | 84 | { |
647 | 84 | PyObject *handler = NULL; |
648 | | |
649 | 84 | PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); |
650 | 84 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) |
651 | 0 | return NULL; |
652 | | |
653 | 84 | if (name==NULL) |
654 | 0 | name = "strict"; |
655 | 84 | handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name); |
656 | 84 | if (handler) { |
657 | 84 | Py_INCREF(handler); |
658 | 84 | } |
659 | 0 | else if (!PyErr_Occurred()) { |
660 | 0 | PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); |
661 | 0 | } |
662 | 84 | return handler; |
663 | 84 | } |
664 | | |
665 | | static void wrong_exception_type(PyObject *exc) |
666 | 0 | { |
667 | 0 | PyErr_Format(PyExc_TypeError, |
668 | 0 | "don't know how to handle %.200s in error callback", |
669 | 0 | exc->ob_type->tp_name); |
670 | 0 | } |
671 | | |
672 | | PyObject *PyCodec_StrictErrors(PyObject *exc) |
673 | 0 | { |
674 | 0 | if (PyExceptionInstance_Check(exc)) |
675 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
676 | 0 | else |
677 | 0 | PyErr_SetString(PyExc_TypeError, "codec must pass exception instance"); |
678 | 0 | return NULL; |
679 | 0 | } |
680 | | |
681 | | |
682 | | PyObject *PyCodec_IgnoreErrors(PyObject *exc) |
683 | 0 | { |
684 | 0 | Py_ssize_t end; |
685 | |
|
686 | 0 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
687 | 0 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) |
688 | 0 | return NULL; |
689 | 0 | } |
690 | 0 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
691 | 0 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) |
692 | 0 | return NULL; |
693 | 0 | } |
694 | 0 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { |
695 | 0 | if (PyUnicodeTranslateError_GetEnd(exc, &end)) |
696 | 0 | return NULL; |
697 | 0 | } |
698 | 0 | else { |
699 | 0 | wrong_exception_type(exc); |
700 | 0 | return NULL; |
701 | 0 | } |
702 | 0 | return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end); |
703 | 0 | } |
704 | | |
705 | | |
706 | | PyObject *PyCodec_ReplaceErrors(PyObject *exc) |
707 | 0 | { |
708 | 0 | Py_ssize_t start, end, i, len; |
709 | |
|
710 | 0 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
711 | 0 | PyObject *res; |
712 | 0 | int kind; |
713 | 0 | void *data; |
714 | 0 | if (PyUnicodeEncodeError_GetStart(exc, &start)) |
715 | 0 | return NULL; |
716 | 0 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) |
717 | 0 | return NULL; |
718 | 0 | len = end - start; |
719 | 0 | res = PyUnicode_New(len, '?'); |
720 | 0 | if (res == NULL) |
721 | 0 | return NULL; |
722 | 0 | kind = PyUnicode_KIND(res); |
723 | 0 | data = PyUnicode_DATA(res); |
724 | 0 | for (i = 0; i < len; ++i) |
725 | 0 | PyUnicode_WRITE(kind, data, i, '?'); |
726 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
727 | 0 | return Py_BuildValue("(Nn)", res, end); |
728 | 0 | } |
729 | 0 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
730 | 0 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) |
731 | 0 | return NULL; |
732 | 0 | return Py_BuildValue("(Cn)", |
733 | 0 | (int)Py_UNICODE_REPLACEMENT_CHARACTER, |
734 | 0 | end); |
735 | 0 | } |
736 | 0 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { |
737 | 0 | PyObject *res; |
738 | 0 | int kind; |
739 | 0 | void *data; |
740 | 0 | if (PyUnicodeTranslateError_GetStart(exc, &start)) |
741 | 0 | return NULL; |
742 | 0 | if (PyUnicodeTranslateError_GetEnd(exc, &end)) |
743 | 0 | return NULL; |
744 | 0 | len = end - start; |
745 | 0 | res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER); |
746 | 0 | if (res == NULL) |
747 | 0 | return NULL; |
748 | 0 | kind = PyUnicode_KIND(res); |
749 | 0 | data = PyUnicode_DATA(res); |
750 | 0 | for (i=0; i < len; i++) |
751 | 0 | PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER); |
752 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
753 | 0 | return Py_BuildValue("(Nn)", res, end); |
754 | 0 | } |
755 | 0 | else { |
756 | 0 | wrong_exception_type(exc); |
757 | 0 | return NULL; |
758 | 0 | } |
759 | 0 | } |
760 | | |
761 | | PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) |
762 | 0 | { |
763 | 0 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
764 | 0 | PyObject *restuple; |
765 | 0 | PyObject *object; |
766 | 0 | Py_ssize_t i; |
767 | 0 | Py_ssize_t start; |
768 | 0 | Py_ssize_t end; |
769 | 0 | PyObject *res; |
770 | 0 | unsigned char *outp; |
771 | 0 | Py_ssize_t ressize; |
772 | 0 | Py_UCS4 ch; |
773 | 0 | if (PyUnicodeEncodeError_GetStart(exc, &start)) |
774 | 0 | return NULL; |
775 | 0 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) |
776 | 0 | return NULL; |
777 | 0 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) |
778 | 0 | return NULL; |
779 | 0 | if (end - start > PY_SSIZE_T_MAX / (2+7+1)) |
780 | 0 | end = start + PY_SSIZE_T_MAX / (2+7+1); |
781 | 0 | for (i = start, ressize = 0; i < end; ++i) { |
782 | | /* object is guaranteed to be "ready" */ |
783 | 0 | ch = PyUnicode_READ_CHAR(object, i); |
784 | 0 | if (ch<10) |
785 | 0 | ressize += 2+1+1; |
786 | 0 | else if (ch<100) |
787 | 0 | ressize += 2+2+1; |
788 | 0 | else if (ch<1000) |
789 | 0 | ressize += 2+3+1; |
790 | 0 | else if (ch<10000) |
791 | 0 | ressize += 2+4+1; |
792 | 0 | else if (ch<100000) |
793 | 0 | ressize += 2+5+1; |
794 | 0 | else if (ch<1000000) |
795 | 0 | ressize += 2+6+1; |
796 | 0 | else |
797 | 0 | ressize += 2+7+1; |
798 | 0 | } |
799 | | /* allocate replacement */ |
800 | 0 | res = PyUnicode_New(ressize, 127); |
801 | 0 | if (res == NULL) { |
802 | 0 | Py_DECREF(object); |
803 | 0 | return NULL; |
804 | 0 | } |
805 | 0 | outp = PyUnicode_1BYTE_DATA(res); |
806 | | /* generate replacement */ |
807 | 0 | for (i = start; i < end; ++i) { |
808 | 0 | int digits; |
809 | 0 | int base; |
810 | 0 | ch = PyUnicode_READ_CHAR(object, i); |
811 | 0 | *outp++ = '&'; |
812 | 0 | *outp++ = '#'; |
813 | 0 | if (ch<10) { |
814 | 0 | digits = 1; |
815 | 0 | base = 1; |
816 | 0 | } |
817 | 0 | else if (ch<100) { |
818 | 0 | digits = 2; |
819 | 0 | base = 10; |
820 | 0 | } |
821 | 0 | else if (ch<1000) { |
822 | 0 | digits = 3; |
823 | 0 | base = 100; |
824 | 0 | } |
825 | 0 | else if (ch<10000) { |
826 | 0 | digits = 4; |
827 | 0 | base = 1000; |
828 | 0 | } |
829 | 0 | else if (ch<100000) { |
830 | 0 | digits = 5; |
831 | 0 | base = 10000; |
832 | 0 | } |
833 | 0 | else if (ch<1000000) { |
834 | 0 | digits = 6; |
835 | 0 | base = 100000; |
836 | 0 | } |
837 | 0 | else { |
838 | 0 | digits = 7; |
839 | 0 | base = 1000000; |
840 | 0 | } |
841 | 0 | while (digits-->0) { |
842 | 0 | *outp++ = '0' + ch/base; |
843 | 0 | ch %= base; |
844 | 0 | base /= 10; |
845 | 0 | } |
846 | 0 | *outp++ = ';'; |
847 | 0 | } |
848 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
849 | 0 | restuple = Py_BuildValue("(Nn)", res, end); |
850 | 0 | Py_DECREF(object); |
851 | 0 | return restuple; |
852 | 0 | } |
853 | 0 | else { |
854 | 0 | wrong_exception_type(exc); |
855 | 0 | return NULL; |
856 | 0 | } |
857 | 0 | } |
858 | | |
859 | | PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) |
860 | 0 | { |
861 | 0 | PyObject *object; |
862 | 0 | Py_ssize_t i; |
863 | 0 | Py_ssize_t start; |
864 | 0 | Py_ssize_t end; |
865 | 0 | PyObject *res; |
866 | 0 | unsigned char *outp; |
867 | 0 | int ressize; |
868 | 0 | Py_UCS4 c; |
869 | |
|
870 | 0 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
871 | 0 | const unsigned char *p; |
872 | 0 | if (PyUnicodeDecodeError_GetStart(exc, &start)) |
873 | 0 | return NULL; |
874 | 0 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) |
875 | 0 | return NULL; |
876 | 0 | if (!(object = PyUnicodeDecodeError_GetObject(exc))) |
877 | 0 | return NULL; |
878 | 0 | p = (const unsigned char*)PyBytes_AS_STRING(object); |
879 | 0 | res = PyUnicode_New(4 * (end - start), 127); |
880 | 0 | if (res == NULL) { |
881 | 0 | Py_DECREF(object); |
882 | 0 | return NULL; |
883 | 0 | } |
884 | 0 | outp = PyUnicode_1BYTE_DATA(res); |
885 | 0 | for (i = start; i < end; i++, outp += 4) { |
886 | 0 | unsigned char c = p[i]; |
887 | 0 | outp[0] = '\\'; |
888 | 0 | outp[1] = 'x'; |
889 | 0 | outp[2] = Py_hexdigits[(c>>4)&0xf]; |
890 | 0 | outp[3] = Py_hexdigits[c&0xf]; |
891 | 0 | } |
892 | |
|
893 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
894 | 0 | Py_DECREF(object); |
895 | 0 | return Py_BuildValue("(Nn)", res, end); |
896 | 0 | } |
897 | 0 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
898 | 0 | if (PyUnicodeEncodeError_GetStart(exc, &start)) |
899 | 0 | return NULL; |
900 | 0 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) |
901 | 0 | return NULL; |
902 | 0 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) |
903 | 0 | return NULL; |
904 | 0 | } |
905 | 0 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { |
906 | 0 | if (PyUnicodeTranslateError_GetStart(exc, &start)) |
907 | 0 | return NULL; |
908 | 0 | if (PyUnicodeTranslateError_GetEnd(exc, &end)) |
909 | 0 | return NULL; |
910 | 0 | if (!(object = PyUnicodeTranslateError_GetObject(exc))) |
911 | 0 | return NULL; |
912 | 0 | } |
913 | 0 | else { |
914 | 0 | wrong_exception_type(exc); |
915 | 0 | return NULL; |
916 | 0 | } |
917 | | |
918 | 0 | if (end - start > PY_SSIZE_T_MAX / (1+1+8)) |
919 | 0 | end = start + PY_SSIZE_T_MAX / (1+1+8); |
920 | 0 | for (i = start, ressize = 0; i < end; ++i) { |
921 | | /* object is guaranteed to be "ready" */ |
922 | 0 | c = PyUnicode_READ_CHAR(object, i); |
923 | 0 | if (c >= 0x10000) { |
924 | 0 | ressize += 1+1+8; |
925 | 0 | } |
926 | 0 | else if (c >= 0x100) { |
927 | 0 | ressize += 1+1+4; |
928 | 0 | } |
929 | 0 | else |
930 | 0 | ressize += 1+1+2; |
931 | 0 | } |
932 | 0 | res = PyUnicode_New(ressize, 127); |
933 | 0 | if (res == NULL) { |
934 | 0 | Py_DECREF(object); |
935 | 0 | return NULL; |
936 | 0 | } |
937 | 0 | outp = PyUnicode_1BYTE_DATA(res); |
938 | 0 | for (i = start; i < end; ++i) { |
939 | 0 | c = PyUnicode_READ_CHAR(object, i); |
940 | 0 | *outp++ = '\\'; |
941 | 0 | if (c >= 0x00010000) { |
942 | 0 | *outp++ = 'U'; |
943 | 0 | *outp++ = Py_hexdigits[(c>>28)&0xf]; |
944 | 0 | *outp++ = Py_hexdigits[(c>>24)&0xf]; |
945 | 0 | *outp++ = Py_hexdigits[(c>>20)&0xf]; |
946 | 0 | *outp++ = Py_hexdigits[(c>>16)&0xf]; |
947 | 0 | *outp++ = Py_hexdigits[(c>>12)&0xf]; |
948 | 0 | *outp++ = Py_hexdigits[(c>>8)&0xf]; |
949 | 0 | } |
950 | 0 | else if (c >= 0x100) { |
951 | 0 | *outp++ = 'u'; |
952 | 0 | *outp++ = Py_hexdigits[(c>>12)&0xf]; |
953 | 0 | *outp++ = Py_hexdigits[(c>>8)&0xf]; |
954 | 0 | } |
955 | 0 | else |
956 | 0 | *outp++ = 'x'; |
957 | 0 | *outp++ = Py_hexdigits[(c>>4)&0xf]; |
958 | 0 | *outp++ = Py_hexdigits[c&0xf]; |
959 | 0 | } |
960 | |
|
961 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
962 | 0 | Py_DECREF(object); |
963 | 0 | return Py_BuildValue("(Nn)", res, end); |
964 | 0 | } |
965 | | |
966 | | static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; |
967 | | |
968 | | PyObject *PyCodec_NameReplaceErrors(PyObject *exc) |
969 | 0 | { |
970 | 0 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
971 | 0 | PyObject *restuple; |
972 | 0 | PyObject *object; |
973 | 0 | Py_ssize_t i; |
974 | 0 | Py_ssize_t start; |
975 | 0 | Py_ssize_t end; |
976 | 0 | PyObject *res; |
977 | 0 | unsigned char *outp; |
978 | 0 | Py_ssize_t ressize; |
979 | 0 | int replsize; |
980 | 0 | Py_UCS4 c; |
981 | 0 | char buffer[256]; /* NAME_MAXLEN */ |
982 | 0 | if (PyUnicodeEncodeError_GetStart(exc, &start)) |
983 | 0 | return NULL; |
984 | 0 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) |
985 | 0 | return NULL; |
986 | 0 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) |
987 | 0 | return NULL; |
988 | 0 | if (!ucnhash_CAPI) { |
989 | | /* load the unicode data module */ |
990 | 0 | ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( |
991 | 0 | PyUnicodeData_CAPSULE_NAME, 1); |
992 | 0 | if (!ucnhash_CAPI) |
993 | 0 | return NULL; |
994 | 0 | } |
995 | 0 | for (i = start, ressize = 0; i < end; ++i) { |
996 | | /* object is guaranteed to be "ready" */ |
997 | 0 | c = PyUnicode_READ_CHAR(object, i); |
998 | 0 | if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { |
999 | 0 | replsize = 1+1+1+(int)strlen(buffer)+1; |
1000 | 0 | } |
1001 | 0 | else if (c >= 0x10000) { |
1002 | 0 | replsize = 1+1+8; |
1003 | 0 | } |
1004 | 0 | else if (c >= 0x100) { |
1005 | 0 | replsize = 1+1+4; |
1006 | 0 | } |
1007 | 0 | else |
1008 | 0 | replsize = 1+1+2; |
1009 | 0 | if (ressize > PY_SSIZE_T_MAX - replsize) |
1010 | 0 | break; |
1011 | 0 | ressize += replsize; |
1012 | 0 | } |
1013 | 0 | end = i; |
1014 | 0 | res = PyUnicode_New(ressize, 127); |
1015 | 0 | if (res==NULL) |
1016 | 0 | return NULL; |
1017 | 0 | for (i = start, outp = PyUnicode_1BYTE_DATA(res); |
1018 | 0 | i < end; ++i) { |
1019 | 0 | c = PyUnicode_READ_CHAR(object, i); |
1020 | 0 | *outp++ = '\\'; |
1021 | 0 | if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { |
1022 | 0 | *outp++ = 'N'; |
1023 | 0 | *outp++ = '{'; |
1024 | 0 | strcpy((char *)outp, buffer); |
1025 | 0 | outp += strlen(buffer); |
1026 | 0 | *outp++ = '}'; |
1027 | 0 | continue; |
1028 | 0 | } |
1029 | 0 | if (c >= 0x00010000) { |
1030 | 0 | *outp++ = 'U'; |
1031 | 0 | *outp++ = Py_hexdigits[(c>>28)&0xf]; |
1032 | 0 | *outp++ = Py_hexdigits[(c>>24)&0xf]; |
1033 | 0 | *outp++ = Py_hexdigits[(c>>20)&0xf]; |
1034 | 0 | *outp++ = Py_hexdigits[(c>>16)&0xf]; |
1035 | 0 | *outp++ = Py_hexdigits[(c>>12)&0xf]; |
1036 | 0 | *outp++ = Py_hexdigits[(c>>8)&0xf]; |
1037 | 0 | } |
1038 | 0 | else if (c >= 0x100) { |
1039 | 0 | *outp++ = 'u'; |
1040 | 0 | *outp++ = Py_hexdigits[(c>>12)&0xf]; |
1041 | 0 | *outp++ = Py_hexdigits[(c>>8)&0xf]; |
1042 | 0 | } |
1043 | 0 | else |
1044 | 0 | *outp++ = 'x'; |
1045 | 0 | *outp++ = Py_hexdigits[(c>>4)&0xf]; |
1046 | 0 | *outp++ = Py_hexdigits[c&0xf]; |
1047 | 0 | } |
1048 | |
|
1049 | 0 | assert(outp == PyUnicode_1BYTE_DATA(res) + ressize); |
1050 | 0 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1051 | 0 | restuple = Py_BuildValue("(Nn)", res, end); |
1052 | 0 | Py_DECREF(object); |
1053 | 0 | return restuple; |
1054 | 0 | } |
1055 | 0 | else { |
1056 | 0 | wrong_exception_type(exc); |
1057 | 0 | return NULL; |
1058 | 0 | } |
1059 | 0 | } |
1060 | | |
1061 | 0 | #define ENC_UNKNOWN -1 |
1062 | 0 | #define ENC_UTF8 0 |
1063 | 0 | #define ENC_UTF16BE 1 |
1064 | 0 | #define ENC_UTF16LE 2 |
1065 | 0 | #define ENC_UTF32BE 3 |
1066 | 0 | #define ENC_UTF32LE 4 |
1067 | | |
1068 | | static int |
1069 | | get_standard_encoding(const char *encoding, int *bytelength) |
1070 | 0 | { |
1071 | 0 | if (Py_TOLOWER(encoding[0]) == 'u' && |
1072 | 0 | Py_TOLOWER(encoding[1]) == 't' && |
1073 | 0 | Py_TOLOWER(encoding[2]) == 'f') { |
1074 | 0 | encoding += 3; |
1075 | 0 | if (*encoding == '-' || *encoding == '_' ) |
1076 | 0 | encoding++; |
1077 | 0 | if (encoding[0] == '8' && encoding[1] == '\0') { |
1078 | 0 | *bytelength = 3; |
1079 | 0 | return ENC_UTF8; |
1080 | 0 | } |
1081 | 0 | else if (encoding[0] == '1' && encoding[1] == '6') { |
1082 | 0 | encoding += 2; |
1083 | 0 | *bytelength = 2; |
1084 | 0 | if (*encoding == '\0') { |
1085 | | #ifdef WORDS_BIGENDIAN |
1086 | | return ENC_UTF16BE; |
1087 | | #else |
1088 | 0 | return ENC_UTF16LE; |
1089 | 0 | #endif |
1090 | 0 | } |
1091 | 0 | if (*encoding == '-' || *encoding == '_' ) |
1092 | 0 | encoding++; |
1093 | 0 | if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { |
1094 | 0 | if (Py_TOLOWER(encoding[0]) == 'b') |
1095 | 0 | return ENC_UTF16BE; |
1096 | 0 | if (Py_TOLOWER(encoding[0]) == 'l') |
1097 | 0 | return ENC_UTF16LE; |
1098 | 0 | } |
1099 | 0 | } |
1100 | 0 | else if (encoding[0] == '3' && encoding[1] == '2') { |
1101 | 0 | encoding += 2; |
1102 | 0 | *bytelength = 4; |
1103 | 0 | if (*encoding == '\0') { |
1104 | | #ifdef WORDS_BIGENDIAN |
1105 | | return ENC_UTF32BE; |
1106 | | #else |
1107 | 0 | return ENC_UTF32LE; |
1108 | 0 | #endif |
1109 | 0 | } |
1110 | 0 | if (*encoding == '-' || *encoding == '_' ) |
1111 | 0 | encoding++; |
1112 | 0 | if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { |
1113 | 0 | if (Py_TOLOWER(encoding[0]) == 'b') |
1114 | 0 | return ENC_UTF32BE; |
1115 | 0 | if (Py_TOLOWER(encoding[0]) == 'l') |
1116 | 0 | return ENC_UTF32LE; |
1117 | 0 | } |
1118 | 0 | } |
1119 | 0 | } |
1120 | 0 | else if (strcmp(encoding, "CP_UTF8") == 0) { |
1121 | 0 | *bytelength = 3; |
1122 | 0 | return ENC_UTF8; |
1123 | 0 | } |
1124 | 0 | return ENC_UNKNOWN; |
1125 | 0 | } |
1126 | | |
1127 | | /* This handler is declared static until someone demonstrates |
1128 | | a need to call it directly. */ |
1129 | | static PyObject * |
1130 | | PyCodec_SurrogatePassErrors(PyObject *exc) |
1131 | 0 | { |
1132 | 0 | PyObject *restuple; |
1133 | 0 | PyObject *object; |
1134 | 0 | PyObject *encode; |
1135 | 0 | const char *encoding; |
1136 | 0 | int code; |
1137 | 0 | int bytelength; |
1138 | 0 | Py_ssize_t i; |
1139 | 0 | Py_ssize_t start; |
1140 | 0 | Py_ssize_t end; |
1141 | 0 | PyObject *res; |
1142 | |
|
1143 | 0 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
1144 | 0 | unsigned char *outp; |
1145 | 0 | if (PyUnicodeEncodeError_GetStart(exc, &start)) |
1146 | 0 | return NULL; |
1147 | 0 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) |
1148 | 0 | return NULL; |
1149 | 0 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) |
1150 | 0 | return NULL; |
1151 | 0 | if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) { |
1152 | 0 | Py_DECREF(object); |
1153 | 0 | return NULL; |
1154 | 0 | } |
1155 | 0 | if (!(encoding = PyUnicode_AsUTF8(encode))) { |
1156 | 0 | Py_DECREF(object); |
1157 | 0 | Py_DECREF(encode); |
1158 | 0 | return NULL; |
1159 | 0 | } |
1160 | 0 | code = get_standard_encoding(encoding, &bytelength); |
1161 | 0 | Py_DECREF(encode); |
1162 | 0 | if (code == ENC_UNKNOWN) { |
1163 | | /* Not supported, fail with original exception */ |
1164 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1165 | 0 | Py_DECREF(object); |
1166 | 0 | return NULL; |
1167 | 0 | } |
1168 | | |
1169 | 0 | if (end - start > PY_SSIZE_T_MAX / bytelength) |
1170 | 0 | end = start + PY_SSIZE_T_MAX / bytelength; |
1171 | 0 | res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start)); |
1172 | 0 | if (!res) { |
1173 | 0 | Py_DECREF(object); |
1174 | 0 | return NULL; |
1175 | 0 | } |
1176 | 0 | outp = (unsigned char*)PyBytes_AsString(res); |
1177 | 0 | for (i = start; i < end; i++) { |
1178 | | /* object is guaranteed to be "ready" */ |
1179 | 0 | Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); |
1180 | 0 | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
1181 | | /* Not a surrogate, fail with original exception */ |
1182 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1183 | 0 | Py_DECREF(res); |
1184 | 0 | Py_DECREF(object); |
1185 | 0 | return NULL; |
1186 | 0 | } |
1187 | 0 | switch (code) { |
1188 | 0 | case ENC_UTF8: |
1189 | 0 | *outp++ = (unsigned char)(0xe0 | (ch >> 12)); |
1190 | 0 | *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); |
1191 | 0 | *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); |
1192 | 0 | break; |
1193 | 0 | case ENC_UTF16LE: |
1194 | 0 | *outp++ = (unsigned char) ch; |
1195 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1196 | 0 | break; |
1197 | 0 | case ENC_UTF16BE: |
1198 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1199 | 0 | *outp++ = (unsigned char) ch; |
1200 | 0 | break; |
1201 | 0 | case ENC_UTF32LE: |
1202 | 0 | *outp++ = (unsigned char) ch; |
1203 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1204 | 0 | *outp++ = (unsigned char)(ch >> 16); |
1205 | 0 | *outp++ = (unsigned char)(ch >> 24); |
1206 | 0 | break; |
1207 | 0 | case ENC_UTF32BE: |
1208 | 0 | *outp++ = (unsigned char)(ch >> 24); |
1209 | 0 | *outp++ = (unsigned char)(ch >> 16); |
1210 | 0 | *outp++ = (unsigned char)(ch >> 8); |
1211 | 0 | *outp++ = (unsigned char) ch; |
1212 | 0 | break; |
1213 | 0 | } |
1214 | 0 | } |
1215 | 0 | restuple = Py_BuildValue("(On)", res, end); |
1216 | 0 | Py_DECREF(res); |
1217 | 0 | Py_DECREF(object); |
1218 | 0 | return restuple; |
1219 | 0 | } |
1220 | 0 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
1221 | 0 | const unsigned char *p; |
1222 | 0 | Py_UCS4 ch = 0; |
1223 | 0 | if (PyUnicodeDecodeError_GetStart(exc, &start)) |
1224 | 0 | return NULL; |
1225 | 0 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) |
1226 | 0 | return NULL; |
1227 | 0 | if (!(object = PyUnicodeDecodeError_GetObject(exc))) |
1228 | 0 | return NULL; |
1229 | 0 | p = (const unsigned char*)PyBytes_AS_STRING(object); |
1230 | 0 | if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) { |
1231 | 0 | Py_DECREF(object); |
1232 | 0 | return NULL; |
1233 | 0 | } |
1234 | 0 | if (!(encoding = PyUnicode_AsUTF8(encode))) { |
1235 | 0 | Py_DECREF(object); |
1236 | 0 | Py_DECREF(encode); |
1237 | 0 | return NULL; |
1238 | 0 | } |
1239 | 0 | code = get_standard_encoding(encoding, &bytelength); |
1240 | 0 | Py_DECREF(encode); |
1241 | 0 | if (code == ENC_UNKNOWN) { |
1242 | | /* Not supported, fail with original exception */ |
1243 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1244 | 0 | Py_DECREF(object); |
1245 | 0 | return NULL; |
1246 | 0 | } |
1247 | | |
1248 | | /* Try decoding a single surrogate character. If |
1249 | | there are more, let the codec call us again. */ |
1250 | 0 | p += start; |
1251 | 0 | if (PyBytes_GET_SIZE(object) - start >= bytelength) { |
1252 | 0 | switch (code) { |
1253 | 0 | case ENC_UTF8: |
1254 | 0 | if ((p[0] & 0xf0) == 0xe0 && |
1255 | 0 | (p[1] & 0xc0) == 0x80 && |
1256 | 0 | (p[2] & 0xc0) == 0x80) { |
1257 | | /* it's a three-byte code */ |
1258 | 0 | ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); |
1259 | 0 | } |
1260 | 0 | break; |
1261 | 0 | case ENC_UTF16LE: |
1262 | 0 | ch = p[1] << 8 | p[0]; |
1263 | 0 | break; |
1264 | 0 | case ENC_UTF16BE: |
1265 | 0 | ch = p[0] << 8 | p[1]; |
1266 | 0 | break; |
1267 | 0 | case ENC_UTF32LE: |
1268 | 0 | ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; |
1269 | 0 | break; |
1270 | 0 | case ENC_UTF32BE: |
1271 | 0 | ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; |
1272 | 0 | break; |
1273 | 0 | } |
1274 | 0 | } |
1275 | | |
1276 | 0 | Py_DECREF(object); |
1277 | 0 | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
1278 | | /* it's not a surrogate - fail */ |
1279 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1280 | 0 | return NULL; |
1281 | 0 | } |
1282 | 0 | res = PyUnicode_FromOrdinal(ch); |
1283 | 0 | if (res == NULL) |
1284 | 0 | return NULL; |
1285 | 0 | return Py_BuildValue("(Nn)", res, start + bytelength); |
1286 | 0 | } |
1287 | 0 | else { |
1288 | 0 | wrong_exception_type(exc); |
1289 | 0 | return NULL; |
1290 | 0 | } |
1291 | 0 | } |
1292 | | |
1293 | | static PyObject * |
1294 | | PyCodec_SurrogateEscapeErrors(PyObject *exc) |
1295 | 0 | { |
1296 | 0 | PyObject *restuple; |
1297 | 0 | PyObject *object; |
1298 | 0 | Py_ssize_t i; |
1299 | 0 | Py_ssize_t start; |
1300 | 0 | Py_ssize_t end; |
1301 | 0 | PyObject *res; |
1302 | |
|
1303 | 0 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
1304 | 0 | char *outp; |
1305 | 0 | if (PyUnicodeEncodeError_GetStart(exc, &start)) |
1306 | 0 | return NULL; |
1307 | 0 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) |
1308 | 0 | return NULL; |
1309 | 0 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) |
1310 | 0 | return NULL; |
1311 | 0 | res = PyBytes_FromStringAndSize(NULL, end-start); |
1312 | 0 | if (!res) { |
1313 | 0 | Py_DECREF(object); |
1314 | 0 | return NULL; |
1315 | 0 | } |
1316 | 0 | outp = PyBytes_AsString(res); |
1317 | 0 | for (i = start; i < end; i++) { |
1318 | | /* object is guaranteed to be "ready" */ |
1319 | 0 | Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); |
1320 | 0 | if (ch < 0xdc80 || ch > 0xdcff) { |
1321 | | /* Not a UTF-8b surrogate, fail with original exception */ |
1322 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1323 | 0 | Py_DECREF(res); |
1324 | 0 | Py_DECREF(object); |
1325 | 0 | return NULL; |
1326 | 0 | } |
1327 | 0 | *outp++ = ch - 0xdc00; |
1328 | 0 | } |
1329 | 0 | restuple = Py_BuildValue("(On)", res, end); |
1330 | 0 | Py_DECREF(res); |
1331 | 0 | Py_DECREF(object); |
1332 | 0 | return restuple; |
1333 | 0 | } |
1334 | 0 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
1335 | 0 | PyObject *str; |
1336 | 0 | const unsigned char *p; |
1337 | 0 | Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ |
1338 | 0 | int consumed = 0; |
1339 | 0 | if (PyUnicodeDecodeError_GetStart(exc, &start)) |
1340 | 0 | return NULL; |
1341 | 0 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) |
1342 | 0 | return NULL; |
1343 | 0 | if (!(object = PyUnicodeDecodeError_GetObject(exc))) |
1344 | 0 | return NULL; |
1345 | 0 | p = (const unsigned char*)PyBytes_AS_STRING(object); |
1346 | 0 | while (consumed < 4 && consumed < end-start) { |
1347 | | /* Refuse to escape ASCII bytes. */ |
1348 | 0 | if (p[start+consumed] < 128) |
1349 | 0 | break; |
1350 | 0 | ch[consumed] = 0xdc00 + p[start+consumed]; |
1351 | 0 | consumed++; |
1352 | 0 | } |
1353 | 0 | Py_DECREF(object); |
1354 | 0 | if (!consumed) { |
1355 | | /* codec complained about ASCII byte. */ |
1356 | 0 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1357 | 0 | return NULL; |
1358 | 0 | } |
1359 | 0 | str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed); |
1360 | 0 | if (str == NULL) |
1361 | 0 | return NULL; |
1362 | 0 | return Py_BuildValue("(Nn)", str, start+consumed); |
1363 | 0 | } |
1364 | 0 | else { |
1365 | 0 | wrong_exception_type(exc); |
1366 | 0 | return NULL; |
1367 | 0 | } |
1368 | 0 | } |
1369 | | |
1370 | | |
1371 | | static PyObject *strict_errors(PyObject *self, PyObject *exc) |
1372 | 0 | { |
1373 | 0 | return PyCodec_StrictErrors(exc); |
1374 | 0 | } |
1375 | | |
1376 | | |
1377 | | static PyObject *ignore_errors(PyObject *self, PyObject *exc) |
1378 | 0 | { |
1379 | 0 | return PyCodec_IgnoreErrors(exc); |
1380 | 0 | } |
1381 | | |
1382 | | |
1383 | | static PyObject *replace_errors(PyObject *self, PyObject *exc) |
1384 | 0 | { |
1385 | 0 | return PyCodec_ReplaceErrors(exc); |
1386 | 0 | } |
1387 | | |
1388 | | |
1389 | | static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc) |
1390 | 0 | { |
1391 | 0 | return PyCodec_XMLCharRefReplaceErrors(exc); |
1392 | 0 | } |
1393 | | |
1394 | | |
1395 | | static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) |
1396 | 0 | { |
1397 | 0 | return PyCodec_BackslashReplaceErrors(exc); |
1398 | 0 | } |
1399 | | |
1400 | | static PyObject *namereplace_errors(PyObject *self, PyObject *exc) |
1401 | 0 | { |
1402 | 0 | return PyCodec_NameReplaceErrors(exc); |
1403 | 0 | } |
1404 | | |
1405 | | static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc) |
1406 | 0 | { |
1407 | 0 | return PyCodec_SurrogatePassErrors(exc); |
1408 | 0 | } |
1409 | | |
1410 | | static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc) |
1411 | 0 | { |
1412 | 0 | return PyCodec_SurrogateEscapeErrors(exc); |
1413 | 0 | } |
1414 | | |
1415 | | static int _PyCodecRegistry_Init(void) |
1416 | 14 | { |
1417 | 14 | static struct { |
1418 | 14 | const char *name; |
1419 | 14 | PyMethodDef def; |
1420 | 14 | } methods[] = |
1421 | 14 | { |
1422 | 14 | { |
1423 | 14 | "strict", |
1424 | 14 | { |
1425 | 14 | "strict_errors", |
1426 | 14 | strict_errors, |
1427 | 14 | METH_O, |
1428 | 14 | PyDoc_STR("Implements the 'strict' error handling, which " |
1429 | 14 | "raises a UnicodeError on coding errors.") |
1430 | 14 | } |
1431 | 14 | }, |
1432 | 14 | { |
1433 | 14 | "ignore", |
1434 | 14 | { |
1435 | 14 | "ignore_errors", |
1436 | 14 | ignore_errors, |
1437 | 14 | METH_O, |
1438 | 14 | PyDoc_STR("Implements the 'ignore' error handling, which " |
1439 | 14 | "ignores malformed data and continues.") |
1440 | 14 | } |
1441 | 14 | }, |
1442 | 14 | { |
1443 | 14 | "replace", |
1444 | 14 | { |
1445 | 14 | "replace_errors", |
1446 | 14 | replace_errors, |
1447 | 14 | METH_O, |
1448 | 14 | PyDoc_STR("Implements the 'replace' error handling, which " |
1449 | 14 | "replaces malformed data with a replacement marker.") |
1450 | 14 | } |
1451 | 14 | }, |
1452 | 14 | { |
1453 | 14 | "xmlcharrefreplace", |
1454 | 14 | { |
1455 | 14 | "xmlcharrefreplace_errors", |
1456 | 14 | xmlcharrefreplace_errors, |
1457 | 14 | METH_O, |
1458 | 14 | PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, " |
1459 | 14 | "which replaces an unencodable character with the " |
1460 | 14 | "appropriate XML character reference.") |
1461 | 14 | } |
1462 | 14 | }, |
1463 | 14 | { |
1464 | 14 | "backslashreplace", |
1465 | 14 | { |
1466 | 14 | "backslashreplace_errors", |
1467 | 14 | backslashreplace_errors, |
1468 | 14 | METH_O, |
1469 | 14 | PyDoc_STR("Implements the 'backslashreplace' error handling, " |
1470 | 14 | "which replaces malformed data with a backslashed " |
1471 | 14 | "escape sequence.") |
1472 | 14 | } |
1473 | 14 | }, |
1474 | 14 | { |
1475 | 14 | "namereplace", |
1476 | 14 | { |
1477 | 14 | "namereplace_errors", |
1478 | 14 | namereplace_errors, |
1479 | 14 | METH_O, |
1480 | 14 | PyDoc_STR("Implements the 'namereplace' error handling, " |
1481 | 14 | "which replaces an unencodable character with a " |
1482 | 14 | "\\N{...} escape sequence.") |
1483 | 14 | } |
1484 | 14 | }, |
1485 | 14 | { |
1486 | 14 | "surrogatepass", |
1487 | 14 | { |
1488 | 14 | "surrogatepass", |
1489 | 14 | surrogatepass_errors, |
1490 | 14 | METH_O |
1491 | 14 | } |
1492 | 14 | }, |
1493 | 14 | { |
1494 | 14 | "surrogateescape", |
1495 | 14 | { |
1496 | 14 | "surrogateescape", |
1497 | 14 | surrogateescape_errors, |
1498 | 14 | METH_O |
1499 | 14 | } |
1500 | 14 | } |
1501 | 14 | }; |
1502 | | |
1503 | 14 | PyInterpreterState *interp = _PyInterpreterState_Get(); |
1504 | 14 | PyObject *mod; |
1505 | 14 | unsigned i; |
1506 | | |
1507 | 14 | if (interp->codec_search_path != NULL) |
1508 | 0 | return 0; |
1509 | | |
1510 | 14 | interp->codec_search_path = PyList_New(0); |
1511 | 14 | interp->codec_search_cache = PyDict_New(); |
1512 | 14 | interp->codec_error_registry = PyDict_New(); |
1513 | | |
1514 | 14 | if (interp->codec_error_registry) { |
1515 | 126 | for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) { |
1516 | 112 | PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL); |
1517 | 112 | int res; |
1518 | 112 | if (!func) |
1519 | 0 | Py_FatalError("can't initialize codec error registry"); |
1520 | 112 | res = PyCodec_RegisterError(methods[i].name, func); |
1521 | 112 | Py_DECREF(func); |
1522 | 112 | if (res) |
1523 | 0 | Py_FatalError("can't initialize codec error registry"); |
1524 | 112 | } |
1525 | 14 | } |
1526 | | |
1527 | 14 | if (interp->codec_search_path == NULL || |
1528 | 14 | interp->codec_search_cache == NULL || |
1529 | 14 | interp->codec_error_registry == NULL) |
1530 | 0 | Py_FatalError("can't initialize codec registry"); |
1531 | | |
1532 | 14 | mod = PyImport_ImportModuleNoBlock("encodings"); |
1533 | 14 | if (mod == NULL) { |
1534 | 0 | return -1; |
1535 | 0 | } |
1536 | 14 | Py_DECREF(mod); |
1537 | 14 | interp->codecs_initialized = 1; |
1538 | 14 | return 0; |
1539 | 14 | } |