Coverage Report

Created: 2026-06-14 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Python-3.8.3/Python/codecs.c
Line
Count
Source
1
/* ------------------------------------------------------------------------
2
3
   Python Codec Registry and support functions
4
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7
Copyright (c) Corporation for National Research Initiatives.
8
9
   ------------------------------------------------------------------------ */
10
11
#include "Python.h"
12
#include "pycore_pystate.h"
13
#include "ucnhash.h"
14
#include <ctype.h>
15
16
const char *Py_hexdigits = "0123456789abcdef";
17
18
/* --- Codec Registry ----------------------------------------------------- */
19
20
/* Import the standard encodings package which will register the first
21
   codec search function.
22
23
   This is done in a lazy way so that the Unicode implementation does
24
   not downgrade startup time of scripts not needing it.
25
26
   ImportErrors are silently ignored by this function. Only one try is
27
   made.
28
29
*/
30
31
static int _PyCodecRegistry_Init(void); /* Forward */
32
33
int PyCodec_Register(PyObject *search_function)
34
13
{
35
13
    PyInterpreterState *interp = _PyInterpreterState_Get();
36
13
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
37
0
        goto onError;
38
13
    if (search_function == NULL) {
39
0
        PyErr_BadArgument();
40
0
        goto onError;
41
0
    }
42
13
    if (!PyCallable_Check(search_function)) {
43
0
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
44
0
        goto onError;
45
0
    }
46
13
    return PyList_Append(interp->codec_search_path, search_function);
47
48
0
 onError:
49
0
    return -1;
50
13
}
51
52
/* Convert a string to a normalized Python string: all characters are
53
   converted to lower case, spaces are replaced with underscores. */
54
55
static
56
PyObject *normalizestring(const char *string)
57
79
{
58
79
    size_t i;
59
79
    size_t len = strlen(string);
60
79
    char *p;
61
79
    PyObject *v;
62
63
79
    if (len > PY_SSIZE_T_MAX) {
64
0
        PyErr_SetString(PyExc_OverflowError, "string is too large");
65
0
        return NULL;
66
0
    }
67
68
79
    p = PyMem_Malloc(len + 1);
69
79
    if (p == NULL)
70
0
        return PyErr_NoMemory();
71
708
    for (i = 0; i < len; i++) {
72
629
        char ch = string[i];
73
629
        if (ch == ' ')
74
0
            ch = '-';
75
629
        else
76
629
            ch = Py_TOLOWER(Py_CHARMASK(ch));
77
629
        p[i] = ch;
78
629
    }
79
79
    p[i] = '\0';
80
79
    v = PyUnicode_FromString(p);
81
79
    PyMem_Free(p);
82
79
    return v;
83
79
}
84
85
/* Lookup the given encoding and return a tuple providing the codec
86
   facilities.
87
88
   The encoding string is looked up converted to all lower-case
89
   characters. This makes encodings looked up through this mechanism
90
   effectively case-insensitive.
91
92
   If no codec is found, a LookupError is set and NULL returned.
93
94
   As side effect, this tries to load the encodings package, if not
95
   yet done. This is part of the lazy load strategy for the encodings
96
   package.
97
98
*/
99
100
PyObject *_PyCodec_Lookup(const char *encoding)
101
79
{
102
79
    PyObject *result, *args = NULL, *v;
103
79
    Py_ssize_t i, len;
104
105
79
    if (encoding == NULL) {
106
0
        PyErr_BadArgument();
107
0
        goto onError;
108
0
    }
109
110
79
    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
111
79
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
112
0
        goto onError;
113
114
    /* Convert the encoding to a normalized Python string: all
115
       characters are converted to lower case, spaces and hyphens are
116
       replaced with underscores. */
117
79
    v = normalizestring(encoding);
118
79
    if (v == NULL)
119
0
        goto onError;
120
79
    PyUnicode_InternInPlace(&v);
121
122
    /* First, try to lookup the name in the registry dictionary */
123
79
    result = PyDict_GetItemWithError(interp->codec_search_cache, v);
124
79
    if (result != NULL) {
125
52
        Py_INCREF(result);
126
52
        Py_DECREF(v);
127
52
        return result;
128
52
    }
129
27
    else if (PyErr_Occurred()) {
130
0
        Py_DECREF(v);
131
0
        return NULL;
132
0
    }
133
134
    /* Next, scan the search functions in order of registration */
135
27
    args = PyTuple_New(1);
136
27
    if (args == NULL) {
137
0
        Py_DECREF(v);
138
0
        return NULL;
139
0
    }
140
27
    PyTuple_SET_ITEM(args,0,v);
141
142
27
    len = PyList_Size(interp->codec_search_path);
143
27
    if (len < 0)
144
0
        goto onError;
145
27
    if (len == 0) {
146
0
        PyErr_SetString(PyExc_LookupError,
147
0
                        "no codec search functions registered: "
148
0
                        "can't find encoding");
149
0
        goto onError;
150
0
    }
151
152
27
    for (i = 0; i < len; i++) {
153
27
        PyObject *func;
154
155
27
        func = PyList_GetItem(interp->codec_search_path, i);
156
27
        if (func == NULL)
157
0
            goto onError;
158
27
        result = PyEval_CallObject(func, args);
159
27
        if (result == NULL)
160
0
            goto onError;
161
27
        if (result == Py_None) {
162
0
            Py_DECREF(result);
163
0
            continue;
164
0
        }
165
27
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
166
0
            PyErr_SetString(PyExc_TypeError,
167
0
                            "codec search functions must return 4-tuples");
168
0
            Py_DECREF(result);
169
0
            goto onError;
170
0
        }
171
27
        break;
172
27
    }
173
27
    if (i == len) {
174
        /* XXX Perhaps we should cache misses too ? */
175
0
        PyErr_Format(PyExc_LookupError,
176
0
                     "unknown encoding: %s", encoding);
177
0
        goto onError;
178
0
    }
179
180
    /* Cache and return the result */
181
27
    if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
182
0
        Py_DECREF(result);
183
0
        goto onError;
184
0
    }
185
27
    Py_DECREF(args);
186
27
    return result;
187
188
0
 onError:
189
0
    Py_XDECREF(args);
190
0
    return NULL;
191
27
}
192
193
int _PyCodec_Forget(const char *encoding)
194
0
{
195
0
    PyObject *v;
196
0
    int result;
197
198
0
    PyInterpreterState *interp = _PyInterpreterState_Get();
199
0
    if (interp->codec_search_path == NULL) {
200
0
        return -1;
201
0
    }
202
203
    /* Convert the encoding to a normalized Python string: all
204
       characters are converted to lower case, spaces and hyphens are
205
       replaced with underscores. */
206
0
    v = normalizestring(encoding);
207
0
    if (v == NULL) {
208
0
        return -1;
209
0
    }
210
211
    /* Drop the named codec from the internal cache */
212
0
    result = PyDict_DelItem(interp->codec_search_cache, v);
213
0
    Py_DECREF(v);
214
215
0
    return result;
216
0
}
217
218
/* Codec registry encoding check API. */
219
220
int PyCodec_KnownEncoding(const char *encoding)
221
0
{
222
0
    PyObject *codecs;
223
224
0
    codecs = _PyCodec_Lookup(encoding);
225
0
    if (!codecs) {
226
0
        PyErr_Clear();
227
0
        return 0;
228
0
    }
229
0
    else {
230
0
        Py_DECREF(codecs);
231
0
        return 1;
232
0
    }
233
0
}
234
235
static
236
PyObject *args_tuple(PyObject *object,
237
                     const char *errors)
238
0
{
239
0
    PyObject *args;
240
241
0
    args = PyTuple_New(1 + (errors != NULL));
242
0
    if (args == NULL)
243
0
        return NULL;
244
0
    Py_INCREF(object);
245
0
    PyTuple_SET_ITEM(args,0,object);
246
0
    if (errors) {
247
0
        PyObject *v;
248
249
0
        v = PyUnicode_FromString(errors);
250
0
        if (v == NULL) {
251
0
            Py_DECREF(args);
252
0
            return NULL;
253
0
        }
254
0
        PyTuple_SET_ITEM(args, 1, v);
255
0
    }
256
0
    return args;
257
0
}
258
259
/* Helper function to get a codec item */
260
261
static
262
PyObject *codec_getitem(const char *encoding, int index)
263
0
{
264
0
    PyObject *codecs;
265
0
    PyObject *v;
266
267
0
    codecs = _PyCodec_Lookup(encoding);
268
0
    if (codecs == NULL)
269
0
        return NULL;
270
0
    v = PyTuple_GET_ITEM(codecs, index);
271
0
    Py_DECREF(codecs);
272
0
    Py_INCREF(v);
273
0
    return v;
274
0
}
275
276
/* Helper functions to create an incremental codec. */
277
static
278
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
279
                                     const char *errors,
280
                                     const char *attrname)
281
40
{
282
40
    PyObject *ret, *inccodec;
283
284
40
    inccodec = PyObject_GetAttrString(codec_info, attrname);
285
40
    if (inccodec == NULL)
286
0
        return NULL;
287
40
    if (errors)
288
40
        ret = PyObject_CallFunction(inccodec, "s", errors);
289
0
    else
290
0
        ret = _PyObject_CallNoArg(inccodec);
291
40
    Py_DECREF(inccodec);
292
40
    return ret;
293
40
}
294
295
static
296
PyObject *codec_getincrementalcodec(const char *encoding,
297
                                    const char *errors,
298
                                    const char *attrname)
299
0
{
300
0
    PyObject *codec_info, *ret;
301
302
0
    codec_info = _PyCodec_Lookup(encoding);
303
0
    if (codec_info == NULL)
304
0
        return NULL;
305
0
    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
306
0
    Py_DECREF(codec_info);
307
0
    return ret;
308
0
}
309
310
/* Helper function to create a stream codec. */
311
312
static
313
PyObject *codec_getstreamcodec(const char *encoding,
314
                               PyObject *stream,
315
                               const char *errors,
316
                               const int index)
317
0
{
318
0
    PyObject *codecs, *streamcodec, *codeccls;
319
320
0
    codecs = _PyCodec_Lookup(encoding);
321
0
    if (codecs == NULL)
322
0
        return NULL;
323
324
0
    codeccls = PyTuple_GET_ITEM(codecs, index);
325
0
    if (errors != NULL)
326
0
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
327
0
    else
328
0
        streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL);
329
0
    Py_DECREF(codecs);
330
0
    return streamcodec;
331
0
}
332
333
/* Helpers to work with the result of _PyCodec_Lookup
334
335
 */
336
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
337
                                             const char *errors)
338
14
{
339
14
    return codec_makeincrementalcodec(codec_info, errors,
340
14
                                      "incrementaldecoder");
341
14
}
342
343
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
344
                                             const char *errors)
345
26
{
346
26
    return codec_makeincrementalcodec(codec_info, errors,
347
26
                                      "incrementalencoder");
348
26
}
349
350
351
/* Convenience APIs to query the Codec registry.
352
353
   All APIs return a codec object with incremented refcount.
354
355
 */
356
357
PyObject *PyCodec_Encoder(const char *encoding)
358
0
{
359
0
    return codec_getitem(encoding, 0);
360
0
}
361
362
PyObject *PyCodec_Decoder(const char *encoding)
363
0
{
364
0
    return codec_getitem(encoding, 1);
365
0
}
366
367
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
368
                                     const char *errors)
369
0
{
370
0
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
371
0
}
372
373
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
374
                                     const char *errors)
375
0
{
376
0
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
377
0
}
378
379
PyObject *PyCodec_StreamReader(const char *encoding,
380
                               PyObject *stream,
381
                               const char *errors)
382
0
{
383
0
    return codec_getstreamcodec(encoding, stream, errors, 2);
384
0
}
385
386
PyObject *PyCodec_StreamWriter(const char *encoding,
387
                               PyObject *stream,
388
                               const char *errors)
389
0
{
390
0
    return codec_getstreamcodec(encoding, stream, errors, 3);
391
0
}
392
393
/* Helper that tries to ensure the reported exception chain indicates the
394
 * codec that was invoked to trigger the failure without changing the type
395
 * of the exception raised.
396
 */
397
static void
398
wrap_codec_error(const char *operation,
399
                 const char *encoding)
400
0
{
401
    /* TrySetFromCause will replace the active exception with a suitably
402
     * updated clone if it can, otherwise it will leave the original
403
     * exception alone.
404
     */
405
0
    _PyErr_TrySetFromCause("%s with '%s' codec failed",
406
0
                           operation, encoding);
407
0
}
408
409
/* Encode an object (e.g. a Unicode object) using the given encoding
410
   and return the resulting encoded object (usually a Python string).
411
412
   errors is passed to the encoder factory as argument if non-NULL. */
413
414
static PyObject *
415
_PyCodec_EncodeInternal(PyObject *object,
416
                        PyObject *encoder,
417
                        const char *encoding,
418
                        const char *errors)
419
0
{
420
0
    PyObject *args = NULL, *result = NULL;
421
0
    PyObject *v = NULL;
422
423
0
    args = args_tuple(object, errors);
424
0
    if (args == NULL)
425
0
        goto onError;
426
427
0
    result = PyEval_CallObject(encoder, args);
428
0
    if (result == NULL) {
429
0
        wrap_codec_error("encoding", encoding);
430
0
        goto onError;
431
0
    }
432
433
0
    if (!PyTuple_Check(result) ||
434
0
        PyTuple_GET_SIZE(result) != 2) {
435
0
        PyErr_SetString(PyExc_TypeError,
436
0
                        "encoder must return a tuple (object, integer)");
437
0
        goto onError;
438
0
    }
439
0
    v = PyTuple_GET_ITEM(result,0);
440
0
    Py_INCREF(v);
441
    /* We don't check or use the second (integer) entry. */
442
443
0
    Py_DECREF(args);
444
0
    Py_DECREF(encoder);
445
0
    Py_DECREF(result);
446
0
    return v;
447
448
0
 onError:
449
0
    Py_XDECREF(result);
450
0
    Py_XDECREF(args);
451
0
    Py_XDECREF(encoder);
452
0
    return NULL;
453
0
}
454
455
/* Decode an object (usually a Python string) using the given encoding
456
   and return an equivalent object (e.g. a Unicode object).
457
458
   errors is passed to the decoder factory as argument if non-NULL. */
459
460
static PyObject *
461
_PyCodec_DecodeInternal(PyObject *object,
462
                        PyObject *decoder,
463
                        const char *encoding,
464
                        const char *errors)
465
0
{
466
0
    PyObject *args = NULL, *result = NULL;
467
0
    PyObject *v;
468
469
0
    args = args_tuple(object, errors);
470
0
    if (args == NULL)
471
0
        goto onError;
472
473
0
    result = PyEval_CallObject(decoder,args);
474
0
    if (result == NULL) {
475
0
        wrap_codec_error("decoding", encoding);
476
0
        goto onError;
477
0
    }
478
0
    if (!PyTuple_Check(result) ||
479
0
        PyTuple_GET_SIZE(result) != 2) {
480
0
        PyErr_SetString(PyExc_TypeError,
481
0
                        "decoder must return a tuple (object,integer)");
482
0
        goto onError;
483
0
    }
484
0
    v = PyTuple_GET_ITEM(result,0);
485
0
    Py_INCREF(v);
486
    /* We don't check or use the second (integer) entry. */
487
488
0
    Py_DECREF(args);
489
0
    Py_DECREF(decoder);
490
0
    Py_DECREF(result);
491
0
    return v;
492
493
0
 onError:
494
0
    Py_XDECREF(args);
495
0
    Py_XDECREF(decoder);
496
0
    Py_XDECREF(result);
497
0
    return NULL;
498
0
}
499
500
/* Generic encoding/decoding API */
501
PyObject *PyCodec_Encode(PyObject *object,
502
                         const char *encoding,
503
                         const char *errors)
504
0
{
505
0
    PyObject *encoder;
506
507
0
    encoder = PyCodec_Encoder(encoding);
508
0
    if (encoder == NULL)
509
0
        return NULL;
510
511
0
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
512
0
}
513
514
PyObject *PyCodec_Decode(PyObject *object,
515
                         const char *encoding,
516
                         const char *errors)
517
0
{
518
0
    PyObject *decoder;
519
520
0
    decoder = PyCodec_Decoder(encoding);
521
0
    if (decoder == NULL)
522
0
        return NULL;
523
524
0
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
525
0
}
526
527
/* Text encoding/decoding API */
528
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
529
                                       const char *alternate_command)
530
40
{
531
40
    _Py_IDENTIFIER(_is_text_encoding);
532
40
    PyObject *codec;
533
40
    PyObject *attr;
534
40
    int is_text_codec;
535
536
40
    codec = _PyCodec_Lookup(encoding);
537
40
    if (codec == NULL)
538
0
        return NULL;
539
540
    /* Backwards compatibility: assume any raw tuple describes a text
541
     * encoding, and the same for anything lacking the private
542
     * attribute.
543
     */
544
40
    if (!PyTuple_CheckExact(codec)) {
545
40
        if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
546
0
            Py_DECREF(codec);
547
0
            return NULL;
548
0
        }
549
40
        if (attr != NULL) {
550
40
            is_text_codec = PyObject_IsTrue(attr);
551
40
            Py_DECREF(attr);
552
40
            if (is_text_codec <= 0) {
553
0
                Py_DECREF(codec);
554
0
                if (!is_text_codec)
555
0
                    PyErr_Format(PyExc_LookupError,
556
0
                                 "'%.400s' is not a text encoding; "
557
0
                                 "use %s to handle arbitrary codecs",
558
0
                                 encoding, alternate_command);
559
0
                return NULL;
560
0
            }
561
40
        }
562
40
    }
563
564
    /* This appears to be a valid text encoding */
565
40
    return codec;
566
40
}
567
568
569
static
570
PyObject *codec_getitem_checked(const char *encoding,
571
                                const char *alternate_command,
572
                                int index)
573
0
{
574
0
    PyObject *codec;
575
0
    PyObject *v;
576
577
0
    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
578
0
    if (codec == NULL)
579
0
        return NULL;
580
581
0
    v = PyTuple_GET_ITEM(codec, index);
582
0
    Py_INCREF(v);
583
0
    Py_DECREF(codec);
584
0
    return v;
585
0
}
586
587
static PyObject * _PyCodec_TextEncoder(const char *encoding)
588
0
{
589
0
    return codec_getitem_checked(encoding, "codecs.encode()", 0);
590
0
}
591
592
static PyObject * _PyCodec_TextDecoder(const char *encoding)
593
0
{
594
0
    return codec_getitem_checked(encoding, "codecs.decode()", 1);
595
0
}
596
597
PyObject *_PyCodec_EncodeText(PyObject *object,
598
                              const char *encoding,
599
                              const char *errors)
600
0
{
601
0
    PyObject *encoder;
602
603
0
    encoder = _PyCodec_TextEncoder(encoding);
604
0
    if (encoder == NULL)
605
0
        return NULL;
606
607
0
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
608
0
}
609
610
PyObject *_PyCodec_DecodeText(PyObject *object,
611
                              const char *encoding,
612
                              const char *errors)
613
0
{
614
0
    PyObject *decoder;
615
616
0
    decoder = _PyCodec_TextDecoder(encoding);
617
0
    if (decoder == NULL)
618
0
        return NULL;
619
620
0
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
621
0
}
622
623
/* Register the error handling callback function error under the name
624
   name. This function will be called by the codec when it encounters
625
   an unencodable characters/undecodable bytes and doesn't know the
626
   callback name, when name is specified as the error parameter
627
   in the call to the encode/decode function.
628
   Return 0 on success, -1 on error */
629
int PyCodec_RegisterError(const char *name, PyObject *error)
630
104
{
631
104
    PyInterpreterState *interp = _PyInterpreterState_Get();
632
104
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
633
0
        return -1;
634
104
    if (!PyCallable_Check(error)) {
635
0
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
636
0
        return -1;
637
0
    }
638
104
    return PyDict_SetItemString(interp->codec_error_registry,
639
104
                                name, error);
640
104
}
641
642
/* Lookup the error handling callback function registered under the
643
   name error. As a special case NULL can be passed, in which case
644
   the error handling callback for strict encoding will be returned. */
645
PyObject *PyCodec_LookupError(const char *name)
646
78
{
647
78
    PyObject *handler = NULL;
648
649
78
    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
650
78
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
651
0
        return NULL;
652
653
78
    if (name==NULL)
654
0
        name = "strict";
655
78
    handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
656
78
    if (handler) {
657
78
        Py_INCREF(handler);
658
78
    }
659
0
    else if (!PyErr_Occurred()) {
660
0
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
661
0
    }
662
78
    return handler;
663
78
}
664
665
static void wrong_exception_type(PyObject *exc)
666
0
{
667
0
    PyErr_Format(PyExc_TypeError,
668
0
                 "don't know how to handle %.200s in error callback",
669
0
                 exc->ob_type->tp_name);
670
0
}
671
672
PyObject *PyCodec_StrictErrors(PyObject *exc)
673
0
{
674
0
    if (PyExceptionInstance_Check(exc))
675
0
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
676
0
    else
677
0
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
678
0
    return NULL;
679
0
}
680
681
682
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
683
0
{
684
0
    Py_ssize_t end;
685
686
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
687
0
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
688
0
            return NULL;
689
0
    }
690
0
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
691
0
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
692
0
            return NULL;
693
0
    }
694
0
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
695
0
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
696
0
            return NULL;
697
0
    }
698
0
    else {
699
0
        wrong_exception_type(exc);
700
0
        return NULL;
701
0
    }
702
0
    return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
703
0
}
704
705
706
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
707
0
{
708
0
    Py_ssize_t start, end, i, len;
709
710
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
711
0
        PyObject *res;
712
0
        int kind;
713
0
        void *data;
714
0
        if (PyUnicodeEncodeError_GetStart(exc, &start))
715
0
            return NULL;
716
0
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
717
0
            return NULL;
718
0
        len = end - start;
719
0
        res = PyUnicode_New(len, '?');
720
0
        if (res == NULL)
721
0
            return NULL;
722
0
        kind = PyUnicode_KIND(res);
723
0
        data = PyUnicode_DATA(res);
724
0
        for (i = 0; i < len; ++i)
725
0
            PyUnicode_WRITE(kind, data, i, '?');
726
0
        assert(_PyUnicode_CheckConsistency(res, 1));
727
0
        return Py_BuildValue("(Nn)", res, end);
728
0
    }
729
0
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
730
0
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
731
0
            return NULL;
732
0
        return Py_BuildValue("(Cn)",
733
0
                             (int)Py_UNICODE_REPLACEMENT_CHARACTER,
734
0
                             end);
735
0
    }
736
0
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
737
0
        PyObject *res;
738
0
        int kind;
739
0
        void *data;
740
0
        if (PyUnicodeTranslateError_GetStart(exc, &start))
741
0
            return NULL;
742
0
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
743
0
            return NULL;
744
0
        len = end - start;
745
0
        res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
746
0
        if (res == NULL)
747
0
            return NULL;
748
0
        kind = PyUnicode_KIND(res);
749
0
        data = PyUnicode_DATA(res);
750
0
        for (i=0; i < len; i++)
751
0
            PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
752
0
        assert(_PyUnicode_CheckConsistency(res, 1));
753
0
        return Py_BuildValue("(Nn)", res, end);
754
0
    }
755
0
    else {
756
0
        wrong_exception_type(exc);
757
0
        return NULL;
758
0
    }
759
0
}
760
761
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
762
0
{
763
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
764
0
        PyObject *restuple;
765
0
        PyObject *object;
766
0
        Py_ssize_t i;
767
0
        Py_ssize_t start;
768
0
        Py_ssize_t end;
769
0
        PyObject *res;
770
0
        unsigned char *outp;
771
0
        Py_ssize_t ressize;
772
0
        Py_UCS4 ch;
773
0
        if (PyUnicodeEncodeError_GetStart(exc, &start))
774
0
            return NULL;
775
0
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
776
0
            return NULL;
777
0
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
778
0
            return NULL;
779
0
        if (end - start > PY_SSIZE_T_MAX / (2+7+1))
780
0
            end = start + PY_SSIZE_T_MAX / (2+7+1);
781
0
        for (i = start, ressize = 0; i < end; ++i) {
782
            /* object is guaranteed to be "ready" */
783
0
            ch = PyUnicode_READ_CHAR(object, i);
784
0
            if (ch<10)
785
0
                ressize += 2+1+1;
786
0
            else if (ch<100)
787
0
                ressize += 2+2+1;
788
0
            else if (ch<1000)
789
0
                ressize += 2+3+1;
790
0
            else if (ch<10000)
791
0
                ressize += 2+4+1;
792
0
            else if (ch<100000)
793
0
                ressize += 2+5+1;
794
0
            else if (ch<1000000)
795
0
                ressize += 2+6+1;
796
0
            else
797
0
                ressize += 2+7+1;
798
0
        }
799
        /* allocate replacement */
800
0
        res = PyUnicode_New(ressize, 127);
801
0
        if (res == NULL) {
802
0
            Py_DECREF(object);
803
0
            return NULL;
804
0
        }
805
0
        outp = PyUnicode_1BYTE_DATA(res);
806
        /* generate replacement */
807
0
        for (i = start; i < end; ++i) {
808
0
            int digits;
809
0
            int base;
810
0
            ch = PyUnicode_READ_CHAR(object, i);
811
0
            *outp++ = '&';
812
0
            *outp++ = '#';
813
0
            if (ch<10) {
814
0
                digits = 1;
815
0
                base = 1;
816
0
            }
817
0
            else if (ch<100) {
818
0
                digits = 2;
819
0
                base = 10;
820
0
            }
821
0
            else if (ch<1000) {
822
0
                digits = 3;
823
0
                base = 100;
824
0
            }
825
0
            else if (ch<10000) {
826
0
                digits = 4;
827
0
                base = 1000;
828
0
            }
829
0
            else if (ch<100000) {
830
0
                digits = 5;
831
0
                base = 10000;
832
0
            }
833
0
            else if (ch<1000000) {
834
0
                digits = 6;
835
0
                base = 100000;
836
0
            }
837
0
            else {
838
0
                digits = 7;
839
0
                base = 1000000;
840
0
            }
841
0
            while (digits-->0) {
842
0
                *outp++ = '0' + ch/base;
843
0
                ch %= base;
844
0
                base /= 10;
845
0
            }
846
0
            *outp++ = ';';
847
0
        }
848
0
        assert(_PyUnicode_CheckConsistency(res, 1));
849
0
        restuple = Py_BuildValue("(Nn)", res, end);
850
0
        Py_DECREF(object);
851
0
        return restuple;
852
0
    }
853
0
    else {
854
0
        wrong_exception_type(exc);
855
0
        return NULL;
856
0
    }
857
0
}
858
859
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
860
0
{
861
0
    PyObject *object;
862
0
    Py_ssize_t i;
863
0
    Py_ssize_t start;
864
0
    Py_ssize_t end;
865
0
    PyObject *res;
866
0
    unsigned char *outp;
867
0
    int ressize;
868
0
    Py_UCS4 c;
869
870
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
871
0
        const unsigned char *p;
872
0
        if (PyUnicodeDecodeError_GetStart(exc, &start))
873
0
            return NULL;
874
0
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
875
0
            return NULL;
876
0
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
877
0
            return NULL;
878
0
        p = (const unsigned char*)PyBytes_AS_STRING(object);
879
0
        res = PyUnicode_New(4 * (end - start), 127);
880
0
        if (res == NULL) {
881
0
            Py_DECREF(object);
882
0
            return NULL;
883
0
        }
884
0
        outp = PyUnicode_1BYTE_DATA(res);
885
0
        for (i = start; i < end; i++, outp += 4) {
886
0
            unsigned char c = p[i];
887
0
            outp[0] = '\\';
888
0
            outp[1] = 'x';
889
0
            outp[2] = Py_hexdigits[(c>>4)&0xf];
890
0
            outp[3] = Py_hexdigits[c&0xf];
891
0
        }
892
893
0
        assert(_PyUnicode_CheckConsistency(res, 1));
894
0
        Py_DECREF(object);
895
0
        return Py_BuildValue("(Nn)", res, end);
896
0
    }
897
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
898
0
        if (PyUnicodeEncodeError_GetStart(exc, &start))
899
0
            return NULL;
900
0
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
901
0
            return NULL;
902
0
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
903
0
            return NULL;
904
0
    }
905
0
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
906
0
        if (PyUnicodeTranslateError_GetStart(exc, &start))
907
0
            return NULL;
908
0
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
909
0
            return NULL;
910
0
        if (!(object = PyUnicodeTranslateError_GetObject(exc)))
911
0
            return NULL;
912
0
    }
913
0
    else {
914
0
        wrong_exception_type(exc);
915
0
        return NULL;
916
0
    }
917
918
0
    if (end - start > PY_SSIZE_T_MAX / (1+1+8))
919
0
        end = start + PY_SSIZE_T_MAX / (1+1+8);
920
0
    for (i = start, ressize = 0; i < end; ++i) {
921
        /* object is guaranteed to be "ready" */
922
0
        c = PyUnicode_READ_CHAR(object, i);
923
0
        if (c >= 0x10000) {
924
0
            ressize += 1+1+8;
925
0
        }
926
0
        else if (c >= 0x100) {
927
0
            ressize += 1+1+4;
928
0
        }
929
0
        else
930
0
            ressize += 1+1+2;
931
0
    }
932
0
    res = PyUnicode_New(ressize, 127);
933
0
    if (res == NULL) {
934
0
        Py_DECREF(object);
935
0
        return NULL;
936
0
    }
937
0
    outp = PyUnicode_1BYTE_DATA(res);
938
0
    for (i = start; i < end; ++i) {
939
0
        c = PyUnicode_READ_CHAR(object, i);
940
0
        *outp++ = '\\';
941
0
        if (c >= 0x00010000) {
942
0
            *outp++ = 'U';
943
0
            *outp++ = Py_hexdigits[(c>>28)&0xf];
944
0
            *outp++ = Py_hexdigits[(c>>24)&0xf];
945
0
            *outp++ = Py_hexdigits[(c>>20)&0xf];
946
0
            *outp++ = Py_hexdigits[(c>>16)&0xf];
947
0
            *outp++ = Py_hexdigits[(c>>12)&0xf];
948
0
            *outp++ = Py_hexdigits[(c>>8)&0xf];
949
0
        }
950
0
        else if (c >= 0x100) {
951
0
            *outp++ = 'u';
952
0
            *outp++ = Py_hexdigits[(c>>12)&0xf];
953
0
            *outp++ = Py_hexdigits[(c>>8)&0xf];
954
0
        }
955
0
        else
956
0
            *outp++ = 'x';
957
0
        *outp++ = Py_hexdigits[(c>>4)&0xf];
958
0
        *outp++ = Py_hexdigits[c&0xf];
959
0
    }
960
961
0
    assert(_PyUnicode_CheckConsistency(res, 1));
962
0
    Py_DECREF(object);
963
0
    return Py_BuildValue("(Nn)", res, end);
964
0
}
965
966
static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
967
968
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
969
0
{
970
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
971
0
        PyObject *restuple;
972
0
        PyObject *object;
973
0
        Py_ssize_t i;
974
0
        Py_ssize_t start;
975
0
        Py_ssize_t end;
976
0
        PyObject *res;
977
0
        unsigned char *outp;
978
0
        Py_ssize_t ressize;
979
0
        int replsize;
980
0
        Py_UCS4 c;
981
0
        char buffer[256]; /* NAME_MAXLEN */
982
0
        if (PyUnicodeEncodeError_GetStart(exc, &start))
983
0
            return NULL;
984
0
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
985
0
            return NULL;
986
0
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
987
0
            return NULL;
988
0
        if (!ucnhash_CAPI) {
989
            /* load the unicode data module */
990
0
            ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
991
0
                                            PyUnicodeData_CAPSULE_NAME, 1);
992
0
            if (!ucnhash_CAPI)
993
0
                return NULL;
994
0
        }
995
0
        for (i = start, ressize = 0; i < end; ++i) {
996
            /* object is guaranteed to be "ready" */
997
0
            c = PyUnicode_READ_CHAR(object, i);
998
0
            if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
999
0
                replsize = 1+1+1+(int)strlen(buffer)+1;
1000
0
            }
1001
0
            else if (c >= 0x10000) {
1002
0
                replsize = 1+1+8;
1003
0
            }
1004
0
            else if (c >= 0x100) {
1005
0
                replsize = 1+1+4;
1006
0
            }
1007
0
            else
1008
0
                replsize = 1+1+2;
1009
0
            if (ressize > PY_SSIZE_T_MAX - replsize)
1010
0
                break;
1011
0
            ressize += replsize;
1012
0
        }
1013
0
        end = i;
1014
0
        res = PyUnicode_New(ressize, 127);
1015
0
        if (res==NULL)
1016
0
            return NULL;
1017
0
        for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1018
0
            i < end; ++i) {
1019
0
            c = PyUnicode_READ_CHAR(object, i);
1020
0
            *outp++ = '\\';
1021
0
            if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1022
0
                *outp++ = 'N';
1023
0
                *outp++ = '{';
1024
0
                strcpy((char *)outp, buffer);
1025
0
                outp += strlen(buffer);
1026
0
                *outp++ = '}';
1027
0
                continue;
1028
0
            }
1029
0
            if (c >= 0x00010000) {
1030
0
                *outp++ = 'U';
1031
0
                *outp++ = Py_hexdigits[(c>>28)&0xf];
1032
0
                *outp++ = Py_hexdigits[(c>>24)&0xf];
1033
0
                *outp++ = Py_hexdigits[(c>>20)&0xf];
1034
0
                *outp++ = Py_hexdigits[(c>>16)&0xf];
1035
0
                *outp++ = Py_hexdigits[(c>>12)&0xf];
1036
0
                *outp++ = Py_hexdigits[(c>>8)&0xf];
1037
0
            }
1038
0
            else if (c >= 0x100) {
1039
0
                *outp++ = 'u';
1040
0
                *outp++ = Py_hexdigits[(c>>12)&0xf];
1041
0
                *outp++ = Py_hexdigits[(c>>8)&0xf];
1042
0
            }
1043
0
            else
1044
0
                *outp++ = 'x';
1045
0
            *outp++ = Py_hexdigits[(c>>4)&0xf];
1046
0
            *outp++ = Py_hexdigits[c&0xf];
1047
0
        }
1048
1049
0
        assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1050
0
        assert(_PyUnicode_CheckConsistency(res, 1));
1051
0
        restuple = Py_BuildValue("(Nn)", res, end);
1052
0
        Py_DECREF(object);
1053
0
        return restuple;
1054
0
    }
1055
0
    else {
1056
0
        wrong_exception_type(exc);
1057
0
        return NULL;
1058
0
    }
1059
0
}
1060
1061
0
#define ENC_UNKNOWN     -1
1062
0
#define ENC_UTF8        0
1063
0
#define ENC_UTF16BE     1
1064
0
#define ENC_UTF16LE     2
1065
0
#define ENC_UTF32BE     3
1066
0
#define ENC_UTF32LE     4
1067
1068
static int
1069
get_standard_encoding(const char *encoding, int *bytelength)
1070
0
{
1071
0
    if (Py_TOLOWER(encoding[0]) == 'u' &&
1072
0
        Py_TOLOWER(encoding[1]) == 't' &&
1073
0
        Py_TOLOWER(encoding[2]) == 'f') {
1074
0
        encoding += 3;
1075
0
        if (*encoding == '-' || *encoding == '_' )
1076
0
            encoding++;
1077
0
        if (encoding[0] == '8' && encoding[1] == '\0') {
1078
0
            *bytelength = 3;
1079
0
            return ENC_UTF8;
1080
0
        }
1081
0
        else if (encoding[0] == '1' && encoding[1] == '6') {
1082
0
            encoding += 2;
1083
0
            *bytelength = 2;
1084
0
            if (*encoding == '\0') {
1085
#ifdef WORDS_BIGENDIAN
1086
                return ENC_UTF16BE;
1087
#else
1088
0
                return ENC_UTF16LE;
1089
0
#endif
1090
0
            }
1091
0
            if (*encoding == '-' || *encoding == '_' )
1092
0
                encoding++;
1093
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1094
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1095
0
                    return ENC_UTF16BE;
1096
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1097
0
                    return ENC_UTF16LE;
1098
0
            }
1099
0
        }
1100
0
        else if (encoding[0] == '3' && encoding[1] == '2') {
1101
0
            encoding += 2;
1102
0
            *bytelength = 4;
1103
0
            if (*encoding == '\0') {
1104
#ifdef WORDS_BIGENDIAN
1105
                return ENC_UTF32BE;
1106
#else
1107
0
                return ENC_UTF32LE;
1108
0
#endif
1109
0
            }
1110
0
            if (*encoding == '-' || *encoding == '_' )
1111
0
                encoding++;
1112
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1113
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1114
0
                    return ENC_UTF32BE;
1115
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1116
0
                    return ENC_UTF32LE;
1117
0
            }
1118
0
        }
1119
0
    }
1120
0
    else if (strcmp(encoding, "CP_UTF8") == 0) {
1121
0
        *bytelength = 3;
1122
0
        return ENC_UTF8;
1123
0
    }
1124
0
    return ENC_UNKNOWN;
1125
0
}
1126
1127
/* This handler is declared static until someone demonstrates
1128
   a need to call it directly. */
1129
static PyObject *
1130
PyCodec_SurrogatePassErrors(PyObject *exc)
1131
0
{
1132
0
    PyObject *restuple;
1133
0
    PyObject *object;
1134
0
    PyObject *encode;
1135
0
    const char *encoding;
1136
0
    int code;
1137
0
    int bytelength;
1138
0
    Py_ssize_t i;
1139
0
    Py_ssize_t start;
1140
0
    Py_ssize_t end;
1141
0
    PyObject *res;
1142
1143
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1144
0
        unsigned char *outp;
1145
0
        if (PyUnicodeEncodeError_GetStart(exc, &start))
1146
0
            return NULL;
1147
0
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
1148
0
            return NULL;
1149
0
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1150
0
            return NULL;
1151
0
        if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1152
0
            Py_DECREF(object);
1153
0
            return NULL;
1154
0
        }
1155
0
        if (!(encoding = PyUnicode_AsUTF8(encode))) {
1156
0
            Py_DECREF(object);
1157
0
            Py_DECREF(encode);
1158
0
            return NULL;
1159
0
        }
1160
0
        code = get_standard_encoding(encoding, &bytelength);
1161
0
        Py_DECREF(encode);
1162
0
        if (code == ENC_UNKNOWN) {
1163
            /* Not supported, fail with original exception */
1164
0
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1165
0
            Py_DECREF(object);
1166
0
            return NULL;
1167
0
        }
1168
1169
0
        if (end - start > PY_SSIZE_T_MAX / bytelength)
1170
0
            end = start + PY_SSIZE_T_MAX / bytelength;
1171
0
        res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1172
0
        if (!res) {
1173
0
            Py_DECREF(object);
1174
0
            return NULL;
1175
0
        }
1176
0
        outp = (unsigned char*)PyBytes_AsString(res);
1177
0
        for (i = start; i < end; i++) {
1178
            /* object is guaranteed to be "ready" */
1179
0
            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1180
0
            if (!Py_UNICODE_IS_SURROGATE(ch)) {
1181
                /* Not a surrogate, fail with original exception */
1182
0
                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1183
0
                Py_DECREF(res);
1184
0
                Py_DECREF(object);
1185
0
                return NULL;
1186
0
            }
1187
0
            switch (code) {
1188
0
            case ENC_UTF8:
1189
0
                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1190
0
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1191
0
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1192
0
                break;
1193
0
            case ENC_UTF16LE:
1194
0
                *outp++ = (unsigned char) ch;
1195
0
                *outp++ = (unsigned char)(ch >> 8);
1196
0
                break;
1197
0
            case ENC_UTF16BE:
1198
0
                *outp++ = (unsigned char)(ch >> 8);
1199
0
                *outp++ = (unsigned char) ch;
1200
0
                break;
1201
0
            case ENC_UTF32LE:
1202
0
                *outp++ = (unsigned char) ch;
1203
0
                *outp++ = (unsigned char)(ch >> 8);
1204
0
                *outp++ = (unsigned char)(ch >> 16);
1205
0
                *outp++ = (unsigned char)(ch >> 24);
1206
0
                break;
1207
0
            case ENC_UTF32BE:
1208
0
                *outp++ = (unsigned char)(ch >> 24);
1209
0
                *outp++ = (unsigned char)(ch >> 16);
1210
0
                *outp++ = (unsigned char)(ch >> 8);
1211
0
                *outp++ = (unsigned char) ch;
1212
0
                break;
1213
0
            }
1214
0
        }
1215
0
        restuple = Py_BuildValue("(On)", res, end);
1216
0
        Py_DECREF(res);
1217
0
        Py_DECREF(object);
1218
0
        return restuple;
1219
0
    }
1220
0
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1221
0
        const unsigned char *p;
1222
0
        Py_UCS4 ch = 0;
1223
0
        if (PyUnicodeDecodeError_GetStart(exc, &start))
1224
0
            return NULL;
1225
0
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
1226
0
            return NULL;
1227
0
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1228
0
            return NULL;
1229
0
        p = (const unsigned char*)PyBytes_AS_STRING(object);
1230
0
        if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1231
0
            Py_DECREF(object);
1232
0
            return NULL;
1233
0
        }
1234
0
        if (!(encoding = PyUnicode_AsUTF8(encode))) {
1235
0
            Py_DECREF(object);
1236
0
            Py_DECREF(encode);
1237
0
            return NULL;
1238
0
        }
1239
0
        code = get_standard_encoding(encoding, &bytelength);
1240
0
        Py_DECREF(encode);
1241
0
        if (code == ENC_UNKNOWN) {
1242
            /* Not supported, fail with original exception */
1243
0
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1244
0
            Py_DECREF(object);
1245
0
            return NULL;
1246
0
        }
1247
1248
        /* Try decoding a single surrogate character. If
1249
           there are more, let the codec call us again. */
1250
0
        p += start;
1251
0
        if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1252
0
            switch (code) {
1253
0
            case ENC_UTF8:
1254
0
                if ((p[0] & 0xf0) == 0xe0 &&
1255
0
                    (p[1] & 0xc0) == 0x80 &&
1256
0
                    (p[2] & 0xc0) == 0x80) {
1257
                    /* it's a three-byte code */
1258
0
                    ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1259
0
                }
1260
0
                break;
1261
0
            case ENC_UTF16LE:
1262
0
                ch = p[1] << 8 | p[0];
1263
0
                break;
1264
0
            case ENC_UTF16BE:
1265
0
                ch = p[0] << 8 | p[1];
1266
0
                break;
1267
0
            case ENC_UTF32LE:
1268
0
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1269
0
                break;
1270
0
            case ENC_UTF32BE:
1271
0
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1272
0
                break;
1273
0
            }
1274
0
        }
1275
1276
0
        Py_DECREF(object);
1277
0
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1278
            /* it's not a surrogate - fail */
1279
0
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1280
0
            return NULL;
1281
0
        }
1282
0
        res = PyUnicode_FromOrdinal(ch);
1283
0
        if (res == NULL)
1284
0
            return NULL;
1285
0
        return Py_BuildValue("(Nn)", res, start + bytelength);
1286
0
    }
1287
0
    else {
1288
0
        wrong_exception_type(exc);
1289
0
        return NULL;
1290
0
    }
1291
0
}
1292
1293
static PyObject *
1294
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1295
0
{
1296
0
    PyObject *restuple;
1297
0
    PyObject *object;
1298
0
    Py_ssize_t i;
1299
0
    Py_ssize_t start;
1300
0
    Py_ssize_t end;
1301
0
    PyObject *res;
1302
1303
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1304
0
        char *outp;
1305
0
        if (PyUnicodeEncodeError_GetStart(exc, &start))
1306
0
            return NULL;
1307
0
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
1308
0
            return NULL;
1309
0
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1310
0
            return NULL;
1311
0
        res = PyBytes_FromStringAndSize(NULL, end-start);
1312
0
        if (!res) {
1313
0
            Py_DECREF(object);
1314
0
            return NULL;
1315
0
        }
1316
0
        outp = PyBytes_AsString(res);
1317
0
        for (i = start; i < end; i++) {
1318
            /* object is guaranteed to be "ready" */
1319
0
            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1320
0
            if (ch < 0xdc80 || ch > 0xdcff) {
1321
                /* Not a UTF-8b surrogate, fail with original exception */
1322
0
                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1323
0
                Py_DECREF(res);
1324
0
                Py_DECREF(object);
1325
0
                return NULL;
1326
0
            }
1327
0
            *outp++ = ch - 0xdc00;
1328
0
        }
1329
0
        restuple = Py_BuildValue("(On)", res, end);
1330
0
        Py_DECREF(res);
1331
0
        Py_DECREF(object);
1332
0
        return restuple;
1333
0
    }
1334
0
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1335
0
        PyObject *str;
1336
0
        const unsigned char *p;
1337
0
        Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1338
0
        int consumed = 0;
1339
0
        if (PyUnicodeDecodeError_GetStart(exc, &start))
1340
0
            return NULL;
1341
0
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
1342
0
            return NULL;
1343
0
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1344
0
            return NULL;
1345
0
        p = (const unsigned char*)PyBytes_AS_STRING(object);
1346
0
        while (consumed < 4 && consumed < end-start) {
1347
            /* Refuse to escape ASCII bytes. */
1348
0
            if (p[start+consumed] < 128)
1349
0
                break;
1350
0
            ch[consumed] = 0xdc00 + p[start+consumed];
1351
0
            consumed++;
1352
0
        }
1353
0
        Py_DECREF(object);
1354
0
        if (!consumed) {
1355
            /* codec complained about ASCII byte. */
1356
0
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1357
0
            return NULL;
1358
0
        }
1359
0
        str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1360
0
        if (str == NULL)
1361
0
            return NULL;
1362
0
        return Py_BuildValue("(Nn)", str, start+consumed);
1363
0
    }
1364
0
    else {
1365
0
        wrong_exception_type(exc);
1366
0
        return NULL;
1367
0
    }
1368
0
}
1369
1370
1371
static PyObject *strict_errors(PyObject *self, PyObject *exc)
1372
0
{
1373
0
    return PyCodec_StrictErrors(exc);
1374
0
}
1375
1376
1377
static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1378
0
{
1379
0
    return PyCodec_IgnoreErrors(exc);
1380
0
}
1381
1382
1383
static PyObject *replace_errors(PyObject *self, PyObject *exc)
1384
0
{
1385
0
    return PyCodec_ReplaceErrors(exc);
1386
0
}
1387
1388
1389
static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1390
0
{
1391
0
    return PyCodec_XMLCharRefReplaceErrors(exc);
1392
0
}
1393
1394
1395
static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1396
0
{
1397
0
    return PyCodec_BackslashReplaceErrors(exc);
1398
0
}
1399
1400
static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1401
0
{
1402
0
    return PyCodec_NameReplaceErrors(exc);
1403
0
}
1404
1405
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1406
0
{
1407
0
    return PyCodec_SurrogatePassErrors(exc);
1408
0
}
1409
1410
static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1411
0
{
1412
0
    return PyCodec_SurrogateEscapeErrors(exc);
1413
0
}
1414
1415
static int _PyCodecRegistry_Init(void)
1416
13
{
1417
13
    static struct {
1418
13
        const char *name;
1419
13
        PyMethodDef def;
1420
13
    } methods[] =
1421
13
    {
1422
13
        {
1423
13
            "strict",
1424
13
            {
1425
13
                "strict_errors",
1426
13
                strict_errors,
1427
13
                METH_O,
1428
13
                PyDoc_STR("Implements the 'strict' error handling, which "
1429
13
                          "raises a UnicodeError on coding errors.")
1430
13
            }
1431
13
        },
1432
13
        {
1433
13
            "ignore",
1434
13
            {
1435
13
                "ignore_errors",
1436
13
                ignore_errors,
1437
13
                METH_O,
1438
13
                PyDoc_STR("Implements the 'ignore' error handling, which "
1439
13
                          "ignores malformed data and continues.")
1440
13
            }
1441
13
        },
1442
13
        {
1443
13
            "replace",
1444
13
            {
1445
13
                "replace_errors",
1446
13
                replace_errors,
1447
13
                METH_O,
1448
13
                PyDoc_STR("Implements the 'replace' error handling, which "
1449
13
                          "replaces malformed data with a replacement marker.")
1450
13
            }
1451
13
        },
1452
13
        {
1453
13
            "xmlcharrefreplace",
1454
13
            {
1455
13
                "xmlcharrefreplace_errors",
1456
13
                xmlcharrefreplace_errors,
1457
13
                METH_O,
1458
13
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1459
13
                          "which replaces an unencodable character with the "
1460
13
                          "appropriate XML character reference.")
1461
13
            }
1462
13
        },
1463
13
        {
1464
13
            "backslashreplace",
1465
13
            {
1466
13
                "backslashreplace_errors",
1467
13
                backslashreplace_errors,
1468
13
                METH_O,
1469
13
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1470
13
                          "which replaces malformed data with a backslashed "
1471
13
                          "escape sequence.")
1472
13
            }
1473
13
        },
1474
13
        {
1475
13
            "namereplace",
1476
13
            {
1477
13
                "namereplace_errors",
1478
13
                namereplace_errors,
1479
13
                METH_O,
1480
13
                PyDoc_STR("Implements the 'namereplace' error handling, "
1481
13
                          "which replaces an unencodable character with a "
1482
13
                          "\\N{...} escape sequence.")
1483
13
            }
1484
13
        },
1485
13
        {
1486
13
            "surrogatepass",
1487
13
            {
1488
13
                "surrogatepass",
1489
13
                surrogatepass_errors,
1490
13
                METH_O
1491
13
            }
1492
13
        },
1493
13
        {
1494
13
            "surrogateescape",
1495
13
            {
1496
13
                "surrogateescape",
1497
13
                surrogateescape_errors,
1498
13
                METH_O
1499
13
            }
1500
13
        }
1501
13
    };
1502
1503
13
    PyInterpreterState *interp = _PyInterpreterState_Get();
1504
13
    PyObject *mod;
1505
13
    unsigned i;
1506
1507
13
    if (interp->codec_search_path != NULL)
1508
0
        return 0;
1509
1510
13
    interp->codec_search_path = PyList_New(0);
1511
13
    interp->codec_search_cache = PyDict_New();
1512
13
    interp->codec_error_registry = PyDict_New();
1513
1514
13
    if (interp->codec_error_registry) {
1515
117
        for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1516
104
            PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1517
104
            int res;
1518
104
            if (!func)
1519
0
                Py_FatalError("can't initialize codec error registry");
1520
104
            res = PyCodec_RegisterError(methods[i].name, func);
1521
104
            Py_DECREF(func);
1522
104
            if (res)
1523
0
                Py_FatalError("can't initialize codec error registry");
1524
104
        }
1525
13
    }
1526
1527
13
    if (interp->codec_search_path == NULL ||
1528
13
        interp->codec_search_cache == NULL ||
1529
13
        interp->codec_error_registry == NULL)
1530
0
        Py_FatalError("can't initialize codec registry");
1531
1532
13
    mod = PyImport_ImportModuleNoBlock("encodings");
1533
13
    if (mod == NULL) {
1534
0
        return -1;
1535
0
    }
1536
13
    Py_DECREF(mod);
1537
13
    interp->codecs_initialized = 1;
1538
13
    return 0;
1539
13
}