Coverage Report

Created: 2025-07-11 06:59

/src/Python-3.8.3/Python/codecs.c
Line
Count
Source (jump to first uncovered line)
1
/* ------------------------------------------------------------------------
2
3
   Python Codec Registry and support functions
4
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7
Copyright (c) Corporation for National Research Initiatives.
8
9
   ------------------------------------------------------------------------ */
10
11
#include "Python.h"
12
#include "pycore_pystate.h"
13
#include "ucnhash.h"
14
#include <ctype.h>
15
16
const char *Py_hexdigits = "0123456789abcdef";
17
18
/* --- Codec Registry ----------------------------------------------------- */
19
20
/* Import the standard encodings package which will register the first
21
   codec search function.
22
23
   This is done in a lazy way so that the Unicode implementation does
24
   not downgrade startup time of scripts not needing it.
25
26
   ImportErrors are silently ignored by this function. Only one try is
27
   made.
28
29
*/
30
31
static int _PyCodecRegistry_Init(void); /* Forward */
32
33
int PyCodec_Register(PyObject *search_function)
34
14
{
35
14
    PyInterpreterState *interp = _PyInterpreterState_Get();
36
14
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
37
0
        goto onError;
38
14
    if (search_function == NULL) {
39
0
        PyErr_BadArgument();
40
0
        goto onError;
41
0
    }
42
14
    if (!PyCallable_Check(search_function)) {
43
0
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
44
0
        goto onError;
45
0
    }
46
14
    return PyList_Append(interp->codec_search_path, search_function);
47
48
0
 onError:
49
0
    return -1;
50
14
}
51
52
/* Convert a string to a normalized Python string: all characters are
53
   converted to lower case, spaces are replaced with underscores. */
54
55
static
56
PyObject *normalizestring(const char *string)
57
85
{
58
85
    size_t i;
59
85
    size_t len = strlen(string);
60
85
    char *p;
61
85
    PyObject *v;
62
63
85
    if (len > PY_SSIZE_T_MAX) {
64
0
        PyErr_SetString(PyExc_OverflowError, "string is too large");
65
0
        return NULL;
66
0
    }
67
68
85
    p = PyMem_Malloc(len + 1);
69
85
    if (p == NULL)
70
0
        return PyErr_NoMemory();
71
762
    for (i = 0; i < len; i++) {
72
677
        char ch = string[i];
73
677
        if (ch == ' ')
74
0
            ch = '-';
75
677
        else
76
677
            ch = Py_TOLOWER(Py_CHARMASK(ch));
77
677
        p[i] = ch;
78
677
    }
79
85
    p[i] = '\0';
80
85
    v = PyUnicode_FromString(p);
81
85
    PyMem_Free(p);
82
85
    return v;
83
85
}
84
85
/* Lookup the given encoding and return a tuple providing the codec
86
   facilities.
87
88
   The encoding string is looked up converted to all lower-case
89
   characters. This makes encodings looked up through this mechanism
90
   effectively case-insensitive.
91
92
   If no codec is found, a LookupError is set and NULL returned.
93
94
   As side effect, this tries to load the encodings package, if not
95
   yet done. This is part of the lazy load strategy for the encodings
96
   package.
97
98
*/
99
100
PyObject *_PyCodec_Lookup(const char *encoding)
101
85
{
102
85
    PyObject *result, *args = NULL, *v;
103
85
    Py_ssize_t i, len;
104
105
85
    if (encoding == NULL) {
106
0
        PyErr_BadArgument();
107
0
        goto onError;
108
0
    }
109
110
85
    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
111
85
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
112
0
        goto onError;
113
114
    /* Convert the encoding to a normalized Python string: all
115
       characters are converted to lower case, spaces and hyphens are
116
       replaced with underscores. */
117
85
    v = normalizestring(encoding);
118
85
    if (v == NULL)
119
0
        goto onError;
120
85
    PyUnicode_InternInPlace(&v);
121
122
    /* First, try to lookup the name in the registry dictionary */
123
85
    result = PyDict_GetItemWithError(interp->codec_search_cache, v);
124
85
    if (result != NULL) {
125
56
        Py_INCREF(result);
126
56
        Py_DECREF(v);
127
56
        return result;
128
56
    }
129
29
    else if (PyErr_Occurred()) {
130
0
        Py_DECREF(v);
131
0
        return NULL;
132
0
    }
133
134
    /* Next, scan the search functions in order of registration */
135
29
    args = PyTuple_New(1);
136
29
    if (args == NULL) {
137
0
        Py_DECREF(v);
138
0
        return NULL;
139
0
    }
140
29
    PyTuple_SET_ITEM(args,0,v);
141
142
29
    len = PyList_Size(interp->codec_search_path);
143
29
    if (len < 0)
144
0
        goto onError;
145
29
    if (len == 0) {
146
0
        PyErr_SetString(PyExc_LookupError,
147
0
                        "no codec search functions registered: "
148
0
                        "can't find encoding");
149
0
        goto onError;
150
0
    }
151
152
29
    for (i = 0; i < len; i++) {
153
29
        PyObject *func;
154
155
29
        func = PyList_GetItem(interp->codec_search_path, i);
156
29
        if (func == NULL)
157
0
            goto onError;
158
29
        result = PyEval_CallObject(func, args);
159
29
        if (result == NULL)
160
0
            goto onError;
161
29
        if (result == Py_None) {
162
0
            Py_DECREF(result);
163
0
            continue;
164
0
        }
165
29
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
166
0
            PyErr_SetString(PyExc_TypeError,
167
0
                            "codec search functions must return 4-tuples");
168
0
            Py_DECREF(result);
169
0
            goto onError;
170
0
        }
171
29
        break;
172
29
    }
173
29
    if (i == len) {
174
        /* XXX Perhaps we should cache misses too ? */
175
0
        PyErr_Format(PyExc_LookupError,
176
0
                     "unknown encoding: %s", encoding);
177
0
        goto onError;
178
0
    }
179
180
    /* Cache and return the result */
181
29
    if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
182
0
        Py_DECREF(result);
183
0
        goto onError;
184
0
    }
185
29
    Py_DECREF(args);
186
29
    return result;
187
188
0
 onError:
189
0
    Py_XDECREF(args);
190
0
    return NULL;
191
29
}
192
193
int _PyCodec_Forget(const char *encoding)
194
0
{
195
0
    PyObject *v;
196
0
    int result;
197
198
0
    PyInterpreterState *interp = _PyInterpreterState_Get();
199
0
    if (interp->codec_search_path == NULL) {
200
0
        return -1;
201
0
    }
202
203
    /* Convert the encoding to a normalized Python string: all
204
       characters are converted to lower case, spaces and hyphens are
205
       replaced with underscores. */
206
0
    v = normalizestring(encoding);
207
0
    if (v == NULL) {
208
0
        return -1;
209
0
    }
210
211
    /* Drop the named codec from the internal cache */
212
0
    result = PyDict_DelItem(interp->codec_search_cache, v);
213
0
    Py_DECREF(v);
214
215
0
    return result;
216
0
}
217
218
/* Codec registry encoding check API. */
219
220
int PyCodec_KnownEncoding(const char *encoding)
221
0
{
222
0
    PyObject *codecs;
223
224
0
    codecs = _PyCodec_Lookup(encoding);
225
0
    if (!codecs) {
226
0
        PyErr_Clear();
227
0
        return 0;
228
0
    }
229
0
    else {
230
0
        Py_DECREF(codecs);
231
0
        return 1;
232
0
    }
233
0
}
234
235
static
236
PyObject *args_tuple(PyObject *object,
237
                     const char *errors)
238
0
{
239
0
    PyObject *args;
240
241
0
    args = PyTuple_New(1 + (errors != NULL));
242
0
    if (args == NULL)
243
0
        return NULL;
244
0
    Py_INCREF(object);
245
0
    PyTuple_SET_ITEM(args,0,object);
246
0
    if (errors) {
247
0
        PyObject *v;
248
249
0
        v = PyUnicode_FromString(errors);
250
0
        if (v == NULL) {
251
0
            Py_DECREF(args);
252
0
            return NULL;
253
0
        }
254
0
        PyTuple_SET_ITEM(args, 1, v);
255
0
    }
256
0
    return args;
257
0
}
258
259
/* Helper function to get a codec item */
260
261
static
262
PyObject *codec_getitem(const char *encoding, int index)
263
0
{
264
0
    PyObject *codecs;
265
0
    PyObject *v;
266
267
0
    codecs = _PyCodec_Lookup(encoding);
268
0
    if (codecs == NULL)
269
0
        return NULL;
270
0
    v = PyTuple_GET_ITEM(codecs, index);
271
0
    Py_DECREF(codecs);
272
0
    Py_INCREF(v);
273
0
    return v;
274
0
}
275
276
/* Helper functions to create an incremental codec. */
277
static
278
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
279
                                     const char *errors,
280
                                     const char *attrname)
281
43
{
282
43
    PyObject *ret, *inccodec;
283
284
43
    inccodec = PyObject_GetAttrString(codec_info, attrname);
285
43
    if (inccodec == NULL)
286
0
        return NULL;
287
43
    if (errors)
288
43
        ret = PyObject_CallFunction(inccodec, "s", errors);
289
0
    else
290
0
        ret = _PyObject_CallNoArg(inccodec);
291
43
    Py_DECREF(inccodec);
292
43
    return ret;
293
43
}
294
295
static
296
PyObject *codec_getincrementalcodec(const char *encoding,
297
                                    const char *errors,
298
                                    const char *attrname)
299
0
{
300
0
    PyObject *codec_info, *ret;
301
302
0
    codec_info = _PyCodec_Lookup(encoding);
303
0
    if (codec_info == NULL)
304
0
        return NULL;
305
0
    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
306
0
    Py_DECREF(codec_info);
307
0
    return ret;
308
0
}
309
310
/* Helper function to create a stream codec. */
311
312
static
313
PyObject *codec_getstreamcodec(const char *encoding,
314
                               PyObject *stream,
315
                               const char *errors,
316
                               const int index)
317
0
{
318
0
    PyObject *codecs, *streamcodec, *codeccls;
319
320
0
    codecs = _PyCodec_Lookup(encoding);
321
0
    if (codecs == NULL)
322
0
        return NULL;
323
324
0
    codeccls = PyTuple_GET_ITEM(codecs, index);
325
0
    if (errors != NULL)
326
0
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
327
0
    else
328
0
        streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL);
329
0
    Py_DECREF(codecs);
330
0
    return streamcodec;
331
0
}
332
333
/* Helpers to work with the result of _PyCodec_Lookup
334
335
 */
336
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
337
                                             const char *errors)
338
15
{
339
15
    return codec_makeincrementalcodec(codec_info, errors,
340
15
                                      "incrementaldecoder");
341
15
}
342
343
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
344
                                             const char *errors)
345
28
{
346
28
    return codec_makeincrementalcodec(codec_info, errors,
347
28
                                      "incrementalencoder");
348
28
}
349
350
351
/* Convenience APIs to query the Codec registry.
352
353
   All APIs return a codec object with incremented refcount.
354
355
 */
356
357
PyObject *PyCodec_Encoder(const char *encoding)
358
0
{
359
0
    return codec_getitem(encoding, 0);
360
0
}
361
362
PyObject *PyCodec_Decoder(const char *encoding)
363
0
{
364
0
    return codec_getitem(encoding, 1);
365
0
}
366
367
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
368
                                     const char *errors)
369
0
{
370
0
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
371
0
}
372
373
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
374
                                     const char *errors)
375
0
{
376
0
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
377
0
}
378
379
PyObject *PyCodec_StreamReader(const char *encoding,
380
                               PyObject *stream,
381
                               const char *errors)
382
0
{
383
0
    return codec_getstreamcodec(encoding, stream, errors, 2);
384
0
}
385
386
PyObject *PyCodec_StreamWriter(const char *encoding,
387
                               PyObject *stream,
388
                               const char *errors)
389
0
{
390
0
    return codec_getstreamcodec(encoding, stream, errors, 3);
391
0
}
392
393
/* Helper that tries to ensure the reported exception chain indicates the
394
 * codec that was invoked to trigger the failure without changing the type
395
 * of the exception raised.
396
 */
397
static void
398
wrap_codec_error(const char *operation,
399
                 const char *encoding)
400
0
{
401
    /* TrySetFromCause will replace the active exception with a suitably
402
     * updated clone if it can, otherwise it will leave the original
403
     * exception alone.
404
     */
405
0
    _PyErr_TrySetFromCause("%s with '%s' codec failed",
406
0
                           operation, encoding);
407
0
}
408
409
/* Encode an object (e.g. a Unicode object) using the given encoding
410
   and return the resulting encoded object (usually a Python string).
411
412
   errors is passed to the encoder factory as argument if non-NULL. */
413
414
static PyObject *
415
_PyCodec_EncodeInternal(PyObject *object,
416
                        PyObject *encoder,
417
                        const char *encoding,
418
                        const char *errors)
419
0
{
420
0
    PyObject *args = NULL, *result = NULL;
421
0
    PyObject *v = NULL;
422
423
0
    args = args_tuple(object, errors);
424
0
    if (args == NULL)
425
0
        goto onError;
426
427
0
    result = PyEval_CallObject(encoder, args);
428
0
    if (result == NULL) {
429
0
        wrap_codec_error("encoding", encoding);
430
0
        goto onError;
431
0
    }
432
433
0
    if (!PyTuple_Check(result) ||
434
0
        PyTuple_GET_SIZE(result) != 2) {
435
0
        PyErr_SetString(PyExc_TypeError,
436
0
                        "encoder must return a tuple (object, integer)");
437
0
        goto onError;
438
0
    }
439
0
    v = PyTuple_GET_ITEM(result,0);
440
0
    Py_INCREF(v);
441
    /* We don't check or use the second (integer) entry. */
442
443
0
    Py_DECREF(args);
444
0
    Py_DECREF(encoder);
445
0
    Py_DECREF(result);
446
0
    return v;
447
448
0
 onError:
449
0
    Py_XDECREF(result);
450
0
    Py_XDECREF(args);
451
0
    Py_XDECREF(encoder);
452
0
    return NULL;
453
0
}
454
455
/* Decode an object (usually a Python string) using the given encoding
456
   and return an equivalent object (e.g. a Unicode object).
457
458
   errors is passed to the decoder factory as argument if non-NULL. */
459
460
static PyObject *
461
_PyCodec_DecodeInternal(PyObject *object,
462
                        PyObject *decoder,
463
                        const char *encoding,
464
                        const char *errors)
465
0
{
466
0
    PyObject *args = NULL, *result = NULL;
467
0
    PyObject *v;
468
469
0
    args = args_tuple(object, errors);
470
0
    if (args == NULL)
471
0
        goto onError;
472
473
0
    result = PyEval_CallObject(decoder,args);
474
0
    if (result == NULL) {
475
0
        wrap_codec_error("decoding", encoding);
476
0
        goto onError;
477
0
    }
478
0
    if (!PyTuple_Check(result) ||
479
0
        PyTuple_GET_SIZE(result) != 2) {
480
0
        PyErr_SetString(PyExc_TypeError,
481
0
                        "decoder must return a tuple (object,integer)");
482
0
        goto onError;
483
0
    }
484
0
    v = PyTuple_GET_ITEM(result,0);
485
0
    Py_INCREF(v);
486
    /* We don't check or use the second (integer) entry. */
487
488
0
    Py_DECREF(args);
489
0
    Py_DECREF(decoder);
490
0
    Py_DECREF(result);
491
0
    return v;
492
493
0
 onError:
494
0
    Py_XDECREF(args);
495
0
    Py_XDECREF(decoder);
496
0
    Py_XDECREF(result);
497
0
    return NULL;
498
0
}
499
500
/* Generic encoding/decoding API */
501
PyObject *PyCodec_Encode(PyObject *object,
502
                         const char *encoding,
503
                         const char *errors)
504
0
{
505
0
    PyObject *encoder;
506
507
0
    encoder = PyCodec_Encoder(encoding);
508
0
    if (encoder == NULL)
509
0
        return NULL;
510
511
0
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
512
0
}
513
514
PyObject *PyCodec_Decode(PyObject *object,
515
                         const char *encoding,
516
                         const char *errors)
517
0
{
518
0
    PyObject *decoder;
519
520
0
    decoder = PyCodec_Decoder(encoding);
521
0
    if (decoder == NULL)
522
0
        return NULL;
523
524
0
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
525
0
}
526
527
/* Text encoding/decoding API */
528
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
529
                                       const char *alternate_command)
530
43
{
531
43
    _Py_IDENTIFIER(_is_text_encoding);
532
43
    PyObject *codec;
533
43
    PyObject *attr;
534
43
    int is_text_codec;
535
536
43
    codec = _PyCodec_Lookup(encoding);
537
43
    if (codec == NULL)
538
0
        return NULL;
539
540
    /* Backwards compatibility: assume any raw tuple describes a text
541
     * encoding, and the same for anything lacking the private
542
     * attribute.
543
     */
544
43
    if (!PyTuple_CheckExact(codec)) {
545
43
        if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
546
0
            Py_DECREF(codec);
547
0
            return NULL;
548
0
        }
549
43
        if (attr != NULL) {
550
43
            is_text_codec = PyObject_IsTrue(attr);
551
43
            Py_DECREF(attr);
552
43
            if (is_text_codec <= 0) {
553
0
                Py_DECREF(codec);
554
0
                if (!is_text_codec)
555
0
                    PyErr_Format(PyExc_LookupError,
556
0
                                 "'%.400s' is not a text encoding; "
557
0
                                 "use %s to handle arbitrary codecs",
558
0
                                 encoding, alternate_command);
559
0
                return NULL;
560
0
            }
561
43
        }
562
43
    }
563
564
    /* This appears to be a valid text encoding */
565
43
    return codec;
566
43
}
567
568
569
static
570
PyObject *codec_getitem_checked(const char *encoding,
571
                                const char *alternate_command,
572
                                int index)
573
0
{
574
0
    PyObject *codec;
575
0
    PyObject *v;
576
577
0
    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
578
0
    if (codec == NULL)
579
0
        return NULL;
580
581
0
    v = PyTuple_GET_ITEM(codec, index);
582
0
    Py_INCREF(v);
583
0
    Py_DECREF(codec);
584
0
    return v;
585
0
}
586
587
static PyObject * _PyCodec_TextEncoder(const char *encoding)
588
0
{
589
0
    return codec_getitem_checked(encoding, "codecs.encode()", 0);
590
0
}
591
592
static PyObject * _PyCodec_TextDecoder(const char *encoding)
593
0
{
594
0
    return codec_getitem_checked(encoding, "codecs.decode()", 1);
595
0
}
596
597
PyObject *_PyCodec_EncodeText(PyObject *object,
598
                              const char *encoding,
599
                              const char *errors)
600
0
{
601
0
    PyObject *encoder;
602
603
0
    encoder = _PyCodec_TextEncoder(encoding);
604
0
    if (encoder == NULL)
605
0
        return NULL;
606
607
0
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
608
0
}
609
610
PyObject *_PyCodec_DecodeText(PyObject *object,
611
                              const char *encoding,
612
                              const char *errors)
613
0
{
614
0
    PyObject *decoder;
615
616
0
    decoder = _PyCodec_TextDecoder(encoding);
617
0
    if (decoder == NULL)
618
0
        return NULL;
619
620
0
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
621
0
}
622
623
/* Register the error handling callback function error under the name
624
   name. This function will be called by the codec when it encounters
625
   an unencodable characters/undecodable bytes and doesn't know the
626
   callback name, when name is specified as the error parameter
627
   in the call to the encode/decode function.
628
   Return 0 on success, -1 on error */
629
int PyCodec_RegisterError(const char *name, PyObject *error)
630
112
{
631
112
    PyInterpreterState *interp = _PyInterpreterState_Get();
632
112
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
633
0
        return -1;
634
112
    if (!PyCallable_Check(error)) {
635
0
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
636
0
        return -1;
637
0
    }
638
112
    return PyDict_SetItemString(interp->codec_error_registry,
639
112
                                name, error);
640
112
}
641
642
/* Lookup the error handling callback function registered under the
643
   name error. As a special case NULL can be passed, in which case
644
   the error handling callback for strict encoding will be returned. */
645
PyObject *PyCodec_LookupError(const char *name)
646
84
{
647
84
    PyObject *handler = NULL;
648
649
84
    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
650
84
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
651
0
        return NULL;
652
653
84
    if (name==NULL)
654
0
        name = "strict";
655
84
    handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
656
84
    if (handler) {
657
84
        Py_INCREF(handler);
658
84
    }
659
0
    else if (!PyErr_Occurred()) {
660
0
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
661
0
    }
662
84
    return handler;
663
84
}
664
665
static void wrong_exception_type(PyObject *exc)
666
0
{
667
0
    PyErr_Format(PyExc_TypeError,
668
0
                 "don't know how to handle %.200s in error callback",
669
0
                 exc->ob_type->tp_name);
670
0
}
671
672
PyObject *PyCodec_StrictErrors(PyObject *exc)
673
0
{
674
0
    if (PyExceptionInstance_Check(exc))
675
0
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
676
0
    else
677
0
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
678
0
    return NULL;
679
0
}
680
681
682
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
683
0
{
684
0
    Py_ssize_t end;
685
686
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
687
0
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
688
0
            return NULL;
689
0
    }
690
0
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
691
0
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
692
0
            return NULL;
693
0
    }
694
0
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
695
0
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
696
0
            return NULL;
697
0
    }
698
0
    else {
699
0
        wrong_exception_type(exc);
700
0
        return NULL;
701
0
    }
702
0
    return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
703
0
}
704
705
706
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
707
0
{
708
0
    Py_ssize_t start, end, i, len;
709
710
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
711
0
        PyObject *res;
712
0
        int kind;
713
0
        void *data;
714
0
        if (PyUnicodeEncodeError_GetStart(exc, &start))
715
0
            return NULL;
716
0
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
717
0
            return NULL;
718
0
        len = end - start;
719
0
        res = PyUnicode_New(len, '?');
720
0
        if (res == NULL)
721
0
            return NULL;
722
0
        kind = PyUnicode_KIND(res);
723
0
        data = PyUnicode_DATA(res);
724
0
        for (i = 0; i < len; ++i)
725
0
            PyUnicode_WRITE(kind, data, i, '?');
726
0
        assert(_PyUnicode_CheckConsistency(res, 1));
727
0
        return Py_BuildValue("(Nn)", res, end);
728
0
    }
729
0
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
730
0
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
731
0
            return NULL;
732
0
        return Py_BuildValue("(Cn)",
733
0
                             (int)Py_UNICODE_REPLACEMENT_CHARACTER,
734
0
                             end);
735
0
    }
736
0
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
737
0
        PyObject *res;
738
0
        int kind;
739
0
        void *data;
740
0
        if (PyUnicodeTranslateError_GetStart(exc, &start))
741
0
            return NULL;
742
0
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
743
0
            return NULL;
744
0
        len = end - start;
745
0
        res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
746
0
        if (res == NULL)
747
0
            return NULL;
748
0
        kind = PyUnicode_KIND(res);
749
0
        data = PyUnicode_DATA(res);
750
0
        for (i=0; i < len; i++)
751
0
            PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
752
0
        assert(_PyUnicode_CheckConsistency(res, 1));
753
0
        return Py_BuildValue("(Nn)", res, end);
754
0
    }
755
0
    else {
756
0
        wrong_exception_type(exc);
757
0
        return NULL;
758
0
    }
759
0
}
760
761
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
762
0
{
763
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
764
0
        PyObject *restuple;
765
0
        PyObject *object;
766
0
        Py_ssize_t i;
767
0
        Py_ssize_t start;
768
0
        Py_ssize_t end;
769
0
        PyObject *res;
770
0
        unsigned char *outp;
771
0
        Py_ssize_t ressize;
772
0
        Py_UCS4 ch;
773
0
        if (PyUnicodeEncodeError_GetStart(exc, &start))
774
0
            return NULL;
775
0
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
776
0
            return NULL;
777
0
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
778
0
            return NULL;
779
0
        if (end - start > PY_SSIZE_T_MAX / (2+7+1))
780
0
            end = start + PY_SSIZE_T_MAX / (2+7+1);
781
0
        for (i = start, ressize = 0; i < end; ++i) {
782
            /* object is guaranteed to be "ready" */
783
0
            ch = PyUnicode_READ_CHAR(object, i);
784
0
            if (ch<10)
785
0
                ressize += 2+1+1;
786
0
            else if (ch<100)
787
0
                ressize += 2+2+1;
788
0
            else if (ch<1000)
789
0
                ressize += 2+3+1;
790
0
            else if (ch<10000)
791
0
                ressize += 2+4+1;
792
0
            else if (ch<100000)
793
0
                ressize += 2+5+1;
794
0
            else if (ch<1000000)
795
0
                ressize += 2+6+1;
796
0
            else
797
0
                ressize += 2+7+1;
798
0
        }
799
        /* allocate replacement */
800
0
        res = PyUnicode_New(ressize, 127);
801
0
        if (res == NULL) {
802
0
            Py_DECREF(object);
803
0
            return NULL;
804
0
        }
805
0
        outp = PyUnicode_1BYTE_DATA(res);
806
        /* generate replacement */
807
0
        for (i = start; i < end; ++i) {
808
0
            int digits;
809
0
            int base;
810
0
            ch = PyUnicode_READ_CHAR(object, i);
811
0
            *outp++ = '&';
812
0
            *outp++ = '#';
813
0
            if (ch<10) {
814
0
                digits = 1;
815
0
                base = 1;
816
0
            }
817
0
            else if (ch<100) {
818
0
                digits = 2;
819
0
                base = 10;
820
0
            }
821
0
            else if (ch<1000) {
822
0
                digits = 3;
823
0
                base = 100;
824
0
            }
825
0
            else if (ch<10000) {
826
0
                digits = 4;
827
0
                base = 1000;
828
0
            }
829
0
            else if (ch<100000) {
830
0
                digits = 5;
831
0
                base = 10000;
832
0
            }
833
0
            else if (ch<1000000) {
834
0
                digits = 6;
835
0
                base = 100000;
836
0
            }
837
0
            else {
838
0
                digits = 7;
839
0
                base = 1000000;
840
0
            }
841
0
            while (digits-->0) {
842
0
                *outp++ = '0' + ch/base;
843
0
                ch %= base;
844
0
                base /= 10;
845
0
            }
846
0
            *outp++ = ';';
847
0
        }
848
0
        assert(_PyUnicode_CheckConsistency(res, 1));
849
0
        restuple = Py_BuildValue("(Nn)", res, end);
850
0
        Py_DECREF(object);
851
0
        return restuple;
852
0
    }
853
0
    else {
854
0
        wrong_exception_type(exc);
855
0
        return NULL;
856
0
    }
857
0
}
858
859
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
860
0
{
861
0
    PyObject *object;
862
0
    Py_ssize_t i;
863
0
    Py_ssize_t start;
864
0
    Py_ssize_t end;
865
0
    PyObject *res;
866
0
    unsigned char *outp;
867
0
    int ressize;
868
0
    Py_UCS4 c;
869
870
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
871
0
        const unsigned char *p;
872
0
        if (PyUnicodeDecodeError_GetStart(exc, &start))
873
0
            return NULL;
874
0
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
875
0
            return NULL;
876
0
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
877
0
            return NULL;
878
0
        p = (const unsigned char*)PyBytes_AS_STRING(object);
879
0
        res = PyUnicode_New(4 * (end - start), 127);
880
0
        if (res == NULL) {
881
0
            Py_DECREF(object);
882
0
            return NULL;
883
0
        }
884
0
        outp = PyUnicode_1BYTE_DATA(res);
885
0
        for (i = start; i < end; i++, outp += 4) {
886
0
            unsigned char c = p[i];
887
0
            outp[0] = '\\';
888
0
            outp[1] = 'x';
889
0
            outp[2] = Py_hexdigits[(c>>4)&0xf];
890
0
            outp[3] = Py_hexdigits[c&0xf];
891
0
        }
892
893
0
        assert(_PyUnicode_CheckConsistency(res, 1));
894
0
        Py_DECREF(object);
895
0
        return Py_BuildValue("(Nn)", res, end);
896
0
    }
897
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
898
0
        if (PyUnicodeEncodeError_GetStart(exc, &start))
899
0
            return NULL;
900
0
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
901
0
            return NULL;
902
0
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
903
0
            return NULL;
904
0
    }
905
0
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
906
0
        if (PyUnicodeTranslateError_GetStart(exc, &start))
907
0
            return NULL;
908
0
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
909
0
            return NULL;
910
0
        if (!(object = PyUnicodeTranslateError_GetObject(exc)))
911
0
            return NULL;
912
0
    }
913
0
    else {
914
0
        wrong_exception_type(exc);
915
0
        return NULL;
916
0
    }
917
918
0
    if (end - start > PY_SSIZE_T_MAX / (1+1+8))
919
0
        end = start + PY_SSIZE_T_MAX / (1+1+8);
920
0
    for (i = start, ressize = 0; i < end; ++i) {
921
        /* object is guaranteed to be "ready" */
922
0
        c = PyUnicode_READ_CHAR(object, i);
923
0
        if (c >= 0x10000) {
924
0
            ressize += 1+1+8;
925
0
        }
926
0
        else if (c >= 0x100) {
927
0
            ressize += 1+1+4;
928
0
        }
929
0
        else
930
0
            ressize += 1+1+2;
931
0
    }
932
0
    res = PyUnicode_New(ressize, 127);
933
0
    if (res == NULL) {
934
0
        Py_DECREF(object);
935
0
        return NULL;
936
0
    }
937
0
    outp = PyUnicode_1BYTE_DATA(res);
938
0
    for (i = start; i < end; ++i) {
939
0
        c = PyUnicode_READ_CHAR(object, i);
940
0
        *outp++ = '\\';
941
0
        if (c >= 0x00010000) {
942
0
            *outp++ = 'U';
943
0
            *outp++ = Py_hexdigits[(c>>28)&0xf];
944
0
            *outp++ = Py_hexdigits[(c>>24)&0xf];
945
0
            *outp++ = Py_hexdigits[(c>>20)&0xf];
946
0
            *outp++ = Py_hexdigits[(c>>16)&0xf];
947
0
            *outp++ = Py_hexdigits[(c>>12)&0xf];
948
0
            *outp++ = Py_hexdigits[(c>>8)&0xf];
949
0
        }
950
0
        else if (c >= 0x100) {
951
0
            *outp++ = 'u';
952
0
            *outp++ = Py_hexdigits[(c>>12)&0xf];
953
0
            *outp++ = Py_hexdigits[(c>>8)&0xf];
954
0
        }
955
0
        else
956
0
            *outp++ = 'x';
957
0
        *outp++ = Py_hexdigits[(c>>4)&0xf];
958
0
        *outp++ = Py_hexdigits[c&0xf];
959
0
    }
960
961
0
    assert(_PyUnicode_CheckConsistency(res, 1));
962
0
    Py_DECREF(object);
963
0
    return Py_BuildValue("(Nn)", res, end);
964
0
}
965
966
static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
967
968
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
969
0
{
970
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
971
0
        PyObject *restuple;
972
0
        PyObject *object;
973
0
        Py_ssize_t i;
974
0
        Py_ssize_t start;
975
0
        Py_ssize_t end;
976
0
        PyObject *res;
977
0
        unsigned char *outp;
978
0
        Py_ssize_t ressize;
979
0
        int replsize;
980
0
        Py_UCS4 c;
981
0
        char buffer[256]; /* NAME_MAXLEN */
982
0
        if (PyUnicodeEncodeError_GetStart(exc, &start))
983
0
            return NULL;
984
0
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
985
0
            return NULL;
986
0
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
987
0
            return NULL;
988
0
        if (!ucnhash_CAPI) {
989
            /* load the unicode data module */
990
0
            ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
991
0
                                            PyUnicodeData_CAPSULE_NAME, 1);
992
0
            if (!ucnhash_CAPI)
993
0
                return NULL;
994
0
        }
995
0
        for (i = start, ressize = 0; i < end; ++i) {
996
            /* object is guaranteed to be "ready" */
997
0
            c = PyUnicode_READ_CHAR(object, i);
998
0
            if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
999
0
                replsize = 1+1+1+(int)strlen(buffer)+1;
1000
0
            }
1001
0
            else if (c >= 0x10000) {
1002
0
                replsize = 1+1+8;
1003
0
            }
1004
0
            else if (c >= 0x100) {
1005
0
                replsize = 1+1+4;
1006
0
            }
1007
0
            else
1008
0
                replsize = 1+1+2;
1009
0
            if (ressize > PY_SSIZE_T_MAX - replsize)
1010
0
                break;
1011
0
            ressize += replsize;
1012
0
        }
1013
0
        end = i;
1014
0
        res = PyUnicode_New(ressize, 127);
1015
0
        if (res==NULL)
1016
0
            return NULL;
1017
0
        for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1018
0
            i < end; ++i) {
1019
0
            c = PyUnicode_READ_CHAR(object, i);
1020
0
            *outp++ = '\\';
1021
0
            if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1022
0
                *outp++ = 'N';
1023
0
                *outp++ = '{';
1024
0
                strcpy((char *)outp, buffer);
1025
0
                outp += strlen(buffer);
1026
0
                *outp++ = '}';
1027
0
                continue;
1028
0
            }
1029
0
            if (c >= 0x00010000) {
1030
0
                *outp++ = 'U';
1031
0
                *outp++ = Py_hexdigits[(c>>28)&0xf];
1032
0
                *outp++ = Py_hexdigits[(c>>24)&0xf];
1033
0
                *outp++ = Py_hexdigits[(c>>20)&0xf];
1034
0
                *outp++ = Py_hexdigits[(c>>16)&0xf];
1035
0
                *outp++ = Py_hexdigits[(c>>12)&0xf];
1036
0
                *outp++ = Py_hexdigits[(c>>8)&0xf];
1037
0
            }
1038
0
            else if (c >= 0x100) {
1039
0
                *outp++ = 'u';
1040
0
                *outp++ = Py_hexdigits[(c>>12)&0xf];
1041
0
                *outp++ = Py_hexdigits[(c>>8)&0xf];
1042
0
            }
1043
0
            else
1044
0
                *outp++ = 'x';
1045
0
            *outp++ = Py_hexdigits[(c>>4)&0xf];
1046
0
            *outp++ = Py_hexdigits[c&0xf];
1047
0
        }
1048
1049
0
        assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1050
0
        assert(_PyUnicode_CheckConsistency(res, 1));
1051
0
        restuple = Py_BuildValue("(Nn)", res, end);
1052
0
        Py_DECREF(object);
1053
0
        return restuple;
1054
0
    }
1055
0
    else {
1056
0
        wrong_exception_type(exc);
1057
0
        return NULL;
1058
0
    }
1059
0
}
1060
1061
0
#define ENC_UNKNOWN     -1
1062
0
#define ENC_UTF8        0
1063
0
#define ENC_UTF16BE     1
1064
0
#define ENC_UTF16LE     2
1065
0
#define ENC_UTF32BE     3
1066
0
#define ENC_UTF32LE     4
1067
1068
static int
1069
get_standard_encoding(const char *encoding, int *bytelength)
1070
0
{
1071
0
    if (Py_TOLOWER(encoding[0]) == 'u' &&
1072
0
        Py_TOLOWER(encoding[1]) == 't' &&
1073
0
        Py_TOLOWER(encoding[2]) == 'f') {
1074
0
        encoding += 3;
1075
0
        if (*encoding == '-' || *encoding == '_' )
1076
0
            encoding++;
1077
0
        if (encoding[0] == '8' && encoding[1] == '\0') {
1078
0
            *bytelength = 3;
1079
0
            return ENC_UTF8;
1080
0
        }
1081
0
        else if (encoding[0] == '1' && encoding[1] == '6') {
1082
0
            encoding += 2;
1083
0
            *bytelength = 2;
1084
0
            if (*encoding == '\0') {
1085
#ifdef WORDS_BIGENDIAN
1086
                return ENC_UTF16BE;
1087
#else
1088
0
                return ENC_UTF16LE;
1089
0
#endif
1090
0
            }
1091
0
            if (*encoding == '-' || *encoding == '_' )
1092
0
                encoding++;
1093
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1094
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1095
0
                    return ENC_UTF16BE;
1096
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1097
0
                    return ENC_UTF16LE;
1098
0
            }
1099
0
        }
1100
0
        else if (encoding[0] == '3' && encoding[1] == '2') {
1101
0
            encoding += 2;
1102
0
            *bytelength = 4;
1103
0
            if (*encoding == '\0') {
1104
#ifdef WORDS_BIGENDIAN
1105
                return ENC_UTF32BE;
1106
#else
1107
0
                return ENC_UTF32LE;
1108
0
#endif
1109
0
            }
1110
0
            if (*encoding == '-' || *encoding == '_' )
1111
0
                encoding++;
1112
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1113
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1114
0
                    return ENC_UTF32BE;
1115
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1116
0
                    return ENC_UTF32LE;
1117
0
            }
1118
0
        }
1119
0
    }
1120
0
    else if (strcmp(encoding, "CP_UTF8") == 0) {
1121
0
        *bytelength = 3;
1122
0
        return ENC_UTF8;
1123
0
    }
1124
0
    return ENC_UNKNOWN;
1125
0
}
1126
1127
/* This handler is declared static until someone demonstrates
1128
   a need to call it directly. */
1129
static PyObject *
1130
PyCodec_SurrogatePassErrors(PyObject *exc)
1131
0
{
1132
0
    PyObject *restuple;
1133
0
    PyObject *object;
1134
0
    PyObject *encode;
1135
0
    const char *encoding;
1136
0
    int code;
1137
0
    int bytelength;
1138
0
    Py_ssize_t i;
1139
0
    Py_ssize_t start;
1140
0
    Py_ssize_t end;
1141
0
    PyObject *res;
1142
1143
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1144
0
        unsigned char *outp;
1145
0
        if (PyUnicodeEncodeError_GetStart(exc, &start))
1146
0
            return NULL;
1147
0
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
1148
0
            return NULL;
1149
0
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1150
0
            return NULL;
1151
0
        if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1152
0
            Py_DECREF(object);
1153
0
            return NULL;
1154
0
        }
1155
0
        if (!(encoding = PyUnicode_AsUTF8(encode))) {
1156
0
            Py_DECREF(object);
1157
0
            Py_DECREF(encode);
1158
0
            return NULL;
1159
0
        }
1160
0
        code = get_standard_encoding(encoding, &bytelength);
1161
0
        Py_DECREF(encode);
1162
0
        if (code == ENC_UNKNOWN) {
1163
            /* Not supported, fail with original exception */
1164
0
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1165
0
            Py_DECREF(object);
1166
0
            return NULL;
1167
0
        }
1168
1169
0
        if (end - start > PY_SSIZE_T_MAX / bytelength)
1170
0
            end = start + PY_SSIZE_T_MAX / bytelength;
1171
0
        res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1172
0
        if (!res) {
1173
0
            Py_DECREF(object);
1174
0
            return NULL;
1175
0
        }
1176
0
        outp = (unsigned char*)PyBytes_AsString(res);
1177
0
        for (i = start; i < end; i++) {
1178
            /* object is guaranteed to be "ready" */
1179
0
            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1180
0
            if (!Py_UNICODE_IS_SURROGATE(ch)) {
1181
                /* Not a surrogate, fail with original exception */
1182
0
                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1183
0
                Py_DECREF(res);
1184
0
                Py_DECREF(object);
1185
0
                return NULL;
1186
0
            }
1187
0
            switch (code) {
1188
0
            case ENC_UTF8:
1189
0
                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1190
0
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1191
0
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1192
0
                break;
1193
0
            case ENC_UTF16LE:
1194
0
                *outp++ = (unsigned char) ch;
1195
0
                *outp++ = (unsigned char)(ch >> 8);
1196
0
                break;
1197
0
            case ENC_UTF16BE:
1198
0
                *outp++ = (unsigned char)(ch >> 8);
1199
0
                *outp++ = (unsigned char) ch;
1200
0
                break;
1201
0
            case ENC_UTF32LE:
1202
0
                *outp++ = (unsigned char) ch;
1203
0
                *outp++ = (unsigned char)(ch >> 8);
1204
0
                *outp++ = (unsigned char)(ch >> 16);
1205
0
                *outp++ = (unsigned char)(ch >> 24);
1206
0
                break;
1207
0
            case ENC_UTF32BE:
1208
0
                *outp++ = (unsigned char)(ch >> 24);
1209
0
                *outp++ = (unsigned char)(ch >> 16);
1210
0
                *outp++ = (unsigned char)(ch >> 8);
1211
0
                *outp++ = (unsigned char) ch;
1212
0
                break;
1213
0
            }
1214
0
        }
1215
0
        restuple = Py_BuildValue("(On)", res, end);
1216
0
        Py_DECREF(res);
1217
0
        Py_DECREF(object);
1218
0
        return restuple;
1219
0
    }
1220
0
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1221
0
        const unsigned char *p;
1222
0
        Py_UCS4 ch = 0;
1223
0
        if (PyUnicodeDecodeError_GetStart(exc, &start))
1224
0
            return NULL;
1225
0
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
1226
0
            return NULL;
1227
0
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1228
0
            return NULL;
1229
0
        p = (const unsigned char*)PyBytes_AS_STRING(object);
1230
0
        if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1231
0
            Py_DECREF(object);
1232
0
            return NULL;
1233
0
        }
1234
0
        if (!(encoding = PyUnicode_AsUTF8(encode))) {
1235
0
            Py_DECREF(object);
1236
0
            Py_DECREF(encode);
1237
0
            return NULL;
1238
0
        }
1239
0
        code = get_standard_encoding(encoding, &bytelength);
1240
0
        Py_DECREF(encode);
1241
0
        if (code == ENC_UNKNOWN) {
1242
            /* Not supported, fail with original exception */
1243
0
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1244
0
            Py_DECREF(object);
1245
0
            return NULL;
1246
0
        }
1247
1248
        /* Try decoding a single surrogate character. If
1249
           there are more, let the codec call us again. */
1250
0
        p += start;
1251
0
        if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1252
0
            switch (code) {
1253
0
            case ENC_UTF8:
1254
0
                if ((p[0] & 0xf0) == 0xe0 &&
1255
0
                    (p[1] & 0xc0) == 0x80 &&
1256
0
                    (p[2] & 0xc0) == 0x80) {
1257
                    /* it's a three-byte code */
1258
0
                    ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1259
0
                }
1260
0
                break;
1261
0
            case ENC_UTF16LE:
1262
0
                ch = p[1] << 8 | p[0];
1263
0
                break;
1264
0
            case ENC_UTF16BE:
1265
0
                ch = p[0] << 8 | p[1];
1266
0
                break;
1267
0
            case ENC_UTF32LE:
1268
0
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1269
0
                break;
1270
0
            case ENC_UTF32BE:
1271
0
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1272
0
                break;
1273
0
            }
1274
0
        }
1275
1276
0
        Py_DECREF(object);
1277
0
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1278
            /* it's not a surrogate - fail */
1279
0
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1280
0
            return NULL;
1281
0
        }
1282
0
        res = PyUnicode_FromOrdinal(ch);
1283
0
        if (res == NULL)
1284
0
            return NULL;
1285
0
        return Py_BuildValue("(Nn)", res, start + bytelength);
1286
0
    }
1287
0
    else {
1288
0
        wrong_exception_type(exc);
1289
0
        return NULL;
1290
0
    }
1291
0
}
1292
1293
static PyObject *
1294
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1295
0
{
1296
0
    PyObject *restuple;
1297
0
    PyObject *object;
1298
0
    Py_ssize_t i;
1299
0
    Py_ssize_t start;
1300
0
    Py_ssize_t end;
1301
0
    PyObject *res;
1302
1303
0
    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1304
0
        char *outp;
1305
0
        if (PyUnicodeEncodeError_GetStart(exc, &start))
1306
0
            return NULL;
1307
0
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
1308
0
            return NULL;
1309
0
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1310
0
            return NULL;
1311
0
        res = PyBytes_FromStringAndSize(NULL, end-start);
1312
0
        if (!res) {
1313
0
            Py_DECREF(object);
1314
0
            return NULL;
1315
0
        }
1316
0
        outp = PyBytes_AsString(res);
1317
0
        for (i = start; i < end; i++) {
1318
            /* object is guaranteed to be "ready" */
1319
0
            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1320
0
            if (ch < 0xdc80 || ch > 0xdcff) {
1321
                /* Not a UTF-8b surrogate, fail with original exception */
1322
0
                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1323
0
                Py_DECREF(res);
1324
0
                Py_DECREF(object);
1325
0
                return NULL;
1326
0
            }
1327
0
            *outp++ = ch - 0xdc00;
1328
0
        }
1329
0
        restuple = Py_BuildValue("(On)", res, end);
1330
0
        Py_DECREF(res);
1331
0
        Py_DECREF(object);
1332
0
        return restuple;
1333
0
    }
1334
0
    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1335
0
        PyObject *str;
1336
0
        const unsigned char *p;
1337
0
        Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1338
0
        int consumed = 0;
1339
0
        if (PyUnicodeDecodeError_GetStart(exc, &start))
1340
0
            return NULL;
1341
0
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
1342
0
            return NULL;
1343
0
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1344
0
            return NULL;
1345
0
        p = (const unsigned char*)PyBytes_AS_STRING(object);
1346
0
        while (consumed < 4 && consumed < end-start) {
1347
            /* Refuse to escape ASCII bytes. */
1348
0
            if (p[start+consumed] < 128)
1349
0
                break;
1350
0
            ch[consumed] = 0xdc00 + p[start+consumed];
1351
0
            consumed++;
1352
0
        }
1353
0
        Py_DECREF(object);
1354
0
        if (!consumed) {
1355
            /* codec complained about ASCII byte. */
1356
0
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1357
0
            return NULL;
1358
0
        }
1359
0
        str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1360
0
        if (str == NULL)
1361
0
            return NULL;
1362
0
        return Py_BuildValue("(Nn)", str, start+consumed);
1363
0
    }
1364
0
    else {
1365
0
        wrong_exception_type(exc);
1366
0
        return NULL;
1367
0
    }
1368
0
}
1369
1370
1371
static PyObject *strict_errors(PyObject *self, PyObject *exc)
1372
0
{
1373
0
    return PyCodec_StrictErrors(exc);
1374
0
}
1375
1376
1377
static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1378
0
{
1379
0
    return PyCodec_IgnoreErrors(exc);
1380
0
}
1381
1382
1383
static PyObject *replace_errors(PyObject *self, PyObject *exc)
1384
0
{
1385
0
    return PyCodec_ReplaceErrors(exc);
1386
0
}
1387
1388
1389
static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1390
0
{
1391
0
    return PyCodec_XMLCharRefReplaceErrors(exc);
1392
0
}
1393
1394
1395
static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1396
0
{
1397
0
    return PyCodec_BackslashReplaceErrors(exc);
1398
0
}
1399
1400
static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1401
0
{
1402
0
    return PyCodec_NameReplaceErrors(exc);
1403
0
}
1404
1405
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1406
0
{
1407
0
    return PyCodec_SurrogatePassErrors(exc);
1408
0
}
1409
1410
static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1411
0
{
1412
0
    return PyCodec_SurrogateEscapeErrors(exc);
1413
0
}
1414
1415
static int _PyCodecRegistry_Init(void)
1416
14
{
1417
14
    static struct {
1418
14
        const char *name;
1419
14
        PyMethodDef def;
1420
14
    } methods[] =
1421
14
    {
1422
14
        {
1423
14
            "strict",
1424
14
            {
1425
14
                "strict_errors",
1426
14
                strict_errors,
1427
14
                METH_O,
1428
14
                PyDoc_STR("Implements the 'strict' error handling, which "
1429
14
                          "raises a UnicodeError on coding errors.")
1430
14
            }
1431
14
        },
1432
14
        {
1433
14
            "ignore",
1434
14
            {
1435
14
                "ignore_errors",
1436
14
                ignore_errors,
1437
14
                METH_O,
1438
14
                PyDoc_STR("Implements the 'ignore' error handling, which "
1439
14
                          "ignores malformed data and continues.")
1440
14
            }
1441
14
        },
1442
14
        {
1443
14
            "replace",
1444
14
            {
1445
14
                "replace_errors",
1446
14
                replace_errors,
1447
14
                METH_O,
1448
14
                PyDoc_STR("Implements the 'replace' error handling, which "
1449
14
                          "replaces malformed data with a replacement marker.")
1450
14
            }
1451
14
        },
1452
14
        {
1453
14
            "xmlcharrefreplace",
1454
14
            {
1455
14
                "xmlcharrefreplace_errors",
1456
14
                xmlcharrefreplace_errors,
1457
14
                METH_O,
1458
14
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1459
14
                          "which replaces an unencodable character with the "
1460
14
                          "appropriate XML character reference.")
1461
14
            }
1462
14
        },
1463
14
        {
1464
14
            "backslashreplace",
1465
14
            {
1466
14
                "backslashreplace_errors",
1467
14
                backslashreplace_errors,
1468
14
                METH_O,
1469
14
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1470
14
                          "which replaces malformed data with a backslashed "
1471
14
                          "escape sequence.")
1472
14
            }
1473
14
        },
1474
14
        {
1475
14
            "namereplace",
1476
14
            {
1477
14
                "namereplace_errors",
1478
14
                namereplace_errors,
1479
14
                METH_O,
1480
14
                PyDoc_STR("Implements the 'namereplace' error handling, "
1481
14
                          "which replaces an unencodable character with a "
1482
14
                          "\\N{...} escape sequence.")
1483
14
            }
1484
14
        },
1485
14
        {
1486
14
            "surrogatepass",
1487
14
            {
1488
14
                "surrogatepass",
1489
14
                surrogatepass_errors,
1490
14
                METH_O
1491
14
            }
1492
14
        },
1493
14
        {
1494
14
            "surrogateescape",
1495
14
            {
1496
14
                "surrogateescape",
1497
14
                surrogateescape_errors,
1498
14
                METH_O
1499
14
            }
1500
14
        }
1501
14
    };
1502
1503
14
    PyInterpreterState *interp = _PyInterpreterState_Get();
1504
14
    PyObject *mod;
1505
14
    unsigned i;
1506
1507
14
    if (interp->codec_search_path != NULL)
1508
0
        return 0;
1509
1510
14
    interp->codec_search_path = PyList_New(0);
1511
14
    interp->codec_search_cache = PyDict_New();
1512
14
    interp->codec_error_registry = PyDict_New();
1513
1514
14
    if (interp->codec_error_registry) {
1515
126
        for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1516
112
            PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1517
112
            int res;
1518
112
            if (!func)
1519
0
                Py_FatalError("can't initialize codec error registry");
1520
112
            res = PyCodec_RegisterError(methods[i].name, func);
1521
112
            Py_DECREF(func);
1522
112
            if (res)
1523
0
                Py_FatalError("can't initialize codec error registry");
1524
112
        }
1525
14
    }
1526
1527
14
    if (interp->codec_search_path == NULL ||
1528
14
        interp->codec_search_cache == NULL ||
1529
14
        interp->codec_error_registry == NULL)
1530
0
        Py_FatalError("can't initialize codec registry");
1531
1532
14
    mod = PyImport_ImportModuleNoBlock("encodings");
1533
14
    if (mod == NULL) {
1534
0
        return -1;
1535
0
    }
1536
14
    Py_DECREF(mod);
1537
14
    interp->codecs_initialized = 1;
1538
14
    return 0;
1539
14
}