Coverage Report

Created: 2025-07-04 06:49

/src/cpython/Python/codecs.c
Line
Count
Source (jump to first uncovered line)
1
/* ------------------------------------------------------------------------
2
3
   Python Codec Registry and support functions
4
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7
Copyright (c) Corporation for National Research Initiatives.
8
9
   ------------------------------------------------------------------------ */
10
11
#include "Python.h"
12
#include "pycore_call.h"          // _PyObject_CallNoArgs()
13
#include "pycore_interp.h"        // PyInterpreterState.codec_search_path
14
#include "pycore_pyerrors.h"      // _PyErr_FormatNote()
15
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
16
#include "pycore_runtime.h"       // _Py_ID()
17
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
18
#include "pycore_unicodeobject.h" // _PyUnicode_InternMortal()
19
20
21
static const char *codecs_builtin_error_handlers[] = {
22
    "strict", "ignore", "replace",
23
    "xmlcharrefreplace", "backslashreplace", "namereplace",
24
    "surrogatepass", "surrogateescape",
25
};
26
27
const char *Py_hexdigits = "0123456789abcdef";
28
29
/* --- Codec Registry ----------------------------------------------------- */
30
31
int PyCodec_Register(PyObject *search_function)
32
16
{
33
16
    PyInterpreterState *interp = _PyInterpreterState_GET();
34
16
    assert(interp->codecs.initialized);
35
16
    if (search_function == NULL) {
36
0
        PyErr_BadArgument();
37
0
        goto onError;
38
0
    }
39
16
    if (!PyCallable_Check(search_function)) {
40
0
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
41
0
        goto onError;
42
0
    }
43
#ifdef Py_GIL_DISABLED
44
    PyMutex_Lock(&interp->codecs.search_path_mutex);
45
#endif
46
16
    int ret = PyList_Append(interp->codecs.search_path, search_function);
47
#ifdef Py_GIL_DISABLED
48
    PyMutex_Unlock(&interp->codecs.search_path_mutex);
49
#endif
50
16
    return ret;
51
52
0
 onError:
53
0
    return -1;
54
16
}
55
56
int
57
PyCodec_Unregister(PyObject *search_function)
58
0
{
59
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
60
0
    if (interp->codecs.initialized != 1) {
61
        /* Do nothing if codecs state was cleared (only possible during
62
           interpreter shutdown). */
63
0
        return 0;
64
0
    }
65
66
0
    PyObject *codec_search_path = interp->codecs.search_path;
67
0
    assert(PyList_CheckExact(codec_search_path));
68
0
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
69
#ifdef Py_GIL_DISABLED
70
        PyMutex_Lock(&interp->codecs.search_path_mutex);
71
#endif
72
0
        PyObject *item = PyList_GetItemRef(codec_search_path, i);
73
0
        int ret = 1;
74
0
        if (item == search_function) {
75
            // We hold a reference to the item, so its destructor can't run
76
            // while we hold search_path_mutex.
77
0
            ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
78
0
        }
79
#ifdef Py_GIL_DISABLED
80
        PyMutex_Unlock(&interp->codecs.search_path_mutex);
81
#endif
82
0
        Py_DECREF(item);
83
0
        if (ret != 1) {
84
0
            assert(interp->codecs.search_cache != NULL);
85
0
            assert(PyDict_CheckExact(interp->codecs.search_cache));
86
0
            PyDict_Clear(interp->codecs.search_cache);
87
0
            return ret;
88
0
        }
89
0
    }
90
0
    return 0;
91
0
}
92
93
extern int _Py_normalize_encoding(const char *, char *, size_t);
94
95
/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
96
   converted to lower case, spaces and hyphens are replaced with underscores. */
97
98
static
99
PyObject *normalizestring(const char *string)
100
966k
{
101
966k
    size_t len = strlen(string);
102
966k
    char *encoding;
103
966k
    PyObject *v;
104
105
966k
    if (len > PY_SSIZE_T_MAX) {
106
0
        PyErr_SetString(PyExc_OverflowError, "string is too large");
107
0
        return NULL;
108
0
    }
109
110
966k
    encoding = PyMem_Malloc(len + 1);
111
966k
    if (encoding == NULL)
112
0
        return PyErr_NoMemory();
113
114
966k
    if (!_Py_normalize_encoding(string, encoding, len + 1))
115
0
    {
116
0
        PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
117
0
        PyMem_Free(encoding);
118
0
        return NULL;
119
0
    }
120
121
966k
    v = PyUnicode_FromString(encoding);
122
966k
    PyMem_Free(encoding);
123
966k
    return v;
124
966k
}
125
126
/* Lookup the given encoding and return a tuple providing the codec
127
   facilities.
128
129
   The encoding string is looked up converted to all lower-case
130
   characters. This makes encodings looked up through this mechanism
131
   effectively case-insensitive.
132
133
   If no codec is found, a LookupError is set and NULL returned.
134
135
   As side effect, this tries to load the encodings package, if not
136
   yet done. This is part of the lazy load strategy for the encodings
137
   package.
138
139
*/
140
141
PyObject *_PyCodec_Lookup(const char *encoding)
142
966k
{
143
966k
    if (encoding == NULL) {
144
0
        PyErr_BadArgument();
145
0
        return NULL;
146
0
    }
147
148
966k
    PyInterpreterState *interp = _PyInterpreterState_GET();
149
966k
    assert(interp->codecs.initialized);
150
151
    /* Convert the encoding to a normalized Python string: all
152
       characters are converted to lower case, spaces and hyphens are
153
       replaced with underscores. */
154
966k
    PyObject *v = normalizestring(encoding);
155
966k
    if (v == NULL) {
156
0
        return NULL;
157
0
    }
158
159
    /* Intern the string. We'll make it immortal later if lookup succeeds. */
160
966k
    _PyUnicode_InternMortal(interp, &v);
161
162
    /* First, try to lookup the name in the registry dictionary */
163
966k
    PyObject *result;
164
966k
    if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
165
0
        goto onError;
166
0
    }
167
966k
    if (result != NULL) {
168
885k
        Py_DECREF(v);
169
885k
        return result;
170
885k
    }
171
172
    /* Next, scan the search functions in order of registration */
173
81.5k
    const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
174
81.5k
    if (len < 0)
175
0
        goto onError;
176
81.5k
    if (len == 0) {
177
0
        PyErr_SetString(PyExc_LookupError,
178
0
                        "no codec search functions registered: "
179
0
                        "can't find encoding");
180
0
        goto onError;
181
0
    }
182
183
81.5k
    Py_ssize_t i;
184
162k
    for (i = 0; i < len; i++) {
185
81.5k
        PyObject *func;
186
187
81.5k
        func = PyList_GetItemRef(interp->codecs.search_path, i);
188
81.5k
        if (func == NULL)
189
0
            goto onError;
190
81.5k
        result = PyObject_CallOneArg(func, v);
191
81.5k
        Py_DECREF(func);
192
81.5k
        if (result == NULL)
193
0
            goto onError;
194
81.5k
        if (result == Py_None) {
195
81.0k
            Py_CLEAR(result);
196
81.0k
            continue;
197
81.0k
        }
198
488
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
199
0
            PyErr_SetString(PyExc_TypeError,
200
0
                            "codec search functions must return 4-tuples");
201
0
            Py_DECREF(result);
202
0
            goto onError;
203
0
        }
204
488
        break;
205
488
    }
206
81.5k
    if (result == NULL) {
207
        /* XXX Perhaps we should cache misses too ? */
208
81.0k
        PyErr_Format(PyExc_LookupError,
209
81.0k
                     "unknown encoding: %s", encoding);
210
81.0k
        goto onError;
211
81.0k
    }
212
213
488
    _PyUnicode_InternImmortal(interp, &v);
214
215
    /* Cache and return the result */
216
488
    if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
217
0
        Py_DECREF(result);
218
0
        goto onError;
219
0
    }
220
488
    Py_DECREF(v);
221
488
    return result;
222
223
81.0k
 onError:
224
81.0k
    Py_DECREF(v);
225
81.0k
    return NULL;
226
488
}
227
228
/* Codec registry encoding check API. */
229
230
int PyCodec_KnownEncoding(const char *encoding)
231
0
{
232
0
    PyObject *codecs;
233
234
0
    codecs = _PyCodec_Lookup(encoding);
235
0
    if (!codecs) {
236
0
        PyErr_Clear();
237
0
        return 0;
238
0
    }
239
0
    else {
240
0
        Py_DECREF(codecs);
241
0
        return 1;
242
0
    }
243
0
}
244
245
static
246
PyObject *args_tuple(PyObject *object,
247
                     const char *errors)
248
883k
{
249
883k
    PyObject *args;
250
251
883k
    args = PyTuple_New(1 + (errors != NULL));
252
883k
    if (args == NULL)
253
0
        return NULL;
254
883k
    PyTuple_SET_ITEM(args, 0, Py_NewRef(object));
255
883k
    if (errors) {
256
148k
        PyObject *v;
257
258
148k
        v = PyUnicode_FromString(errors);
259
148k
        if (v == NULL) {
260
0
            Py_DECREF(args);
261
0
            return NULL;
262
0
        }
263
148k
        PyTuple_SET_ITEM(args, 1, v);
264
148k
    }
265
883k
    return args;
266
883k
}
267
268
/* Helper function to get a codec item */
269
270
static
271
PyObject *codec_getitem(const char *encoding, int index)
272
0
{
273
0
    PyObject *codecs;
274
0
    PyObject *v;
275
276
0
    codecs = _PyCodec_Lookup(encoding);
277
0
    if (codecs == NULL)
278
0
        return NULL;
279
0
    v = PyTuple_GET_ITEM(codecs, index);
280
0
    Py_DECREF(codecs);
281
0
    return Py_NewRef(v);
282
0
}
283
284
/* Helper functions to create an incremental codec. */
285
static
286
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
287
                                     const char *errors,
288
                                     const char *attrname)
289
48
{
290
48
    PyObject *ret, *inccodec;
291
292
48
    inccodec = PyObject_GetAttrString(codec_info, attrname);
293
48
    if (inccodec == NULL)
294
0
        return NULL;
295
48
    if (errors)
296
48
        ret = PyObject_CallFunction(inccodec, "s", errors);
297
0
    else
298
0
        ret = _PyObject_CallNoArgs(inccodec);
299
48
    Py_DECREF(inccodec);
300
48
    return ret;
301
48
}
302
303
static
304
PyObject *codec_getincrementalcodec(const char *encoding,
305
                                    const char *errors,
306
                                    const char *attrname)
307
0
{
308
0
    PyObject *codec_info, *ret;
309
310
0
    codec_info = _PyCodec_Lookup(encoding);
311
0
    if (codec_info == NULL)
312
0
        return NULL;
313
0
    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
314
0
    Py_DECREF(codec_info);
315
0
    return ret;
316
0
}
317
318
/* Helper function to create a stream codec. */
319
320
static
321
PyObject *codec_getstreamcodec(const char *encoding,
322
                               PyObject *stream,
323
                               const char *errors,
324
                               const int index)
325
0
{
326
0
    PyObject *codecs, *streamcodec, *codeccls;
327
328
0
    codecs = _PyCodec_Lookup(encoding);
329
0
    if (codecs == NULL)
330
0
        return NULL;
331
332
0
    codeccls = PyTuple_GET_ITEM(codecs, index);
333
0
    if (errors != NULL)
334
0
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
335
0
    else
336
0
        streamcodec = PyObject_CallOneArg(codeccls, stream);
337
0
    Py_DECREF(codecs);
338
0
    return streamcodec;
339
0
}
340
341
/* Helpers to work with the result of _PyCodec_Lookup
342
343
 */
344
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
345
                                             const char *errors)
346
16
{
347
16
    return codec_makeincrementalcodec(codec_info, errors,
348
16
                                      "incrementaldecoder");
349
16
}
350
351
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
352
                                             const char *errors)
353
32
{
354
32
    return codec_makeincrementalcodec(codec_info, errors,
355
32
                                      "incrementalencoder");
356
32
}
357
358
359
/* Convenience APIs to query the Codec registry.
360
361
   All APIs return a codec object with incremented refcount.
362
363
 */
364
365
PyObject *PyCodec_Encoder(const char *encoding)
366
0
{
367
0
    return codec_getitem(encoding, 0);
368
0
}
369
370
PyObject *PyCodec_Decoder(const char *encoding)
371
0
{
372
0
    return codec_getitem(encoding, 1);
373
0
}
374
375
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
376
                                     const char *errors)
377
0
{
378
0
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
379
0
}
380
381
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
382
                                     const char *errors)
383
0
{
384
0
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
385
0
}
386
387
PyObject *PyCodec_StreamReader(const char *encoding,
388
                               PyObject *stream,
389
                               const char *errors)
390
0
{
391
0
    return codec_getstreamcodec(encoding, stream, errors, 2);
392
0
}
393
394
PyObject *PyCodec_StreamWriter(const char *encoding,
395
                               PyObject *stream,
396
                               const char *errors)
397
0
{
398
0
    return codec_getstreamcodec(encoding, stream, errors, 3);
399
0
}
400
401
/* Encode an object (e.g. a Unicode object) using the given encoding
402
   and return the resulting encoded object (usually a Python string).
403
404
   errors is passed to the encoder factory as argument if non-NULL. */
405
406
static PyObject *
407
_PyCodec_EncodeInternal(PyObject *object,
408
                        PyObject *encoder,
409
                        const char *encoding,
410
                        const char *errors)
411
670k
{
412
670k
    PyObject *args = NULL, *result = NULL;
413
670k
    PyObject *v = NULL;
414
415
670k
    args = args_tuple(object, errors);
416
670k
    if (args == NULL)
417
0
        goto onError;
418
419
670k
    result = PyObject_Call(encoder, args, NULL);
420
670k
    if (result == NULL) {
421
0
        _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding);
422
0
        goto onError;
423
0
    }
424
425
670k
    if (!PyTuple_Check(result) ||
426
670k
        PyTuple_GET_SIZE(result) != 2) {
427
0
        PyErr_SetString(PyExc_TypeError,
428
0
                        "encoder must return a tuple (object, integer)");
429
0
        goto onError;
430
0
    }
431
670k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
432
    /* We don't check or use the second (integer) entry. */
433
434
670k
    Py_DECREF(args);
435
670k
    Py_DECREF(encoder);
436
670k
    Py_DECREF(result);
437
670k
    return v;
438
439
0
 onError:
440
0
    Py_XDECREF(result);
441
0
    Py_XDECREF(args);
442
0
    Py_XDECREF(encoder);
443
0
    return NULL;
444
670k
}
445
446
/* Decode an object (usually a Python string) using the given encoding
447
   and return an equivalent object (e.g. a Unicode object).
448
449
   errors is passed to the decoder factory as argument if non-NULL. */
450
451
static PyObject *
452
_PyCodec_DecodeInternal(PyObject *object,
453
                        PyObject *decoder,
454
                        const char *encoding,
455
                        const char *errors)
456
212k
{
457
212k
    PyObject *args = NULL, *result = NULL;
458
212k
    PyObject *v;
459
460
212k
    args = args_tuple(object, errors);
461
212k
    if (args == NULL)
462
0
        goto onError;
463
464
212k
    result = PyObject_Call(decoder, args, NULL);
465
212k
    if (result == NULL) {
466
46.2k
        _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding);
467
46.2k
        goto onError;
468
46.2k
    }
469
166k
    if (!PyTuple_Check(result) ||
470
166k
        PyTuple_GET_SIZE(result) != 2) {
471
0
        PyErr_SetString(PyExc_TypeError,
472
0
                        "decoder must return a tuple (object,integer)");
473
0
        goto onError;
474
0
    }
475
166k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
476
    /* We don't check or use the second (integer) entry. */
477
478
166k
    Py_DECREF(args);
479
166k
    Py_DECREF(decoder);
480
166k
    Py_DECREF(result);
481
166k
    return v;
482
483
46.2k
 onError:
484
46.2k
    Py_XDECREF(args);
485
46.2k
    Py_XDECREF(decoder);
486
46.2k
    Py_XDECREF(result);
487
46.2k
    return NULL;
488
166k
}
489
490
/* Generic encoding/decoding API */
491
PyObject *PyCodec_Encode(PyObject *object,
492
                         const char *encoding,
493
                         const char *errors)
494
0
{
495
0
    PyObject *encoder;
496
497
0
    encoder = PyCodec_Encoder(encoding);
498
0
    if (encoder == NULL)
499
0
        return NULL;
500
501
0
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
502
0
}
503
504
PyObject *PyCodec_Decode(PyObject *object,
505
                         const char *encoding,
506
                         const char *errors)
507
0
{
508
0
    PyObject *decoder;
509
510
0
    decoder = PyCodec_Decoder(encoding);
511
0
    if (decoder == NULL)
512
0
        return NULL;
513
514
0
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
515
0
}
516
517
/* Text encoding/decoding API */
518
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
519
                                       const char *alternate_command)
520
966k
{
521
966k
    PyObject *codec;
522
966k
    PyObject *attr;
523
966k
    int is_text_codec;
524
525
966k
    codec = _PyCodec_Lookup(encoding);
526
966k
    if (codec == NULL)
527
81.0k
        return NULL;
528
529
    /* Backwards compatibility: assume any raw tuple describes a text
530
     * encoding, and the same for anything lacking the private
531
     * attribute.
532
     */
533
885k
    if (!PyTuple_CheckExact(codec)) {
534
885k
        if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
535
0
            Py_DECREF(codec);
536
0
            return NULL;
537
0
        }
538
885k
        if (attr != NULL) {
539
885k
            is_text_codec = PyObject_IsTrue(attr);
540
885k
            Py_DECREF(attr);
541
885k
            if (is_text_codec <= 0) {
542
2.52k
                Py_DECREF(codec);
543
2.52k
                if (!is_text_codec) {
544
2.52k
                    if (alternate_command != NULL) {
545
2.52k
                        PyErr_Format(PyExc_LookupError,
546
2.52k
                                     "'%.400s' is not a text encoding; "
547
2.52k
                                     "use %s to handle arbitrary codecs",
548
2.52k
                                     encoding, alternate_command);
549
2.52k
                    }
550
0
                    else {
551
0
                        PyErr_Format(PyExc_LookupError,
552
0
                                     "'%.400s' is not a text encoding",
553
0
                                     encoding);
554
0
                    }
555
2.52k
                }
556
2.52k
                return NULL;
557
2.52k
            }
558
885k
        }
559
885k
    }
560
561
    /* This appears to be a valid text encoding */
562
883k
    return codec;
563
885k
}
564
565
566
static
567
PyObject *codec_getitem_checked(const char *encoding,
568
                                const char *alternate_command,
569
                                int index)
570
966k
{
571
966k
    PyObject *codec;
572
966k
    PyObject *v;
573
574
966k
    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
575
966k
    if (codec == NULL)
576
83.6k
        return NULL;
577
578
883k
    v = Py_NewRef(PyTuple_GET_ITEM(codec, index));
579
883k
    Py_DECREF(codec);
580
883k
    return v;
581
966k
}
582
583
static PyObject * _PyCodec_TextEncoder(const char *encoding)
584
670k
{
585
670k
    return codec_getitem_checked(encoding, "codecs.encode()", 0);
586
670k
}
587
588
static PyObject * _PyCodec_TextDecoder(const char *encoding)
589
296k
{
590
296k
    return codec_getitem_checked(encoding, "codecs.decode()", 1);
591
296k
}
592
593
PyObject *_PyCodec_EncodeText(PyObject *object,
594
                              const char *encoding,
595
                              const char *errors)
596
670k
{
597
670k
    PyObject *encoder;
598
599
670k
    encoder = _PyCodec_TextEncoder(encoding);
600
670k
    if (encoder == NULL)
601
0
        return NULL;
602
603
670k
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
604
670k
}
605
606
PyObject *_PyCodec_DecodeText(PyObject *object,
607
                              const char *encoding,
608
                              const char *errors)
609
296k
{
610
296k
    PyObject *decoder;
611
612
296k
    decoder = _PyCodec_TextDecoder(encoding);
613
296k
    if (decoder == NULL)
614
83.6k
        return NULL;
615
616
212k
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
617
296k
}
618
619
/* Register the error handling callback function error under the name
620
   name. This function will be called by the codec when it encounters
621
   an unencodable characters/undecodable bytes and doesn't know the
622
   callback name, when name is specified as the error parameter
623
   in the call to the encode/decode function.
624
   Return 0 on success, -1 on error */
625
int PyCodec_RegisterError(const char *name, PyObject *error)
626
0
{
627
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
628
0
    assert(interp->codecs.initialized);
629
0
    if (!PyCallable_Check(error)) {
630
0
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
631
0
        return -1;
632
0
    }
633
0
    return PyDict_SetItemString(interp->codecs.error_registry,
634
0
                                name, error);
635
0
}
636
637
int _PyCodec_UnregisterError(const char *name)
638
0
{
639
0
    for (size_t i = 0; i < Py_ARRAY_LENGTH(codecs_builtin_error_handlers); ++i) {
640
0
        if (strcmp(name, codecs_builtin_error_handlers[i]) == 0) {
641
0
            PyErr_Format(PyExc_ValueError,
642
0
                         "cannot un-register built-in error handler '%s'", name);
643
0
            return -1;
644
0
        }
645
0
    }
646
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
647
0
    assert(interp->codecs.initialized);
648
0
    return PyDict_PopString(interp->codecs.error_registry, name, NULL);
649
0
}
650
651
/* Lookup the error handling callback function registered under the
652
   name error. As a special case NULL can be passed, in which case
653
   the error handling callback for strict encoding will be returned. */
654
PyObject *PyCodec_LookupError(const char *name)
655
203k
{
656
203k
    PyInterpreterState *interp = _PyInterpreterState_GET();
657
203k
    assert(interp->codecs.initialized);
658
659
203k
    if (name==NULL)
660
139k
        name = "strict";
661
203k
    PyObject *handler;
662
203k
    if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
663
0
        return NULL;
664
0
    }
665
203k
    if (handler == NULL) {
666
0
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
667
0
        return NULL;
668
0
    }
669
203k
    return handler;
670
203k
}
671
672
673
static inline void
674
wrong_exception_type(PyObject *exc)
675
0
{
676
0
    PyErr_Format(PyExc_TypeError,
677
0
                 "don't know how to handle %T in error callback", exc);
678
0
}
679
680
681
#define _PyIsUnicodeEncodeError(EXC)    \
682
243k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeEncodeError)
683
#define _PyIsUnicodeDecodeError(EXC)    \
684
236k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeDecodeError)
685
#define _PyIsUnicodeTranslateError(EXC) \
686
0
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
687
688
689
// --- codecs handlers: utilities ---------------------------------------------
690
691
/*
692
 * Return the number of characters (including special prefixes)
693
 * needed to represent 'ch' by codec_handler_write_unicode_hex().
694
 */
695
static inline Py_ssize_t
696
codec_handler_unicode_hex_width(Py_UCS4 ch)
697
0
{
698
0
    if (ch >= 0x10000) {
699
        // format: '\\' + 'U' + 8 hex digits
700
0
        return 1 + 1 + 8;
701
0
    }
702
0
    else if (ch >= 0x100) {
703
        // format: '\\' + 'u' + 4 hex digits
704
0
        return 1 + 1 + 4;
705
0
    }
706
0
    else {
707
        // format: '\\' + 'x' + 2 hex digits
708
0
        return 1 + 1 + 2;
709
0
    }
710
0
}
711
712
713
/*
714
 * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
715
 * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
716
 */
717
static inline void
718
codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
719
0
{
720
0
    *(*p)++ = '\\';
721
0
    if (ch >= 0x10000) {
722
0
        *(*p)++ = 'U';
723
0
        *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf];
724
0
        *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf];
725
0
        *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf];
726
0
        *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf];
727
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
728
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
729
0
    }
730
0
    else if (ch >= 0x100) {
731
0
        *(*p)++ = 'u';
732
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
733
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
734
0
    }
735
0
    else {
736
0
        *(*p)++ = 'x';
737
0
    }
738
0
    *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf];
739
0
    *(*p)++ = Py_hexdigits[ch & 0xf];
740
0
}
741
742
743
/*
744
 * Determine the number of digits for a decimal representation of Unicode
745
 * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
746
 */
747
static inline int
748
n_decimal_digits_for_codepoint(Py_UCS4 ch)
749
0
{
750
0
    if (ch < 10) return 1;
751
0
    if (ch < 100) return 2;
752
0
    if (ch < 1000) return 3;
753
0
    if (ch < 10000) return 4;
754
0
    if (ch < 100000) return 5;
755
0
    if (ch < 1000000) return 6;
756
0
    if (ch < 10000000) return 7;
757
    // Unicode codepoints are limited to 1114111 (7 decimal digits)
758
0
    Py_UNREACHABLE();
759
0
}
760
761
762
/*
763
 * Create a Unicode string containing 'count' copies of the official
764
 * Unicode REPLACEMENT CHARACTER (0xFFFD).
765
 */
766
static PyObject *
767
codec_handler_unicode_replacement_character(Py_ssize_t count)
768
194k
{
769
194k
    PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER);
770
194k
    if (res == NULL) {
771
0
        return NULL;
772
0
    }
773
194k
    assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
774
194k
    Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
775
388k
    for (Py_ssize_t i = 0; i < count; ++i) {
776
194k
        outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
777
194k
    }
778
194k
    assert(_PyUnicode_CheckConsistency(res, 1));
779
194k
    return res;
780
194k
}
781
782
783
// --- handler: 'strict' ------------------------------------------------------
784
785
PyObject *PyCodec_StrictErrors(PyObject *exc)
786
198k
{
787
198k
    if (PyExceptionInstance_Check(exc)) {
788
198k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
789
198k
    }
790
0
    else {
791
0
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
792
0
    }
793
198k
    return NULL;
794
198k
}
795
796
797
// --- handler: 'ignore' ------------------------------------------------------
798
799
static PyObject *
800
_PyCodec_IgnoreError(PyObject *exc, int as_bytes)
801
0
{
802
0
    Py_ssize_t end;
803
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL,
804
0
                                  &end, NULL, as_bytes) < 0)
805
0
    {
806
0
        return NULL;
807
0
    }
808
0
    return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
809
0
}
810
811
812
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
813
0
{
814
0
    if (_PyIsUnicodeEncodeError(exc) || _PyIsUnicodeTranslateError(exc)) {
815
0
        return _PyCodec_IgnoreError(exc, false);
816
0
    }
817
0
    else if (_PyIsUnicodeDecodeError(exc)) {
818
0
        return _PyCodec_IgnoreError(exc, true);
819
0
    }
820
0
    else {
821
0
        wrong_exception_type(exc);
822
0
        return NULL;
823
0
    }
824
0
}
825
826
827
// --- handler: 'replace' -----------------------------------------------------
828
829
static PyObject *
830
_PyCodec_ReplaceUnicodeEncodeError(PyObject *exc)
831
0
{
832
0
    Py_ssize_t start, end, slen;
833
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
834
0
                                  &start, &end, &slen, false) < 0)
835
0
    {
836
0
        return NULL;
837
0
    }
838
0
    PyObject *res = PyUnicode_New(slen, '?');
839
0
    if (res == NULL) {
840
0
        return NULL;
841
0
    }
842
0
    assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
843
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
844
0
    memset(outp, '?', sizeof(Py_UCS1) * slen);
845
0
    assert(_PyUnicode_CheckConsistency(res, 1));
846
0
    return Py_BuildValue("(Nn)", res, end);
847
0
}
848
849
850
static PyObject *
851
_PyCodec_ReplaceUnicodeDecodeError(PyObject *exc)
852
194k
{
853
194k
    Py_ssize_t end;
854
194k
    if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) {
855
0
        return NULL;
856
0
    }
857
194k
    PyObject *res = codec_handler_unicode_replacement_character(1);
858
194k
    if (res == NULL) {
859
0
        return NULL;
860
0
    }
861
194k
    return Py_BuildValue("(Nn)", res, end);
862
194k
}
863
864
865
static PyObject *
866
_PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
867
0
{
868
0
    Py_ssize_t start, end, slen;
869
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
870
0
                                  &start, &end, &slen, false) < 0)
871
0
    {
872
0
        return NULL;
873
0
    }
874
0
    PyObject *res = codec_handler_unicode_replacement_character(slen);
875
0
    if (res == NULL) {
876
0
        return NULL;
877
0
    }
878
0
    return Py_BuildValue("(Nn)", res, end);
879
0
}
880
881
882
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
883
194k
{
884
194k
    if (_PyIsUnicodeEncodeError(exc)) {
885
0
        return _PyCodec_ReplaceUnicodeEncodeError(exc);
886
0
    }
887
194k
    else if (_PyIsUnicodeDecodeError(exc)) {
888
194k
        return _PyCodec_ReplaceUnicodeDecodeError(exc);
889
194k
    }
890
0
    else if (_PyIsUnicodeTranslateError(exc)) {
891
0
        return _PyCodec_ReplaceUnicodeTranslateError(exc);
892
0
    }
893
0
    else {
894
0
        wrong_exception_type(exc);
895
0
        return NULL;
896
0
    }
897
194k
}
898
899
900
// --- handler: 'xmlcharrefreplace' -------------------------------------------
901
902
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
903
0
{
904
0
    if (!_PyIsUnicodeEncodeError(exc)) {
905
0
        wrong_exception_type(exc);
906
0
        return NULL;
907
0
    }
908
909
0
    PyObject *obj;
910
0
    Py_ssize_t objlen, start, end, slen;
911
0
    if (_PyUnicodeError_GetParams(exc,
912
0
                                  &obj, &objlen,
913
0
                                  &start, &end, &slen, false) < 0)
914
0
    {
915
0
        return NULL;
916
0
    }
917
918
    // The number of characters that each character 'ch' contributes
919
    // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
920
    // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
921
    // range is below 10^7, each "block" requires at most 2 + 7 + 1
922
    // characters.
923
0
    if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
924
0
        end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
925
0
        end = Py_MIN(end, objlen);
926
0
        slen = Py_MAX(0, end - start);
927
0
    }
928
929
0
    Py_ssize_t ressize = 0;
930
0
    for (Py_ssize_t i = start; i < end; ++i) {
931
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
932
0
        int k = n_decimal_digits_for_codepoint(ch);
933
0
        assert(k != 0);
934
0
        assert(k <= 7);
935
0
        ressize += 2 + k + 1;
936
0
    }
937
938
    /* allocate replacement */
939
0
    PyObject *res = PyUnicode_New(ressize, 127);
940
0
    if (res == NULL) {
941
0
        Py_DECREF(obj);
942
0
        return NULL;
943
0
    }
944
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
945
    /* generate replacement */
946
0
    for (Py_ssize_t i = start; i < end; ++i) {
947
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
948
        /*
949
         * Write the decimal representation of 'ch' to the buffer pointed by 'p'
950
         * using at most 7 characters prefixed by '&#' and suffixed by ';'.
951
         */
952
0
        *outp++ = '&';
953
0
        *outp++ = '#';
954
0
        Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
955
0
        for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
956
0
            *p_digit = '0' + (ch % 10);
957
0
            ch /= 10;
958
0
        }
959
0
        assert(ch == 0);
960
0
        outp = digit_end;
961
0
        *outp++ = ';';
962
0
    }
963
0
    assert(_PyUnicode_CheckConsistency(res, 1));
964
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
965
0
    Py_DECREF(obj);
966
0
    return restuple;
967
0
}
968
969
970
// --- handler: 'backslashreplace' --------------------------------------------
971
972
static PyObject *
973
_PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc)
974
0
{
975
0
    PyObject *obj;
976
0
    Py_ssize_t objlen, start, end, slen;
977
0
    if (_PyUnicodeError_GetParams(exc,
978
0
                                  &obj, &objlen,
979
0
                                  &start, &end, &slen, false) < 0)
980
0
    {
981
0
        return NULL;
982
0
    }
983
984
    // The number of characters that each character 'ch' contributes
985
    // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch}
986
    // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS,
987
    // where the number of hexdigits is either 2, 4, or 8 (not 6).
988
    // Since the Unicode range is below 10^7, we choose k = 8 whence
989
    // each "block" requires at most 1 + 1 + 8 characters.
990
0
    if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) {
991
0
        end = start + PY_SSIZE_T_MAX / (1 + 1 + 8);
992
0
        end = Py_MIN(end, objlen);
993
0
        slen = Py_MAX(0, end - start);
994
0
    }
995
996
0
    Py_ssize_t ressize = 0;
997
0
    for (Py_ssize_t i = start; i < end; ++i) {
998
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
999
0
        ressize += codec_handler_unicode_hex_width(c);
1000
0
    }
1001
0
    PyObject *res = PyUnicode_New(ressize, 127);
1002
0
    if (res == NULL) {
1003
0
        Py_DECREF(obj);
1004
0
        return NULL;
1005
0
    }
1006
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1007
0
    for (Py_ssize_t i = start; i < end; ++i) {
1008
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1009
0
        codec_handler_write_unicode_hex(&outp, c);
1010
0
    }
1011
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1012
0
    Py_DECREF(obj);
1013
0
    return Py_BuildValue("(Nn)", res, end);
1014
0
}
1015
1016
1017
static PyObject *
1018
_PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc)
1019
0
{
1020
0
    PyObject *obj;
1021
0
    Py_ssize_t objlen, start, end, slen;
1022
0
    if (_PyUnicodeError_GetParams(exc,
1023
0
                                  &obj, &objlen,
1024
0
                                  &start, &end, &slen, true) < 0)
1025
0
    {
1026
0
        return NULL;
1027
0
    }
1028
1029
0
    PyObject *res = PyUnicode_New(4 * slen, 127);
1030
0
    if (res == NULL) {
1031
0
        Py_DECREF(obj);
1032
0
        return NULL;
1033
0
    }
1034
1035
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1036
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1037
0
    for (Py_ssize_t i = start; i < end; i++, outp += 4) {
1038
0
        const unsigned char ch = p[i];
1039
0
        outp[0] = '\\';
1040
0
        outp[1] = 'x';
1041
0
        outp[2] = Py_hexdigits[(ch >> 4) & 0xf];
1042
0
        outp[3] = Py_hexdigits[ch & 0xf];
1043
0
    }
1044
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1045
0
    Py_DECREF(obj);
1046
0
    return Py_BuildValue("(Nn)", res, end);
1047
0
}
1048
1049
1050
static inline PyObject *
1051
_PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc)
1052
0
{
1053
    // Same implementation as for UnicodeEncodeError objects.
1054
0
    return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1055
0
}
1056
1057
1058
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
1059
0
{
1060
0
    if (_PyIsUnicodeEncodeError(exc)) {
1061
0
        return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1062
0
    }
1063
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1064
0
        return _PyCodec_BackslashReplaceUnicodeDecodeError(exc);
1065
0
    }
1066
0
    else if (_PyIsUnicodeTranslateError(exc)) {
1067
0
        return _PyCodec_BackslashReplaceUnicodeTranslateError(exc);
1068
0
    }
1069
0
    else {
1070
0
        wrong_exception_type(exc);
1071
0
        return NULL;
1072
0
    }
1073
0
}
1074
1075
1076
// --- handler: 'namereplace' -------------------------------------------------
1077
1078
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
1079
0
{
1080
0
    if (!_PyIsUnicodeEncodeError(exc)) {
1081
0
        wrong_exception_type(exc);
1082
0
        return NULL;
1083
0
    }
1084
1085
0
    _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
1086
0
    if (ucnhash_capi == NULL) {
1087
0
        return NULL;
1088
0
    }
1089
1090
0
    PyObject *obj;
1091
0
    Py_ssize_t start, end;
1092
0
    if (_PyUnicodeError_GetParams(exc,
1093
0
                                  &obj, NULL,
1094
0
                                  &start, &end, NULL, false) < 0)
1095
0
    {
1096
0
        return NULL;
1097
0
    }
1098
1099
0
    char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */
1100
0
    Py_ssize_t imax = start, ressize = 0, replsize;
1101
0
    for (; imax < end; ++imax) {
1102
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax);
1103
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1104
            // If 'c' is recognized by getname(), the corresponding replacement
1105
            // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
1106
            // characters. Failures of getname() are ignored by the handler.
1107
0
            replsize = 1 + 1 + 1 + strlen(buffer) + 1;
1108
0
        }
1109
0
        else {
1110
0
            replsize = codec_handler_unicode_hex_width(c);
1111
0
        }
1112
0
        if (ressize > PY_SSIZE_T_MAX - replsize) {
1113
0
            break;
1114
0
        }
1115
0
        ressize += replsize;
1116
0
    }
1117
1118
0
    PyObject *res = PyUnicode_New(ressize, 127);
1119
0
    if (res == NULL) {
1120
0
        Py_DECREF(obj);
1121
0
        return NULL;
1122
0
    }
1123
1124
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1125
0
    for (Py_ssize_t i = start; i < imax; ++i) {
1126
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1127
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1128
0
            *outp++ = '\\';
1129
0
            *outp++ = 'N';
1130
0
            *outp++ = '{';
1131
0
            (void)strcpy((char *)outp, buffer);
1132
0
            outp += strlen(buffer);
1133
0
            *outp++ = '}';
1134
0
        }
1135
0
        else {
1136
0
            codec_handler_write_unicode_hex(&outp, c);
1137
0
        }
1138
0
    }
1139
1140
0
    assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1141
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1142
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, imax);
1143
0
    Py_DECREF(obj);
1144
0
    return restuple;
1145
0
}
1146
1147
1148
0
#define ENC_UNKNOWN     -1
1149
0
#define ENC_UTF8        0
1150
0
#define ENC_UTF16BE     1
1151
0
#define ENC_UTF16LE     2
1152
0
#define ENC_UTF32BE     3
1153
0
#define ENC_UTF32LE     4
1154
1155
static int
1156
get_standard_encoding_impl(const char *encoding, int *bytelength)
1157
0
{
1158
0
    if (Py_TOLOWER(encoding[0]) == 'u' &&
1159
0
        Py_TOLOWER(encoding[1]) == 't' &&
1160
0
        Py_TOLOWER(encoding[2]) == 'f') {
1161
0
        encoding += 3;
1162
0
        if (*encoding == '-' || *encoding == '_' )
1163
0
            encoding++;
1164
0
        if (encoding[0] == '8' && encoding[1] == '\0') {
1165
0
            *bytelength = 3;
1166
0
            return ENC_UTF8;
1167
0
        }
1168
0
        else if (encoding[0] == '1' && encoding[1] == '6') {
1169
0
            encoding += 2;
1170
0
            *bytelength = 2;
1171
0
            if (*encoding == '\0') {
1172
#ifdef WORDS_BIGENDIAN
1173
                return ENC_UTF16BE;
1174
#else
1175
0
                return ENC_UTF16LE;
1176
0
#endif
1177
0
            }
1178
0
            if (*encoding == '-' || *encoding == '_' )
1179
0
                encoding++;
1180
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1181
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1182
0
                    return ENC_UTF16BE;
1183
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1184
0
                    return ENC_UTF16LE;
1185
0
            }
1186
0
        }
1187
0
        else if (encoding[0] == '3' && encoding[1] == '2') {
1188
0
            encoding += 2;
1189
0
            *bytelength = 4;
1190
0
            if (*encoding == '\0') {
1191
#ifdef WORDS_BIGENDIAN
1192
                return ENC_UTF32BE;
1193
#else
1194
0
                return ENC_UTF32LE;
1195
0
#endif
1196
0
            }
1197
0
            if (*encoding == '-' || *encoding == '_' )
1198
0
                encoding++;
1199
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1200
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1201
0
                    return ENC_UTF32BE;
1202
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1203
0
                    return ENC_UTF32LE;
1204
0
            }
1205
0
        }
1206
0
    }
1207
0
    else if (strcmp(encoding, "CP_UTF8") == 0) {
1208
0
        *bytelength = 3;
1209
0
        return ENC_UTF8;
1210
0
    }
1211
0
    return ENC_UNKNOWN;
1212
0
}
1213
1214
1215
static int
1216
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
1217
0
{
1218
0
    const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
1219
0
    if (encoding_cstr == NULL) {
1220
0
        return -1;
1221
0
    }
1222
0
    *code = get_standard_encoding_impl(encoding_cstr, bytelength);
1223
0
    return 0;
1224
0
}
1225
1226
1227
// --- handler: 'surrogatepass' -----------------------------------------------
1228
1229
static PyObject *
1230
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
1231
0
{
1232
0
    PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
1233
0
    if (encoding == NULL) {
1234
0
        return NULL;
1235
0
    }
1236
0
    int code, bytelength;
1237
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1238
0
    Py_DECREF(encoding);
1239
0
    if (rc < 0) {
1240
0
        return NULL;
1241
0
    }
1242
0
    if (code == ENC_UNKNOWN) {
1243
0
        goto bail;
1244
0
    }
1245
1246
0
    PyObject *obj;
1247
0
    Py_ssize_t objlen, start, end, slen;
1248
0
    if (_PyUnicodeError_GetParams(exc,
1249
0
                                  &obj, &objlen,
1250
0
                                  &start, &end, &slen, false) < 0)
1251
0
    {
1252
0
        return NULL;
1253
0
    }
1254
1255
0
    if (slen > PY_SSIZE_T_MAX / bytelength) {
1256
0
        end = start + PY_SSIZE_T_MAX / bytelength;
1257
0
        end = Py_MIN(end, objlen);
1258
0
        slen = Py_MAX(0, end - start);
1259
0
    }
1260
1261
0
    PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
1262
0
    if (res == NULL) {
1263
0
        Py_DECREF(obj);
1264
0
        return NULL;
1265
0
    }
1266
1267
0
    unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
1268
0
    for (Py_ssize_t i = start; i < end; i++) {
1269
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1270
0
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1271
            /* Not a surrogate, fail with original exception */
1272
0
            Py_DECREF(obj);
1273
0
            Py_DECREF(res);
1274
0
            goto bail;
1275
0
        }
1276
0
        switch (code) {
1277
0
            case ENC_UTF8: {
1278
0
                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1279
0
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1280
0
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1281
0
                break;
1282
0
            }
1283
0
            case ENC_UTF16LE: {
1284
0
                *outp++ = (unsigned char)ch;
1285
0
                *outp++ = (unsigned char)(ch >> 8);
1286
0
                break;
1287
0
            }
1288
0
            case ENC_UTF16BE: {
1289
0
                *outp++ = (unsigned char)(ch >> 8);
1290
0
                *outp++ = (unsigned char)ch;
1291
0
                break;
1292
0
            }
1293
0
            case ENC_UTF32LE: {
1294
0
                *outp++ = (unsigned char)ch;
1295
0
                *outp++ = (unsigned char)(ch >> 8);
1296
0
                *outp++ = (unsigned char)(ch >> 16);
1297
0
                *outp++ = (unsigned char)(ch >> 24);
1298
0
                break;
1299
0
            }
1300
0
            case ENC_UTF32BE: {
1301
0
                *outp++ = (unsigned char)(ch >> 24);
1302
0
                *outp++ = (unsigned char)(ch >> 16);
1303
0
                *outp++ = (unsigned char)(ch >> 8);
1304
0
                *outp++ = (unsigned char)ch;
1305
0
                break;
1306
0
            }
1307
0
        }
1308
0
    }
1309
1310
0
    Py_DECREF(obj);
1311
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
1312
0
    return restuple;
1313
1314
0
bail:
1315
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1316
0
    return NULL;
1317
0
}
1318
1319
1320
static PyObject *
1321
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
1322
0
{
1323
0
    PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
1324
0
    if (encoding == NULL) {
1325
0
        return NULL;
1326
0
    }
1327
0
    int code, bytelength;
1328
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1329
0
    Py_DECREF(encoding);
1330
0
    if (rc < 0) {
1331
0
        return NULL;
1332
0
    }
1333
0
    if (code == ENC_UNKNOWN) {
1334
0
        goto bail;
1335
0
    }
1336
1337
0
    PyObject *obj;
1338
0
    Py_ssize_t objlen, start, end, slen;
1339
0
    if (_PyUnicodeError_GetParams(exc,
1340
0
                                  &obj, &objlen,
1341
0
                                  &start, &end, &slen, true) < 0)
1342
0
    {
1343
0
        return NULL;
1344
0
    }
1345
1346
    /* Try decoding a single surrogate character. If
1347
       there are more, let the codec call us again. */
1348
0
    Py_UCS4 ch = 0;
1349
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1350
0
    p += start;
1351
1352
0
    if (objlen - start >= bytelength) {
1353
0
        switch (code) {
1354
0
            case ENC_UTF8: {
1355
0
                if ((p[0] & 0xf0) == 0xe0 &&
1356
0
                    (p[1] & 0xc0) == 0x80 &&
1357
0
                    (p[2] & 0xc0) == 0x80)
1358
0
                {
1359
                    /* it's a three-byte code */
1360
0
                    ch = ((p[0] & 0x0f) << 12) +
1361
0
                         ((p[1] & 0x3f) << 6)  +
1362
0
                          (p[2] & 0x3f);
1363
0
                }
1364
0
                break;
1365
0
            }
1366
0
            case ENC_UTF16LE: {
1367
0
                ch = p[1] << 8 | p[0];
1368
0
                break;
1369
0
            }
1370
0
            case ENC_UTF16BE: {
1371
0
                ch = p[0] << 8 | p[1];
1372
0
                break;
1373
0
            }
1374
0
            case ENC_UTF32LE: {
1375
0
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1376
0
                break;
1377
0
            }
1378
0
            case ENC_UTF32BE: {
1379
0
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1380
0
                break;
1381
0
            }
1382
0
        }
1383
0
    }
1384
0
    Py_DECREF(obj);
1385
0
    if (!Py_UNICODE_IS_SURROGATE(ch)) {
1386
0
        goto bail;
1387
0
    }
1388
1389
0
    PyObject *res = PyUnicode_FromOrdinal(ch);
1390
0
    if (res == NULL) {
1391
0
        return NULL;
1392
0
    }
1393
0
    return Py_BuildValue("(Nn)", res, start + bytelength);
1394
1395
0
bail:
1396
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1397
0
    return NULL;
1398
0
}
1399
1400
1401
/* This handler is declared static until someone demonstrates
1402
   a need to call it directly. */
1403
static PyObject *
1404
PyCodec_SurrogatePassErrors(PyObject *exc)
1405
0
{
1406
0
    if (_PyIsUnicodeEncodeError(exc)) {
1407
0
        return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
1408
0
    }
1409
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1410
0
        return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
1411
0
    }
1412
0
    else {
1413
0
        wrong_exception_type(exc);
1414
0
        return NULL;
1415
0
    }
1416
0
}
1417
1418
1419
// --- handler: 'surrogateescape' ---------------------------------------------
1420
1421
static PyObject *
1422
_PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
1423
6.61k
{
1424
6.61k
    PyObject *obj;
1425
6.61k
    Py_ssize_t start, end, slen;
1426
6.61k
    if (_PyUnicodeError_GetParams(exc,
1427
6.61k
                                  &obj, NULL,
1428
6.61k
                                  &start, &end, &slen, false) < 0)
1429
0
    {
1430
0
        return NULL;
1431
0
    }
1432
1433
6.61k
    PyObject *res = PyBytes_FromStringAndSize(NULL, slen);
1434
6.61k
    if (res == NULL) {
1435
0
        Py_DECREF(obj);
1436
0
        return NULL;
1437
0
    }
1438
1439
6.61k
    char *outp = PyBytes_AsString(res);
1440
6.61k
    for (Py_ssize_t i = start; i < end; i++) {
1441
6.61k
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1442
6.61k
        if (ch < 0xdc80 || ch > 0xdcff) {
1443
            /* Not a UTF-8b surrogate, fail with original exception. */
1444
6.61k
            Py_DECREF(obj);
1445
6.61k
            Py_DECREF(res);
1446
6.61k
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1447
6.61k
            return NULL;
1448
6.61k
        }
1449
0
        *outp++ = ch - 0xdc00;
1450
0
    }
1451
0
    Py_DECREF(obj);
1452
1453
0
    return Py_BuildValue("(Nn)", res, end);
1454
6.61k
}
1455
1456
1457
static PyObject *
1458
_PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc)
1459
42.3k
{
1460
42.3k
    PyObject *obj;
1461
42.3k
    Py_ssize_t start, end, slen;
1462
42.3k
    if (_PyUnicodeError_GetParams(exc,
1463
42.3k
                                  &obj, NULL,
1464
42.3k
                                  &start, &end, &slen, true) < 0)
1465
0
    {
1466
0
        return NULL;
1467
0
    }
1468
1469
42.3k
    Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1470
42.3k
    int consumed = 0;
1471
42.3k
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1472
91.2k
    while (consumed < 4 && consumed < slen) {
1473
        /* Refuse to escape ASCII bytes. */
1474
73.8k
        if (p[start + consumed] < 128) {
1475
24.9k
            break;
1476
24.9k
        }
1477
48.8k
        ch[consumed] = 0xdc00 + p[start + consumed];
1478
48.8k
        consumed++;
1479
48.8k
    }
1480
42.3k
    Py_DECREF(obj);
1481
1482
42.3k
    if (consumed == 0) {
1483
        /* Codec complained about ASCII byte. */
1484
14.5k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1485
14.5k
        return NULL;
1486
14.5k
    }
1487
1488
27.7k
    PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1489
27.7k
    if (str == NULL) {
1490
0
        return NULL;
1491
0
    }
1492
27.7k
    return Py_BuildValue("(Nn)", str, start + consumed);
1493
27.7k
}
1494
1495
1496
static PyObject *
1497
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1498
48.9k
{
1499
48.9k
    if (_PyIsUnicodeEncodeError(exc)) {
1500
6.61k
        return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc);
1501
6.61k
    }
1502
42.3k
    else if (_PyIsUnicodeDecodeError(exc)) {
1503
42.3k
        return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc);
1504
42.3k
    }
1505
0
    else {
1506
0
        wrong_exception_type(exc);
1507
0
        return NULL;
1508
0
    }
1509
48.9k
}
1510
1511
1512
// --- Codecs registry handlers -----------------------------------------------
1513
1514
static inline PyObject *
1515
strict_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1516
162k
{
1517
162k
    return PyCodec_StrictErrors(exc);
1518
162k
}
1519
1520
1521
static inline PyObject *
1522
ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1523
0
{
1524
0
    return PyCodec_IgnoreErrors(exc);
1525
0
}
1526
1527
1528
static inline PyObject *
1529
replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1530
194k
{
1531
194k
    return PyCodec_ReplaceErrors(exc);
1532
194k
}
1533
1534
1535
static inline PyObject *
1536
xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1537
0
{
1538
0
    return PyCodec_XMLCharRefReplaceErrors(exc);
1539
0
}
1540
1541
1542
static inline PyObject *
1543
backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1544
0
{
1545
0
    return PyCodec_BackslashReplaceErrors(exc);
1546
0
}
1547
1548
1549
static inline PyObject *
1550
namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1551
0
{
1552
0
    return PyCodec_NameReplaceErrors(exc);
1553
0
}
1554
1555
1556
static inline PyObject *
1557
surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1558
0
{
1559
0
    return PyCodec_SurrogatePassErrors(exc);
1560
0
}
1561
1562
1563
static inline PyObject *
1564
surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1565
48.9k
{
1566
48.9k
    return PyCodec_SurrogateEscapeErrors(exc);
1567
48.9k
}
1568
1569
1570
PyStatus
1571
_PyCodec_InitRegistry(PyInterpreterState *interp)
1572
16
{
1573
16
    static struct {
1574
16
        const char *name;
1575
16
        PyMethodDef def;
1576
16
    } methods[] =
1577
16
    {
1578
16
        {
1579
16
            "strict",
1580
16
            {
1581
16
                "strict_errors",
1582
16
                strict_errors,
1583
16
                METH_O,
1584
16
                PyDoc_STR("Implements the 'strict' error handling, which "
1585
16
                          "raises a UnicodeError on coding errors.")
1586
16
            }
1587
16
        },
1588
16
        {
1589
16
            "ignore",
1590
16
            {
1591
16
                "ignore_errors",
1592
16
                ignore_errors,
1593
16
                METH_O,
1594
16
                PyDoc_STR("Implements the 'ignore' error handling, which "
1595
16
                          "ignores malformed data and continues.")
1596
16
            }
1597
16
        },
1598
16
        {
1599
16
            "replace",
1600
16
            {
1601
16
                "replace_errors",
1602
16
                replace_errors,
1603
16
                METH_O,
1604
16
                PyDoc_STR("Implements the 'replace' error handling, which "
1605
16
                          "replaces malformed data with a replacement marker.")
1606
16
            }
1607
16
        },
1608
16
        {
1609
16
            "xmlcharrefreplace",
1610
16
            {
1611
16
                "xmlcharrefreplace_errors",
1612
16
                xmlcharrefreplace_errors,
1613
16
                METH_O,
1614
16
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1615
16
                          "which replaces an unencodable character with the "
1616
16
                          "appropriate XML character reference.")
1617
16
            }
1618
16
        },
1619
16
        {
1620
16
            "backslashreplace",
1621
16
            {
1622
16
                "backslashreplace_errors",
1623
16
                backslashreplace_errors,
1624
16
                METH_O,
1625
16
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1626
16
                          "which replaces malformed data with a backslashed "
1627
16
                          "escape sequence.")
1628
16
            }
1629
16
        },
1630
16
        {
1631
16
            "namereplace",
1632
16
            {
1633
16
                "namereplace_errors",
1634
16
                namereplace_errors,
1635
16
                METH_O,
1636
16
                PyDoc_STR("Implements the 'namereplace' error handling, "
1637
16
                          "which replaces an unencodable character with a "
1638
16
                          "\\N{...} escape sequence.")
1639
16
            }
1640
16
        },
1641
16
        {
1642
16
            "surrogatepass",
1643
16
            {
1644
16
                "surrogatepass",
1645
16
                surrogatepass_errors,
1646
16
                METH_O
1647
16
            }
1648
16
        },
1649
16
        {
1650
16
            "surrogateescape",
1651
16
            {
1652
16
                "surrogateescape",
1653
16
                surrogateescape_errors,
1654
16
                METH_O
1655
16
            }
1656
16
        }
1657
16
    };
1658
    // ensure that the built-in error handlers' names are kept in sync
1659
16
    assert(Py_ARRAY_LENGTH(methods) == Py_ARRAY_LENGTH(codecs_builtin_error_handlers));
1660
1661
16
    assert(interp->codecs.initialized == 0);
1662
16
    interp->codecs.search_path = PyList_New(0);
1663
16
    if (interp->codecs.search_path == NULL) {
1664
0
        return PyStatus_NoMemory();
1665
0
    }
1666
16
    interp->codecs.search_cache = PyDict_New();
1667
16
    if (interp->codecs.search_cache == NULL) {
1668
0
        return PyStatus_NoMemory();
1669
0
    }
1670
16
    interp->codecs.error_registry = PyDict_New();
1671
16
    if (interp->codecs.error_registry == NULL) {
1672
0
        return PyStatus_NoMemory();
1673
0
    }
1674
144
    for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1675
128
        PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1676
128
        if (func == NULL) {
1677
0
            return PyStatus_NoMemory();
1678
0
        }
1679
1680
128
        int res = PyDict_SetItemString(interp->codecs.error_registry,
1681
128
                                       methods[i].name, func);
1682
128
        Py_DECREF(func);
1683
128
        if (res < 0) {
1684
0
            return PyStatus_Error("Failed to insert into codec error registry");
1685
0
        }
1686
128
    }
1687
1688
16
    interp->codecs.initialized = 1;
1689
1690
    // Importing `encodings' will call back into this module to register codec
1691
    // search functions, so this is done after everything else is initialized.
1692
16
    PyObject *mod = PyImport_ImportModule("encodings");
1693
16
    if (mod == NULL) {
1694
0
        return PyStatus_Error("Failed to import encodings module");
1695
0
    }
1696
16
    Py_DECREF(mod);
1697
1698
16
    return PyStatus_Ok();
1699
16
}
1700
1701
void
1702
_PyCodec_Fini(PyInterpreterState *interp)
1703
0
{
1704
0
    Py_CLEAR(interp->codecs.search_path);
1705
0
    Py_CLEAR(interp->codecs.search_cache);
1706
0
    Py_CLEAR(interp->codecs.error_registry);
1707
0
    interp->codecs.initialized = 0;
1708
0
}