Coverage Report

Created: 2026-06-09 06:31

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython3/Python/codecs.c
Line
Count
Source
1
/* ------------------------------------------------------------------------
2
3
   Python Codec Registry and support functions
4
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7
Copyright (c) Corporation for National Research Initiatives.
8
9
   ------------------------------------------------------------------------ */
10
11
#include "Python.h"
12
#include "pycore_call.h"          // _PyObject_CallNoArgs()
13
#include "pycore_codecs.h"        // export _PyCodec_LookupTextEncoding()
14
#include "pycore_interp.h"        // PyInterpreterState.codec_search_path
15
#include "pycore_pyerrors.h"      // _PyErr_FormatNote()
16
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
17
#include "pycore_runtime.h"       // _Py_ID()
18
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
19
#include "pycore_unicodeobject.h" // _PyUnicode_InternMortal()
20
#include "pycore_pyatomic_ft_wrappers.h"
21
22
static const char *codecs_builtin_error_handlers[] = {
23
    "strict", "ignore", "replace",
24
    "xmlcharrefreplace", "backslashreplace", "namereplace",
25
    "surrogatepass", "surrogateescape",
26
};
27
28
const char *Py_hexdigits = "0123456789abcdef";
29
30
/* --- Codec Registry ----------------------------------------------------- */
31
32
int PyCodec_Register(PyObject *search_function)
33
19
{
34
19
    PyInterpreterState *interp = _PyInterpreterState_GET();
35
19
    assert(interp->codecs.initialized);
36
19
    if (search_function == NULL) {
37
0
        PyErr_BadArgument();
38
0
        goto onError;
39
0
    }
40
19
    if (!PyCallable_Check(search_function)) {
41
0
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
42
0
        goto onError;
43
0
    }
44
19
    FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
45
19
    int ret = PyList_Append(interp->codecs.search_path, search_function);
46
19
    FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
47
48
19
    return ret;
49
50
0
 onError:
51
0
    return -1;
52
19
}
53
54
int
55
PyCodec_Unregister(PyObject *search_function)
56
0
{
57
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
58
0
    if (interp->codecs.initialized != 1) {
59
        /* Do nothing if codecs state was cleared (only possible during
60
           interpreter shutdown). */
61
0
        return 0;
62
0
    }
63
64
0
    PyObject *codec_search_path = interp->codecs.search_path;
65
0
    assert(PyList_CheckExact(codec_search_path));
66
0
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
67
0
        FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
68
0
        PyObject *item = PyList_GetItemRef(codec_search_path, i);
69
0
        int ret = 1;
70
0
        if (item == search_function) {
71
            // We hold a reference to the item, so its destructor can't run
72
            // while we hold search_path_mutex.
73
0
            ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
74
0
        }
75
0
        FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
76
0
        Py_DECREF(item);
77
0
        if (ret != 1) {
78
0
            assert(interp->codecs.search_cache != NULL);
79
0
            assert(PyDict_CheckExact(interp->codecs.search_cache));
80
0
            PyDict_Clear(interp->codecs.search_cache);
81
0
            return ret;
82
0
        }
83
0
    }
84
0
    return 0;
85
0
}
86
87
/* Convert a string to a normalized Python string: all ASCII letters are
88
   converted to lower case, spaces are replaced with hyphens. */
89
90
static PyObject*
91
normalizestring(const char *string)
92
6.80k
{
93
6.80k
    size_t i;
94
6.80k
    size_t len = strlen(string);
95
6.80k
    char *p;
96
6.80k
    PyObject *v;
97
98
6.80k
    if (len > PY_SSIZE_T_MAX) {
99
0
        PyErr_SetString(PyExc_OverflowError, "string is too large");
100
0
        return NULL;
101
0
    }
102
103
6.80k
    p = PyMem_Malloc(len + 1);
104
6.80k
    if (p == NULL)
105
0
        return PyErr_NoMemory();
106
74.8k
    for (i = 0; i < len; i++) {
107
68.0k
        char ch = string[i];
108
68.0k
        if (ch == ' ')
109
0
            ch = '-';
110
68.0k
        else
111
68.0k
            ch = Py_TOLOWER(Py_CHARMASK(ch));
112
68.0k
        p[i] = ch;
113
68.0k
    }
114
6.80k
    p[i] = '\0';
115
6.80k
    v = PyUnicode_FromString(p);
116
6.80k
    PyMem_Free(p);
117
6.80k
    return v;
118
6.80k
}
119
120
/* Lookup the given encoding and return a tuple providing the codec
121
   facilities.
122
123
   ASCII letters in the encoding string is looked up converted to all
124
   lower case. This makes encodings looked up through this mechanism
125
   effectively case-insensitive. Spaces are replaced with hyphens for
126
   names like "US ASCII" and "ISO 8859-1".
127
128
   If no codec is found, a LookupError is set and NULL returned.
129
130
   As side effect, this tries to load the encodings package, if not
131
   yet done. This is part of the lazy load strategy for the encodings
132
   package.
133
134
*/
135
136
PyObject *_PyCodec_Lookup(const char *encoding)
137
6.80k
{
138
6.80k
    if (encoding == NULL) {
139
0
        PyErr_BadArgument();
140
0
        return NULL;
141
0
    }
142
143
6.80k
    PyInterpreterState *interp = _PyInterpreterState_GET();
144
6.80k
    assert(interp->codecs.initialized);
145
146
    /* Convert the encoding to a normalized Python string: all
147
       ASCII letters are converted to lower case, spaces are
148
       replaced with hyphens. */
149
6.80k
    PyObject *v = normalizestring(encoding);
150
6.80k
    if (v == NULL) {
151
0
        return NULL;
152
0
    }
153
154
    /* Intern the string. We'll make it immortal later if lookup succeeds. */
155
6.80k
    _PyUnicode_InternMortal(interp, &v);
156
157
    /* First, try to lookup the name in the registry dictionary */
158
6.80k
    PyObject *result;
159
6.80k
    if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
160
0
        goto onError;
161
0
    }
162
6.80k
    if (result != NULL) {
163
6.59k
        Py_DECREF(v);
164
6.59k
        return result;
165
6.59k
    }
166
167
    /* Next, scan the search functions in order of registration */
168
211
    const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
169
211
    if (len < 0)
170
0
        goto onError;
171
211
    if (len == 0) {
172
0
        PyErr_SetString(PyExc_LookupError,
173
0
                        "no codec search functions registered: "
174
0
                        "can't find encoding");
175
0
        goto onError;
176
0
    }
177
178
211
    Py_ssize_t i;
179
322
    for (i = 0; i < len; i++) {
180
211
        PyObject *func;
181
182
211
        func = PyList_GetItemRef(interp->codecs.search_path, i);
183
211
        if (func == NULL)
184
0
            goto onError;
185
211
        result = PyObject_CallOneArg(func, v);
186
211
        Py_DECREF(func);
187
211
        if (result == NULL)
188
0
            goto onError;
189
211
        if (result == Py_None) {
190
111
            Py_CLEAR(result);
191
111
            continue;
192
111
        }
193
100
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
194
0
            PyErr_SetString(PyExc_TypeError,
195
0
                            "codec search functions must return 4-tuples");
196
0
            Py_DECREF(result);
197
0
            goto onError;
198
0
        }
199
100
        break;
200
100
    }
201
211
    if (result == NULL) {
202
        /* XXX Perhaps we should cache misses too ? */
203
111
        PyErr_Format(PyExc_LookupError,
204
111
                     "unknown encoding: %s", encoding);
205
111
        goto onError;
206
111
    }
207
208
100
    _PyUnicode_InternImmortal(interp, &v);
209
210
    /* Cache and return the result */
211
100
    if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
212
0
        Py_DECREF(result);
213
0
        goto onError;
214
0
    }
215
100
    Py_DECREF(v);
216
100
    return result;
217
218
111
 onError:
219
111
    Py_DECREF(v);
220
111
    return NULL;
221
100
}
222
223
/* Codec registry encoding check API. */
224
225
int PyCodec_KnownEncoding(const char *encoding)
226
0
{
227
0
    PyObject *codecs;
228
229
0
    codecs = _PyCodec_Lookup(encoding);
230
0
    if (!codecs) {
231
0
        PyErr_Clear();
232
0
        return 0;
233
0
    }
234
0
    else {
235
0
        Py_DECREF(codecs);
236
0
        return 1;
237
0
    }
238
0
}
239
240
static
241
PyObject *args_tuple(PyObject *object,
242
                     const char *errors)
243
6.58k
{
244
6.58k
    PyObject *args;
245
246
6.58k
    args = PyTuple_New(1 + (errors != NULL));
247
6.58k
    if (args == NULL)
248
0
        return NULL;
249
6.58k
    PyTuple_SET_ITEM(args, 0, Py_NewRef(object));
250
6.58k
    if (errors) {
251
1.49k
        PyObject *v;
252
253
1.49k
        v = PyUnicode_FromString(errors);
254
1.49k
        if (v == NULL) {
255
0
            Py_DECREF(args);
256
0
            return NULL;
257
0
        }
258
1.49k
        PyTuple_SET_ITEM(args, 1, v);
259
1.49k
    }
260
6.58k
    return args;
261
6.58k
}
262
263
/* Helper function to get a codec item */
264
265
static
266
PyObject *codec_getitem(const char *encoding, int index)
267
0
{
268
0
    PyObject *codecs;
269
0
    PyObject *v;
270
271
0
    codecs = _PyCodec_Lookup(encoding);
272
0
    if (codecs == NULL)
273
0
        return NULL;
274
0
    v = PyTuple_GET_ITEM(codecs, index);
275
0
    Py_DECREF(codecs);
276
0
    return Py_NewRef(v);
277
0
}
278
279
/* Helper functions to create an incremental codec. */
280
static
281
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
282
                                     const char *errors,
283
                                     const char *attrname)
284
57
{
285
57
    PyObject *ret, *inccodec;
286
287
57
    inccodec = PyObject_GetAttrString(codec_info, attrname);
288
57
    if (inccodec == NULL)
289
0
        return NULL;
290
57
    if (errors)
291
57
        ret = PyObject_CallFunction(inccodec, "s", errors);
292
0
    else
293
0
        ret = _PyObject_CallNoArgs(inccodec);
294
57
    Py_DECREF(inccodec);
295
57
    return ret;
296
57
}
297
298
static
299
PyObject *codec_getincrementalcodec(const char *encoding,
300
                                    const char *errors,
301
                                    const char *attrname)
302
0
{
303
0
    PyObject *codec_info, *ret;
304
305
0
    codec_info = _PyCodec_Lookup(encoding);
306
0
    if (codec_info == NULL)
307
0
        return NULL;
308
0
    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
309
0
    Py_DECREF(codec_info);
310
0
    return ret;
311
0
}
312
313
/* Helper function to create a stream codec. */
314
315
static
316
PyObject *codec_getstreamcodec(const char *encoding,
317
                               PyObject *stream,
318
                               const char *errors,
319
                               const int index)
320
0
{
321
0
    PyObject *codecs, *streamcodec, *codeccls;
322
323
0
    codecs = _PyCodec_Lookup(encoding);
324
0
    if (codecs == NULL)
325
0
        return NULL;
326
327
0
    codeccls = PyTuple_GET_ITEM(codecs, index);
328
0
    if (errors != NULL)
329
0
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
330
0
    else
331
0
        streamcodec = PyObject_CallOneArg(codeccls, stream);
332
0
    Py_DECREF(codecs);
333
0
    return streamcodec;
334
0
}
335
336
/* Helpers to work with the result of _PyCodec_Lookup
337
338
 */
339
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
340
                                             const char *errors)
341
19
{
342
19
    return codec_makeincrementalcodec(codec_info, errors,
343
19
                                      "incrementaldecoder");
344
19
}
345
346
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
347
                                             const char *errors)
348
38
{
349
38
    return codec_makeincrementalcodec(codec_info, errors,
350
38
                                      "incrementalencoder");
351
38
}
352
353
354
/* Convenience APIs to query the Codec registry.
355
356
   All APIs return a codec object with incremented refcount.
357
358
 */
359
360
PyObject *PyCodec_Encoder(const char *encoding)
361
0
{
362
0
    return codec_getitem(encoding, 0);
363
0
}
364
365
PyObject *PyCodec_Decoder(const char *encoding)
366
0
{
367
0
    return codec_getitem(encoding, 1);
368
0
}
369
370
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
371
                                     const char *errors)
372
0
{
373
0
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
374
0
}
375
376
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
377
                                     const char *errors)
378
0
{
379
0
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
380
0
}
381
382
PyObject *PyCodec_StreamReader(const char *encoding,
383
                               PyObject *stream,
384
                               const char *errors)
385
0
{
386
0
    return codec_getstreamcodec(encoding, stream, errors, 2);
387
0
}
388
389
PyObject *PyCodec_StreamWriter(const char *encoding,
390
                               PyObject *stream,
391
                               const char *errors)
392
0
{
393
0
    return codec_getstreamcodec(encoding, stream, errors, 3);
394
0
}
395
396
/* Encode an object (e.g. a Unicode object) using the given encoding
397
   and return the resulting encoded object (usually a Python string).
398
399
   errors is passed to the encoder factory as argument if non-NULL. */
400
401
static PyObject *
402
_PyCodec_EncodeInternal(PyObject *object,
403
                        PyObject *encoder,
404
                        const char *encoding,
405
                        const char *errors)
406
1.13k
{
407
1.13k
    PyObject *args = NULL, *result = NULL;
408
1.13k
    PyObject *v = NULL;
409
410
1.13k
    args = args_tuple(object, errors);
411
1.13k
    if (args == NULL)
412
0
        goto onError;
413
414
1.13k
    result = PyObject_Call(encoder, args, NULL);
415
1.13k
    if (result == NULL) {
416
0
        _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding);
417
0
        goto onError;
418
0
    }
419
420
1.13k
    if (!PyTuple_Check(result) ||
421
1.13k
        PyTuple_GET_SIZE(result) != 2) {
422
0
        PyErr_SetString(PyExc_TypeError,
423
0
                        "encoder must return a tuple (object, integer)");
424
0
        goto onError;
425
0
    }
426
1.13k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
427
    /* We don't check or use the second (integer) entry. */
428
429
1.13k
    Py_DECREF(args);
430
1.13k
    Py_DECREF(encoder);
431
1.13k
    Py_DECREF(result);
432
1.13k
    return v;
433
434
0
 onError:
435
0
    Py_XDECREF(result);
436
0
    Py_XDECREF(args);
437
0
    Py_XDECREF(encoder);
438
0
    return NULL;
439
1.13k
}
440
441
/* Decode an object (usually a Python string) using the given encoding
442
   and return an equivalent object (e.g. a Unicode object).
443
444
   errors is passed to the decoder factory as argument if non-NULL. */
445
446
static PyObject *
447
_PyCodec_DecodeInternal(PyObject *object,
448
                        PyObject *decoder,
449
                        const char *encoding,
450
                        const char *errors)
451
5.44k
{
452
5.44k
    PyObject *args = NULL, *result = NULL;
453
5.44k
    PyObject *v;
454
455
5.44k
    args = args_tuple(object, errors);
456
5.44k
    if (args == NULL)
457
0
        goto onError;
458
459
5.44k
    result = PyObject_Call(decoder, args, NULL);
460
5.44k
    if (result == NULL) {
461
1.49k
        _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding);
462
1.49k
        goto onError;
463
1.49k
    }
464
3.95k
    if (!PyTuple_Check(result) ||
465
3.95k
        PyTuple_GET_SIZE(result) != 2) {
466
0
        PyErr_SetString(PyExc_TypeError,
467
0
                        "decoder must return a tuple (object,integer)");
468
0
        goto onError;
469
0
    }
470
3.95k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
471
    /* We don't check or use the second (integer) entry. */
472
473
3.95k
    Py_DECREF(args);
474
3.95k
    Py_DECREF(decoder);
475
3.95k
    Py_DECREF(result);
476
3.95k
    return v;
477
478
1.49k
 onError:
479
1.49k
    Py_XDECREF(args);
480
1.49k
    Py_XDECREF(decoder);
481
1.49k
    Py_XDECREF(result);
482
1.49k
    return NULL;
483
3.95k
}
484
485
/* Generic encoding/decoding API */
486
PyObject *PyCodec_Encode(PyObject *object,
487
                         const char *encoding,
488
                         const char *errors)
489
0
{
490
0
    PyObject *encoder;
491
492
0
    encoder = PyCodec_Encoder(encoding);
493
0
    if (encoder == NULL)
494
0
        return NULL;
495
496
0
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
497
0
}
498
499
PyObject *PyCodec_Decode(PyObject *object,
500
                         const char *encoding,
501
                         const char *errors)
502
0
{
503
0
    PyObject *decoder;
504
505
0
    decoder = PyCodec_Decoder(encoding);
506
0
    if (decoder == NULL)
507
0
        return NULL;
508
509
0
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
510
0
}
511
512
/* Text encoding/decoding API */
513
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
514
                                       const char *alternate_command)
515
6.75k
{
516
6.75k
    PyObject *codec;
517
6.75k
    PyObject *attr;
518
6.75k
    int is_text_codec;
519
520
6.75k
    codec = _PyCodec_Lookup(encoding);
521
6.75k
    if (codec == NULL)
522
111
        return NULL;
523
524
    /* Backwards compatibility: assume any raw tuple describes a text
525
     * encoding, and the same for anything lacking the private
526
     * attribute.
527
     */
528
6.64k
    if (!PyTuple_CheckExact(codec)) {
529
6.64k
        if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
530
0
            Py_DECREF(codec);
531
0
            return NULL;
532
0
        }
533
6.64k
        if (attr != NULL) {
534
6.64k
            is_text_codec = PyObject_IsTrue(attr);
535
6.64k
            Py_DECREF(attr);
536
6.64k
            if (is_text_codec <= 0) {
537
1
                Py_DECREF(codec);
538
1
                if (!is_text_codec) {
539
1
                    if (alternate_command != NULL) {
540
1
                        PyErr_Format(PyExc_LookupError,
541
1
                                     "'%.400s' is not a text encoding; "
542
1
                                     "use %s to handle arbitrary codecs",
543
1
                                     encoding, alternate_command);
544
1
                    }
545
0
                    else {
546
0
                        PyErr_Format(PyExc_LookupError,
547
0
                                     "'%.400s' is not a text encoding",
548
0
                                     encoding);
549
0
                    }
550
1
                }
551
1
                return NULL;
552
1
            }
553
6.64k
        }
554
6.64k
    }
555
556
    /* This appears to be a valid text encoding */
557
6.63k
    return codec;
558
6.64k
}
559
560
561
static
562
PyObject *codec_getitem_checked(const char *encoding,
563
                                const char *alternate_command,
564
                                int index)
565
6.69k
{
566
6.69k
    PyObject *codec;
567
6.69k
    PyObject *v;
568
569
6.69k
    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
570
6.69k
    if (codec == NULL)
571
112
        return NULL;
572
573
6.58k
    v = Py_NewRef(PyTuple_GET_ITEM(codec, index));
574
6.58k
    Py_DECREF(codec);
575
6.58k
    return v;
576
6.58k
}
577
578
static PyObject * _PyCodec_TextEncoder(const char *encoding)
579
1.13k
{
580
1.13k
    return codec_getitem_checked(encoding, "codecs.encode()", 0);
581
1.13k
}
582
583
static PyObject * _PyCodec_TextDecoder(const char *encoding)
584
5.55k
{
585
5.55k
    return codec_getitem_checked(encoding, "codecs.decode()", 1);
586
5.55k
}
587
588
PyObject *_PyCodec_EncodeText(PyObject *object,
589
                              const char *encoding,
590
                              const char *errors)
591
1.13k
{
592
1.13k
    PyObject *encoder;
593
594
1.13k
    encoder = _PyCodec_TextEncoder(encoding);
595
1.13k
    if (encoder == NULL)
596
0
        return NULL;
597
598
1.13k
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
599
1.13k
}
600
601
PyObject *_PyCodec_DecodeText(PyObject *object,
602
                              const char *encoding,
603
                              const char *errors)
604
5.55k
{
605
5.55k
    PyObject *decoder;
606
607
5.55k
    decoder = _PyCodec_TextDecoder(encoding);
608
5.55k
    if (decoder == NULL)
609
112
        return NULL;
610
611
5.44k
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
612
5.55k
}
613
614
/* Register the error handling callback function error under the name
615
   name. This function will be called by the codec when it encounters
616
   an unencodable characters/undecodable bytes and doesn't know the
617
   callback name, when name is specified as the error parameter
618
   in the call to the encode/decode function.
619
   Return 0 on success, -1 on error */
620
int PyCodec_RegisterError(const char *name, PyObject *error)
621
0
{
622
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
623
0
    assert(interp->codecs.initialized);
624
0
    if (!PyCallable_Check(error)) {
625
0
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
626
0
        return -1;
627
0
    }
628
0
    return PyDict_SetItemString(interp->codecs.error_registry,
629
0
                                name, error);
630
0
}
631
632
int _PyCodec_UnregisterError(const char *name)
633
0
{
634
0
    for (size_t i = 0; i < Py_ARRAY_LENGTH(codecs_builtin_error_handlers); ++i) {
635
0
        if (strcmp(name, codecs_builtin_error_handlers[i]) == 0) {
636
0
            PyErr_Format(PyExc_ValueError,
637
0
                         "cannot un-register built-in error handler '%s'", name);
638
0
            return -1;
639
0
        }
640
0
    }
641
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
642
0
    assert(interp->codecs.initialized);
643
0
    return PyDict_PopString(interp->codecs.error_registry, name, NULL);
644
0
}
645
646
/* Lookup the error handling callback function registered under the
647
   name error. As a special case NULL can be passed, in which case
648
   the error handling callback for strict encoding will be returned. */
649
PyObject *PyCodec_LookupError(const char *name)
650
4.41k
{
651
4.41k
    PyInterpreterState *interp = _PyInterpreterState_GET();
652
4.41k
    assert(interp->codecs.initialized);
653
654
4.41k
    if (name==NULL)
655
2.86k
        name = "strict";
656
4.41k
    PyObject *handler;
657
4.41k
    if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
658
0
        return NULL;
659
0
    }
660
4.41k
    if (handler == NULL) {
661
0
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
662
0
        return NULL;
663
0
    }
664
4.41k
    return handler;
665
4.41k
}
666
667
668
static inline void
669
wrong_exception_type(PyObject *exc)
670
0
{
671
0
    PyErr_Format(PyExc_TypeError,
672
0
                 "don't know how to handle %T in error callback", exc);
673
0
}
674
675
676
#define _PyIsUnicodeEncodeError(EXC)    \
677
118k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeEncodeError)
678
#define _PyIsUnicodeDecodeError(EXC)    \
679
118k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeDecodeError)
680
#define _PyIsUnicodeTranslateError(EXC) \
681
0
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
682
683
684
// --- codecs handlers: utilities ---------------------------------------------
685
686
/*
687
 * Return the number of characters (including special prefixes)
688
 * needed to represent 'ch' by codec_handler_write_unicode_hex().
689
 */
690
static inline Py_ssize_t
691
codec_handler_unicode_hex_width(Py_UCS4 ch)
692
0
{
693
0
    if (ch >= 0x10000) {
694
        // format: '\\' + 'U' + 8 hex digits
695
0
        return 1 + 1 + 8;
696
0
    }
697
0
    else if (ch >= 0x100) {
698
        // format: '\\' + 'u' + 4 hex digits
699
0
        return 1 + 1 + 4;
700
0
    }
701
0
    else {
702
        // format: '\\' + 'x' + 2 hex digits
703
0
        return 1 + 1 + 2;
704
0
    }
705
0
}
706
707
708
/*
709
 * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
710
 * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
711
 */
712
static inline void
713
codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
714
0
{
715
0
    *(*p)++ = '\\';
716
0
    if (ch >= 0x10000) {
717
0
        *(*p)++ = 'U';
718
0
        *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf];
719
0
        *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf];
720
0
        *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf];
721
0
        *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf];
722
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
723
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
724
0
    }
725
0
    else if (ch >= 0x100) {
726
0
        *(*p)++ = 'u';
727
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
728
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
729
0
    }
730
0
    else {
731
0
        *(*p)++ = 'x';
732
0
    }
733
0
    *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf];
734
0
    *(*p)++ = Py_hexdigits[ch & 0xf];
735
0
}
736
737
738
/*
739
 * Determine the number of digits for a decimal representation of Unicode
740
 * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
741
 */
742
static inline int
743
n_decimal_digits_for_codepoint(Py_UCS4 ch)
744
0
{
745
0
    if (ch < 10) return 1;
746
0
    if (ch < 100) return 2;
747
0
    if (ch < 1000) return 3;
748
0
    if (ch < 10000) return 4;
749
0
    if (ch < 100000) return 5;
750
0
    if (ch < 1000000) return 6;
751
0
    if (ch < 10000000) return 7;
752
    // Unicode codepoints are limited to 1114111 (7 decimal digits)
753
0
    Py_UNREACHABLE();
754
0
}
755
756
757
/*
758
 * Create a Unicode string containing 'count' copies of the official
759
 * Unicode REPLACEMENT CHARACTER (0xFFFD).
760
 */
761
static PyObject *
762
codec_handler_unicode_replacement_character(Py_ssize_t count)
763
0
{
764
0
    PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER);
765
0
    if (res == NULL) {
766
0
        return NULL;
767
0
    }
768
0
    assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
769
0
    Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
770
0
    for (Py_ssize_t i = 0; i < count; ++i) {
771
0
        outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
772
0
    }
773
0
    assert(_PyUnicode_CheckConsistency(res, 1));
774
0
    return res;
775
0
}
776
777
778
// --- handler: 'strict' ------------------------------------------------------
779
780
PyObject *PyCodec_StrictErrors(PyObject *exc)
781
5.60k
{
782
5.60k
    if (PyExceptionInstance_Check(exc)) {
783
5.60k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
784
5.60k
    }
785
0
    else {
786
0
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
787
0
    }
788
5.60k
    return NULL;
789
5.60k
}
790
791
792
// --- handler: 'ignore' ------------------------------------------------------
793
794
static PyObject *
795
_PyCodec_IgnoreError(PyObject *exc, int as_bytes)
796
0
{
797
0
    Py_ssize_t end;
798
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL,
799
0
                                  &end, NULL, as_bytes) < 0)
800
0
    {
801
0
        return NULL;
802
0
    }
803
0
    return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
804
0
}
805
806
807
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
808
0
{
809
0
    if (_PyIsUnicodeEncodeError(exc) || _PyIsUnicodeTranslateError(exc)) {
810
0
        return _PyCodec_IgnoreError(exc, false);
811
0
    }
812
0
    else if (_PyIsUnicodeDecodeError(exc)) {
813
0
        return _PyCodec_IgnoreError(exc, true);
814
0
    }
815
0
    else {
816
0
        wrong_exception_type(exc);
817
0
        return NULL;
818
0
    }
819
0
}
820
821
822
// --- handler: 'replace' -----------------------------------------------------
823
824
static PyObject *
825
_PyCodec_ReplaceUnicodeEncodeError(PyObject *exc)
826
0
{
827
0
    Py_ssize_t start, end, slen;
828
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
829
0
                                  &start, &end, &slen, false) < 0)
830
0
    {
831
0
        return NULL;
832
0
    }
833
0
    PyObject *res = PyUnicode_New(slen, '?');
834
0
    if (res == NULL) {
835
0
        return NULL;
836
0
    }
837
0
    assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
838
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
839
0
    memset(outp, '?', sizeof(Py_UCS1) * slen);
840
0
    assert(_PyUnicode_CheckConsistency(res, 1));
841
0
    return Py_BuildValue("(Nn)", res, end);
842
0
}
843
844
845
static PyObject *
846
_PyCodec_ReplaceUnicodeDecodeError(PyObject *exc)
847
0
{
848
0
    Py_ssize_t end;
849
0
    if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) {
850
0
        return NULL;
851
0
    }
852
0
    PyObject *res = codec_handler_unicode_replacement_character(1);
853
0
    if (res == NULL) {
854
0
        return NULL;
855
0
    }
856
0
    return Py_BuildValue("(Nn)", res, end);
857
0
}
858
859
860
static PyObject *
861
_PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
862
0
{
863
0
    Py_ssize_t start, end, slen;
864
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
865
0
                                  &start, &end, &slen, false) < 0)
866
0
    {
867
0
        return NULL;
868
0
    }
869
0
    PyObject *res = codec_handler_unicode_replacement_character(slen);
870
0
    if (res == NULL) {
871
0
        return NULL;
872
0
    }
873
0
    return Py_BuildValue("(Nn)", res, end);
874
0
}
875
876
877
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
878
0
{
879
0
    if (_PyIsUnicodeEncodeError(exc)) {
880
0
        return _PyCodec_ReplaceUnicodeEncodeError(exc);
881
0
    }
882
0
    else if (_PyIsUnicodeDecodeError(exc)) {
883
0
        return _PyCodec_ReplaceUnicodeDecodeError(exc);
884
0
    }
885
0
    else if (_PyIsUnicodeTranslateError(exc)) {
886
0
        return _PyCodec_ReplaceUnicodeTranslateError(exc);
887
0
    }
888
0
    else {
889
0
        wrong_exception_type(exc);
890
0
        return NULL;
891
0
    }
892
0
}
893
894
895
// --- handler: 'xmlcharrefreplace' -------------------------------------------
896
897
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
898
0
{
899
0
    if (!_PyIsUnicodeEncodeError(exc)) {
900
0
        wrong_exception_type(exc);
901
0
        return NULL;
902
0
    }
903
904
0
    PyObject *obj;
905
0
    Py_ssize_t objlen, start, end, slen;
906
0
    if (_PyUnicodeError_GetParams(exc,
907
0
                                  &obj, &objlen,
908
0
                                  &start, &end, &slen, false) < 0)
909
0
    {
910
0
        return NULL;
911
0
    }
912
913
    // The number of characters that each character 'ch' contributes
914
    // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
915
    // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
916
    // range is below 10^7, each "block" requires at most 2 + 7 + 1
917
    // characters.
918
0
    if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
919
0
        end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
920
0
        end = Py_MIN(end, objlen);
921
0
        slen = Py_MAX(0, end - start);
922
0
    }
923
924
0
    Py_ssize_t ressize = 0;
925
0
    for (Py_ssize_t i = start; i < end; ++i) {
926
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
927
0
        int k = n_decimal_digits_for_codepoint(ch);
928
0
        assert(k != 0);
929
0
        assert(k <= 7);
930
0
        ressize += 2 + k + 1;
931
0
    }
932
933
    /* allocate replacement */
934
0
    PyObject *res = PyUnicode_New(ressize, 127);
935
0
    if (res == NULL) {
936
0
        Py_DECREF(obj);
937
0
        return NULL;
938
0
    }
939
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
940
    /* generate replacement */
941
0
    for (Py_ssize_t i = start; i < end; ++i) {
942
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
943
        /*
944
         * Write the decimal representation of 'ch' to the buffer pointed by 'p'
945
         * using at most 7 characters prefixed by '&#' and suffixed by ';'.
946
         */
947
0
        *outp++ = '&';
948
0
        *outp++ = '#';
949
0
        Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
950
0
        for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
951
0
            *p_digit = '0' + (ch % 10);
952
0
            ch /= 10;
953
0
        }
954
0
        assert(ch == 0);
955
0
        outp = digit_end;
956
0
        *outp++ = ';';
957
0
    }
958
0
    assert(_PyUnicode_CheckConsistency(res, 1));
959
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
960
0
    Py_DECREF(obj);
961
0
    return restuple;
962
0
}
963
964
965
// --- handler: 'backslashreplace' --------------------------------------------
966
967
static PyObject *
968
_PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc)
969
0
{
970
0
    PyObject *obj;
971
0
    Py_ssize_t objlen, start, end, slen;
972
0
    if (_PyUnicodeError_GetParams(exc,
973
0
                                  &obj, &objlen,
974
0
                                  &start, &end, &slen, false) < 0)
975
0
    {
976
0
        return NULL;
977
0
    }
978
979
    // The number of characters that each character 'ch' contributes
980
    // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch}
981
    // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS,
982
    // where the number of hexdigits is either 2, 4, or 8 (not 6).
983
    // Since the Unicode range is below 10^7, we choose k = 8 whence
984
    // each "block" requires at most 1 + 1 + 8 characters.
985
0
    if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) {
986
0
        end = start + PY_SSIZE_T_MAX / (1 + 1 + 8);
987
0
        end = Py_MIN(end, objlen);
988
0
        slen = Py_MAX(0, end - start);
989
0
    }
990
991
0
    Py_ssize_t ressize = 0;
992
0
    for (Py_ssize_t i = start; i < end; ++i) {
993
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
994
0
        ressize += codec_handler_unicode_hex_width(c);
995
0
    }
996
0
    PyObject *res = PyUnicode_New(ressize, 127);
997
0
    if (res == NULL) {
998
0
        Py_DECREF(obj);
999
0
        return NULL;
1000
0
    }
1001
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1002
0
    for (Py_ssize_t i = start; i < end; ++i) {
1003
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1004
0
        codec_handler_write_unicode_hex(&outp, c);
1005
0
    }
1006
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1007
0
    Py_DECREF(obj);
1008
0
    return Py_BuildValue("(Nn)", res, end);
1009
0
}
1010
1011
1012
static PyObject *
1013
_PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc)
1014
0
{
1015
0
    PyObject *obj;
1016
0
    Py_ssize_t objlen, start, end, slen;
1017
0
    if (_PyUnicodeError_GetParams(exc,
1018
0
                                  &obj, &objlen,
1019
0
                                  &start, &end, &slen, true) < 0)
1020
0
    {
1021
0
        return NULL;
1022
0
    }
1023
1024
0
    PyObject *res = PyUnicode_New(4 * slen, 127);
1025
0
    if (res == NULL) {
1026
0
        Py_DECREF(obj);
1027
0
        return NULL;
1028
0
    }
1029
1030
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1031
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1032
0
    for (Py_ssize_t i = start; i < end; i++, outp += 4) {
1033
0
        const unsigned char ch = p[i];
1034
0
        outp[0] = '\\';
1035
0
        outp[1] = 'x';
1036
0
        outp[2] = Py_hexdigits[(ch >> 4) & 0xf];
1037
0
        outp[3] = Py_hexdigits[ch & 0xf];
1038
0
    }
1039
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1040
0
    Py_DECREF(obj);
1041
0
    return Py_BuildValue("(Nn)", res, end);
1042
0
}
1043
1044
1045
static inline PyObject *
1046
_PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc)
1047
0
{
1048
    // Same implementation as for UnicodeEncodeError objects.
1049
0
    return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1050
0
}
1051
1052
1053
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
1054
0
{
1055
0
    if (_PyIsUnicodeEncodeError(exc)) {
1056
0
        return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1057
0
    }
1058
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1059
0
        return _PyCodec_BackslashReplaceUnicodeDecodeError(exc);
1060
0
    }
1061
0
    else if (_PyIsUnicodeTranslateError(exc)) {
1062
0
        return _PyCodec_BackslashReplaceUnicodeTranslateError(exc);
1063
0
    }
1064
0
    else {
1065
0
        wrong_exception_type(exc);
1066
0
        return NULL;
1067
0
    }
1068
0
}
1069
1070
1071
// --- handler: 'namereplace' -------------------------------------------------
1072
1073
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
1074
0
{
1075
0
    if (!_PyIsUnicodeEncodeError(exc)) {
1076
0
        wrong_exception_type(exc);
1077
0
        return NULL;
1078
0
    }
1079
1080
0
    _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
1081
0
    if (ucnhash_capi == NULL) {
1082
0
        return NULL;
1083
0
    }
1084
1085
0
    PyObject *obj;
1086
0
    Py_ssize_t start, end;
1087
0
    if (_PyUnicodeError_GetParams(exc,
1088
0
                                  &obj, NULL,
1089
0
                                  &start, &end, NULL, false) < 0)
1090
0
    {
1091
0
        return NULL;
1092
0
    }
1093
1094
0
    char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */
1095
0
    Py_ssize_t imax = start, ressize = 0, replsize;
1096
0
    for (; imax < end; ++imax) {
1097
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax);
1098
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1099
            // If 'c' is recognized by getname(), the corresponding replacement
1100
            // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
1101
            // characters. Failures of getname() are ignored by the handler.
1102
0
            replsize = 1 + 1 + 1 + strlen(buffer) + 1;
1103
0
        }
1104
0
        else {
1105
0
            replsize = codec_handler_unicode_hex_width(c);
1106
0
        }
1107
0
        if (ressize > PY_SSIZE_T_MAX - replsize) {
1108
0
            break;
1109
0
        }
1110
0
        ressize += replsize;
1111
0
    }
1112
1113
0
    PyObject *res = PyUnicode_New(ressize, 127);
1114
0
    if (res == NULL) {
1115
0
        Py_DECREF(obj);
1116
0
        return NULL;
1117
0
    }
1118
1119
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1120
0
    for (Py_ssize_t i = start; i < imax; ++i) {
1121
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1122
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1123
0
            *outp++ = '\\';
1124
0
            *outp++ = 'N';
1125
0
            *outp++ = '{';
1126
0
            (void)strcpy((char *)outp, buffer);
1127
0
            outp += strlen(buffer);
1128
0
            *outp++ = '}';
1129
0
        }
1130
0
        else {
1131
0
            codec_handler_write_unicode_hex(&outp, c);
1132
0
        }
1133
0
    }
1134
1135
0
    assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1136
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1137
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, imax);
1138
0
    Py_DECREF(obj);
1139
0
    return restuple;
1140
0
}
1141
1142
1143
118k
#define ENC_UNKNOWN     -1
1144
52.8k
#define ENC_UTF8        0
1145
52.9k
#define ENC_UTF16BE     1
1146
126k
#define ENC_UTF16LE     2
1147
3.56k
#define ENC_UTF32BE     3
1148
1.02k
#define ENC_UTF32LE     4
1149
1150
static int
1151
get_standard_encoding_impl(const char *encoding, int *bytelength)
1152
118k
{
1153
118k
    if (Py_TOLOWER(encoding[0]) == 'u' &&
1154
118k
        Py_TOLOWER(encoding[1]) == 't' &&
1155
118k
        Py_TOLOWER(encoding[2]) == 'f') {
1156
118k
        encoding += 3;
1157
118k
        if (*encoding == '-' || *encoding == '_' )
1158
118k
            encoding++;
1159
118k
        if (encoding[0] == '8' && encoding[1] == '\0') {
1160
26.5k
            *bytelength = 3;
1161
26.5k
            return ENC_UTF8;
1162
26.5k
        }
1163
91.9k
        else if (encoding[0] == '1' && encoding[1] == '6') {
1164
89.6k
            encoding += 2;
1165
89.6k
            *bytelength = 2;
1166
89.6k
            if (*encoding == '\0') {
1167
#ifdef WORDS_BIGENDIAN
1168
                return ENC_UTF16BE;
1169
#else
1170
0
                return ENC_UTF16LE;
1171
0
#endif
1172
0
            }
1173
89.6k
            if (*encoding == '-' || *encoding == '_' )
1174
89.6k
                encoding++;
1175
89.6k
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1176
89.6k
                if (Py_TOLOWER(encoding[0]) == 'b')
1177
26.5k
                    return ENC_UTF16BE;
1178
63.1k
                if (Py_TOLOWER(encoding[0]) == 'l')
1179
63.1k
                    return ENC_UTF16LE;
1180
63.1k
            }
1181
89.6k
        }
1182
2.31k
        else if (encoding[0] == '3' && encoding[1] == '2') {
1183
2.31k
            encoding += 2;
1184
2.31k
            *bytelength = 4;
1185
2.31k
            if (*encoding == '\0') {
1186
#ifdef WORDS_BIGENDIAN
1187
                return ENC_UTF32BE;
1188
#else
1189
0
                return ENC_UTF32LE;
1190
0
#endif
1191
0
            }
1192
2.31k
            if (*encoding == '-' || *encoding == '_' )
1193
2.31k
                encoding++;
1194
2.31k
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1195
2.31k
                if (Py_TOLOWER(encoding[0]) == 'b')
1196
1.78k
                    return ENC_UTF32BE;
1197
521
                if (Py_TOLOWER(encoding[0]) == 'l')
1198
521
                    return ENC_UTF32LE;
1199
521
            }
1200
2.31k
        }
1201
118k
    }
1202
0
    else if (strcmp(encoding, "cp65001") == 0) {
1203
0
        *bytelength = 3;
1204
0
        return ENC_UTF8;
1205
0
    }
1206
0
    return ENC_UNKNOWN;
1207
118k
}
1208
1209
1210
static int
1211
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
1212
118k
{
1213
118k
    const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
1214
118k
    if (encoding_cstr == NULL) {
1215
0
        return -1;
1216
0
    }
1217
118k
    *code = get_standard_encoding_impl(encoding_cstr, bytelength);
1218
118k
    return 0;
1219
118k
}
1220
1221
1222
// --- handler: 'surrogatepass' -----------------------------------------------
1223
1224
static PyObject *
1225
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
1226
0
{
1227
0
    PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
1228
0
    if (encoding == NULL) {
1229
0
        return NULL;
1230
0
    }
1231
0
    int code, bytelength;
1232
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1233
0
    Py_DECREF(encoding);
1234
0
    if (rc < 0) {
1235
0
        return NULL;
1236
0
    }
1237
0
    if (code == ENC_UNKNOWN) {
1238
0
        goto bail;
1239
0
    }
1240
1241
0
    PyObject *obj;
1242
0
    Py_ssize_t objlen, start, end, slen;
1243
0
    if (_PyUnicodeError_GetParams(exc,
1244
0
                                  &obj, &objlen,
1245
0
                                  &start, &end, &slen, false) < 0)
1246
0
    {
1247
0
        return NULL;
1248
0
    }
1249
1250
0
    if (slen > PY_SSIZE_T_MAX / bytelength) {
1251
0
        end = start + PY_SSIZE_T_MAX / bytelength;
1252
0
        end = Py_MIN(end, objlen);
1253
0
        slen = Py_MAX(0, end - start);
1254
0
    }
1255
1256
0
    PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
1257
0
    if (res == NULL) {
1258
0
        Py_DECREF(obj);
1259
0
        return NULL;
1260
0
    }
1261
1262
0
    unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
1263
0
    for (Py_ssize_t i = start; i < end; i++) {
1264
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1265
0
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1266
            /* Not a surrogate, fail with original exception */
1267
0
            Py_DECREF(obj);
1268
0
            Py_DECREF(res);
1269
0
            goto bail;
1270
0
        }
1271
0
        switch (code) {
1272
0
            case ENC_UTF8: {
1273
0
                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1274
0
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1275
0
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1276
0
                break;
1277
0
            }
1278
0
            case ENC_UTF16LE: {
1279
0
                *outp++ = (unsigned char)ch;
1280
0
                *outp++ = (unsigned char)(ch >> 8);
1281
0
                break;
1282
0
            }
1283
0
            case ENC_UTF16BE: {
1284
0
                *outp++ = (unsigned char)(ch >> 8);
1285
0
                *outp++ = (unsigned char)ch;
1286
0
                break;
1287
0
            }
1288
0
            case ENC_UTF32LE: {
1289
0
                *outp++ = (unsigned char)ch;
1290
0
                *outp++ = (unsigned char)(ch >> 8);
1291
0
                *outp++ = (unsigned char)(ch >> 16);
1292
0
                *outp++ = (unsigned char)(ch >> 24);
1293
0
                break;
1294
0
            }
1295
0
            case ENC_UTF32BE: {
1296
0
                *outp++ = (unsigned char)(ch >> 24);
1297
0
                *outp++ = (unsigned char)(ch >> 16);
1298
0
                *outp++ = (unsigned char)(ch >> 8);
1299
0
                *outp++ = (unsigned char)ch;
1300
0
                break;
1301
0
            }
1302
0
        }
1303
0
    }
1304
1305
0
    Py_DECREF(obj);
1306
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
1307
0
    return restuple;
1308
1309
0
bail:
1310
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1311
0
    return NULL;
1312
0
}
1313
1314
1315
static PyObject *
1316
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
1317
118k
{
1318
118k
    PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
1319
118k
    if (encoding == NULL) {
1320
0
        return NULL;
1321
0
    }
1322
118k
    int code, bytelength;
1323
118k
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1324
118k
    Py_DECREF(encoding);
1325
118k
    if (rc < 0) {
1326
0
        return NULL;
1327
0
    }
1328
118k
    if (code == ENC_UNKNOWN) {
1329
0
        goto bail;
1330
0
    }
1331
1332
118k
    PyObject *obj;
1333
118k
    Py_ssize_t objlen, start, end, slen;
1334
118k
    if (_PyUnicodeError_GetParams(exc,
1335
118k
                                  &obj, &objlen,
1336
118k
                                  &start, &end, &slen, true) < 0)
1337
0
    {
1338
0
        return NULL;
1339
0
    }
1340
1341
    /* Try decoding a single surrogate character. If
1342
       there are more, let the codec call us again. */
1343
118k
    Py_UCS4 ch = 0;
1344
118k
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1345
118k
    p += start;
1346
1347
118k
    if (objlen - start >= bytelength) {
1348
117k
        switch (code) {
1349
26.3k
            case ENC_UTF8: {
1350
26.3k
                if ((p[0] & 0xf0) == 0xe0 &&
1351
26.1k
                    (p[1] & 0xc0) == 0x80 &&
1352
26.1k
                    (p[2] & 0xc0) == 0x80)
1353
26.0k
                {
1354
                    /* it's a three-byte code */
1355
26.0k
                    ch = ((p[0] & 0x0f) << 12) +
1356
26.0k
                         ((p[1] & 0x3f) << 6)  +
1357
26.0k
                          (p[2] & 0x3f);
1358
26.0k
                }
1359
26.3k
                break;
1360
0
            }
1361
62.9k
            case ENC_UTF16LE: {
1362
62.9k
                ch = p[1] << 8 | p[0];
1363
62.9k
                break;
1364
0
            }
1365
26.3k
            case ENC_UTF16BE: {
1366
26.3k
                ch = p[0] << 8 | p[1];
1367
26.3k
                break;
1368
0
            }
1369
507
            case ENC_UTF32LE: {
1370
507
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1371
507
                break;
1372
0
            }
1373
1.78k
            case ENC_UTF32BE: {
1374
1.78k
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1375
1.78k
                break;
1376
0
            }
1377
117k
        }
1378
117k
    }
1379
118k
    Py_DECREF(obj);
1380
118k
    if (!Py_UNICODE_IS_SURROGATE(ch)) {
1381
930
        goto bail;
1382
930
    }
1383
1384
117k
    PyObject *res = PyUnicode_FromOrdinal(ch);
1385
117k
    if (res == NULL) {
1386
0
        return NULL;
1387
0
    }
1388
117k
    return Py_BuildValue("(Nn)", res, start + bytelength);
1389
1390
930
bail:
1391
930
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1392
930
    return NULL;
1393
117k
}
1394
1395
1396
/* This handler is declared static until someone demonstrates
1397
   a need to call it directly. */
1398
static PyObject *
1399
PyCodec_SurrogatePassErrors(PyObject *exc)
1400
118k
{
1401
118k
    if (_PyIsUnicodeEncodeError(exc)) {
1402
0
        return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
1403
0
    }
1404
118k
    else if (_PyIsUnicodeDecodeError(exc)) {
1405
118k
        return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
1406
118k
    }
1407
0
    else {
1408
0
        wrong_exception_type(exc);
1409
0
        return NULL;
1410
0
    }
1411
118k
}
1412
1413
1414
// --- handler: 'surrogateescape' ---------------------------------------------
1415
1416
static PyObject *
1417
_PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
1418
0
{
1419
0
    PyObject *obj;
1420
0
    Py_ssize_t start, end, slen;
1421
0
    if (_PyUnicodeError_GetParams(exc,
1422
0
                                  &obj, NULL,
1423
0
                                  &start, &end, &slen, false) < 0)
1424
0
    {
1425
0
        return NULL;
1426
0
    }
1427
1428
0
    PyObject *res = PyBytes_FromStringAndSize(NULL, slen);
1429
0
    if (res == NULL) {
1430
0
        Py_DECREF(obj);
1431
0
        return NULL;
1432
0
    }
1433
1434
0
    char *outp = PyBytes_AsString(res);
1435
0
    for (Py_ssize_t i = start; i < end; i++) {
1436
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1437
0
        if (ch < 0xdc80 || ch > 0xdcff) {
1438
            /* Not a UTF-8b surrogate, fail with original exception. */
1439
0
            Py_DECREF(obj);
1440
0
            Py_DECREF(res);
1441
0
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1442
0
            return NULL;
1443
0
        }
1444
0
        *outp++ = ch - 0xdc00;
1445
0
    }
1446
0
    Py_DECREF(obj);
1447
1448
0
    return Py_BuildValue("(Nn)", res, end);
1449
0
}
1450
1451
1452
static PyObject *
1453
_PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc)
1454
0
{
1455
0
    PyObject *obj;
1456
0
    Py_ssize_t start, end, slen;
1457
0
    if (_PyUnicodeError_GetParams(exc,
1458
0
                                  &obj, NULL,
1459
0
                                  &start, &end, &slen, true) < 0)
1460
0
    {
1461
0
        return NULL;
1462
0
    }
1463
1464
0
    Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1465
0
    int consumed = 0;
1466
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1467
0
    while (consumed < 4 && consumed < slen) {
1468
        /* Refuse to escape ASCII bytes. */
1469
0
        if (p[start + consumed] < 128) {
1470
0
            break;
1471
0
        }
1472
0
        ch[consumed] = 0xdc00 + p[start + consumed];
1473
0
        consumed++;
1474
0
    }
1475
0
    Py_DECREF(obj);
1476
1477
0
    if (consumed == 0) {
1478
        /* Codec complained about ASCII byte. */
1479
0
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1480
0
        return NULL;
1481
0
    }
1482
1483
0
    PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1484
0
    if (str == NULL) {
1485
0
        return NULL;
1486
0
    }
1487
0
    return Py_BuildValue("(Nn)", str, start + consumed);
1488
0
}
1489
1490
1491
static PyObject *
1492
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1493
0
{
1494
0
    if (_PyIsUnicodeEncodeError(exc)) {
1495
0
        return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc);
1496
0
    }
1497
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1498
0
        return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc);
1499
0
    }
1500
0
    else {
1501
0
        wrong_exception_type(exc);
1502
0
        return NULL;
1503
0
    }
1504
0
}
1505
1506
1507
// --- Codecs registry handlers -----------------------------------------------
1508
1509
static inline PyObject *
1510
strict_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1511
2.95k
{
1512
2.95k
    return PyCodec_StrictErrors(exc);
1513
2.95k
}
1514
1515
1516
static inline PyObject *
1517
ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1518
0
{
1519
0
    return PyCodec_IgnoreErrors(exc);
1520
0
}
1521
1522
1523
static inline PyObject *
1524
replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1525
0
{
1526
0
    return PyCodec_ReplaceErrors(exc);
1527
0
}
1528
1529
1530
static inline PyObject *
1531
xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1532
0
{
1533
0
    return PyCodec_XMLCharRefReplaceErrors(exc);
1534
0
}
1535
1536
1537
static inline PyObject *
1538
backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1539
0
{
1540
0
    return PyCodec_BackslashReplaceErrors(exc);
1541
0
}
1542
1543
1544
static inline PyObject *
1545
namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1546
0
{
1547
0
    return PyCodec_NameReplaceErrors(exc);
1548
0
}
1549
1550
1551
static inline PyObject *
1552
surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1553
118k
{
1554
118k
    return PyCodec_SurrogatePassErrors(exc);
1555
118k
}
1556
1557
1558
static inline PyObject *
1559
surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1560
0
{
1561
0
    return PyCodec_SurrogateEscapeErrors(exc);
1562
0
}
1563
1564
1565
PyStatus
1566
_PyCodec_InitRegistry(PyInterpreterState *interp)
1567
19
{
1568
19
    static struct {
1569
19
        const char *name;
1570
19
        PyMethodDef def;
1571
19
    } methods[] =
1572
19
    {
1573
19
        {
1574
19
            "strict",
1575
19
            {
1576
19
                "strict_errors",
1577
19
                strict_errors,
1578
19
                METH_O,
1579
19
                PyDoc_STR("Implements the 'strict' error handling, which "
1580
19
                          "raises a UnicodeError on coding errors.")
1581
19
            }
1582
19
        },
1583
19
        {
1584
19
            "ignore",
1585
19
            {
1586
19
                "ignore_errors",
1587
19
                ignore_errors,
1588
19
                METH_O,
1589
19
                PyDoc_STR("Implements the 'ignore' error handling, which "
1590
19
                          "ignores malformed data and continues.")
1591
19
            }
1592
19
        },
1593
19
        {
1594
19
            "replace",
1595
19
            {
1596
19
                "replace_errors",
1597
19
                replace_errors,
1598
19
                METH_O,
1599
19
                PyDoc_STR("Implements the 'replace' error handling, which "
1600
19
                          "replaces malformed data with a replacement marker.")
1601
19
            }
1602
19
        },
1603
19
        {
1604
19
            "xmlcharrefreplace",
1605
19
            {
1606
19
                "xmlcharrefreplace_errors",
1607
19
                xmlcharrefreplace_errors,
1608
19
                METH_O,
1609
19
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1610
19
                          "which replaces an unencodable character with the "
1611
19
                          "appropriate XML character reference.")
1612
19
            }
1613
19
        },
1614
19
        {
1615
19
            "backslashreplace",
1616
19
            {
1617
19
                "backslashreplace_errors",
1618
19
                backslashreplace_errors,
1619
19
                METH_O,
1620
19
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1621
19
                          "which replaces malformed data with a backslashed "
1622
19
                          "escape sequence.")
1623
19
            }
1624
19
        },
1625
19
        {
1626
19
            "namereplace",
1627
19
            {
1628
19
                "namereplace_errors",
1629
19
                namereplace_errors,
1630
19
                METH_O,
1631
19
                PyDoc_STR("Implements the 'namereplace' error handling, "
1632
19
                          "which replaces an unencodable character with a "
1633
19
                          "\\N{...} escape sequence.")
1634
19
            }
1635
19
        },
1636
19
        {
1637
19
            "surrogatepass",
1638
19
            {
1639
19
                "surrogatepass",
1640
19
                surrogatepass_errors,
1641
19
                METH_O
1642
19
            }
1643
19
        },
1644
19
        {
1645
19
            "surrogateescape",
1646
19
            {
1647
19
                "surrogateescape",
1648
19
                surrogateescape_errors,
1649
19
                METH_O
1650
19
            }
1651
19
        }
1652
19
    };
1653
    // ensure that the built-in error handlers' names are kept in sync
1654
19
    assert(Py_ARRAY_LENGTH(methods) == Py_ARRAY_LENGTH(codecs_builtin_error_handlers));
1655
1656
19
    assert(interp->codecs.initialized == 0);
1657
19
    interp->codecs.search_path = PyList_New(0);
1658
19
    if (interp->codecs.search_path == NULL) {
1659
0
        return PyStatus_NoMemory();
1660
0
    }
1661
19
    interp->codecs.search_cache = PyDict_New();
1662
19
    if (interp->codecs.search_cache == NULL) {
1663
0
        return PyStatus_NoMemory();
1664
0
    }
1665
19
    interp->codecs.error_registry = PyDict_New();
1666
19
    if (interp->codecs.error_registry == NULL) {
1667
0
        return PyStatus_NoMemory();
1668
0
    }
1669
171
    for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1670
152
        PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1671
152
        if (func == NULL) {
1672
0
            return PyStatus_NoMemory();
1673
0
        }
1674
1675
152
        int res = PyDict_SetItemString(interp->codecs.error_registry,
1676
152
                                       methods[i].name, func);
1677
152
        Py_DECREF(func);
1678
152
        if (res < 0) {
1679
0
            return PyStatus_Error("Failed to insert into codec error registry");
1680
0
        }
1681
152
    }
1682
1683
19
    interp->codecs.initialized = 1;
1684
1685
    // Importing `encodings' will call back into this module to register codec
1686
    // search functions, so this is done after everything else is initialized.
1687
19
    PyObject *mod = PyImport_ImportModule("encodings");
1688
19
    if (mod == NULL) {
1689
0
        return PyStatus_Error("Failed to import encodings module");
1690
0
    }
1691
19
    Py_DECREF(mod);
1692
1693
19
    return PyStatus_Ok();
1694
19
}
1695
1696
void
1697
_PyCodec_Fini(PyInterpreterState *interp)
1698
0
{
1699
0
    Py_CLEAR(interp->codecs.search_path);
1700
0
    Py_CLEAR(interp->codecs.search_cache);
1701
    Py_CLEAR(interp->codecs.error_registry);
1702
0
    interp->codecs.initialized = 0;
1703
0
}