Coverage Report

Created: 2026-06-21 06:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Python/codecs.c
Line
Count
Source
1
/* ------------------------------------------------------------------------
2
3
   Python Codec Registry and support functions
4
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7
Copyright (c) Corporation for National Research Initiatives.
8
9
   ------------------------------------------------------------------------ */
10
11
#include "Python.h"
12
#include "pycore_call.h"          // _PyObject_CallNoArgs()
13
#include "pycore_codecs.h"        // export _PyCodec_LookupTextEncoding()
14
#include "pycore_initconfig.h"    // _Py_DumpPathConfig()
15
#include "pycore_interp.h"        // PyInterpreterState.codec_search_path
16
#include "pycore_pyerrors.h"      // _PyErr_FormatNote()
17
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
18
#include "pycore_runtime.h"       // _Py_ID()
19
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
20
#include "pycore_unicodeobject.h" // _PyUnicode_InternMortal()
21
#include "pycore_pyatomic_ft_wrappers.h"
22
23
static const char *codecs_builtin_error_handlers[] = {
24
    "strict", "ignore", "replace",
25
    "xmlcharrefreplace", "backslashreplace", "namereplace",
26
    "surrogatepass", "surrogateescape",
27
};
28
29
const char *Py_hexdigits = "0123456789abcdef";
30
31
/* --- Codec Registry ----------------------------------------------------- */
32
33
int PyCodec_Register(PyObject *search_function)
34
36
{
35
36
    PyInterpreterState *interp = _PyInterpreterState_GET();
36
36
    assert(interp->codecs.initialized);
37
36
    if (search_function == NULL) {
38
0
        PyErr_BadArgument();
39
0
        goto onError;
40
0
    }
41
36
    if (!PyCallable_Check(search_function)) {
42
0
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
43
0
        goto onError;
44
0
    }
45
36
    FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
46
36
    int ret = PyList_Append(interp->codecs.search_path, search_function);
47
36
    FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
48
49
36
    return ret;
50
51
0
 onError:
52
0
    return -1;
53
36
}
54
55
int
56
PyCodec_Unregister(PyObject *search_function)
57
0
{
58
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
59
0
    if (interp->codecs.initialized != 1) {
60
        /* Do nothing if codecs state was cleared (only possible during
61
           interpreter shutdown). */
62
0
        return 0;
63
0
    }
64
65
0
    PyObject *codec_search_path = interp->codecs.search_path;
66
0
    assert(PyList_CheckExact(codec_search_path));
67
0
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
68
0
        FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
69
0
        PyObject *item = PyList_GetItemRef(codec_search_path, i);
70
0
        int ret = 1;
71
0
        if (item == search_function) {
72
            // We hold a reference to the item, so its destructor can't run
73
            // while we hold search_path_mutex.
74
0
            ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
75
0
        }
76
0
        FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
77
0
        Py_DECREF(item);
78
0
        if (ret != 1) {
79
0
            assert(interp->codecs.search_cache != NULL);
80
0
            assert(PyDict_CheckExact(interp->codecs.search_cache));
81
0
            PyDict_Clear(interp->codecs.search_cache);
82
0
            return ret;
83
0
        }
84
0
    }
85
0
    return 0;
86
0
}
87
88
/* Convert a string to a normalized Python string: all ASCII letters are
89
   converted to lower case, spaces are replaced with hyphens. */
90
91
static PyObject*
92
normalizestring(const char *string)
93
2.41M
{
94
2.41M
    size_t i;
95
2.41M
    size_t len = strlen(string);
96
2.41M
    char *p;
97
2.41M
    PyObject *v;
98
99
2.41M
    if (len > PY_SSIZE_T_MAX) {
100
0
        PyErr_SetString(PyExc_OverflowError, "string is too large");
101
0
        return NULL;
102
0
    }
103
104
2.41M
    p = PyMem_Malloc(len + 1);
105
2.41M
    if (p == NULL)
106
0
        return PyErr_NoMemory();
107
33.9M
    for (i = 0; i < len; i++) {
108
31.4M
        char ch = string[i];
109
31.4M
        if (ch == ' ')
110
219k
            ch = '-';
111
31.2M
        else
112
31.2M
            ch = Py_TOLOWER(Py_CHARMASK(ch));
113
31.4M
        p[i] = ch;
114
31.4M
    }
115
2.41M
    p[i] = '\0';
116
2.41M
    v = PyUnicode_FromString(p);
117
2.41M
    PyMem_Free(p);
118
2.41M
    return v;
119
2.41M
}
120
121
/* Lookup the given encoding and return a tuple providing the codec
122
   facilities.
123
124
   ASCII letters in the encoding string is looked up converted to all
125
   lower case. This makes encodings looked up through this mechanism
126
   effectively case-insensitive. Spaces are replaced with hyphens for
127
   names like "US ASCII" and "ISO 8859-1".
128
129
   If no codec is found, a LookupError is set and NULL returned.
130
131
   As side effect, this tries to load the encodings package, if not
132
   yet done. This is part of the lazy load strategy for the encodings
133
   package.
134
135
*/
136
137
PyObject *_PyCodec_Lookup(const char *encoding)
138
2.41M
{
139
2.41M
    if (encoding == NULL) {
140
0
        PyErr_BadArgument();
141
0
        return NULL;
142
0
    }
143
144
2.41M
    PyInterpreterState *interp = _PyInterpreterState_GET();
145
2.41M
    assert(interp->codecs.initialized);
146
147
    /* Convert the encoding to a normalized Python string: all
148
       ASCII letters are converted to lower case, spaces are
149
       replaced with hyphens. */
150
2.41M
    PyObject *v = normalizestring(encoding);
151
2.41M
    if (v == NULL) {
152
0
        return NULL;
153
0
    }
154
155
    /* Intern the string. We'll make it immortal later if lookup succeeds. */
156
2.41M
    _PyUnicode_InternMortal(interp, &v);
157
158
    /* First, try to lookup the name in the registry dictionary */
159
2.41M
    PyObject *result;
160
2.41M
    if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
161
0
        goto onError;
162
0
    }
163
2.41M
    if (result != NULL) {
164
2.34M
        Py_DECREF(v);
165
2.34M
        return result;
166
2.34M
    }
167
168
    /* Next, scan the search functions in order of registration */
169
68.3k
    const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
170
68.3k
    if (len < 0)
171
0
        goto onError;
172
68.3k
    if (len == 0) {
173
0
        PyErr_SetString(PyExc_LookupError,
174
0
                        "no codec search functions registered: "
175
0
                        "can't find encoding");
176
0
        goto onError;
177
0
    }
178
179
68.3k
    Py_ssize_t i;
180
134k
    for (i = 0; i < len; i++) {
181
68.3k
        PyObject *func;
182
183
68.3k
        func = PyList_GetItemRef(interp->codecs.search_path, i);
184
68.3k
        if (func == NULL)
185
0
            goto onError;
186
68.3k
        result = PyObject_CallOneArg(func, v);
187
68.3k
        Py_DECREF(func);
188
68.3k
        if (result == NULL)
189
0
            goto onError;
190
68.3k
        if (result == Py_None) {
191
66.2k
            Py_CLEAR(result);
192
66.2k
            continue;
193
66.2k
        }
194
2.09k
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
195
0
            PyErr_SetString(PyExc_TypeError,
196
0
                            "codec search functions must return 4-tuples");
197
0
            Py_DECREF(result);
198
0
            goto onError;
199
0
        }
200
2.09k
        break;
201
2.09k
    }
202
68.3k
    if (result == NULL) {
203
        /* XXX Perhaps we should cache misses too ? */
204
66.2k
        PyErr_Format(PyExc_LookupError,
205
66.2k
                     "unknown encoding: %s", encoding);
206
66.2k
        goto onError;
207
66.2k
    }
208
209
2.09k
    _PyUnicode_InternImmortal(interp, &v);
210
211
    /* Cache and return the result */
212
2.09k
    if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
213
0
        Py_DECREF(result);
214
0
        goto onError;
215
0
    }
216
2.09k
    Py_DECREF(v);
217
2.09k
    return result;
218
219
66.2k
 onError:
220
66.2k
    Py_DECREF(v);
221
66.2k
    return NULL;
222
2.09k
}
223
224
/* Codec registry encoding check API. */
225
226
int PyCodec_KnownEncoding(const char *encoding)
227
0
{
228
0
    PyObject *codecs;
229
230
0
    codecs = _PyCodec_Lookup(encoding);
231
0
    if (!codecs) {
232
0
        PyErr_Clear();
233
0
        return 0;
234
0
    }
235
0
    else {
236
0
        Py_DECREF(codecs);
237
0
        return 1;
238
0
    }
239
0
}
240
241
static
242
PyObject *args_tuple(PyObject *object,
243
                     const char *errors)
244
2.07M
{
245
2.07M
    PyObject *args;
246
247
2.07M
    args = PyTuple_New(1 + (errors != NULL));
248
2.07M
    if (args == NULL)
249
0
        return NULL;
250
2.07M
    PyTuple_SET_ITEM(args, 0, Py_NewRef(object));
251
2.07M
    if (errors) {
252
187k
        PyObject *v;
253
254
187k
        v = PyUnicode_FromString(errors);
255
187k
        if (v == NULL) {
256
0
            Py_DECREF(args);
257
0
            return NULL;
258
0
        }
259
187k
        PyTuple_SET_ITEM(args, 1, v);
260
187k
    }
261
2.07M
    return args;
262
2.07M
}
263
264
/* Helper function to get a codec item */
265
266
static
267
PyObject *codec_getitem(const char *encoding, int index)
268
0
{
269
0
    PyObject *codecs;
270
0
    PyObject *v;
271
272
0
    codecs = _PyCodec_Lookup(encoding);
273
0
    if (codecs == NULL)
274
0
        return NULL;
275
0
    v = PyTuple_GET_ITEM(codecs, index);
276
0
    Py_DECREF(codecs);
277
0
    return Py_NewRef(v);
278
0
}
279
280
/* Helper functions to create an incremental codec. */
281
static
282
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
283
                                     const char *errors,
284
                                     const char *attrname)
285
127
{
286
127
    PyObject *ret, *inccodec;
287
288
127
    inccodec = PyObject_GetAttrString(codec_info, attrname);
289
127
    if (inccodec == NULL)
290
0
        return NULL;
291
127
    if (errors)
292
127
        ret = PyObject_CallFunction(inccodec, "s", errors);
293
0
    else
294
0
        ret = _PyObject_CallNoArgs(inccodec);
295
127
    Py_DECREF(inccodec);
296
127
    return ret;
297
127
}
298
299
static
300
PyObject *codec_getincrementalcodec(const char *encoding,
301
                                    const char *errors,
302
                                    const char *attrname)
303
0
{
304
0
    PyObject *codec_info, *ret;
305
306
0
    codec_info = _PyCodec_Lookup(encoding);
307
0
    if (codec_info == NULL)
308
0
        return NULL;
309
0
    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
310
0
    Py_DECREF(codec_info);
311
0
    return ret;
312
0
}
313
314
/* Helper function to create a stream codec. */
315
316
static
317
PyObject *codec_getstreamcodec(const char *encoding,
318
                               PyObject *stream,
319
                               const char *errors,
320
                               const int index)
321
0
{
322
0
    PyObject *codecs, *streamcodec, *codeccls;
323
324
0
    codecs = _PyCodec_Lookup(encoding);
325
0
    if (codecs == NULL)
326
0
        return NULL;
327
328
0
    codeccls = PyTuple_GET_ITEM(codecs, index);
329
0
    if (errors != NULL)
330
0
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
331
0
    else
332
0
        streamcodec = PyObject_CallOneArg(codeccls, stream);
333
0
    Py_DECREF(codecs);
334
0
    return streamcodec;
335
0
}
336
337
/* Helpers to work with the result of _PyCodec_Lookup
338
339
 */
340
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
341
                                             const char *errors)
342
54
{
343
54
    return codec_makeincrementalcodec(codec_info, errors,
344
54
                                      "incrementaldecoder");
345
54
}
346
347
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
348
                                             const char *errors)
349
73
{
350
73
    return codec_makeincrementalcodec(codec_info, errors,
351
73
                                      "incrementalencoder");
352
73
}
353
354
355
/* Convenience APIs to query the Codec registry.
356
357
   All APIs return a codec object with incremented refcount.
358
359
 */
360
361
PyObject *PyCodec_Encoder(const char *encoding)
362
0
{
363
0
    return codec_getitem(encoding, 0);
364
0
}
365
366
PyObject *PyCodec_Decoder(const char *encoding)
367
0
{
368
0
    return codec_getitem(encoding, 1);
369
0
}
370
371
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
372
                                     const char *errors)
373
0
{
374
0
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
375
0
}
376
377
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
378
                                     const char *errors)
379
0
{
380
0
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
381
0
}
382
383
PyObject *PyCodec_StreamReader(const char *encoding,
384
                               PyObject *stream,
385
                               const char *errors)
386
0
{
387
0
    return codec_getstreamcodec(encoding, stream, errors, 2);
388
0
}
389
390
PyObject *PyCodec_StreamWriter(const char *encoding,
391
                               PyObject *stream,
392
                               const char *errors)
393
0
{
394
0
    return codec_getstreamcodec(encoding, stream, errors, 3);
395
0
}
396
397
/* Encode an object (e.g. a Unicode object) using the given encoding
398
   and return the resulting encoded object (usually a Python string).
399
400
   errors is passed to the encoder factory as argument if non-NULL. */
401
402
static PyObject *
403
_PyCodec_EncodeInternal(PyObject *object,
404
                        PyObject *encoder,
405
                        const char *encoding,
406
                        const char *errors)
407
842k
{
408
842k
    PyObject *args = NULL, *result = NULL;
409
842k
    PyObject *v = NULL;
410
411
842k
    args = args_tuple(object, errors);
412
842k
    if (args == NULL)
413
0
        goto onError;
414
415
842k
    result = PyObject_Call(encoder, args, NULL);
416
842k
    if (result == NULL) {
417
0
        _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding);
418
0
        goto onError;
419
0
    }
420
421
842k
    if (!PyTuple_Check(result) ||
422
842k
        PyTuple_GET_SIZE(result) != 2) {
423
0
        PyErr_SetString(PyExc_TypeError,
424
0
                        "encoder must return a tuple (object, integer)");
425
0
        goto onError;
426
0
    }
427
842k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
428
    /* We don't check or use the second (integer) entry. */
429
430
842k
    Py_DECREF(args);
431
842k
    Py_DECREF(encoder);
432
842k
    Py_DECREF(result);
433
842k
    return v;
434
435
0
 onError:
436
0
    Py_XDECREF(result);
437
0
    Py_XDECREF(args);
438
0
    Py_XDECREF(encoder);
439
0
    return NULL;
440
842k
}
441
442
/* Decode an object (usually a Python string) using the given encoding
443
   and return an equivalent object (e.g. a Unicode object).
444
445
   errors is passed to the decoder factory as argument if non-NULL. */
446
447
static PyObject *
448
_PyCodec_DecodeInternal(PyObject *object,
449
                        PyObject *decoder,
450
                        const char *encoding,
451
                        const char *errors)
452
1.23M
{
453
1.23M
    PyObject *args = NULL, *result = NULL;
454
1.23M
    PyObject *v;
455
456
1.23M
    args = args_tuple(object, errors);
457
1.23M
    if (args == NULL)
458
0
        goto onError;
459
460
1.23M
    result = PyObject_Call(decoder, args, NULL);
461
1.23M
    if (result == NULL) {
462
72.2k
        _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding);
463
72.2k
        goto onError;
464
72.2k
    }
465
1.15M
    if (!PyTuple_Check(result) ||
466
1.15M
        PyTuple_GET_SIZE(result) != 2) {
467
0
        PyErr_SetString(PyExc_TypeError,
468
0
                        "decoder must return a tuple (object,integer)");
469
0
        goto onError;
470
0
    }
471
1.15M
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
472
    /* We don't check or use the second (integer) entry. */
473
474
1.15M
    Py_DECREF(args);
475
1.15M
    Py_DECREF(decoder);
476
1.15M
    Py_DECREF(result);
477
1.15M
    return v;
478
479
72.2k
 onError:
480
72.2k
    Py_XDECREF(args);
481
72.2k
    Py_XDECREF(decoder);
482
72.2k
    Py_XDECREF(result);
483
72.2k
    return NULL;
484
1.15M
}
485
486
/* Generic encoding/decoding API */
487
PyObject *PyCodec_Encode(PyObject *object,
488
                         const char *encoding,
489
                         const char *errors)
490
0
{
491
0
    PyObject *encoder;
492
493
0
    encoder = PyCodec_Encoder(encoding);
494
0
    if (encoder == NULL)
495
0
        return NULL;
496
497
0
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
498
0
}
499
500
PyObject *PyCodec_Decode(PyObject *object,
501
                         const char *encoding,
502
                         const char *errors)
503
0
{
504
0
    PyObject *decoder;
505
506
0
    decoder = PyCodec_Decoder(encoding);
507
0
    if (decoder == NULL)
508
0
        return NULL;
509
510
0
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
511
0
}
512
513
/* Text encoding/decoding API */
514
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
515
                                       const char *alternate_command)
516
2.08M
{
517
2.08M
    PyObject *codec;
518
2.08M
    PyObject *attr;
519
2.08M
    int is_text_codec;
520
521
2.08M
    codec = _PyCodec_Lookup(encoding);
522
2.08M
    if (codec == NULL)
523
11.3k
        return NULL;
524
525
    /* Backwards compatibility: assume any raw tuple describes a text
526
     * encoding, and the same for anything lacking the private
527
     * attribute.
528
     */
529
2.07M
    if (!PyTuple_CheckExact(codec)) {
530
2.07M
        if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
531
0
            Py_DECREF(codec);
532
0
            return NULL;
533
0
        }
534
2.07M
        if (attr != NULL) {
535
2.07M
            is_text_codec = PyObject_IsTrue(attr);
536
2.07M
            Py_DECREF(attr);
537
2.07M
            if (is_text_codec <= 0) {
538
3.34k
                Py_DECREF(codec);
539
3.34k
                if (!is_text_codec) {
540
3.34k
                    if (alternate_command != NULL) {
541
3.34k
                        PyErr_Format(PyExc_LookupError,
542
3.34k
                                     "'%.400s' is not a text encoding; "
543
3.34k
                                     "use %s to handle arbitrary codecs",
544
3.34k
                                     encoding, alternate_command);
545
3.34k
                    }
546
0
                    else {
547
0
                        PyErr_Format(PyExc_LookupError,
548
0
                                     "'%.400s' is not a text encoding",
549
0
                                     encoding);
550
0
                    }
551
3.34k
                }
552
3.34k
                return NULL;
553
3.34k
            }
554
2.07M
        }
555
2.07M
    }
556
557
    /* This appears to be a valid text encoding */
558
2.07M
    return codec;
559
2.07M
}
560
561
562
static
563
PyObject *codec_getitem_checked(const char *encoding,
564
                                const char *alternate_command,
565
                                int index)
566
2.08M
{
567
2.08M
    PyObject *codec;
568
2.08M
    PyObject *v;
569
570
2.08M
    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
571
2.08M
    if (codec == NULL)
572
14.6k
        return NULL;
573
574
2.07M
    v = Py_NewRef(PyTuple_GET_ITEM(codec, index));
575
2.07M
    Py_DECREF(codec);
576
2.07M
    return v;
577
2.08M
}
578
579
static PyObject * _PyCodec_TextEncoder(const char *encoding)
580
842k
{
581
842k
    return codec_getitem_checked(encoding, "codecs.encode()", 0);
582
842k
}
583
584
static PyObject * _PyCodec_TextDecoder(const char *encoding)
585
1.24M
{
586
1.24M
    return codec_getitem_checked(encoding, "codecs.decode()", 1);
587
1.24M
}
588
589
PyObject *_PyCodec_EncodeText(PyObject *object,
590
                              const char *encoding,
591
                              const char *errors)
592
842k
{
593
842k
    PyObject *encoder;
594
595
842k
    encoder = _PyCodec_TextEncoder(encoding);
596
842k
    if (encoder == NULL)
597
0
        return NULL;
598
599
842k
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
600
842k
}
601
602
PyObject *_PyCodec_DecodeText(PyObject *object,
603
                              const char *encoding,
604
                              const char *errors)
605
1.24M
{
606
1.24M
    PyObject *decoder;
607
608
1.24M
    decoder = _PyCodec_TextDecoder(encoding);
609
1.24M
    if (decoder == NULL)
610
14.6k
        return NULL;
611
612
1.23M
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
613
1.24M
}
614
615
/* Register the error handling callback function error under the name
616
   name. This function will be called by the codec when it encounters
617
   an unencodable characters/undecodable bytes and doesn't know the
618
   callback name, when name is specified as the error parameter
619
   in the call to the encode/decode function.
620
   Return 0 on success, -1 on error */
621
int PyCodec_RegisterError(const char *name, PyObject *error)
622
0
{
623
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
624
0
    assert(interp->codecs.initialized);
625
0
    if (!PyCallable_Check(error)) {
626
0
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
627
0
        return -1;
628
0
    }
629
0
    return PyDict_SetItemString(interp->codecs.error_registry,
630
0
                                name, error);
631
0
}
632
633
int _PyCodec_UnregisterError(const char *name)
634
0
{
635
0
    for (size_t i = 0; i < Py_ARRAY_LENGTH(codecs_builtin_error_handlers); ++i) {
636
0
        if (strcmp(name, codecs_builtin_error_handlers[i]) == 0) {
637
0
            PyErr_Format(PyExc_ValueError,
638
0
                         "cannot un-register built-in error handler '%s'", name);
639
0
            return -1;
640
0
        }
641
0
    }
642
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
643
0
    assert(interp->codecs.initialized);
644
0
    return PyDict_PopString(interp->codecs.error_registry, name, NULL);
645
0
}
646
647
/* Lookup the error handling callback function registered under the
648
   name error. As a special case NULL can be passed, in which case
649
   the error handling callback for strict encoding will be returned. */
650
PyObject *PyCodec_LookupError(const char *name)
651
3.28M
{
652
3.28M
    PyInterpreterState *interp = _PyInterpreterState_GET();
653
3.28M
    assert(interp->codecs.initialized);
654
655
3.28M
    if (name==NULL)
656
161k
        name = "strict";
657
3.28M
    PyObject *handler;
658
3.28M
    if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
659
0
        return NULL;
660
0
    }
661
3.28M
    if (handler == NULL) {
662
0
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
663
0
        return NULL;
664
0
    }
665
3.28M
    return handler;
666
3.28M
}
667
668
669
static inline void
670
wrong_exception_type(PyObject *exc)
671
0
{
672
0
    PyErr_Format(PyExc_TypeError,
673
0
                 "don't know how to handle %T in error callback", exc);
674
0
}
675
676
677
#define _PyIsUnicodeEncodeError(EXC)    \
678
312k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeEncodeError)
679
#define _PyIsUnicodeDecodeError(EXC)    \
680
300k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeDecodeError)
681
#define _PyIsUnicodeTranslateError(EXC) \
682
0
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
683
684
685
// --- codecs handlers: utilities ---------------------------------------------
686
687
/*
688
 * Return the number of characters (including special prefixes)
689
 * needed to represent 'ch' by codec_handler_write_unicode_hex().
690
 */
691
static inline Py_ssize_t
692
codec_handler_unicode_hex_width(Py_UCS4 ch)
693
0
{
694
0
    if (ch >= 0x10000) {
695
        // format: '\\' + 'U' + 8 hex digits
696
0
        return 1 + 1 + 8;
697
0
    }
698
0
    else if (ch >= 0x100) {
699
        // format: '\\' + 'u' + 4 hex digits
700
0
        return 1 + 1 + 4;
701
0
    }
702
0
    else {
703
        // format: '\\' + 'x' + 2 hex digits
704
0
        return 1 + 1 + 2;
705
0
    }
706
0
}
707
708
709
/*
710
 * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
711
 * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
712
 */
713
static inline void
714
codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
715
0
{
716
0
    *(*p)++ = '\\';
717
0
    if (ch >= 0x10000) {
718
0
        *(*p)++ = 'U';
719
0
        *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf];
720
0
        *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf];
721
0
        *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf];
722
0
        *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf];
723
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
724
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
725
0
    }
726
0
    else if (ch >= 0x100) {
727
0
        *(*p)++ = 'u';
728
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
729
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
730
0
    }
731
0
    else {
732
0
        *(*p)++ = 'x';
733
0
    }
734
0
    *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf];
735
0
    *(*p)++ = Py_hexdigits[ch & 0xf];
736
0
}
737
738
739
/*
740
 * Determine the number of digits for a decimal representation of Unicode
741
 * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
742
 */
743
static inline int
744
n_decimal_digits_for_codepoint(Py_UCS4 ch)
745
0
{
746
0
    if (ch < 10) return 1;
747
0
    if (ch < 100) return 2;
748
0
    if (ch < 1000) return 3;
749
0
    if (ch < 10000) return 4;
750
0
    if (ch < 100000) return 5;
751
0
    if (ch < 1000000) return 6;
752
0
    if (ch < 10000000) return 7;
753
    // Unicode codepoints are limited to 1114111 (7 decimal digits)
754
0
    Py_UNREACHABLE();
755
0
}
756
757
758
/*
759
 * Create a Unicode string containing 'count' copies of the official
760
 * Unicode REPLACEMENT CHARACTER (0xFFFD).
761
 */
762
static PyObject *
763
codec_handler_unicode_replacement_character(Py_ssize_t count)
764
228k
{
765
228k
    PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER);
766
228k
    if (res == NULL) {
767
0
        return NULL;
768
0
    }
769
228k
    assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
770
228k
    Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
771
456k
    for (Py_ssize_t i = 0; i < count; ++i) {
772
228k
        outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
773
228k
    }
774
228k
    assert(_PyUnicode_CheckConsistency(res, 1));
775
228k
    return res;
776
228k
}
777
778
779
// --- handler: 'strict' ------------------------------------------------------
780
781
PyObject *PyCodec_StrictErrors(PyObject *exc)
782
3.78M
{
783
3.78M
    if (PyExceptionInstance_Check(exc)) {
784
3.78M
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
785
3.78M
    }
786
0
    else {
787
0
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
788
0
    }
789
3.78M
    return NULL;
790
3.78M
}
791
792
793
// --- handler: 'ignore' ------------------------------------------------------
794
795
static PyObject *
796
_PyCodec_IgnoreError(PyObject *exc, int as_bytes)
797
0
{
798
0
    Py_ssize_t end;
799
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL,
800
0
                                  &end, NULL, as_bytes) < 0)
801
0
    {
802
0
        return NULL;
803
0
    }
804
0
    return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
805
0
}
806
807
808
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
809
0
{
810
0
    if (_PyIsUnicodeEncodeError(exc) || _PyIsUnicodeTranslateError(exc)) {
811
0
        return _PyCodec_IgnoreError(exc, false);
812
0
    }
813
0
    else if (_PyIsUnicodeDecodeError(exc)) {
814
0
        return _PyCodec_IgnoreError(exc, true);
815
0
    }
816
0
    else {
817
0
        wrong_exception_type(exc);
818
0
        return NULL;
819
0
    }
820
0
}
821
822
823
// --- handler: 'replace' -----------------------------------------------------
824
825
static PyObject *
826
_PyCodec_ReplaceUnicodeEncodeError(PyObject *exc)
827
0
{
828
0
    Py_ssize_t start, end, slen;
829
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
830
0
                                  &start, &end, &slen, false) < 0)
831
0
    {
832
0
        return NULL;
833
0
    }
834
0
    PyObject *res = PyUnicode_New(slen, '?');
835
0
    if (res == NULL) {
836
0
        return NULL;
837
0
    }
838
0
    assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
839
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
840
0
    memset(outp, '?', sizeof(Py_UCS1) * slen);
841
0
    assert(_PyUnicode_CheckConsistency(res, 1));
842
0
    return Py_BuildValue("(Nn)", res, end);
843
0
}
844
845
846
static PyObject *
847
_PyCodec_ReplaceUnicodeDecodeError(PyObject *exc)
848
228k
{
849
228k
    Py_ssize_t end;
850
228k
    if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) {
851
0
        return NULL;
852
0
    }
853
228k
    PyObject *res = codec_handler_unicode_replacement_character(1);
854
228k
    if (res == NULL) {
855
0
        return NULL;
856
0
    }
857
228k
    return Py_BuildValue("(Nn)", res, end);
858
228k
}
859
860
861
static PyObject *
862
_PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
863
0
{
864
0
    Py_ssize_t start, end, slen;
865
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
866
0
                                  &start, &end, &slen, false) < 0)
867
0
    {
868
0
        return NULL;
869
0
    }
870
0
    PyObject *res = codec_handler_unicode_replacement_character(slen);
871
0
    if (res == NULL) {
872
0
        return NULL;
873
0
    }
874
0
    return Py_BuildValue("(Nn)", res, end);
875
0
}
876
877
878
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
879
228k
{
880
228k
    if (_PyIsUnicodeEncodeError(exc)) {
881
0
        return _PyCodec_ReplaceUnicodeEncodeError(exc);
882
0
    }
883
228k
    else if (_PyIsUnicodeDecodeError(exc)) {
884
228k
        return _PyCodec_ReplaceUnicodeDecodeError(exc);
885
228k
    }
886
0
    else if (_PyIsUnicodeTranslateError(exc)) {
887
0
        return _PyCodec_ReplaceUnicodeTranslateError(exc);
888
0
    }
889
0
    else {
890
0
        wrong_exception_type(exc);
891
0
        return NULL;
892
0
    }
893
228k
}
894
895
896
// --- handler: 'xmlcharrefreplace' -------------------------------------------
897
898
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
899
0
{
900
0
    if (!_PyIsUnicodeEncodeError(exc)) {
901
0
        wrong_exception_type(exc);
902
0
        return NULL;
903
0
    }
904
905
0
    PyObject *obj;
906
0
    Py_ssize_t objlen, start, end, slen;
907
0
    if (_PyUnicodeError_GetParams(exc,
908
0
                                  &obj, &objlen,
909
0
                                  &start, &end, &slen, false) < 0)
910
0
    {
911
0
        return NULL;
912
0
    }
913
914
    // The number of characters that each character 'ch' contributes
915
    // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
916
    // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
917
    // range is below 10^7, each "block" requires at most 2 + 7 + 1
918
    // characters.
919
0
    if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
920
0
        end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
921
0
        end = Py_MIN(end, objlen);
922
0
        slen = Py_MAX(0, end - start);
923
0
    }
924
925
0
    Py_ssize_t ressize = 0;
926
0
    for (Py_ssize_t i = start; i < end; ++i) {
927
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
928
0
        int k = n_decimal_digits_for_codepoint(ch);
929
0
        assert(k != 0);
930
0
        assert(k <= 7);
931
0
        ressize += 2 + k + 1;
932
0
    }
933
934
    /* allocate replacement */
935
0
    PyObject *res = PyUnicode_New(ressize, 127);
936
0
    if (res == NULL) {
937
0
        Py_DECREF(obj);
938
0
        return NULL;
939
0
    }
940
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
941
    /* generate replacement */
942
0
    for (Py_ssize_t i = start; i < end; ++i) {
943
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
944
        /*
945
         * Write the decimal representation of 'ch' to the buffer pointed by 'p'
946
         * using at most 7 characters prefixed by '&#' and suffixed by ';'.
947
         */
948
0
        *outp++ = '&';
949
0
        *outp++ = '#';
950
0
        Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
951
0
        for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
952
0
            *p_digit = '0' + (ch % 10);
953
0
            ch /= 10;
954
0
        }
955
0
        assert(ch == 0);
956
0
        outp = digit_end;
957
0
        *outp++ = ';';
958
0
    }
959
0
    assert(_PyUnicode_CheckConsistency(res, 1));
960
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
961
0
    Py_DECREF(obj);
962
0
    return restuple;
963
0
}
964
965
966
// --- handler: 'backslashreplace' --------------------------------------------
967
968
static PyObject *
969
_PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc)
970
0
{
971
0
    PyObject *obj;
972
0
    Py_ssize_t objlen, start, end, slen;
973
0
    if (_PyUnicodeError_GetParams(exc,
974
0
                                  &obj, &objlen,
975
0
                                  &start, &end, &slen, false) < 0)
976
0
    {
977
0
        return NULL;
978
0
    }
979
980
    // The number of characters that each character 'ch' contributes
981
    // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch}
982
    // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS,
983
    // where the number of hexdigits is either 2, 4, or 8 (not 6).
984
    // Since the Unicode range is below 10^7, we choose k = 8 whence
985
    // each "block" requires at most 1 + 1 + 8 characters.
986
0
    if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) {
987
0
        end = start + PY_SSIZE_T_MAX / (1 + 1 + 8);
988
0
        end = Py_MIN(end, objlen);
989
0
        slen = Py_MAX(0, end - start);
990
0
    }
991
992
0
    Py_ssize_t ressize = 0;
993
0
    for (Py_ssize_t i = start; i < end; ++i) {
994
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
995
0
        ressize += codec_handler_unicode_hex_width(c);
996
0
    }
997
0
    PyObject *res = PyUnicode_New(ressize, 127);
998
0
    if (res == NULL) {
999
0
        Py_DECREF(obj);
1000
0
        return NULL;
1001
0
    }
1002
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1003
0
    for (Py_ssize_t i = start; i < end; ++i) {
1004
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1005
0
        codec_handler_write_unicode_hex(&outp, c);
1006
0
    }
1007
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1008
0
    Py_DECREF(obj);
1009
0
    return Py_BuildValue("(Nn)", res, end);
1010
0
}
1011
1012
1013
static PyObject *
1014
_PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc)
1015
0
{
1016
0
    PyObject *obj;
1017
0
    Py_ssize_t objlen, start, end, slen;
1018
0
    if (_PyUnicodeError_GetParams(exc,
1019
0
                                  &obj, &objlen,
1020
0
                                  &start, &end, &slen, true) < 0)
1021
0
    {
1022
0
        return NULL;
1023
0
    }
1024
1025
0
    PyObject *res = PyUnicode_New(4 * slen, 127);
1026
0
    if (res == NULL) {
1027
0
        Py_DECREF(obj);
1028
0
        return NULL;
1029
0
    }
1030
1031
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1032
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1033
0
    for (Py_ssize_t i = start; i < end; i++, outp += 4) {
1034
0
        const unsigned char ch = p[i];
1035
0
        outp[0] = '\\';
1036
0
        outp[1] = 'x';
1037
0
        outp[2] = Py_hexdigits[(ch >> 4) & 0xf];
1038
0
        outp[3] = Py_hexdigits[ch & 0xf];
1039
0
    }
1040
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1041
0
    Py_DECREF(obj);
1042
0
    return Py_BuildValue("(Nn)", res, end);
1043
0
}
1044
1045
1046
static inline PyObject *
1047
_PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc)
1048
0
{
1049
    // Same implementation as for UnicodeEncodeError objects.
1050
0
    return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1051
0
}
1052
1053
1054
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
1055
0
{
1056
0
    if (_PyIsUnicodeEncodeError(exc)) {
1057
0
        return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1058
0
    }
1059
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1060
0
        return _PyCodec_BackslashReplaceUnicodeDecodeError(exc);
1061
0
    }
1062
0
    else if (_PyIsUnicodeTranslateError(exc)) {
1063
0
        return _PyCodec_BackslashReplaceUnicodeTranslateError(exc);
1064
0
    }
1065
0
    else {
1066
0
        wrong_exception_type(exc);
1067
0
        return NULL;
1068
0
    }
1069
0
}
1070
1071
1072
// --- handler: 'namereplace' -------------------------------------------------
1073
1074
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
1075
0
{
1076
0
    if (!_PyIsUnicodeEncodeError(exc)) {
1077
0
        wrong_exception_type(exc);
1078
0
        return NULL;
1079
0
    }
1080
1081
0
    _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
1082
0
    if (ucnhash_capi == NULL) {
1083
0
        return NULL;
1084
0
    }
1085
1086
0
    PyObject *obj;
1087
0
    Py_ssize_t start, end;
1088
0
    if (_PyUnicodeError_GetParams(exc,
1089
0
                                  &obj, NULL,
1090
0
                                  &start, &end, NULL, false) < 0)
1091
0
    {
1092
0
        return NULL;
1093
0
    }
1094
1095
0
    char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */
1096
0
    Py_ssize_t imax = start, ressize = 0, replsize;
1097
0
    for (; imax < end; ++imax) {
1098
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax);
1099
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1100
            // If 'c' is recognized by getname(), the corresponding replacement
1101
            // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
1102
            // characters. Failures of getname() are ignored by the handler.
1103
0
            replsize = 1 + 1 + 1 + strlen(buffer) + 1;
1104
0
        }
1105
0
        else {
1106
0
            replsize = codec_handler_unicode_hex_width(c);
1107
0
        }
1108
0
        if (ressize > PY_SSIZE_T_MAX - replsize) {
1109
0
            break;
1110
0
        }
1111
0
        ressize += replsize;
1112
0
    }
1113
1114
0
    PyObject *res = PyUnicode_New(ressize, 127);
1115
0
    if (res == NULL) {
1116
0
        Py_DECREF(obj);
1117
0
        return NULL;
1118
0
    }
1119
1120
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1121
0
    for (Py_ssize_t i = start; i < imax; ++i) {
1122
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1123
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1124
0
            *outp++ = '\\';
1125
0
            *outp++ = 'N';
1126
0
            *outp++ = '{';
1127
0
            (void)strcpy((char *)outp, buffer);
1128
0
            outp += strlen(buffer);
1129
0
            *outp++ = '}';
1130
0
        }
1131
0
        else {
1132
0
            codec_handler_write_unicode_hex(&outp, c);
1133
0
        }
1134
0
    }
1135
1136
0
    assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1137
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1138
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, imax);
1139
0
    Py_DECREF(obj);
1140
0
    return restuple;
1141
0
}
1142
1143
1144
8
#define ENC_UNKNOWN     -1
1145
16
#define ENC_UTF8        0
1146
0
#define ENC_UTF16BE     1
1147
0
#define ENC_UTF16LE     2
1148
0
#define ENC_UTF32BE     3
1149
0
#define ENC_UTF32LE     4
1150
1151
static int
1152
get_standard_encoding_impl(const char *encoding, int *bytelength)
1153
8
{
1154
8
    if (Py_TOLOWER(encoding[0]) == 'u' &&
1155
8
        Py_TOLOWER(encoding[1]) == 't' &&
1156
8
        Py_TOLOWER(encoding[2]) == 'f') {
1157
8
        encoding += 3;
1158
8
        if (*encoding == '-' || *encoding == '_' )
1159
8
            encoding++;
1160
8
        if (encoding[0] == '8' && encoding[1] == '\0') {
1161
8
            *bytelength = 3;
1162
8
            return ENC_UTF8;
1163
8
        }
1164
0
        else if (encoding[0] == '1' && encoding[1] == '6') {
1165
0
            encoding += 2;
1166
0
            *bytelength = 2;
1167
0
            if (*encoding == '\0') {
1168
#ifdef WORDS_BIGENDIAN
1169
                return ENC_UTF16BE;
1170
#else
1171
0
                return ENC_UTF16LE;
1172
0
#endif
1173
0
            }
1174
0
            if (*encoding == '-' || *encoding == '_' )
1175
0
                encoding++;
1176
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1177
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1178
0
                    return ENC_UTF16BE;
1179
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1180
0
                    return ENC_UTF16LE;
1181
0
            }
1182
0
        }
1183
0
        else if (encoding[0] == '3' && encoding[1] == '2') {
1184
0
            encoding += 2;
1185
0
            *bytelength = 4;
1186
0
            if (*encoding == '\0') {
1187
#ifdef WORDS_BIGENDIAN
1188
                return ENC_UTF32BE;
1189
#else
1190
0
                return ENC_UTF32LE;
1191
0
#endif
1192
0
            }
1193
0
            if (*encoding == '-' || *encoding == '_' )
1194
0
                encoding++;
1195
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1196
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1197
0
                    return ENC_UTF32BE;
1198
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1199
0
                    return ENC_UTF32LE;
1200
0
            }
1201
0
        }
1202
8
    }
1203
0
    else if (strcmp(encoding, "cp65001") == 0) {
1204
0
        *bytelength = 3;
1205
0
        return ENC_UTF8;
1206
0
    }
1207
0
    return ENC_UNKNOWN;
1208
8
}
1209
1210
1211
static int
1212
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
1213
8
{
1214
8
    const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
1215
8
    if (encoding_cstr == NULL) {
1216
0
        return -1;
1217
0
    }
1218
8
    *code = get_standard_encoding_impl(encoding_cstr, bytelength);
1219
8
    return 0;
1220
8
}
1221
1222
1223
// --- handler: 'surrogatepass' -----------------------------------------------
1224
1225
static PyObject *
1226
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
1227
0
{
1228
0
    PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
1229
0
    if (encoding == NULL) {
1230
0
        return NULL;
1231
0
    }
1232
0
    int code, bytelength;
1233
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1234
0
    Py_DECREF(encoding);
1235
0
    if (rc < 0) {
1236
0
        return NULL;
1237
0
    }
1238
0
    if (code == ENC_UNKNOWN) {
1239
0
        goto bail;
1240
0
    }
1241
1242
0
    PyObject *obj;
1243
0
    Py_ssize_t objlen, start, end, slen;
1244
0
    if (_PyUnicodeError_GetParams(exc,
1245
0
                                  &obj, &objlen,
1246
0
                                  &start, &end, &slen, false) < 0)
1247
0
    {
1248
0
        return NULL;
1249
0
    }
1250
1251
0
    if (slen > PY_SSIZE_T_MAX / bytelength) {
1252
0
        end = start + PY_SSIZE_T_MAX / bytelength;
1253
0
        end = Py_MIN(end, objlen);
1254
0
        slen = Py_MAX(0, end - start);
1255
0
    }
1256
1257
0
    PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
1258
0
    if (res == NULL) {
1259
0
        Py_DECREF(obj);
1260
0
        return NULL;
1261
0
    }
1262
1263
0
    unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
1264
0
    for (Py_ssize_t i = start; i < end; i++) {
1265
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1266
0
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1267
            /* Not a surrogate, fail with original exception */
1268
0
            Py_DECREF(obj);
1269
0
            Py_DECREF(res);
1270
0
            goto bail;
1271
0
        }
1272
0
        switch (code) {
1273
0
            case ENC_UTF8: {
1274
0
                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1275
0
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1276
0
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1277
0
                break;
1278
0
            }
1279
0
            case ENC_UTF16LE: {
1280
0
                *outp++ = (unsigned char)ch;
1281
0
                *outp++ = (unsigned char)(ch >> 8);
1282
0
                break;
1283
0
            }
1284
0
            case ENC_UTF16BE: {
1285
0
                *outp++ = (unsigned char)(ch >> 8);
1286
0
                *outp++ = (unsigned char)ch;
1287
0
                break;
1288
0
            }
1289
0
            case ENC_UTF32LE: {
1290
0
                *outp++ = (unsigned char)ch;
1291
0
                *outp++ = (unsigned char)(ch >> 8);
1292
0
                *outp++ = (unsigned char)(ch >> 16);
1293
0
                *outp++ = (unsigned char)(ch >> 24);
1294
0
                break;
1295
0
            }
1296
0
            case ENC_UTF32BE: {
1297
0
                *outp++ = (unsigned char)(ch >> 24);
1298
0
                *outp++ = (unsigned char)(ch >> 16);
1299
0
                *outp++ = (unsigned char)(ch >> 8);
1300
0
                *outp++ = (unsigned char)ch;
1301
0
                break;
1302
0
            }
1303
0
        }
1304
0
    }
1305
1306
0
    Py_DECREF(obj);
1307
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
1308
0
    return restuple;
1309
1310
0
bail:
1311
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1312
0
    return NULL;
1313
0
}
1314
1315
1316
static PyObject *
1317
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
1318
8
{
1319
8
    PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
1320
8
    if (encoding == NULL) {
1321
0
        return NULL;
1322
0
    }
1323
8
    int code, bytelength;
1324
8
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1325
8
    Py_DECREF(encoding);
1326
8
    if (rc < 0) {
1327
0
        return NULL;
1328
0
    }
1329
8
    if (code == ENC_UNKNOWN) {
1330
0
        goto bail;
1331
0
    }
1332
1333
8
    PyObject *obj;
1334
8
    Py_ssize_t objlen, start, end, slen;
1335
8
    if (_PyUnicodeError_GetParams(exc,
1336
8
                                  &obj, &objlen,
1337
8
                                  &start, &end, &slen, true) < 0)
1338
0
    {
1339
0
        return NULL;
1340
0
    }
1341
1342
    /* Try decoding a single surrogate character. If
1343
       there are more, let the codec call us again. */
1344
8
    Py_UCS4 ch = 0;
1345
8
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1346
8
    p += start;
1347
1348
8
    if (objlen - start >= bytelength) {
1349
8
        switch (code) {
1350
8
            case ENC_UTF8: {
1351
8
                if ((p[0] & 0xf0) == 0xe0 &&
1352
8
                    (p[1] & 0xc0) == 0x80 &&
1353
8
                    (p[2] & 0xc0) == 0x80)
1354
8
                {
1355
                    /* it's a three-byte code */
1356
8
                    ch = ((p[0] & 0x0f) << 12) +
1357
8
                         ((p[1] & 0x3f) << 6)  +
1358
8
                          (p[2] & 0x3f);
1359
8
                }
1360
8
                break;
1361
0
            }
1362
0
            case ENC_UTF16LE: {
1363
0
                ch = p[1] << 8 | p[0];
1364
0
                break;
1365
0
            }
1366
0
            case ENC_UTF16BE: {
1367
0
                ch = p[0] << 8 | p[1];
1368
0
                break;
1369
0
            }
1370
0
            case ENC_UTF32LE: {
1371
0
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1372
0
                break;
1373
0
            }
1374
0
            case ENC_UTF32BE: {
1375
0
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1376
0
                break;
1377
0
            }
1378
8
        }
1379
8
    }
1380
8
    Py_DECREF(obj);
1381
8
    if (!Py_UNICODE_IS_SURROGATE(ch)) {
1382
0
        goto bail;
1383
0
    }
1384
1385
8
    PyObject *res = PyUnicode_FromOrdinal(ch);
1386
8
    if (res == NULL) {
1387
0
        return NULL;
1388
0
    }
1389
8
    return Py_BuildValue("(Nn)", res, start + bytelength);
1390
1391
0
bail:
1392
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1393
0
    return NULL;
1394
8
}
1395
1396
1397
/* This handler is declared static until someone demonstrates
1398
   a need to call it directly. */
1399
static PyObject *
1400
PyCodec_SurrogatePassErrors(PyObject *exc)
1401
8
{
1402
8
    if (_PyIsUnicodeEncodeError(exc)) {
1403
0
        return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
1404
0
    }
1405
8
    else if (_PyIsUnicodeDecodeError(exc)) {
1406
8
        return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
1407
8
    }
1408
0
    else {
1409
0
        wrong_exception_type(exc);
1410
0
        return NULL;
1411
0
    }
1412
8
}
1413
1414
1415
// --- handler: 'surrogateescape' ---------------------------------------------
1416
1417
static PyObject *
1418
_PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
1419
11.6k
{
1420
11.6k
    PyObject *obj;
1421
11.6k
    Py_ssize_t start, end, slen;
1422
11.6k
    if (_PyUnicodeError_GetParams(exc,
1423
11.6k
                                  &obj, NULL,
1424
11.6k
                                  &start, &end, &slen, false) < 0)
1425
0
    {
1426
0
        return NULL;
1427
0
    }
1428
1429
11.6k
    PyObject *res = PyBytes_FromStringAndSize(NULL, slen);
1430
11.6k
    if (res == NULL) {
1431
0
        Py_DECREF(obj);
1432
0
        return NULL;
1433
0
    }
1434
1435
11.6k
    char *outp = PyBytes_AsString(res);
1436
11.6k
    for (Py_ssize_t i = start; i < end; i++) {
1437
11.6k
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1438
11.6k
        if (ch < 0xdc80 || ch > 0xdcff) {
1439
            /* Not a UTF-8b surrogate, fail with original exception. */
1440
11.6k
            Py_DECREF(obj);
1441
11.6k
            Py_DECREF(res);
1442
11.6k
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1443
11.6k
            return NULL;
1444
11.6k
        }
1445
0
        *outp++ = ch - 0xdc00;
1446
0
    }
1447
0
    Py_DECREF(obj);
1448
1449
0
    return Py_BuildValue("(Nn)", res, end);
1450
11.6k
}
1451
1452
1453
static PyObject *
1454
_PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc)
1455
72.0k
{
1456
72.0k
    PyObject *obj;
1457
72.0k
    Py_ssize_t start, end, slen;
1458
72.0k
    if (_PyUnicodeError_GetParams(exc,
1459
72.0k
                                  &obj, NULL,
1460
72.0k
                                  &start, &end, &slen, true) < 0)
1461
0
    {
1462
0
        return NULL;
1463
0
    }
1464
1465
72.0k
    Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1466
72.0k
    int consumed = 0;
1467
72.0k
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1468
162k
    while (consumed < 4 && consumed < slen) {
1469
        /* Refuse to escape ASCII bytes. */
1470
132k
        if (p[start + consumed] < 128) {
1471
41.9k
            break;
1472
41.9k
        }
1473
90.8k
        ch[consumed] = 0xdc00 + p[start + consumed];
1474
90.8k
        consumed++;
1475
90.8k
    }
1476
72.0k
    Py_DECREF(obj);
1477
1478
72.0k
    if (consumed == 0) {
1479
        /* Codec complained about ASCII byte. */
1480
23.6k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1481
23.6k
        return NULL;
1482
23.6k
    }
1483
1484
48.3k
    PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1485
48.3k
    if (str == NULL) {
1486
0
        return NULL;
1487
0
    }
1488
48.3k
    return Py_BuildValue("(Nn)", str, start + consumed);
1489
48.3k
}
1490
1491
1492
static PyObject *
1493
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1494
83.7k
{
1495
83.7k
    if (_PyIsUnicodeEncodeError(exc)) {
1496
11.6k
        return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc);
1497
11.6k
    }
1498
72.0k
    else if (_PyIsUnicodeDecodeError(exc)) {
1499
72.0k
        return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc);
1500
72.0k
    }
1501
0
    else {
1502
0
        wrong_exception_type(exc);
1503
0
        return NULL;
1504
0
    }
1505
83.7k
}
1506
1507
1508
// --- Codecs registry handlers -----------------------------------------------
1509
1510
static inline PyObject *
1511
strict_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1512
3.20M
{
1513
3.20M
    return PyCodec_StrictErrors(exc);
1514
3.20M
}
1515
1516
1517
static inline PyObject *
1518
ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1519
0
{
1520
0
    return PyCodec_IgnoreErrors(exc);
1521
0
}
1522
1523
1524
static inline PyObject *
1525
replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1526
228k
{
1527
228k
    return PyCodec_ReplaceErrors(exc);
1528
228k
}
1529
1530
1531
static inline PyObject *
1532
xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1533
0
{
1534
0
    return PyCodec_XMLCharRefReplaceErrors(exc);
1535
0
}
1536
1537
1538
static inline PyObject *
1539
backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1540
0
{
1541
0
    return PyCodec_BackslashReplaceErrors(exc);
1542
0
}
1543
1544
1545
static inline PyObject *
1546
namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1547
0
{
1548
0
    return PyCodec_NameReplaceErrors(exc);
1549
0
}
1550
1551
1552
static inline PyObject *
1553
surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1554
8
{
1555
8
    return PyCodec_SurrogatePassErrors(exc);
1556
8
}
1557
1558
1559
static inline PyObject *
1560
surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1561
83.7k
{
1562
83.7k
    return PyCodec_SurrogateEscapeErrors(exc);
1563
83.7k
}
1564
1565
1566
PyStatus
1567
_PyCodec_InitRegistry(PyInterpreterState *interp)
1568
36
{
1569
36
    static struct {
1570
36
        const char *name;
1571
36
        PyMethodDef def;
1572
36
    } methods[] =
1573
36
    {
1574
36
        {
1575
36
            "strict",
1576
36
            {
1577
36
                "strict_errors",
1578
36
                strict_errors,
1579
36
                METH_O,
1580
36
                PyDoc_STR("Implements the 'strict' error handling, which "
1581
36
                          "raises a UnicodeError on coding errors.")
1582
36
            }
1583
36
        },
1584
36
        {
1585
36
            "ignore",
1586
36
            {
1587
36
                "ignore_errors",
1588
36
                ignore_errors,
1589
36
                METH_O,
1590
36
                PyDoc_STR("Implements the 'ignore' error handling, which "
1591
36
                          "ignores malformed data and continues.")
1592
36
            }
1593
36
        },
1594
36
        {
1595
36
            "replace",
1596
36
            {
1597
36
                "replace_errors",
1598
36
                replace_errors,
1599
36
                METH_O,
1600
36
                PyDoc_STR("Implements the 'replace' error handling, which "
1601
36
                          "replaces malformed data with a replacement marker.")
1602
36
            }
1603
36
        },
1604
36
        {
1605
36
            "xmlcharrefreplace",
1606
36
            {
1607
36
                "xmlcharrefreplace_errors",
1608
36
                xmlcharrefreplace_errors,
1609
36
                METH_O,
1610
36
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1611
36
                          "which replaces an unencodable character with the "
1612
36
                          "appropriate XML character reference.")
1613
36
            }
1614
36
        },
1615
36
        {
1616
36
            "backslashreplace",
1617
36
            {
1618
36
                "backslashreplace_errors",
1619
36
                backslashreplace_errors,
1620
36
                METH_O,
1621
36
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1622
36
                          "which replaces malformed data with a backslashed "
1623
36
                          "escape sequence.")
1624
36
            }
1625
36
        },
1626
36
        {
1627
36
            "namereplace",
1628
36
            {
1629
36
                "namereplace_errors",
1630
36
                namereplace_errors,
1631
36
                METH_O,
1632
36
                PyDoc_STR("Implements the 'namereplace' error handling, "
1633
36
                          "which replaces an unencodable character with a "
1634
36
                          "\\N{...} escape sequence.")
1635
36
            }
1636
36
        },
1637
36
        {
1638
36
            "surrogatepass",
1639
36
            {
1640
36
                "surrogatepass",
1641
36
                surrogatepass_errors,
1642
36
                METH_O
1643
36
            }
1644
36
        },
1645
36
        {
1646
36
            "surrogateescape",
1647
36
            {
1648
36
                "surrogateescape",
1649
36
                surrogateescape_errors,
1650
36
                METH_O
1651
36
            }
1652
36
        }
1653
36
    };
1654
    // ensure that the built-in error handlers' names are kept in sync
1655
36
    assert(Py_ARRAY_LENGTH(methods) == Py_ARRAY_LENGTH(codecs_builtin_error_handlers));
1656
1657
36
    assert(interp->codecs.initialized == 0);
1658
36
    interp->codecs.search_path = PyList_New(0);
1659
36
    if (interp->codecs.search_path == NULL) {
1660
0
        return PyStatus_NoMemory();
1661
0
    }
1662
36
    interp->codecs.search_cache = PyDict_New();
1663
36
    if (interp->codecs.search_cache == NULL) {
1664
0
        return PyStatus_NoMemory();
1665
0
    }
1666
36
    interp->codecs.error_registry = PyDict_New();
1667
36
    if (interp->codecs.error_registry == NULL) {
1668
0
        return PyStatus_NoMemory();
1669
0
    }
1670
324
    for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1671
288
        PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1672
288
        if (func == NULL) {
1673
0
            return PyStatus_NoMemory();
1674
0
        }
1675
1676
288
        int res = PyDict_SetItemString(interp->codecs.error_registry,
1677
288
                                       methods[i].name, func);
1678
288
        Py_DECREF(func);
1679
288
        if (res < 0) {
1680
0
            return PyStatus_Error("Failed to insert into codec error registry");
1681
0
        }
1682
288
    }
1683
1684
36
    interp->codecs.initialized = 1;
1685
1686
    // Importing `encodings' will call back into this module to register codec
1687
    // search functions, so this is done after everything else is initialized.
1688
36
    PyObject *mod = PyImport_ImportModule("encodings");
1689
36
    if (mod == NULL) {
1690
0
        PyThreadState *tstate = _PyThreadState_GET();
1691
0
        _Py_DumpPathConfig(tstate);
1692
0
        return PyStatus_Error("Failed to import encodings module");
1693
0
    }
1694
36
    Py_DECREF(mod);
1695
1696
36
    return PyStatus_Ok();
1697
36
}
1698
1699
void
1700
_PyCodec_Fini(PyInterpreterState *interp)
1701
0
{
1702
0
    Py_CLEAR(interp->codecs.search_path);
1703
0
    Py_CLEAR(interp->codecs.search_cache);
1704
    Py_CLEAR(interp->codecs.error_registry);
1705
0
    interp->codecs.initialized = 0;
1706
0
}