Coverage Report

Created: 2026-06-09 06:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Python/codecs.c
Line
Count
Source
1
/* ------------------------------------------------------------------------
2
3
   Python Codec Registry and support functions
4
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7
Copyright (c) Corporation for National Research Initiatives.
8
9
   ------------------------------------------------------------------------ */
10
11
#include "Python.h"
12
#include "pycore_call.h"          // _PyObject_CallNoArgs()
13
#include "pycore_codecs.h"        // export _PyCodec_LookupTextEncoding()
14
#include "pycore_interp.h"        // PyInterpreterState.codec_search_path
15
#include "pycore_pyerrors.h"      // _PyErr_FormatNote()
16
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
17
#include "pycore_runtime.h"       // _Py_ID()
18
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
19
#include "pycore_unicodeobject.h" // _PyUnicode_InternMortal()
20
#include "pycore_pyatomic_ft_wrappers.h"
21
22
static const char *codecs_builtin_error_handlers[] = {
23
    "strict", "ignore", "replace",
24
    "xmlcharrefreplace", "backslashreplace", "namereplace",
25
    "surrogatepass", "surrogateescape",
26
};
27
28
const char *Py_hexdigits = "0123456789abcdef";
29
30
/* --- Codec Registry ----------------------------------------------------- */
31
32
int PyCodec_Register(PyObject *search_function)
33
36
{
34
36
    PyInterpreterState *interp = _PyInterpreterState_GET();
35
36
    assert(interp->codecs.initialized);
36
36
    if (search_function == NULL) {
37
0
        PyErr_BadArgument();
38
0
        goto onError;
39
0
    }
40
36
    if (!PyCallable_Check(search_function)) {
41
0
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
42
0
        goto onError;
43
0
    }
44
36
    FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
45
36
    int ret = PyList_Append(interp->codecs.search_path, search_function);
46
36
    FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
47
48
36
    return ret;
49
50
0
 onError:
51
0
    return -1;
52
36
}
53
54
int
55
PyCodec_Unregister(PyObject *search_function)
56
0
{
57
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
58
0
    if (interp->codecs.initialized != 1) {
59
        /* Do nothing if codecs state was cleared (only possible during
60
           interpreter shutdown). */
61
0
        return 0;
62
0
    }
63
64
0
    PyObject *codec_search_path = interp->codecs.search_path;
65
0
    assert(PyList_CheckExact(codec_search_path));
66
0
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
67
0
        FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
68
0
        PyObject *item = PyList_GetItemRef(codec_search_path, i);
69
0
        int ret = 1;
70
0
        if (item == search_function) {
71
            // We hold a reference to the item, so its destructor can't run
72
            // while we hold search_path_mutex.
73
0
            ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
74
0
        }
75
0
        FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
76
0
        Py_DECREF(item);
77
0
        if (ret != 1) {
78
0
            assert(interp->codecs.search_cache != NULL);
79
0
            assert(PyDict_CheckExact(interp->codecs.search_cache));
80
0
            PyDict_Clear(interp->codecs.search_cache);
81
0
            return ret;
82
0
        }
83
0
    }
84
0
    return 0;
85
0
}
86
87
/* Convert a string to a normalized Python string: all ASCII letters are
88
   converted to lower case, spaces are replaced with hyphens. */
89
90
static PyObject*
91
normalizestring(const char *string)
92
2.54M
{
93
2.54M
    size_t i;
94
2.54M
    size_t len = strlen(string);
95
2.54M
    char *p;
96
2.54M
    PyObject *v;
97
98
2.54M
    if (len > PY_SSIZE_T_MAX) {
99
0
        PyErr_SetString(PyExc_OverflowError, "string is too large");
100
0
        return NULL;
101
0
    }
102
103
2.54M
    p = PyMem_Malloc(len + 1);
104
2.54M
    if (p == NULL)
105
0
        return PyErr_NoMemory();
106
35.8M
    for (i = 0; i < len; i++) {
107
33.3M
        char ch = string[i];
108
33.3M
        if (ch == ' ')
109
238k
            ch = '-';
110
33.0M
        else
111
33.0M
            ch = Py_TOLOWER(Py_CHARMASK(ch));
112
33.3M
        p[i] = ch;
113
33.3M
    }
114
2.54M
    p[i] = '\0';
115
2.54M
    v = PyUnicode_FromString(p);
116
2.54M
    PyMem_Free(p);
117
2.54M
    return v;
118
2.54M
}
119
120
/* Lookup the given encoding and return a tuple providing the codec
121
   facilities.
122
123
   ASCII letters in the encoding string is looked up converted to all
124
   lower case. This makes encodings looked up through this mechanism
125
   effectively case-insensitive. Spaces are replaced with hyphens for
126
   names like "US ASCII" and "ISO 8859-1".
127
128
   If no codec is found, a LookupError is set and NULL returned.
129
130
   As side effect, this tries to load the encodings package, if not
131
   yet done. This is part of the lazy load strategy for the encodings
132
   package.
133
134
*/
135
136
PyObject *_PyCodec_Lookup(const char *encoding)
137
2.54M
{
138
2.54M
    if (encoding == NULL) {
139
0
        PyErr_BadArgument();
140
0
        return NULL;
141
0
    }
142
143
2.54M
    PyInterpreterState *interp = _PyInterpreterState_GET();
144
2.54M
    assert(interp->codecs.initialized);
145
146
    /* Convert the encoding to a normalized Python string: all
147
       ASCII letters are converted to lower case, spaces are
148
       replaced with hyphens. */
149
2.54M
    PyObject *v = normalizestring(encoding);
150
2.54M
    if (v == NULL) {
151
0
        return NULL;
152
0
    }
153
154
    /* Intern the string. We'll make it immortal later if lookup succeeds. */
155
2.54M
    _PyUnicode_InternMortal(interp, &v);
156
157
    /* First, try to lookup the name in the registry dictionary */
158
2.54M
    PyObject *result;
159
2.54M
    if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
160
0
        goto onError;
161
0
    }
162
2.54M
    if (result != NULL) {
163
2.47M
        Py_DECREF(v);
164
2.47M
        return result;
165
2.47M
    }
166
167
    /* Next, scan the search functions in order of registration */
168
68.0k
    const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
169
68.0k
    if (len < 0)
170
0
        goto onError;
171
68.0k
    if (len == 0) {
172
0
        PyErr_SetString(PyExc_LookupError,
173
0
                        "no codec search functions registered: "
174
0
                        "can't find encoding");
175
0
        goto onError;
176
0
    }
177
178
68.0k
    Py_ssize_t i;
179
133k
    for (i = 0; i < len; i++) {
180
68.0k
        PyObject *func;
181
182
68.0k
        func = PyList_GetItemRef(interp->codecs.search_path, i);
183
68.0k
        if (func == NULL)
184
0
            goto onError;
185
68.0k
        result = PyObject_CallOneArg(func, v);
186
68.0k
        Py_DECREF(func);
187
68.0k
        if (result == NULL)
188
0
            goto onError;
189
68.0k
        if (result == Py_None) {
190
65.9k
            Py_CLEAR(result);
191
65.9k
            continue;
192
65.9k
        }
193
2.09k
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
194
0
            PyErr_SetString(PyExc_TypeError,
195
0
                            "codec search functions must return 4-tuples");
196
0
            Py_DECREF(result);
197
0
            goto onError;
198
0
        }
199
2.09k
        break;
200
2.09k
    }
201
68.0k
    if (result == NULL) {
202
        /* XXX Perhaps we should cache misses too ? */
203
65.9k
        PyErr_Format(PyExc_LookupError,
204
65.9k
                     "unknown encoding: %s", encoding);
205
65.9k
        goto onError;
206
65.9k
    }
207
208
2.09k
    _PyUnicode_InternImmortal(interp, &v);
209
210
    /* Cache and return the result */
211
2.09k
    if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
212
0
        Py_DECREF(result);
213
0
        goto onError;
214
0
    }
215
2.09k
    Py_DECREF(v);
216
2.09k
    return result;
217
218
65.9k
 onError:
219
65.9k
    Py_DECREF(v);
220
65.9k
    return NULL;
221
2.09k
}
222
223
/* Codec registry encoding check API. */
224
225
int PyCodec_KnownEncoding(const char *encoding)
226
0
{
227
0
    PyObject *codecs;
228
229
0
    codecs = _PyCodec_Lookup(encoding);
230
0
    if (!codecs) {
231
0
        PyErr_Clear();
232
0
        return 0;
233
0
    }
234
0
    else {
235
0
        Py_DECREF(codecs);
236
0
        return 1;
237
0
    }
238
0
}
239
240
static
241
PyObject *args_tuple(PyObject *object,
242
                     const char *errors)
243
2.20M
{
244
2.20M
    PyObject *args;
245
246
2.20M
    args = PyTuple_New(1 + (errors != NULL));
247
2.20M
    if (args == NULL)
248
0
        return NULL;
249
2.20M
    PyTuple_SET_ITEM(args, 0, Py_NewRef(object));
250
2.20M
    if (errors) {
251
194k
        PyObject *v;
252
253
194k
        v = PyUnicode_FromString(errors);
254
194k
        if (v == NULL) {
255
0
            Py_DECREF(args);
256
0
            return NULL;
257
0
        }
258
194k
        PyTuple_SET_ITEM(args, 1, v);
259
194k
    }
260
2.20M
    return args;
261
2.20M
}
262
263
/* Helper function to get a codec item */
264
265
static
266
PyObject *codec_getitem(const char *encoding, int index)
267
0
{
268
0
    PyObject *codecs;
269
0
    PyObject *v;
270
271
0
    codecs = _PyCodec_Lookup(encoding);
272
0
    if (codecs == NULL)
273
0
        return NULL;
274
0
    v = PyTuple_GET_ITEM(codecs, index);
275
0
    Py_DECREF(codecs);
276
0
    return Py_NewRef(v);
277
0
}
278
279
/* Helper functions to create an incremental codec. */
280
static
281
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
282
                                     const char *errors,
283
                                     const char *attrname)
284
127
{
285
127
    PyObject *ret, *inccodec;
286
287
127
    inccodec = PyObject_GetAttrString(codec_info, attrname);
288
127
    if (inccodec == NULL)
289
0
        return NULL;
290
127
    if (errors)
291
127
        ret = PyObject_CallFunction(inccodec, "s", errors);
292
0
    else
293
0
        ret = _PyObject_CallNoArgs(inccodec);
294
127
    Py_DECREF(inccodec);
295
127
    return ret;
296
127
}
297
298
static
299
PyObject *codec_getincrementalcodec(const char *encoding,
300
                                    const char *errors,
301
                                    const char *attrname)
302
0
{
303
0
    PyObject *codec_info, *ret;
304
305
0
    codec_info = _PyCodec_Lookup(encoding);
306
0
    if (codec_info == NULL)
307
0
        return NULL;
308
0
    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
309
0
    Py_DECREF(codec_info);
310
0
    return ret;
311
0
}
312
313
/* Helper function to create a stream codec. */
314
315
static
316
PyObject *codec_getstreamcodec(const char *encoding,
317
                               PyObject *stream,
318
                               const char *errors,
319
                               const int index)
320
0
{
321
0
    PyObject *codecs, *streamcodec, *codeccls;
322
323
0
    codecs = _PyCodec_Lookup(encoding);
324
0
    if (codecs == NULL)
325
0
        return NULL;
326
327
0
    codeccls = PyTuple_GET_ITEM(codecs, index);
328
0
    if (errors != NULL)
329
0
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
330
0
    else
331
0
        streamcodec = PyObject_CallOneArg(codeccls, stream);
332
0
    Py_DECREF(codecs);
333
0
    return streamcodec;
334
0
}
335
336
/* Helpers to work with the result of _PyCodec_Lookup
337
338
 */
339
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
340
                                             const char *errors)
341
54
{
342
54
    return codec_makeincrementalcodec(codec_info, errors,
343
54
                                      "incrementaldecoder");
344
54
}
345
346
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
347
                                             const char *errors)
348
73
{
349
73
    return codec_makeincrementalcodec(codec_info, errors,
350
73
                                      "incrementalencoder");
351
73
}
352
353
354
/* Convenience APIs to query the Codec registry.
355
356
   All APIs return a codec object with incremented refcount.
357
358
 */
359
360
PyObject *PyCodec_Encoder(const char *encoding)
361
0
{
362
0
    return codec_getitem(encoding, 0);
363
0
}
364
365
PyObject *PyCodec_Decoder(const char *encoding)
366
0
{
367
0
    return codec_getitem(encoding, 1);
368
0
}
369
370
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
371
                                     const char *errors)
372
0
{
373
0
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
374
0
}
375
376
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
377
                                     const char *errors)
378
0
{
379
0
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
380
0
}
381
382
PyObject *PyCodec_StreamReader(const char *encoding,
383
                               PyObject *stream,
384
                               const char *errors)
385
0
{
386
0
    return codec_getstreamcodec(encoding, stream, errors, 2);
387
0
}
388
389
PyObject *PyCodec_StreamWriter(const char *encoding,
390
                               PyObject *stream,
391
                               const char *errors)
392
0
{
393
0
    return codec_getstreamcodec(encoding, stream, errors, 3);
394
0
}
395
396
/* Encode an object (e.g. a Unicode object) using the given encoding
397
   and return the resulting encoded object (usually a Python string).
398
399
   errors is passed to the encoder factory as argument if non-NULL. */
400
401
static PyObject *
402
_PyCodec_EncodeInternal(PyObject *object,
403
                        PyObject *encoder,
404
                        const char *encoding,
405
                        const char *errors)
406
960k
{
407
960k
    PyObject *args = NULL, *result = NULL;
408
960k
    PyObject *v = NULL;
409
410
960k
    args = args_tuple(object, errors);
411
960k
    if (args == NULL)
412
0
        goto onError;
413
414
960k
    result = PyObject_Call(encoder, args, NULL);
415
960k
    if (result == NULL) {
416
0
        _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding);
417
0
        goto onError;
418
0
    }
419
420
960k
    if (!PyTuple_Check(result) ||
421
960k
        PyTuple_GET_SIZE(result) != 2) {
422
0
        PyErr_SetString(PyExc_TypeError,
423
0
                        "encoder must return a tuple (object, integer)");
424
0
        goto onError;
425
0
    }
426
960k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
427
    /* We don't check or use the second (integer) entry. */
428
429
960k
    Py_DECREF(args);
430
960k
    Py_DECREF(encoder);
431
960k
    Py_DECREF(result);
432
960k
    return v;
433
434
0
 onError:
435
0
    Py_XDECREF(result);
436
0
    Py_XDECREF(args);
437
0
    Py_XDECREF(encoder);
438
0
    return NULL;
439
960k
}
440
441
/* Decode an object (usually a Python string) using the given encoding
442
   and return an equivalent object (e.g. a Unicode object).
443
444
   errors is passed to the decoder factory as argument if non-NULL. */
445
446
static PyObject *
447
_PyCodec_DecodeInternal(PyObject *object,
448
                        PyObject *decoder,
449
                        const char *encoding,
450
                        const char *errors)
451
1.24M
{
452
1.24M
    PyObject *args = NULL, *result = NULL;
453
1.24M
    PyObject *v;
454
455
1.24M
    args = args_tuple(object, errors);
456
1.24M
    if (args == NULL)
457
0
        goto onError;
458
459
1.24M
    result = PyObject_Call(decoder, args, NULL);
460
1.24M
    if (result == NULL) {
461
86.2k
        _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding);
462
86.2k
        goto onError;
463
86.2k
    }
464
1.15M
    if (!PyTuple_Check(result) ||
465
1.15M
        PyTuple_GET_SIZE(result) != 2) {
466
0
        PyErr_SetString(PyExc_TypeError,
467
0
                        "decoder must return a tuple (object,integer)");
468
0
        goto onError;
469
0
    }
470
1.15M
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
471
    /* We don't check or use the second (integer) entry. */
472
473
1.15M
    Py_DECREF(args);
474
1.15M
    Py_DECREF(decoder);
475
1.15M
    Py_DECREF(result);
476
1.15M
    return v;
477
478
86.2k
 onError:
479
86.2k
    Py_XDECREF(args);
480
86.2k
    Py_XDECREF(decoder);
481
86.2k
    Py_XDECREF(result);
482
86.2k
    return NULL;
483
1.15M
}
484
485
/* Generic encoding/decoding API */
486
PyObject *PyCodec_Encode(PyObject *object,
487
                         const char *encoding,
488
                         const char *errors)
489
0
{
490
0
    PyObject *encoder;
491
492
0
    encoder = PyCodec_Encoder(encoding);
493
0
    if (encoder == NULL)
494
0
        return NULL;
495
496
0
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
497
0
}
498
499
PyObject *PyCodec_Decode(PyObject *object,
500
                         const char *encoding,
501
                         const char *errors)
502
0
{
503
0
    PyObject *decoder;
504
505
0
    decoder = PyCodec_Decoder(encoding);
506
0
    if (decoder == NULL)
507
0
        return NULL;
508
509
0
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
510
0
}
511
512
/* Text encoding/decoding API */
513
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
514
                                       const char *alternate_command)
515
2.22M
{
516
2.22M
    PyObject *codec;
517
2.22M
    PyObject *attr;
518
2.22M
    int is_text_codec;
519
520
2.22M
    codec = _PyCodec_Lookup(encoding);
521
2.22M
    if (codec == NULL)
522
11.8k
        return NULL;
523
524
    /* Backwards compatibility: assume any raw tuple describes a text
525
     * encoding, and the same for anything lacking the private
526
     * attribute.
527
     */
528
2.20M
    if (!PyTuple_CheckExact(codec)) {
529
2.20M
        if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
530
0
            Py_DECREF(codec);
531
0
            return NULL;
532
0
        }
533
2.20M
        if (attr != NULL) {
534
2.20M
            is_text_codec = PyObject_IsTrue(attr);
535
2.20M
            Py_DECREF(attr);
536
2.20M
            if (is_text_codec <= 0) {
537
3.11k
                Py_DECREF(codec);
538
3.11k
                if (!is_text_codec) {
539
3.11k
                    if (alternate_command != NULL) {
540
3.11k
                        PyErr_Format(PyExc_LookupError,
541
3.11k
                                     "'%.400s' is not a text encoding; "
542
3.11k
                                     "use %s to handle arbitrary codecs",
543
3.11k
                                     encoding, alternate_command);
544
3.11k
                    }
545
0
                    else {
546
0
                        PyErr_Format(PyExc_LookupError,
547
0
                                     "'%.400s' is not a text encoding",
548
0
                                     encoding);
549
0
                    }
550
3.11k
                }
551
3.11k
                return NULL;
552
3.11k
            }
553
2.20M
        }
554
2.20M
    }
555
556
    /* This appears to be a valid text encoding */
557
2.20M
    return codec;
558
2.20M
}
559
560
561
static
562
PyObject *codec_getitem_checked(const char *encoding,
563
                                const char *alternate_command,
564
                                int index)
565
2.22M
{
566
2.22M
    PyObject *codec;
567
2.22M
    PyObject *v;
568
569
2.22M
    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
570
2.22M
    if (codec == NULL)
571
15.0k
        return NULL;
572
573
2.20M
    v = Py_NewRef(PyTuple_GET_ITEM(codec, index));
574
2.20M
    Py_DECREF(codec);
575
2.20M
    return v;
576
2.22M
}
577
578
static PyObject * _PyCodec_TextEncoder(const char *encoding)
579
960k
{
580
960k
    return codec_getitem_checked(encoding, "codecs.encode()", 0);
581
960k
}
582
583
static PyObject * _PyCodec_TextDecoder(const char *encoding)
584
1.26M
{
585
1.26M
    return codec_getitem_checked(encoding, "codecs.decode()", 1);
586
1.26M
}
587
588
PyObject *_PyCodec_EncodeText(PyObject *object,
589
                              const char *encoding,
590
                              const char *errors)
591
960k
{
592
960k
    PyObject *encoder;
593
594
960k
    encoder = _PyCodec_TextEncoder(encoding);
595
960k
    if (encoder == NULL)
596
0
        return NULL;
597
598
960k
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
599
960k
}
600
601
PyObject *_PyCodec_DecodeText(PyObject *object,
602
                              const char *encoding,
603
                              const char *errors)
604
1.26M
{
605
1.26M
    PyObject *decoder;
606
607
1.26M
    decoder = _PyCodec_TextDecoder(encoding);
608
1.26M
    if (decoder == NULL)
609
15.0k
        return NULL;
610
611
1.24M
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
612
1.26M
}
613
614
/* Register the error handling callback function error under the name
615
   name. This function will be called by the codec when it encounters
616
   an unencodable characters/undecodable bytes and doesn't know the
617
   callback name, when name is specified as the error parameter
618
   in the call to the encode/decode function.
619
   Return 0 on success, -1 on error */
620
int PyCodec_RegisterError(const char *name, PyObject *error)
621
0
{
622
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
623
0
    assert(interp->codecs.initialized);
624
0
    if (!PyCallable_Check(error)) {
625
0
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
626
0
        return -1;
627
0
    }
628
0
    return PyDict_SetItemString(interp->codecs.error_registry,
629
0
                                name, error);
630
0
}
631
632
int _PyCodec_UnregisterError(const char *name)
633
0
{
634
0
    for (size_t i = 0; i < Py_ARRAY_LENGTH(codecs_builtin_error_handlers); ++i) {
635
0
        if (strcmp(name, codecs_builtin_error_handlers[i]) == 0) {
636
0
            PyErr_Format(PyExc_ValueError,
637
0
                         "cannot un-register built-in error handler '%s'", name);
638
0
            return -1;
639
0
        }
640
0
    }
641
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
642
0
    assert(interp->codecs.initialized);
643
0
    return PyDict_PopString(interp->codecs.error_registry, name, NULL);
644
0
}
645
646
/* Lookup the error handling callback function registered under the
647
   name error. As a special case NULL can be passed, in which case
648
   the error handling callback for strict encoding will be returned. */
649
PyObject *PyCodec_LookupError(const char *name)
650
2.57M
{
651
2.57M
    PyInterpreterState *interp = _PyInterpreterState_GET();
652
2.57M
    assert(interp->codecs.initialized);
653
654
2.57M
    if (name==NULL)
655
166k
        name = "strict";
656
2.57M
    PyObject *handler;
657
2.57M
    if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
658
0
        return NULL;
659
0
    }
660
2.57M
    if (handler == NULL) {
661
0
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
662
0
        return NULL;
663
0
    }
664
2.57M
    return handler;
665
2.57M
}
666
667
668
static inline void
669
wrong_exception_type(PyObject *exc)
670
0
{
671
0
    PyErr_Format(PyExc_TypeError,
672
0
                 "don't know how to handle %T in error callback", exc);
673
0
}
674
675
676
#define _PyIsUnicodeEncodeError(EXC)    \
677
327k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeEncodeError)
678
#define _PyIsUnicodeDecodeError(EXC)    \
679
315k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeDecodeError)
680
#define _PyIsUnicodeTranslateError(EXC) \
681
0
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
682
683
684
// --- codecs handlers: utilities ---------------------------------------------
685
686
/*
687
 * Return the number of characters (including special prefixes)
688
 * needed to represent 'ch' by codec_handler_write_unicode_hex().
689
 */
690
static inline Py_ssize_t
691
codec_handler_unicode_hex_width(Py_UCS4 ch)
692
0
{
693
0
    if (ch >= 0x10000) {
694
        // format: '\\' + 'U' + 8 hex digits
695
0
        return 1 + 1 + 8;
696
0
    }
697
0
    else if (ch >= 0x100) {
698
        // format: '\\' + 'u' + 4 hex digits
699
0
        return 1 + 1 + 4;
700
0
    }
701
0
    else {
702
        // format: '\\' + 'x' + 2 hex digits
703
0
        return 1 + 1 + 2;
704
0
    }
705
0
}
706
707
708
/*
709
 * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
710
 * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
711
 */
712
static inline void
713
codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
714
0
{
715
0
    *(*p)++ = '\\';
716
0
    if (ch >= 0x10000) {
717
0
        *(*p)++ = 'U';
718
0
        *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf];
719
0
        *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf];
720
0
        *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf];
721
0
        *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf];
722
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
723
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
724
0
    }
725
0
    else if (ch >= 0x100) {
726
0
        *(*p)++ = 'u';
727
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
728
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
729
0
    }
730
0
    else {
731
0
        *(*p)++ = 'x';
732
0
    }
733
0
    *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf];
734
0
    *(*p)++ = Py_hexdigits[ch & 0xf];
735
0
}
736
737
738
/*
739
 * Determine the number of digits for a decimal representation of Unicode
740
 * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
741
 */
742
static inline int
743
n_decimal_digits_for_codepoint(Py_UCS4 ch)
744
0
{
745
0
    if (ch < 10) return 1;
746
0
    if (ch < 100) return 2;
747
0
    if (ch < 1000) return 3;
748
0
    if (ch < 10000) return 4;
749
0
    if (ch < 100000) return 5;
750
0
    if (ch < 1000000) return 6;
751
0
    if (ch < 10000000) return 7;
752
    // Unicode codepoints are limited to 1114111 (7 decimal digits)
753
0
    Py_UNREACHABLE();
754
0
}
755
756
757
/*
758
 * Create a Unicode string containing 'count' copies of the official
759
 * Unicode REPLACEMENT CHARACTER (0xFFFD).
760
 */
761
static PyObject *
762
codec_handler_unicode_replacement_character(Py_ssize_t count)
763
228k
{
764
228k
    PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER);
765
228k
    if (res == NULL) {
766
0
        return NULL;
767
0
    }
768
228k
    assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
769
228k
    Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
770
457k
    for (Py_ssize_t i = 0; i < count; ++i) {
771
228k
        outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
772
228k
    }
773
228k
    assert(_PyUnicode_CheckConsistency(res, 1));
774
228k
    return res;
775
228k
}
776
777
778
// --- handler: 'strict' ------------------------------------------------------
779
780
PyObject *PyCodec_StrictErrors(PyObject *exc)
781
3.06M
{
782
3.06M
    if (PyExceptionInstance_Check(exc)) {
783
3.06M
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
784
3.06M
    }
785
0
    else {
786
0
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
787
0
    }
788
3.06M
    return NULL;
789
3.06M
}
790
791
792
// --- handler: 'ignore' ------------------------------------------------------
793
794
static PyObject *
795
_PyCodec_IgnoreError(PyObject *exc, int as_bytes)
796
0
{
797
0
    Py_ssize_t end;
798
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL,
799
0
                                  &end, NULL, as_bytes) < 0)
800
0
    {
801
0
        return NULL;
802
0
    }
803
0
    return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
804
0
}
805
806
807
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
808
0
{
809
0
    if (_PyIsUnicodeEncodeError(exc) || _PyIsUnicodeTranslateError(exc)) {
810
0
        return _PyCodec_IgnoreError(exc, false);
811
0
    }
812
0
    else if (_PyIsUnicodeDecodeError(exc)) {
813
0
        return _PyCodec_IgnoreError(exc, true);
814
0
    }
815
0
    else {
816
0
        wrong_exception_type(exc);
817
0
        return NULL;
818
0
    }
819
0
}
820
821
822
// --- handler: 'replace' -----------------------------------------------------
823
824
static PyObject *
825
_PyCodec_ReplaceUnicodeEncodeError(PyObject *exc)
826
0
{
827
0
    Py_ssize_t start, end, slen;
828
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
829
0
                                  &start, &end, &slen, false) < 0)
830
0
    {
831
0
        return NULL;
832
0
    }
833
0
    PyObject *res = PyUnicode_New(slen, '?');
834
0
    if (res == NULL) {
835
0
        return NULL;
836
0
    }
837
0
    assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
838
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
839
0
    memset(outp, '?', sizeof(Py_UCS1) * slen);
840
0
    assert(_PyUnicode_CheckConsistency(res, 1));
841
0
    return Py_BuildValue("(Nn)", res, end);
842
0
}
843
844
845
static PyObject *
846
_PyCodec_ReplaceUnicodeDecodeError(PyObject *exc)
847
228k
{
848
228k
    Py_ssize_t end;
849
228k
    if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) {
850
0
        return NULL;
851
0
    }
852
228k
    PyObject *res = codec_handler_unicode_replacement_character(1);
853
228k
    if (res == NULL) {
854
0
        return NULL;
855
0
    }
856
228k
    return Py_BuildValue("(Nn)", res, end);
857
228k
}
858
859
860
static PyObject *
861
_PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
862
0
{
863
0
    Py_ssize_t start, end, slen;
864
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
865
0
                                  &start, &end, &slen, false) < 0)
866
0
    {
867
0
        return NULL;
868
0
    }
869
0
    PyObject *res = codec_handler_unicode_replacement_character(slen);
870
0
    if (res == NULL) {
871
0
        return NULL;
872
0
    }
873
0
    return Py_BuildValue("(Nn)", res, end);
874
0
}
875
876
877
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
878
228k
{
879
228k
    if (_PyIsUnicodeEncodeError(exc)) {
880
0
        return _PyCodec_ReplaceUnicodeEncodeError(exc);
881
0
    }
882
228k
    else if (_PyIsUnicodeDecodeError(exc)) {
883
228k
        return _PyCodec_ReplaceUnicodeDecodeError(exc);
884
228k
    }
885
0
    else if (_PyIsUnicodeTranslateError(exc)) {
886
0
        return _PyCodec_ReplaceUnicodeTranslateError(exc);
887
0
    }
888
0
    else {
889
0
        wrong_exception_type(exc);
890
0
        return NULL;
891
0
    }
892
228k
}
893
894
895
// --- handler: 'xmlcharrefreplace' -------------------------------------------
896
897
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
898
0
{
899
0
    if (!_PyIsUnicodeEncodeError(exc)) {
900
0
        wrong_exception_type(exc);
901
0
        return NULL;
902
0
    }
903
904
0
    PyObject *obj;
905
0
    Py_ssize_t objlen, start, end, slen;
906
0
    if (_PyUnicodeError_GetParams(exc,
907
0
                                  &obj, &objlen,
908
0
                                  &start, &end, &slen, false) < 0)
909
0
    {
910
0
        return NULL;
911
0
    }
912
913
    // The number of characters that each character 'ch' contributes
914
    // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
915
    // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
916
    // range is below 10^7, each "block" requires at most 2 + 7 + 1
917
    // characters.
918
0
    if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
919
0
        end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
920
0
        end = Py_MIN(end, objlen);
921
0
        slen = Py_MAX(0, end - start);
922
0
    }
923
924
0
    Py_ssize_t ressize = 0;
925
0
    for (Py_ssize_t i = start; i < end; ++i) {
926
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
927
0
        int k = n_decimal_digits_for_codepoint(ch);
928
0
        assert(k != 0);
929
0
        assert(k <= 7);
930
0
        ressize += 2 + k + 1;
931
0
    }
932
933
    /* allocate replacement */
934
0
    PyObject *res = PyUnicode_New(ressize, 127);
935
0
    if (res == NULL) {
936
0
        Py_DECREF(obj);
937
0
        return NULL;
938
0
    }
939
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
940
    /* generate replacement */
941
0
    for (Py_ssize_t i = start; i < end; ++i) {
942
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
943
        /*
944
         * Write the decimal representation of 'ch' to the buffer pointed by 'p'
945
         * using at most 7 characters prefixed by '&#' and suffixed by ';'.
946
         */
947
0
        *outp++ = '&';
948
0
        *outp++ = '#';
949
0
        Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
950
0
        for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
951
0
            *p_digit = '0' + (ch % 10);
952
0
            ch /= 10;
953
0
        }
954
0
        assert(ch == 0);
955
0
        outp = digit_end;
956
0
        *outp++ = ';';
957
0
    }
958
0
    assert(_PyUnicode_CheckConsistency(res, 1));
959
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
960
0
    Py_DECREF(obj);
961
0
    return restuple;
962
0
}
963
964
965
// --- handler: 'backslashreplace' --------------------------------------------
966
967
static PyObject *
968
_PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc)
969
0
{
970
0
    PyObject *obj;
971
0
    Py_ssize_t objlen, start, end, slen;
972
0
    if (_PyUnicodeError_GetParams(exc,
973
0
                                  &obj, &objlen,
974
0
                                  &start, &end, &slen, false) < 0)
975
0
    {
976
0
        return NULL;
977
0
    }
978
979
    // The number of characters that each character 'ch' contributes
980
    // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch}
981
    // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS,
982
    // where the number of hexdigits is either 2, 4, or 8 (not 6).
983
    // Since the Unicode range is below 10^7, we choose k = 8 whence
984
    // each "block" requires at most 1 + 1 + 8 characters.
985
0
    if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) {
986
0
        end = start + PY_SSIZE_T_MAX / (1 + 1 + 8);
987
0
        end = Py_MIN(end, objlen);
988
0
        slen = Py_MAX(0, end - start);
989
0
    }
990
991
0
    Py_ssize_t ressize = 0;
992
0
    for (Py_ssize_t i = start; i < end; ++i) {
993
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
994
0
        ressize += codec_handler_unicode_hex_width(c);
995
0
    }
996
0
    PyObject *res = PyUnicode_New(ressize, 127);
997
0
    if (res == NULL) {
998
0
        Py_DECREF(obj);
999
0
        return NULL;
1000
0
    }
1001
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1002
0
    for (Py_ssize_t i = start; i < end; ++i) {
1003
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1004
0
        codec_handler_write_unicode_hex(&outp, c);
1005
0
    }
1006
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1007
0
    Py_DECREF(obj);
1008
0
    return Py_BuildValue("(Nn)", res, end);
1009
0
}
1010
1011
1012
static PyObject *
1013
_PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc)
1014
0
{
1015
0
    PyObject *obj;
1016
0
    Py_ssize_t objlen, start, end, slen;
1017
0
    if (_PyUnicodeError_GetParams(exc,
1018
0
                                  &obj, &objlen,
1019
0
                                  &start, &end, &slen, true) < 0)
1020
0
    {
1021
0
        return NULL;
1022
0
    }
1023
1024
0
    PyObject *res = PyUnicode_New(4 * slen, 127);
1025
0
    if (res == NULL) {
1026
0
        Py_DECREF(obj);
1027
0
        return NULL;
1028
0
    }
1029
1030
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1031
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1032
0
    for (Py_ssize_t i = start; i < end; i++, outp += 4) {
1033
0
        const unsigned char ch = p[i];
1034
0
        outp[0] = '\\';
1035
0
        outp[1] = 'x';
1036
0
        outp[2] = Py_hexdigits[(ch >> 4) & 0xf];
1037
0
        outp[3] = Py_hexdigits[ch & 0xf];
1038
0
    }
1039
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1040
0
    Py_DECREF(obj);
1041
0
    return Py_BuildValue("(Nn)", res, end);
1042
0
}
1043
1044
1045
static inline PyObject *
1046
_PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc)
1047
0
{
1048
    // Same implementation as for UnicodeEncodeError objects.
1049
0
    return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1050
0
}
1051
1052
1053
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
1054
0
{
1055
0
    if (_PyIsUnicodeEncodeError(exc)) {
1056
0
        return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1057
0
    }
1058
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1059
0
        return _PyCodec_BackslashReplaceUnicodeDecodeError(exc);
1060
0
    }
1061
0
    else if (_PyIsUnicodeTranslateError(exc)) {
1062
0
        return _PyCodec_BackslashReplaceUnicodeTranslateError(exc);
1063
0
    }
1064
0
    else {
1065
0
        wrong_exception_type(exc);
1066
0
        return NULL;
1067
0
    }
1068
0
}
1069
1070
1071
// --- handler: 'namereplace' -------------------------------------------------
1072
1073
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
1074
0
{
1075
0
    if (!_PyIsUnicodeEncodeError(exc)) {
1076
0
        wrong_exception_type(exc);
1077
0
        return NULL;
1078
0
    }
1079
1080
0
    _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
1081
0
    if (ucnhash_capi == NULL) {
1082
0
        return NULL;
1083
0
    }
1084
1085
0
    PyObject *obj;
1086
0
    Py_ssize_t start, end;
1087
0
    if (_PyUnicodeError_GetParams(exc,
1088
0
                                  &obj, NULL,
1089
0
                                  &start, &end, NULL, false) < 0)
1090
0
    {
1091
0
        return NULL;
1092
0
    }
1093
1094
0
    char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */
1095
0
    Py_ssize_t imax = start, ressize = 0, replsize;
1096
0
    for (; imax < end; ++imax) {
1097
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax);
1098
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1099
            // If 'c' is recognized by getname(), the corresponding replacement
1100
            // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
1101
            // characters. Failures of getname() are ignored by the handler.
1102
0
            replsize = 1 + 1 + 1 + strlen(buffer) + 1;
1103
0
        }
1104
0
        else {
1105
0
            replsize = codec_handler_unicode_hex_width(c);
1106
0
        }
1107
0
        if (ressize > PY_SSIZE_T_MAX - replsize) {
1108
0
            break;
1109
0
        }
1110
0
        ressize += replsize;
1111
0
    }
1112
1113
0
    PyObject *res = PyUnicode_New(ressize, 127);
1114
0
    if (res == NULL) {
1115
0
        Py_DECREF(obj);
1116
0
        return NULL;
1117
0
    }
1118
1119
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1120
0
    for (Py_ssize_t i = start; i < imax; ++i) {
1121
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1122
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1123
0
            *outp++ = '\\';
1124
0
            *outp++ = 'N';
1125
0
            *outp++ = '{';
1126
0
            (void)strcpy((char *)outp, buffer);
1127
0
            outp += strlen(buffer);
1128
0
            *outp++ = '}';
1129
0
        }
1130
0
        else {
1131
0
            codec_handler_write_unicode_hex(&outp, c);
1132
0
        }
1133
0
    }
1134
1135
0
    assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1136
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1137
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, imax);
1138
0
    Py_DECREF(obj);
1139
0
    return restuple;
1140
0
}
1141
1142
1143
8
#define ENC_UNKNOWN     -1
1144
16
#define ENC_UTF8        0
1145
0
#define ENC_UTF16BE     1
1146
0
#define ENC_UTF16LE     2
1147
0
#define ENC_UTF32BE     3
1148
0
#define ENC_UTF32LE     4
1149
1150
static int
1151
get_standard_encoding_impl(const char *encoding, int *bytelength)
1152
8
{
1153
8
    if (Py_TOLOWER(encoding[0]) == 'u' &&
1154
8
        Py_TOLOWER(encoding[1]) == 't' &&
1155
8
        Py_TOLOWER(encoding[2]) == 'f') {
1156
8
        encoding += 3;
1157
8
        if (*encoding == '-' || *encoding == '_' )
1158
8
            encoding++;
1159
8
        if (encoding[0] == '8' && encoding[1] == '\0') {
1160
8
            *bytelength = 3;
1161
8
            return ENC_UTF8;
1162
8
        }
1163
0
        else if (encoding[0] == '1' && encoding[1] == '6') {
1164
0
            encoding += 2;
1165
0
            *bytelength = 2;
1166
0
            if (*encoding == '\0') {
1167
#ifdef WORDS_BIGENDIAN
1168
                return ENC_UTF16BE;
1169
#else
1170
0
                return ENC_UTF16LE;
1171
0
#endif
1172
0
            }
1173
0
            if (*encoding == '-' || *encoding == '_' )
1174
0
                encoding++;
1175
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1176
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1177
0
                    return ENC_UTF16BE;
1178
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1179
0
                    return ENC_UTF16LE;
1180
0
            }
1181
0
        }
1182
0
        else if (encoding[0] == '3' && encoding[1] == '2') {
1183
0
            encoding += 2;
1184
0
            *bytelength = 4;
1185
0
            if (*encoding == '\0') {
1186
#ifdef WORDS_BIGENDIAN
1187
                return ENC_UTF32BE;
1188
#else
1189
0
                return ENC_UTF32LE;
1190
0
#endif
1191
0
            }
1192
0
            if (*encoding == '-' || *encoding == '_' )
1193
0
                encoding++;
1194
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1195
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1196
0
                    return ENC_UTF32BE;
1197
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1198
0
                    return ENC_UTF32LE;
1199
0
            }
1200
0
        }
1201
8
    }
1202
0
    else if (strcmp(encoding, "cp65001") == 0) {
1203
0
        *bytelength = 3;
1204
0
        return ENC_UTF8;
1205
0
    }
1206
0
    return ENC_UNKNOWN;
1207
8
}
1208
1209
1210
static int
1211
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
1212
8
{
1213
8
    const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
1214
8
    if (encoding_cstr == NULL) {
1215
0
        return -1;
1216
0
    }
1217
8
    *code = get_standard_encoding_impl(encoding_cstr, bytelength);
1218
8
    return 0;
1219
8
}
1220
1221
1222
// --- handler: 'surrogatepass' -----------------------------------------------
1223
1224
static PyObject *
1225
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
1226
0
{
1227
0
    PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
1228
0
    if (encoding == NULL) {
1229
0
        return NULL;
1230
0
    }
1231
0
    int code, bytelength;
1232
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1233
0
    Py_DECREF(encoding);
1234
0
    if (rc < 0) {
1235
0
        return NULL;
1236
0
    }
1237
0
    if (code == ENC_UNKNOWN) {
1238
0
        goto bail;
1239
0
    }
1240
1241
0
    PyObject *obj;
1242
0
    Py_ssize_t objlen, start, end, slen;
1243
0
    if (_PyUnicodeError_GetParams(exc,
1244
0
                                  &obj, &objlen,
1245
0
                                  &start, &end, &slen, false) < 0)
1246
0
    {
1247
0
        return NULL;
1248
0
    }
1249
1250
0
    if (slen > PY_SSIZE_T_MAX / bytelength) {
1251
0
        end = start + PY_SSIZE_T_MAX / bytelength;
1252
0
        end = Py_MIN(end, objlen);
1253
0
        slen = Py_MAX(0, end - start);
1254
0
    }
1255
1256
0
    PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
1257
0
    if (res == NULL) {
1258
0
        Py_DECREF(obj);
1259
0
        return NULL;
1260
0
    }
1261
1262
0
    unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
1263
0
    for (Py_ssize_t i = start; i < end; i++) {
1264
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1265
0
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1266
            /* Not a surrogate, fail with original exception */
1267
0
            Py_DECREF(obj);
1268
0
            Py_DECREF(res);
1269
0
            goto bail;
1270
0
        }
1271
0
        switch (code) {
1272
0
            case ENC_UTF8: {
1273
0
                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1274
0
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1275
0
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1276
0
                break;
1277
0
            }
1278
0
            case ENC_UTF16LE: {
1279
0
                *outp++ = (unsigned char)ch;
1280
0
                *outp++ = (unsigned char)(ch >> 8);
1281
0
                break;
1282
0
            }
1283
0
            case ENC_UTF16BE: {
1284
0
                *outp++ = (unsigned char)(ch >> 8);
1285
0
                *outp++ = (unsigned char)ch;
1286
0
                break;
1287
0
            }
1288
0
            case ENC_UTF32LE: {
1289
0
                *outp++ = (unsigned char)ch;
1290
0
                *outp++ = (unsigned char)(ch >> 8);
1291
0
                *outp++ = (unsigned char)(ch >> 16);
1292
0
                *outp++ = (unsigned char)(ch >> 24);
1293
0
                break;
1294
0
            }
1295
0
            case ENC_UTF32BE: {
1296
0
                *outp++ = (unsigned char)(ch >> 24);
1297
0
                *outp++ = (unsigned char)(ch >> 16);
1298
0
                *outp++ = (unsigned char)(ch >> 8);
1299
0
                *outp++ = (unsigned char)ch;
1300
0
                break;
1301
0
            }
1302
0
        }
1303
0
    }
1304
1305
0
    Py_DECREF(obj);
1306
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
1307
0
    return restuple;
1308
1309
0
bail:
1310
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1311
0
    return NULL;
1312
0
}
1313
1314
1315
static PyObject *
1316
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
1317
8
{
1318
8
    PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
1319
8
    if (encoding == NULL) {
1320
0
        return NULL;
1321
0
    }
1322
8
    int code, bytelength;
1323
8
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1324
8
    Py_DECREF(encoding);
1325
8
    if (rc < 0) {
1326
0
        return NULL;
1327
0
    }
1328
8
    if (code == ENC_UNKNOWN) {
1329
0
        goto bail;
1330
0
    }
1331
1332
8
    PyObject *obj;
1333
8
    Py_ssize_t objlen, start, end, slen;
1334
8
    if (_PyUnicodeError_GetParams(exc,
1335
8
                                  &obj, &objlen,
1336
8
                                  &start, &end, &slen, true) < 0)
1337
0
    {
1338
0
        return NULL;
1339
0
    }
1340
1341
    /* Try decoding a single surrogate character. If
1342
       there are more, let the codec call us again. */
1343
8
    Py_UCS4 ch = 0;
1344
8
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1345
8
    p += start;
1346
1347
8
    if (objlen - start >= bytelength) {
1348
8
        switch (code) {
1349
8
            case ENC_UTF8: {
1350
8
                if ((p[0] & 0xf0) == 0xe0 &&
1351
8
                    (p[1] & 0xc0) == 0x80 &&
1352
8
                    (p[2] & 0xc0) == 0x80)
1353
8
                {
1354
                    /* it's a three-byte code */
1355
8
                    ch = ((p[0] & 0x0f) << 12) +
1356
8
                         ((p[1] & 0x3f) << 6)  +
1357
8
                          (p[2] & 0x3f);
1358
8
                }
1359
8
                break;
1360
0
            }
1361
0
            case ENC_UTF16LE: {
1362
0
                ch = p[1] << 8 | p[0];
1363
0
                break;
1364
0
            }
1365
0
            case ENC_UTF16BE: {
1366
0
                ch = p[0] << 8 | p[1];
1367
0
                break;
1368
0
            }
1369
0
            case ENC_UTF32LE: {
1370
0
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1371
0
                break;
1372
0
            }
1373
0
            case ENC_UTF32BE: {
1374
0
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1375
0
                break;
1376
0
            }
1377
8
        }
1378
8
    }
1379
8
    Py_DECREF(obj);
1380
8
    if (!Py_UNICODE_IS_SURROGATE(ch)) {
1381
0
        goto bail;
1382
0
    }
1383
1384
8
    PyObject *res = PyUnicode_FromOrdinal(ch);
1385
8
    if (res == NULL) {
1386
0
        return NULL;
1387
0
    }
1388
8
    return Py_BuildValue("(Nn)", res, start + bytelength);
1389
1390
0
bail:
1391
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1392
0
    return NULL;
1393
8
}
1394
1395
1396
/* This handler is declared static until someone demonstrates
1397
   a need to call it directly. */
1398
static PyObject *
1399
PyCodec_SurrogatePassErrors(PyObject *exc)
1400
8
{
1401
8
    if (_PyIsUnicodeEncodeError(exc)) {
1402
0
        return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
1403
0
    }
1404
8
    else if (_PyIsUnicodeDecodeError(exc)) {
1405
8
        return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
1406
8
    }
1407
0
    else {
1408
0
        wrong_exception_type(exc);
1409
0
        return NULL;
1410
0
    }
1411
8
}
1412
1413
1414
// --- handler: 'surrogateescape' ---------------------------------------------
1415
1416
static PyObject *
1417
_PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
1418
12.2k
{
1419
12.2k
    PyObject *obj;
1420
12.2k
    Py_ssize_t start, end, slen;
1421
12.2k
    if (_PyUnicodeError_GetParams(exc,
1422
12.2k
                                  &obj, NULL,
1423
12.2k
                                  &start, &end, &slen, false) < 0)
1424
0
    {
1425
0
        return NULL;
1426
0
    }
1427
1428
12.2k
    PyObject *res = PyBytes_FromStringAndSize(NULL, slen);
1429
12.2k
    if (res == NULL) {
1430
0
        Py_DECREF(obj);
1431
0
        return NULL;
1432
0
    }
1433
1434
12.2k
    char *outp = PyBytes_AsString(res);
1435
12.2k
    for (Py_ssize_t i = start; i < end; i++) {
1436
12.2k
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1437
12.2k
        if (ch < 0xdc80 || ch > 0xdcff) {
1438
            /* Not a UTF-8b surrogate, fail with original exception. */
1439
12.2k
            Py_DECREF(obj);
1440
12.2k
            Py_DECREF(res);
1441
12.2k
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1442
12.2k
            return NULL;
1443
12.2k
        }
1444
0
        *outp++ = ch - 0xdc00;
1445
0
    }
1446
0
    Py_DECREF(obj);
1447
1448
0
    return Py_BuildValue("(Nn)", res, end);
1449
12.2k
}
1450
1451
1452
static PyObject *
1453
_PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc)
1454
86.6k
{
1455
86.6k
    PyObject *obj;
1456
86.6k
    Py_ssize_t start, end, slen;
1457
86.6k
    if (_PyUnicodeError_GetParams(exc,
1458
86.6k
                                  &obj, NULL,
1459
86.6k
                                  &start, &end, &slen, true) < 0)
1460
0
    {
1461
0
        return NULL;
1462
0
    }
1463
1464
86.6k
    Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1465
86.6k
    int consumed = 0;
1466
86.6k
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1467
193k
    while (consumed < 4 && consumed < slen) {
1468
        /* Refuse to escape ASCII bytes. */
1469
162k
        if (p[start + consumed] < 128) {
1470
54.9k
            break;
1471
54.9k
        }
1472
107k
        ch[consumed] = 0xdc00 + p[start + consumed];
1473
107k
        consumed++;
1474
107k
    }
1475
86.6k
    Py_DECREF(obj);
1476
1477
86.6k
    if (consumed == 0) {
1478
        /* Codec complained about ASCII byte. */
1479
30.1k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1480
30.1k
        return NULL;
1481
30.1k
    }
1482
1483
56.4k
    PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1484
56.4k
    if (str == NULL) {
1485
0
        return NULL;
1486
0
    }
1487
56.4k
    return Py_BuildValue("(Nn)", str, start + consumed);
1488
56.4k
}
1489
1490
1491
static PyObject *
1492
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1493
98.8k
{
1494
98.8k
    if (_PyIsUnicodeEncodeError(exc)) {
1495
12.2k
        return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc);
1496
12.2k
    }
1497
86.6k
    else if (_PyIsUnicodeDecodeError(exc)) {
1498
86.6k
        return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc);
1499
86.6k
    }
1500
0
    else {
1501
0
        wrong_exception_type(exc);
1502
0
        return NULL;
1503
0
    }
1504
98.8k
}
1505
1506
1507
// --- Codecs registry handlers -----------------------------------------------
1508
1509
static inline PyObject *
1510
strict_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1511
2.48M
{
1512
2.48M
    return PyCodec_StrictErrors(exc);
1513
2.48M
}
1514
1515
1516
static inline PyObject *
1517
ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1518
0
{
1519
0
    return PyCodec_IgnoreErrors(exc);
1520
0
}
1521
1522
1523
static inline PyObject *
1524
replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1525
228k
{
1526
228k
    return PyCodec_ReplaceErrors(exc);
1527
228k
}
1528
1529
1530
static inline PyObject *
1531
xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1532
0
{
1533
0
    return PyCodec_XMLCharRefReplaceErrors(exc);
1534
0
}
1535
1536
1537
static inline PyObject *
1538
backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1539
0
{
1540
0
    return PyCodec_BackslashReplaceErrors(exc);
1541
0
}
1542
1543
1544
static inline PyObject *
1545
namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1546
0
{
1547
0
    return PyCodec_NameReplaceErrors(exc);
1548
0
}
1549
1550
1551
static inline PyObject *
1552
surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1553
8
{
1554
8
    return PyCodec_SurrogatePassErrors(exc);
1555
8
}
1556
1557
1558
static inline PyObject *
1559
surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1560
98.8k
{
1561
98.8k
    return PyCodec_SurrogateEscapeErrors(exc);
1562
98.8k
}
1563
1564
1565
PyStatus
1566
_PyCodec_InitRegistry(PyInterpreterState *interp)
1567
36
{
1568
36
    static struct {
1569
36
        const char *name;
1570
36
        PyMethodDef def;
1571
36
    } methods[] =
1572
36
    {
1573
36
        {
1574
36
            "strict",
1575
36
            {
1576
36
                "strict_errors",
1577
36
                strict_errors,
1578
36
                METH_O,
1579
36
                PyDoc_STR("Implements the 'strict' error handling, which "
1580
36
                          "raises a UnicodeError on coding errors.")
1581
36
            }
1582
36
        },
1583
36
        {
1584
36
            "ignore",
1585
36
            {
1586
36
                "ignore_errors",
1587
36
                ignore_errors,
1588
36
                METH_O,
1589
36
                PyDoc_STR("Implements the 'ignore' error handling, which "
1590
36
                          "ignores malformed data and continues.")
1591
36
            }
1592
36
        },
1593
36
        {
1594
36
            "replace",
1595
36
            {
1596
36
                "replace_errors",
1597
36
                replace_errors,
1598
36
                METH_O,
1599
36
                PyDoc_STR("Implements the 'replace' error handling, which "
1600
36
                          "replaces malformed data with a replacement marker.")
1601
36
            }
1602
36
        },
1603
36
        {
1604
36
            "xmlcharrefreplace",
1605
36
            {
1606
36
                "xmlcharrefreplace_errors",
1607
36
                xmlcharrefreplace_errors,
1608
36
                METH_O,
1609
36
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1610
36
                          "which replaces an unencodable character with the "
1611
36
                          "appropriate XML character reference.")
1612
36
            }
1613
36
        },
1614
36
        {
1615
36
            "backslashreplace",
1616
36
            {
1617
36
                "backslashreplace_errors",
1618
36
                backslashreplace_errors,
1619
36
                METH_O,
1620
36
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1621
36
                          "which replaces malformed data with a backslashed "
1622
36
                          "escape sequence.")
1623
36
            }
1624
36
        },
1625
36
        {
1626
36
            "namereplace",
1627
36
            {
1628
36
                "namereplace_errors",
1629
36
                namereplace_errors,
1630
36
                METH_O,
1631
36
                PyDoc_STR("Implements the 'namereplace' error handling, "
1632
36
                          "which replaces an unencodable character with a "
1633
36
                          "\\N{...} escape sequence.")
1634
36
            }
1635
36
        },
1636
36
        {
1637
36
            "surrogatepass",
1638
36
            {
1639
36
                "surrogatepass",
1640
36
                surrogatepass_errors,
1641
36
                METH_O
1642
36
            }
1643
36
        },
1644
36
        {
1645
36
            "surrogateescape",
1646
36
            {
1647
36
                "surrogateescape",
1648
36
                surrogateescape_errors,
1649
36
                METH_O
1650
36
            }
1651
36
        }
1652
36
    };
1653
    // ensure that the built-in error handlers' names are kept in sync
1654
36
    assert(Py_ARRAY_LENGTH(methods) == Py_ARRAY_LENGTH(codecs_builtin_error_handlers));
1655
1656
36
    assert(interp->codecs.initialized == 0);
1657
36
    interp->codecs.search_path = PyList_New(0);
1658
36
    if (interp->codecs.search_path == NULL) {
1659
0
        return PyStatus_NoMemory();
1660
0
    }
1661
36
    interp->codecs.search_cache = PyDict_New();
1662
36
    if (interp->codecs.search_cache == NULL) {
1663
0
        return PyStatus_NoMemory();
1664
0
    }
1665
36
    interp->codecs.error_registry = PyDict_New();
1666
36
    if (interp->codecs.error_registry == NULL) {
1667
0
        return PyStatus_NoMemory();
1668
0
    }
1669
324
    for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1670
288
        PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1671
288
        if (func == NULL) {
1672
0
            return PyStatus_NoMemory();
1673
0
        }
1674
1675
288
        int res = PyDict_SetItemString(interp->codecs.error_registry,
1676
288
                                       methods[i].name, func);
1677
288
        Py_DECREF(func);
1678
288
        if (res < 0) {
1679
0
            return PyStatus_Error("Failed to insert into codec error registry");
1680
0
        }
1681
288
    }
1682
1683
36
    interp->codecs.initialized = 1;
1684
1685
    // Importing `encodings' will call back into this module to register codec
1686
    // search functions, so this is done after everything else is initialized.
1687
36
    PyObject *mod = PyImport_ImportModule("encodings");
1688
36
    if (mod == NULL) {
1689
0
        return PyStatus_Error("Failed to import encodings module");
1690
0
    }
1691
36
    Py_DECREF(mod);
1692
1693
36
    return PyStatus_Ok();
1694
36
}
1695
1696
void
1697
_PyCodec_Fini(PyInterpreterState *interp)
1698
0
{
1699
0
    Py_CLEAR(interp->codecs.search_path);
1700
0
    Py_CLEAR(interp->codecs.search_cache);
1701
    Py_CLEAR(interp->codecs.error_registry);
1702
0
    interp->codecs.initialized = 0;
1703
0
}