Coverage Report

Created: 2026-05-30 06:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Python/codecs.c
Line
Count
Source
1
/* ------------------------------------------------------------------------
2
3
   Python Codec Registry and support functions
4
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7
Copyright (c) Corporation for National Research Initiatives.
8
9
   ------------------------------------------------------------------------ */
10
11
#include "Python.h"
12
#include "pycore_call.h"          // _PyObject_CallNoArgs()
13
#include "pycore_codecs.h"        // export _PyCodec_LookupTextEncoding()
14
#include "pycore_interp.h"        // PyInterpreterState.codec_search_path
15
#include "pycore_pyerrors.h"      // _PyErr_FormatNote()
16
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
17
#include "pycore_runtime.h"       // _Py_ID()
18
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
19
#include "pycore_unicodeobject.h" // _PyUnicode_InternMortal()
20
#include "pycore_pyatomic_ft_wrappers.h"
21
22
static const char *codecs_builtin_error_handlers[] = {
23
    "strict", "ignore", "replace",
24
    "xmlcharrefreplace", "backslashreplace", "namereplace",
25
    "surrogatepass", "surrogateescape",
26
};
27
28
const char *Py_hexdigits = "0123456789abcdef";
29
30
/* --- Codec Registry ----------------------------------------------------- */
31
32
int PyCodec_Register(PyObject *search_function)
33
37
{
34
37
    PyInterpreterState *interp = _PyInterpreterState_GET();
35
37
    assert(interp->codecs.initialized);
36
37
    if (search_function == NULL) {
37
0
        PyErr_BadArgument();
38
0
        goto onError;
39
0
    }
40
37
    if (!PyCallable_Check(search_function)) {
41
0
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
42
0
        goto onError;
43
0
    }
44
37
    FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
45
37
    int ret = PyList_Append(interp->codecs.search_path, search_function);
46
37
    FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
47
48
37
    return ret;
49
50
0
 onError:
51
0
    return -1;
52
37
}
53
54
int
55
PyCodec_Unregister(PyObject *search_function)
56
0
{
57
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
58
0
    if (interp->codecs.initialized != 1) {
59
        /* Do nothing if codecs state was cleared (only possible during
60
           interpreter shutdown). */
61
0
        return 0;
62
0
    }
63
64
0
    PyObject *codec_search_path = interp->codecs.search_path;
65
0
    assert(PyList_CheckExact(codec_search_path));
66
0
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
67
0
        FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
68
0
        PyObject *item = PyList_GetItemRef(codec_search_path, i);
69
0
        int ret = 1;
70
0
        if (item == search_function) {
71
            // We hold a reference to the item, so its destructor can't run
72
            // while we hold search_path_mutex.
73
0
            ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
74
0
        }
75
0
        FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
76
0
        Py_DECREF(item);
77
0
        if (ret != 1) {
78
0
            assert(interp->codecs.search_cache != NULL);
79
0
            assert(PyDict_CheckExact(interp->codecs.search_cache));
80
0
            PyDict_Clear(interp->codecs.search_cache);
81
0
            return ret;
82
0
        }
83
0
    }
84
0
    return 0;
85
0
}
86
87
/* Convert a string to a normalized Python string: all ASCII letters are
88
   converted to lower case, spaces are replaced with hyphens. */
89
90
static PyObject*
91
normalizestring(const char *string)
92
2.36M
{
93
2.36M
    size_t i;
94
2.36M
    size_t len = strlen(string);
95
2.36M
    char *p;
96
2.36M
    PyObject *v;
97
98
2.36M
    if (len > PY_SSIZE_T_MAX) {
99
0
        PyErr_SetString(PyExc_OverflowError, "string is too large");
100
0
        return NULL;
101
0
    }
102
103
2.36M
    p = PyMem_Malloc(len + 1);
104
2.36M
    if (p == NULL)
105
0
        return PyErr_NoMemory();
106
33.7M
    for (i = 0; i < len; i++) {
107
31.4M
        char ch = string[i];
108
31.4M
        if (ch == ' ')
109
237k
            ch = '-';
110
31.1M
        else
111
31.1M
            ch = Py_TOLOWER(Py_CHARMASK(ch));
112
31.4M
        p[i] = ch;
113
31.4M
    }
114
2.36M
    p[i] = '\0';
115
2.36M
    v = PyUnicode_FromString(p);
116
2.36M
    PyMem_Free(p);
117
2.36M
    return v;
118
2.36M
}
119
120
/* Lookup the given encoding and return a tuple providing the codec
121
   facilities.
122
123
   ASCII letters in the encoding string is looked up converted to all
124
   lower case. This makes encodings looked up through this mechanism
125
   effectively case-insensitive. Spaces are replaced with hyphens for
126
   names like "US ASCII" and "ISO 8859-1".
127
128
   If no codec is found, a LookupError is set and NULL returned.
129
130
   As side effect, this tries to load the encodings package, if not
131
   yet done. This is part of the lazy load strategy for the encodings
132
   package.
133
134
*/
135
136
PyObject *_PyCodec_Lookup(const char *encoding)
137
2.36M
{
138
2.36M
    if (encoding == NULL) {
139
0
        PyErr_BadArgument();
140
0
        return NULL;
141
0
    }
142
143
2.36M
    PyInterpreterState *interp = _PyInterpreterState_GET();
144
2.36M
    assert(interp->codecs.initialized);
145
146
    /* Convert the encoding to a normalized Python string: all
147
       ASCII letters are converted to lower case, spaces are
148
       replaced with hyphens. */
149
2.36M
    PyObject *v = normalizestring(encoding);
150
2.36M
    if (v == NULL) {
151
0
        return NULL;
152
0
    }
153
154
    /* Intern the string. We'll make it immortal later if lookup succeeds. */
155
2.36M
    _PyUnicode_InternMortal(interp, &v);
156
157
    /* First, try to lookup the name in the registry dictionary */
158
2.36M
    PyObject *result;
159
2.36M
    if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
160
0
        goto onError;
161
0
    }
162
2.36M
    if (result != NULL) {
163
2.28M
        Py_DECREF(v);
164
2.28M
        return result;
165
2.28M
    }
166
167
    /* Next, scan the search functions in order of registration */
168
77.8k
    const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
169
77.8k
    if (len < 0)
170
0
        goto onError;
171
77.8k
    if (len == 0) {
172
0
        PyErr_SetString(PyExc_LookupError,
173
0
                        "no codec search functions registered: "
174
0
                        "can't find encoding");
175
0
        goto onError;
176
0
    }
177
178
77.8k
    Py_ssize_t i;
179
153k
    for (i = 0; i < len; i++) {
180
77.8k
        PyObject *func;
181
182
77.8k
        func = PyList_GetItemRef(interp->codecs.search_path, i);
183
77.8k
        if (func == NULL)
184
0
            goto onError;
185
77.8k
        result = PyObject_CallOneArg(func, v);
186
77.8k
        Py_DECREF(func);
187
77.8k
        if (result == NULL)
188
0
            goto onError;
189
77.8k
        if (result == Py_None) {
190
75.7k
            Py_CLEAR(result);
191
75.7k
            continue;
192
75.7k
        }
193
2.08k
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
194
0
            PyErr_SetString(PyExc_TypeError,
195
0
                            "codec search functions must return 4-tuples");
196
0
            Py_DECREF(result);
197
0
            goto onError;
198
0
        }
199
2.08k
        break;
200
2.08k
    }
201
77.8k
    if (result == NULL) {
202
        /* XXX Perhaps we should cache misses too ? */
203
75.7k
        PyErr_Format(PyExc_LookupError,
204
75.7k
                     "unknown encoding: %s", encoding);
205
75.7k
        goto onError;
206
75.7k
    }
207
208
2.08k
    _PyUnicode_InternImmortal(interp, &v);
209
210
    /* Cache and return the result */
211
2.08k
    if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
212
0
        Py_DECREF(result);
213
0
        goto onError;
214
0
    }
215
2.08k
    Py_DECREF(v);
216
2.08k
    return result;
217
218
75.7k
 onError:
219
75.7k
    Py_DECREF(v);
220
75.7k
    return NULL;
221
2.08k
}
222
223
/* Codec registry encoding check API. */
224
225
int PyCodec_KnownEncoding(const char *encoding)
226
0
{
227
0
    PyObject *codecs;
228
229
0
    codecs = _PyCodec_Lookup(encoding);
230
0
    if (!codecs) {
231
0
        PyErr_Clear();
232
0
        return 0;
233
0
    }
234
0
    else {
235
0
        Py_DECREF(codecs);
236
0
        return 1;
237
0
    }
238
0
}
239
240
static
241
PyObject *args_tuple(PyObject *object,
242
                     const char *errors)
243
2.01M
{
244
2.01M
    PyObject *args;
245
246
2.01M
    args = PyTuple_New(1 + (errors != NULL));
247
2.01M
    if (args == NULL)
248
0
        return NULL;
249
2.01M
    PyTuple_SET_ITEM(args, 0, Py_NewRef(object));
250
2.01M
    if (errors) {
251
191k
        PyObject *v;
252
253
191k
        v = PyUnicode_FromString(errors);
254
191k
        if (v == NULL) {
255
0
            Py_DECREF(args);
256
0
            return NULL;
257
0
        }
258
191k
        PyTuple_SET_ITEM(args, 1, v);
259
191k
    }
260
2.01M
    return args;
261
2.01M
}
262
263
/* Helper function to get a codec item */
264
265
static
266
PyObject *codec_getitem(const char *encoding, int index)
267
0
{
268
0
    PyObject *codecs;
269
0
    PyObject *v;
270
271
0
    codecs = _PyCodec_Lookup(encoding);
272
0
    if (codecs == NULL)
273
0
        return NULL;
274
0
    v = PyTuple_GET_ITEM(codecs, index);
275
0
    Py_DECREF(codecs);
276
0
    return Py_NewRef(v);
277
0
}
278
279
/* Helper functions to create an incremental codec. */
280
static
281
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
282
                                     const char *errors,
283
                                     const char *attrname)
284
130
{
285
130
    PyObject *ret, *inccodec;
286
287
130
    inccodec = PyObject_GetAttrString(codec_info, attrname);
288
130
    if (inccodec == NULL)
289
0
        return NULL;
290
130
    if (errors)
291
130
        ret = PyObject_CallFunction(inccodec, "s", errors);
292
0
    else
293
0
        ret = _PyObject_CallNoArgs(inccodec);
294
130
    Py_DECREF(inccodec);
295
130
    return ret;
296
130
}
297
298
static
299
PyObject *codec_getincrementalcodec(const char *encoding,
300
                                    const char *errors,
301
                                    const char *attrname)
302
0
{
303
0
    PyObject *codec_info, *ret;
304
305
0
    codec_info = _PyCodec_Lookup(encoding);
306
0
    if (codec_info == NULL)
307
0
        return NULL;
308
0
    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
309
0
    Py_DECREF(codec_info);
310
0
    return ret;
311
0
}
312
313
/* Helper function to create a stream codec. */
314
315
static
316
PyObject *codec_getstreamcodec(const char *encoding,
317
                               PyObject *stream,
318
                               const char *errors,
319
                               const int index)
320
0
{
321
0
    PyObject *codecs, *streamcodec, *codeccls;
322
323
0
    codecs = _PyCodec_Lookup(encoding);
324
0
    if (codecs == NULL)
325
0
        return NULL;
326
327
0
    codeccls = PyTuple_GET_ITEM(codecs, index);
328
0
    if (errors != NULL)
329
0
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
330
0
    else
331
0
        streamcodec = PyObject_CallOneArg(codeccls, stream);
332
0
    Py_DECREF(codecs);
333
0
    return streamcodec;
334
0
}
335
336
/* Helpers to work with the result of _PyCodec_Lookup
337
338
 */
339
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
340
                                             const char *errors)
341
55
{
342
55
    return codec_makeincrementalcodec(codec_info, errors,
343
55
                                      "incrementaldecoder");
344
55
}
345
346
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
347
                                             const char *errors)
348
75
{
349
75
    return codec_makeincrementalcodec(codec_info, errors,
350
75
                                      "incrementalencoder");
351
75
}
352
353
354
/* Convenience APIs to query the Codec registry.
355
356
   All APIs return a codec object with incremented refcount.
357
358
 */
359
360
PyObject *PyCodec_Encoder(const char *encoding)
361
0
{
362
0
    return codec_getitem(encoding, 0);
363
0
}
364
365
PyObject *PyCodec_Decoder(const char *encoding)
366
0
{
367
0
    return codec_getitem(encoding, 1);
368
0
}
369
370
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
371
                                     const char *errors)
372
0
{
373
0
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
374
0
}
375
376
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
377
                                     const char *errors)
378
0
{
379
0
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
380
0
}
381
382
PyObject *PyCodec_StreamReader(const char *encoding,
383
                               PyObject *stream,
384
                               const char *errors)
385
0
{
386
0
    return codec_getstreamcodec(encoding, stream, errors, 2);
387
0
}
388
389
PyObject *PyCodec_StreamWriter(const char *encoding,
390
                               PyObject *stream,
391
                               const char *errors)
392
0
{
393
0
    return codec_getstreamcodec(encoding, stream, errors, 3);
394
0
}
395
396
/* Encode an object (e.g. a Unicode object) using the given encoding
397
   and return the resulting encoded object (usually a Python string).
398
399
   errors is passed to the encoder factory as argument if non-NULL. */
400
401
static PyObject *
402
_PyCodec_EncodeInternal(PyObject *object,
403
                        PyObject *encoder,
404
                        const char *encoding,
405
                        const char *errors)
406
855k
{
407
855k
    PyObject *args = NULL, *result = NULL;
408
855k
    PyObject *v = NULL;
409
410
855k
    args = args_tuple(object, errors);
411
855k
    if (args == NULL)
412
0
        goto onError;
413
414
855k
    result = PyObject_Call(encoder, args, NULL);
415
855k
    if (result == NULL) {
416
0
        _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding);
417
0
        goto onError;
418
0
    }
419
420
855k
    if (!PyTuple_Check(result) ||
421
855k
        PyTuple_GET_SIZE(result) != 2) {
422
0
        PyErr_SetString(PyExc_TypeError,
423
0
                        "encoder must return a tuple (object, integer)");
424
0
        goto onError;
425
0
    }
426
855k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
427
    /* We don't check or use the second (integer) entry. */
428
429
855k
    Py_DECREF(args);
430
855k
    Py_DECREF(encoder);
431
855k
    Py_DECREF(result);
432
855k
    return v;
433
434
0
 onError:
435
0
    Py_XDECREF(result);
436
0
    Py_XDECREF(args);
437
0
    Py_XDECREF(encoder);
438
0
    return NULL;
439
855k
}
440
441
/* Decode an object (usually a Python string) using the given encoding
442
   and return an equivalent object (e.g. a Unicode object).
443
444
   errors is passed to the decoder factory as argument if non-NULL. */
445
446
static PyObject *
447
_PyCodec_DecodeInternal(PyObject *object,
448
                        PyObject *decoder,
449
                        const char *encoding,
450
                        const char *errors)
451
1.15M
{
452
1.15M
    PyObject *args = NULL, *result = NULL;
453
1.15M
    PyObject *v;
454
455
1.15M
    args = args_tuple(object, errors);
456
1.15M
    if (args == NULL)
457
0
        goto onError;
458
459
1.15M
    result = PyObject_Call(decoder, args, NULL);
460
1.15M
    if (result == NULL) {
461
79.5k
        _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding);
462
79.5k
        goto onError;
463
79.5k
    }
464
1.07M
    if (!PyTuple_Check(result) ||
465
1.07M
        PyTuple_GET_SIZE(result) != 2) {
466
0
        PyErr_SetString(PyExc_TypeError,
467
0
                        "decoder must return a tuple (object,integer)");
468
0
        goto onError;
469
0
    }
470
1.07M
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
471
    /* We don't check or use the second (integer) entry. */
472
473
1.07M
    Py_DECREF(args);
474
1.07M
    Py_DECREF(decoder);
475
1.07M
    Py_DECREF(result);
476
1.07M
    return v;
477
478
79.5k
 onError:
479
79.5k
    Py_XDECREF(args);
480
79.5k
    Py_XDECREF(decoder);
481
79.5k
    Py_XDECREF(result);
482
79.5k
    return NULL;
483
1.07M
}
484
485
/* Generic encoding/decoding API */
486
PyObject *PyCodec_Encode(PyObject *object,
487
                         const char *encoding,
488
                         const char *errors)
489
0
{
490
0
    PyObject *encoder;
491
492
0
    encoder = PyCodec_Encoder(encoding);
493
0
    if (encoder == NULL)
494
0
        return NULL;
495
496
0
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
497
0
}
498
499
PyObject *PyCodec_Decode(PyObject *object,
500
                         const char *encoding,
501
                         const char *errors)
502
0
{
503
0
    PyObject *decoder;
504
505
0
    decoder = PyCodec_Decoder(encoding);
506
0
    if (decoder == NULL)
507
0
        return NULL;
508
509
0
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
510
0
}
511
512
/* Text encoding/decoding API */
513
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
514
                                       const char *alternate_command)
515
2.02M
{
516
2.02M
    PyObject *codec;
517
2.02M
    PyObject *attr;
518
2.02M
    int is_text_codec;
519
520
2.02M
    codec = _PyCodec_Lookup(encoding);
521
2.02M
    if (codec == NULL)
522
11.3k
        return NULL;
523
524
    /* Backwards compatibility: assume any raw tuple describes a text
525
     * encoding, and the same for anything lacking the private
526
     * attribute.
527
     */
528
2.01M
    if (!PyTuple_CheckExact(codec)) {
529
2.01M
        if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
530
0
            Py_DECREF(codec);
531
0
            return NULL;
532
0
        }
533
2.01M
        if (attr != NULL) {
534
2.01M
            is_text_codec = PyObject_IsTrue(attr);
535
2.01M
            Py_DECREF(attr);
536
2.01M
            if (is_text_codec <= 0) {
537
3.10k
                Py_DECREF(codec);
538
3.10k
                if (!is_text_codec) {
539
3.10k
                    if (alternate_command != NULL) {
540
3.10k
                        PyErr_Format(PyExc_LookupError,
541
3.10k
                                     "'%.400s' is not a text encoding; "
542
3.10k
                                     "use %s to handle arbitrary codecs",
543
3.10k
                                     encoding, alternate_command);
544
3.10k
                    }
545
0
                    else {
546
0
                        PyErr_Format(PyExc_LookupError,
547
0
                                     "'%.400s' is not a text encoding",
548
0
                                     encoding);
549
0
                    }
550
3.10k
                }
551
3.10k
                return NULL;
552
3.10k
            }
553
2.01M
        }
554
2.01M
    }
555
556
    /* This appears to be a valid text encoding */
557
2.01M
    return codec;
558
2.01M
}
559
560
561
static
562
PyObject *codec_getitem_checked(const char *encoding,
563
                                const char *alternate_command,
564
                                int index)
565
2.02M
{
566
2.02M
    PyObject *codec;
567
2.02M
    PyObject *v;
568
569
2.02M
    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
570
2.02M
    if (codec == NULL)
571
14.4k
        return NULL;
572
573
2.01M
    v = Py_NewRef(PyTuple_GET_ITEM(codec, index));
574
2.01M
    Py_DECREF(codec);
575
2.01M
    return v;
576
2.02M
}
577
578
static PyObject * _PyCodec_TextEncoder(const char *encoding)
579
855k
{
580
855k
    return codec_getitem_checked(encoding, "codecs.encode()", 0);
581
855k
}
582
583
static PyObject * _PyCodec_TextDecoder(const char *encoding)
584
1.17M
{
585
1.17M
    return codec_getitem_checked(encoding, "codecs.decode()", 1);
586
1.17M
}
587
588
PyObject *_PyCodec_EncodeText(PyObject *object,
589
                              const char *encoding,
590
                              const char *errors)
591
855k
{
592
855k
    PyObject *encoder;
593
594
855k
    encoder = _PyCodec_TextEncoder(encoding);
595
855k
    if (encoder == NULL)
596
0
        return NULL;
597
598
855k
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
599
855k
}
600
601
PyObject *_PyCodec_DecodeText(PyObject *object,
602
                              const char *encoding,
603
                              const char *errors)
604
1.17M
{
605
1.17M
    PyObject *decoder;
606
607
1.17M
    decoder = _PyCodec_TextDecoder(encoding);
608
1.17M
    if (decoder == NULL)
609
14.4k
        return NULL;
610
611
1.15M
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
612
1.17M
}
613
614
/* Register the error handling callback function error under the name
615
   name. This function will be called by the codec when it encounters
616
   an unencodable characters/undecodable bytes and doesn't know the
617
   callback name, when name is specified as the error parameter
618
   in the call to the encode/decode function.
619
   Return 0 on success, -1 on error */
620
int PyCodec_RegisterError(const char *name, PyObject *error)
621
0
{
622
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
623
0
    assert(interp->codecs.initialized);
624
0
    if (!PyCallable_Check(error)) {
625
0
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
626
0
        return -1;
627
0
    }
628
0
    return PyDict_SetItemString(interp->codecs.error_registry,
629
0
                                name, error);
630
0
}
631
632
int _PyCodec_UnregisterError(const char *name)
633
0
{
634
0
    for (size_t i = 0; i < Py_ARRAY_LENGTH(codecs_builtin_error_handlers); ++i) {
635
0
        if (strcmp(name, codecs_builtin_error_handlers[i]) == 0) {
636
0
            PyErr_Format(PyExc_ValueError,
637
0
                         "cannot un-register built-in error handler '%s'", name);
638
0
            return -1;
639
0
        }
640
0
    }
641
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
642
0
    assert(interp->codecs.initialized);
643
0
    return PyDict_PopString(interp->codecs.error_registry, name, NULL);
644
0
}
645
646
/* Lookup the error handling callback function registered under the
647
   name error. As a special case NULL can be passed, in which case
648
   the error handling callback for strict encoding will be returned. */
649
PyObject *PyCodec_LookupError(const char *name)
650
2.53M
{
651
2.53M
    PyInterpreterState *interp = _PyInterpreterState_GET();
652
2.53M
    assert(interp->codecs.initialized);
653
654
2.53M
    if (name==NULL)
655
182k
        name = "strict";
656
2.53M
    PyObject *handler;
657
2.53M
    if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
658
0
        return NULL;
659
0
    }
660
2.53M
    if (handler == NULL) {
661
0
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
662
0
        return NULL;
663
0
    }
664
2.53M
    return handler;
665
2.53M
}
666
667
668
static inline void
669
wrong_exception_type(PyObject *exc)
670
0
{
671
0
    PyErr_Format(PyExc_TypeError,
672
0
                 "don't know how to handle %T in error callback", exc);
673
0
}
674
675
676
#define _PyIsUnicodeEncodeError(EXC)    \
677
318k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeEncodeError)
678
#define _PyIsUnicodeDecodeError(EXC)    \
679
306k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeDecodeError)
680
#define _PyIsUnicodeTranslateError(EXC) \
681
0
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
682
683
684
// --- codecs handlers: utilities ---------------------------------------------
685
686
/*
687
 * Return the number of characters (including special prefixes)
688
 * needed to represent 'ch' by codec_handler_write_unicode_hex().
689
 */
690
static inline Py_ssize_t
691
codec_handler_unicode_hex_width(Py_UCS4 ch)
692
0
{
693
0
    if (ch >= 0x10000) {
694
        // format: '\\' + 'U' + 8 hex digits
695
0
        return 1 + 1 + 8;
696
0
    }
697
0
    else if (ch >= 0x100) {
698
        // format: '\\' + 'u' + 4 hex digits
699
0
        return 1 + 1 + 4;
700
0
    }
701
0
    else {
702
        // format: '\\' + 'x' + 2 hex digits
703
0
        return 1 + 1 + 2;
704
0
    }
705
0
}
706
707
708
/*
709
 * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
710
 * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
711
 */
712
static inline void
713
codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
714
0
{
715
0
    *(*p)++ = '\\';
716
0
    if (ch >= 0x10000) {
717
0
        *(*p)++ = 'U';
718
0
        *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf];
719
0
        *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf];
720
0
        *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf];
721
0
        *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf];
722
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
723
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
724
0
    }
725
0
    else if (ch >= 0x100) {
726
0
        *(*p)++ = 'u';
727
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
728
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
729
0
    }
730
0
    else {
731
0
        *(*p)++ = 'x';
732
0
    }
733
0
    *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf];
734
0
    *(*p)++ = Py_hexdigits[ch & 0xf];
735
0
}
736
737
738
/*
739
 * Determine the number of digits for a decimal representation of Unicode
740
 * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
741
 */
742
static inline int
743
n_decimal_digits_for_codepoint(Py_UCS4 ch)
744
0
{
745
0
    if (ch < 10) return 1;
746
0
    if (ch < 100) return 2;
747
0
    if (ch < 1000) return 3;
748
0
    if (ch < 10000) return 4;
749
0
    if (ch < 100000) return 5;
750
0
    if (ch < 1000000) return 6;
751
0
    if (ch < 10000000) return 7;
752
    // Unicode codepoints are limited to 1114111 (7 decimal digits)
753
0
    Py_UNREACHABLE();
754
0
}
755
756
757
/*
758
 * Create a Unicode string containing 'count' copies of the official
759
 * Unicode REPLACEMENT CHARACTER (0xFFFD).
760
 */
761
static PyObject *
762
codec_handler_unicode_replacement_character(Py_ssize_t count)
763
228k
{
764
228k
    PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER);
765
228k
    if (res == NULL) {
766
0
        return NULL;
767
0
    }
768
228k
    assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
769
228k
    Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
770
456k
    for (Py_ssize_t i = 0; i < count; ++i) {
771
228k
        outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
772
228k
    }
773
228k
    assert(_PyUnicode_CheckConsistency(res, 1));
774
228k
    return res;
775
228k
}
776
777
778
// --- handler: 'strict' ------------------------------------------------------
779
780
PyObject *PyCodec_StrictErrors(PyObject *exc)
781
3.02M
{
782
3.02M
    if (PyExceptionInstance_Check(exc)) {
783
3.02M
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
784
3.02M
    }
785
0
    else {
786
0
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
787
0
    }
788
3.02M
    return NULL;
789
3.02M
}
790
791
792
// --- handler: 'ignore' ------------------------------------------------------
793
794
static PyObject *
795
_PyCodec_IgnoreError(PyObject *exc, int as_bytes)
796
0
{
797
0
    Py_ssize_t end;
798
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL,
799
0
                                  &end, NULL, as_bytes) < 0)
800
0
    {
801
0
        return NULL;
802
0
    }
803
0
    return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
804
0
}
805
806
807
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
808
0
{
809
0
    if (_PyIsUnicodeEncodeError(exc) || _PyIsUnicodeTranslateError(exc)) {
810
0
        return _PyCodec_IgnoreError(exc, false);
811
0
    }
812
0
    else if (_PyIsUnicodeDecodeError(exc)) {
813
0
        return _PyCodec_IgnoreError(exc, true);
814
0
    }
815
0
    else {
816
0
        wrong_exception_type(exc);
817
0
        return NULL;
818
0
    }
819
0
}
820
821
822
// --- handler: 'replace' -----------------------------------------------------
823
824
static PyObject *
825
_PyCodec_ReplaceUnicodeEncodeError(PyObject *exc)
826
0
{
827
0
    Py_ssize_t start, end, slen;
828
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
829
0
                                  &start, &end, &slen, false) < 0)
830
0
    {
831
0
        return NULL;
832
0
    }
833
0
    PyObject *res = PyUnicode_New(slen, '?');
834
0
    if (res == NULL) {
835
0
        return NULL;
836
0
    }
837
0
    assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
838
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
839
0
    memset(outp, '?', sizeof(Py_UCS1) * slen);
840
0
    assert(_PyUnicode_CheckConsistency(res, 1));
841
0
    return Py_BuildValue("(Nn)", res, end);
842
0
}
843
844
845
static PyObject *
846
_PyCodec_ReplaceUnicodeDecodeError(PyObject *exc)
847
228k
{
848
228k
    Py_ssize_t end;
849
228k
    if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) {
850
0
        return NULL;
851
0
    }
852
228k
    PyObject *res = codec_handler_unicode_replacement_character(1);
853
228k
    if (res == NULL) {
854
0
        return NULL;
855
0
    }
856
228k
    return Py_BuildValue("(Nn)", res, end);
857
228k
}
858
859
860
static PyObject *
861
_PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
862
0
{
863
0
    Py_ssize_t start, end, slen;
864
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
865
0
                                  &start, &end, &slen, false) < 0)
866
0
    {
867
0
        return NULL;
868
0
    }
869
0
    PyObject *res = codec_handler_unicode_replacement_character(slen);
870
0
    if (res == NULL) {
871
0
        return NULL;
872
0
    }
873
0
    return Py_BuildValue("(Nn)", res, end);
874
0
}
875
876
877
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
878
228k
{
879
228k
    if (_PyIsUnicodeEncodeError(exc)) {
880
0
        return _PyCodec_ReplaceUnicodeEncodeError(exc);
881
0
    }
882
228k
    else if (_PyIsUnicodeDecodeError(exc)) {
883
228k
        return _PyCodec_ReplaceUnicodeDecodeError(exc);
884
228k
    }
885
0
    else if (_PyIsUnicodeTranslateError(exc)) {
886
0
        return _PyCodec_ReplaceUnicodeTranslateError(exc);
887
0
    }
888
0
    else {
889
0
        wrong_exception_type(exc);
890
0
        return NULL;
891
0
    }
892
228k
}
893
894
895
// --- handler: 'xmlcharrefreplace' -------------------------------------------
896
897
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
898
0
{
899
0
    if (!_PyIsUnicodeEncodeError(exc)) {
900
0
        wrong_exception_type(exc);
901
0
        return NULL;
902
0
    }
903
904
0
    PyObject *obj;
905
0
    Py_ssize_t objlen, start, end, slen;
906
0
    if (_PyUnicodeError_GetParams(exc,
907
0
                                  &obj, &objlen,
908
0
                                  &start, &end, &slen, false) < 0)
909
0
    {
910
0
        return NULL;
911
0
    }
912
913
    // The number of characters that each character 'ch' contributes
914
    // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
915
    // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
916
    // range is below 10^7, each "block" requires at most 2 + 7 + 1
917
    // characters.
918
0
    if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
919
0
        end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
920
0
        end = Py_MIN(end, objlen);
921
0
        slen = Py_MAX(0, end - start);
922
0
    }
923
924
0
    Py_ssize_t ressize = 0;
925
0
    for (Py_ssize_t i = start; i < end; ++i) {
926
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
927
0
        int k = n_decimal_digits_for_codepoint(ch);
928
0
        assert(k != 0);
929
0
        assert(k <= 7);
930
0
        ressize += 2 + k + 1;
931
0
    }
932
933
    /* allocate replacement */
934
0
    PyObject *res = PyUnicode_New(ressize, 127);
935
0
    if (res == NULL) {
936
0
        Py_DECREF(obj);
937
0
        return NULL;
938
0
    }
939
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
940
    /* generate replacement */
941
0
    for (Py_ssize_t i = start; i < end; ++i) {
942
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
943
        /*
944
         * Write the decimal representation of 'ch' to the buffer pointed by 'p'
945
         * using at most 7 characters prefixed by '&#' and suffixed by ';'.
946
         */
947
0
        *outp++ = '&';
948
0
        *outp++ = '#';
949
0
        Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
950
0
        for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
951
0
            *p_digit = '0' + (ch % 10);
952
0
            ch /= 10;
953
0
        }
954
0
        assert(ch == 0);
955
0
        outp = digit_end;
956
0
        *outp++ = ';';
957
0
    }
958
0
    assert(_PyUnicode_CheckConsistency(res, 1));
959
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
960
0
    Py_DECREF(obj);
961
0
    return restuple;
962
0
}
963
964
965
// --- handler: 'backslashreplace' --------------------------------------------
966
967
static PyObject *
968
_PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc)
969
0
{
970
0
    PyObject *obj;
971
0
    Py_ssize_t objlen, start, end, slen;
972
0
    if (_PyUnicodeError_GetParams(exc,
973
0
                                  &obj, &objlen,
974
0
                                  &start, &end, &slen, false) < 0)
975
0
    {
976
0
        return NULL;
977
0
    }
978
979
    // The number of characters that each character 'ch' contributes
980
    // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch}
981
    // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS,
982
    // where the number of hexdigits is either 2, 4, or 8 (not 6).
983
    // Since the Unicode range is below 10^7, we choose k = 8 whence
984
    // each "block" requires at most 1 + 1 + 8 characters.
985
0
    if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) {
986
0
        end = start + PY_SSIZE_T_MAX / (1 + 1 + 8);
987
0
        end = Py_MIN(end, objlen);
988
0
        slen = Py_MAX(0, end - start);
989
0
    }
990
991
0
    Py_ssize_t ressize = 0;
992
0
    for (Py_ssize_t i = start; i < end; ++i) {
993
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
994
0
        ressize += codec_handler_unicode_hex_width(c);
995
0
    }
996
0
    PyObject *res = PyUnicode_New(ressize, 127);
997
0
    if (res == NULL) {
998
0
        Py_DECREF(obj);
999
0
        return NULL;
1000
0
    }
1001
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1002
0
    for (Py_ssize_t i = start; i < end; ++i) {
1003
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1004
0
        codec_handler_write_unicode_hex(&outp, c);
1005
0
    }
1006
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1007
0
    Py_DECREF(obj);
1008
0
    return Py_BuildValue("(Nn)", res, end);
1009
0
}
1010
1011
1012
static PyObject *
1013
_PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc)
1014
0
{
1015
0
    PyObject *obj;
1016
0
    Py_ssize_t objlen, start, end, slen;
1017
0
    if (_PyUnicodeError_GetParams(exc,
1018
0
                                  &obj, &objlen,
1019
0
                                  &start, &end, &slen, true) < 0)
1020
0
    {
1021
0
        return NULL;
1022
0
    }
1023
1024
0
    PyObject *res = PyUnicode_New(4 * slen, 127);
1025
0
    if (res == NULL) {
1026
0
        Py_DECREF(obj);
1027
0
        return NULL;
1028
0
    }
1029
1030
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1031
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1032
0
    for (Py_ssize_t i = start; i < end; i++, outp += 4) {
1033
0
        const unsigned char ch = p[i];
1034
0
        outp[0] = '\\';
1035
0
        outp[1] = 'x';
1036
0
        outp[2] = Py_hexdigits[(ch >> 4) & 0xf];
1037
0
        outp[3] = Py_hexdigits[ch & 0xf];
1038
0
    }
1039
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1040
0
    Py_DECREF(obj);
1041
0
    return Py_BuildValue("(Nn)", res, end);
1042
0
}
1043
1044
1045
static inline PyObject *
1046
_PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc)
1047
0
{
1048
    // Same implementation as for UnicodeEncodeError objects.
1049
0
    return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1050
0
}
1051
1052
1053
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
1054
0
{
1055
0
    if (_PyIsUnicodeEncodeError(exc)) {
1056
0
        return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1057
0
    }
1058
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1059
0
        return _PyCodec_BackslashReplaceUnicodeDecodeError(exc);
1060
0
    }
1061
0
    else if (_PyIsUnicodeTranslateError(exc)) {
1062
0
        return _PyCodec_BackslashReplaceUnicodeTranslateError(exc);
1063
0
    }
1064
0
    else {
1065
0
        wrong_exception_type(exc);
1066
0
        return NULL;
1067
0
    }
1068
0
}
1069
1070
1071
// --- handler: 'namereplace' -------------------------------------------------
1072
1073
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
1074
0
{
1075
0
    if (!_PyIsUnicodeEncodeError(exc)) {
1076
0
        wrong_exception_type(exc);
1077
0
        return NULL;
1078
0
    }
1079
1080
0
    _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
1081
0
    if (ucnhash_capi == NULL) {
1082
0
        return NULL;
1083
0
    }
1084
1085
0
    PyObject *obj;
1086
0
    Py_ssize_t start, end;
1087
0
    if (_PyUnicodeError_GetParams(exc,
1088
0
                                  &obj, NULL,
1089
0
                                  &start, &end, NULL, false) < 0)
1090
0
    {
1091
0
        return NULL;
1092
0
    }
1093
1094
0
    char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */
1095
0
    Py_ssize_t imax = start, ressize = 0, replsize;
1096
0
    for (; imax < end; ++imax) {
1097
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax);
1098
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1099
            // If 'c' is recognized by getname(), the corresponding replacement
1100
            // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
1101
            // characters. Failures of getname() are ignored by the handler.
1102
0
            replsize = 1 + 1 + 1 + strlen(buffer) + 1;
1103
0
        }
1104
0
        else {
1105
0
            replsize = codec_handler_unicode_hex_width(c);
1106
0
        }
1107
0
        if (ressize > PY_SSIZE_T_MAX - replsize) {
1108
0
            break;
1109
0
        }
1110
0
        ressize += replsize;
1111
0
    }
1112
1113
0
    PyObject *res = PyUnicode_New(ressize, 127);
1114
0
    if (res == NULL) {
1115
0
        Py_DECREF(obj);
1116
0
        return NULL;
1117
0
    }
1118
1119
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1120
0
    for (Py_ssize_t i = start; i < imax; ++i) {
1121
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1122
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1123
0
            *outp++ = '\\';
1124
0
            *outp++ = 'N';
1125
0
            *outp++ = '{';
1126
0
            (void)strcpy((char *)outp, buffer);
1127
0
            outp += strlen(buffer);
1128
0
            *outp++ = '}';
1129
0
        }
1130
0
        else {
1131
0
            codec_handler_write_unicode_hex(&outp, c);
1132
0
        }
1133
0
    }
1134
1135
0
    assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1136
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1137
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, imax);
1138
0
    Py_DECREF(obj);
1139
0
    return restuple;
1140
0
}
1141
1142
1143
8
#define ENC_UNKNOWN     -1
1144
16
#define ENC_UTF8        0
1145
0
#define ENC_UTF16BE     1
1146
0
#define ENC_UTF16LE     2
1147
0
#define ENC_UTF32BE     3
1148
0
#define ENC_UTF32LE     4
1149
1150
static int
1151
get_standard_encoding_impl(const char *encoding, int *bytelength)
1152
8
{
1153
8
    if (Py_TOLOWER(encoding[0]) == 'u' &&
1154
8
        Py_TOLOWER(encoding[1]) == 't' &&
1155
8
        Py_TOLOWER(encoding[2]) == 'f') {
1156
8
        encoding += 3;
1157
8
        if (*encoding == '-' || *encoding == '_' )
1158
8
            encoding++;
1159
8
        if (encoding[0] == '8' && encoding[1] == '\0') {
1160
8
            *bytelength = 3;
1161
8
            return ENC_UTF8;
1162
8
        }
1163
0
        else if (encoding[0] == '1' && encoding[1] == '6') {
1164
0
            encoding += 2;
1165
0
            *bytelength = 2;
1166
0
            if (*encoding == '\0') {
1167
#ifdef WORDS_BIGENDIAN
1168
                return ENC_UTF16BE;
1169
#else
1170
0
                return ENC_UTF16LE;
1171
0
#endif
1172
0
            }
1173
0
            if (*encoding == '-' || *encoding == '_' )
1174
0
                encoding++;
1175
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1176
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1177
0
                    return ENC_UTF16BE;
1178
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1179
0
                    return ENC_UTF16LE;
1180
0
            }
1181
0
        }
1182
0
        else if (encoding[0] == '3' && encoding[1] == '2') {
1183
0
            encoding += 2;
1184
0
            *bytelength = 4;
1185
0
            if (*encoding == '\0') {
1186
#ifdef WORDS_BIGENDIAN
1187
                return ENC_UTF32BE;
1188
#else
1189
0
                return ENC_UTF32LE;
1190
0
#endif
1191
0
            }
1192
0
            if (*encoding == '-' || *encoding == '_' )
1193
0
                encoding++;
1194
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1195
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1196
0
                    return ENC_UTF32BE;
1197
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1198
0
                    return ENC_UTF32LE;
1199
0
            }
1200
0
        }
1201
8
    }
1202
0
    else if (strcmp(encoding, "cp65001") == 0) {
1203
0
        *bytelength = 3;
1204
0
        return ENC_UTF8;
1205
0
    }
1206
0
    return ENC_UNKNOWN;
1207
8
}
1208
1209
1210
static int
1211
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
1212
8
{
1213
8
    const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
1214
8
    if (encoding_cstr == NULL) {
1215
0
        return -1;
1216
0
    }
1217
8
    *code = get_standard_encoding_impl(encoding_cstr, bytelength);
1218
8
    return 0;
1219
8
}
1220
1221
1222
// --- handler: 'surrogatepass' -----------------------------------------------
1223
1224
static PyObject *
1225
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
1226
0
{
1227
0
    PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
1228
0
    if (encoding == NULL) {
1229
0
        return NULL;
1230
0
    }
1231
0
    int code, bytelength;
1232
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1233
0
    Py_DECREF(encoding);
1234
0
    if (rc < 0) {
1235
0
        return NULL;
1236
0
    }
1237
0
    if (code == ENC_UNKNOWN) {
1238
0
        goto bail;
1239
0
    }
1240
1241
0
    PyObject *obj;
1242
0
    Py_ssize_t objlen, start, end, slen;
1243
0
    if (_PyUnicodeError_GetParams(exc,
1244
0
                                  &obj, &objlen,
1245
0
                                  &start, &end, &slen, false) < 0)
1246
0
    {
1247
0
        return NULL;
1248
0
    }
1249
1250
0
    if (slen > PY_SSIZE_T_MAX / bytelength) {
1251
0
        end = start + PY_SSIZE_T_MAX / bytelength;
1252
0
        end = Py_MIN(end, objlen);
1253
0
        slen = Py_MAX(0, end - start);
1254
0
    }
1255
1256
0
    PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
1257
0
    if (res == NULL) {
1258
0
        Py_DECREF(obj);
1259
0
        return NULL;
1260
0
    }
1261
1262
0
    unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
1263
0
    for (Py_ssize_t i = start; i < end; i++) {
1264
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1265
0
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1266
            /* Not a surrogate, fail with original exception */
1267
0
            Py_DECREF(obj);
1268
0
            Py_DECREF(res);
1269
0
            goto bail;
1270
0
        }
1271
0
        switch (code) {
1272
0
            case ENC_UTF8: {
1273
0
                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1274
0
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1275
0
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1276
0
                break;
1277
0
            }
1278
0
            case ENC_UTF16LE: {
1279
0
                *outp++ = (unsigned char)ch;
1280
0
                *outp++ = (unsigned char)(ch >> 8);
1281
0
                break;
1282
0
            }
1283
0
            case ENC_UTF16BE: {
1284
0
                *outp++ = (unsigned char)(ch >> 8);
1285
0
                *outp++ = (unsigned char)ch;
1286
0
                break;
1287
0
            }
1288
0
            case ENC_UTF32LE: {
1289
0
                *outp++ = (unsigned char)ch;
1290
0
                *outp++ = (unsigned char)(ch >> 8);
1291
0
                *outp++ = (unsigned char)(ch >> 16);
1292
0
                *outp++ = (unsigned char)(ch >> 24);
1293
0
                break;
1294
0
            }
1295
0
            case ENC_UTF32BE: {
1296
0
                *outp++ = (unsigned char)(ch >> 24);
1297
0
                *outp++ = (unsigned char)(ch >> 16);
1298
0
                *outp++ = (unsigned char)(ch >> 8);
1299
0
                *outp++ = (unsigned char)ch;
1300
0
                break;
1301
0
            }
1302
0
        }
1303
0
    }
1304
1305
0
    Py_DECREF(obj);
1306
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
1307
0
    return restuple;
1308
1309
0
bail:
1310
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1311
0
    return NULL;
1312
0
}
1313
1314
1315
static PyObject *
1316
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
1317
8
{
1318
8
    PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
1319
8
    if (encoding == NULL) {
1320
0
        return NULL;
1321
0
    }
1322
8
    int code, bytelength;
1323
8
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1324
8
    Py_DECREF(encoding);
1325
8
    if (rc < 0) {
1326
0
        return NULL;
1327
0
    }
1328
8
    if (code == ENC_UNKNOWN) {
1329
0
        goto bail;
1330
0
    }
1331
1332
8
    PyObject *obj;
1333
8
    Py_ssize_t objlen, start, end, slen;
1334
8
    if (_PyUnicodeError_GetParams(exc,
1335
8
                                  &obj, &objlen,
1336
8
                                  &start, &end, &slen, true) < 0)
1337
0
    {
1338
0
        return NULL;
1339
0
    }
1340
1341
    /* Try decoding a single surrogate character. If
1342
       there are more, let the codec call us again. */
1343
8
    Py_UCS4 ch = 0;
1344
8
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1345
8
    p += start;
1346
1347
8
    if (objlen - start >= bytelength) {
1348
8
        switch (code) {
1349
8
            case ENC_UTF8: {
1350
8
                if ((p[0] & 0xf0) == 0xe0 &&
1351
8
                    (p[1] & 0xc0) == 0x80 &&
1352
8
                    (p[2] & 0xc0) == 0x80)
1353
8
                {
1354
                    /* it's a three-byte code */
1355
8
                    ch = ((p[0] & 0x0f) << 12) +
1356
8
                         ((p[1] & 0x3f) << 6)  +
1357
8
                          (p[2] & 0x3f);
1358
8
                }
1359
8
                break;
1360
0
            }
1361
0
            case ENC_UTF16LE: {
1362
0
                ch = p[1] << 8 | p[0];
1363
0
                break;
1364
0
            }
1365
0
            case ENC_UTF16BE: {
1366
0
                ch = p[0] << 8 | p[1];
1367
0
                break;
1368
0
            }
1369
0
            case ENC_UTF32LE: {
1370
0
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1371
0
                break;
1372
0
            }
1373
0
            case ENC_UTF32BE: {
1374
0
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1375
0
                break;
1376
0
            }
1377
8
        }
1378
8
    }
1379
8
    Py_DECREF(obj);
1380
8
    if (!Py_UNICODE_IS_SURROGATE(ch)) {
1381
0
        goto bail;
1382
0
    }
1383
1384
8
    PyObject *res = PyUnicode_FromOrdinal(ch);
1385
8
    if (res == NULL) {
1386
0
        return NULL;
1387
0
    }
1388
8
    return Py_BuildValue("(Nn)", res, start + bytelength);
1389
1390
0
bail:
1391
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1392
0
    return NULL;
1393
8
}
1394
1395
1396
/* This handler is declared static until someone demonstrates
1397
   a need to call it directly. */
1398
static PyObject *
1399
PyCodec_SurrogatePassErrors(PyObject *exc)
1400
8
{
1401
8
    if (_PyIsUnicodeEncodeError(exc)) {
1402
0
        return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
1403
0
    }
1404
8
    else if (_PyIsUnicodeDecodeError(exc)) {
1405
8
        return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
1406
8
    }
1407
0
    else {
1408
0
        wrong_exception_type(exc);
1409
0
        return NULL;
1410
0
    }
1411
8
}
1412
1413
1414
// --- handler: 'surrogateescape' ---------------------------------------------
1415
1416
static PyObject *
1417
_PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
1418
12.8k
{
1419
12.8k
    PyObject *obj;
1420
12.8k
    Py_ssize_t start, end, slen;
1421
12.8k
    if (_PyUnicodeError_GetParams(exc,
1422
12.8k
                                  &obj, NULL,
1423
12.8k
                                  &start, &end, &slen, false) < 0)
1424
0
    {
1425
0
        return NULL;
1426
0
    }
1427
1428
12.8k
    PyObject *res = PyBytes_FromStringAndSize(NULL, slen);
1429
12.8k
    if (res == NULL) {
1430
0
        Py_DECREF(obj);
1431
0
        return NULL;
1432
0
    }
1433
1434
12.8k
    char *outp = PyBytes_AsString(res);
1435
12.8k
    for (Py_ssize_t i = start; i < end; i++) {
1436
12.8k
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1437
12.8k
        if (ch < 0xdc80 || ch > 0xdcff) {
1438
            /* Not a UTF-8b surrogate, fail with original exception. */
1439
12.8k
            Py_DECREF(obj);
1440
12.8k
            Py_DECREF(res);
1441
12.8k
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1442
12.8k
            return NULL;
1443
12.8k
        }
1444
0
        *outp++ = ch - 0xdc00;
1445
0
    }
1446
0
    Py_DECREF(obj);
1447
1448
0
    return Py_BuildValue("(Nn)", res, end);
1449
12.8k
}
1450
1451
1452
static PyObject *
1453
_PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc)
1454
77.5k
{
1455
77.5k
    PyObject *obj;
1456
77.5k
    Py_ssize_t start, end, slen;
1457
77.5k
    if (_PyUnicodeError_GetParams(exc,
1458
77.5k
                                  &obj, NULL,
1459
77.5k
                                  &start, &end, &slen, true) < 0)
1460
0
    {
1461
0
        return NULL;
1462
0
    }
1463
1464
77.5k
    Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1465
77.5k
    int consumed = 0;
1466
77.5k
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1467
173k
    while (consumed < 4 && consumed < slen) {
1468
        /* Refuse to escape ASCII bytes. */
1469
144k
        if (p[start + consumed] < 128) {
1470
48.6k
            break;
1471
48.6k
        }
1472
96.1k
        ch[consumed] = 0xdc00 + p[start + consumed];
1473
96.1k
        consumed++;
1474
96.1k
    }
1475
77.5k
    Py_DECREF(obj);
1476
1477
77.5k
    if (consumed == 0) {
1478
        /* Codec complained about ASCII byte. */
1479
27.0k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1480
27.0k
        return NULL;
1481
27.0k
    }
1482
1483
50.5k
    PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1484
50.5k
    if (str == NULL) {
1485
0
        return NULL;
1486
0
    }
1487
50.5k
    return Py_BuildValue("(Nn)", str, start + consumed);
1488
50.5k
}
1489
1490
1491
static PyObject *
1492
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1493
90.4k
{
1494
90.4k
    if (_PyIsUnicodeEncodeError(exc)) {
1495
12.8k
        return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc);
1496
12.8k
    }
1497
77.5k
    else if (_PyIsUnicodeDecodeError(exc)) {
1498
77.5k
        return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc);
1499
77.5k
    }
1500
0
    else {
1501
0
        wrong_exception_type(exc);
1502
0
        return NULL;
1503
0
    }
1504
90.4k
}
1505
1506
1507
// --- Codecs registry handlers -----------------------------------------------
1508
1509
static inline PyObject *
1510
strict_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1511
2.45M
{
1512
2.45M
    return PyCodec_StrictErrors(exc);
1513
2.45M
}
1514
1515
1516
static inline PyObject *
1517
ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1518
0
{
1519
0
    return PyCodec_IgnoreErrors(exc);
1520
0
}
1521
1522
1523
static inline PyObject *
1524
replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1525
228k
{
1526
228k
    return PyCodec_ReplaceErrors(exc);
1527
228k
}
1528
1529
1530
static inline PyObject *
1531
xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1532
0
{
1533
0
    return PyCodec_XMLCharRefReplaceErrors(exc);
1534
0
}
1535
1536
1537
static inline PyObject *
1538
backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1539
0
{
1540
0
    return PyCodec_BackslashReplaceErrors(exc);
1541
0
}
1542
1543
1544
static inline PyObject *
1545
namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1546
0
{
1547
0
    return PyCodec_NameReplaceErrors(exc);
1548
0
}
1549
1550
1551
static inline PyObject *
1552
surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1553
8
{
1554
8
    return PyCodec_SurrogatePassErrors(exc);
1555
8
}
1556
1557
1558
static inline PyObject *
1559
surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1560
90.4k
{
1561
90.4k
    return PyCodec_SurrogateEscapeErrors(exc);
1562
90.4k
}
1563
1564
1565
PyStatus
1566
_PyCodec_InitRegistry(PyInterpreterState *interp)
1567
37
{
1568
37
    static struct {
1569
37
        const char *name;
1570
37
        PyMethodDef def;
1571
37
    } methods[] =
1572
37
    {
1573
37
        {
1574
37
            "strict",
1575
37
            {
1576
37
                "strict_errors",
1577
37
                strict_errors,
1578
37
                METH_O,
1579
37
                PyDoc_STR("Implements the 'strict' error handling, which "
1580
37
                          "raises a UnicodeError on coding errors.")
1581
37
            }
1582
37
        },
1583
37
        {
1584
37
            "ignore",
1585
37
            {
1586
37
                "ignore_errors",
1587
37
                ignore_errors,
1588
37
                METH_O,
1589
37
                PyDoc_STR("Implements the 'ignore' error handling, which "
1590
37
                          "ignores malformed data and continues.")
1591
37
            }
1592
37
        },
1593
37
        {
1594
37
            "replace",
1595
37
            {
1596
37
                "replace_errors",
1597
37
                replace_errors,
1598
37
                METH_O,
1599
37
                PyDoc_STR("Implements the 'replace' error handling, which "
1600
37
                          "replaces malformed data with a replacement marker.")
1601
37
            }
1602
37
        },
1603
37
        {
1604
37
            "xmlcharrefreplace",
1605
37
            {
1606
37
                "xmlcharrefreplace_errors",
1607
37
                xmlcharrefreplace_errors,
1608
37
                METH_O,
1609
37
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1610
37
                          "which replaces an unencodable character with the "
1611
37
                          "appropriate XML character reference.")
1612
37
            }
1613
37
        },
1614
37
        {
1615
37
            "backslashreplace",
1616
37
            {
1617
37
                "backslashreplace_errors",
1618
37
                backslashreplace_errors,
1619
37
                METH_O,
1620
37
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1621
37
                          "which replaces malformed data with a backslashed "
1622
37
                          "escape sequence.")
1623
37
            }
1624
37
        },
1625
37
        {
1626
37
            "namereplace",
1627
37
            {
1628
37
                "namereplace_errors",
1629
37
                namereplace_errors,
1630
37
                METH_O,
1631
37
                PyDoc_STR("Implements the 'namereplace' error handling, "
1632
37
                          "which replaces an unencodable character with a "
1633
37
                          "\\N{...} escape sequence.")
1634
37
            }
1635
37
        },
1636
37
        {
1637
37
            "surrogatepass",
1638
37
            {
1639
37
                "surrogatepass",
1640
37
                surrogatepass_errors,
1641
37
                METH_O
1642
37
            }
1643
37
        },
1644
37
        {
1645
37
            "surrogateescape",
1646
37
            {
1647
37
                "surrogateescape",
1648
37
                surrogateescape_errors,
1649
37
                METH_O
1650
37
            }
1651
37
        }
1652
37
    };
1653
    // ensure that the built-in error handlers' names are kept in sync
1654
37
    assert(Py_ARRAY_LENGTH(methods) == Py_ARRAY_LENGTH(codecs_builtin_error_handlers));
1655
1656
37
    assert(interp->codecs.initialized == 0);
1657
37
    interp->codecs.search_path = PyList_New(0);
1658
37
    if (interp->codecs.search_path == NULL) {
1659
0
        return PyStatus_NoMemory();
1660
0
    }
1661
37
    interp->codecs.search_cache = PyDict_New();
1662
37
    if (interp->codecs.search_cache == NULL) {
1663
0
        return PyStatus_NoMemory();
1664
0
    }
1665
37
    interp->codecs.error_registry = PyDict_New();
1666
37
    if (interp->codecs.error_registry == NULL) {
1667
0
        return PyStatus_NoMemory();
1668
0
    }
1669
333
    for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1670
296
        PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1671
296
        if (func == NULL) {
1672
0
            return PyStatus_NoMemory();
1673
0
        }
1674
1675
296
        int res = PyDict_SetItemString(interp->codecs.error_registry,
1676
296
                                       methods[i].name, func);
1677
296
        Py_DECREF(func);
1678
296
        if (res < 0) {
1679
0
            return PyStatus_Error("Failed to insert into codec error registry");
1680
0
        }
1681
296
    }
1682
1683
37
    interp->codecs.initialized = 1;
1684
1685
    // Importing `encodings' will call back into this module to register codec
1686
    // search functions, so this is done after everything else is initialized.
1687
37
    PyObject *mod = PyImport_ImportModule("encodings");
1688
37
    if (mod == NULL) {
1689
0
        return PyStatus_Error("Failed to import encodings module");
1690
0
    }
1691
37
    Py_DECREF(mod);
1692
1693
37
    return PyStatus_Ok();
1694
37
}
1695
1696
void
1697
_PyCodec_Fini(PyInterpreterState *interp)
1698
0
{
1699
0
    Py_CLEAR(interp->codecs.search_path);
1700
0
    Py_CLEAR(interp->codecs.search_cache);
1701
    Py_CLEAR(interp->codecs.error_registry);
1702
0
    interp->codecs.initialized = 0;
1703
0
}