Coverage Report

Created: 2025-07-11 06:24

/src/cpython/Python/codecs.c
Line
Count
Source (jump to first uncovered line)
1
/* ------------------------------------------------------------------------
2
3
   Python Codec Registry and support functions
4
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7
Copyright (c) Corporation for National Research Initiatives.
8
9
   ------------------------------------------------------------------------ */
10
11
#include "Python.h"
12
#include "pycore_call.h"          // _PyObject_CallNoArgs()
13
#include "pycore_interp.h"        // PyInterpreterState.codec_search_path
14
#include "pycore_pyerrors.h"      // _PyErr_FormatNote()
15
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
16
#include "pycore_runtime.h"       // _Py_ID()
17
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
18
#include "pycore_unicodeobject.h" // _PyUnicode_InternMortal()
19
20
21
static const char *codecs_builtin_error_handlers[] = {
22
    "strict", "ignore", "replace",
23
    "xmlcharrefreplace", "backslashreplace", "namereplace",
24
    "surrogatepass", "surrogateescape",
25
};
26
27
const char *Py_hexdigits = "0123456789abcdef";
28
29
/* --- Codec Registry ----------------------------------------------------- */
30
31
int PyCodec_Register(PyObject *search_function)
32
16
{
33
16
    PyInterpreterState *interp = _PyInterpreterState_GET();
34
16
    assert(interp->codecs.initialized);
35
16
    if (search_function == NULL) {
36
0
        PyErr_BadArgument();
37
0
        goto onError;
38
0
    }
39
16
    if (!PyCallable_Check(search_function)) {
40
0
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
41
0
        goto onError;
42
0
    }
43
#ifdef Py_GIL_DISABLED
44
    PyMutex_Lock(&interp->codecs.search_path_mutex);
45
#endif
46
16
    int ret = PyList_Append(interp->codecs.search_path, search_function);
47
#ifdef Py_GIL_DISABLED
48
    PyMutex_Unlock(&interp->codecs.search_path_mutex);
49
#endif
50
16
    return ret;
51
52
0
 onError:
53
0
    return -1;
54
16
}
55
56
int
57
PyCodec_Unregister(PyObject *search_function)
58
0
{
59
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
60
0
    if (interp->codecs.initialized != 1) {
61
        /* Do nothing if codecs state was cleared (only possible during
62
           interpreter shutdown). */
63
0
        return 0;
64
0
    }
65
66
0
    PyObject *codec_search_path = interp->codecs.search_path;
67
0
    assert(PyList_CheckExact(codec_search_path));
68
0
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
69
#ifdef Py_GIL_DISABLED
70
        PyMutex_Lock(&interp->codecs.search_path_mutex);
71
#endif
72
0
        PyObject *item = PyList_GetItemRef(codec_search_path, i);
73
0
        int ret = 1;
74
0
        if (item == search_function) {
75
            // We hold a reference to the item, so its destructor can't run
76
            // while we hold search_path_mutex.
77
0
            ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
78
0
        }
79
#ifdef Py_GIL_DISABLED
80
        PyMutex_Unlock(&interp->codecs.search_path_mutex);
81
#endif
82
0
        Py_DECREF(item);
83
0
        if (ret != 1) {
84
0
            assert(interp->codecs.search_cache != NULL);
85
0
            assert(PyDict_CheckExact(interp->codecs.search_cache));
86
0
            PyDict_Clear(interp->codecs.search_cache);
87
0
            return ret;
88
0
        }
89
0
    }
90
0
    return 0;
91
0
}
92
93
extern int _Py_normalize_encoding(const char *, char *, size_t);
94
95
/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
96
   converted to lower case, spaces and hyphens are replaced with underscores. */
97
98
static
99
PyObject *normalizestring(const char *string)
100
1.00M
{
101
1.00M
    size_t len = strlen(string);
102
1.00M
    char *encoding;
103
1.00M
    PyObject *v;
104
105
1.00M
    if (len > PY_SSIZE_T_MAX) {
106
0
        PyErr_SetString(PyExc_OverflowError, "string is too large");
107
0
        return NULL;
108
0
    }
109
110
1.00M
    encoding = PyMem_Malloc(len + 1);
111
1.00M
    if (encoding == NULL)
112
0
        return PyErr_NoMemory();
113
114
1.00M
    if (!_Py_normalize_encoding(string, encoding, len + 1))
115
0
    {
116
0
        PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
117
0
        PyMem_Free(encoding);
118
0
        return NULL;
119
0
    }
120
121
1.00M
    v = PyUnicode_FromString(encoding);
122
1.00M
    PyMem_Free(encoding);
123
1.00M
    return v;
124
1.00M
}
125
126
/* Lookup the given encoding and return a tuple providing the codec
127
   facilities.
128
129
   The encoding string is looked up converted to all lower-case
130
   characters. This makes encodings looked up through this mechanism
131
   effectively case-insensitive.
132
133
   If no codec is found, a LookupError is set and NULL returned.
134
135
   As side effect, this tries to load the encodings package, if not
136
   yet done. This is part of the lazy load strategy for the encodings
137
   package.
138
139
*/
140
141
PyObject *_PyCodec_Lookup(const char *encoding)
142
1.00M
{
143
1.00M
    if (encoding == NULL) {
144
0
        PyErr_BadArgument();
145
0
        return NULL;
146
0
    }
147
148
1.00M
    PyInterpreterState *interp = _PyInterpreterState_GET();
149
1.00M
    assert(interp->codecs.initialized);
150
151
    /* Convert the encoding to a normalized Python string: all
152
       characters are converted to lower case, spaces and hyphens are
153
       replaced with underscores. */
154
1.00M
    PyObject *v = normalizestring(encoding);
155
1.00M
    if (v == NULL) {
156
0
        return NULL;
157
0
    }
158
159
    /* Intern the string. We'll make it immortal later if lookup succeeds. */
160
1.00M
    _PyUnicode_InternMortal(interp, &v);
161
162
    /* First, try to lookup the name in the registry dictionary */
163
1.00M
    PyObject *result;
164
1.00M
    if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
165
0
        goto onError;
166
0
    }
167
1.00M
    if (result != NULL) {
168
924k
        Py_DECREF(v);
169
924k
        return result;
170
924k
    }
171
172
    /* Next, scan the search functions in order of registration */
173
78.6k
    const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
174
78.6k
    if (len < 0)
175
0
        goto onError;
176
78.6k
    if (len == 0) {
177
0
        PyErr_SetString(PyExc_LookupError,
178
0
                        "no codec search functions registered: "
179
0
                        "can't find encoding");
180
0
        goto onError;
181
0
    }
182
183
78.6k
    Py_ssize_t i;
184
156k
    for (i = 0; i < len; i++) {
185
78.6k
        PyObject *func;
186
187
78.6k
        func = PyList_GetItemRef(interp->codecs.search_path, i);
188
78.6k
        if (func == NULL)
189
0
            goto onError;
190
78.6k
        result = PyObject_CallOneArg(func, v);
191
78.6k
        Py_DECREF(func);
192
78.6k
        if (result == NULL)
193
0
            goto onError;
194
78.6k
        if (result == Py_None) {
195
78.2k
            Py_CLEAR(result);
196
78.2k
            continue;
197
78.2k
        }
198
461
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
199
0
            PyErr_SetString(PyExc_TypeError,
200
0
                            "codec search functions must return 4-tuples");
201
0
            Py_DECREF(result);
202
0
            goto onError;
203
0
        }
204
461
        break;
205
461
    }
206
78.6k
    if (result == NULL) {
207
        /* XXX Perhaps we should cache misses too ? */
208
78.2k
        PyErr_Format(PyExc_LookupError,
209
78.2k
                     "unknown encoding: %s", encoding);
210
78.2k
        goto onError;
211
78.2k
    }
212
213
461
    _PyUnicode_InternImmortal(interp, &v);
214
215
    /* Cache and return the result */
216
461
    if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
217
0
        Py_DECREF(result);
218
0
        goto onError;
219
0
    }
220
461
    Py_DECREF(v);
221
461
    return result;
222
223
78.2k
 onError:
224
78.2k
    Py_DECREF(v);
225
78.2k
    return NULL;
226
461
}
227
228
/* Codec registry encoding check API. */
229
230
int PyCodec_KnownEncoding(const char *encoding)
231
0
{
232
0
    PyObject *codecs;
233
234
0
    codecs = _PyCodec_Lookup(encoding);
235
0
    if (!codecs) {
236
0
        PyErr_Clear();
237
0
        return 0;
238
0
    }
239
0
    else {
240
0
        Py_DECREF(codecs);
241
0
        return 1;
242
0
    }
243
0
}
244
245
static
246
PyObject *args_tuple(PyObject *object,
247
                     const char *errors)
248
921k
{
249
921k
    PyObject *args;
250
251
921k
    args = PyTuple_New(1 + (errors != NULL));
252
921k
    if (args == NULL)
253
0
        return NULL;
254
921k
    PyTuple_SET_ITEM(args, 0, Py_NewRef(object));
255
921k
    if (errors) {
256
141k
        PyObject *v;
257
258
141k
        v = PyUnicode_FromString(errors);
259
141k
        if (v == NULL) {
260
0
            Py_DECREF(args);
261
0
            return NULL;
262
0
        }
263
141k
        PyTuple_SET_ITEM(args, 1, v);
264
141k
    }
265
921k
    return args;
266
921k
}
267
268
/* Helper function to get a codec item */
269
270
static
271
PyObject *codec_getitem(const char *encoding, int index)
272
0
{
273
0
    PyObject *codecs;
274
0
    PyObject *v;
275
276
0
    codecs = _PyCodec_Lookup(encoding);
277
0
    if (codecs == NULL)
278
0
        return NULL;
279
0
    v = PyTuple_GET_ITEM(codecs, index);
280
0
    Py_DECREF(codecs);
281
0
    return Py_NewRef(v);
282
0
}
283
284
/* Helper functions to create an incremental codec. */
285
static
286
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
287
                                     const char *errors,
288
                                     const char *attrname)
289
48
{
290
48
    PyObject *ret, *inccodec;
291
292
48
    inccodec = PyObject_GetAttrString(codec_info, attrname);
293
48
    if (inccodec == NULL)
294
0
        return NULL;
295
48
    if (errors)
296
48
        ret = PyObject_CallFunction(inccodec, "s", errors);
297
0
    else
298
0
        ret = _PyObject_CallNoArgs(inccodec);
299
48
    Py_DECREF(inccodec);
300
48
    return ret;
301
48
}
302
303
static
304
PyObject *codec_getincrementalcodec(const char *encoding,
305
                                    const char *errors,
306
                                    const char *attrname)
307
0
{
308
0
    PyObject *codec_info, *ret;
309
310
0
    codec_info = _PyCodec_Lookup(encoding);
311
0
    if (codec_info == NULL)
312
0
        return NULL;
313
0
    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
314
0
    Py_DECREF(codec_info);
315
0
    return ret;
316
0
}
317
318
/* Helper function to create a stream codec. */
319
320
static
321
PyObject *codec_getstreamcodec(const char *encoding,
322
                               PyObject *stream,
323
                               const char *errors,
324
                               const int index)
325
0
{
326
0
    PyObject *codecs, *streamcodec, *codeccls;
327
328
0
    codecs = _PyCodec_Lookup(encoding);
329
0
    if (codecs == NULL)
330
0
        return NULL;
331
332
0
    codeccls = PyTuple_GET_ITEM(codecs, index);
333
0
    if (errors != NULL)
334
0
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
335
0
    else
336
0
        streamcodec = PyObject_CallOneArg(codeccls, stream);
337
0
    Py_DECREF(codecs);
338
0
    return streamcodec;
339
0
}
340
341
/* Helpers to work with the result of _PyCodec_Lookup
342
343
 */
344
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
345
                                             const char *errors)
346
16
{
347
16
    return codec_makeincrementalcodec(codec_info, errors,
348
16
                                      "incrementaldecoder");
349
16
}
350
351
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
352
                                             const char *errors)
353
32
{
354
32
    return codec_makeincrementalcodec(codec_info, errors,
355
32
                                      "incrementalencoder");
356
32
}
357
358
359
/* Convenience APIs to query the Codec registry.
360
361
   All APIs return a codec object with incremented refcount.
362
363
 */
364
365
PyObject *PyCodec_Encoder(const char *encoding)
366
0
{
367
0
    return codec_getitem(encoding, 0);
368
0
}
369
370
PyObject *PyCodec_Decoder(const char *encoding)
371
0
{
372
0
    return codec_getitem(encoding, 1);
373
0
}
374
375
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
376
                                     const char *errors)
377
0
{
378
0
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
379
0
}
380
381
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
382
                                     const char *errors)
383
0
{
384
0
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
385
0
}
386
387
PyObject *PyCodec_StreamReader(const char *encoding,
388
                               PyObject *stream,
389
                               const char *errors)
390
0
{
391
0
    return codec_getstreamcodec(encoding, stream, errors, 2);
392
0
}
393
394
PyObject *PyCodec_StreamWriter(const char *encoding,
395
                               PyObject *stream,
396
                               const char *errors)
397
0
{
398
0
    return codec_getstreamcodec(encoding, stream, errors, 3);
399
0
}
400
401
/* Encode an object (e.g. a Unicode object) using the given encoding
402
   and return the resulting encoded object (usually a Python string).
403
404
   errors is passed to the encoder factory as argument if non-NULL. */
405
406
static PyObject *
407
_PyCodec_EncodeInternal(PyObject *object,
408
                        PyObject *encoder,
409
                        const char *encoding,
410
                        const char *errors)
411
705k
{
412
705k
    PyObject *args = NULL, *result = NULL;
413
705k
    PyObject *v = NULL;
414
415
705k
    args = args_tuple(object, errors);
416
705k
    if (args == NULL)
417
0
        goto onError;
418
419
705k
    result = PyObject_Call(encoder, args, NULL);
420
705k
    if (result == NULL) {
421
0
        _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding);
422
0
        goto onError;
423
0
    }
424
425
705k
    if (!PyTuple_Check(result) ||
426
705k
        PyTuple_GET_SIZE(result) != 2) {
427
0
        PyErr_SetString(PyExc_TypeError,
428
0
                        "encoder must return a tuple (object, integer)");
429
0
        goto onError;
430
0
    }
431
705k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
432
    /* We don't check or use the second (integer) entry. */
433
434
705k
    Py_DECREF(args);
435
705k
    Py_DECREF(encoder);
436
705k
    Py_DECREF(result);
437
705k
    return v;
438
439
0
 onError:
440
0
    Py_XDECREF(result);
441
0
    Py_XDECREF(args);
442
0
    Py_XDECREF(encoder);
443
0
    return NULL;
444
705k
}
445
446
/* Decode an object (usually a Python string) using the given encoding
447
   and return an equivalent object (e.g. a Unicode object).
448
449
   errors is passed to the decoder factory as argument if non-NULL. */
450
451
static PyObject *
452
_PyCodec_DecodeInternal(PyObject *object,
453
                        PyObject *decoder,
454
                        const char *encoding,
455
                        const char *errors)
456
215k
{
457
215k
    PyObject *args = NULL, *result = NULL;
458
215k
    PyObject *v;
459
460
215k
    args = args_tuple(object, errors);
461
215k
    if (args == NULL)
462
0
        goto onError;
463
464
215k
    result = PyObject_Call(decoder, args, NULL);
465
215k
    if (result == NULL) {
466
62.3k
        _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding);
467
62.3k
        goto onError;
468
62.3k
    }
469
153k
    if (!PyTuple_Check(result) ||
470
153k
        PyTuple_GET_SIZE(result) != 2) {
471
0
        PyErr_SetString(PyExc_TypeError,
472
0
                        "decoder must return a tuple (object,integer)");
473
0
        goto onError;
474
0
    }
475
153k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
476
    /* We don't check or use the second (integer) entry. */
477
478
153k
    Py_DECREF(args);
479
153k
    Py_DECREF(decoder);
480
153k
    Py_DECREF(result);
481
153k
    return v;
482
483
62.3k
 onError:
484
62.3k
    Py_XDECREF(args);
485
62.3k
    Py_XDECREF(decoder);
486
62.3k
    Py_XDECREF(result);
487
62.3k
    return NULL;
488
153k
}
489
490
/* Generic encoding/decoding API */
491
PyObject *PyCodec_Encode(PyObject *object,
492
                         const char *encoding,
493
                         const char *errors)
494
0
{
495
0
    PyObject *encoder;
496
497
0
    encoder = PyCodec_Encoder(encoding);
498
0
    if (encoder == NULL)
499
0
        return NULL;
500
501
0
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
502
0
}
503
504
PyObject *PyCodec_Decode(PyObject *object,
505
                         const char *encoding,
506
                         const char *errors)
507
0
{
508
0
    PyObject *decoder;
509
510
0
    decoder = PyCodec_Decoder(encoding);
511
0
    if (decoder == NULL)
512
0
        return NULL;
513
514
0
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
515
0
}
516
517
/* Text encoding/decoding API */
518
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
519
                                       const char *alternate_command)
520
1.00M
{
521
1.00M
    PyObject *codec;
522
1.00M
    PyObject *attr;
523
1.00M
    int is_text_codec;
524
525
1.00M
    codec = _PyCodec_Lookup(encoding);
526
1.00M
    if (codec == NULL)
527
78.2k
        return NULL;
528
529
    /* Backwards compatibility: assume any raw tuple describes a text
530
     * encoding, and the same for anything lacking the private
531
     * attribute.
532
     */
533
924k
    if (!PyTuple_CheckExact(codec)) {
534
924k
        if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
535
0
            Py_DECREF(codec);
536
0
            return NULL;
537
0
        }
538
924k
        if (attr != NULL) {
539
924k
            is_text_codec = PyObject_IsTrue(attr);
540
924k
            Py_DECREF(attr);
541
924k
            if (is_text_codec <= 0) {
542
3.08k
                Py_DECREF(codec);
543
3.08k
                if (!is_text_codec) {
544
3.08k
                    if (alternate_command != NULL) {
545
3.08k
                        PyErr_Format(PyExc_LookupError,
546
3.08k
                                     "'%.400s' is not a text encoding; "
547
3.08k
                                     "use %s to handle arbitrary codecs",
548
3.08k
                                     encoding, alternate_command);
549
3.08k
                    }
550
0
                    else {
551
0
                        PyErr_Format(PyExc_LookupError,
552
0
                                     "'%.400s' is not a text encoding",
553
0
                                     encoding);
554
0
                    }
555
3.08k
                }
556
3.08k
                return NULL;
557
3.08k
            }
558
924k
        }
559
924k
    }
560
561
    /* This appears to be a valid text encoding */
562
921k
    return codec;
563
924k
}
564
565
566
static
567
PyObject *codec_getitem_checked(const char *encoding,
568
                                const char *alternate_command,
569
                                int index)
570
1.00M
{
571
1.00M
    PyObject *codec;
572
1.00M
    PyObject *v;
573
574
1.00M
    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
575
1.00M
    if (codec == NULL)
576
81.3k
        return NULL;
577
578
921k
    v = Py_NewRef(PyTuple_GET_ITEM(codec, index));
579
921k
    Py_DECREF(codec);
580
921k
    return v;
581
1.00M
}
582
583
static PyObject * _PyCodec_TextEncoder(const char *encoding)
584
705k
{
585
705k
    return codec_getitem_checked(encoding, "codecs.encode()", 0);
586
705k
}
587
588
static PyObject * _PyCodec_TextDecoder(const char *encoding)
589
297k
{
590
297k
    return codec_getitem_checked(encoding, "codecs.decode()", 1);
591
297k
}
592
593
PyObject *_PyCodec_EncodeText(PyObject *object,
594
                              const char *encoding,
595
                              const char *errors)
596
705k
{
597
705k
    PyObject *encoder;
598
599
705k
    encoder = _PyCodec_TextEncoder(encoding);
600
705k
    if (encoder == NULL)
601
0
        return NULL;
602
603
705k
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
604
705k
}
605
606
PyObject *_PyCodec_DecodeText(PyObject *object,
607
                              const char *encoding,
608
                              const char *errors)
609
297k
{
610
297k
    PyObject *decoder;
611
612
297k
    decoder = _PyCodec_TextDecoder(encoding);
613
297k
    if (decoder == NULL)
614
81.3k
        return NULL;
615
616
215k
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
617
297k
}
618
619
/* Register the error handling callback function error under the name
620
   name. This function will be called by the codec when it encounters
621
   an unencodable characters/undecodable bytes and doesn't know the
622
   callback name, when name is specified as the error parameter
623
   in the call to the encode/decode function.
624
   Return 0 on success, -1 on error */
625
int PyCodec_RegisterError(const char *name, PyObject *error)
626
0
{
627
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
628
0
    assert(interp->codecs.initialized);
629
0
    if (!PyCallable_Check(error)) {
630
0
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
631
0
        return -1;
632
0
    }
633
0
    return PyDict_SetItemString(interp->codecs.error_registry,
634
0
                                name, error);
635
0
}
636
637
int _PyCodec_UnregisterError(const char *name)
638
0
{
639
0
    for (size_t i = 0; i < Py_ARRAY_LENGTH(codecs_builtin_error_handlers); ++i) {
640
0
        if (strcmp(name, codecs_builtin_error_handlers[i]) == 0) {
641
0
            PyErr_Format(PyExc_ValueError,
642
0
                         "cannot un-register built-in error handler '%s'", name);
643
0
            return -1;
644
0
        }
645
0
    }
646
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
647
0
    assert(interp->codecs.initialized);
648
0
    return PyDict_PopString(interp->codecs.error_registry, name, NULL);
649
0
}
650
651
/* Lookup the error handling callback function registered under the
652
   name error. As a special case NULL can be passed, in which case
653
   the error handling callback for strict encoding will be returned. */
654
PyObject *PyCodec_LookupError(const char *name)
655
232k
{
656
232k
    PyInterpreterState *interp = _PyInterpreterState_GET();
657
232k
    assert(interp->codecs.initialized);
658
659
232k
    if (name==NULL)
660
153k
        name = "strict";
661
232k
    PyObject *handler;
662
232k
    if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
663
0
        return NULL;
664
0
    }
665
232k
    if (handler == NULL) {
666
0
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
667
0
        return NULL;
668
0
    }
669
232k
    return handler;
670
232k
}
671
672
673
static inline void
674
wrong_exception_type(PyObject *exc)
675
0
{
676
0
    PyErr_Format(PyExc_TypeError,
677
0
                 "don't know how to handle %T in error callback", exc);
678
0
}
679
680
681
#define _PyIsUnicodeEncodeError(EXC)    \
682
212k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeEncodeError)
683
#define _PyIsUnicodeDecodeError(EXC)    \
684
203k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeDecodeError)
685
#define _PyIsUnicodeTranslateError(EXC) \
686
0
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
687
688
689
// --- codecs handlers: utilities ---------------------------------------------
690
691
/*
692
 * Return the number of characters (including special prefixes)
693
 * needed to represent 'ch' by codec_handler_write_unicode_hex().
694
 */
695
static inline Py_ssize_t
696
codec_handler_unicode_hex_width(Py_UCS4 ch)
697
0
{
698
0
    if (ch >= 0x10000) {
699
        // format: '\\' + 'U' + 8 hex digits
700
0
        return 1 + 1 + 8;
701
0
    }
702
0
    else if (ch >= 0x100) {
703
        // format: '\\' + 'u' + 4 hex digits
704
0
        return 1 + 1 + 4;
705
0
    }
706
0
    else {
707
        // format: '\\' + 'x' + 2 hex digits
708
0
        return 1 + 1 + 2;
709
0
    }
710
0
}
711
712
713
/*
714
 * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
715
 * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
716
 */
717
static inline void
718
codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
719
0
{
720
0
    *(*p)++ = '\\';
721
0
    if (ch >= 0x10000) {
722
0
        *(*p)++ = 'U';
723
0
        *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf];
724
0
        *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf];
725
0
        *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf];
726
0
        *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf];
727
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
728
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
729
0
    }
730
0
    else if (ch >= 0x100) {
731
0
        *(*p)++ = 'u';
732
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
733
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
734
0
    }
735
0
    else {
736
0
        *(*p)++ = 'x';
737
0
    }
738
0
    *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf];
739
0
    *(*p)++ = Py_hexdigits[ch & 0xf];
740
0
}
741
742
743
/*
744
 * Determine the number of digits for a decimal representation of Unicode
745
 * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
746
 */
747
static inline int
748
n_decimal_digits_for_codepoint(Py_UCS4 ch)
749
0
{
750
0
    if (ch < 10) return 1;
751
0
    if (ch < 100) return 2;
752
0
    if (ch < 1000) return 3;
753
0
    if (ch < 10000) return 4;
754
0
    if (ch < 100000) return 5;
755
0
    if (ch < 1000000) return 6;
756
0
    if (ch < 10000000) return 7;
757
    // Unicode codepoints are limited to 1114111 (7 decimal digits)
758
0
    Py_UNREACHABLE();
759
0
}
760
761
762
/*
763
 * Create a Unicode string containing 'count' copies of the official
764
 * Unicode REPLACEMENT CHARACTER (0xFFFD).
765
 */
766
static PyObject *
767
codec_handler_unicode_replacement_character(Py_ssize_t count)
768
147k
{
769
147k
    PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER);
770
147k
    if (res == NULL) {
771
0
        return NULL;
772
0
    }
773
147k
    assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
774
147k
    Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
775
294k
    for (Py_ssize_t i = 0; i < count; ++i) {
776
147k
        outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
777
147k
    }
778
147k
    assert(_PyUnicode_CheckConsistency(res, 1));
779
147k
    return res;
780
147k
}
781
782
783
// --- handler: 'strict' ------------------------------------------------------
784
785
PyObject *PyCodec_StrictErrors(PyObject *exc)
786
219k
{
787
219k
    if (PyExceptionInstance_Check(exc)) {
788
219k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
789
219k
    }
790
0
    else {
791
0
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
792
0
    }
793
219k
    return NULL;
794
219k
}
795
796
797
// --- handler: 'ignore' ------------------------------------------------------
798
799
static PyObject *
800
_PyCodec_IgnoreError(PyObject *exc, int as_bytes)
801
0
{
802
0
    Py_ssize_t end;
803
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL,
804
0
                                  &end, NULL, as_bytes) < 0)
805
0
    {
806
0
        return NULL;
807
0
    }
808
0
    return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
809
0
}
810
811
812
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
813
0
{
814
0
    if (_PyIsUnicodeEncodeError(exc) || _PyIsUnicodeTranslateError(exc)) {
815
0
        return _PyCodec_IgnoreError(exc, false);
816
0
    }
817
0
    else if (_PyIsUnicodeDecodeError(exc)) {
818
0
        return _PyCodec_IgnoreError(exc, true);
819
0
    }
820
0
    else {
821
0
        wrong_exception_type(exc);
822
0
        return NULL;
823
0
    }
824
0
}
825
826
827
// --- handler: 'replace' -----------------------------------------------------
828
829
static PyObject *
830
_PyCodec_ReplaceUnicodeEncodeError(PyObject *exc)
831
0
{
832
0
    Py_ssize_t start, end, slen;
833
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
834
0
                                  &start, &end, &slen, false) < 0)
835
0
    {
836
0
        return NULL;
837
0
    }
838
0
    PyObject *res = PyUnicode_New(slen, '?');
839
0
    if (res == NULL) {
840
0
        return NULL;
841
0
    }
842
0
    assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
843
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
844
0
    memset(outp, '?', sizeof(Py_UCS1) * slen);
845
0
    assert(_PyUnicode_CheckConsistency(res, 1));
846
0
    return Py_BuildValue("(Nn)", res, end);
847
0
}
848
849
850
static PyObject *
851
_PyCodec_ReplaceUnicodeDecodeError(PyObject *exc)
852
147k
{
853
147k
    Py_ssize_t end;
854
147k
    if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) {
855
0
        return NULL;
856
0
    }
857
147k
    PyObject *res = codec_handler_unicode_replacement_character(1);
858
147k
    if (res == NULL) {
859
0
        return NULL;
860
0
    }
861
147k
    return Py_BuildValue("(Nn)", res, end);
862
147k
}
863
864
865
static PyObject *
866
_PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
867
0
{
868
0
    Py_ssize_t start, end, slen;
869
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
870
0
                                  &start, &end, &slen, false) < 0)
871
0
    {
872
0
        return NULL;
873
0
    }
874
0
    PyObject *res = codec_handler_unicode_replacement_character(slen);
875
0
    if (res == NULL) {
876
0
        return NULL;
877
0
    }
878
0
    return Py_BuildValue("(Nn)", res, end);
879
0
}
880
881
882
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
883
147k
{
884
147k
    if (_PyIsUnicodeEncodeError(exc)) {
885
0
        return _PyCodec_ReplaceUnicodeEncodeError(exc);
886
0
    }
887
147k
    else if (_PyIsUnicodeDecodeError(exc)) {
888
147k
        return _PyCodec_ReplaceUnicodeDecodeError(exc);
889
147k
    }
890
0
    else if (_PyIsUnicodeTranslateError(exc)) {
891
0
        return _PyCodec_ReplaceUnicodeTranslateError(exc);
892
0
    }
893
0
    else {
894
0
        wrong_exception_type(exc);
895
0
        return NULL;
896
0
    }
897
147k
}
898
899
900
// --- handler: 'xmlcharrefreplace' -------------------------------------------
901
902
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
903
0
{
904
0
    if (!_PyIsUnicodeEncodeError(exc)) {
905
0
        wrong_exception_type(exc);
906
0
        return NULL;
907
0
    }
908
909
0
    PyObject *obj;
910
0
    Py_ssize_t objlen, start, end, slen;
911
0
    if (_PyUnicodeError_GetParams(exc,
912
0
                                  &obj, &objlen,
913
0
                                  &start, &end, &slen, false) < 0)
914
0
    {
915
0
        return NULL;
916
0
    }
917
918
    // The number of characters that each character 'ch' contributes
919
    // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
920
    // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
921
    // range is below 10^7, each "block" requires at most 2 + 7 + 1
922
    // characters.
923
0
    if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
924
0
        end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
925
0
        end = Py_MIN(end, objlen);
926
0
        slen = Py_MAX(0, end - start);
927
0
    }
928
929
0
    Py_ssize_t ressize = 0;
930
0
    for (Py_ssize_t i = start; i < end; ++i) {
931
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
932
0
        int k = n_decimal_digits_for_codepoint(ch);
933
0
        assert(k != 0);
934
0
        assert(k <= 7);
935
0
        ressize += 2 + k + 1;
936
0
    }
937
938
    /* allocate replacement */
939
0
    PyObject *res = PyUnicode_New(ressize, 127);
940
0
    if (res == NULL) {
941
0
        Py_DECREF(obj);
942
0
        return NULL;
943
0
    }
944
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
945
    /* generate replacement */
946
0
    for (Py_ssize_t i = start; i < end; ++i) {
947
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
948
        /*
949
         * Write the decimal representation of 'ch' to the buffer pointed by 'p'
950
         * using at most 7 characters prefixed by '&#' and suffixed by ';'.
951
         */
952
0
        *outp++ = '&';
953
0
        *outp++ = '#';
954
0
        Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
955
0
        for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
956
0
            *p_digit = '0' + (ch % 10);
957
0
            ch /= 10;
958
0
        }
959
0
        assert(ch == 0);
960
0
        outp = digit_end;
961
0
        *outp++ = ';';
962
0
    }
963
0
    assert(_PyUnicode_CheckConsistency(res, 1));
964
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
965
0
    Py_DECREF(obj);
966
0
    return restuple;
967
0
}
968
969
970
// --- handler: 'backslashreplace' --------------------------------------------
971
972
static PyObject *
973
_PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc)
974
0
{
975
0
    PyObject *obj;
976
0
    Py_ssize_t objlen, start, end, slen;
977
0
    if (_PyUnicodeError_GetParams(exc,
978
0
                                  &obj, &objlen,
979
0
                                  &start, &end, &slen, false) < 0)
980
0
    {
981
0
        return NULL;
982
0
    }
983
984
    // The number of characters that each character 'ch' contributes
985
    // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch}
986
    // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS,
987
    // where the number of hexdigits is either 2, 4, or 8 (not 6).
988
    // Since the Unicode range is below 10^7, we choose k = 8 whence
989
    // each "block" requires at most 1 + 1 + 8 characters.
990
0
    if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) {
991
0
        end = start + PY_SSIZE_T_MAX / (1 + 1 + 8);
992
0
        end = Py_MIN(end, objlen);
993
0
        slen = Py_MAX(0, end - start);
994
0
    }
995
996
0
    Py_ssize_t ressize = 0;
997
0
    for (Py_ssize_t i = start; i < end; ++i) {
998
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
999
0
        ressize += codec_handler_unicode_hex_width(c);
1000
0
    }
1001
0
    PyObject *res = PyUnicode_New(ressize, 127);
1002
0
    if (res == NULL) {
1003
0
        Py_DECREF(obj);
1004
0
        return NULL;
1005
0
    }
1006
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1007
0
    for (Py_ssize_t i = start; i < end; ++i) {
1008
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1009
0
        codec_handler_write_unicode_hex(&outp, c);
1010
0
    }
1011
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1012
0
    Py_DECREF(obj);
1013
0
    return Py_BuildValue("(Nn)", res, end);
1014
0
}
1015
1016
1017
static PyObject *
1018
_PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc)
1019
0
{
1020
0
    PyObject *obj;
1021
0
    Py_ssize_t objlen, start, end, slen;
1022
0
    if (_PyUnicodeError_GetParams(exc,
1023
0
                                  &obj, &objlen,
1024
0
                                  &start, &end, &slen, true) < 0)
1025
0
    {
1026
0
        return NULL;
1027
0
    }
1028
1029
0
    PyObject *res = PyUnicode_New(4 * slen, 127);
1030
0
    if (res == NULL) {
1031
0
        Py_DECREF(obj);
1032
0
        return NULL;
1033
0
    }
1034
1035
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1036
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1037
0
    for (Py_ssize_t i = start; i < end; i++, outp += 4) {
1038
0
        const unsigned char ch = p[i];
1039
0
        outp[0] = '\\';
1040
0
        outp[1] = 'x';
1041
0
        outp[2] = Py_hexdigits[(ch >> 4) & 0xf];
1042
0
        outp[3] = Py_hexdigits[ch & 0xf];
1043
0
    }
1044
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1045
0
    Py_DECREF(obj);
1046
0
    return Py_BuildValue("(Nn)", res, end);
1047
0
}
1048
1049
1050
static inline PyObject *
1051
_PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc)
1052
0
{
1053
    // Same implementation as for UnicodeEncodeError objects.
1054
0
    return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1055
0
}
1056
1057
1058
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
1059
0
{
1060
0
    if (_PyIsUnicodeEncodeError(exc)) {
1061
0
        return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1062
0
    }
1063
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1064
0
        return _PyCodec_BackslashReplaceUnicodeDecodeError(exc);
1065
0
    }
1066
0
    else if (_PyIsUnicodeTranslateError(exc)) {
1067
0
        return _PyCodec_BackslashReplaceUnicodeTranslateError(exc);
1068
0
    }
1069
0
    else {
1070
0
        wrong_exception_type(exc);
1071
0
        return NULL;
1072
0
    }
1073
0
}
1074
1075
1076
// --- handler: 'namereplace' -------------------------------------------------
1077
1078
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
1079
0
{
1080
0
    if (!_PyIsUnicodeEncodeError(exc)) {
1081
0
        wrong_exception_type(exc);
1082
0
        return NULL;
1083
0
    }
1084
1085
0
    _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
1086
0
    if (ucnhash_capi == NULL) {
1087
0
        return NULL;
1088
0
    }
1089
1090
0
    PyObject *obj;
1091
0
    Py_ssize_t start, end;
1092
0
    if (_PyUnicodeError_GetParams(exc,
1093
0
                                  &obj, NULL,
1094
0
                                  &start, &end, NULL, false) < 0)
1095
0
    {
1096
0
        return NULL;
1097
0
    }
1098
1099
0
    char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */
1100
0
    Py_ssize_t imax = start, ressize = 0, replsize;
1101
0
    for (; imax < end; ++imax) {
1102
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax);
1103
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1104
            // If 'c' is recognized by getname(), the corresponding replacement
1105
            // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
1106
            // characters. Failures of getname() are ignored by the handler.
1107
0
            replsize = 1 + 1 + 1 + strlen(buffer) + 1;
1108
0
        }
1109
0
        else {
1110
0
            replsize = codec_handler_unicode_hex_width(c);
1111
0
        }
1112
0
        if (ressize > PY_SSIZE_T_MAX - replsize) {
1113
0
            break;
1114
0
        }
1115
0
        ressize += replsize;
1116
0
    }
1117
1118
0
    PyObject *res = PyUnicode_New(ressize, 127);
1119
0
    if (res == NULL) {
1120
0
        Py_DECREF(obj);
1121
0
        return NULL;
1122
0
    }
1123
1124
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1125
0
    for (Py_ssize_t i = start; i < imax; ++i) {
1126
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1127
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1128
0
            *outp++ = '\\';
1129
0
            *outp++ = 'N';
1130
0
            *outp++ = '{';
1131
0
            (void)strcpy((char *)outp, buffer);
1132
0
            outp += strlen(buffer);
1133
0
            *outp++ = '}';
1134
0
        }
1135
0
        else {
1136
0
            codec_handler_write_unicode_hex(&outp, c);
1137
0
        }
1138
0
    }
1139
1140
0
    assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1141
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1142
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, imax);
1143
0
    Py_DECREF(obj);
1144
0
    return restuple;
1145
0
}
1146
1147
1148
0
#define ENC_UNKNOWN     -1
1149
0
#define ENC_UTF8        0
1150
0
#define ENC_UTF16BE     1
1151
0
#define ENC_UTF16LE     2
1152
0
#define ENC_UTF32BE     3
1153
0
#define ENC_UTF32LE     4
1154
1155
static int
1156
get_standard_encoding_impl(const char *encoding, int *bytelength)
1157
0
{
1158
0
    if (Py_TOLOWER(encoding[0]) == 'u' &&
1159
0
        Py_TOLOWER(encoding[1]) == 't' &&
1160
0
        Py_TOLOWER(encoding[2]) == 'f') {
1161
0
        encoding += 3;
1162
0
        if (*encoding == '-' || *encoding == '_' )
1163
0
            encoding++;
1164
0
        if (encoding[0] == '8' && encoding[1] == '\0') {
1165
0
            *bytelength = 3;
1166
0
            return ENC_UTF8;
1167
0
        }
1168
0
        else if (encoding[0] == '1' && encoding[1] == '6') {
1169
0
            encoding += 2;
1170
0
            *bytelength = 2;
1171
0
            if (*encoding == '\0') {
1172
#ifdef WORDS_BIGENDIAN
1173
                return ENC_UTF16BE;
1174
#else
1175
0
                return ENC_UTF16LE;
1176
0
#endif
1177
0
            }
1178
0
            if (*encoding == '-' || *encoding == '_' )
1179
0
                encoding++;
1180
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1181
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1182
0
                    return ENC_UTF16BE;
1183
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1184
0
                    return ENC_UTF16LE;
1185
0
            }
1186
0
        }
1187
0
        else if (encoding[0] == '3' && encoding[1] == '2') {
1188
0
            encoding += 2;
1189
0
            *bytelength = 4;
1190
0
            if (*encoding == '\0') {
1191
#ifdef WORDS_BIGENDIAN
1192
                return ENC_UTF32BE;
1193
#else
1194
0
                return ENC_UTF32LE;
1195
0
#endif
1196
0
            }
1197
0
            if (*encoding == '-' || *encoding == '_' )
1198
0
                encoding++;
1199
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1200
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1201
0
                    return ENC_UTF32BE;
1202
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1203
0
                    return ENC_UTF32LE;
1204
0
            }
1205
0
        }
1206
0
    }
1207
0
    else if (strcmp(encoding, "CP_UTF8") == 0) {
1208
0
        *bytelength = 3;
1209
0
        return ENC_UTF8;
1210
0
    }
1211
0
    return ENC_UNKNOWN;
1212
0
}
1213
1214
1215
static int
1216
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
1217
0
{
1218
0
    const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
1219
0
    if (encoding_cstr == NULL) {
1220
0
        return -1;
1221
0
    }
1222
0
    *code = get_standard_encoding_impl(encoding_cstr, bytelength);
1223
0
    return 0;
1224
0
}
1225
1226
1227
// --- handler: 'surrogatepass' -----------------------------------------------
1228
1229
static PyObject *
1230
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
1231
0
{
1232
0
    PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
1233
0
    if (encoding == NULL) {
1234
0
        return NULL;
1235
0
    }
1236
0
    int code, bytelength;
1237
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1238
0
    Py_DECREF(encoding);
1239
0
    if (rc < 0) {
1240
0
        return NULL;
1241
0
    }
1242
0
    if (code == ENC_UNKNOWN) {
1243
0
        goto bail;
1244
0
    }
1245
1246
0
    PyObject *obj;
1247
0
    Py_ssize_t objlen, start, end, slen;
1248
0
    if (_PyUnicodeError_GetParams(exc,
1249
0
                                  &obj, &objlen,
1250
0
                                  &start, &end, &slen, false) < 0)
1251
0
    {
1252
0
        return NULL;
1253
0
    }
1254
1255
0
    if (slen > PY_SSIZE_T_MAX / bytelength) {
1256
0
        end = start + PY_SSIZE_T_MAX / bytelength;
1257
0
        end = Py_MIN(end, objlen);
1258
0
        slen = Py_MAX(0, end - start);
1259
0
    }
1260
1261
0
    PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
1262
0
    if (res == NULL) {
1263
0
        Py_DECREF(obj);
1264
0
        return NULL;
1265
0
    }
1266
1267
0
    unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
1268
0
    for (Py_ssize_t i = start; i < end; i++) {
1269
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1270
0
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1271
            /* Not a surrogate, fail with original exception */
1272
0
            Py_DECREF(obj);
1273
0
            Py_DECREF(res);
1274
0
            goto bail;
1275
0
        }
1276
0
        switch (code) {
1277
0
            case ENC_UTF8: {
1278
0
                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1279
0
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1280
0
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1281
0
                break;
1282
0
            }
1283
0
            case ENC_UTF16LE: {
1284
0
                *outp++ = (unsigned char)ch;
1285
0
                *outp++ = (unsigned char)(ch >> 8);
1286
0
                break;
1287
0
            }
1288
0
            case ENC_UTF16BE: {
1289
0
                *outp++ = (unsigned char)(ch >> 8);
1290
0
                *outp++ = (unsigned char)ch;
1291
0
                break;
1292
0
            }
1293
0
            case ENC_UTF32LE: {
1294
0
                *outp++ = (unsigned char)ch;
1295
0
                *outp++ = (unsigned char)(ch >> 8);
1296
0
                *outp++ = (unsigned char)(ch >> 16);
1297
0
                *outp++ = (unsigned char)(ch >> 24);
1298
0
                break;
1299
0
            }
1300
0
            case ENC_UTF32BE: {
1301
0
                *outp++ = (unsigned char)(ch >> 24);
1302
0
                *outp++ = (unsigned char)(ch >> 16);
1303
0
                *outp++ = (unsigned char)(ch >> 8);
1304
0
                *outp++ = (unsigned char)ch;
1305
0
                break;
1306
0
            }
1307
0
        }
1308
0
    }
1309
1310
0
    Py_DECREF(obj);
1311
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
1312
0
    return restuple;
1313
1314
0
bail:
1315
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1316
0
    return NULL;
1317
0
}
1318
1319
1320
static PyObject *
1321
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
1322
0
{
1323
0
    PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
1324
0
    if (encoding == NULL) {
1325
0
        return NULL;
1326
0
    }
1327
0
    int code, bytelength;
1328
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1329
0
    Py_DECREF(encoding);
1330
0
    if (rc < 0) {
1331
0
        return NULL;
1332
0
    }
1333
0
    if (code == ENC_UNKNOWN) {
1334
0
        goto bail;
1335
0
    }
1336
1337
0
    PyObject *obj;
1338
0
    Py_ssize_t objlen, start, end, slen;
1339
0
    if (_PyUnicodeError_GetParams(exc,
1340
0
                                  &obj, &objlen,
1341
0
                                  &start, &end, &slen, true) < 0)
1342
0
    {
1343
0
        return NULL;
1344
0
    }
1345
1346
    /* Try decoding a single surrogate character. If
1347
       there are more, let the codec call us again. */
1348
0
    Py_UCS4 ch = 0;
1349
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1350
0
    p += start;
1351
1352
0
    if (objlen - start >= bytelength) {
1353
0
        switch (code) {
1354
0
            case ENC_UTF8: {
1355
0
                if ((p[0] & 0xf0) == 0xe0 &&
1356
0
                    (p[1] & 0xc0) == 0x80 &&
1357
0
                    (p[2] & 0xc0) == 0x80)
1358
0
                {
1359
                    /* it's a three-byte code */
1360
0
                    ch = ((p[0] & 0x0f) << 12) +
1361
0
                         ((p[1] & 0x3f) << 6)  +
1362
0
                          (p[2] & 0x3f);
1363
0
                }
1364
0
                break;
1365
0
            }
1366
0
            case ENC_UTF16LE: {
1367
0
                ch = p[1] << 8 | p[0];
1368
0
                break;
1369
0
            }
1370
0
            case ENC_UTF16BE: {
1371
0
                ch = p[0] << 8 | p[1];
1372
0
                break;
1373
0
            }
1374
0
            case ENC_UTF32LE: {
1375
0
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1376
0
                break;
1377
0
            }
1378
0
            case ENC_UTF32BE: {
1379
0
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1380
0
                break;
1381
0
            }
1382
0
        }
1383
0
    }
1384
0
    Py_DECREF(obj);
1385
0
    if (!Py_UNICODE_IS_SURROGATE(ch)) {
1386
0
        goto bail;
1387
0
    }
1388
1389
0
    PyObject *res = PyUnicode_FromOrdinal(ch);
1390
0
    if (res == NULL) {
1391
0
        return NULL;
1392
0
    }
1393
0
    return Py_BuildValue("(Nn)", res, start + bytelength);
1394
1395
0
bail:
1396
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1397
0
    return NULL;
1398
0
}
1399
1400
1401
/* This handler is declared static until someone demonstrates
1402
   a need to call it directly. */
1403
static PyObject *
1404
PyCodec_SurrogatePassErrors(PyObject *exc)
1405
0
{
1406
0
    if (_PyIsUnicodeEncodeError(exc)) {
1407
0
        return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
1408
0
    }
1409
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1410
0
        return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
1411
0
    }
1412
0
    else {
1413
0
        wrong_exception_type(exc);
1414
0
        return NULL;
1415
0
    }
1416
0
}
1417
1418
1419
// --- handler: 'surrogateescape' ---------------------------------------------
1420
1421
static PyObject *
1422
_PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
1423
9.38k
{
1424
9.38k
    PyObject *obj;
1425
9.38k
    Py_ssize_t start, end, slen;
1426
9.38k
    if (_PyUnicodeError_GetParams(exc,
1427
9.38k
                                  &obj, NULL,
1428
9.38k
                                  &start, &end, &slen, false) < 0)
1429
0
    {
1430
0
        return NULL;
1431
0
    }
1432
1433
9.38k
    PyObject *res = PyBytes_FromStringAndSize(NULL, slen);
1434
9.38k
    if (res == NULL) {
1435
0
        Py_DECREF(obj);
1436
0
        return NULL;
1437
0
    }
1438
1439
9.38k
    char *outp = PyBytes_AsString(res);
1440
9.38k
    for (Py_ssize_t i = start; i < end; i++) {
1441
9.38k
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1442
9.38k
        if (ch < 0xdc80 || ch > 0xdcff) {
1443
            /* Not a UTF-8b surrogate, fail with original exception. */
1444
9.38k
            Py_DECREF(obj);
1445
9.38k
            Py_DECREF(res);
1446
9.38k
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1447
9.38k
            return NULL;
1448
9.38k
        }
1449
0
        *outp++ = ch - 0xdc00;
1450
0
    }
1451
0
    Py_DECREF(obj);
1452
1453
0
    return Py_BuildValue("(Nn)", res, end);
1454
9.38k
}
1455
1456
1457
static PyObject *
1458
_PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc)
1459
56.1k
{
1460
56.1k
    PyObject *obj;
1461
56.1k
    Py_ssize_t start, end, slen;
1462
56.1k
    if (_PyUnicodeError_GetParams(exc,
1463
56.1k
                                  &obj, NULL,
1464
56.1k
                                  &start, &end, &slen, true) < 0)
1465
0
    {
1466
0
        return NULL;
1467
0
    }
1468
1469
56.1k
    Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1470
56.1k
    int consumed = 0;
1471
56.1k
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1472
117k
    while (consumed < 4 && consumed < slen) {
1473
        /* Refuse to escape ASCII bytes. */
1474
100k
        if (p[start + consumed] < 128) {
1475
38.8k
            break;
1476
38.8k
        }
1477
61.8k
        ch[consumed] = 0xdc00 + p[start + consumed];
1478
61.8k
        consumed++;
1479
61.8k
    }
1480
56.1k
    Py_DECREF(obj);
1481
1482
56.1k
    if (consumed == 0) {
1483
        /* Codec complained about ASCII byte. */
1484
21.6k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1485
21.6k
        return NULL;
1486
21.6k
    }
1487
1488
34.4k
    PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1489
34.4k
    if (str == NULL) {
1490
0
        return NULL;
1491
0
    }
1492
34.4k
    return Py_BuildValue("(Nn)", str, start + consumed);
1493
34.4k
}
1494
1495
1496
static PyObject *
1497
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1498
65.4k
{
1499
65.4k
    if (_PyIsUnicodeEncodeError(exc)) {
1500
9.38k
        return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc);
1501
9.38k
    }
1502
56.1k
    else if (_PyIsUnicodeDecodeError(exc)) {
1503
56.1k
        return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc);
1504
56.1k
    }
1505
0
    else {
1506
0
        wrong_exception_type(exc);
1507
0
        return NULL;
1508
0
    }
1509
65.4k
}
1510
1511
1512
// --- Codecs registry handlers -----------------------------------------------
1513
1514
static inline PyObject *
1515
strict_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1516
183k
{
1517
183k
    return PyCodec_StrictErrors(exc);
1518
183k
}
1519
1520
1521
static inline PyObject *
1522
ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1523
0
{
1524
0
    return PyCodec_IgnoreErrors(exc);
1525
0
}
1526
1527
1528
static inline PyObject *
1529
replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1530
147k
{
1531
147k
    return PyCodec_ReplaceErrors(exc);
1532
147k
}
1533
1534
1535
static inline PyObject *
1536
xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1537
0
{
1538
0
    return PyCodec_XMLCharRefReplaceErrors(exc);
1539
0
}
1540
1541
1542
static inline PyObject *
1543
backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1544
0
{
1545
0
    return PyCodec_BackslashReplaceErrors(exc);
1546
0
}
1547
1548
1549
static inline PyObject *
1550
namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1551
0
{
1552
0
    return PyCodec_NameReplaceErrors(exc);
1553
0
}
1554
1555
1556
static inline PyObject *
1557
surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1558
0
{
1559
0
    return PyCodec_SurrogatePassErrors(exc);
1560
0
}
1561
1562
1563
static inline PyObject *
1564
surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1565
65.4k
{
1566
65.4k
    return PyCodec_SurrogateEscapeErrors(exc);
1567
65.4k
}
1568
1569
1570
PyStatus
1571
_PyCodec_InitRegistry(PyInterpreterState *interp)
1572
16
{
1573
16
    static struct {
1574
16
        const char *name;
1575
16
        PyMethodDef def;
1576
16
    } methods[] =
1577
16
    {
1578
16
        {
1579
16
            "strict",
1580
16
            {
1581
16
                "strict_errors",
1582
16
                strict_errors,
1583
16
                METH_O,
1584
16
                PyDoc_STR("Implements the 'strict' error handling, which "
1585
16
                          "raises a UnicodeError on coding errors.")
1586
16
            }
1587
16
        },
1588
16
        {
1589
16
            "ignore",
1590
16
            {
1591
16
                "ignore_errors",
1592
16
                ignore_errors,
1593
16
                METH_O,
1594
16
                PyDoc_STR("Implements the 'ignore' error handling, which "
1595
16
                          "ignores malformed data and continues.")
1596
16
            }
1597
16
        },
1598
16
        {
1599
16
            "replace",
1600
16
            {
1601
16
                "replace_errors",
1602
16
                replace_errors,
1603
16
                METH_O,
1604
16
                PyDoc_STR("Implements the 'replace' error handling, which "
1605
16
                          "replaces malformed data with a replacement marker.")
1606
16
            }
1607
16
        },
1608
16
        {
1609
16
            "xmlcharrefreplace",
1610
16
            {
1611
16
                "xmlcharrefreplace_errors",
1612
16
                xmlcharrefreplace_errors,
1613
16
                METH_O,
1614
16
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1615
16
                          "which replaces an unencodable character with the "
1616
16
                          "appropriate XML character reference.")
1617
16
            }
1618
16
        },
1619
16
        {
1620
16
            "backslashreplace",
1621
16
            {
1622
16
                "backslashreplace_errors",
1623
16
                backslashreplace_errors,
1624
16
                METH_O,
1625
16
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1626
16
                          "which replaces malformed data with a backslashed "
1627
16
                          "escape sequence.")
1628
16
            }
1629
16
        },
1630
16
        {
1631
16
            "namereplace",
1632
16
            {
1633
16
                "namereplace_errors",
1634
16
                namereplace_errors,
1635
16
                METH_O,
1636
16
                PyDoc_STR("Implements the 'namereplace' error handling, "
1637
16
                          "which replaces an unencodable character with a "
1638
16
                          "\\N{...} escape sequence.")
1639
16
            }
1640
16
        },
1641
16
        {
1642
16
            "surrogatepass",
1643
16
            {
1644
16
                "surrogatepass",
1645
16
                surrogatepass_errors,
1646
16
                METH_O
1647
16
            }
1648
16
        },
1649
16
        {
1650
16
            "surrogateescape",
1651
16
            {
1652
16
                "surrogateescape",
1653
16
                surrogateescape_errors,
1654
16
                METH_O
1655
16
            }
1656
16
        }
1657
16
    };
1658
    // ensure that the built-in error handlers' names are kept in sync
1659
16
    assert(Py_ARRAY_LENGTH(methods) == Py_ARRAY_LENGTH(codecs_builtin_error_handlers));
1660
1661
16
    assert(interp->codecs.initialized == 0);
1662
16
    interp->codecs.search_path = PyList_New(0);
1663
16
    if (interp->codecs.search_path == NULL) {
1664
0
        return PyStatus_NoMemory();
1665
0
    }
1666
16
    interp->codecs.search_cache = PyDict_New();
1667
16
    if (interp->codecs.search_cache == NULL) {
1668
0
        return PyStatus_NoMemory();
1669
0
    }
1670
16
    interp->codecs.error_registry = PyDict_New();
1671
16
    if (interp->codecs.error_registry == NULL) {
1672
0
        return PyStatus_NoMemory();
1673
0
    }
1674
144
    for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1675
128
        PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1676
128
        if (func == NULL) {
1677
0
            return PyStatus_NoMemory();
1678
0
        }
1679
1680
128
        int res = PyDict_SetItemString(interp->codecs.error_registry,
1681
128
                                       methods[i].name, func);
1682
128
        Py_DECREF(func);
1683
128
        if (res < 0) {
1684
0
            return PyStatus_Error("Failed to insert into codec error registry");
1685
0
        }
1686
128
    }
1687
1688
16
    interp->codecs.initialized = 1;
1689
1690
    // Importing `encodings' will call back into this module to register codec
1691
    // search functions, so this is done after everything else is initialized.
1692
16
    PyObject *mod = PyImport_ImportModule("encodings");
1693
16
    if (mod == NULL) {
1694
0
        return PyStatus_Error("Failed to import encodings module");
1695
0
    }
1696
16
    Py_DECREF(mod);
1697
1698
16
    return PyStatus_Ok();
1699
16
}
1700
1701
void
1702
_PyCodec_Fini(PyInterpreterState *interp)
1703
0
{
1704
0
    Py_CLEAR(interp->codecs.search_path);
1705
0
    Py_CLEAR(interp->codecs.search_cache);
1706
0
    Py_CLEAR(interp->codecs.error_registry);
1707
0
    interp->codecs.initialized = 0;
1708
0
}