Coverage Report

Created: 2025-08-26 06:26

/src/cpython/Python/codecs.c
Line
Count
Source (jump to first uncovered line)
1
/* ------------------------------------------------------------------------
2
3
   Python Codec Registry and support functions
4
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7
Copyright (c) Corporation for National Research Initiatives.
8
9
   ------------------------------------------------------------------------ */
10
11
#include "Python.h"
12
#include "pycore_call.h"          // _PyObject_CallNoArgs()
13
#include "pycore_interp.h"        // PyInterpreterState.codec_search_path
14
#include "pycore_pyerrors.h"      // _PyErr_FormatNote()
15
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
16
#include "pycore_runtime.h"       // _Py_ID()
17
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
18
#include "pycore_unicodeobject.h" // _PyUnicode_InternMortal()
19
#include "pycore_pyatomic_ft_wrappers.h"
20
21
static const char *codecs_builtin_error_handlers[] = {
22
    "strict", "ignore", "replace",
23
    "xmlcharrefreplace", "backslashreplace", "namereplace",
24
    "surrogatepass", "surrogateescape",
25
};
26
27
const char *Py_hexdigits = "0123456789abcdef";
28
29
/* --- Codec Registry ----------------------------------------------------- */
30
31
int PyCodec_Register(PyObject *search_function)
32
16
{
33
16
    PyInterpreterState *interp = _PyInterpreterState_GET();
34
16
    assert(interp->codecs.initialized);
35
16
    if (search_function == NULL) {
36
0
        PyErr_BadArgument();
37
0
        goto onError;
38
0
    }
39
16
    if (!PyCallable_Check(search_function)) {
40
0
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
41
0
        goto onError;
42
0
    }
43
16
    FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
44
16
    int ret = PyList_Append(interp->codecs.search_path, search_function);
45
16
    FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
46
47
16
    return ret;
48
49
0
 onError:
50
0
    return -1;
51
16
}
52
53
int
54
PyCodec_Unregister(PyObject *search_function)
55
0
{
56
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
57
0
    if (interp->codecs.initialized != 1) {
58
        /* Do nothing if codecs state was cleared (only possible during
59
           interpreter shutdown). */
60
0
        return 0;
61
0
    }
62
63
0
    PyObject *codec_search_path = interp->codecs.search_path;
64
0
    assert(PyList_CheckExact(codec_search_path));
65
0
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
66
0
        FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
67
0
        PyObject *item = PyList_GetItemRef(codec_search_path, i);
68
0
        int ret = 1;
69
0
        if (item == search_function) {
70
            // We hold a reference to the item, so its destructor can't run
71
            // while we hold search_path_mutex.
72
0
            ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
73
0
        }
74
0
        FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
75
0
        Py_DECREF(item);
76
0
        if (ret != 1) {
77
0
            assert(interp->codecs.search_cache != NULL);
78
0
            assert(PyDict_CheckExact(interp->codecs.search_cache));
79
0
            PyDict_Clear(interp->codecs.search_cache);
80
0
            return ret;
81
0
        }
82
0
    }
83
0
    return 0;
84
0
}
85
86
extern int _Py_normalize_encoding(const char *, char *, size_t);
87
88
/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
89
   converted to lower case, spaces and hyphens are replaced with underscores. */
90
91
static
92
PyObject *normalizestring(const char *string)
93
1.26M
{
94
1.26M
    size_t len = strlen(string);
95
1.26M
    char *encoding;
96
1.26M
    PyObject *v;
97
98
1.26M
    if (len > PY_SSIZE_T_MAX) {
99
0
        PyErr_SetString(PyExc_OverflowError, "string is too large");
100
0
        return NULL;
101
0
    }
102
103
1.26M
    encoding = PyMem_Malloc(len + 1);
104
1.26M
    if (encoding == NULL)
105
0
        return PyErr_NoMemory();
106
107
1.26M
    if (!_Py_normalize_encoding(string, encoding, len + 1))
108
0
    {
109
0
        PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
110
0
        PyMem_Free(encoding);
111
0
        return NULL;
112
0
    }
113
114
1.26M
    v = PyUnicode_FromString(encoding);
115
1.26M
    PyMem_Free(encoding);
116
1.26M
    return v;
117
1.26M
}
118
119
/* Lookup the given encoding and return a tuple providing the codec
120
   facilities.
121
122
   The encoding string is looked up converted to all lower-case
123
   characters. This makes encodings looked up through this mechanism
124
   effectively case-insensitive.
125
126
   If no codec is found, a LookupError is set and NULL returned.
127
128
   As side effect, this tries to load the encodings package, if not
129
   yet done. This is part of the lazy load strategy for the encodings
130
   package.
131
132
*/
133
134
PyObject *_PyCodec_Lookup(const char *encoding)
135
1.26M
{
136
1.26M
    if (encoding == NULL) {
137
0
        PyErr_BadArgument();
138
0
        return NULL;
139
0
    }
140
141
1.26M
    PyInterpreterState *interp = _PyInterpreterState_GET();
142
1.26M
    assert(interp->codecs.initialized);
143
144
    /* Convert the encoding to a normalized Python string: all
145
       characters are converted to lower case, spaces and hyphens are
146
       replaced with underscores. */
147
1.26M
    PyObject *v = normalizestring(encoding);
148
1.26M
    if (v == NULL) {
149
0
        return NULL;
150
0
    }
151
152
    /* Intern the string. We'll make it immortal later if lookup succeeds. */
153
1.26M
    _PyUnicode_InternMortal(interp, &v);
154
155
    /* First, try to lookup the name in the registry dictionary */
156
1.26M
    PyObject *result;
157
1.26M
    if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
158
0
        goto onError;
159
0
    }
160
1.26M
    if (result != NULL) {
161
1.15M
        Py_DECREF(v);
162
1.15M
        return result;
163
1.15M
    }
164
165
    /* Next, scan the search functions in order of registration */
166
107k
    const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
167
107k
    if (len < 0)
168
0
        goto onError;
169
107k
    if (len == 0) {
170
0
        PyErr_SetString(PyExc_LookupError,
171
0
                        "no codec search functions registered: "
172
0
                        "can't find encoding");
173
0
        goto onError;
174
0
    }
175
176
107k
    Py_ssize_t i;
177
213k
    for (i = 0; i < len; i++) {
178
107k
        PyObject *func;
179
180
107k
        func = PyList_GetItemRef(interp->codecs.search_path, i);
181
107k
        if (func == NULL)
182
0
            goto onError;
183
107k
        result = PyObject_CallOneArg(func, v);
184
107k
        Py_DECREF(func);
185
107k
        if (result == NULL)
186
0
            goto onError;
187
107k
        if (result == Py_None) {
188
106k
            Py_CLEAR(result);
189
106k
            continue;
190
106k
        }
191
461
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
192
0
            PyErr_SetString(PyExc_TypeError,
193
0
                            "codec search functions must return 4-tuples");
194
0
            Py_DECREF(result);
195
0
            goto onError;
196
0
        }
197
461
        break;
198
461
    }
199
107k
    if (result == NULL) {
200
        /* XXX Perhaps we should cache misses too ? */
201
106k
        PyErr_Format(PyExc_LookupError,
202
106k
                     "unknown encoding: %s", encoding);
203
106k
        goto onError;
204
106k
    }
205
206
461
    _PyUnicode_InternImmortal(interp, &v);
207
208
    /* Cache and return the result */
209
461
    if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
210
0
        Py_DECREF(result);
211
0
        goto onError;
212
0
    }
213
461
    Py_DECREF(v);
214
461
    return result;
215
216
106k
 onError:
217
106k
    Py_DECREF(v);
218
106k
    return NULL;
219
461
}
220
221
/* Codec registry encoding check API. */
222
223
int PyCodec_KnownEncoding(const char *encoding)
224
0
{
225
0
    PyObject *codecs;
226
227
0
    codecs = _PyCodec_Lookup(encoding);
228
0
    if (!codecs) {
229
0
        PyErr_Clear();
230
0
        return 0;
231
0
    }
232
0
    else {
233
0
        Py_DECREF(codecs);
234
0
        return 1;
235
0
    }
236
0
}
237
238
static
239
PyObject *args_tuple(PyObject *object,
240
                     const char *errors)
241
1.15M
{
242
1.15M
    PyObject *args;
243
244
1.15M
    args = PyTuple_New(1 + (errors != NULL));
245
1.15M
    if (args == NULL)
246
0
        return NULL;
247
1.15M
    PyTuple_SET_ITEM(args, 0, Py_NewRef(object));
248
1.15M
    if (errors) {
249
182k
        PyObject *v;
250
251
182k
        v = PyUnicode_FromString(errors);
252
182k
        if (v == NULL) {
253
0
            Py_DECREF(args);
254
0
            return NULL;
255
0
        }
256
182k
        PyTuple_SET_ITEM(args, 1, v);
257
182k
    }
258
1.15M
    return args;
259
1.15M
}
260
261
/* Helper function to get a codec item */
262
263
static
264
PyObject *codec_getitem(const char *encoding, int index)
265
0
{
266
0
    PyObject *codecs;
267
0
    PyObject *v;
268
269
0
    codecs = _PyCodec_Lookup(encoding);
270
0
    if (codecs == NULL)
271
0
        return NULL;
272
0
    v = PyTuple_GET_ITEM(codecs, index);
273
0
    Py_DECREF(codecs);
274
0
    return Py_NewRef(v);
275
0
}
276
277
/* Helper functions to create an incremental codec. */
278
static
279
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
280
                                     const char *errors,
281
                                     const char *attrname)
282
48
{
283
48
    PyObject *ret, *inccodec;
284
285
48
    inccodec = PyObject_GetAttrString(codec_info, attrname);
286
48
    if (inccodec == NULL)
287
0
        return NULL;
288
48
    if (errors)
289
48
        ret = PyObject_CallFunction(inccodec, "s", errors);
290
0
    else
291
0
        ret = _PyObject_CallNoArgs(inccodec);
292
48
    Py_DECREF(inccodec);
293
48
    return ret;
294
48
}
295
296
static
297
PyObject *codec_getincrementalcodec(const char *encoding,
298
                                    const char *errors,
299
                                    const char *attrname)
300
0
{
301
0
    PyObject *codec_info, *ret;
302
303
0
    codec_info = _PyCodec_Lookup(encoding);
304
0
    if (codec_info == NULL)
305
0
        return NULL;
306
0
    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
307
0
    Py_DECREF(codec_info);
308
0
    return ret;
309
0
}
310
311
/* Helper function to create a stream codec. */
312
313
static
314
PyObject *codec_getstreamcodec(const char *encoding,
315
                               PyObject *stream,
316
                               const char *errors,
317
                               const int index)
318
0
{
319
0
    PyObject *codecs, *streamcodec, *codeccls;
320
321
0
    codecs = _PyCodec_Lookup(encoding);
322
0
    if (codecs == NULL)
323
0
        return NULL;
324
325
0
    codeccls = PyTuple_GET_ITEM(codecs, index);
326
0
    if (errors != NULL)
327
0
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
328
0
    else
329
0
        streamcodec = PyObject_CallOneArg(codeccls, stream);
330
0
    Py_DECREF(codecs);
331
0
    return streamcodec;
332
0
}
333
334
/* Helpers to work with the result of _PyCodec_Lookup
335
336
 */
337
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
338
                                             const char *errors)
339
16
{
340
16
    return codec_makeincrementalcodec(codec_info, errors,
341
16
                                      "incrementaldecoder");
342
16
}
343
344
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
345
                                             const char *errors)
346
32
{
347
32
    return codec_makeincrementalcodec(codec_info, errors,
348
32
                                      "incrementalencoder");
349
32
}
350
351
352
/* Convenience APIs to query the Codec registry.
353
354
   All APIs return a codec object with incremented refcount.
355
356
 */
357
358
PyObject *PyCodec_Encoder(const char *encoding)
359
0
{
360
0
    return codec_getitem(encoding, 0);
361
0
}
362
363
PyObject *PyCodec_Decoder(const char *encoding)
364
0
{
365
0
    return codec_getitem(encoding, 1);
366
0
}
367
368
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
369
                                     const char *errors)
370
0
{
371
0
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
372
0
}
373
374
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
375
                                     const char *errors)
376
0
{
377
0
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
378
0
}
379
380
PyObject *PyCodec_StreamReader(const char *encoding,
381
                               PyObject *stream,
382
                               const char *errors)
383
0
{
384
0
    return codec_getstreamcodec(encoding, stream, errors, 2);
385
0
}
386
387
PyObject *PyCodec_StreamWriter(const char *encoding,
388
                               PyObject *stream,
389
                               const char *errors)
390
0
{
391
0
    return codec_getstreamcodec(encoding, stream, errors, 3);
392
0
}
393
394
/* Encode an object (e.g. a Unicode object) using the given encoding
395
   and return the resulting encoded object (usually a Python string).
396
397
   errors is passed to the encoder factory as argument if non-NULL. */
398
399
static PyObject *
400
_PyCodec_EncodeInternal(PyObject *object,
401
                        PyObject *encoder,
402
                        const char *encoding,
403
                        const char *errors)
404
892k
{
405
892k
    PyObject *args = NULL, *result = NULL;
406
892k
    PyObject *v = NULL;
407
408
892k
    args = args_tuple(object, errors);
409
892k
    if (args == NULL)
410
0
        goto onError;
411
412
892k
    result = PyObject_Call(encoder, args, NULL);
413
892k
    if (result == NULL) {
414
0
        _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding);
415
0
        goto onError;
416
0
    }
417
418
892k
    if (!PyTuple_Check(result) ||
419
892k
        PyTuple_GET_SIZE(result) != 2) {
420
0
        PyErr_SetString(PyExc_TypeError,
421
0
                        "encoder must return a tuple (object, integer)");
422
0
        goto onError;
423
0
    }
424
892k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
425
    /* We don't check or use the second (integer) entry. */
426
427
892k
    Py_DECREF(args);
428
892k
    Py_DECREF(encoder);
429
892k
    Py_DECREF(result);
430
892k
    return v;
431
432
0
 onError:
433
0
    Py_XDECREF(result);
434
0
    Py_XDECREF(args);
435
0
    Py_XDECREF(encoder);
436
0
    return NULL;
437
892k
}
438
439
/* Decode an object (usually a Python string) using the given encoding
440
   and return an equivalent object (e.g. a Unicode object).
441
442
   errors is passed to the decoder factory as argument if non-NULL. */
443
444
static PyObject *
445
_PyCodec_DecodeInternal(PyObject *object,
446
                        PyObject *decoder,
447
                        const char *encoding,
448
                        const char *errors)
449
257k
{
450
257k
    PyObject *args = NULL, *result = NULL;
451
257k
    PyObject *v;
452
453
257k
    args = args_tuple(object, errors);
454
257k
    if (args == NULL)
455
0
        goto onError;
456
457
257k
    result = PyObject_Call(decoder, args, NULL);
458
257k
    if (result == NULL) {
459
58.0k
        _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding);
460
58.0k
        goto onError;
461
58.0k
    }
462
199k
    if (!PyTuple_Check(result) ||
463
199k
        PyTuple_GET_SIZE(result) != 2) {
464
0
        PyErr_SetString(PyExc_TypeError,
465
0
                        "decoder must return a tuple (object,integer)");
466
0
        goto onError;
467
0
    }
468
199k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
469
    /* We don't check or use the second (integer) entry. */
470
471
199k
    Py_DECREF(args);
472
199k
    Py_DECREF(decoder);
473
199k
    Py_DECREF(result);
474
199k
    return v;
475
476
58.0k
 onError:
477
58.0k
    Py_XDECREF(args);
478
58.0k
    Py_XDECREF(decoder);
479
58.0k
    Py_XDECREF(result);
480
58.0k
    return NULL;
481
199k
}
482
483
/* Generic encoding/decoding API */
484
PyObject *PyCodec_Encode(PyObject *object,
485
                         const char *encoding,
486
                         const char *errors)
487
0
{
488
0
    PyObject *encoder;
489
490
0
    encoder = PyCodec_Encoder(encoding);
491
0
    if (encoder == NULL)
492
0
        return NULL;
493
494
0
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
495
0
}
496
497
PyObject *PyCodec_Decode(PyObject *object,
498
                         const char *encoding,
499
                         const char *errors)
500
0
{
501
0
    PyObject *decoder;
502
503
0
    decoder = PyCodec_Decoder(encoding);
504
0
    if (decoder == NULL)
505
0
        return NULL;
506
507
0
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
508
0
}
509
510
/* Text encoding/decoding API */
511
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
512
                                       const char *alternate_command)
513
1.26M
{
514
1.26M
    PyObject *codec;
515
1.26M
    PyObject *attr;
516
1.26M
    int is_text_codec;
517
518
1.26M
    codec = _PyCodec_Lookup(encoding);
519
1.26M
    if (codec == NULL)
520
106k
        return NULL;
521
522
    /* Backwards compatibility: assume any raw tuple describes a text
523
     * encoding, and the same for anything lacking the private
524
     * attribute.
525
     */
526
1.15M
    if (!PyTuple_CheckExact(codec)) {
527
1.15M
        if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
528
0
            Py_DECREF(codec);
529
0
            return NULL;
530
0
        }
531
1.15M
        if (attr != NULL) {
532
1.15M
            is_text_codec = PyObject_IsTrue(attr);
533
1.15M
            Py_DECREF(attr);
534
1.15M
            if (is_text_codec <= 0) {
535
3.14k
                Py_DECREF(codec);
536
3.14k
                if (!is_text_codec) {
537
3.14k
                    if (alternate_command != NULL) {
538
3.14k
                        PyErr_Format(PyExc_LookupError,
539
3.14k
                                     "'%.400s' is not a text encoding; "
540
3.14k
                                     "use %s to handle arbitrary codecs",
541
3.14k
                                     encoding, alternate_command);
542
3.14k
                    }
543
0
                    else {
544
0
                        PyErr_Format(PyExc_LookupError,
545
0
                                     "'%.400s' is not a text encoding",
546
0
                                     encoding);
547
0
                    }
548
3.14k
                }
549
3.14k
                return NULL;
550
3.14k
            }
551
1.15M
        }
552
1.15M
    }
553
554
    /* This appears to be a valid text encoding */
555
1.15M
    return codec;
556
1.15M
}
557
558
559
static
560
PyObject *codec_getitem_checked(const char *encoding,
561
                                const char *alternate_command,
562
                                int index)
563
1.25M
{
564
1.25M
    PyObject *codec;
565
1.25M
    PyObject *v;
566
567
1.25M
    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
568
1.25M
    if (codec == NULL)
569
109k
        return NULL;
570
571
1.15M
    v = Py_NewRef(PyTuple_GET_ITEM(codec, index));
572
1.15M
    Py_DECREF(codec);
573
1.15M
    return v;
574
1.25M
}
575
576
static PyObject * _PyCodec_TextEncoder(const char *encoding)
577
892k
{
578
892k
    return codec_getitem_checked(encoding, "codecs.encode()", 0);
579
892k
}
580
581
static PyObject * _PyCodec_TextDecoder(const char *encoding)
582
367k
{
583
367k
    return codec_getitem_checked(encoding, "codecs.decode()", 1);
584
367k
}
585
586
PyObject *_PyCodec_EncodeText(PyObject *object,
587
                              const char *encoding,
588
                              const char *errors)
589
892k
{
590
892k
    PyObject *encoder;
591
592
892k
    encoder = _PyCodec_TextEncoder(encoding);
593
892k
    if (encoder == NULL)
594
0
        return NULL;
595
596
892k
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
597
892k
}
598
599
PyObject *_PyCodec_DecodeText(PyObject *object,
600
                              const char *encoding,
601
                              const char *errors)
602
367k
{
603
367k
    PyObject *decoder;
604
605
367k
    decoder = _PyCodec_TextDecoder(encoding);
606
367k
    if (decoder == NULL)
607
109k
        return NULL;
608
609
257k
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
610
367k
}
611
612
/* Register the error handling callback function error under the name
613
   name. This function will be called by the codec when it encounters
614
   an unencodable characters/undecodable bytes and doesn't know the
615
   callback name, when name is specified as the error parameter
616
   in the call to the encode/decode function.
617
   Return 0 on success, -1 on error */
618
int PyCodec_RegisterError(const char *name, PyObject *error)
619
0
{
620
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
621
0
    assert(interp->codecs.initialized);
622
0
    if (!PyCallable_Check(error)) {
623
0
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
624
0
        return -1;
625
0
    }
626
0
    return PyDict_SetItemString(interp->codecs.error_registry,
627
0
                                name, error);
628
0
}
629
630
int _PyCodec_UnregisterError(const char *name)
631
0
{
632
0
    for (size_t i = 0; i < Py_ARRAY_LENGTH(codecs_builtin_error_handlers); ++i) {
633
0
        if (strcmp(name, codecs_builtin_error_handlers[i]) == 0) {
634
0
            PyErr_Format(PyExc_ValueError,
635
0
                         "cannot un-register built-in error handler '%s'", name);
636
0
            return -1;
637
0
        }
638
0
    }
639
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
640
0
    assert(interp->codecs.initialized);
641
0
    return PyDict_PopString(interp->codecs.error_registry, name, NULL);
642
0
}
643
644
/* Lookup the error handling callback function registered under the
645
   name error. As a special case NULL can be passed, in which case
646
   the error handling callback for strict encoding will be returned. */
647
PyObject *PyCodec_LookupError(const char *name)
648
287k
{
649
287k
    PyInterpreterState *interp = _PyInterpreterState_GET();
650
287k
    assert(interp->codecs.initialized);
651
652
287k
    if (name==NULL)
653
200k
        name = "strict";
654
287k
    PyObject *handler;
655
287k
    if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
656
0
        return NULL;
657
0
    }
658
287k
    if (handler == NULL) {
659
0
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
660
0
        return NULL;
661
0
    }
662
287k
    return handler;
663
287k
}
664
665
666
static inline void
667
wrong_exception_type(PyObject *exc)
668
0
{
669
0
    PyErr_Format(PyExc_TypeError,
670
0
                 "don't know how to handle %T in error callback", exc);
671
0
}
672
673
674
#define _PyIsUnicodeEncodeError(EXC)    \
675
276k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeEncodeError)
676
#define _PyIsUnicodeDecodeError(EXC)    \
677
267k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeDecodeError)
678
#define _PyIsUnicodeTranslateError(EXC) \
679
0
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
680
681
682
// --- codecs handlers: utilities ---------------------------------------------
683
684
/*
685
 * Return the number of characters (including special prefixes)
686
 * needed to represent 'ch' by codec_handler_write_unicode_hex().
687
 */
688
static inline Py_ssize_t
689
codec_handler_unicode_hex_width(Py_UCS4 ch)
690
0
{
691
0
    if (ch >= 0x10000) {
692
        // format: '\\' + 'U' + 8 hex digits
693
0
        return 1 + 1 + 8;
694
0
    }
695
0
    else if (ch >= 0x100) {
696
        // format: '\\' + 'u' + 4 hex digits
697
0
        return 1 + 1 + 4;
698
0
    }
699
0
    else {
700
        // format: '\\' + 'x' + 2 hex digits
701
0
        return 1 + 1 + 2;
702
0
    }
703
0
}
704
705
706
/*
707
 * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
708
 * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
709
 */
710
static inline void
711
codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
712
0
{
713
0
    *(*p)++ = '\\';
714
0
    if (ch >= 0x10000) {
715
0
        *(*p)++ = 'U';
716
0
        *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf];
717
0
        *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf];
718
0
        *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf];
719
0
        *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf];
720
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
721
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
722
0
    }
723
0
    else if (ch >= 0x100) {
724
0
        *(*p)++ = 'u';
725
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
726
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
727
0
    }
728
0
    else {
729
0
        *(*p)++ = 'x';
730
0
    }
731
0
    *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf];
732
0
    *(*p)++ = Py_hexdigits[ch & 0xf];
733
0
}
734
735
736
/*
737
 * Determine the number of digits for a decimal representation of Unicode
738
 * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
739
 */
740
static inline int
741
n_decimal_digits_for_codepoint(Py_UCS4 ch)
742
0
{
743
0
    if (ch < 10) return 1;
744
0
    if (ch < 100) return 2;
745
0
    if (ch < 1000) return 3;
746
0
    if (ch < 10000) return 4;
747
0
    if (ch < 100000) return 5;
748
0
    if (ch < 1000000) return 6;
749
0
    if (ch < 10000000) return 7;
750
    // Unicode codepoints are limited to 1114111 (7 decimal digits)
751
0
    Py_UNREACHABLE();
752
0
}
753
754
755
/*
756
 * Create a Unicode string containing 'count' copies of the official
757
 * Unicode REPLACEMENT CHARACTER (0xFFFD).
758
 */
759
static PyObject *
760
codec_handler_unicode_replacement_character(Py_ssize_t count)
761
214k
{
762
214k
    PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER);
763
214k
    if (res == NULL) {
764
0
        return NULL;
765
0
    }
766
214k
    assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
767
214k
    Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
768
429k
    for (Py_ssize_t i = 0; i < count; ++i) {
769
214k
        outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
770
214k
    }
771
214k
    assert(_PyUnicode_CheckConsistency(res, 1));
772
214k
    return res;
773
214k
}
774
775
776
// --- handler: 'strict' ------------------------------------------------------
777
778
PyObject *PyCodec_StrictErrors(PyObject *exc)
779
268k
{
780
268k
    if (PyExceptionInstance_Check(exc)) {
781
268k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
782
268k
    }
783
0
    else {
784
0
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
785
0
    }
786
268k
    return NULL;
787
268k
}
788
789
790
// --- handler: 'ignore' ------------------------------------------------------
791
792
static PyObject *
793
_PyCodec_IgnoreError(PyObject *exc, int as_bytes)
794
0
{
795
0
    Py_ssize_t end;
796
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL,
797
0
                                  &end, NULL, as_bytes) < 0)
798
0
    {
799
0
        return NULL;
800
0
    }
801
0
    return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
802
0
}
803
804
805
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
806
0
{
807
0
    if (_PyIsUnicodeEncodeError(exc) || _PyIsUnicodeTranslateError(exc)) {
808
0
        return _PyCodec_IgnoreError(exc, false);
809
0
    }
810
0
    else if (_PyIsUnicodeDecodeError(exc)) {
811
0
        return _PyCodec_IgnoreError(exc, true);
812
0
    }
813
0
    else {
814
0
        wrong_exception_type(exc);
815
0
        return NULL;
816
0
    }
817
0
}
818
819
820
// --- handler: 'replace' -----------------------------------------------------
821
822
static PyObject *
823
_PyCodec_ReplaceUnicodeEncodeError(PyObject *exc)
824
0
{
825
0
    Py_ssize_t start, end, slen;
826
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
827
0
                                  &start, &end, &slen, false) < 0)
828
0
    {
829
0
        return NULL;
830
0
    }
831
0
    PyObject *res = PyUnicode_New(slen, '?');
832
0
    if (res == NULL) {
833
0
        return NULL;
834
0
    }
835
0
    assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
836
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
837
0
    memset(outp, '?', sizeof(Py_UCS1) * slen);
838
0
    assert(_PyUnicode_CheckConsistency(res, 1));
839
0
    return Py_BuildValue("(Nn)", res, end);
840
0
}
841
842
843
static PyObject *
844
_PyCodec_ReplaceUnicodeDecodeError(PyObject *exc)
845
214k
{
846
214k
    Py_ssize_t end;
847
214k
    if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) {
848
0
        return NULL;
849
0
    }
850
214k
    PyObject *res = codec_handler_unicode_replacement_character(1);
851
214k
    if (res == NULL) {
852
0
        return NULL;
853
0
    }
854
214k
    return Py_BuildValue("(Nn)", res, end);
855
214k
}
856
857
858
static PyObject *
859
_PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
860
0
{
861
0
    Py_ssize_t start, end, slen;
862
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
863
0
                                  &start, &end, &slen, false) < 0)
864
0
    {
865
0
        return NULL;
866
0
    }
867
0
    PyObject *res = codec_handler_unicode_replacement_character(slen);
868
0
    if (res == NULL) {
869
0
        return NULL;
870
0
    }
871
0
    return Py_BuildValue("(Nn)", res, end);
872
0
}
873
874
875
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
876
214k
{
877
214k
    if (_PyIsUnicodeEncodeError(exc)) {
878
0
        return _PyCodec_ReplaceUnicodeEncodeError(exc);
879
0
    }
880
214k
    else if (_PyIsUnicodeDecodeError(exc)) {
881
214k
        return _PyCodec_ReplaceUnicodeDecodeError(exc);
882
214k
    }
883
0
    else if (_PyIsUnicodeTranslateError(exc)) {
884
0
        return _PyCodec_ReplaceUnicodeTranslateError(exc);
885
0
    }
886
0
    else {
887
0
        wrong_exception_type(exc);
888
0
        return NULL;
889
0
    }
890
214k
}
891
892
893
// --- handler: 'xmlcharrefreplace' -------------------------------------------
894
895
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
896
0
{
897
0
    if (!_PyIsUnicodeEncodeError(exc)) {
898
0
        wrong_exception_type(exc);
899
0
        return NULL;
900
0
    }
901
902
0
    PyObject *obj;
903
0
    Py_ssize_t objlen, start, end, slen;
904
0
    if (_PyUnicodeError_GetParams(exc,
905
0
                                  &obj, &objlen,
906
0
                                  &start, &end, &slen, false) < 0)
907
0
    {
908
0
        return NULL;
909
0
    }
910
911
    // The number of characters that each character 'ch' contributes
912
    // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
913
    // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
914
    // range is below 10^7, each "block" requires at most 2 + 7 + 1
915
    // characters.
916
0
    if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
917
0
        end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
918
0
        end = Py_MIN(end, objlen);
919
0
        slen = Py_MAX(0, end - start);
920
0
    }
921
922
0
    Py_ssize_t ressize = 0;
923
0
    for (Py_ssize_t i = start; i < end; ++i) {
924
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
925
0
        int k = n_decimal_digits_for_codepoint(ch);
926
0
        assert(k != 0);
927
0
        assert(k <= 7);
928
0
        ressize += 2 + k + 1;
929
0
    }
930
931
    /* allocate replacement */
932
0
    PyObject *res = PyUnicode_New(ressize, 127);
933
0
    if (res == NULL) {
934
0
        Py_DECREF(obj);
935
0
        return NULL;
936
0
    }
937
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
938
    /* generate replacement */
939
0
    for (Py_ssize_t i = start; i < end; ++i) {
940
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
941
        /*
942
         * Write the decimal representation of 'ch' to the buffer pointed by 'p'
943
         * using at most 7 characters prefixed by '&#' and suffixed by ';'.
944
         */
945
0
        *outp++ = '&';
946
0
        *outp++ = '#';
947
0
        Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
948
0
        for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
949
0
            *p_digit = '0' + (ch % 10);
950
0
            ch /= 10;
951
0
        }
952
0
        assert(ch == 0);
953
0
        outp = digit_end;
954
0
        *outp++ = ';';
955
0
    }
956
0
    assert(_PyUnicode_CheckConsistency(res, 1));
957
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
958
0
    Py_DECREF(obj);
959
0
    return restuple;
960
0
}
961
962
963
// --- handler: 'backslashreplace' --------------------------------------------
964
965
static PyObject *
966
_PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc)
967
0
{
968
0
    PyObject *obj;
969
0
    Py_ssize_t objlen, start, end, slen;
970
0
    if (_PyUnicodeError_GetParams(exc,
971
0
                                  &obj, &objlen,
972
0
                                  &start, &end, &slen, false) < 0)
973
0
    {
974
0
        return NULL;
975
0
    }
976
977
    // The number of characters that each character 'ch' contributes
978
    // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch}
979
    // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS,
980
    // where the number of hexdigits is either 2, 4, or 8 (not 6).
981
    // Since the Unicode range is below 10^7, we choose k = 8 whence
982
    // each "block" requires at most 1 + 1 + 8 characters.
983
0
    if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) {
984
0
        end = start + PY_SSIZE_T_MAX / (1 + 1 + 8);
985
0
        end = Py_MIN(end, objlen);
986
0
        slen = Py_MAX(0, end - start);
987
0
    }
988
989
0
    Py_ssize_t ressize = 0;
990
0
    for (Py_ssize_t i = start; i < end; ++i) {
991
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
992
0
        ressize += codec_handler_unicode_hex_width(c);
993
0
    }
994
0
    PyObject *res = PyUnicode_New(ressize, 127);
995
0
    if (res == NULL) {
996
0
        Py_DECREF(obj);
997
0
        return NULL;
998
0
    }
999
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1000
0
    for (Py_ssize_t i = start; i < end; ++i) {
1001
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1002
0
        codec_handler_write_unicode_hex(&outp, c);
1003
0
    }
1004
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1005
0
    Py_DECREF(obj);
1006
0
    return Py_BuildValue("(Nn)", res, end);
1007
0
}
1008
1009
1010
static PyObject *
1011
_PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc)
1012
0
{
1013
0
    PyObject *obj;
1014
0
    Py_ssize_t objlen, start, end, slen;
1015
0
    if (_PyUnicodeError_GetParams(exc,
1016
0
                                  &obj, &objlen,
1017
0
                                  &start, &end, &slen, true) < 0)
1018
0
    {
1019
0
        return NULL;
1020
0
    }
1021
1022
0
    PyObject *res = PyUnicode_New(4 * slen, 127);
1023
0
    if (res == NULL) {
1024
0
        Py_DECREF(obj);
1025
0
        return NULL;
1026
0
    }
1027
1028
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1029
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1030
0
    for (Py_ssize_t i = start; i < end; i++, outp += 4) {
1031
0
        const unsigned char ch = p[i];
1032
0
        outp[0] = '\\';
1033
0
        outp[1] = 'x';
1034
0
        outp[2] = Py_hexdigits[(ch >> 4) & 0xf];
1035
0
        outp[3] = Py_hexdigits[ch & 0xf];
1036
0
    }
1037
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1038
0
    Py_DECREF(obj);
1039
0
    return Py_BuildValue("(Nn)", res, end);
1040
0
}
1041
1042
1043
static inline PyObject *
1044
_PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc)
1045
0
{
1046
    // Same implementation as for UnicodeEncodeError objects.
1047
0
    return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1048
0
}
1049
1050
1051
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
1052
0
{
1053
0
    if (_PyIsUnicodeEncodeError(exc)) {
1054
0
        return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1055
0
    }
1056
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1057
0
        return _PyCodec_BackslashReplaceUnicodeDecodeError(exc);
1058
0
    }
1059
0
    else if (_PyIsUnicodeTranslateError(exc)) {
1060
0
        return _PyCodec_BackslashReplaceUnicodeTranslateError(exc);
1061
0
    }
1062
0
    else {
1063
0
        wrong_exception_type(exc);
1064
0
        return NULL;
1065
0
    }
1066
0
}
1067
1068
1069
// --- handler: 'namereplace' -------------------------------------------------
1070
1071
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
1072
0
{
1073
0
    if (!_PyIsUnicodeEncodeError(exc)) {
1074
0
        wrong_exception_type(exc);
1075
0
        return NULL;
1076
0
    }
1077
1078
0
    _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
1079
0
    if (ucnhash_capi == NULL) {
1080
0
        return NULL;
1081
0
    }
1082
1083
0
    PyObject *obj;
1084
0
    Py_ssize_t start, end;
1085
0
    if (_PyUnicodeError_GetParams(exc,
1086
0
                                  &obj, NULL,
1087
0
                                  &start, &end, NULL, false) < 0)
1088
0
    {
1089
0
        return NULL;
1090
0
    }
1091
1092
0
    char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */
1093
0
    Py_ssize_t imax = start, ressize = 0, replsize;
1094
0
    for (; imax < end; ++imax) {
1095
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax);
1096
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1097
            // If 'c' is recognized by getname(), the corresponding replacement
1098
            // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
1099
            // characters. Failures of getname() are ignored by the handler.
1100
0
            replsize = 1 + 1 + 1 + strlen(buffer) + 1;
1101
0
        }
1102
0
        else {
1103
0
            replsize = codec_handler_unicode_hex_width(c);
1104
0
        }
1105
0
        if (ressize > PY_SSIZE_T_MAX - replsize) {
1106
0
            break;
1107
0
        }
1108
0
        ressize += replsize;
1109
0
    }
1110
1111
0
    PyObject *res = PyUnicode_New(ressize, 127);
1112
0
    if (res == NULL) {
1113
0
        Py_DECREF(obj);
1114
0
        return NULL;
1115
0
    }
1116
1117
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1118
0
    for (Py_ssize_t i = start; i < imax; ++i) {
1119
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1120
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1121
0
            *outp++ = '\\';
1122
0
            *outp++ = 'N';
1123
0
            *outp++ = '{';
1124
0
            (void)strcpy((char *)outp, buffer);
1125
0
            outp += strlen(buffer);
1126
0
            *outp++ = '}';
1127
0
        }
1128
0
        else {
1129
0
            codec_handler_write_unicode_hex(&outp, c);
1130
0
        }
1131
0
    }
1132
1133
0
    assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1134
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1135
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, imax);
1136
0
    Py_DECREF(obj);
1137
0
    return restuple;
1138
0
}
1139
1140
1141
0
#define ENC_UNKNOWN     -1
1142
0
#define ENC_UTF8        0
1143
0
#define ENC_UTF16BE     1
1144
0
#define ENC_UTF16LE     2
1145
0
#define ENC_UTF32BE     3
1146
0
#define ENC_UTF32LE     4
1147
1148
static int
1149
get_standard_encoding_impl(const char *encoding, int *bytelength)
1150
0
{
1151
0
    if (Py_TOLOWER(encoding[0]) == 'u' &&
1152
0
        Py_TOLOWER(encoding[1]) == 't' &&
1153
0
        Py_TOLOWER(encoding[2]) == 'f') {
1154
0
        encoding += 3;
1155
0
        if (*encoding == '-' || *encoding == '_' )
1156
0
            encoding++;
1157
0
        if (encoding[0] == '8' && encoding[1] == '\0') {
1158
0
            *bytelength = 3;
1159
0
            return ENC_UTF8;
1160
0
        }
1161
0
        else if (encoding[0] == '1' && encoding[1] == '6') {
1162
0
            encoding += 2;
1163
0
            *bytelength = 2;
1164
0
            if (*encoding == '\0') {
1165
#ifdef WORDS_BIGENDIAN
1166
                return ENC_UTF16BE;
1167
#else
1168
0
                return ENC_UTF16LE;
1169
0
#endif
1170
0
            }
1171
0
            if (*encoding == '-' || *encoding == '_' )
1172
0
                encoding++;
1173
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1174
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1175
0
                    return ENC_UTF16BE;
1176
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1177
0
                    return ENC_UTF16LE;
1178
0
            }
1179
0
        }
1180
0
        else if (encoding[0] == '3' && encoding[1] == '2') {
1181
0
            encoding += 2;
1182
0
            *bytelength = 4;
1183
0
            if (*encoding == '\0') {
1184
#ifdef WORDS_BIGENDIAN
1185
                return ENC_UTF32BE;
1186
#else
1187
0
                return ENC_UTF32LE;
1188
0
#endif
1189
0
            }
1190
0
            if (*encoding == '-' || *encoding == '_' )
1191
0
                encoding++;
1192
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1193
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1194
0
                    return ENC_UTF32BE;
1195
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1196
0
                    return ENC_UTF32LE;
1197
0
            }
1198
0
        }
1199
0
    }
1200
0
    else if (strcmp(encoding, "cp65001") == 0) {
1201
0
        *bytelength = 3;
1202
0
        return ENC_UTF8;
1203
0
    }
1204
0
    return ENC_UNKNOWN;
1205
0
}
1206
1207
1208
static int
1209
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
1210
0
{
1211
0
    const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
1212
0
    if (encoding_cstr == NULL) {
1213
0
        return -1;
1214
0
    }
1215
0
    *code = get_standard_encoding_impl(encoding_cstr, bytelength);
1216
0
    return 0;
1217
0
}
1218
1219
1220
// --- handler: 'surrogatepass' -----------------------------------------------
1221
1222
static PyObject *
1223
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
1224
0
{
1225
0
    PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
1226
0
    if (encoding == NULL) {
1227
0
        return NULL;
1228
0
    }
1229
0
    int code, bytelength;
1230
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1231
0
    Py_DECREF(encoding);
1232
0
    if (rc < 0) {
1233
0
        return NULL;
1234
0
    }
1235
0
    if (code == ENC_UNKNOWN) {
1236
0
        goto bail;
1237
0
    }
1238
1239
0
    PyObject *obj;
1240
0
    Py_ssize_t objlen, start, end, slen;
1241
0
    if (_PyUnicodeError_GetParams(exc,
1242
0
                                  &obj, &objlen,
1243
0
                                  &start, &end, &slen, false) < 0)
1244
0
    {
1245
0
        return NULL;
1246
0
    }
1247
1248
0
    if (slen > PY_SSIZE_T_MAX / bytelength) {
1249
0
        end = start + PY_SSIZE_T_MAX / bytelength;
1250
0
        end = Py_MIN(end, objlen);
1251
0
        slen = Py_MAX(0, end - start);
1252
0
    }
1253
1254
0
    PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
1255
0
    if (res == NULL) {
1256
0
        Py_DECREF(obj);
1257
0
        return NULL;
1258
0
    }
1259
1260
0
    unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
1261
0
    for (Py_ssize_t i = start; i < end; i++) {
1262
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1263
0
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1264
            /* Not a surrogate, fail with original exception */
1265
0
            Py_DECREF(obj);
1266
0
            Py_DECREF(res);
1267
0
            goto bail;
1268
0
        }
1269
0
        switch (code) {
1270
0
            case ENC_UTF8: {
1271
0
                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1272
0
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1273
0
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1274
0
                break;
1275
0
            }
1276
0
            case ENC_UTF16LE: {
1277
0
                *outp++ = (unsigned char)ch;
1278
0
                *outp++ = (unsigned char)(ch >> 8);
1279
0
                break;
1280
0
            }
1281
0
            case ENC_UTF16BE: {
1282
0
                *outp++ = (unsigned char)(ch >> 8);
1283
0
                *outp++ = (unsigned char)ch;
1284
0
                break;
1285
0
            }
1286
0
            case ENC_UTF32LE: {
1287
0
                *outp++ = (unsigned char)ch;
1288
0
                *outp++ = (unsigned char)(ch >> 8);
1289
0
                *outp++ = (unsigned char)(ch >> 16);
1290
0
                *outp++ = (unsigned char)(ch >> 24);
1291
0
                break;
1292
0
            }
1293
0
            case ENC_UTF32BE: {
1294
0
                *outp++ = (unsigned char)(ch >> 24);
1295
0
                *outp++ = (unsigned char)(ch >> 16);
1296
0
                *outp++ = (unsigned char)(ch >> 8);
1297
0
                *outp++ = (unsigned char)ch;
1298
0
                break;
1299
0
            }
1300
0
        }
1301
0
    }
1302
1303
0
    Py_DECREF(obj);
1304
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
1305
0
    return restuple;
1306
1307
0
bail:
1308
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1309
0
    return NULL;
1310
0
}
1311
1312
1313
static PyObject *
1314
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
1315
0
{
1316
0
    PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
1317
0
    if (encoding == NULL) {
1318
0
        return NULL;
1319
0
    }
1320
0
    int code, bytelength;
1321
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1322
0
    Py_DECREF(encoding);
1323
0
    if (rc < 0) {
1324
0
        return NULL;
1325
0
    }
1326
0
    if (code == ENC_UNKNOWN) {
1327
0
        goto bail;
1328
0
    }
1329
1330
0
    PyObject *obj;
1331
0
    Py_ssize_t objlen, start, end, slen;
1332
0
    if (_PyUnicodeError_GetParams(exc,
1333
0
                                  &obj, &objlen,
1334
0
                                  &start, &end, &slen, true) < 0)
1335
0
    {
1336
0
        return NULL;
1337
0
    }
1338
1339
    /* Try decoding a single surrogate character. If
1340
       there are more, let the codec call us again. */
1341
0
    Py_UCS4 ch = 0;
1342
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1343
0
    p += start;
1344
1345
0
    if (objlen - start >= bytelength) {
1346
0
        switch (code) {
1347
0
            case ENC_UTF8: {
1348
0
                if ((p[0] & 0xf0) == 0xe0 &&
1349
0
                    (p[1] & 0xc0) == 0x80 &&
1350
0
                    (p[2] & 0xc0) == 0x80)
1351
0
                {
1352
                    /* it's a three-byte code */
1353
0
                    ch = ((p[0] & 0x0f) << 12) +
1354
0
                         ((p[1] & 0x3f) << 6)  +
1355
0
                          (p[2] & 0x3f);
1356
0
                }
1357
0
                break;
1358
0
            }
1359
0
            case ENC_UTF16LE: {
1360
0
                ch = p[1] << 8 | p[0];
1361
0
                break;
1362
0
            }
1363
0
            case ENC_UTF16BE: {
1364
0
                ch = p[0] << 8 | p[1];
1365
0
                break;
1366
0
            }
1367
0
            case ENC_UTF32LE: {
1368
0
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1369
0
                break;
1370
0
            }
1371
0
            case ENC_UTF32BE: {
1372
0
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1373
0
                break;
1374
0
            }
1375
0
        }
1376
0
    }
1377
0
    Py_DECREF(obj);
1378
0
    if (!Py_UNICODE_IS_SURROGATE(ch)) {
1379
0
        goto bail;
1380
0
    }
1381
1382
0
    PyObject *res = PyUnicode_FromOrdinal(ch);
1383
0
    if (res == NULL) {
1384
0
        return NULL;
1385
0
    }
1386
0
    return Py_BuildValue("(Nn)", res, start + bytelength);
1387
1388
0
bail:
1389
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1390
0
    return NULL;
1391
0
}
1392
1393
1394
/* This handler is declared static until someone demonstrates
1395
   a need to call it directly. */
1396
static PyObject *
1397
PyCodec_SurrogatePassErrors(PyObject *exc)
1398
0
{
1399
0
    if (_PyIsUnicodeEncodeError(exc)) {
1400
0
        return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
1401
0
    }
1402
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1403
0
        return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
1404
0
    }
1405
0
    else {
1406
0
        wrong_exception_type(exc);
1407
0
        return NULL;
1408
0
    }
1409
0
}
1410
1411
1412
// --- handler: 'surrogateescape' ---------------------------------------------
1413
1414
static PyObject *
1415
_PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
1416
9.32k
{
1417
9.32k
    PyObject *obj;
1418
9.32k
    Py_ssize_t start, end, slen;
1419
9.32k
    if (_PyUnicodeError_GetParams(exc,
1420
9.32k
                                  &obj, NULL,
1421
9.32k
                                  &start, &end, &slen, false) < 0)
1422
0
    {
1423
0
        return NULL;
1424
0
    }
1425
1426
9.32k
    PyObject *res = PyBytes_FromStringAndSize(NULL, slen);
1427
9.32k
    if (res == NULL) {
1428
0
        Py_DECREF(obj);
1429
0
        return NULL;
1430
0
    }
1431
1432
9.32k
    char *outp = PyBytes_AsString(res);
1433
9.32k
    for (Py_ssize_t i = start; i < end; i++) {
1434
9.32k
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1435
9.32k
        if (ch < 0xdc80 || ch > 0xdcff) {
1436
            /* Not a UTF-8b surrogate, fail with original exception. */
1437
9.32k
            Py_DECREF(obj);
1438
9.32k
            Py_DECREF(res);
1439
9.32k
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1440
9.32k
            return NULL;
1441
9.32k
        }
1442
0
        *outp++ = ch - 0xdc00;
1443
0
    }
1444
0
    Py_DECREF(obj);
1445
1446
0
    return Py_BuildValue("(Nn)", res, end);
1447
9.32k
}
1448
1449
1450
static PyObject *
1451
_PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc)
1452
52.4k
{
1453
52.4k
    PyObject *obj;
1454
52.4k
    Py_ssize_t start, end, slen;
1455
52.4k
    if (_PyUnicodeError_GetParams(exc,
1456
52.4k
                                  &obj, NULL,
1457
52.4k
                                  &start, &end, &slen, true) < 0)
1458
0
    {
1459
0
        return NULL;
1460
0
    }
1461
1462
52.4k
    Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1463
52.4k
    int consumed = 0;
1464
52.4k
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1465
114k
    while (consumed < 4 && consumed < slen) {
1466
        /* Refuse to escape ASCII bytes. */
1467
93.0k
        if (p[start + consumed] < 128) {
1468
31.1k
            break;
1469
31.1k
        }
1470
61.8k
        ch[consumed] = 0xdc00 + p[start + consumed];
1471
61.8k
        consumed++;
1472
61.8k
    }
1473
52.4k
    Py_DECREF(obj);
1474
1475
52.4k
    if (consumed == 0) {
1476
        /* Codec complained about ASCII byte. */
1477
18.0k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1478
18.0k
        return NULL;
1479
18.0k
    }
1480
1481
34.4k
    PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1482
34.4k
    if (str == NULL) {
1483
0
        return NULL;
1484
0
    }
1485
34.4k
    return Py_BuildValue("(Nn)", str, start + consumed);
1486
34.4k
}
1487
1488
1489
static PyObject *
1490
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1491
61.7k
{
1492
61.7k
    if (_PyIsUnicodeEncodeError(exc)) {
1493
9.32k
        return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc);
1494
9.32k
    }
1495
52.4k
    else if (_PyIsUnicodeDecodeError(exc)) {
1496
52.4k
        return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc);
1497
52.4k
    }
1498
0
    else {
1499
0
        wrong_exception_type(exc);
1500
0
        return NULL;
1501
0
    }
1502
61.7k
}
1503
1504
1505
// --- Codecs registry handlers -----------------------------------------------
1506
1507
static inline PyObject *
1508
strict_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1509
228k
{
1510
228k
    return PyCodec_StrictErrors(exc);
1511
228k
}
1512
1513
1514
static inline PyObject *
1515
ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1516
0
{
1517
0
    return PyCodec_IgnoreErrors(exc);
1518
0
}
1519
1520
1521
static inline PyObject *
1522
replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1523
214k
{
1524
214k
    return PyCodec_ReplaceErrors(exc);
1525
214k
}
1526
1527
1528
static inline PyObject *
1529
xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1530
0
{
1531
0
    return PyCodec_XMLCharRefReplaceErrors(exc);
1532
0
}
1533
1534
1535
static inline PyObject *
1536
backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1537
0
{
1538
0
    return PyCodec_BackslashReplaceErrors(exc);
1539
0
}
1540
1541
1542
static inline PyObject *
1543
namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1544
0
{
1545
0
    return PyCodec_NameReplaceErrors(exc);
1546
0
}
1547
1548
1549
static inline PyObject *
1550
surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1551
0
{
1552
0
    return PyCodec_SurrogatePassErrors(exc);
1553
0
}
1554
1555
1556
static inline PyObject *
1557
surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1558
61.7k
{
1559
61.7k
    return PyCodec_SurrogateEscapeErrors(exc);
1560
61.7k
}
1561
1562
1563
PyStatus
1564
_PyCodec_InitRegistry(PyInterpreterState *interp)
1565
16
{
1566
16
    static struct {
1567
16
        const char *name;
1568
16
        PyMethodDef def;
1569
16
    } methods[] =
1570
16
    {
1571
16
        {
1572
16
            "strict",
1573
16
            {
1574
16
                "strict_errors",
1575
16
                strict_errors,
1576
16
                METH_O,
1577
16
                PyDoc_STR("Implements the 'strict' error handling, which "
1578
16
                          "raises a UnicodeError on coding errors.")
1579
16
            }
1580
16
        },
1581
16
        {
1582
16
            "ignore",
1583
16
            {
1584
16
                "ignore_errors",
1585
16
                ignore_errors,
1586
16
                METH_O,
1587
16
                PyDoc_STR("Implements the 'ignore' error handling, which "
1588
16
                          "ignores malformed data and continues.")
1589
16
            }
1590
16
        },
1591
16
        {
1592
16
            "replace",
1593
16
            {
1594
16
                "replace_errors",
1595
16
                replace_errors,
1596
16
                METH_O,
1597
16
                PyDoc_STR("Implements the 'replace' error handling, which "
1598
16
                          "replaces malformed data with a replacement marker.")
1599
16
            }
1600
16
        },
1601
16
        {
1602
16
            "xmlcharrefreplace",
1603
16
            {
1604
16
                "xmlcharrefreplace_errors",
1605
16
                xmlcharrefreplace_errors,
1606
16
                METH_O,
1607
16
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1608
16
                          "which replaces an unencodable character with the "
1609
16
                          "appropriate XML character reference.")
1610
16
            }
1611
16
        },
1612
16
        {
1613
16
            "backslashreplace",
1614
16
            {
1615
16
                "backslashreplace_errors",
1616
16
                backslashreplace_errors,
1617
16
                METH_O,
1618
16
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1619
16
                          "which replaces malformed data with a backslashed "
1620
16
                          "escape sequence.")
1621
16
            }
1622
16
        },
1623
16
        {
1624
16
            "namereplace",
1625
16
            {
1626
16
                "namereplace_errors",
1627
16
                namereplace_errors,
1628
16
                METH_O,
1629
16
                PyDoc_STR("Implements the 'namereplace' error handling, "
1630
16
                          "which replaces an unencodable character with a "
1631
16
                          "\\N{...} escape sequence.")
1632
16
            }
1633
16
        },
1634
16
        {
1635
16
            "surrogatepass",
1636
16
            {
1637
16
                "surrogatepass",
1638
16
                surrogatepass_errors,
1639
16
                METH_O
1640
16
            }
1641
16
        },
1642
16
        {
1643
16
            "surrogateescape",
1644
16
            {
1645
16
                "surrogateescape",
1646
16
                surrogateescape_errors,
1647
16
                METH_O
1648
16
            }
1649
16
        }
1650
16
    };
1651
    // ensure that the built-in error handlers' names are kept in sync
1652
16
    assert(Py_ARRAY_LENGTH(methods) == Py_ARRAY_LENGTH(codecs_builtin_error_handlers));
1653
1654
16
    assert(interp->codecs.initialized == 0);
1655
16
    interp->codecs.search_path = PyList_New(0);
1656
16
    if (interp->codecs.search_path == NULL) {
1657
0
        return PyStatus_NoMemory();
1658
0
    }
1659
16
    interp->codecs.search_cache = PyDict_New();
1660
16
    if (interp->codecs.search_cache == NULL) {
1661
0
        return PyStatus_NoMemory();
1662
0
    }
1663
16
    interp->codecs.error_registry = PyDict_New();
1664
16
    if (interp->codecs.error_registry == NULL) {
1665
0
        return PyStatus_NoMemory();
1666
0
    }
1667
144
    for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1668
128
        PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1669
128
        if (func == NULL) {
1670
0
            return PyStatus_NoMemory();
1671
0
        }
1672
1673
128
        int res = PyDict_SetItemString(interp->codecs.error_registry,
1674
128
                                       methods[i].name, func);
1675
128
        Py_DECREF(func);
1676
128
        if (res < 0) {
1677
0
            return PyStatus_Error("Failed to insert into codec error registry");
1678
0
        }
1679
128
    }
1680
1681
16
    interp->codecs.initialized = 1;
1682
1683
    // Importing `encodings' will call back into this module to register codec
1684
    // search functions, so this is done after everything else is initialized.
1685
16
    PyObject *mod = PyImport_ImportModule("encodings");
1686
16
    if (mod == NULL) {
1687
0
        return PyStatus_Error("Failed to import encodings module");
1688
0
    }
1689
16
    Py_DECREF(mod);
1690
1691
16
    return PyStatus_Ok();
1692
16
}
1693
1694
void
1695
_PyCodec_Fini(PyInterpreterState *interp)
1696
0
{
1697
0
    Py_CLEAR(interp->codecs.search_path);
1698
0
    Py_CLEAR(interp->codecs.search_cache);
1699
0
    Py_CLEAR(interp->codecs.error_registry);
1700
0
    interp->codecs.initialized = 0;
1701
0
}