Coverage Report

Created: 2025-09-05 07:10

/src/cpython/Python/codecs.c
Line
Count
Source (jump to first uncovered line)
1
/* ------------------------------------------------------------------------
2
3
   Python Codec Registry and support functions
4
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7
Copyright (c) Corporation for National Research Initiatives.
8
9
   ------------------------------------------------------------------------ */
10
11
#include "Python.h"
12
#include "pycore_call.h"          // _PyObject_CallNoArgs()
13
#include "pycore_interp.h"        // PyInterpreterState.codec_search_path
14
#include "pycore_pyerrors.h"      // _PyErr_FormatNote()
15
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
16
#include "pycore_runtime.h"       // _Py_ID()
17
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
18
#include "pycore_unicodeobject.h" // _PyUnicode_InternMortal()
19
#include "pycore_pyatomic_ft_wrappers.h"
20
21
static const char *codecs_builtin_error_handlers[] = {
22
    "strict", "ignore", "replace",
23
    "xmlcharrefreplace", "backslashreplace", "namereplace",
24
    "surrogatepass", "surrogateescape",
25
};
26
27
const char *Py_hexdigits = "0123456789abcdef";
28
29
/* --- Codec Registry ----------------------------------------------------- */
30
31
int PyCodec_Register(PyObject *search_function)
32
16
{
33
16
    PyInterpreterState *interp = _PyInterpreterState_GET();
34
16
    assert(interp->codecs.initialized);
35
16
    if (search_function == NULL) {
36
0
        PyErr_BadArgument();
37
0
        goto onError;
38
0
    }
39
16
    if (!PyCallable_Check(search_function)) {
40
0
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
41
0
        goto onError;
42
0
    }
43
16
    FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
44
16
    int ret = PyList_Append(interp->codecs.search_path, search_function);
45
16
    FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
46
47
16
    return ret;
48
49
0
 onError:
50
0
    return -1;
51
16
}
52
53
int
54
PyCodec_Unregister(PyObject *search_function)
55
0
{
56
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
57
0
    if (interp->codecs.initialized != 1) {
58
        /* Do nothing if codecs state was cleared (only possible during
59
           interpreter shutdown). */
60
0
        return 0;
61
0
    }
62
63
0
    PyObject *codec_search_path = interp->codecs.search_path;
64
0
    assert(PyList_CheckExact(codec_search_path));
65
0
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
66
0
        FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
67
0
        PyObject *item = PyList_GetItemRef(codec_search_path, i);
68
0
        int ret = 1;
69
0
        if (item == search_function) {
70
            // We hold a reference to the item, so its destructor can't run
71
            // while we hold search_path_mutex.
72
0
            ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
73
0
        }
74
0
        FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
75
0
        Py_DECREF(item);
76
0
        if (ret != 1) {
77
0
            assert(interp->codecs.search_cache != NULL);
78
0
            assert(PyDict_CheckExact(interp->codecs.search_cache));
79
0
            PyDict_Clear(interp->codecs.search_cache);
80
0
            return ret;
81
0
        }
82
0
    }
83
0
    return 0;
84
0
}
85
86
extern int _Py_normalize_encoding(const char *, char *, size_t);
87
88
/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
89
   converted to lower case, spaces and hyphens are replaced with underscores. */
90
91
static
92
PyObject *normalizestring(const char *string)
93
1.06M
{
94
1.06M
    size_t len = strlen(string);
95
1.06M
    char *encoding;
96
1.06M
    PyObject *v;
97
98
1.06M
    if (len > PY_SSIZE_T_MAX) {
99
0
        PyErr_SetString(PyExc_OverflowError, "string is too large");
100
0
        return NULL;
101
0
    }
102
103
1.06M
    encoding = PyMem_Malloc(len + 1);
104
1.06M
    if (encoding == NULL)
105
0
        return PyErr_NoMemory();
106
107
1.06M
    if (!_Py_normalize_encoding(string, encoding, len + 1))
108
0
    {
109
0
        PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
110
0
        PyMem_Free(encoding);
111
0
        return NULL;
112
0
    }
113
114
1.06M
    v = PyUnicode_FromString(encoding);
115
1.06M
    PyMem_Free(encoding);
116
1.06M
    return v;
117
1.06M
}
118
119
/* Lookup the given encoding and return a tuple providing the codec
120
   facilities.
121
122
   The encoding string is looked up converted to all lower-case
123
   characters. This makes encodings looked up through this mechanism
124
   effectively case-insensitive.
125
126
   If no codec is found, a LookupError is set and NULL returned.
127
128
   As side effect, this tries to load the encodings package, if not
129
   yet done. This is part of the lazy load strategy for the encodings
130
   package.
131
132
*/
133
134
PyObject *_PyCodec_Lookup(const char *encoding)
135
1.06M
{
136
1.06M
    if (encoding == NULL) {
137
0
        PyErr_BadArgument();
138
0
        return NULL;
139
0
    }
140
141
1.06M
    PyInterpreterState *interp = _PyInterpreterState_GET();
142
1.06M
    assert(interp->codecs.initialized);
143
144
    /* Convert the encoding to a normalized Python string: all
145
       characters are converted to lower case, spaces and hyphens are
146
       replaced with underscores. */
147
1.06M
    PyObject *v = normalizestring(encoding);
148
1.06M
    if (v == NULL) {
149
0
        return NULL;
150
0
    }
151
152
    /* Intern the string. We'll make it immortal later if lookup succeeds. */
153
1.06M
    _PyUnicode_InternMortal(interp, &v);
154
155
    /* First, try to lookup the name in the registry dictionary */
156
1.06M
    PyObject *result;
157
1.06M
    if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
158
0
        goto onError;
159
0
    }
160
1.06M
    if (result != NULL) {
161
977k
        Py_DECREF(v);
162
977k
        return result;
163
977k
    }
164
165
    /* Next, scan the search functions in order of registration */
166
91.9k
    const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
167
91.9k
    if (len < 0)
168
0
        goto onError;
169
91.9k
    if (len == 0) {
170
0
        PyErr_SetString(PyExc_LookupError,
171
0
                        "no codec search functions registered: "
172
0
                        "can't find encoding");
173
0
        goto onError;
174
0
    }
175
176
91.9k
    Py_ssize_t i;
177
183k
    for (i = 0; i < len; i++) {
178
91.9k
        PyObject *func;
179
180
91.9k
        func = PyList_GetItemRef(interp->codecs.search_path, i);
181
91.9k
        if (func == NULL)
182
0
            goto onError;
183
91.9k
        result = PyObject_CallOneArg(func, v);
184
91.9k
        Py_DECREF(func);
185
91.9k
        if (result == NULL)
186
0
            goto onError;
187
91.9k
        if (result == Py_None) {
188
91.5k
            Py_CLEAR(result);
189
91.5k
            continue;
190
91.5k
        }
191
440
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
192
0
            PyErr_SetString(PyExc_TypeError,
193
0
                            "codec search functions must return 4-tuples");
194
0
            Py_DECREF(result);
195
0
            goto onError;
196
0
        }
197
440
        break;
198
440
    }
199
91.9k
    if (result == NULL) {
200
        /* XXX Perhaps we should cache misses too ? */
201
91.5k
        PyErr_Format(PyExc_LookupError,
202
91.5k
                     "unknown encoding: %s", encoding);
203
91.5k
        goto onError;
204
91.5k
    }
205
206
440
    _PyUnicode_InternImmortal(interp, &v);
207
208
    /* Cache and return the result */
209
440
    if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
210
0
        Py_DECREF(result);
211
0
        goto onError;
212
0
    }
213
440
    Py_DECREF(v);
214
440
    return result;
215
216
91.5k
 onError:
217
91.5k
    Py_DECREF(v);
218
91.5k
    return NULL;
219
440
}
220
221
/* Codec registry encoding check API. */
222
223
int PyCodec_KnownEncoding(const char *encoding)
224
0
{
225
0
    PyObject *codecs;
226
227
0
    codecs = _PyCodec_Lookup(encoding);
228
0
    if (!codecs) {
229
0
        PyErr_Clear();
230
0
        return 0;
231
0
    }
232
0
    else {
233
0
        Py_DECREF(codecs);
234
0
        return 1;
235
0
    }
236
0
}
237
238
static
239
PyObject *args_tuple(PyObject *object,
240
                     const char *errors)
241
974k
{
242
974k
    PyObject *args;
243
244
974k
    args = PyTuple_New(1 + (errors != NULL));
245
974k
    if (args == NULL)
246
0
        return NULL;
247
974k
    PyTuple_SET_ITEM(args, 0, Py_NewRef(object));
248
974k
    if (errors) {
249
171k
        PyObject *v;
250
251
171k
        v = PyUnicode_FromString(errors);
252
171k
        if (v == NULL) {
253
0
            Py_DECREF(args);
254
0
            return NULL;
255
0
        }
256
171k
        PyTuple_SET_ITEM(args, 1, v);
257
171k
    }
258
974k
    return args;
259
974k
}
260
261
/* Helper function to get a codec item */
262
263
static
264
PyObject *codec_getitem(const char *encoding, int index)
265
0
{
266
0
    PyObject *codecs;
267
0
    PyObject *v;
268
269
0
    codecs = _PyCodec_Lookup(encoding);
270
0
    if (codecs == NULL)
271
0
        return NULL;
272
0
    v = PyTuple_GET_ITEM(codecs, index);
273
0
    Py_DECREF(codecs);
274
0
    return Py_NewRef(v);
275
0
}
276
277
/* Helper functions to create an incremental codec. */
278
static
279
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
280
                                     const char *errors,
281
                                     const char *attrname)
282
48
{
283
48
    PyObject *ret, *inccodec;
284
285
48
    inccodec = PyObject_GetAttrString(codec_info, attrname);
286
48
    if (inccodec == NULL)
287
0
        return NULL;
288
48
    if (errors)
289
48
        ret = PyObject_CallFunction(inccodec, "s", errors);
290
0
    else
291
0
        ret = _PyObject_CallNoArgs(inccodec);
292
48
    Py_DECREF(inccodec);
293
48
    return ret;
294
48
}
295
296
static
297
PyObject *codec_getincrementalcodec(const char *encoding,
298
                                    const char *errors,
299
                                    const char *attrname)
300
0
{
301
0
    PyObject *codec_info, *ret;
302
303
0
    codec_info = _PyCodec_Lookup(encoding);
304
0
    if (codec_info == NULL)
305
0
        return NULL;
306
0
    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
307
0
    Py_DECREF(codec_info);
308
0
    return ret;
309
0
}
310
311
/* Helper function to create a stream codec. */
312
313
static
314
PyObject *codec_getstreamcodec(const char *encoding,
315
                               PyObject *stream,
316
                               const char *errors,
317
                               const int index)
318
0
{
319
0
    PyObject *codecs, *streamcodec, *codeccls;
320
321
0
    codecs = _PyCodec_Lookup(encoding);
322
0
    if (codecs == NULL)
323
0
        return NULL;
324
325
0
    codeccls = PyTuple_GET_ITEM(codecs, index);
326
0
    if (errors != NULL)
327
0
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
328
0
    else
329
0
        streamcodec = PyObject_CallOneArg(codeccls, stream);
330
0
    Py_DECREF(codecs);
331
0
    return streamcodec;
332
0
}
333
334
/* Helpers to work with the result of _PyCodec_Lookup
335
336
 */
337
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
338
                                             const char *errors)
339
16
{
340
16
    return codec_makeincrementalcodec(codec_info, errors,
341
16
                                      "incrementaldecoder");
342
16
}
343
344
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
345
                                             const char *errors)
346
32
{
347
32
    return codec_makeincrementalcodec(codec_info, errors,
348
32
                                      "incrementalencoder");
349
32
}
350
351
352
/* Convenience APIs to query the Codec registry.
353
354
   All APIs return a codec object with incremented refcount.
355
356
 */
357
358
PyObject *PyCodec_Encoder(const char *encoding)
359
0
{
360
0
    return codec_getitem(encoding, 0);
361
0
}
362
363
PyObject *PyCodec_Decoder(const char *encoding)
364
0
{
365
0
    return codec_getitem(encoding, 1);
366
0
}
367
368
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
369
                                     const char *errors)
370
0
{
371
0
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
372
0
}
373
374
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
375
                                     const char *errors)
376
0
{
377
0
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
378
0
}
379
380
PyObject *PyCodec_StreamReader(const char *encoding,
381
                               PyObject *stream,
382
                               const char *errors)
383
0
{
384
0
    return codec_getstreamcodec(encoding, stream, errors, 2);
385
0
}
386
387
PyObject *PyCodec_StreamWriter(const char *encoding,
388
                               PyObject *stream,
389
                               const char *errors)
390
0
{
391
0
    return codec_getstreamcodec(encoding, stream, errors, 3);
392
0
}
393
394
/* Encode an object (e.g. a Unicode object) using the given encoding
395
   and return the resulting encoded object (usually a Python string).
396
397
   errors is passed to the encoder factory as argument if non-NULL. */
398
399
static PyObject *
400
_PyCodec_EncodeInternal(PyObject *object,
401
                        PyObject *encoder,
402
                        const char *encoding,
403
                        const char *errors)
404
737k
{
405
737k
    PyObject *args = NULL, *result = NULL;
406
737k
    PyObject *v = NULL;
407
408
737k
    args = args_tuple(object, errors);
409
737k
    if (args == NULL)
410
0
        goto onError;
411
412
737k
    result = PyObject_Call(encoder, args, NULL);
413
737k
    if (result == NULL) {
414
0
        _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding);
415
0
        goto onError;
416
0
    }
417
418
737k
    if (!PyTuple_Check(result) ||
419
737k
        PyTuple_GET_SIZE(result) != 2) {
420
0
        PyErr_SetString(PyExc_TypeError,
421
0
                        "encoder must return a tuple (object, integer)");
422
0
        goto onError;
423
0
    }
424
737k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
425
    /* We don't check or use the second (integer) entry. */
426
427
737k
    Py_DECREF(args);
428
737k
    Py_DECREF(encoder);
429
737k
    Py_DECREF(result);
430
737k
    return v;
431
432
0
 onError:
433
0
    Py_XDECREF(result);
434
0
    Py_XDECREF(args);
435
0
    Py_XDECREF(encoder);
436
0
    return NULL;
437
737k
}
438
439
/* Decode an object (usually a Python string) using the given encoding
440
   and return an equivalent object (e.g. a Unicode object).
441
442
   errors is passed to the decoder factory as argument if non-NULL. */
443
444
static PyObject *
445
_PyCodec_DecodeInternal(PyObject *object,
446
                        PyObject *decoder,
447
                        const char *encoding,
448
                        const char *errors)
449
237k
{
450
237k
    PyObject *args = NULL, *result = NULL;
451
237k
    PyObject *v;
452
453
237k
    args = args_tuple(object, errors);
454
237k
    if (args == NULL)
455
0
        goto onError;
456
457
237k
    result = PyObject_Call(decoder, args, NULL);
458
237k
    if (result == NULL) {
459
43.7k
        _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding);
460
43.7k
        goto onError;
461
43.7k
    }
462
193k
    if (!PyTuple_Check(result) ||
463
193k
        PyTuple_GET_SIZE(result) != 2) {
464
0
        PyErr_SetString(PyExc_TypeError,
465
0
                        "decoder must return a tuple (object,integer)");
466
0
        goto onError;
467
0
    }
468
193k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
469
    /* We don't check or use the second (integer) entry. */
470
471
193k
    Py_DECREF(args);
472
193k
    Py_DECREF(decoder);
473
193k
    Py_DECREF(result);
474
193k
    return v;
475
476
43.7k
 onError:
477
43.7k
    Py_XDECREF(args);
478
43.7k
    Py_XDECREF(decoder);
479
43.7k
    Py_XDECREF(result);
480
43.7k
    return NULL;
481
193k
}
482
483
/* Generic encoding/decoding API */
484
PyObject *PyCodec_Encode(PyObject *object,
485
                         const char *encoding,
486
                         const char *errors)
487
0
{
488
0
    PyObject *encoder;
489
490
0
    encoder = PyCodec_Encoder(encoding);
491
0
    if (encoder == NULL)
492
0
        return NULL;
493
494
0
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
495
0
}
496
497
PyObject *PyCodec_Decode(PyObject *object,
498
                         const char *encoding,
499
                         const char *errors)
500
0
{
501
0
    PyObject *decoder;
502
503
0
    decoder = PyCodec_Decoder(encoding);
504
0
    if (decoder == NULL)
505
0
        return NULL;
506
507
0
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
508
0
}
509
510
/* Text encoding/decoding API */
511
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
512
                                       const char *alternate_command)
513
1.06M
{
514
1.06M
    PyObject *codec;
515
1.06M
    PyObject *attr;
516
1.06M
    int is_text_codec;
517
518
1.06M
    codec = _PyCodec_Lookup(encoding);
519
1.06M
    if (codec == NULL)
520
91.5k
        return NULL;
521
522
    /* Backwards compatibility: assume any raw tuple describes a text
523
     * encoding, and the same for anything lacking the private
524
     * attribute.
525
     */
526
977k
    if (!PyTuple_CheckExact(codec)) {
527
977k
        if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
528
0
            Py_DECREF(codec);
529
0
            return NULL;
530
0
        }
531
977k
        if (attr != NULL) {
532
977k
            is_text_codec = PyObject_IsTrue(attr);
533
977k
            Py_DECREF(attr);
534
977k
            if (is_text_codec <= 0) {
535
3.13k
                Py_DECREF(codec);
536
3.13k
                if (!is_text_codec) {
537
3.13k
                    if (alternate_command != NULL) {
538
3.13k
                        PyErr_Format(PyExc_LookupError,
539
3.13k
                                     "'%.400s' is not a text encoding; "
540
3.13k
                                     "use %s to handle arbitrary codecs",
541
3.13k
                                     encoding, alternate_command);
542
3.13k
                    }
543
0
                    else {
544
0
                        PyErr_Format(PyExc_LookupError,
545
0
                                     "'%.400s' is not a text encoding",
546
0
                                     encoding);
547
0
                    }
548
3.13k
                }
549
3.13k
                return NULL;
550
3.13k
            }
551
977k
        }
552
977k
    }
553
554
    /* This appears to be a valid text encoding */
555
974k
    return codec;
556
977k
}
557
558
559
static
560
PyObject *codec_getitem_checked(const char *encoding,
561
                                const char *alternate_command,
562
                                int index)
563
1.06M
{
564
1.06M
    PyObject *codec;
565
1.06M
    PyObject *v;
566
567
1.06M
    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
568
1.06M
    if (codec == NULL)
569
94.6k
        return NULL;
570
571
974k
    v = Py_NewRef(PyTuple_GET_ITEM(codec, index));
572
974k
    Py_DECREF(codec);
573
974k
    return v;
574
1.06M
}
575
576
static PyObject * _PyCodec_TextEncoder(const char *encoding)
577
737k
{
578
737k
    return codec_getitem_checked(encoding, "codecs.encode()", 0);
579
737k
}
580
581
static PyObject * _PyCodec_TextDecoder(const char *encoding)
582
332k
{
583
332k
    return codec_getitem_checked(encoding, "codecs.decode()", 1);
584
332k
}
585
586
PyObject *_PyCodec_EncodeText(PyObject *object,
587
                              const char *encoding,
588
                              const char *errors)
589
737k
{
590
737k
    PyObject *encoder;
591
592
737k
    encoder = _PyCodec_TextEncoder(encoding);
593
737k
    if (encoder == NULL)
594
0
        return NULL;
595
596
737k
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
597
737k
}
598
599
PyObject *_PyCodec_DecodeText(PyObject *object,
600
                              const char *encoding,
601
                              const char *errors)
602
332k
{
603
332k
    PyObject *decoder;
604
605
332k
    decoder = _PyCodec_TextDecoder(encoding);
606
332k
    if (decoder == NULL)
607
94.6k
        return NULL;
608
609
237k
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
610
332k
}
611
612
/* Register the error handling callback function error under the name
613
   name. This function will be called by the codec when it encounters
614
   an unencodable characters/undecodable bytes and doesn't know the
615
   callback name, when name is specified as the error parameter
616
   in the call to the encode/decode function.
617
   Return 0 on success, -1 on error */
618
int PyCodec_RegisterError(const char *name, PyObject *error)
619
0
{
620
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
621
0
    assert(interp->codecs.initialized);
622
0
    if (!PyCallable_Check(error)) {
623
0
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
624
0
        return -1;
625
0
    }
626
0
    return PyDict_SetItemString(interp->codecs.error_registry,
627
0
                                name, error);
628
0
}
629
630
int _PyCodec_UnregisterError(const char *name)
631
0
{
632
0
    for (size_t i = 0; i < Py_ARRAY_LENGTH(codecs_builtin_error_handlers); ++i) {
633
0
        if (strcmp(name, codecs_builtin_error_handlers[i]) == 0) {
634
0
            PyErr_Format(PyExc_ValueError,
635
0
                         "cannot un-register built-in error handler '%s'", name);
636
0
            return -1;
637
0
        }
638
0
    }
639
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
640
0
    assert(interp->codecs.initialized);
641
0
    return PyDict_PopString(interp->codecs.error_registry, name, NULL);
642
0
}
643
644
/* Lookup the error handling callback function registered under the
645
   name error. As a special case NULL can be passed, in which case
646
   the error handling callback for strict encoding will be returned. */
647
PyObject *PyCodec_LookupError(const char *name)
648
243k
{
649
243k
    PyInterpreterState *interp = _PyInterpreterState_GET();
650
243k
    assert(interp->codecs.initialized);
651
652
243k
    if (name==NULL)
653
168k
        name = "strict";
654
243k
    PyObject *handler;
655
243k
    if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
656
0
        return NULL;
657
0
    }
658
243k
    if (handler == NULL) {
659
0
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
660
0
        return NULL;
661
0
    }
662
243k
    return handler;
663
243k
}
664
665
666
static inline void
667
wrong_exception_type(PyObject *exc)
668
0
{
669
0
    PyErr_Format(PyExc_TypeError,
670
0
                 "don't know how to handle %T in error callback", exc);
671
0
}
672
673
674
#define _PyIsUnicodeEncodeError(EXC)    \
675
258k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeEncodeError)
676
#define _PyIsUnicodeDecodeError(EXC)    \
677
249k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeDecodeError)
678
#define _PyIsUnicodeTranslateError(EXC) \
679
0
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
680
681
682
// --- codecs handlers: utilities ---------------------------------------------
683
684
/*
685
 * Return the number of characters (including special prefixes)
686
 * needed to represent 'ch' by codec_handler_write_unicode_hex().
687
 */
688
static inline Py_ssize_t
689
codec_handler_unicode_hex_width(Py_UCS4 ch)
690
0
{
691
0
    if (ch >= 0x10000) {
692
        // format: '\\' + 'U' + 8 hex digits
693
0
        return 1 + 1 + 8;
694
0
    }
695
0
    else if (ch >= 0x100) {
696
        // format: '\\' + 'u' + 4 hex digits
697
0
        return 1 + 1 + 4;
698
0
    }
699
0
    else {
700
        // format: '\\' + 'x' + 2 hex digits
701
0
        return 1 + 1 + 2;
702
0
    }
703
0
}
704
705
706
/*
707
 * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
708
 * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
709
 */
710
static inline void
711
codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
712
0
{
713
0
    *(*p)++ = '\\';
714
0
    if (ch >= 0x10000) {
715
0
        *(*p)++ = 'U';
716
0
        *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf];
717
0
        *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf];
718
0
        *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf];
719
0
        *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf];
720
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
721
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
722
0
    }
723
0
    else if (ch >= 0x100) {
724
0
        *(*p)++ = 'u';
725
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
726
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
727
0
    }
728
0
    else {
729
0
        *(*p)++ = 'x';
730
0
    }
731
0
    *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf];
732
0
    *(*p)++ = Py_hexdigits[ch & 0xf];
733
0
}
734
735
736
/*
737
 * Determine the number of digits for a decimal representation of Unicode
738
 * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
739
 */
740
static inline int
741
n_decimal_digits_for_codepoint(Py_UCS4 ch)
742
0
{
743
0
    if (ch < 10) return 1;
744
0
    if (ch < 100) return 2;
745
0
    if (ch < 1000) return 3;
746
0
    if (ch < 10000) return 4;
747
0
    if (ch < 100000) return 5;
748
0
    if (ch < 1000000) return 6;
749
0
    if (ch < 10000000) return 7;
750
    // Unicode codepoints are limited to 1114111 (7 decimal digits)
751
0
    Py_UNREACHABLE();
752
0
}
753
754
755
/*
756
 * Create a Unicode string containing 'count' copies of the official
757
 * Unicode REPLACEMENT CHARACTER (0xFFFD).
758
 */
759
static PyObject *
760
codec_handler_unicode_replacement_character(Py_ssize_t count)
761
207k
{
762
207k
    PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER);
763
207k
    if (res == NULL) {
764
0
        return NULL;
765
0
    }
766
207k
    assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
767
207k
    Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
768
415k
    for (Py_ssize_t i = 0; i < count; ++i) {
769
207k
        outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
770
207k
    }
771
207k
    assert(_PyUnicode_CheckConsistency(res, 1));
772
207k
    return res;
773
207k
}
774
775
776
// --- handler: 'strict' ------------------------------------------------------
777
778
PyObject *PyCodec_StrictErrors(PyObject *exc)
779
230k
{
780
230k
    if (PyExceptionInstance_Check(exc)) {
781
230k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
782
230k
    }
783
0
    else {
784
0
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
785
0
    }
786
230k
    return NULL;
787
230k
}
788
789
790
// --- handler: 'ignore' ------------------------------------------------------
791
792
static PyObject *
793
_PyCodec_IgnoreError(PyObject *exc, int as_bytes)
794
0
{
795
0
    Py_ssize_t end;
796
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL,
797
0
                                  &end, NULL, as_bytes) < 0)
798
0
    {
799
0
        return NULL;
800
0
    }
801
0
    return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
802
0
}
803
804
805
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
806
0
{
807
0
    if (_PyIsUnicodeEncodeError(exc) || _PyIsUnicodeTranslateError(exc)) {
808
0
        return _PyCodec_IgnoreError(exc, false);
809
0
    }
810
0
    else if (_PyIsUnicodeDecodeError(exc)) {
811
0
        return _PyCodec_IgnoreError(exc, true);
812
0
    }
813
0
    else {
814
0
        wrong_exception_type(exc);
815
0
        return NULL;
816
0
    }
817
0
}
818
819
820
// --- handler: 'replace' -----------------------------------------------------
821
822
static PyObject *
823
_PyCodec_ReplaceUnicodeEncodeError(PyObject *exc)
824
0
{
825
0
    Py_ssize_t start, end, slen;
826
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
827
0
                                  &start, &end, &slen, false) < 0)
828
0
    {
829
0
        return NULL;
830
0
    }
831
0
    PyObject *res = PyUnicode_New(slen, '?');
832
0
    if (res == NULL) {
833
0
        return NULL;
834
0
    }
835
0
    assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
836
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
837
0
    memset(outp, '?', sizeof(Py_UCS1) * slen);
838
0
    assert(_PyUnicode_CheckConsistency(res, 1));
839
0
    return Py_BuildValue("(Nn)", res, end);
840
0
}
841
842
843
static PyObject *
844
_PyCodec_ReplaceUnicodeDecodeError(PyObject *exc)
845
207k
{
846
207k
    Py_ssize_t end;
847
207k
    if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) {
848
0
        return NULL;
849
0
    }
850
207k
    PyObject *res = codec_handler_unicode_replacement_character(1);
851
207k
    if (res == NULL) {
852
0
        return NULL;
853
0
    }
854
207k
    return Py_BuildValue("(Nn)", res, end);
855
207k
}
856
857
858
static PyObject *
859
_PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
860
0
{
861
0
    Py_ssize_t start, end, slen;
862
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
863
0
                                  &start, &end, &slen, false) < 0)
864
0
    {
865
0
        return NULL;
866
0
    }
867
0
    PyObject *res = codec_handler_unicode_replacement_character(slen);
868
0
    if (res == NULL) {
869
0
        return NULL;
870
0
    }
871
0
    return Py_BuildValue("(Nn)", res, end);
872
0
}
873
874
875
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
876
207k
{
877
207k
    if (_PyIsUnicodeEncodeError(exc)) {
878
0
        return _PyCodec_ReplaceUnicodeEncodeError(exc);
879
0
    }
880
207k
    else if (_PyIsUnicodeDecodeError(exc)) {
881
207k
        return _PyCodec_ReplaceUnicodeDecodeError(exc);
882
207k
    }
883
0
    else if (_PyIsUnicodeTranslateError(exc)) {
884
0
        return _PyCodec_ReplaceUnicodeTranslateError(exc);
885
0
    }
886
0
    else {
887
0
        wrong_exception_type(exc);
888
0
        return NULL;
889
0
    }
890
207k
}
891
892
893
// --- handler: 'xmlcharrefreplace' -------------------------------------------
894
895
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
896
0
{
897
0
    if (!_PyIsUnicodeEncodeError(exc)) {
898
0
        wrong_exception_type(exc);
899
0
        return NULL;
900
0
    }
901
902
0
    PyObject *obj;
903
0
    Py_ssize_t objlen, start, end, slen;
904
0
    if (_PyUnicodeError_GetParams(exc,
905
0
                                  &obj, &objlen,
906
0
                                  &start, &end, &slen, false) < 0)
907
0
    {
908
0
        return NULL;
909
0
    }
910
911
    // The number of characters that each character 'ch' contributes
912
    // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
913
    // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
914
    // range is below 10^7, each "block" requires at most 2 + 7 + 1
915
    // characters.
916
0
    if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
917
0
        end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
918
0
        end = Py_MIN(end, objlen);
919
0
        slen = Py_MAX(0, end - start);
920
0
    }
921
922
0
    Py_ssize_t ressize = 0;
923
0
    for (Py_ssize_t i = start; i < end; ++i) {
924
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
925
0
        int k = n_decimal_digits_for_codepoint(ch);
926
0
        assert(k != 0);
927
0
        assert(k <= 7);
928
0
        ressize += 2 + k + 1;
929
0
    }
930
931
    /* allocate replacement */
932
0
    PyObject *res = PyUnicode_New(ressize, 127);
933
0
    if (res == NULL) {
934
0
        Py_DECREF(obj);
935
0
        return NULL;
936
0
    }
937
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
938
    /* generate replacement */
939
0
    for (Py_ssize_t i = start; i < end; ++i) {
940
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
941
        /*
942
         * Write the decimal representation of 'ch' to the buffer pointed by 'p'
943
         * using at most 7 characters prefixed by '&#' and suffixed by ';'.
944
         */
945
0
        *outp++ = '&';
946
0
        *outp++ = '#';
947
0
        Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
948
0
        for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
949
0
            *p_digit = '0' + (ch % 10);
950
0
            ch /= 10;
951
0
        }
952
0
        assert(ch == 0);
953
0
        outp = digit_end;
954
0
        *outp++ = ';';
955
0
    }
956
0
    assert(_PyUnicode_CheckConsistency(res, 1));
957
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
958
0
    Py_DECREF(obj);
959
0
    return restuple;
960
0
}
961
962
963
// --- handler: 'backslashreplace' --------------------------------------------
964
965
static PyObject *
966
_PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc)
967
0
{
968
0
    PyObject *obj;
969
0
    Py_ssize_t objlen, start, end, slen;
970
0
    if (_PyUnicodeError_GetParams(exc,
971
0
                                  &obj, &objlen,
972
0
                                  &start, &end, &slen, false) < 0)
973
0
    {
974
0
        return NULL;
975
0
    }
976
977
    // The number of characters that each character 'ch' contributes
978
    // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch}
979
    // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS,
980
    // where the number of hexdigits is either 2, 4, or 8 (not 6).
981
    // Since the Unicode range is below 10^7, we choose k = 8 whence
982
    // each "block" requires at most 1 + 1 + 8 characters.
983
0
    if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) {
984
0
        end = start + PY_SSIZE_T_MAX / (1 + 1 + 8);
985
0
        end = Py_MIN(end, objlen);
986
0
        slen = Py_MAX(0, end - start);
987
0
    }
988
989
0
    Py_ssize_t ressize = 0;
990
0
    for (Py_ssize_t i = start; i < end; ++i) {
991
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
992
0
        ressize += codec_handler_unicode_hex_width(c);
993
0
    }
994
0
    PyObject *res = PyUnicode_New(ressize, 127);
995
0
    if (res == NULL) {
996
0
        Py_DECREF(obj);
997
0
        return NULL;
998
0
    }
999
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1000
0
    for (Py_ssize_t i = start; i < end; ++i) {
1001
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1002
0
        codec_handler_write_unicode_hex(&outp, c);
1003
0
    }
1004
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1005
0
    Py_DECREF(obj);
1006
0
    return Py_BuildValue("(Nn)", res, end);
1007
0
}
1008
1009
1010
static PyObject *
1011
_PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc)
1012
0
{
1013
0
    PyObject *obj;
1014
0
    Py_ssize_t objlen, start, end, slen;
1015
0
    if (_PyUnicodeError_GetParams(exc,
1016
0
                                  &obj, &objlen,
1017
0
                                  &start, &end, &slen, true) < 0)
1018
0
    {
1019
0
        return NULL;
1020
0
    }
1021
1022
0
    PyObject *res = PyUnicode_New(4 * slen, 127);
1023
0
    if (res == NULL) {
1024
0
        Py_DECREF(obj);
1025
0
        return NULL;
1026
0
    }
1027
1028
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1029
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1030
0
    for (Py_ssize_t i = start; i < end; i++, outp += 4) {
1031
0
        const unsigned char ch = p[i];
1032
0
        outp[0] = '\\';
1033
0
        outp[1] = 'x';
1034
0
        outp[2] = Py_hexdigits[(ch >> 4) & 0xf];
1035
0
        outp[3] = Py_hexdigits[ch & 0xf];
1036
0
    }
1037
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1038
0
    Py_DECREF(obj);
1039
0
    return Py_BuildValue("(Nn)", res, end);
1040
0
}
1041
1042
1043
static inline PyObject *
1044
_PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc)
1045
0
{
1046
    // Same implementation as for UnicodeEncodeError objects.
1047
0
    return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1048
0
}
1049
1050
1051
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
1052
0
{
1053
0
    if (_PyIsUnicodeEncodeError(exc)) {
1054
0
        return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1055
0
    }
1056
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1057
0
        return _PyCodec_BackslashReplaceUnicodeDecodeError(exc);
1058
0
    }
1059
0
    else if (_PyIsUnicodeTranslateError(exc)) {
1060
0
        return _PyCodec_BackslashReplaceUnicodeTranslateError(exc);
1061
0
    }
1062
0
    else {
1063
0
        wrong_exception_type(exc);
1064
0
        return NULL;
1065
0
    }
1066
0
}
1067
1068
1069
// --- handler: 'namereplace' -------------------------------------------------
1070
1071
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
1072
0
{
1073
0
    if (!_PyIsUnicodeEncodeError(exc)) {
1074
0
        wrong_exception_type(exc);
1075
0
        return NULL;
1076
0
    }
1077
1078
0
    _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
1079
0
    if (ucnhash_capi == NULL) {
1080
0
        return NULL;
1081
0
    }
1082
1083
0
    PyObject *obj;
1084
0
    Py_ssize_t start, end;
1085
0
    if (_PyUnicodeError_GetParams(exc,
1086
0
                                  &obj, NULL,
1087
0
                                  &start, &end, NULL, false) < 0)
1088
0
    {
1089
0
        return NULL;
1090
0
    }
1091
1092
0
    char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */
1093
0
    Py_ssize_t imax = start, ressize = 0, replsize;
1094
0
    for (; imax < end; ++imax) {
1095
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax);
1096
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1097
            // If 'c' is recognized by getname(), the corresponding replacement
1098
            // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
1099
            // characters. Failures of getname() are ignored by the handler.
1100
0
            replsize = 1 + 1 + 1 + strlen(buffer) + 1;
1101
0
        }
1102
0
        else {
1103
0
            replsize = codec_handler_unicode_hex_width(c);
1104
0
        }
1105
0
        if (ressize > PY_SSIZE_T_MAX - replsize) {
1106
0
            break;
1107
0
        }
1108
0
        ressize += replsize;
1109
0
    }
1110
1111
0
    PyObject *res = PyUnicode_New(ressize, 127);
1112
0
    if (res == NULL) {
1113
0
        Py_DECREF(obj);
1114
0
        return NULL;
1115
0
    }
1116
1117
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1118
0
    for (Py_ssize_t i = start; i < imax; ++i) {
1119
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1120
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1121
0
            *outp++ = '\\';
1122
0
            *outp++ = 'N';
1123
0
            *outp++ = '{';
1124
0
            (void)strcpy((char *)outp, buffer);
1125
0
            outp += strlen(buffer);
1126
0
            *outp++ = '}';
1127
0
        }
1128
0
        else {
1129
0
            codec_handler_write_unicode_hex(&outp, c);
1130
0
        }
1131
0
    }
1132
1133
0
    assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1134
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1135
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, imax);
1136
0
    Py_DECREF(obj);
1137
0
    return restuple;
1138
0
}
1139
1140
1141
0
#define ENC_UNKNOWN     -1
1142
0
#define ENC_UTF8        0
1143
0
#define ENC_UTF16BE     1
1144
0
#define ENC_UTF16LE     2
1145
0
#define ENC_UTF32BE     3
1146
0
#define ENC_UTF32LE     4
1147
1148
static int
1149
get_standard_encoding_impl(const char *encoding, int *bytelength)
1150
0
{
1151
0
    if (Py_TOLOWER(encoding[0]) == 'u' &&
1152
0
        Py_TOLOWER(encoding[1]) == 't' &&
1153
0
        Py_TOLOWER(encoding[2]) == 'f') {
1154
0
        encoding += 3;
1155
0
        if (*encoding == '-' || *encoding == '_' )
1156
0
            encoding++;
1157
0
        if (encoding[0] == '8' && encoding[1] == '\0') {
1158
0
            *bytelength = 3;
1159
0
            return ENC_UTF8;
1160
0
        }
1161
0
        else if (encoding[0] == '1' && encoding[1] == '6') {
1162
0
            encoding += 2;
1163
0
            *bytelength = 2;
1164
0
            if (*encoding == '\0') {
1165
#ifdef WORDS_BIGENDIAN
1166
                return ENC_UTF16BE;
1167
#else
1168
0
                return ENC_UTF16LE;
1169
0
#endif
1170
0
            }
1171
0
            if (*encoding == '-' || *encoding == '_' )
1172
0
                encoding++;
1173
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1174
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1175
0
                    return ENC_UTF16BE;
1176
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1177
0
                    return ENC_UTF16LE;
1178
0
            }
1179
0
        }
1180
0
        else if (encoding[0] == '3' && encoding[1] == '2') {
1181
0
            encoding += 2;
1182
0
            *bytelength = 4;
1183
0
            if (*encoding == '\0') {
1184
#ifdef WORDS_BIGENDIAN
1185
                return ENC_UTF32BE;
1186
#else
1187
0
                return ENC_UTF32LE;
1188
0
#endif
1189
0
            }
1190
0
            if (*encoding == '-' || *encoding == '_' )
1191
0
                encoding++;
1192
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1193
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1194
0
                    return ENC_UTF32BE;
1195
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1196
0
                    return ENC_UTF32LE;
1197
0
            }
1198
0
        }
1199
0
    }
1200
0
    else if (strcmp(encoding, "cp65001") == 0) {
1201
0
        *bytelength = 3;
1202
0
        return ENC_UTF8;
1203
0
    }
1204
0
    return ENC_UNKNOWN;
1205
0
}
1206
1207
1208
static int
1209
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
1210
0
{
1211
0
    const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
1212
0
    if (encoding_cstr == NULL) {
1213
0
        return -1;
1214
0
    }
1215
0
    *code = get_standard_encoding_impl(encoding_cstr, bytelength);
1216
0
    return 0;
1217
0
}
1218
1219
1220
// --- handler: 'surrogatepass' -----------------------------------------------
1221
1222
static PyObject *
1223
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
1224
0
{
1225
0
    PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
1226
0
    if (encoding == NULL) {
1227
0
        return NULL;
1228
0
    }
1229
0
    int code, bytelength;
1230
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1231
0
    Py_DECREF(encoding);
1232
0
    if (rc < 0) {
1233
0
        return NULL;
1234
0
    }
1235
0
    if (code == ENC_UNKNOWN) {
1236
0
        goto bail;
1237
0
    }
1238
1239
0
    PyObject *obj;
1240
0
    Py_ssize_t objlen, start, end, slen;
1241
0
    if (_PyUnicodeError_GetParams(exc,
1242
0
                                  &obj, &objlen,
1243
0
                                  &start, &end, &slen, false) < 0)
1244
0
    {
1245
0
        return NULL;
1246
0
    }
1247
1248
0
    if (slen > PY_SSIZE_T_MAX / bytelength) {
1249
0
        end = start + PY_SSIZE_T_MAX / bytelength;
1250
0
        end = Py_MIN(end, objlen);
1251
0
        slen = Py_MAX(0, end - start);
1252
0
    }
1253
1254
0
    PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
1255
0
    if (res == NULL) {
1256
0
        Py_DECREF(obj);
1257
0
        return NULL;
1258
0
    }
1259
1260
0
    unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
1261
0
    for (Py_ssize_t i = start; i < end; i++) {
1262
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1263
0
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1264
            /* Not a surrogate, fail with original exception */
1265
0
            Py_DECREF(obj);
1266
0
            Py_DECREF(res);
1267
0
            goto bail;
1268
0
        }
1269
0
        switch (code) {
1270
0
            case ENC_UTF8: {
1271
0
                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1272
0
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1273
0
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1274
0
                break;
1275
0
            }
1276
0
            case ENC_UTF16LE: {
1277
0
                *outp++ = (unsigned char)ch;
1278
0
                *outp++ = (unsigned char)(ch >> 8);
1279
0
                break;
1280
0
            }
1281
0
            case ENC_UTF16BE: {
1282
0
                *outp++ = (unsigned char)(ch >> 8);
1283
0
                *outp++ = (unsigned char)ch;
1284
0
                break;
1285
0
            }
1286
0
            case ENC_UTF32LE: {
1287
0
                *outp++ = (unsigned char)ch;
1288
0
                *outp++ = (unsigned char)(ch >> 8);
1289
0
                *outp++ = (unsigned char)(ch >> 16);
1290
0
                *outp++ = (unsigned char)(ch >> 24);
1291
0
                break;
1292
0
            }
1293
0
            case ENC_UTF32BE: {
1294
0
                *outp++ = (unsigned char)(ch >> 24);
1295
0
                *outp++ = (unsigned char)(ch >> 16);
1296
0
                *outp++ = (unsigned char)(ch >> 8);
1297
0
                *outp++ = (unsigned char)ch;
1298
0
                break;
1299
0
            }
1300
0
        }
1301
0
    }
1302
1303
0
    Py_DECREF(obj);
1304
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
1305
0
    return restuple;
1306
1307
0
bail:
1308
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1309
0
    return NULL;
1310
0
}
1311
1312
1313
static PyObject *
1314
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
1315
0
{
1316
0
    PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
1317
0
    if (encoding == NULL) {
1318
0
        return NULL;
1319
0
    }
1320
0
    int code, bytelength;
1321
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1322
0
    Py_DECREF(encoding);
1323
0
    if (rc < 0) {
1324
0
        return NULL;
1325
0
    }
1326
0
    if (code == ENC_UNKNOWN) {
1327
0
        goto bail;
1328
0
    }
1329
1330
0
    PyObject *obj;
1331
0
    Py_ssize_t objlen, start, end, slen;
1332
0
    if (_PyUnicodeError_GetParams(exc,
1333
0
                                  &obj, &objlen,
1334
0
                                  &start, &end, &slen, true) < 0)
1335
0
    {
1336
0
        return NULL;
1337
0
    }
1338
1339
    /* Try decoding a single surrogate character. If
1340
       there are more, let the codec call us again. */
1341
0
    Py_UCS4 ch = 0;
1342
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1343
0
    p += start;
1344
1345
0
    if (objlen - start >= bytelength) {
1346
0
        switch (code) {
1347
0
            case ENC_UTF8: {
1348
0
                if ((p[0] & 0xf0) == 0xe0 &&
1349
0
                    (p[1] & 0xc0) == 0x80 &&
1350
0
                    (p[2] & 0xc0) == 0x80)
1351
0
                {
1352
                    /* it's a three-byte code */
1353
0
                    ch = ((p[0] & 0x0f) << 12) +
1354
0
                         ((p[1] & 0x3f) << 6)  +
1355
0
                          (p[2] & 0x3f);
1356
0
                }
1357
0
                break;
1358
0
            }
1359
0
            case ENC_UTF16LE: {
1360
0
                ch = p[1] << 8 | p[0];
1361
0
                break;
1362
0
            }
1363
0
            case ENC_UTF16BE: {
1364
0
                ch = p[0] << 8 | p[1];
1365
0
                break;
1366
0
            }
1367
0
            case ENC_UTF32LE: {
1368
0
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1369
0
                break;
1370
0
            }
1371
0
            case ENC_UTF32BE: {
1372
0
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1373
0
                break;
1374
0
            }
1375
0
        }
1376
0
    }
1377
0
    Py_DECREF(obj);
1378
0
    if (!Py_UNICODE_IS_SURROGATE(ch)) {
1379
0
        goto bail;
1380
0
    }
1381
1382
0
    PyObject *res = PyUnicode_FromOrdinal(ch);
1383
0
    if (res == NULL) {
1384
0
        return NULL;
1385
0
    }
1386
0
    return Py_BuildValue("(Nn)", res, start + bytelength);
1387
1388
0
bail:
1389
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1390
0
    return NULL;
1391
0
}
1392
1393
1394
/* This handler is declared static until someone demonstrates
1395
   a need to call it directly. */
1396
static PyObject *
1397
PyCodec_SurrogatePassErrors(PyObject *exc)
1398
0
{
1399
0
    if (_PyIsUnicodeEncodeError(exc)) {
1400
0
        return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
1401
0
    }
1402
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1403
0
        return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
1404
0
    }
1405
0
    else {
1406
0
        wrong_exception_type(exc);
1407
0
        return NULL;
1408
0
    }
1409
0
}
1410
1411
1412
// --- handler: 'surrogateescape' ---------------------------------------------
1413
1414
static PyObject *
1415
_PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
1416
9.21k
{
1417
9.21k
    PyObject *obj;
1418
9.21k
    Py_ssize_t start, end, slen;
1419
9.21k
    if (_PyUnicodeError_GetParams(exc,
1420
9.21k
                                  &obj, NULL,
1421
9.21k
                                  &start, &end, &slen, false) < 0)
1422
0
    {
1423
0
        return NULL;
1424
0
    }
1425
1426
9.21k
    PyObject *res = PyBytes_FromStringAndSize(NULL, slen);
1427
9.21k
    if (res == NULL) {
1428
0
        Py_DECREF(obj);
1429
0
        return NULL;
1430
0
    }
1431
1432
9.21k
    char *outp = PyBytes_AsString(res);
1433
9.21k
    for (Py_ssize_t i = start; i < end; i++) {
1434
9.21k
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1435
9.21k
        if (ch < 0xdc80 || ch > 0xdcff) {
1436
            /* Not a UTF-8b surrogate, fail with original exception. */
1437
9.21k
            Py_DECREF(obj);
1438
9.21k
            Py_DECREF(res);
1439
9.21k
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1440
9.21k
            return NULL;
1441
9.21k
        }
1442
0
        *outp++ = ch - 0xdc00;
1443
0
    }
1444
0
    Py_DECREF(obj);
1445
1446
0
    return Py_BuildValue("(Nn)", res, end);
1447
9.21k
}
1448
1449
1450
static PyObject *
1451
_PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc)
1452
41.6k
{
1453
41.6k
    PyObject *obj;
1454
41.6k
    Py_ssize_t start, end, slen;
1455
41.6k
    if (_PyUnicodeError_GetParams(exc,
1456
41.6k
                                  &obj, NULL,
1457
41.6k
                                  &start, &end, &slen, true) < 0)
1458
0
    {
1459
0
        return NULL;
1460
0
    }
1461
1462
41.6k
    Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1463
41.6k
    int consumed = 0;
1464
41.6k
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1465
94.6k
    while (consumed < 4 && consumed < slen) {
1466
        /* Refuse to escape ASCII bytes. */
1467
70.3k
        if (p[start + consumed] < 128) {
1468
17.3k
            break;
1469
17.3k
        }
1470
53.0k
        ch[consumed] = 0xdc00 + p[start + consumed];
1471
53.0k
        consumed++;
1472
53.0k
    }
1473
41.6k
    Py_DECREF(obj);
1474
1475
41.6k
    if (consumed == 0) {
1476
        /* Codec complained about ASCII byte. */
1477
11.0k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1478
11.0k
        return NULL;
1479
11.0k
    }
1480
1481
30.5k
    PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1482
30.5k
    if (str == NULL) {
1483
0
        return NULL;
1484
0
    }
1485
30.5k
    return Py_BuildValue("(Nn)", str, start + consumed);
1486
30.5k
}
1487
1488
1489
static PyObject *
1490
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1491
50.8k
{
1492
50.8k
    if (_PyIsUnicodeEncodeError(exc)) {
1493
9.21k
        return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc);
1494
9.21k
    }
1495
41.6k
    else if (_PyIsUnicodeDecodeError(exc)) {
1496
41.6k
        return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc);
1497
41.6k
    }
1498
0
    else {
1499
0
        wrong_exception_type(exc);
1500
0
        return NULL;
1501
0
    }
1502
50.8k
}
1503
1504
1505
// --- Codecs registry handlers -----------------------------------------------
1506
1507
static inline PyObject *
1508
strict_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1509
191k
{
1510
191k
    return PyCodec_StrictErrors(exc);
1511
191k
}
1512
1513
1514
static inline PyObject *
1515
ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1516
0
{
1517
0
    return PyCodec_IgnoreErrors(exc);
1518
0
}
1519
1520
1521
static inline PyObject *
1522
replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1523
207k
{
1524
207k
    return PyCodec_ReplaceErrors(exc);
1525
207k
}
1526
1527
1528
static inline PyObject *
1529
xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1530
0
{
1531
0
    return PyCodec_XMLCharRefReplaceErrors(exc);
1532
0
}
1533
1534
1535
static inline PyObject *
1536
backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1537
0
{
1538
0
    return PyCodec_BackslashReplaceErrors(exc);
1539
0
}
1540
1541
1542
static inline PyObject *
1543
namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1544
0
{
1545
0
    return PyCodec_NameReplaceErrors(exc);
1546
0
}
1547
1548
1549
static inline PyObject *
1550
surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1551
0
{
1552
0
    return PyCodec_SurrogatePassErrors(exc);
1553
0
}
1554
1555
1556
static inline PyObject *
1557
surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1558
50.8k
{
1559
50.8k
    return PyCodec_SurrogateEscapeErrors(exc);
1560
50.8k
}
1561
1562
1563
PyStatus
1564
_PyCodec_InitRegistry(PyInterpreterState *interp)
1565
16
{
1566
16
    static struct {
1567
16
        const char *name;
1568
16
        PyMethodDef def;
1569
16
    } methods[] =
1570
16
    {
1571
16
        {
1572
16
            "strict",
1573
16
            {
1574
16
                "strict_errors",
1575
16
                strict_errors,
1576
16
                METH_O,
1577
16
                PyDoc_STR("Implements the 'strict' error handling, which "
1578
16
                          "raises a UnicodeError on coding errors.")
1579
16
            }
1580
16
        },
1581
16
        {
1582
16
            "ignore",
1583
16
            {
1584
16
                "ignore_errors",
1585
16
                ignore_errors,
1586
16
                METH_O,
1587
16
                PyDoc_STR("Implements the 'ignore' error handling, which "
1588
16
                          "ignores malformed data and continues.")
1589
16
            }
1590
16
        },
1591
16
        {
1592
16
            "replace",
1593
16
            {
1594
16
                "replace_errors",
1595
16
                replace_errors,
1596
16
                METH_O,
1597
16
                PyDoc_STR("Implements the 'replace' error handling, which "
1598
16
                          "replaces malformed data with a replacement marker.")
1599
16
            }
1600
16
        },
1601
16
        {
1602
16
            "xmlcharrefreplace",
1603
16
            {
1604
16
                "xmlcharrefreplace_errors",
1605
16
                xmlcharrefreplace_errors,
1606
16
                METH_O,
1607
16
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1608
16
                          "which replaces an unencodable character with the "
1609
16
                          "appropriate XML character reference.")
1610
16
            }
1611
16
        },
1612
16
        {
1613
16
            "backslashreplace",
1614
16
            {
1615
16
                "backslashreplace_errors",
1616
16
                backslashreplace_errors,
1617
16
                METH_O,
1618
16
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1619
16
                          "which replaces malformed data with a backslashed "
1620
16
                          "escape sequence.")
1621
16
            }
1622
16
        },
1623
16
        {
1624
16
            "namereplace",
1625
16
            {
1626
16
                "namereplace_errors",
1627
16
                namereplace_errors,
1628
16
                METH_O,
1629
16
                PyDoc_STR("Implements the 'namereplace' error handling, "
1630
16
                          "which replaces an unencodable character with a "
1631
16
                          "\\N{...} escape sequence.")
1632
16
            }
1633
16
        },
1634
16
        {
1635
16
            "surrogatepass",
1636
16
            {
1637
16
                "surrogatepass",
1638
16
                surrogatepass_errors,
1639
16
                METH_O
1640
16
            }
1641
16
        },
1642
16
        {
1643
16
            "surrogateescape",
1644
16
            {
1645
16
                "surrogateescape",
1646
16
                surrogateescape_errors,
1647
16
                METH_O
1648
16
            }
1649
16
        }
1650
16
    };
1651
    // ensure that the built-in error handlers' names are kept in sync
1652
16
    assert(Py_ARRAY_LENGTH(methods) == Py_ARRAY_LENGTH(codecs_builtin_error_handlers));
1653
1654
16
    assert(interp->codecs.initialized == 0);
1655
16
    interp->codecs.search_path = PyList_New(0);
1656
16
    if (interp->codecs.search_path == NULL) {
1657
0
        return PyStatus_NoMemory();
1658
0
    }
1659
16
    interp->codecs.search_cache = PyDict_New();
1660
16
    if (interp->codecs.search_cache == NULL) {
1661
0
        return PyStatus_NoMemory();
1662
0
    }
1663
16
    interp->codecs.error_registry = PyDict_New();
1664
16
    if (interp->codecs.error_registry == NULL) {
1665
0
        return PyStatus_NoMemory();
1666
0
    }
1667
144
    for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1668
128
        PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1669
128
        if (func == NULL) {
1670
0
            return PyStatus_NoMemory();
1671
0
        }
1672
1673
128
        int res = PyDict_SetItemString(interp->codecs.error_registry,
1674
128
                                       methods[i].name, func);
1675
128
        Py_DECREF(func);
1676
128
        if (res < 0) {
1677
0
            return PyStatus_Error("Failed to insert into codec error registry");
1678
0
        }
1679
128
    }
1680
1681
16
    interp->codecs.initialized = 1;
1682
1683
    // Importing `encodings' will call back into this module to register codec
1684
    // search functions, so this is done after everything else is initialized.
1685
16
    PyObject *mod = PyImport_ImportModule("encodings");
1686
16
    if (mod == NULL) {
1687
0
        return PyStatus_Error("Failed to import encodings module");
1688
0
    }
1689
16
    Py_DECREF(mod);
1690
1691
16
    return PyStatus_Ok();
1692
16
}
1693
1694
void
1695
_PyCodec_Fini(PyInterpreterState *interp)
1696
0
{
1697
0
    Py_CLEAR(interp->codecs.search_path);
1698
0
    Py_CLEAR(interp->codecs.search_cache);
1699
0
    Py_CLEAR(interp->codecs.error_registry);
1700
0
    interp->codecs.initialized = 0;
1701
0
}