Coverage Report

Created: 2025-09-04 06:25

/src/cpython/Python/codecs.c
Line
Count
Source (jump to first uncovered line)
1
/* ------------------------------------------------------------------------
2
3
   Python Codec Registry and support functions
4
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7
Copyright (c) Corporation for National Research Initiatives.
8
9
   ------------------------------------------------------------------------ */
10
11
#include "Python.h"
12
#include "pycore_call.h"          // _PyObject_CallNoArgs()
13
#include "pycore_interp.h"        // PyInterpreterState.codec_search_path
14
#include "pycore_pyerrors.h"      // _PyErr_FormatNote()
15
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
16
#include "pycore_runtime.h"       // _Py_ID()
17
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
18
#include "pycore_unicodeobject.h" // _PyUnicode_InternMortal()
19
#include "pycore_pyatomic_ft_wrappers.h"
20
21
static const char *codecs_builtin_error_handlers[] = {
22
    "strict", "ignore", "replace",
23
    "xmlcharrefreplace", "backslashreplace", "namereplace",
24
    "surrogatepass", "surrogateescape",
25
};
26
27
const char *Py_hexdigits = "0123456789abcdef";
28
29
/* --- Codec Registry ----------------------------------------------------- */
30
31
int PyCodec_Register(PyObject *search_function)
32
16
{
33
16
    PyInterpreterState *interp = _PyInterpreterState_GET();
34
16
    assert(interp->codecs.initialized);
35
16
    if (search_function == NULL) {
36
0
        PyErr_BadArgument();
37
0
        goto onError;
38
0
    }
39
16
    if (!PyCallable_Check(search_function)) {
40
0
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
41
0
        goto onError;
42
0
    }
43
16
    FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
44
16
    int ret = PyList_Append(interp->codecs.search_path, search_function);
45
16
    FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
46
47
16
    return ret;
48
49
0
 onError:
50
0
    return -1;
51
16
}
52
53
int
54
PyCodec_Unregister(PyObject *search_function)
55
0
{
56
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
57
0
    if (interp->codecs.initialized != 1) {
58
        /* Do nothing if codecs state was cleared (only possible during
59
           interpreter shutdown). */
60
0
        return 0;
61
0
    }
62
63
0
    PyObject *codec_search_path = interp->codecs.search_path;
64
0
    assert(PyList_CheckExact(codec_search_path));
65
0
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
66
0
        FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
67
0
        PyObject *item = PyList_GetItemRef(codec_search_path, i);
68
0
        int ret = 1;
69
0
        if (item == search_function) {
70
            // We hold a reference to the item, so its destructor can't run
71
            // while we hold search_path_mutex.
72
0
            ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
73
0
        }
74
0
        FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
75
0
        Py_DECREF(item);
76
0
        if (ret != 1) {
77
0
            assert(interp->codecs.search_cache != NULL);
78
0
            assert(PyDict_CheckExact(interp->codecs.search_cache));
79
0
            PyDict_Clear(interp->codecs.search_cache);
80
0
            return ret;
81
0
        }
82
0
    }
83
0
    return 0;
84
0
}
85
86
extern int _Py_normalize_encoding(const char *, char *, size_t);
87
88
/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
89
   converted to lower case, spaces and hyphens are replaced with underscores. */
90
91
static
92
PyObject *normalizestring(const char *string)
93
1.07M
{
94
1.07M
    size_t len = strlen(string);
95
1.07M
    char *encoding;
96
1.07M
    PyObject *v;
97
98
1.07M
    if (len > PY_SSIZE_T_MAX) {
99
0
        PyErr_SetString(PyExc_OverflowError, "string is too large");
100
0
        return NULL;
101
0
    }
102
103
1.07M
    encoding = PyMem_Malloc(len + 1);
104
1.07M
    if (encoding == NULL)
105
0
        return PyErr_NoMemory();
106
107
1.07M
    if (!_Py_normalize_encoding(string, encoding, len + 1))
108
0
    {
109
0
        PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
110
0
        PyMem_Free(encoding);
111
0
        return NULL;
112
0
    }
113
114
1.07M
    v = PyUnicode_FromString(encoding);
115
1.07M
    PyMem_Free(encoding);
116
1.07M
    return v;
117
1.07M
}
118
119
/* Lookup the given encoding and return a tuple providing the codec
120
   facilities.
121
122
   The encoding string is looked up converted to all lower-case
123
   characters. This makes encodings looked up through this mechanism
124
   effectively case-insensitive.
125
126
   If no codec is found, a LookupError is set and NULL returned.
127
128
   As side effect, this tries to load the encodings package, if not
129
   yet done. This is part of the lazy load strategy for the encodings
130
   package.
131
132
*/
133
134
PyObject *_PyCodec_Lookup(const char *encoding)
135
1.07M
{
136
1.07M
    if (encoding == NULL) {
137
0
        PyErr_BadArgument();
138
0
        return NULL;
139
0
    }
140
141
1.07M
    PyInterpreterState *interp = _PyInterpreterState_GET();
142
1.07M
    assert(interp->codecs.initialized);
143
144
    /* Convert the encoding to a normalized Python string: all
145
       characters are converted to lower case, spaces and hyphens are
146
       replaced with underscores. */
147
1.07M
    PyObject *v = normalizestring(encoding);
148
1.07M
    if (v == NULL) {
149
0
        return NULL;
150
0
    }
151
152
    /* Intern the string. We'll make it immortal later if lookup succeeds. */
153
1.07M
    _PyUnicode_InternMortal(interp, &v);
154
155
    /* First, try to lookup the name in the registry dictionary */
156
1.07M
    PyObject *result;
157
1.07M
    if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
158
0
        goto onError;
159
0
    }
160
1.07M
    if (result != NULL) {
161
987k
        Py_DECREF(v);
162
987k
        return result;
163
987k
    }
164
165
    /* Next, scan the search functions in order of registration */
166
87.4k
    const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
167
87.4k
    if (len < 0)
168
0
        goto onError;
169
87.4k
    if (len == 0) {
170
0
        PyErr_SetString(PyExc_LookupError,
171
0
                        "no codec search functions registered: "
172
0
                        "can't find encoding");
173
0
        goto onError;
174
0
    }
175
176
87.4k
    Py_ssize_t i;
177
174k
    for (i = 0; i < len; i++) {
178
87.4k
        PyObject *func;
179
180
87.4k
        func = PyList_GetItemRef(interp->codecs.search_path, i);
181
87.4k
        if (func == NULL)
182
0
            goto onError;
183
87.4k
        result = PyObject_CallOneArg(func, v);
184
87.4k
        Py_DECREF(func);
185
87.4k
        if (result == NULL)
186
0
            goto onError;
187
87.4k
        if (result == Py_None) {
188
86.9k
            Py_CLEAR(result);
189
86.9k
            continue;
190
86.9k
        }
191
450
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
192
0
            PyErr_SetString(PyExc_TypeError,
193
0
                            "codec search functions must return 4-tuples");
194
0
            Py_DECREF(result);
195
0
            goto onError;
196
0
        }
197
450
        break;
198
450
    }
199
87.4k
    if (result == NULL) {
200
        /* XXX Perhaps we should cache misses too ? */
201
86.9k
        PyErr_Format(PyExc_LookupError,
202
86.9k
                     "unknown encoding: %s", encoding);
203
86.9k
        goto onError;
204
86.9k
    }
205
206
450
    _PyUnicode_InternImmortal(interp, &v);
207
208
    /* Cache and return the result */
209
450
    if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
210
0
        Py_DECREF(result);
211
0
        goto onError;
212
0
    }
213
450
    Py_DECREF(v);
214
450
    return result;
215
216
86.9k
 onError:
217
86.9k
    Py_DECREF(v);
218
86.9k
    return NULL;
219
450
}
220
221
/* Codec registry encoding check API. */
222
223
int PyCodec_KnownEncoding(const char *encoding)
224
0
{
225
0
    PyObject *codecs;
226
227
0
    codecs = _PyCodec_Lookup(encoding);
228
0
    if (!codecs) {
229
0
        PyErr_Clear();
230
0
        return 0;
231
0
    }
232
0
    else {
233
0
        Py_DECREF(codecs);
234
0
        return 1;
235
0
    }
236
0
}
237
238
static
239
PyObject *args_tuple(PyObject *object,
240
                     const char *errors)
241
984k
{
242
984k
    PyObject *args;
243
244
984k
    args = PyTuple_New(1 + (errors != NULL));
245
984k
    if (args == NULL)
246
0
        return NULL;
247
984k
    PyTuple_SET_ITEM(args, 0, Py_NewRef(object));
248
984k
    if (errors) {
249
169k
        PyObject *v;
250
251
169k
        v = PyUnicode_FromString(errors);
252
169k
        if (v == NULL) {
253
0
            Py_DECREF(args);
254
0
            return NULL;
255
0
        }
256
169k
        PyTuple_SET_ITEM(args, 1, v);
257
169k
    }
258
984k
    return args;
259
984k
}
260
261
/* Helper function to get a codec item */
262
263
static
264
PyObject *codec_getitem(const char *encoding, int index)
265
0
{
266
0
    PyObject *codecs;
267
0
    PyObject *v;
268
269
0
    codecs = _PyCodec_Lookup(encoding);
270
0
    if (codecs == NULL)
271
0
        return NULL;
272
0
    v = PyTuple_GET_ITEM(codecs, index);
273
0
    Py_DECREF(codecs);
274
0
    return Py_NewRef(v);
275
0
}
276
277
/* Helper functions to create an incremental codec. */
278
static
279
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
280
                                     const char *errors,
281
                                     const char *attrname)
282
48
{
283
48
    PyObject *ret, *inccodec;
284
285
48
    inccodec = PyObject_GetAttrString(codec_info, attrname);
286
48
    if (inccodec == NULL)
287
0
        return NULL;
288
48
    if (errors)
289
48
        ret = PyObject_CallFunction(inccodec, "s", errors);
290
0
    else
291
0
        ret = _PyObject_CallNoArgs(inccodec);
292
48
    Py_DECREF(inccodec);
293
48
    return ret;
294
48
}
295
296
static
297
PyObject *codec_getincrementalcodec(const char *encoding,
298
                                    const char *errors,
299
                                    const char *attrname)
300
0
{
301
0
    PyObject *codec_info, *ret;
302
303
0
    codec_info = _PyCodec_Lookup(encoding);
304
0
    if (codec_info == NULL)
305
0
        return NULL;
306
0
    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
307
0
    Py_DECREF(codec_info);
308
0
    return ret;
309
0
}
310
311
/* Helper function to create a stream codec. */
312
313
static
314
PyObject *codec_getstreamcodec(const char *encoding,
315
                               PyObject *stream,
316
                               const char *errors,
317
                               const int index)
318
0
{
319
0
    PyObject *codecs, *streamcodec, *codeccls;
320
321
0
    codecs = _PyCodec_Lookup(encoding);
322
0
    if (codecs == NULL)
323
0
        return NULL;
324
325
0
    codeccls = PyTuple_GET_ITEM(codecs, index);
326
0
    if (errors != NULL)
327
0
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
328
0
    else
329
0
        streamcodec = PyObject_CallOneArg(codeccls, stream);
330
0
    Py_DECREF(codecs);
331
0
    return streamcodec;
332
0
}
333
334
/* Helpers to work with the result of _PyCodec_Lookup
335
336
 */
337
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
338
                                             const char *errors)
339
16
{
340
16
    return codec_makeincrementalcodec(codec_info, errors,
341
16
                                      "incrementaldecoder");
342
16
}
343
344
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
345
                                             const char *errors)
346
32
{
347
32
    return codec_makeincrementalcodec(codec_info, errors,
348
32
                                      "incrementalencoder");
349
32
}
350
351
352
/* Convenience APIs to query the Codec registry.
353
354
   All APIs return a codec object with incremented refcount.
355
356
 */
357
358
PyObject *PyCodec_Encoder(const char *encoding)
359
0
{
360
0
    return codec_getitem(encoding, 0);
361
0
}
362
363
PyObject *PyCodec_Decoder(const char *encoding)
364
0
{
365
0
    return codec_getitem(encoding, 1);
366
0
}
367
368
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
369
                                     const char *errors)
370
0
{
371
0
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
372
0
}
373
374
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
375
                                     const char *errors)
376
0
{
377
0
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
378
0
}
379
380
PyObject *PyCodec_StreamReader(const char *encoding,
381
                               PyObject *stream,
382
                               const char *errors)
383
0
{
384
0
    return codec_getstreamcodec(encoding, stream, errors, 2);
385
0
}
386
387
PyObject *PyCodec_StreamWriter(const char *encoding,
388
                               PyObject *stream,
389
                               const char *errors)
390
0
{
391
0
    return codec_getstreamcodec(encoding, stream, errors, 3);
392
0
}
393
394
/* Encode an object (e.g. a Unicode object) using the given encoding
395
   and return the resulting encoded object (usually a Python string).
396
397
   errors is passed to the encoder factory as argument if non-NULL. */
398
399
static PyObject *
400
_PyCodec_EncodeInternal(PyObject *object,
401
                        PyObject *encoder,
402
                        const char *encoding,
403
                        const char *errors)
404
747k
{
405
747k
    PyObject *args = NULL, *result = NULL;
406
747k
    PyObject *v = NULL;
407
408
747k
    args = args_tuple(object, errors);
409
747k
    if (args == NULL)
410
0
        goto onError;
411
412
747k
    result = PyObject_Call(encoder, args, NULL);
413
747k
    if (result == NULL) {
414
0
        _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding);
415
0
        goto onError;
416
0
    }
417
418
747k
    if (!PyTuple_Check(result) ||
419
747k
        PyTuple_GET_SIZE(result) != 2) {
420
0
        PyErr_SetString(PyExc_TypeError,
421
0
                        "encoder must return a tuple (object, integer)");
422
0
        goto onError;
423
0
    }
424
747k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
425
    /* We don't check or use the second (integer) entry. */
426
427
747k
    Py_DECREF(args);
428
747k
    Py_DECREF(encoder);
429
747k
    Py_DECREF(result);
430
747k
    return v;
431
432
0
 onError:
433
0
    Py_XDECREF(result);
434
0
    Py_XDECREF(args);
435
0
    Py_XDECREF(encoder);
436
0
    return NULL;
437
747k
}
438
439
/* Decode an object (usually a Python string) using the given encoding
440
   and return an equivalent object (e.g. a Unicode object).
441
442
   errors is passed to the decoder factory as argument if non-NULL. */
443
444
static PyObject *
445
_PyCodec_DecodeInternal(PyObject *object,
446
                        PyObject *decoder,
447
                        const char *encoding,
448
                        const char *errors)
449
237k
{
450
237k
    PyObject *args = NULL, *result = NULL;
451
237k
    PyObject *v;
452
453
237k
    args = args_tuple(object, errors);
454
237k
    if (args == NULL)
455
0
        goto onError;
456
457
237k
    result = PyObject_Call(decoder, args, NULL);
458
237k
    if (result == NULL) {
459
43.9k
        _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding);
460
43.9k
        goto onError;
461
43.9k
    }
462
193k
    if (!PyTuple_Check(result) ||
463
193k
        PyTuple_GET_SIZE(result) != 2) {
464
0
        PyErr_SetString(PyExc_TypeError,
465
0
                        "decoder must return a tuple (object,integer)");
466
0
        goto onError;
467
0
    }
468
193k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
469
    /* We don't check or use the second (integer) entry. */
470
471
193k
    Py_DECREF(args);
472
193k
    Py_DECREF(decoder);
473
193k
    Py_DECREF(result);
474
193k
    return v;
475
476
43.9k
 onError:
477
43.9k
    Py_XDECREF(args);
478
43.9k
    Py_XDECREF(decoder);
479
43.9k
    Py_XDECREF(result);
480
43.9k
    return NULL;
481
193k
}
482
483
/* Generic encoding/decoding API */
484
PyObject *PyCodec_Encode(PyObject *object,
485
                         const char *encoding,
486
                         const char *errors)
487
0
{
488
0
    PyObject *encoder;
489
490
0
    encoder = PyCodec_Encoder(encoding);
491
0
    if (encoder == NULL)
492
0
        return NULL;
493
494
0
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
495
0
}
496
497
PyObject *PyCodec_Decode(PyObject *object,
498
                         const char *encoding,
499
                         const char *errors)
500
0
{
501
0
    PyObject *decoder;
502
503
0
    decoder = PyCodec_Decoder(encoding);
504
0
    if (decoder == NULL)
505
0
        return NULL;
506
507
0
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
508
0
}
509
510
/* Text encoding/decoding API */
511
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
512
                                       const char *alternate_command)
513
1.07M
{
514
1.07M
    PyObject *codec;
515
1.07M
    PyObject *attr;
516
1.07M
    int is_text_codec;
517
518
1.07M
    codec = _PyCodec_Lookup(encoding);
519
1.07M
    if (codec == NULL)
520
86.9k
        return NULL;
521
522
    /* Backwards compatibility: assume any raw tuple describes a text
523
     * encoding, and the same for anything lacking the private
524
     * attribute.
525
     */
526
987k
    if (!PyTuple_CheckExact(codec)) {
527
987k
        if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
528
0
            Py_DECREF(codec);
529
0
            return NULL;
530
0
        }
531
987k
        if (attr != NULL) {
532
987k
            is_text_codec = PyObject_IsTrue(attr);
533
987k
            Py_DECREF(attr);
534
987k
            if (is_text_codec <= 0) {
535
3.03k
                Py_DECREF(codec);
536
3.03k
                if (!is_text_codec) {
537
3.03k
                    if (alternate_command != NULL) {
538
3.03k
                        PyErr_Format(PyExc_LookupError,
539
3.03k
                                     "'%.400s' is not a text encoding; "
540
3.03k
                                     "use %s to handle arbitrary codecs",
541
3.03k
                                     encoding, alternate_command);
542
3.03k
                    }
543
0
                    else {
544
0
                        PyErr_Format(PyExc_LookupError,
545
0
                                     "'%.400s' is not a text encoding",
546
0
                                     encoding);
547
0
                    }
548
3.03k
                }
549
3.03k
                return NULL;
550
3.03k
            }
551
987k
        }
552
987k
    }
553
554
    /* This appears to be a valid text encoding */
555
984k
    return codec;
556
987k
}
557
558
559
static
560
PyObject *codec_getitem_checked(const char *encoding,
561
                                const char *alternate_command,
562
                                int index)
563
1.07M
{
564
1.07M
    PyObject *codec;
565
1.07M
    PyObject *v;
566
567
1.07M
    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
568
1.07M
    if (codec == NULL)
569
90.0k
        return NULL;
570
571
984k
    v = Py_NewRef(PyTuple_GET_ITEM(codec, index));
572
984k
    Py_DECREF(codec);
573
984k
    return v;
574
1.07M
}
575
576
static PyObject * _PyCodec_TextEncoder(const char *encoding)
577
747k
{
578
747k
    return codec_getitem_checked(encoding, "codecs.encode()", 0);
579
747k
}
580
581
static PyObject * _PyCodec_TextDecoder(const char *encoding)
582
327k
{
583
327k
    return codec_getitem_checked(encoding, "codecs.decode()", 1);
584
327k
}
585
586
PyObject *_PyCodec_EncodeText(PyObject *object,
587
                              const char *encoding,
588
                              const char *errors)
589
747k
{
590
747k
    PyObject *encoder;
591
592
747k
    encoder = _PyCodec_TextEncoder(encoding);
593
747k
    if (encoder == NULL)
594
0
        return NULL;
595
596
747k
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
597
747k
}
598
599
PyObject *_PyCodec_DecodeText(PyObject *object,
600
                              const char *encoding,
601
                              const char *errors)
602
327k
{
603
327k
    PyObject *decoder;
604
605
327k
    decoder = _PyCodec_TextDecoder(encoding);
606
327k
    if (decoder == NULL)
607
90.0k
        return NULL;
608
609
237k
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
610
327k
}
611
612
/* Register the error handling callback function error under the name
613
   name. This function will be called by the codec when it encounters
614
   an unencodable characters/undecodable bytes and doesn't know the
615
   callback name, when name is specified as the error parameter
616
   in the call to the encode/decode function.
617
   Return 0 on success, -1 on error */
618
int PyCodec_RegisterError(const char *name, PyObject *error)
619
0
{
620
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
621
0
    assert(interp->codecs.initialized);
622
0
    if (!PyCallable_Check(error)) {
623
0
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
624
0
        return -1;
625
0
    }
626
0
    return PyDict_SetItemString(interp->codecs.error_registry,
627
0
                                name, error);
628
0
}
629
630
int _PyCodec_UnregisterError(const char *name)
631
0
{
632
0
    for (size_t i = 0; i < Py_ARRAY_LENGTH(codecs_builtin_error_handlers); ++i) {
633
0
        if (strcmp(name, codecs_builtin_error_handlers[i]) == 0) {
634
0
            PyErr_Format(PyExc_ValueError,
635
0
                         "cannot un-register built-in error handler '%s'", name);
636
0
            return -1;
637
0
        }
638
0
    }
639
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
640
0
    assert(interp->codecs.initialized);
641
0
    return PyDict_PopString(interp->codecs.error_registry, name, NULL);
642
0
}
643
644
/* Lookup the error handling callback function registered under the
645
   name error. As a special case NULL can be passed, in which case
646
   the error handling callback for strict encoding will be returned. */
647
PyObject *PyCodec_LookupError(const char *name)
648
241k
{
649
241k
    PyInterpreterState *interp = _PyInterpreterState_GET();
650
241k
    assert(interp->codecs.initialized);
651
652
241k
    if (name==NULL)
653
167k
        name = "strict";
654
241k
    PyObject *handler;
655
241k
    if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
656
0
        return NULL;
657
0
    }
658
241k
    if (handler == NULL) {
659
0
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
660
0
        return NULL;
661
0
    }
662
241k
    return handler;
663
241k
}
664
665
666
static inline void
667
wrong_exception_type(PyObject *exc)
668
0
{
669
0
    PyErr_Format(PyExc_TypeError,
670
0
                 "don't know how to handle %T in error callback", exc);
671
0
}
672
673
674
#define _PyIsUnicodeEncodeError(EXC)    \
675
256k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeEncodeError)
676
#define _PyIsUnicodeDecodeError(EXC)    \
677
247k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeDecodeError)
678
#define _PyIsUnicodeTranslateError(EXC) \
679
0
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
680
681
682
// --- codecs handlers: utilities ---------------------------------------------
683
684
/*
685
 * Return the number of characters (including special prefixes)
686
 * needed to represent 'ch' by codec_handler_write_unicode_hex().
687
 */
688
static inline Py_ssize_t
689
codec_handler_unicode_hex_width(Py_UCS4 ch)
690
0
{
691
0
    if (ch >= 0x10000) {
692
        // format: '\\' + 'U' + 8 hex digits
693
0
        return 1 + 1 + 8;
694
0
    }
695
0
    else if (ch >= 0x100) {
696
        // format: '\\' + 'u' + 4 hex digits
697
0
        return 1 + 1 + 4;
698
0
    }
699
0
    else {
700
        // format: '\\' + 'x' + 2 hex digits
701
0
        return 1 + 1 + 2;
702
0
    }
703
0
}
704
705
706
/*
707
 * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
708
 * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
709
 */
710
static inline void
711
codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
712
0
{
713
0
    *(*p)++ = '\\';
714
0
    if (ch >= 0x10000) {
715
0
        *(*p)++ = 'U';
716
0
        *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf];
717
0
        *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf];
718
0
        *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf];
719
0
        *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf];
720
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
721
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
722
0
    }
723
0
    else if (ch >= 0x100) {
724
0
        *(*p)++ = 'u';
725
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
726
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
727
0
    }
728
0
    else {
729
0
        *(*p)++ = 'x';
730
0
    }
731
0
    *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf];
732
0
    *(*p)++ = Py_hexdigits[ch & 0xf];
733
0
}
734
735
736
/*
737
 * Determine the number of digits for a decimal representation of Unicode
738
 * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
739
 */
740
static inline int
741
n_decimal_digits_for_codepoint(Py_UCS4 ch)
742
0
{
743
0
    if (ch < 10) return 1;
744
0
    if (ch < 100) return 2;
745
0
    if (ch < 1000) return 3;
746
0
    if (ch < 10000) return 4;
747
0
    if (ch < 100000) return 5;
748
0
    if (ch < 1000000) return 6;
749
0
    if (ch < 10000000) return 7;
750
    // Unicode codepoints are limited to 1114111 (7 decimal digits)
751
0
    Py_UNREACHABLE();
752
0
}
753
754
755
/*
756
 * Create a Unicode string containing 'count' copies of the official
757
 * Unicode REPLACEMENT CHARACTER (0xFFFD).
758
 */
759
static PyObject *
760
codec_handler_unicode_replacement_character(Py_ssize_t count)
761
205k
{
762
205k
    PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER);
763
205k
    if (res == NULL) {
764
0
        return NULL;
765
0
    }
766
205k
    assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
767
205k
    Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
768
411k
    for (Py_ssize_t i = 0; i < count; ++i) {
769
205k
        outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
770
205k
    }
771
205k
    assert(_PyUnicode_CheckConsistency(res, 1));
772
205k
    return res;
773
205k
}
774
775
776
// --- handler: 'strict' ------------------------------------------------------
777
778
PyObject *PyCodec_StrictErrors(PyObject *exc)
779
228k
{
780
228k
    if (PyExceptionInstance_Check(exc)) {
781
228k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
782
228k
    }
783
0
    else {
784
0
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
785
0
    }
786
228k
    return NULL;
787
228k
}
788
789
790
// --- handler: 'ignore' ------------------------------------------------------
791
792
static PyObject *
793
_PyCodec_IgnoreError(PyObject *exc, int as_bytes)
794
0
{
795
0
    Py_ssize_t end;
796
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL,
797
0
                                  &end, NULL, as_bytes) < 0)
798
0
    {
799
0
        return NULL;
800
0
    }
801
0
    return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
802
0
}
803
804
805
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
806
0
{
807
0
    if (_PyIsUnicodeEncodeError(exc) || _PyIsUnicodeTranslateError(exc)) {
808
0
        return _PyCodec_IgnoreError(exc, false);
809
0
    }
810
0
    else if (_PyIsUnicodeDecodeError(exc)) {
811
0
        return _PyCodec_IgnoreError(exc, true);
812
0
    }
813
0
    else {
814
0
        wrong_exception_type(exc);
815
0
        return NULL;
816
0
    }
817
0
}
818
819
820
// --- handler: 'replace' -----------------------------------------------------
821
822
static PyObject *
823
_PyCodec_ReplaceUnicodeEncodeError(PyObject *exc)
824
0
{
825
0
    Py_ssize_t start, end, slen;
826
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
827
0
                                  &start, &end, &slen, false) < 0)
828
0
    {
829
0
        return NULL;
830
0
    }
831
0
    PyObject *res = PyUnicode_New(slen, '?');
832
0
    if (res == NULL) {
833
0
        return NULL;
834
0
    }
835
0
    assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
836
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
837
0
    memset(outp, '?', sizeof(Py_UCS1) * slen);
838
0
    assert(_PyUnicode_CheckConsistency(res, 1));
839
0
    return Py_BuildValue("(Nn)", res, end);
840
0
}
841
842
843
static PyObject *
844
_PyCodec_ReplaceUnicodeDecodeError(PyObject *exc)
845
205k
{
846
205k
    Py_ssize_t end;
847
205k
    if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) {
848
0
        return NULL;
849
0
    }
850
205k
    PyObject *res = codec_handler_unicode_replacement_character(1);
851
205k
    if (res == NULL) {
852
0
        return NULL;
853
0
    }
854
205k
    return Py_BuildValue("(Nn)", res, end);
855
205k
}
856
857
858
static PyObject *
859
_PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
860
0
{
861
0
    Py_ssize_t start, end, slen;
862
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
863
0
                                  &start, &end, &slen, false) < 0)
864
0
    {
865
0
        return NULL;
866
0
    }
867
0
    PyObject *res = codec_handler_unicode_replacement_character(slen);
868
0
    if (res == NULL) {
869
0
        return NULL;
870
0
    }
871
0
    return Py_BuildValue("(Nn)", res, end);
872
0
}
873
874
875
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
876
205k
{
877
205k
    if (_PyIsUnicodeEncodeError(exc)) {
878
0
        return _PyCodec_ReplaceUnicodeEncodeError(exc);
879
0
    }
880
205k
    else if (_PyIsUnicodeDecodeError(exc)) {
881
205k
        return _PyCodec_ReplaceUnicodeDecodeError(exc);
882
205k
    }
883
0
    else if (_PyIsUnicodeTranslateError(exc)) {
884
0
        return _PyCodec_ReplaceUnicodeTranslateError(exc);
885
0
    }
886
0
    else {
887
0
        wrong_exception_type(exc);
888
0
        return NULL;
889
0
    }
890
205k
}
891
892
893
// --- handler: 'xmlcharrefreplace' -------------------------------------------
894
895
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
896
0
{
897
0
    if (!_PyIsUnicodeEncodeError(exc)) {
898
0
        wrong_exception_type(exc);
899
0
        return NULL;
900
0
    }
901
902
0
    PyObject *obj;
903
0
    Py_ssize_t objlen, start, end, slen;
904
0
    if (_PyUnicodeError_GetParams(exc,
905
0
                                  &obj, &objlen,
906
0
                                  &start, &end, &slen, false) < 0)
907
0
    {
908
0
        return NULL;
909
0
    }
910
911
    // The number of characters that each character 'ch' contributes
912
    // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
913
    // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
914
    // range is below 10^7, each "block" requires at most 2 + 7 + 1
915
    // characters.
916
0
    if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
917
0
        end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
918
0
        end = Py_MIN(end, objlen);
919
0
        slen = Py_MAX(0, end - start);
920
0
    }
921
922
0
    Py_ssize_t ressize = 0;
923
0
    for (Py_ssize_t i = start; i < end; ++i) {
924
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
925
0
        int k = n_decimal_digits_for_codepoint(ch);
926
0
        assert(k != 0);
927
0
        assert(k <= 7);
928
0
        ressize += 2 + k + 1;
929
0
    }
930
931
    /* allocate replacement */
932
0
    PyObject *res = PyUnicode_New(ressize, 127);
933
0
    if (res == NULL) {
934
0
        Py_DECREF(obj);
935
0
        return NULL;
936
0
    }
937
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
938
    /* generate replacement */
939
0
    for (Py_ssize_t i = start; i < end; ++i) {
940
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
941
        /*
942
         * Write the decimal representation of 'ch' to the buffer pointed by 'p'
943
         * using at most 7 characters prefixed by '&#' and suffixed by ';'.
944
         */
945
0
        *outp++ = '&';
946
0
        *outp++ = '#';
947
0
        Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
948
0
        for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
949
0
            *p_digit = '0' + (ch % 10);
950
0
            ch /= 10;
951
0
        }
952
0
        assert(ch == 0);
953
0
        outp = digit_end;
954
0
        *outp++ = ';';
955
0
    }
956
0
    assert(_PyUnicode_CheckConsistency(res, 1));
957
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
958
0
    Py_DECREF(obj);
959
0
    return restuple;
960
0
}
961
962
963
// --- handler: 'backslashreplace' --------------------------------------------
964
965
static PyObject *
966
_PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc)
967
0
{
968
0
    PyObject *obj;
969
0
    Py_ssize_t objlen, start, end, slen;
970
0
    if (_PyUnicodeError_GetParams(exc,
971
0
                                  &obj, &objlen,
972
0
                                  &start, &end, &slen, false) < 0)
973
0
    {
974
0
        return NULL;
975
0
    }
976
977
    // The number of characters that each character 'ch' contributes
978
    // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch}
979
    // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS,
980
    // where the number of hexdigits is either 2, 4, or 8 (not 6).
981
    // Since the Unicode range is below 10^7, we choose k = 8 whence
982
    // each "block" requires at most 1 + 1 + 8 characters.
983
0
    if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) {
984
0
        end = start + PY_SSIZE_T_MAX / (1 + 1 + 8);
985
0
        end = Py_MIN(end, objlen);
986
0
        slen = Py_MAX(0, end - start);
987
0
    }
988
989
0
    Py_ssize_t ressize = 0;
990
0
    for (Py_ssize_t i = start; i < end; ++i) {
991
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
992
0
        ressize += codec_handler_unicode_hex_width(c);
993
0
    }
994
0
    PyObject *res = PyUnicode_New(ressize, 127);
995
0
    if (res == NULL) {
996
0
        Py_DECREF(obj);
997
0
        return NULL;
998
0
    }
999
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1000
0
    for (Py_ssize_t i = start; i < end; ++i) {
1001
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1002
0
        codec_handler_write_unicode_hex(&outp, c);
1003
0
    }
1004
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1005
0
    Py_DECREF(obj);
1006
0
    return Py_BuildValue("(Nn)", res, end);
1007
0
}
1008
1009
1010
static PyObject *
1011
_PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc)
1012
0
{
1013
0
    PyObject *obj;
1014
0
    Py_ssize_t objlen, start, end, slen;
1015
0
    if (_PyUnicodeError_GetParams(exc,
1016
0
                                  &obj, &objlen,
1017
0
                                  &start, &end, &slen, true) < 0)
1018
0
    {
1019
0
        return NULL;
1020
0
    }
1021
1022
0
    PyObject *res = PyUnicode_New(4 * slen, 127);
1023
0
    if (res == NULL) {
1024
0
        Py_DECREF(obj);
1025
0
        return NULL;
1026
0
    }
1027
1028
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1029
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1030
0
    for (Py_ssize_t i = start; i < end; i++, outp += 4) {
1031
0
        const unsigned char ch = p[i];
1032
0
        outp[0] = '\\';
1033
0
        outp[1] = 'x';
1034
0
        outp[2] = Py_hexdigits[(ch >> 4) & 0xf];
1035
0
        outp[3] = Py_hexdigits[ch & 0xf];
1036
0
    }
1037
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1038
0
    Py_DECREF(obj);
1039
0
    return Py_BuildValue("(Nn)", res, end);
1040
0
}
1041
1042
1043
static inline PyObject *
1044
_PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc)
1045
0
{
1046
    // Same implementation as for UnicodeEncodeError objects.
1047
0
    return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1048
0
}
1049
1050
1051
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
1052
0
{
1053
0
    if (_PyIsUnicodeEncodeError(exc)) {
1054
0
        return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1055
0
    }
1056
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1057
0
        return _PyCodec_BackslashReplaceUnicodeDecodeError(exc);
1058
0
    }
1059
0
    else if (_PyIsUnicodeTranslateError(exc)) {
1060
0
        return _PyCodec_BackslashReplaceUnicodeTranslateError(exc);
1061
0
    }
1062
0
    else {
1063
0
        wrong_exception_type(exc);
1064
0
        return NULL;
1065
0
    }
1066
0
}
1067
1068
1069
// --- handler: 'namereplace' -------------------------------------------------
1070
1071
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
1072
0
{
1073
0
    if (!_PyIsUnicodeEncodeError(exc)) {
1074
0
        wrong_exception_type(exc);
1075
0
        return NULL;
1076
0
    }
1077
1078
0
    _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
1079
0
    if (ucnhash_capi == NULL) {
1080
0
        return NULL;
1081
0
    }
1082
1083
0
    PyObject *obj;
1084
0
    Py_ssize_t start, end;
1085
0
    if (_PyUnicodeError_GetParams(exc,
1086
0
                                  &obj, NULL,
1087
0
                                  &start, &end, NULL, false) < 0)
1088
0
    {
1089
0
        return NULL;
1090
0
    }
1091
1092
0
    char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */
1093
0
    Py_ssize_t imax = start, ressize = 0, replsize;
1094
0
    for (; imax < end; ++imax) {
1095
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax);
1096
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1097
            // If 'c' is recognized by getname(), the corresponding replacement
1098
            // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
1099
            // characters. Failures of getname() are ignored by the handler.
1100
0
            replsize = 1 + 1 + 1 + strlen(buffer) + 1;
1101
0
        }
1102
0
        else {
1103
0
            replsize = codec_handler_unicode_hex_width(c);
1104
0
        }
1105
0
        if (ressize > PY_SSIZE_T_MAX - replsize) {
1106
0
            break;
1107
0
        }
1108
0
        ressize += replsize;
1109
0
    }
1110
1111
0
    PyObject *res = PyUnicode_New(ressize, 127);
1112
0
    if (res == NULL) {
1113
0
        Py_DECREF(obj);
1114
0
        return NULL;
1115
0
    }
1116
1117
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1118
0
    for (Py_ssize_t i = start; i < imax; ++i) {
1119
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1120
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1121
0
            *outp++ = '\\';
1122
0
            *outp++ = 'N';
1123
0
            *outp++ = '{';
1124
0
            (void)strcpy((char *)outp, buffer);
1125
0
            outp += strlen(buffer);
1126
0
            *outp++ = '}';
1127
0
        }
1128
0
        else {
1129
0
            codec_handler_write_unicode_hex(&outp, c);
1130
0
        }
1131
0
    }
1132
1133
0
    assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1134
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1135
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, imax);
1136
0
    Py_DECREF(obj);
1137
0
    return restuple;
1138
0
}
1139
1140
1141
0
#define ENC_UNKNOWN     -1
1142
0
#define ENC_UTF8        0
1143
0
#define ENC_UTF16BE     1
1144
0
#define ENC_UTF16LE     2
1145
0
#define ENC_UTF32BE     3
1146
0
#define ENC_UTF32LE     4
1147
1148
static int
1149
get_standard_encoding_impl(const char *encoding, int *bytelength)
1150
0
{
1151
0
    if (Py_TOLOWER(encoding[0]) == 'u' &&
1152
0
        Py_TOLOWER(encoding[1]) == 't' &&
1153
0
        Py_TOLOWER(encoding[2]) == 'f') {
1154
0
        encoding += 3;
1155
0
        if (*encoding == '-' || *encoding == '_' )
1156
0
            encoding++;
1157
0
        if (encoding[0] == '8' && encoding[1] == '\0') {
1158
0
            *bytelength = 3;
1159
0
            return ENC_UTF8;
1160
0
        }
1161
0
        else if (encoding[0] == '1' && encoding[1] == '6') {
1162
0
            encoding += 2;
1163
0
            *bytelength = 2;
1164
0
            if (*encoding == '\0') {
1165
#ifdef WORDS_BIGENDIAN
1166
                return ENC_UTF16BE;
1167
#else
1168
0
                return ENC_UTF16LE;
1169
0
#endif
1170
0
            }
1171
0
            if (*encoding == '-' || *encoding == '_' )
1172
0
                encoding++;
1173
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1174
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1175
0
                    return ENC_UTF16BE;
1176
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1177
0
                    return ENC_UTF16LE;
1178
0
            }
1179
0
        }
1180
0
        else if (encoding[0] == '3' && encoding[1] == '2') {
1181
0
            encoding += 2;
1182
0
            *bytelength = 4;
1183
0
            if (*encoding == '\0') {
1184
#ifdef WORDS_BIGENDIAN
1185
                return ENC_UTF32BE;
1186
#else
1187
0
                return ENC_UTF32LE;
1188
0
#endif
1189
0
            }
1190
0
            if (*encoding == '-' || *encoding == '_' )
1191
0
                encoding++;
1192
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1193
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1194
0
                    return ENC_UTF32BE;
1195
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1196
0
                    return ENC_UTF32LE;
1197
0
            }
1198
0
        }
1199
0
    }
1200
0
    else if (strcmp(encoding, "cp65001") == 0) {
1201
0
        *bytelength = 3;
1202
0
        return ENC_UTF8;
1203
0
    }
1204
0
    return ENC_UNKNOWN;
1205
0
}
1206
1207
1208
static int
1209
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
1210
0
{
1211
0
    const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
1212
0
    if (encoding_cstr == NULL) {
1213
0
        return -1;
1214
0
    }
1215
0
    *code = get_standard_encoding_impl(encoding_cstr, bytelength);
1216
0
    return 0;
1217
0
}
1218
1219
1220
// --- handler: 'surrogatepass' -----------------------------------------------
1221
1222
static PyObject *
1223
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
1224
0
{
1225
0
    PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
1226
0
    if (encoding == NULL) {
1227
0
        return NULL;
1228
0
    }
1229
0
    int code, bytelength;
1230
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1231
0
    Py_DECREF(encoding);
1232
0
    if (rc < 0) {
1233
0
        return NULL;
1234
0
    }
1235
0
    if (code == ENC_UNKNOWN) {
1236
0
        goto bail;
1237
0
    }
1238
1239
0
    PyObject *obj;
1240
0
    Py_ssize_t objlen, start, end, slen;
1241
0
    if (_PyUnicodeError_GetParams(exc,
1242
0
                                  &obj, &objlen,
1243
0
                                  &start, &end, &slen, false) < 0)
1244
0
    {
1245
0
        return NULL;
1246
0
    }
1247
1248
0
    if (slen > PY_SSIZE_T_MAX / bytelength) {
1249
0
        end = start + PY_SSIZE_T_MAX / bytelength;
1250
0
        end = Py_MIN(end, objlen);
1251
0
        slen = Py_MAX(0, end - start);
1252
0
    }
1253
1254
0
    PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
1255
0
    if (res == NULL) {
1256
0
        Py_DECREF(obj);
1257
0
        return NULL;
1258
0
    }
1259
1260
0
    unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
1261
0
    for (Py_ssize_t i = start; i < end; i++) {
1262
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1263
0
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1264
            /* Not a surrogate, fail with original exception */
1265
0
            Py_DECREF(obj);
1266
0
            Py_DECREF(res);
1267
0
            goto bail;
1268
0
        }
1269
0
        switch (code) {
1270
0
            case ENC_UTF8: {
1271
0
                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1272
0
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1273
0
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1274
0
                break;
1275
0
            }
1276
0
            case ENC_UTF16LE: {
1277
0
                *outp++ = (unsigned char)ch;
1278
0
                *outp++ = (unsigned char)(ch >> 8);
1279
0
                break;
1280
0
            }
1281
0
            case ENC_UTF16BE: {
1282
0
                *outp++ = (unsigned char)(ch >> 8);
1283
0
                *outp++ = (unsigned char)ch;
1284
0
                break;
1285
0
            }
1286
0
            case ENC_UTF32LE: {
1287
0
                *outp++ = (unsigned char)ch;
1288
0
                *outp++ = (unsigned char)(ch >> 8);
1289
0
                *outp++ = (unsigned char)(ch >> 16);
1290
0
                *outp++ = (unsigned char)(ch >> 24);
1291
0
                break;
1292
0
            }
1293
0
            case ENC_UTF32BE: {
1294
0
                *outp++ = (unsigned char)(ch >> 24);
1295
0
                *outp++ = (unsigned char)(ch >> 16);
1296
0
                *outp++ = (unsigned char)(ch >> 8);
1297
0
                *outp++ = (unsigned char)ch;
1298
0
                break;
1299
0
            }
1300
0
        }
1301
0
    }
1302
1303
0
    Py_DECREF(obj);
1304
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
1305
0
    return restuple;
1306
1307
0
bail:
1308
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1309
0
    return NULL;
1310
0
}
1311
1312
1313
static PyObject *
1314
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
1315
0
{
1316
0
    PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
1317
0
    if (encoding == NULL) {
1318
0
        return NULL;
1319
0
    }
1320
0
    int code, bytelength;
1321
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1322
0
    Py_DECREF(encoding);
1323
0
    if (rc < 0) {
1324
0
        return NULL;
1325
0
    }
1326
0
    if (code == ENC_UNKNOWN) {
1327
0
        goto bail;
1328
0
    }
1329
1330
0
    PyObject *obj;
1331
0
    Py_ssize_t objlen, start, end, slen;
1332
0
    if (_PyUnicodeError_GetParams(exc,
1333
0
                                  &obj, &objlen,
1334
0
                                  &start, &end, &slen, true) < 0)
1335
0
    {
1336
0
        return NULL;
1337
0
    }
1338
1339
    /* Try decoding a single surrogate character. If
1340
       there are more, let the codec call us again. */
1341
0
    Py_UCS4 ch = 0;
1342
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1343
0
    p += start;
1344
1345
0
    if (objlen - start >= bytelength) {
1346
0
        switch (code) {
1347
0
            case ENC_UTF8: {
1348
0
                if ((p[0] & 0xf0) == 0xe0 &&
1349
0
                    (p[1] & 0xc0) == 0x80 &&
1350
0
                    (p[2] & 0xc0) == 0x80)
1351
0
                {
1352
                    /* it's a three-byte code */
1353
0
                    ch = ((p[0] & 0x0f) << 12) +
1354
0
                         ((p[1] & 0x3f) << 6)  +
1355
0
                          (p[2] & 0x3f);
1356
0
                }
1357
0
                break;
1358
0
            }
1359
0
            case ENC_UTF16LE: {
1360
0
                ch = p[1] << 8 | p[0];
1361
0
                break;
1362
0
            }
1363
0
            case ENC_UTF16BE: {
1364
0
                ch = p[0] << 8 | p[1];
1365
0
                break;
1366
0
            }
1367
0
            case ENC_UTF32LE: {
1368
0
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1369
0
                break;
1370
0
            }
1371
0
            case ENC_UTF32BE: {
1372
0
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1373
0
                break;
1374
0
            }
1375
0
        }
1376
0
    }
1377
0
    Py_DECREF(obj);
1378
0
    if (!Py_UNICODE_IS_SURROGATE(ch)) {
1379
0
        goto bail;
1380
0
    }
1381
1382
0
    PyObject *res = PyUnicode_FromOrdinal(ch);
1383
0
    if (res == NULL) {
1384
0
        return NULL;
1385
0
    }
1386
0
    return Py_BuildValue("(Nn)", res, start + bytelength);
1387
1388
0
bail:
1389
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1390
0
    return NULL;
1391
0
}
1392
1393
1394
/* This handler is declared static until someone demonstrates
1395
   a need to call it directly. */
1396
static PyObject *
1397
PyCodec_SurrogatePassErrors(PyObject *exc)
1398
0
{
1399
0
    if (_PyIsUnicodeEncodeError(exc)) {
1400
0
        return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
1401
0
    }
1402
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1403
0
        return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
1404
0
    }
1405
0
    else {
1406
0
        wrong_exception_type(exc);
1407
0
        return NULL;
1408
0
    }
1409
0
}
1410
1411
1412
// --- handler: 'surrogateescape' ---------------------------------------------
1413
1414
static PyObject *
1415
_PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
1416
9.28k
{
1417
9.28k
    PyObject *obj;
1418
9.28k
    Py_ssize_t start, end, slen;
1419
9.28k
    if (_PyUnicodeError_GetParams(exc,
1420
9.28k
                                  &obj, NULL,
1421
9.28k
                                  &start, &end, &slen, false) < 0)
1422
0
    {
1423
0
        return NULL;
1424
0
    }
1425
1426
9.28k
    PyObject *res = PyBytes_FromStringAndSize(NULL, slen);
1427
9.28k
    if (res == NULL) {
1428
0
        Py_DECREF(obj);
1429
0
        return NULL;
1430
0
    }
1431
1432
9.28k
    char *outp = PyBytes_AsString(res);
1433
9.28k
    for (Py_ssize_t i = start; i < end; i++) {
1434
9.28k
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1435
9.28k
        if (ch < 0xdc80 || ch > 0xdcff) {
1436
            /* Not a UTF-8b surrogate, fail with original exception. */
1437
9.28k
            Py_DECREF(obj);
1438
9.28k
            Py_DECREF(res);
1439
9.28k
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1440
9.28k
            return NULL;
1441
9.28k
        }
1442
0
        *outp++ = ch - 0xdc00;
1443
0
    }
1444
0
    Py_DECREF(obj);
1445
1446
0
    return Py_BuildValue("(Nn)", res, end);
1447
9.28k
}
1448
1449
1450
static PyObject *
1451
_PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc)
1452
41.5k
{
1453
41.5k
    PyObject *obj;
1454
41.5k
    Py_ssize_t start, end, slen;
1455
41.5k
    if (_PyUnicodeError_GetParams(exc,
1456
41.5k
                                  &obj, NULL,
1457
41.5k
                                  &start, &end, &slen, true) < 0)
1458
0
    {
1459
0
        return NULL;
1460
0
    }
1461
1462
41.5k
    Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1463
41.5k
    int consumed = 0;
1464
41.5k
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1465
94.3k
    while (consumed < 4 && consumed < slen) {
1466
        /* Refuse to escape ASCII bytes. */
1467
70.1k
        if (p[start + consumed] < 128) {
1468
17.3k
            break;
1469
17.3k
        }
1470
52.8k
        ch[consumed] = 0xdc00 + p[start + consumed];
1471
52.8k
        consumed++;
1472
52.8k
    }
1473
41.5k
    Py_DECREF(obj);
1474
1475
41.5k
    if (consumed == 0) {
1476
        /* Codec complained about ASCII byte. */
1477
11.0k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1478
11.0k
        return NULL;
1479
11.0k
    }
1480
1481
30.4k
    PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1482
30.4k
    if (str == NULL) {
1483
0
        return NULL;
1484
0
    }
1485
30.4k
    return Py_BuildValue("(Nn)", str, start + consumed);
1486
30.4k
}
1487
1488
1489
static PyObject *
1490
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1491
50.8k
{
1492
50.8k
    if (_PyIsUnicodeEncodeError(exc)) {
1493
9.28k
        return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc);
1494
9.28k
    }
1495
41.5k
    else if (_PyIsUnicodeDecodeError(exc)) {
1496
41.5k
        return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc);
1497
41.5k
    }
1498
0
    else {
1499
0
        wrong_exception_type(exc);
1500
0
        return NULL;
1501
0
    }
1502
50.8k
}
1503
1504
1505
// --- Codecs registry handlers -----------------------------------------------
1506
1507
static inline PyObject *
1508
strict_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1509
190k
{
1510
190k
    return PyCodec_StrictErrors(exc);
1511
190k
}
1512
1513
1514
static inline PyObject *
1515
ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1516
0
{
1517
0
    return PyCodec_IgnoreErrors(exc);
1518
0
}
1519
1520
1521
static inline PyObject *
1522
replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1523
205k
{
1524
205k
    return PyCodec_ReplaceErrors(exc);
1525
205k
}
1526
1527
1528
static inline PyObject *
1529
xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1530
0
{
1531
0
    return PyCodec_XMLCharRefReplaceErrors(exc);
1532
0
}
1533
1534
1535
static inline PyObject *
1536
backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1537
0
{
1538
0
    return PyCodec_BackslashReplaceErrors(exc);
1539
0
}
1540
1541
1542
static inline PyObject *
1543
namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1544
0
{
1545
0
    return PyCodec_NameReplaceErrors(exc);
1546
0
}
1547
1548
1549
static inline PyObject *
1550
surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1551
0
{
1552
0
    return PyCodec_SurrogatePassErrors(exc);
1553
0
}
1554
1555
1556
static inline PyObject *
1557
surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1558
50.8k
{
1559
50.8k
    return PyCodec_SurrogateEscapeErrors(exc);
1560
50.8k
}
1561
1562
1563
PyStatus
1564
_PyCodec_InitRegistry(PyInterpreterState *interp)
1565
16
{
1566
16
    static struct {
1567
16
        const char *name;
1568
16
        PyMethodDef def;
1569
16
    } methods[] =
1570
16
    {
1571
16
        {
1572
16
            "strict",
1573
16
            {
1574
16
                "strict_errors",
1575
16
                strict_errors,
1576
16
                METH_O,
1577
16
                PyDoc_STR("Implements the 'strict' error handling, which "
1578
16
                          "raises a UnicodeError on coding errors.")
1579
16
            }
1580
16
        },
1581
16
        {
1582
16
            "ignore",
1583
16
            {
1584
16
                "ignore_errors",
1585
16
                ignore_errors,
1586
16
                METH_O,
1587
16
                PyDoc_STR("Implements the 'ignore' error handling, which "
1588
16
                          "ignores malformed data and continues.")
1589
16
            }
1590
16
        },
1591
16
        {
1592
16
            "replace",
1593
16
            {
1594
16
                "replace_errors",
1595
16
                replace_errors,
1596
16
                METH_O,
1597
16
                PyDoc_STR("Implements the 'replace' error handling, which "
1598
16
                          "replaces malformed data with a replacement marker.")
1599
16
            }
1600
16
        },
1601
16
        {
1602
16
            "xmlcharrefreplace",
1603
16
            {
1604
16
                "xmlcharrefreplace_errors",
1605
16
                xmlcharrefreplace_errors,
1606
16
                METH_O,
1607
16
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1608
16
                          "which replaces an unencodable character with the "
1609
16
                          "appropriate XML character reference.")
1610
16
            }
1611
16
        },
1612
16
        {
1613
16
            "backslashreplace",
1614
16
            {
1615
16
                "backslashreplace_errors",
1616
16
                backslashreplace_errors,
1617
16
                METH_O,
1618
16
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1619
16
                          "which replaces malformed data with a backslashed "
1620
16
                          "escape sequence.")
1621
16
            }
1622
16
        },
1623
16
        {
1624
16
            "namereplace",
1625
16
            {
1626
16
                "namereplace_errors",
1627
16
                namereplace_errors,
1628
16
                METH_O,
1629
16
                PyDoc_STR("Implements the 'namereplace' error handling, "
1630
16
                          "which replaces an unencodable character with a "
1631
16
                          "\\N{...} escape sequence.")
1632
16
            }
1633
16
        },
1634
16
        {
1635
16
            "surrogatepass",
1636
16
            {
1637
16
                "surrogatepass",
1638
16
                surrogatepass_errors,
1639
16
                METH_O
1640
16
            }
1641
16
        },
1642
16
        {
1643
16
            "surrogateescape",
1644
16
            {
1645
16
                "surrogateescape",
1646
16
                surrogateescape_errors,
1647
16
                METH_O
1648
16
            }
1649
16
        }
1650
16
    };
1651
    // ensure that the built-in error handlers' names are kept in sync
1652
16
    assert(Py_ARRAY_LENGTH(methods) == Py_ARRAY_LENGTH(codecs_builtin_error_handlers));
1653
1654
16
    assert(interp->codecs.initialized == 0);
1655
16
    interp->codecs.search_path = PyList_New(0);
1656
16
    if (interp->codecs.search_path == NULL) {
1657
0
        return PyStatus_NoMemory();
1658
0
    }
1659
16
    interp->codecs.search_cache = PyDict_New();
1660
16
    if (interp->codecs.search_cache == NULL) {
1661
0
        return PyStatus_NoMemory();
1662
0
    }
1663
16
    interp->codecs.error_registry = PyDict_New();
1664
16
    if (interp->codecs.error_registry == NULL) {
1665
0
        return PyStatus_NoMemory();
1666
0
    }
1667
144
    for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1668
128
        PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1669
128
        if (func == NULL) {
1670
0
            return PyStatus_NoMemory();
1671
0
        }
1672
1673
128
        int res = PyDict_SetItemString(interp->codecs.error_registry,
1674
128
                                       methods[i].name, func);
1675
128
        Py_DECREF(func);
1676
128
        if (res < 0) {
1677
0
            return PyStatus_Error("Failed to insert into codec error registry");
1678
0
        }
1679
128
    }
1680
1681
16
    interp->codecs.initialized = 1;
1682
1683
    // Importing `encodings' will call back into this module to register codec
1684
    // search functions, so this is done after everything else is initialized.
1685
16
    PyObject *mod = PyImport_ImportModule("encodings");
1686
16
    if (mod == NULL) {
1687
0
        return PyStatus_Error("Failed to import encodings module");
1688
0
    }
1689
16
    Py_DECREF(mod);
1690
1691
16
    return PyStatus_Ok();
1692
16
}
1693
1694
void
1695
_PyCodec_Fini(PyInterpreterState *interp)
1696
0
{
1697
0
    Py_CLEAR(interp->codecs.search_path);
1698
0
    Py_CLEAR(interp->codecs.search_cache);
1699
0
    Py_CLEAR(interp->codecs.error_registry);
1700
0
    interp->codecs.initialized = 0;
1701
0
}