Coverage Report

Created: 2026-05-16 06:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Python/codecs.c
Line
Count
Source
1
/* ------------------------------------------------------------------------
2
3
   Python Codec Registry and support functions
4
5
Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7
Copyright (c) Corporation for National Research Initiatives.
8
9
   ------------------------------------------------------------------------ */
10
11
#include "Python.h"
12
#include "pycore_call.h"          // _PyObject_CallNoArgs()
13
#include "pycore_interp.h"        // PyInterpreterState.codec_search_path
14
#include "pycore_pyerrors.h"      // _PyErr_FormatNote()
15
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
16
#include "pycore_runtime.h"       // _Py_ID()
17
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
18
#include "pycore_unicodeobject.h" // _PyUnicode_InternMortal()
19
#include "pycore_pyatomic_ft_wrappers.h"
20
21
static const char *codecs_builtin_error_handlers[] = {
22
    "strict", "ignore", "replace",
23
    "xmlcharrefreplace", "backslashreplace", "namereplace",
24
    "surrogatepass", "surrogateescape",
25
};
26
27
const char *Py_hexdigits = "0123456789abcdef";
28
29
/* --- Codec Registry ----------------------------------------------------- */
30
31
int PyCodec_Register(PyObject *search_function)
32
37
{
33
37
    PyInterpreterState *interp = _PyInterpreterState_GET();
34
37
    assert(interp->codecs.initialized);
35
37
    if (search_function == NULL) {
36
0
        PyErr_BadArgument();
37
0
        goto onError;
38
0
    }
39
37
    if (!PyCallable_Check(search_function)) {
40
0
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
41
0
        goto onError;
42
0
    }
43
37
    FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
44
37
    int ret = PyList_Append(interp->codecs.search_path, search_function);
45
37
    FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
46
47
37
    return ret;
48
49
0
 onError:
50
0
    return -1;
51
37
}
52
53
int
54
PyCodec_Unregister(PyObject *search_function)
55
0
{
56
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
57
0
    if (interp->codecs.initialized != 1) {
58
        /* Do nothing if codecs state was cleared (only possible during
59
           interpreter shutdown). */
60
0
        return 0;
61
0
    }
62
63
0
    PyObject *codec_search_path = interp->codecs.search_path;
64
0
    assert(PyList_CheckExact(codec_search_path));
65
0
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
66
0
        FT_MUTEX_LOCK(&interp->codecs.search_path_mutex);
67
0
        PyObject *item = PyList_GetItemRef(codec_search_path, i);
68
0
        int ret = 1;
69
0
        if (item == search_function) {
70
            // We hold a reference to the item, so its destructor can't run
71
            // while we hold search_path_mutex.
72
0
            ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
73
0
        }
74
0
        FT_MUTEX_UNLOCK(&interp->codecs.search_path_mutex);
75
0
        Py_DECREF(item);
76
0
        if (ret != 1) {
77
0
            assert(interp->codecs.search_cache != NULL);
78
0
            assert(PyDict_CheckExact(interp->codecs.search_cache));
79
0
            PyDict_Clear(interp->codecs.search_cache);
80
0
            return ret;
81
0
        }
82
0
    }
83
0
    return 0;
84
0
}
85
86
/* Convert a string to a normalized Python string: all ASCII letters are
87
   converted to lower case, spaces are replaced with hyphens. */
88
89
static PyObject*
90
normalizestring(const char *string)
91
2.36M
{
92
2.36M
    size_t i;
93
2.36M
    size_t len = strlen(string);
94
2.36M
    char *p;
95
2.36M
    PyObject *v;
96
97
2.36M
    if (len > PY_SSIZE_T_MAX) {
98
0
        PyErr_SetString(PyExc_OverflowError, "string is too large");
99
0
        return NULL;
100
0
    }
101
102
2.36M
    p = PyMem_Malloc(len + 1);
103
2.36M
    if (p == NULL)
104
0
        return PyErr_NoMemory();
105
33.7M
    for (i = 0; i < len; i++) {
106
31.3M
        char ch = string[i];
107
31.3M
        if (ch == ' ')
108
239k
            ch = '-';
109
31.1M
        else
110
31.1M
            ch = Py_TOLOWER(Py_CHARMASK(ch));
111
31.3M
        p[i] = ch;
112
31.3M
    }
113
2.36M
    p[i] = '\0';
114
2.36M
    v = PyUnicode_FromString(p);
115
2.36M
    PyMem_Free(p);
116
2.36M
    return v;
117
2.36M
}
118
119
/* Lookup the given encoding and return a tuple providing the codec
120
   facilities.
121
122
   ASCII letters in the encoding string is looked up converted to all
123
   lower case. This makes encodings looked up through this mechanism
124
   effectively case-insensitive. Spaces are replaced with hyphens for
125
   names like "US ASCII" and "ISO 8859-1".
126
127
   If no codec is found, a LookupError is set and NULL returned.
128
129
   As side effect, this tries to load the encodings package, if not
130
   yet done. This is part of the lazy load strategy for the encodings
131
   package.
132
133
*/
134
135
PyObject *_PyCodec_Lookup(const char *encoding)
136
2.36M
{
137
2.36M
    if (encoding == NULL) {
138
0
        PyErr_BadArgument();
139
0
        return NULL;
140
0
    }
141
142
2.36M
    PyInterpreterState *interp = _PyInterpreterState_GET();
143
2.36M
    assert(interp->codecs.initialized);
144
145
    /* Convert the encoding to a normalized Python string: all
146
       ASCII letters are converted to lower case, spaces are
147
       replaced with hyphens. */
148
2.36M
    PyObject *v = normalizestring(encoding);
149
2.36M
    if (v == NULL) {
150
0
        return NULL;
151
0
    }
152
153
    /* Intern the string. We'll make it immortal later if lookup succeeds. */
154
2.36M
    _PyUnicode_InternMortal(interp, &v);
155
156
    /* First, try to lookup the name in the registry dictionary */
157
2.36M
    PyObject *result;
158
2.36M
    if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
159
0
        goto onError;
160
0
    }
161
2.36M
    if (result != NULL) {
162
2.30M
        Py_DECREF(v);
163
2.30M
        return result;
164
2.30M
    }
165
166
    /* Next, scan the search functions in order of registration */
167
65.6k
    const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
168
65.6k
    if (len < 0)
169
0
        goto onError;
170
65.6k
    if (len == 0) {
171
0
        PyErr_SetString(PyExc_LookupError,
172
0
                        "no codec search functions registered: "
173
0
                        "can't find encoding");
174
0
        goto onError;
175
0
    }
176
177
65.6k
    Py_ssize_t i;
178
129k
    for (i = 0; i < len; i++) {
179
65.6k
        PyObject *func;
180
181
65.6k
        func = PyList_GetItemRef(interp->codecs.search_path, i);
182
65.6k
        if (func == NULL)
183
0
            goto onError;
184
65.6k
        result = PyObject_CallOneArg(func, v);
185
65.6k
        Py_DECREF(func);
186
65.6k
        if (result == NULL)
187
0
            goto onError;
188
65.6k
        if (result == Py_None) {
189
63.5k
            Py_CLEAR(result);
190
63.5k
            continue;
191
63.5k
        }
192
2.07k
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
193
0
            PyErr_SetString(PyExc_TypeError,
194
0
                            "codec search functions must return 4-tuples");
195
0
            Py_DECREF(result);
196
0
            goto onError;
197
0
        }
198
2.07k
        break;
199
2.07k
    }
200
65.6k
    if (result == NULL) {
201
        /* XXX Perhaps we should cache misses too ? */
202
63.5k
        PyErr_Format(PyExc_LookupError,
203
63.5k
                     "unknown encoding: %s", encoding);
204
63.5k
        goto onError;
205
63.5k
    }
206
207
2.07k
    _PyUnicode_InternImmortal(interp, &v);
208
209
    /* Cache and return the result */
210
2.07k
    if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
211
0
        Py_DECREF(result);
212
0
        goto onError;
213
0
    }
214
2.07k
    Py_DECREF(v);
215
2.07k
    return result;
216
217
63.5k
 onError:
218
63.5k
    Py_DECREF(v);
219
63.5k
    return NULL;
220
2.07k
}
221
222
/* Codec registry encoding check API. */
223
224
int PyCodec_KnownEncoding(const char *encoding)
225
0
{
226
0
    PyObject *codecs;
227
228
0
    codecs = _PyCodec_Lookup(encoding);
229
0
    if (!codecs) {
230
0
        PyErr_Clear();
231
0
        return 0;
232
0
    }
233
0
    else {
234
0
        Py_DECREF(codecs);
235
0
        return 1;
236
0
    }
237
0
}
238
239
static
240
PyObject *args_tuple(PyObject *object,
241
                     const char *errors)
242
2.04M
{
243
2.04M
    PyObject *args;
244
245
2.04M
    args = PyTuple_New(1 + (errors != NULL));
246
2.04M
    if (args == NULL)
247
0
        return NULL;
248
2.04M
    PyTuple_SET_ITEM(args, 0, Py_NewRef(object));
249
2.04M
    if (errors) {
250
175k
        PyObject *v;
251
252
175k
        v = PyUnicode_FromString(errors);
253
175k
        if (v == NULL) {
254
0
            Py_DECREF(args);
255
0
            return NULL;
256
0
        }
257
175k
        PyTuple_SET_ITEM(args, 1, v);
258
175k
    }
259
2.04M
    return args;
260
2.04M
}
261
262
/* Helper function to get a codec item */
263
264
static
265
PyObject *codec_getitem(const char *encoding, int index)
266
0
{
267
0
    PyObject *codecs;
268
0
    PyObject *v;
269
270
0
    codecs = _PyCodec_Lookup(encoding);
271
0
    if (codecs == NULL)
272
0
        return NULL;
273
0
    v = PyTuple_GET_ITEM(codecs, index);
274
0
    Py_DECREF(codecs);
275
0
    return Py_NewRef(v);
276
0
}
277
278
/* Helper functions to create an incremental codec. */
279
static
280
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
281
                                     const char *errors,
282
                                     const char *attrname)
283
130
{
284
130
    PyObject *ret, *inccodec;
285
286
130
    inccodec = PyObject_GetAttrString(codec_info, attrname);
287
130
    if (inccodec == NULL)
288
0
        return NULL;
289
130
    if (errors)
290
130
        ret = PyObject_CallFunction(inccodec, "s", errors);
291
0
    else
292
0
        ret = _PyObject_CallNoArgs(inccodec);
293
130
    Py_DECREF(inccodec);
294
130
    return ret;
295
130
}
296
297
static
298
PyObject *codec_getincrementalcodec(const char *encoding,
299
                                    const char *errors,
300
                                    const char *attrname)
301
0
{
302
0
    PyObject *codec_info, *ret;
303
304
0
    codec_info = _PyCodec_Lookup(encoding);
305
0
    if (codec_info == NULL)
306
0
        return NULL;
307
0
    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
308
0
    Py_DECREF(codec_info);
309
0
    return ret;
310
0
}
311
312
/* Helper function to create a stream codec. */
313
314
static
315
PyObject *codec_getstreamcodec(const char *encoding,
316
                               PyObject *stream,
317
                               const char *errors,
318
                               const int index)
319
0
{
320
0
    PyObject *codecs, *streamcodec, *codeccls;
321
322
0
    codecs = _PyCodec_Lookup(encoding);
323
0
    if (codecs == NULL)
324
0
        return NULL;
325
326
0
    codeccls = PyTuple_GET_ITEM(codecs, index);
327
0
    if (errors != NULL)
328
0
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
329
0
    else
330
0
        streamcodec = PyObject_CallOneArg(codeccls, stream);
331
0
    Py_DECREF(codecs);
332
0
    return streamcodec;
333
0
}
334
335
/* Helpers to work with the result of _PyCodec_Lookup
336
337
 */
338
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
339
                                             const char *errors)
340
55
{
341
55
    return codec_makeincrementalcodec(codec_info, errors,
342
55
                                      "incrementaldecoder");
343
55
}
344
345
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
346
                                             const char *errors)
347
75
{
348
75
    return codec_makeincrementalcodec(codec_info, errors,
349
75
                                      "incrementalencoder");
350
75
}
351
352
353
/* Convenience APIs to query the Codec registry.
354
355
   All APIs return a codec object with incremented refcount.
356
357
 */
358
359
PyObject *PyCodec_Encoder(const char *encoding)
360
0
{
361
0
    return codec_getitem(encoding, 0);
362
0
}
363
364
PyObject *PyCodec_Decoder(const char *encoding)
365
0
{
366
0
    return codec_getitem(encoding, 1);
367
0
}
368
369
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
370
                                     const char *errors)
371
0
{
372
0
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
373
0
}
374
375
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
376
                                     const char *errors)
377
0
{
378
0
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
379
0
}
380
381
PyObject *PyCodec_StreamReader(const char *encoding,
382
                               PyObject *stream,
383
                               const char *errors)
384
0
{
385
0
    return codec_getstreamcodec(encoding, stream, errors, 2);
386
0
}
387
388
PyObject *PyCodec_StreamWriter(const char *encoding,
389
                               PyObject *stream,
390
                               const char *errors)
391
0
{
392
0
    return codec_getstreamcodec(encoding, stream, errors, 3);
393
0
}
394
395
/* Encode an object (e.g. a Unicode object) using the given encoding
396
   and return the resulting encoded object (usually a Python string).
397
398
   errors is passed to the encoder factory as argument if non-NULL. */
399
400
static PyObject *
401
_PyCodec_EncodeInternal(PyObject *object,
402
                        PyObject *encoder,
403
                        const char *encoding,
404
                        const char *errors)
405
906k
{
406
906k
    PyObject *args = NULL, *result = NULL;
407
906k
    PyObject *v = NULL;
408
409
906k
    args = args_tuple(object, errors);
410
906k
    if (args == NULL)
411
0
        goto onError;
412
413
906k
    result = PyObject_Call(encoder, args, NULL);
414
906k
    if (result == NULL) {
415
0
        _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding);
416
0
        goto onError;
417
0
    }
418
419
906k
    if (!PyTuple_Check(result) ||
420
906k
        PyTuple_GET_SIZE(result) != 2) {
421
0
        PyErr_SetString(PyExc_TypeError,
422
0
                        "encoder must return a tuple (object, integer)");
423
0
        goto onError;
424
0
    }
425
906k
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
426
    /* We don't check or use the second (integer) entry. */
427
428
906k
    Py_DECREF(args);
429
906k
    Py_DECREF(encoder);
430
906k
    Py_DECREF(result);
431
906k
    return v;
432
433
0
 onError:
434
0
    Py_XDECREF(result);
435
0
    Py_XDECREF(args);
436
0
    Py_XDECREF(encoder);
437
0
    return NULL;
438
906k
}
439
440
/* Decode an object (usually a Python string) using the given encoding
441
   and return an equivalent object (e.g. a Unicode object).
442
443
   errors is passed to the decoder factory as argument if non-NULL. */
444
445
static PyObject *
446
_PyCodec_DecodeInternal(PyObject *object,
447
                        PyObject *decoder,
448
                        const char *encoding,
449
                        const char *errors)
450
1.13M
{
451
1.13M
    PyObject *args = NULL, *result = NULL;
452
1.13M
    PyObject *v;
453
454
1.13M
    args = args_tuple(object, errors);
455
1.13M
    if (args == NULL)
456
0
        goto onError;
457
458
1.13M
    result = PyObject_Call(decoder, args, NULL);
459
1.13M
    if (result == NULL) {
460
59.0k
        _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding);
461
59.0k
        goto onError;
462
59.0k
    }
463
1.08M
    if (!PyTuple_Check(result) ||
464
1.08M
        PyTuple_GET_SIZE(result) != 2) {
465
0
        PyErr_SetString(PyExc_TypeError,
466
0
                        "decoder must return a tuple (object,integer)");
467
0
        goto onError;
468
0
    }
469
1.08M
    v = Py_NewRef(PyTuple_GET_ITEM(result,0));
470
    /* We don't check or use the second (integer) entry. */
471
472
1.08M
    Py_DECREF(args);
473
1.08M
    Py_DECREF(decoder);
474
1.08M
    Py_DECREF(result);
475
1.08M
    return v;
476
477
59.0k
 onError:
478
59.0k
    Py_XDECREF(args);
479
59.0k
    Py_XDECREF(decoder);
480
59.0k
    Py_XDECREF(result);
481
59.0k
    return NULL;
482
1.08M
}
483
484
/* Generic encoding/decoding API */
485
PyObject *PyCodec_Encode(PyObject *object,
486
                         const char *encoding,
487
                         const char *errors)
488
0
{
489
0
    PyObject *encoder;
490
491
0
    encoder = PyCodec_Encoder(encoding);
492
0
    if (encoder == NULL)
493
0
        return NULL;
494
495
0
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
496
0
}
497
498
PyObject *PyCodec_Decode(PyObject *object,
499
                         const char *encoding,
500
                         const char *errors)
501
0
{
502
0
    PyObject *decoder;
503
504
0
    decoder = PyCodec_Decoder(encoding);
505
0
    if (decoder == NULL)
506
0
        return NULL;
507
508
0
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
509
0
}
510
511
/* Text encoding/decoding API */
512
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
513
                                       const char *alternate_command)
514
2.05M
{
515
2.05M
    PyObject *codec;
516
2.05M
    PyObject *attr;
517
2.05M
    int is_text_codec;
518
519
2.05M
    codec = _PyCodec_Lookup(encoding);
520
2.05M
    if (codec == NULL)
521
10.0k
        return NULL;
522
523
    /* Backwards compatibility: assume any raw tuple describes a text
524
     * encoding, and the same for anything lacking the private
525
     * attribute.
526
     */
527
2.04M
    if (!PyTuple_CheckExact(codec)) {
528
2.04M
        if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
529
0
            Py_DECREF(codec);
530
0
            return NULL;
531
0
        }
532
2.04M
        if (attr != NULL) {
533
2.04M
            is_text_codec = PyObject_IsTrue(attr);
534
2.04M
            Py_DECREF(attr);
535
2.04M
            if (is_text_codec <= 0) {
536
2.70k
                Py_DECREF(codec);
537
2.70k
                if (!is_text_codec) {
538
2.70k
                    if (alternate_command != NULL) {
539
2.70k
                        PyErr_Format(PyExc_LookupError,
540
2.70k
                                     "'%.400s' is not a text encoding; "
541
2.70k
                                     "use %s to handle arbitrary codecs",
542
2.70k
                                     encoding, alternate_command);
543
2.70k
                    }
544
0
                    else {
545
0
                        PyErr_Format(PyExc_LookupError,
546
0
                                     "'%.400s' is not a text encoding",
547
0
                                     encoding);
548
0
                    }
549
2.70k
                }
550
2.70k
                return NULL;
551
2.70k
            }
552
2.04M
        }
553
2.04M
    }
554
555
    /* This appears to be a valid text encoding */
556
2.04M
    return codec;
557
2.04M
}
558
559
560
static
561
PyObject *codec_getitem_checked(const char *encoding,
562
                                const char *alternate_command,
563
                                int index)
564
2.05M
{
565
2.05M
    PyObject *codec;
566
2.05M
    PyObject *v;
567
568
2.05M
    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
569
2.05M
    if (codec == NULL)
570
12.7k
        return NULL;
571
572
2.04M
    v = Py_NewRef(PyTuple_GET_ITEM(codec, index));
573
2.04M
    Py_DECREF(codec);
574
2.04M
    return v;
575
2.05M
}
576
577
static PyObject * _PyCodec_TextEncoder(const char *encoding)
578
906k
{
579
906k
    return codec_getitem_checked(encoding, "codecs.encode()", 0);
580
906k
}
581
582
static PyObject * _PyCodec_TextDecoder(const char *encoding)
583
1.15M
{
584
1.15M
    return codec_getitem_checked(encoding, "codecs.decode()", 1);
585
1.15M
}
586
587
PyObject *_PyCodec_EncodeText(PyObject *object,
588
                              const char *encoding,
589
                              const char *errors)
590
906k
{
591
906k
    PyObject *encoder;
592
593
906k
    encoder = _PyCodec_TextEncoder(encoding);
594
906k
    if (encoder == NULL)
595
0
        return NULL;
596
597
906k
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
598
906k
}
599
600
PyObject *_PyCodec_DecodeText(PyObject *object,
601
                              const char *encoding,
602
                              const char *errors)
603
1.15M
{
604
1.15M
    PyObject *decoder;
605
606
1.15M
    decoder = _PyCodec_TextDecoder(encoding);
607
1.15M
    if (decoder == NULL)
608
12.7k
        return NULL;
609
610
1.13M
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
611
1.15M
}
612
613
/* Register the error handling callback function error under the name
614
   name. This function will be called by the codec when it encounters
615
   an unencodable characters/undecodable bytes and doesn't know the
616
   callback name, when name is specified as the error parameter
617
   in the call to the encode/decode function.
618
   Return 0 on success, -1 on error */
619
int PyCodec_RegisterError(const char *name, PyObject *error)
620
0
{
621
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
622
0
    assert(interp->codecs.initialized);
623
0
    if (!PyCallable_Check(error)) {
624
0
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
625
0
        return -1;
626
0
    }
627
0
    return PyDict_SetItemString(interp->codecs.error_registry,
628
0
                                name, error);
629
0
}
630
631
int _PyCodec_UnregisterError(const char *name)
632
0
{
633
0
    for (size_t i = 0; i < Py_ARRAY_LENGTH(codecs_builtin_error_handlers); ++i) {
634
0
        if (strcmp(name, codecs_builtin_error_handlers[i]) == 0) {
635
0
            PyErr_Format(PyExc_ValueError,
636
0
                         "cannot un-register built-in error handler '%s'", name);
637
0
            return -1;
638
0
        }
639
0
    }
640
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
641
0
    assert(interp->codecs.initialized);
642
0
    return PyDict_PopString(interp->codecs.error_registry, name, NULL);
643
0
}
644
645
/* Lookup the error handling callback function registered under the
646
   name error. As a special case NULL can be passed, in which case
647
   the error handling callback for strict encoding will be returned. */
648
PyObject *PyCodec_LookupError(const char *name)
649
2.42M
{
650
2.42M
    PyInterpreterState *interp = _PyInterpreterState_GET();
651
2.42M
    assert(interp->codecs.initialized);
652
653
2.42M
    if (name==NULL)
654
158k
        name = "strict";
655
2.42M
    PyObject *handler;
656
2.42M
    if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
657
0
        return NULL;
658
0
    }
659
2.42M
    if (handler == NULL) {
660
0
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
661
0
        return NULL;
662
0
    }
663
2.42M
    return handler;
664
2.42M
}
665
666
667
static inline void
668
wrong_exception_type(PyObject *exc)
669
0
{
670
0
    PyErr_Format(PyExc_TypeError,
671
0
                 "don't know how to handle %T in error callback", exc);
672
0
}
673
674
675
#define _PyIsUnicodeEncodeError(EXC)    \
676
281k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeEncodeError)
677
#define _PyIsUnicodeDecodeError(EXC)    \
678
269k
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeDecodeError)
679
#define _PyIsUnicodeTranslateError(EXC) \
680
0
    PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
681
682
683
// --- codecs handlers: utilities ---------------------------------------------
684
685
/*
686
 * Return the number of characters (including special prefixes)
687
 * needed to represent 'ch' by codec_handler_write_unicode_hex().
688
 */
689
static inline Py_ssize_t
690
codec_handler_unicode_hex_width(Py_UCS4 ch)
691
0
{
692
0
    if (ch >= 0x10000) {
693
        // format: '\\' + 'U' + 8 hex digits
694
0
        return 1 + 1 + 8;
695
0
    }
696
0
    else if (ch >= 0x100) {
697
        // format: '\\' + 'u' + 4 hex digits
698
0
        return 1 + 1 + 4;
699
0
    }
700
0
    else {
701
        // format: '\\' + 'x' + 2 hex digits
702
0
        return 1 + 1 + 2;
703
0
    }
704
0
}
705
706
707
/*
708
 * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
709
 * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
710
 */
711
static inline void
712
codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
713
0
{
714
0
    *(*p)++ = '\\';
715
0
    if (ch >= 0x10000) {
716
0
        *(*p)++ = 'U';
717
0
        *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf];
718
0
        *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf];
719
0
        *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf];
720
0
        *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf];
721
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
722
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
723
0
    }
724
0
    else if (ch >= 0x100) {
725
0
        *(*p)++ = 'u';
726
0
        *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
727
0
        *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
728
0
    }
729
0
    else {
730
0
        *(*p)++ = 'x';
731
0
    }
732
0
    *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf];
733
0
    *(*p)++ = Py_hexdigits[ch & 0xf];
734
0
}
735
736
737
/*
738
 * Determine the number of digits for a decimal representation of Unicode
739
 * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
740
 */
741
static inline int
742
n_decimal_digits_for_codepoint(Py_UCS4 ch)
743
0
{
744
0
    if (ch < 10) return 1;
745
0
    if (ch < 100) return 2;
746
0
    if (ch < 1000) return 3;
747
0
    if (ch < 10000) return 4;
748
0
    if (ch < 100000) return 5;
749
0
    if (ch < 1000000) return 6;
750
0
    if (ch < 10000000) return 7;
751
    // Unicode codepoints are limited to 1114111 (7 decimal digits)
752
0
    Py_UNREACHABLE();
753
0
}
754
755
756
/*
757
 * Create a Unicode string containing 'count' copies of the official
758
 * Unicode REPLACEMENT CHARACTER (0xFFFD).
759
 */
760
static PyObject *
761
codec_handler_unicode_replacement_character(Py_ssize_t count)
762
211k
{
763
211k
    PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER);
764
211k
    if (res == NULL) {
765
0
        return NULL;
766
0
    }
767
211k
    assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
768
211k
    Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
769
423k
    for (Py_ssize_t i = 0; i < count; ++i) {
770
211k
        outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
771
211k
    }
772
211k
    assert(_PyUnicode_CheckConsistency(res, 1));
773
211k
    return res;
774
211k
}
775
776
777
// --- handler: 'strict' ------------------------------------------------------
778
779
PyObject *PyCodec_StrictErrors(PyObject *exc)
780
2.93M
{
781
2.93M
    if (PyExceptionInstance_Check(exc)) {
782
2.93M
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
783
2.93M
    }
784
0
    else {
785
0
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
786
0
    }
787
2.93M
    return NULL;
788
2.93M
}
789
790
791
// --- handler: 'ignore' ------------------------------------------------------
792
793
static PyObject *
794
_PyCodec_IgnoreError(PyObject *exc, int as_bytes)
795
0
{
796
0
    Py_ssize_t end;
797
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL,
798
0
                                  &end, NULL, as_bytes) < 0)
799
0
    {
800
0
        return NULL;
801
0
    }
802
0
    return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
803
0
}
804
805
806
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
807
0
{
808
0
    if (_PyIsUnicodeEncodeError(exc) || _PyIsUnicodeTranslateError(exc)) {
809
0
        return _PyCodec_IgnoreError(exc, false);
810
0
    }
811
0
    else if (_PyIsUnicodeDecodeError(exc)) {
812
0
        return _PyCodec_IgnoreError(exc, true);
813
0
    }
814
0
    else {
815
0
        wrong_exception_type(exc);
816
0
        return NULL;
817
0
    }
818
0
}
819
820
821
// --- handler: 'replace' -----------------------------------------------------
822
823
static PyObject *
824
_PyCodec_ReplaceUnicodeEncodeError(PyObject *exc)
825
0
{
826
0
    Py_ssize_t start, end, slen;
827
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
828
0
                                  &start, &end, &slen, false) < 0)
829
0
    {
830
0
        return NULL;
831
0
    }
832
0
    PyObject *res = PyUnicode_New(slen, '?');
833
0
    if (res == NULL) {
834
0
        return NULL;
835
0
    }
836
0
    assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
837
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
838
0
    memset(outp, '?', sizeof(Py_UCS1) * slen);
839
0
    assert(_PyUnicode_CheckConsistency(res, 1));
840
0
    return Py_BuildValue("(Nn)", res, end);
841
0
}
842
843
844
static PyObject *
845
_PyCodec_ReplaceUnicodeDecodeError(PyObject *exc)
846
211k
{
847
211k
    Py_ssize_t end;
848
211k
    if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) {
849
0
        return NULL;
850
0
    }
851
211k
    PyObject *res = codec_handler_unicode_replacement_character(1);
852
211k
    if (res == NULL) {
853
0
        return NULL;
854
0
    }
855
211k
    return Py_BuildValue("(Nn)", res, end);
856
211k
}
857
858
859
static PyObject *
860
_PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
861
0
{
862
0
    Py_ssize_t start, end, slen;
863
0
    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
864
0
                                  &start, &end, &slen, false) < 0)
865
0
    {
866
0
        return NULL;
867
0
    }
868
0
    PyObject *res = codec_handler_unicode_replacement_character(slen);
869
0
    if (res == NULL) {
870
0
        return NULL;
871
0
    }
872
0
    return Py_BuildValue("(Nn)", res, end);
873
0
}
874
875
876
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
877
211k
{
878
211k
    if (_PyIsUnicodeEncodeError(exc)) {
879
0
        return _PyCodec_ReplaceUnicodeEncodeError(exc);
880
0
    }
881
211k
    else if (_PyIsUnicodeDecodeError(exc)) {
882
211k
        return _PyCodec_ReplaceUnicodeDecodeError(exc);
883
211k
    }
884
0
    else if (_PyIsUnicodeTranslateError(exc)) {
885
0
        return _PyCodec_ReplaceUnicodeTranslateError(exc);
886
0
    }
887
0
    else {
888
0
        wrong_exception_type(exc);
889
0
        return NULL;
890
0
    }
891
211k
}
892
893
894
// --- handler: 'xmlcharrefreplace' -------------------------------------------
895
896
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
897
0
{
898
0
    if (!_PyIsUnicodeEncodeError(exc)) {
899
0
        wrong_exception_type(exc);
900
0
        return NULL;
901
0
    }
902
903
0
    PyObject *obj;
904
0
    Py_ssize_t objlen, start, end, slen;
905
0
    if (_PyUnicodeError_GetParams(exc,
906
0
                                  &obj, &objlen,
907
0
                                  &start, &end, &slen, false) < 0)
908
0
    {
909
0
        return NULL;
910
0
    }
911
912
    // The number of characters that each character 'ch' contributes
913
    // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
914
    // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
915
    // range is below 10^7, each "block" requires at most 2 + 7 + 1
916
    // characters.
917
0
    if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
918
0
        end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
919
0
        end = Py_MIN(end, objlen);
920
0
        slen = Py_MAX(0, end - start);
921
0
    }
922
923
0
    Py_ssize_t ressize = 0;
924
0
    for (Py_ssize_t i = start; i < end; ++i) {
925
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
926
0
        int k = n_decimal_digits_for_codepoint(ch);
927
0
        assert(k != 0);
928
0
        assert(k <= 7);
929
0
        ressize += 2 + k + 1;
930
0
    }
931
932
    /* allocate replacement */
933
0
    PyObject *res = PyUnicode_New(ressize, 127);
934
0
    if (res == NULL) {
935
0
        Py_DECREF(obj);
936
0
        return NULL;
937
0
    }
938
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
939
    /* generate replacement */
940
0
    for (Py_ssize_t i = start; i < end; ++i) {
941
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
942
        /*
943
         * Write the decimal representation of 'ch' to the buffer pointed by 'p'
944
         * using at most 7 characters prefixed by '&#' and suffixed by ';'.
945
         */
946
0
        *outp++ = '&';
947
0
        *outp++ = '#';
948
0
        Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
949
0
        for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
950
0
            *p_digit = '0' + (ch % 10);
951
0
            ch /= 10;
952
0
        }
953
0
        assert(ch == 0);
954
0
        outp = digit_end;
955
0
        *outp++ = ';';
956
0
    }
957
0
    assert(_PyUnicode_CheckConsistency(res, 1));
958
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
959
0
    Py_DECREF(obj);
960
0
    return restuple;
961
0
}
962
963
964
// --- handler: 'backslashreplace' --------------------------------------------
965
966
static PyObject *
967
_PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc)
968
0
{
969
0
    PyObject *obj;
970
0
    Py_ssize_t objlen, start, end, slen;
971
0
    if (_PyUnicodeError_GetParams(exc,
972
0
                                  &obj, &objlen,
973
0
                                  &start, &end, &slen, false) < 0)
974
0
    {
975
0
        return NULL;
976
0
    }
977
978
    // The number of characters that each character 'ch' contributes
979
    // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch}
980
    // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS,
981
    // where the number of hexdigits is either 2, 4, or 8 (not 6).
982
    // Since the Unicode range is below 10^7, we choose k = 8 whence
983
    // each "block" requires at most 1 + 1 + 8 characters.
984
0
    if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) {
985
0
        end = start + PY_SSIZE_T_MAX / (1 + 1 + 8);
986
0
        end = Py_MIN(end, objlen);
987
0
        slen = Py_MAX(0, end - start);
988
0
    }
989
990
0
    Py_ssize_t ressize = 0;
991
0
    for (Py_ssize_t i = start; i < end; ++i) {
992
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
993
0
        ressize += codec_handler_unicode_hex_width(c);
994
0
    }
995
0
    PyObject *res = PyUnicode_New(ressize, 127);
996
0
    if (res == NULL) {
997
0
        Py_DECREF(obj);
998
0
        return NULL;
999
0
    }
1000
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1001
0
    for (Py_ssize_t i = start; i < end; ++i) {
1002
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1003
0
        codec_handler_write_unicode_hex(&outp, c);
1004
0
    }
1005
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1006
0
    Py_DECREF(obj);
1007
0
    return Py_BuildValue("(Nn)", res, end);
1008
0
}
1009
1010
1011
static PyObject *
1012
_PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc)
1013
0
{
1014
0
    PyObject *obj;
1015
0
    Py_ssize_t objlen, start, end, slen;
1016
0
    if (_PyUnicodeError_GetParams(exc,
1017
0
                                  &obj, &objlen,
1018
0
                                  &start, &end, &slen, true) < 0)
1019
0
    {
1020
0
        return NULL;
1021
0
    }
1022
1023
0
    PyObject *res = PyUnicode_New(4 * slen, 127);
1024
0
    if (res == NULL) {
1025
0
        Py_DECREF(obj);
1026
0
        return NULL;
1027
0
    }
1028
1029
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1030
0
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1031
0
    for (Py_ssize_t i = start; i < end; i++, outp += 4) {
1032
0
        const unsigned char ch = p[i];
1033
0
        outp[0] = '\\';
1034
0
        outp[1] = 'x';
1035
0
        outp[2] = Py_hexdigits[(ch >> 4) & 0xf];
1036
0
        outp[3] = Py_hexdigits[ch & 0xf];
1037
0
    }
1038
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1039
0
    Py_DECREF(obj);
1040
0
    return Py_BuildValue("(Nn)", res, end);
1041
0
}
1042
1043
1044
static inline PyObject *
1045
_PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc)
1046
0
{
1047
    // Same implementation as for UnicodeEncodeError objects.
1048
0
    return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1049
0
}
1050
1051
1052
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
1053
0
{
1054
0
    if (_PyIsUnicodeEncodeError(exc)) {
1055
0
        return _PyCodec_BackslashReplaceUnicodeEncodeError(exc);
1056
0
    }
1057
0
    else if (_PyIsUnicodeDecodeError(exc)) {
1058
0
        return _PyCodec_BackslashReplaceUnicodeDecodeError(exc);
1059
0
    }
1060
0
    else if (_PyIsUnicodeTranslateError(exc)) {
1061
0
        return _PyCodec_BackslashReplaceUnicodeTranslateError(exc);
1062
0
    }
1063
0
    else {
1064
0
        wrong_exception_type(exc);
1065
0
        return NULL;
1066
0
    }
1067
0
}
1068
1069
1070
// --- handler: 'namereplace' -------------------------------------------------
1071
1072
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
1073
0
{
1074
0
    if (!_PyIsUnicodeEncodeError(exc)) {
1075
0
        wrong_exception_type(exc);
1076
0
        return NULL;
1077
0
    }
1078
1079
0
    _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
1080
0
    if (ucnhash_capi == NULL) {
1081
0
        return NULL;
1082
0
    }
1083
1084
0
    PyObject *obj;
1085
0
    Py_ssize_t start, end;
1086
0
    if (_PyUnicodeError_GetParams(exc,
1087
0
                                  &obj, NULL,
1088
0
                                  &start, &end, NULL, false) < 0)
1089
0
    {
1090
0
        return NULL;
1091
0
    }
1092
1093
0
    char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */
1094
0
    Py_ssize_t imax = start, ressize = 0, replsize;
1095
0
    for (; imax < end; ++imax) {
1096
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax);
1097
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1098
            // If 'c' is recognized by getname(), the corresponding replacement
1099
            // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
1100
            // characters. Failures of getname() are ignored by the handler.
1101
0
            replsize = 1 + 1 + 1 + strlen(buffer) + 1;
1102
0
        }
1103
0
        else {
1104
0
            replsize = codec_handler_unicode_hex_width(c);
1105
0
        }
1106
0
        if (ressize > PY_SSIZE_T_MAX - replsize) {
1107
0
            break;
1108
0
        }
1109
0
        ressize += replsize;
1110
0
    }
1111
1112
0
    PyObject *res = PyUnicode_New(ressize, 127);
1113
0
    if (res == NULL) {
1114
0
        Py_DECREF(obj);
1115
0
        return NULL;
1116
0
    }
1117
1118
0
    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1119
0
    for (Py_ssize_t i = start; i < imax; ++i) {
1120
0
        Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1121
0
        if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1122
0
            *outp++ = '\\';
1123
0
            *outp++ = 'N';
1124
0
            *outp++ = '{';
1125
0
            (void)strcpy((char *)outp, buffer);
1126
0
            outp += strlen(buffer);
1127
0
            *outp++ = '}';
1128
0
        }
1129
0
        else {
1130
0
            codec_handler_write_unicode_hex(&outp, c);
1131
0
        }
1132
0
    }
1133
1134
0
    assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1135
0
    assert(_PyUnicode_CheckConsistency(res, 1));
1136
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, imax);
1137
0
    Py_DECREF(obj);
1138
0
    return restuple;
1139
0
}
1140
1141
1142
8
#define ENC_UNKNOWN     -1
1143
16
#define ENC_UTF8        0
1144
0
#define ENC_UTF16BE     1
1145
0
#define ENC_UTF16LE     2
1146
0
#define ENC_UTF32BE     3
1147
0
#define ENC_UTF32LE     4
1148
1149
static int
1150
get_standard_encoding_impl(const char *encoding, int *bytelength)
1151
8
{
1152
8
    if (Py_TOLOWER(encoding[0]) == 'u' &&
1153
8
        Py_TOLOWER(encoding[1]) == 't' &&
1154
8
        Py_TOLOWER(encoding[2]) == 'f') {
1155
8
        encoding += 3;
1156
8
        if (*encoding == '-' || *encoding == '_' )
1157
8
            encoding++;
1158
8
        if (encoding[0] == '8' && encoding[1] == '\0') {
1159
8
            *bytelength = 3;
1160
8
            return ENC_UTF8;
1161
8
        }
1162
0
        else if (encoding[0] == '1' && encoding[1] == '6') {
1163
0
            encoding += 2;
1164
0
            *bytelength = 2;
1165
0
            if (*encoding == '\0') {
1166
#ifdef WORDS_BIGENDIAN
1167
                return ENC_UTF16BE;
1168
#else
1169
0
                return ENC_UTF16LE;
1170
0
#endif
1171
0
            }
1172
0
            if (*encoding == '-' || *encoding == '_' )
1173
0
                encoding++;
1174
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1175
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1176
0
                    return ENC_UTF16BE;
1177
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1178
0
                    return ENC_UTF16LE;
1179
0
            }
1180
0
        }
1181
0
        else if (encoding[0] == '3' && encoding[1] == '2') {
1182
0
            encoding += 2;
1183
0
            *bytelength = 4;
1184
0
            if (*encoding == '\0') {
1185
#ifdef WORDS_BIGENDIAN
1186
                return ENC_UTF32BE;
1187
#else
1188
0
                return ENC_UTF32LE;
1189
0
#endif
1190
0
            }
1191
0
            if (*encoding == '-' || *encoding == '_' )
1192
0
                encoding++;
1193
0
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1194
0
                if (Py_TOLOWER(encoding[0]) == 'b')
1195
0
                    return ENC_UTF32BE;
1196
0
                if (Py_TOLOWER(encoding[0]) == 'l')
1197
0
                    return ENC_UTF32LE;
1198
0
            }
1199
0
        }
1200
8
    }
1201
0
    else if (strcmp(encoding, "cp65001") == 0) {
1202
0
        *bytelength = 3;
1203
0
        return ENC_UTF8;
1204
0
    }
1205
0
    return ENC_UNKNOWN;
1206
8
}
1207
1208
1209
static int
1210
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
1211
8
{
1212
8
    const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
1213
8
    if (encoding_cstr == NULL) {
1214
0
        return -1;
1215
0
    }
1216
8
    *code = get_standard_encoding_impl(encoding_cstr, bytelength);
1217
8
    return 0;
1218
8
}
1219
1220
1221
// --- handler: 'surrogatepass' -----------------------------------------------
1222
1223
static PyObject *
1224
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
1225
0
{
1226
0
    PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
1227
0
    if (encoding == NULL) {
1228
0
        return NULL;
1229
0
    }
1230
0
    int code, bytelength;
1231
0
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1232
0
    Py_DECREF(encoding);
1233
0
    if (rc < 0) {
1234
0
        return NULL;
1235
0
    }
1236
0
    if (code == ENC_UNKNOWN) {
1237
0
        goto bail;
1238
0
    }
1239
1240
0
    PyObject *obj;
1241
0
    Py_ssize_t objlen, start, end, slen;
1242
0
    if (_PyUnicodeError_GetParams(exc,
1243
0
                                  &obj, &objlen,
1244
0
                                  &start, &end, &slen, false) < 0)
1245
0
    {
1246
0
        return NULL;
1247
0
    }
1248
1249
0
    if (slen > PY_SSIZE_T_MAX / bytelength) {
1250
0
        end = start + PY_SSIZE_T_MAX / bytelength;
1251
0
        end = Py_MIN(end, objlen);
1252
0
        slen = Py_MAX(0, end - start);
1253
0
    }
1254
1255
0
    PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
1256
0
    if (res == NULL) {
1257
0
        Py_DECREF(obj);
1258
0
        return NULL;
1259
0
    }
1260
1261
0
    unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
1262
0
    for (Py_ssize_t i = start; i < end; i++) {
1263
0
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1264
0
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1265
            /* Not a surrogate, fail with original exception */
1266
0
            Py_DECREF(obj);
1267
0
            Py_DECREF(res);
1268
0
            goto bail;
1269
0
        }
1270
0
        switch (code) {
1271
0
            case ENC_UTF8: {
1272
0
                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1273
0
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1274
0
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1275
0
                break;
1276
0
            }
1277
0
            case ENC_UTF16LE: {
1278
0
                *outp++ = (unsigned char)ch;
1279
0
                *outp++ = (unsigned char)(ch >> 8);
1280
0
                break;
1281
0
            }
1282
0
            case ENC_UTF16BE: {
1283
0
                *outp++ = (unsigned char)(ch >> 8);
1284
0
                *outp++ = (unsigned char)ch;
1285
0
                break;
1286
0
            }
1287
0
            case ENC_UTF32LE: {
1288
0
                *outp++ = (unsigned char)ch;
1289
0
                *outp++ = (unsigned char)(ch >> 8);
1290
0
                *outp++ = (unsigned char)(ch >> 16);
1291
0
                *outp++ = (unsigned char)(ch >> 24);
1292
0
                break;
1293
0
            }
1294
0
            case ENC_UTF32BE: {
1295
0
                *outp++ = (unsigned char)(ch >> 24);
1296
0
                *outp++ = (unsigned char)(ch >> 16);
1297
0
                *outp++ = (unsigned char)(ch >> 8);
1298
0
                *outp++ = (unsigned char)ch;
1299
0
                break;
1300
0
            }
1301
0
        }
1302
0
    }
1303
1304
0
    Py_DECREF(obj);
1305
0
    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
1306
0
    return restuple;
1307
1308
0
bail:
1309
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1310
0
    return NULL;
1311
0
}
1312
1313
1314
static PyObject *
1315
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
1316
8
{
1317
8
    PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
1318
8
    if (encoding == NULL) {
1319
0
        return NULL;
1320
0
    }
1321
8
    int code, bytelength;
1322
8
    int rc = get_standard_encoding(encoding, &code, &bytelength);
1323
8
    Py_DECREF(encoding);
1324
8
    if (rc < 0) {
1325
0
        return NULL;
1326
0
    }
1327
8
    if (code == ENC_UNKNOWN) {
1328
0
        goto bail;
1329
0
    }
1330
1331
8
    PyObject *obj;
1332
8
    Py_ssize_t objlen, start, end, slen;
1333
8
    if (_PyUnicodeError_GetParams(exc,
1334
8
                                  &obj, &objlen,
1335
8
                                  &start, &end, &slen, true) < 0)
1336
0
    {
1337
0
        return NULL;
1338
0
    }
1339
1340
    /* Try decoding a single surrogate character. If
1341
       there are more, let the codec call us again. */
1342
8
    Py_UCS4 ch = 0;
1343
8
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1344
8
    p += start;
1345
1346
8
    if (objlen - start >= bytelength) {
1347
8
        switch (code) {
1348
8
            case ENC_UTF8: {
1349
8
                if ((p[0] & 0xf0) == 0xe0 &&
1350
8
                    (p[1] & 0xc0) == 0x80 &&
1351
8
                    (p[2] & 0xc0) == 0x80)
1352
8
                {
1353
                    /* it's a three-byte code */
1354
8
                    ch = ((p[0] & 0x0f) << 12) +
1355
8
                         ((p[1] & 0x3f) << 6)  +
1356
8
                          (p[2] & 0x3f);
1357
8
                }
1358
8
                break;
1359
0
            }
1360
0
            case ENC_UTF16LE: {
1361
0
                ch = p[1] << 8 | p[0];
1362
0
                break;
1363
0
            }
1364
0
            case ENC_UTF16BE: {
1365
0
                ch = p[0] << 8 | p[1];
1366
0
                break;
1367
0
            }
1368
0
            case ENC_UTF32LE: {
1369
0
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1370
0
                break;
1371
0
            }
1372
0
            case ENC_UTF32BE: {
1373
0
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1374
0
                break;
1375
0
            }
1376
8
        }
1377
8
    }
1378
8
    Py_DECREF(obj);
1379
8
    if (!Py_UNICODE_IS_SURROGATE(ch)) {
1380
0
        goto bail;
1381
0
    }
1382
1383
8
    PyObject *res = PyUnicode_FromOrdinal(ch);
1384
8
    if (res == NULL) {
1385
0
        return NULL;
1386
0
    }
1387
8
    return Py_BuildValue("(Nn)", res, start + bytelength);
1388
1389
0
bail:
1390
0
    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1391
0
    return NULL;
1392
8
}
1393
1394
1395
/* This handler is declared static until someone demonstrates
1396
   a need to call it directly. */
1397
static PyObject *
1398
PyCodec_SurrogatePassErrors(PyObject *exc)
1399
8
{
1400
8
    if (_PyIsUnicodeEncodeError(exc)) {
1401
0
        return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
1402
0
    }
1403
8
    else if (_PyIsUnicodeDecodeError(exc)) {
1404
8
        return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
1405
8
    }
1406
0
    else {
1407
0
        wrong_exception_type(exc);
1408
0
        return NULL;
1409
0
    }
1410
8
}
1411
1412
1413
// --- handler: 'surrogateescape' ---------------------------------------------
1414
1415
static PyObject *
1416
_PyCodec_SurrogateEscapeUnicodeEncodeError(PyObject *exc)
1417
11.5k
{
1418
11.5k
    PyObject *obj;
1419
11.5k
    Py_ssize_t start, end, slen;
1420
11.5k
    if (_PyUnicodeError_GetParams(exc,
1421
11.5k
                                  &obj, NULL,
1422
11.5k
                                  &start, &end, &slen, false) < 0)
1423
0
    {
1424
0
        return NULL;
1425
0
    }
1426
1427
11.5k
    PyObject *res = PyBytes_FromStringAndSize(NULL, slen);
1428
11.5k
    if (res == NULL) {
1429
0
        Py_DECREF(obj);
1430
0
        return NULL;
1431
0
    }
1432
1433
11.5k
    char *outp = PyBytes_AsString(res);
1434
11.5k
    for (Py_ssize_t i = start; i < end; i++) {
1435
11.5k
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1436
11.5k
        if (ch < 0xdc80 || ch > 0xdcff) {
1437
            /* Not a UTF-8b surrogate, fail with original exception. */
1438
11.5k
            Py_DECREF(obj);
1439
11.5k
            Py_DECREF(res);
1440
11.5k
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1441
11.5k
            return NULL;
1442
11.5k
        }
1443
0
        *outp++ = ch - 0xdc00;
1444
0
    }
1445
0
    Py_DECREF(obj);
1446
1447
0
    return Py_BuildValue("(Nn)", res, end);
1448
11.5k
}
1449
1450
1451
static PyObject *
1452
_PyCodec_SurrogateEscapeUnicodeDecodeError(PyObject *exc)
1453
58.1k
{
1454
58.1k
    PyObject *obj;
1455
58.1k
    Py_ssize_t start, end, slen;
1456
58.1k
    if (_PyUnicodeError_GetParams(exc,
1457
58.1k
                                  &obj, NULL,
1458
58.1k
                                  &start, &end, &slen, true) < 0)
1459
0
    {
1460
0
        return NULL;
1461
0
    }
1462
1463
58.1k
    Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1464
58.1k
    int consumed = 0;
1465
58.1k
    const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1466
134k
    while (consumed < 4 && consumed < slen) {
1467
        /* Refuse to escape ASCII bytes. */
1468
105k
        if (p[start + consumed] < 128) {
1469
29.6k
            break;
1470
29.6k
        }
1471
76.1k
        ch[consumed] = 0xdc00 + p[start + consumed];
1472
76.1k
        consumed++;
1473
76.1k
    }
1474
58.1k
    Py_DECREF(obj);
1475
1476
58.1k
    if (consumed == 0) {
1477
        /* Codec complained about ASCII byte. */
1478
17.4k
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1479
17.4k
        return NULL;
1480
17.4k
    }
1481
1482
40.6k
    PyObject *str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1483
40.6k
    if (str == NULL) {
1484
0
        return NULL;
1485
0
    }
1486
40.6k
    return Py_BuildValue("(Nn)", str, start + consumed);
1487
40.6k
}
1488
1489
1490
static PyObject *
1491
PyCodec_SurrogateEscapeErrors(PyObject *exc)
1492
69.6k
{
1493
69.6k
    if (_PyIsUnicodeEncodeError(exc)) {
1494
11.5k
        return _PyCodec_SurrogateEscapeUnicodeEncodeError(exc);
1495
11.5k
    }
1496
58.1k
    else if (_PyIsUnicodeDecodeError(exc)) {
1497
58.1k
        return _PyCodec_SurrogateEscapeUnicodeDecodeError(exc);
1498
58.1k
    }
1499
0
    else {
1500
0
        wrong_exception_type(exc);
1501
0
        return NULL;
1502
0
    }
1503
69.6k
}
1504
1505
1506
// --- Codecs registry handlers -----------------------------------------------
1507
1508
static inline PyObject *
1509
strict_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1510
2.36M
{
1511
2.36M
    return PyCodec_StrictErrors(exc);
1512
2.36M
}
1513
1514
1515
static inline PyObject *
1516
ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1517
0
{
1518
0
    return PyCodec_IgnoreErrors(exc);
1519
0
}
1520
1521
1522
static inline PyObject *
1523
replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1524
211k
{
1525
211k
    return PyCodec_ReplaceErrors(exc);
1526
211k
}
1527
1528
1529
static inline PyObject *
1530
xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1531
0
{
1532
0
    return PyCodec_XMLCharRefReplaceErrors(exc);
1533
0
}
1534
1535
1536
static inline PyObject *
1537
backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1538
0
{
1539
0
    return PyCodec_BackslashReplaceErrors(exc);
1540
0
}
1541
1542
1543
static inline PyObject *
1544
namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1545
0
{
1546
0
    return PyCodec_NameReplaceErrors(exc);
1547
0
}
1548
1549
1550
static inline PyObject *
1551
surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1552
8
{
1553
8
    return PyCodec_SurrogatePassErrors(exc);
1554
8
}
1555
1556
1557
static inline PyObject *
1558
surrogateescape_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1559
69.6k
{
1560
69.6k
    return PyCodec_SurrogateEscapeErrors(exc);
1561
69.6k
}
1562
1563
1564
PyStatus
1565
_PyCodec_InitRegistry(PyInterpreterState *interp)
1566
37
{
1567
37
    static struct {
1568
37
        const char *name;
1569
37
        PyMethodDef def;
1570
37
    } methods[] =
1571
37
    {
1572
37
        {
1573
37
            "strict",
1574
37
            {
1575
37
                "strict_errors",
1576
37
                strict_errors,
1577
37
                METH_O,
1578
37
                PyDoc_STR("Implements the 'strict' error handling, which "
1579
37
                          "raises a UnicodeError on coding errors.")
1580
37
            }
1581
37
        },
1582
37
        {
1583
37
            "ignore",
1584
37
            {
1585
37
                "ignore_errors",
1586
37
                ignore_errors,
1587
37
                METH_O,
1588
37
                PyDoc_STR("Implements the 'ignore' error handling, which "
1589
37
                          "ignores malformed data and continues.")
1590
37
            }
1591
37
        },
1592
37
        {
1593
37
            "replace",
1594
37
            {
1595
37
                "replace_errors",
1596
37
                replace_errors,
1597
37
                METH_O,
1598
37
                PyDoc_STR("Implements the 'replace' error handling, which "
1599
37
                          "replaces malformed data with a replacement marker.")
1600
37
            }
1601
37
        },
1602
37
        {
1603
37
            "xmlcharrefreplace",
1604
37
            {
1605
37
                "xmlcharrefreplace_errors",
1606
37
                xmlcharrefreplace_errors,
1607
37
                METH_O,
1608
37
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1609
37
                          "which replaces an unencodable character with the "
1610
37
                          "appropriate XML character reference.")
1611
37
            }
1612
37
        },
1613
37
        {
1614
37
            "backslashreplace",
1615
37
            {
1616
37
                "backslashreplace_errors",
1617
37
                backslashreplace_errors,
1618
37
                METH_O,
1619
37
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1620
37
                          "which replaces malformed data with a backslashed "
1621
37
                          "escape sequence.")
1622
37
            }
1623
37
        },
1624
37
        {
1625
37
            "namereplace",
1626
37
            {
1627
37
                "namereplace_errors",
1628
37
                namereplace_errors,
1629
37
                METH_O,
1630
37
                PyDoc_STR("Implements the 'namereplace' error handling, "
1631
37
                          "which replaces an unencodable character with a "
1632
37
                          "\\N{...} escape sequence.")
1633
37
            }
1634
37
        },
1635
37
        {
1636
37
            "surrogatepass",
1637
37
            {
1638
37
                "surrogatepass",
1639
37
                surrogatepass_errors,
1640
37
                METH_O
1641
37
            }
1642
37
        },
1643
37
        {
1644
37
            "surrogateescape",
1645
37
            {
1646
37
                "surrogateescape",
1647
37
                surrogateescape_errors,
1648
37
                METH_O
1649
37
            }
1650
37
        }
1651
37
    };
1652
    // ensure that the built-in error handlers' names are kept in sync
1653
37
    assert(Py_ARRAY_LENGTH(methods) == Py_ARRAY_LENGTH(codecs_builtin_error_handlers));
1654
1655
37
    assert(interp->codecs.initialized == 0);
1656
37
    interp->codecs.search_path = PyList_New(0);
1657
37
    if (interp->codecs.search_path == NULL) {
1658
0
        return PyStatus_NoMemory();
1659
0
    }
1660
37
    interp->codecs.search_cache = PyDict_New();
1661
37
    if (interp->codecs.search_cache == NULL) {
1662
0
        return PyStatus_NoMemory();
1663
0
    }
1664
37
    interp->codecs.error_registry = PyDict_New();
1665
37
    if (interp->codecs.error_registry == NULL) {
1666
0
        return PyStatus_NoMemory();
1667
0
    }
1668
333
    for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1669
296
        PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1670
296
        if (func == NULL) {
1671
0
            return PyStatus_NoMemory();
1672
0
        }
1673
1674
296
        int res = PyDict_SetItemString(interp->codecs.error_registry,
1675
296
                                       methods[i].name, func);
1676
296
        Py_DECREF(func);
1677
296
        if (res < 0) {
1678
0
            return PyStatus_Error("Failed to insert into codec error registry");
1679
0
        }
1680
296
    }
1681
1682
37
    interp->codecs.initialized = 1;
1683
1684
    // Importing `encodings' will call back into this module to register codec
1685
    // search functions, so this is done after everything else is initialized.
1686
37
    PyObject *mod = PyImport_ImportModule("encodings");
1687
37
    if (mod == NULL) {
1688
0
        return PyStatus_Error("Failed to import encodings module");
1689
0
    }
1690
37
    Py_DECREF(mod);
1691
1692
37
    return PyStatus_Ok();
1693
37
}
1694
1695
void
1696
_PyCodec_Fini(PyInterpreterState *interp)
1697
0
{
1698
0
    Py_CLEAR(interp->codecs.search_path);
1699
0
    Py_CLEAR(interp->codecs.search_cache);
1700
    Py_CLEAR(interp->codecs.error_registry);
1701
0
    interp->codecs.initialized = 0;
1702
0
}