Coverage Report

Created: 2026-05-30 06:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Modules/_codecsmodule.c
Line
Count
Source
1
/* ------------------------------------------------------------------------
2
3
   _codecs -- Provides access to the codec registry and the builtin
4
              codecs.
5
6
   This module should never be imported directly. The standard library
7
   module "codecs" wraps this builtin module for use within Python.
8
9
   The codec registry is accessible via:
10
11
     register(search_function) -> None
12
13
     lookup(encoding) -> CodecInfo object
14
15
   The builtin Unicode codecs use the following interface:
16
17
     <encoding>_encode(Unicode_object[,errors='strict']) ->
18
        (string object, bytes consumed)
19
20
     <encoding>_decode(char_buffer_obj[,errors='strict']) ->
21
        (Unicode object, bytes consumed)
22
23
   These <encoding>s are available: utf_8, unicode_escape,
24
   raw_unicode_escape, latin_1, ascii (7-bit), mbcs (on win32).
25
26
27
Written by Marc-Andre Lemburg (mal@lemburg.com).
28
29
Copyright (c) Corporation for National Research Initiatives.
30
31
   ------------------------------------------------------------------------ */
32
33
#include "Python.h"
34
#include "pycore_codecs.h"        // _PyCodec_Lookup()
35
#include "pycore_unicodeobject.h" // _PyUnicode_EncodeCharmap
36
37
#ifdef MS_WINDOWS
38
#include <windows.h>
39
#endif
40
41
/*[clinic input]
42
module _codecs
43
[clinic start generated code]*/
44
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e1390e3da3cb9deb]*/
45
46
#include "pycore_runtime.h"
47
#include "clinic/_codecsmodule.c.h"
48
49
/* --- Registry ----------------------------------------------------------- */
50
51
/*[clinic input]
52
_codecs.register
53
    search_function: object
54
    /
55
56
Register a codec search function.
57
58
Search functions are expected to take one argument, the encoding
59
name in all lower case letters, and either return None, or a tuple
60
of functions (encoder, decoder, stream_reader, stream_writer) (or
61
a CodecInfo object).
62
[clinic start generated code]*/
63
64
static PyObject *
65
_codecs_register(PyObject *module, PyObject *search_function)
66
/*[clinic end generated code: output=d1bf21e99db7d6d3 input=2321d8c8c0420dfc]*/
67
37
{
68
37
    if (PyCodec_Register(search_function))
69
0
        return NULL;
70
71
37
    Py_RETURN_NONE;
72
37
}
73
74
/*[clinic input]
75
_codecs.unregister
76
    search_function: object
77
    /
78
79
Unregister a codec search function and clear the registry's cache.
80
81
If the search function is not registered, do nothing.
82
[clinic start generated code]*/
83
84
static PyObject *
85
_codecs_unregister(PyObject *module, PyObject *search_function)
86
/*[clinic end generated code: output=1f0edee9cf246399 input=dd7c004c652d345e]*/
87
0
{
88
0
    if (PyCodec_Unregister(search_function) < 0) {
89
0
        return NULL;
90
0
    }
91
92
0
    Py_RETURN_NONE;
93
0
}
94
95
/*[clinic input]
96
@permit_long_summary
97
_codecs.lookup
98
    encoding: str
99
    /
100
101
Looks up a codec tuple in the Python codec registry and returns a CodecInfo object.
102
[clinic start generated code]*/
103
104
static PyObject *
105
_codecs_lookup_impl(PyObject *module, const char *encoding)
106
/*[clinic end generated code: output=9f0afa572080c36d input=02227d5429491ab3]*/
107
333k
{
108
333k
    return _PyCodec_Lookup(encoding);
109
333k
}
110
111
/*[clinic input]
112
_codecs.encode
113
    obj: object
114
    encoding: str(c_default="NULL") = "utf-8"
115
    errors: str(c_default="NULL") = "strict"
116
117
Encodes obj using the codec registered for encoding.
118
119
The default encoding is 'utf-8'.  errors may be given to set a
120
different error handling scheme.  Default is 'strict' meaning that
121
encoding errors raise a ValueError.  Other possible values are 'ignore',
122
'replace' and 'backslashreplace' as well as any other name registered
123
with codecs.register_error that can handle ValueErrors.
124
[clinic start generated code]*/
125
126
static PyObject *
127
_codecs_encode_impl(PyObject *module, PyObject *obj, const char *encoding,
128
                    const char *errors)
129
/*[clinic end generated code: output=385148eb9a067c86 input=e5271d443e391d7f]*/
130
0
{
131
0
    if (encoding == NULL)
132
0
        encoding = PyUnicode_GetDefaultEncoding();
133
134
    /* Encode via the codec registry */
135
0
    return PyCodec_Encode(obj, encoding, errors);
136
0
}
137
138
/*[clinic input]
139
_codecs.decode
140
    obj: object
141
    encoding: str(c_default="NULL") = "utf-8"
142
    errors: str(c_default="NULL") = "strict"
143
144
Decodes obj using the codec registered for encoding.
145
146
Default encoding is 'utf-8'.  errors may be given to set a
147
different error handling scheme.  Default is 'strict' meaning that
148
encoding errors raise a ValueError.  Other possible values are 'ignore',
149
'replace' and 'backslashreplace' as well as any other name registered
150
with codecs.register_error that can handle ValueErrors.
151
[clinic start generated code]*/
152
153
static PyObject *
154
_codecs_decode_impl(PyObject *module, PyObject *obj, const char *encoding,
155
                    const char *errors)
156
/*[clinic end generated code: output=679882417dc3a0bd input=3e6254628f9ca538]*/
157
0
{
158
0
    if (encoding == NULL)
159
0
        encoding = PyUnicode_GetDefaultEncoding();
160
161
    /* Decode via the codec registry */
162
0
    return PyCodec_Decode(obj, encoding, errors);
163
0
}
164
165
/* --- Helpers ------------------------------------------------------------ */
166
167
static
168
PyObject *codec_tuple(PyObject *decoded,
169
                      Py_ssize_t len)
170
1.42M
{
171
1.42M
    if (decoded == NULL)
172
74.1k
        return NULL;
173
1.34M
    return Py_BuildValue("Nn", decoded, len);
174
1.42M
}
175
176
/* --- String codecs ------------------------------------------------------ */
177
/*[clinic input]
178
_codecs.escape_decode
179
    data: Py_buffer(accept={str, buffer})
180
    errors: str(accept={str, NoneType}) = None
181
    /
182
[clinic start generated code]*/
183
184
static PyObject *
185
_codecs_escape_decode_impl(PyObject *module, Py_buffer *data,
186
                           const char *errors)
187
/*[clinic end generated code: output=505200ba8056979a input=77298a561c90bd82]*/
188
0
{
189
0
    PyObject *decoded = PyBytes_DecodeEscape(data->buf, data->len,
190
0
                                             errors, 0, NULL);
191
0
    return codec_tuple(decoded, data->len);
192
0
}
193
194
/*[clinic input]
195
_codecs.escape_encode
196
    data: object(subclass_of='&PyBytes_Type')
197
    errors: str(accept={str, NoneType}) = None
198
    /
199
[clinic start generated code]*/
200
201
static PyObject *
202
_codecs_escape_encode_impl(PyObject *module, PyObject *data,
203
                           const char *errors)
204
/*[clinic end generated code: output=4af1d477834bab34 input=8f4b144799a94245]*/
205
0
{
206
0
    Py_ssize_t size = PyBytes_GET_SIZE(data);
207
0
    if (size > PY_SSIZE_T_MAX / 4) {
208
0
        PyErr_SetString(PyExc_OverflowError,
209
0
            "string is too large to encode");
210
0
            return NULL;
211
0
    }
212
0
    Py_ssize_t newsize = 4*size;
213
214
0
    PyBytesWriter *writer = PyBytesWriter_Create(newsize);
215
0
    if (writer == NULL) {
216
0
        return NULL;
217
0
    }
218
0
    char *p = PyBytesWriter_GetData(writer);
219
220
0
    for (Py_ssize_t i = 0; i < size; i++) {
221
        /* There's at least enough room for a hex escape */
222
0
        assert(newsize - (p - (char*)PyBytesWriter_GetData(writer)) >= 4);
223
224
0
        char c = PyBytes_AS_STRING(data)[i];
225
0
        if (c == '\'' || c == '\\') {
226
0
            *p++ = '\\'; *p++ = c;
227
0
        }
228
0
        else if (c == '\t') {
229
0
            *p++ = '\\'; *p++ = 't';
230
0
        }
231
0
        else if (c == '\n') {
232
0
            *p++ = '\\'; *p++ = 'n';
233
0
        }
234
0
        else if (c == '\r') {
235
0
            *p++ = '\\'; *p++ = 'r';
236
0
        }
237
0
        else if (c < ' ' || c >= 0x7f) {
238
0
            *p++ = '\\';
239
0
            *p++ = 'x';
240
0
            *p++ = Py_hexdigits[(c & 0xf0) >> 4];
241
0
            *p++ = Py_hexdigits[c & 0xf];
242
0
        }
243
0
        else {
244
0
            *p++ = c;
245
0
        }
246
0
    }
247
248
0
    PyObject *decoded = PyBytesWriter_FinishWithPointer(writer, p);
249
0
    return codec_tuple(decoded, size);
250
0
}
251
252
/* --- Decoder ------------------------------------------------------------ */
253
/*[clinic input]
254
_codecs.utf_7_decode
255
    data: Py_buffer
256
    errors: str(accept={str, NoneType}) = None
257
    final: bool = False
258
    /
259
[clinic start generated code]*/
260
261
static PyObject *
262
_codecs_utf_7_decode_impl(PyObject *module, Py_buffer *data,
263
                          const char *errors, int final)
264
/*[clinic end generated code: output=0cd3a944a32a4089 input=dbf8c8998102dc7d]*/
265
22.1k
{
266
22.1k
    Py_ssize_t consumed = data->len;
267
22.1k
    PyObject *decoded = PyUnicode_DecodeUTF7Stateful(data->buf, data->len,
268
22.1k
                                                     errors,
269
22.1k
                                                     final ? NULL : &consumed);
270
22.1k
    return codec_tuple(decoded, consumed);
271
22.1k
}
272
273
/*[clinic input]
274
_codecs.utf_8_decode
275
    data: Py_buffer
276
    errors: str(accept={str, NoneType}) = None
277
    final: bool = False
278
    /
279
[clinic start generated code]*/
280
281
static PyObject *
282
_codecs_utf_8_decode_impl(PyObject *module, Py_buffer *data,
283
                          const char *errors, int final)
284
/*[clinic end generated code: output=10f74dec8d9bb8bf input=ca06bc8a9c970e25]*/
285
62.1k
{
286
62.1k
    Py_ssize_t consumed = data->len;
287
62.1k
    PyObject *decoded = PyUnicode_DecodeUTF8Stateful(data->buf, data->len,
288
62.1k
                                                     errors,
289
62.1k
                                                     final ? NULL : &consumed);
290
62.1k
    return codec_tuple(decoded, consumed);
291
62.1k
}
292
293
/*[clinic input]
294
_codecs.utf_16_decode
295
    data: Py_buffer
296
    errors: str(accept={str, NoneType}) = None
297
    final: bool = False
298
    /
299
[clinic start generated code]*/
300
301
static PyObject *
302
_codecs_utf_16_decode_impl(PyObject *module, Py_buffer *data,
303
                           const char *errors, int final)
304
/*[clinic end generated code: output=783b442abcbcc2d0 input=5b0f52071ba6cadc]*/
305
16.7k
{
306
16.7k
    int byteorder = 0;
307
    /* This is overwritten unless final is true. */
308
16.7k
    Py_ssize_t consumed = data->len;
309
16.7k
    PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
310
16.7k
                                                      errors, &byteorder,
311
16.7k
                                                      final ? NULL : &consumed);
312
16.7k
    return codec_tuple(decoded, consumed);
313
16.7k
}
314
315
/*[clinic input]
316
_codecs.utf_16_le_decode
317
    data: Py_buffer
318
    errors: str(accept={str, NoneType}) = None
319
    final: bool = False
320
    /
321
[clinic start generated code]*/
322
323
static PyObject *
324
_codecs_utf_16_le_decode_impl(PyObject *module, Py_buffer *data,
325
                              const char *errors, int final)
326
/*[clinic end generated code: output=899b9e6364379dcd input=115bd8c7b783d0bf]*/
327
48
{
328
48
    int byteorder = -1;
329
    /* This is overwritten unless final is true. */
330
48
    Py_ssize_t consumed = data->len;
331
48
    PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
332
48
                                                      errors, &byteorder,
333
48
                                                      final ? NULL : &consumed);
334
48
    return codec_tuple(decoded, consumed);
335
48
}
336
337
/*[clinic input]
338
_codecs.utf_16_be_decode
339
    data: Py_buffer
340
    errors: str(accept={str, NoneType}) = None
341
    final: bool = False
342
    /
343
[clinic start generated code]*/
344
345
static PyObject *
346
_codecs_utf_16_be_decode_impl(PyObject *module, Py_buffer *data,
347
                              const char *errors, int final)
348
/*[clinic end generated code: output=49f6465ea07669c8 input=63131422b01f9cb4]*/
349
224
{
350
224
    int byteorder = 1;
351
    /* This is overwritten unless final is true. */
352
224
    Py_ssize_t consumed = data->len;
353
224
    PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
354
224
                                                      errors, &byteorder,
355
224
                                                      final ? NULL : &consumed);
356
224
    return codec_tuple(decoded, consumed);
357
224
}
358
359
/* This non-standard version also provides access to the byteorder
360
   parameter of the builtin UTF-16 codec.
361
362
   It returns a tuple (unicode, bytesread, byteorder) with byteorder
363
   being the value in effect at the end of data.
364
365
*/
366
/*[clinic input]
367
_codecs.utf_16_ex_decode
368
    data: Py_buffer
369
    errors: str(accept={str, NoneType}) = None
370
    byteorder: int = 0
371
    final: bool = False
372
    /
373
[clinic start generated code]*/
374
375
static PyObject *
376
_codecs_utf_16_ex_decode_impl(PyObject *module, Py_buffer *data,
377
                              const char *errors, int byteorder, int final)
378
/*[clinic end generated code: output=0f385f251ecc1988 input=f368a51cf384bf4c]*/
379
0
{
380
    /* This is overwritten unless final is true. */
381
0
    Py_ssize_t consumed = data->len;
382
383
0
    PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
384
0
                                                      errors, &byteorder,
385
0
                                                      final ? NULL : &consumed);
386
0
    if (decoded == NULL)
387
0
        return NULL;
388
0
    return Py_BuildValue("Nni", decoded, consumed, byteorder);
389
0
}
390
391
/*[clinic input]
392
_codecs.utf_32_decode
393
    data: Py_buffer
394
    errors: str(accept={str, NoneType}) = None
395
    final: bool = False
396
    /
397
[clinic start generated code]*/
398
399
static PyObject *
400
_codecs_utf_32_decode_impl(PyObject *module, Py_buffer *data,
401
                           const char *errors, int final)
402
/*[clinic end generated code: output=2fc961807f7b145f input=fcdf3658c5e9b5f3]*/
403
51.0k
{
404
51.0k
    int byteorder = 0;
405
    /* This is overwritten unless final is true. */
406
51.0k
    Py_ssize_t consumed = data->len;
407
51.0k
    PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
408
51.0k
                                                      errors, &byteorder,
409
51.0k
                                                      final ? NULL : &consumed);
410
51.0k
    return codec_tuple(decoded, consumed);
411
51.0k
}
412
413
/*[clinic input]
414
_codecs.utf_32_le_decode
415
    data: Py_buffer
416
    errors: str(accept={str, NoneType}) = None
417
    final: bool = False
418
    /
419
[clinic start generated code]*/
420
421
static PyObject *
422
_codecs_utf_32_le_decode_impl(PyObject *module, Py_buffer *data,
423
                              const char *errors, int final)
424
/*[clinic end generated code: output=ec8f46b67a94f3e6 input=12220556e885f817]*/
425
29
{
426
29
    int byteorder = -1;
427
    /* This is overwritten unless final is true. */
428
29
    Py_ssize_t consumed = data->len;
429
29
    PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
430
29
                                                      errors, &byteorder,
431
29
                                                      final ? NULL : &consumed);
432
29
    return codec_tuple(decoded, consumed);
433
29
}
434
435
/*[clinic input]
436
_codecs.utf_32_be_decode
437
    data: Py_buffer
438
    errors: str(accept={str, NoneType}) = None
439
    final: bool = False
440
    /
441
[clinic start generated code]*/
442
443
static PyObject *
444
_codecs_utf_32_be_decode_impl(PyObject *module, Py_buffer *data,
445
                              const char *errors, int final)
446
/*[clinic end generated code: output=ff82bae862c92c4e input=2bc669b4781598db]*/
447
81
{
448
81
    int byteorder = 1;
449
    /* This is overwritten unless final is true. */
450
81
    Py_ssize_t consumed = data->len;
451
81
    PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
452
81
                                                      errors, &byteorder,
453
81
                                                      final ? NULL : &consumed);
454
81
    return codec_tuple(decoded, consumed);
455
81
}
456
457
/* This non-standard version also provides access to the byteorder
458
   parameter of the builtin UTF-32 codec.
459
460
   It returns a tuple (unicode, bytesread, byteorder) with byteorder
461
   being the value in effect at the end of data.
462
463
*/
464
/*[clinic input]
465
_codecs.utf_32_ex_decode
466
    data: Py_buffer
467
    errors: str(accept={str, NoneType}) = None
468
    byteorder: int = 0
469
    final: bool = False
470
    /
471
[clinic start generated code]*/
472
473
static PyObject *
474
_codecs_utf_32_ex_decode_impl(PyObject *module, Py_buffer *data,
475
                              const char *errors, int byteorder, int final)
476
/*[clinic end generated code: output=6bfb177dceaf4848 input=4a2323d0013620df]*/
477
0
{
478
0
    Py_ssize_t consumed = data->len;
479
0
    PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
480
0
                                                      errors, &byteorder,
481
0
                                                      final ? NULL : &consumed);
482
0
    if (decoded == NULL)
483
0
        return NULL;
484
0
    return Py_BuildValue("Nni", decoded, consumed, byteorder);
485
0
}
486
487
/*[clinic input]
488
_codecs.unicode_escape_decode
489
    data: Py_buffer(accept={str, buffer})
490
    errors: str(accept={str, NoneType}) = None
491
    final: bool = True
492
    /
493
[clinic start generated code]*/
494
495
static PyObject *
496
_codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
497
                                   const char *errors, int final)
498
/*[clinic end generated code: output=b284f97b12c635ee input=15019f081ffe272b]*/
499
545
{
500
545
    Py_ssize_t consumed = data->len;
501
545
    PyObject *decoded = _PyUnicode_DecodeUnicodeEscapeStateful(data->buf, data->len,
502
545
                                                               errors,
503
545
                                                               final ? NULL : &consumed);
504
545
    return codec_tuple(decoded, consumed);
505
545
}
506
507
/*[clinic input]
508
_codecs.raw_unicode_escape_decode
509
    data: Py_buffer(accept={str, buffer})
510
    errors: str(accept={str, NoneType}) = None
511
    final: bool = True
512
    /
513
[clinic start generated code]*/
514
515
static PyObject *
516
_codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
517
                                       const char *errors, int final)
518
/*[clinic end generated code: output=11dbd96301e2879e input=b93f823aa8c343ad]*/
519
112
{
520
112
    Py_ssize_t consumed = data->len;
521
112
    PyObject *decoded = _PyUnicode_DecodeRawUnicodeEscapeStateful(data->buf, data->len,
522
112
                                                                  errors,
523
112
                                                                  final ? NULL : &consumed);
524
112
    return codec_tuple(decoded, consumed);
525
112
}
526
527
/*[clinic input]
528
_codecs.latin_1_decode
529
    data: Py_buffer
530
    errors: str(accept={str, NoneType}) = None
531
    /
532
[clinic start generated code]*/
533
534
static PyObject *
535
_codecs_latin_1_decode_impl(PyObject *module, Py_buffer *data,
536
                            const char *errors)
537
/*[clinic end generated code: output=07f3dfa3f72c7d8f input=76ca58fd6dcd08c7]*/
538
4.09k
{
539
4.09k
    PyObject *decoded = PyUnicode_DecodeLatin1(data->buf, data->len, errors);
540
4.09k
    return codec_tuple(decoded, data->len);
541
4.09k
}
542
543
/*[clinic input]
544
_codecs.ascii_decode
545
    data: Py_buffer
546
    errors: str(accept={str, NoneType}) = None
547
    /
548
[clinic start generated code]*/
549
550
static PyObject *
551
_codecs_ascii_decode_impl(PyObject *module, Py_buffer *data,
552
                          const char *errors)
553
/*[clinic end generated code: output=2627d72058d42429 input=e428a267a04b4481]*/
554
22.0k
{
555
22.0k
    PyObject *decoded = PyUnicode_DecodeASCII(data->buf, data->len, errors);
556
22.0k
    return codec_tuple(decoded, data->len);
557
22.0k
}
558
559
/*[clinic input]
560
_codecs.charmap_decode
561
    data: Py_buffer
562
    errors: str(accept={str, NoneType}) = None
563
    mapping: object = None
564
    /
565
[clinic start generated code]*/
566
567
static PyObject *
568
_codecs_charmap_decode_impl(PyObject *module, Py_buffer *data,
569
                            const char *errors, PyObject *mapping)
570
/*[clinic end generated code: output=2c335b09778cf895 input=15b69df43458eb40]*/
571
655k
{
572
655k
    PyObject *decoded;
573
574
655k
    if (mapping == Py_None)
575
21
        mapping = NULL;
576
577
655k
    decoded = PyUnicode_DecodeCharmap(data->buf, data->len, mapping, errors);
578
655k
    return codec_tuple(decoded, data->len);
579
655k
}
580
581
#ifdef MS_WINDOWS
582
583
/*[clinic input]
584
_codecs.mbcs_decode
585
    data: Py_buffer
586
    errors: str(accept={str, NoneType}) = None
587
    final: bool = False
588
    /
589
[clinic start generated code]*/
590
591
static PyObject *
592
_codecs_mbcs_decode_impl(PyObject *module, Py_buffer *data,
593
                         const char *errors, int final)
594
/*[clinic end generated code: output=39b65b8598938c4b input=f144ad1ed6d8f5a6]*/
595
{
596
    Py_ssize_t consumed = data->len;
597
    PyObject *decoded = PyUnicode_DecodeMBCSStateful(data->buf, data->len,
598
            errors, final ? NULL : &consumed);
599
    return codec_tuple(decoded, consumed);
600
}
601
602
/*[clinic input]
603
_codecs.oem_decode
604
    data: Py_buffer
605
    errors: str(accept={str, NoneType}) = None
606
    final: bool = False
607
    /
608
[clinic start generated code]*/
609
610
static PyObject *
611
_codecs_oem_decode_impl(PyObject *module, Py_buffer *data,
612
                        const char *errors, int final)
613
/*[clinic end generated code: output=da1617612f3fcad8 input=629bf87376d211b4]*/
614
{
615
    Py_ssize_t consumed = data->len;
616
    PyObject *decoded = PyUnicode_DecodeCodePageStateful(CP_OEMCP,
617
        data->buf, data->len, errors, final ? NULL : &consumed);
618
    return codec_tuple(decoded, consumed);
619
}
620
621
/*[clinic input]
622
_codecs.code_page_decode
623
    codepage: int
624
    data: Py_buffer
625
    errors: str(accept={str, NoneType}) = None
626
    final: bool = False
627
    /
628
[clinic start generated code]*/
629
630
static PyObject *
631
_codecs_code_page_decode_impl(PyObject *module, int codepage,
632
                              Py_buffer *data, const char *errors, int final)
633
/*[clinic end generated code: output=53008ea967da3fff input=6a32589b0658c277]*/
634
{
635
    Py_ssize_t consumed = data->len;
636
    PyObject *decoded = PyUnicode_DecodeCodePageStateful(codepage,
637
                                                         data->buf, data->len,
638
                                                         errors,
639
                                                         final ? NULL : &consumed);
640
    return codec_tuple(decoded, consumed);
641
}
642
643
#endif /* MS_WINDOWS */
644
645
/* --- Encoder ------------------------------------------------------------ */
646
647
/*[clinic input]
648
_codecs.readbuffer_encode
649
    data: Py_buffer(accept={str, buffer})
650
    errors: str(accept={str, NoneType}) = None
651
    /
652
[clinic start generated code]*/
653
654
static PyObject *
655
_codecs_readbuffer_encode_impl(PyObject *module, Py_buffer *data,
656
                               const char *errors)
657
/*[clinic end generated code: output=c645ea7cdb3d6e86 input=aa10cfdf252455c5]*/
658
0
{
659
0
    PyObject *result = PyBytes_FromStringAndSize(data->buf, data->len);
660
0
    return codec_tuple(result, data->len);
661
0
}
662
663
/*[clinic input]
664
_codecs.utf_7_encode
665
    str: unicode
666
    errors: str(accept={str, NoneType}) = None
667
    /
668
[clinic start generated code]*/
669
670
static PyObject *
671
_codecs_utf_7_encode_impl(PyObject *module, PyObject *str,
672
                          const char *errors)
673
/*[clinic end generated code: output=0feda21ffc921bc8 input=2546dbbb3fa53114]*/
674
0
{
675
0
    return codec_tuple(_PyUnicode_EncodeUTF7(str, errors),
676
0
                       PyUnicode_GET_LENGTH(str));
677
0
}
678
679
/*[clinic input]
680
_codecs.utf_8_encode
681
    str: unicode
682
    errors: str(accept={str, NoneType}) = None
683
    /
684
[clinic start generated code]*/
685
686
static PyObject *
687
_codecs_utf_8_encode_impl(PyObject *module, PyObject *str,
688
                          const char *errors)
689
/*[clinic end generated code: output=02bf47332b9c796c input=a3e71ae01c3f93f3]*/
690
0
{
691
0
    return codec_tuple(_PyUnicode_AsUTF8String(str, errors),
692
0
                       PyUnicode_GET_LENGTH(str));
693
0
}
694
695
/* This version provides access to the byteorder parameter of the
696
   builtin UTF-16 codecs as optional third argument. It defaults to 0
697
   which means: use the native byte order and prepend the data with a
698
   BOM mark.
699
700
*/
701
702
/*[clinic input]
703
_codecs.utf_16_encode
704
    str: unicode
705
    errors: str(accept={str, NoneType}) = None
706
    byteorder: int = 0
707
    /
708
[clinic start generated code]*/
709
710
static PyObject *
711
_codecs_utf_16_encode_impl(PyObject *module, PyObject *str,
712
                           const char *errors, int byteorder)
713
/*[clinic end generated code: output=c654e13efa2e64e4 input=68cdc2eb8338555d]*/
714
0
{
715
0
    return codec_tuple(_PyUnicode_EncodeUTF16(str, errors, byteorder),
716
0
                       PyUnicode_GET_LENGTH(str));
717
0
}
718
719
/*[clinic input]
720
_codecs.utf_16_le_encode
721
    str: unicode
722
    errors: str(accept={str, NoneType}) = None
723
    /
724
[clinic start generated code]*/
725
726
static PyObject *
727
_codecs_utf_16_le_encode_impl(PyObject *module, PyObject *str,
728
                              const char *errors)
729
/*[clinic end generated code: output=431b01e55f2d4995 input=83d042706eed6798]*/
730
3.47k
{
731
3.47k
    return codec_tuple(_PyUnicode_EncodeUTF16(str, errors, -1),
732
3.47k
                       PyUnicode_GET_LENGTH(str));
733
3.47k
}
734
735
/*[clinic input]
736
_codecs.utf_16_be_encode
737
    str: unicode
738
    errors: str(accept={str, NoneType}) = None
739
    /
740
[clinic start generated code]*/
741
742
static PyObject *
743
_codecs_utf_16_be_encode_impl(PyObject *module, PyObject *str,
744
                              const char *errors)
745
/*[clinic end generated code: output=96886a6fd54dcae3 input=6f1e9e623b03071b]*/
746
3.65k
{
747
3.65k
    return codec_tuple(_PyUnicode_EncodeUTF16(str, errors, +1),
748
3.65k
                       PyUnicode_GET_LENGTH(str));
749
3.65k
}
750
751
/* This version provides access to the byteorder parameter of the
752
   builtin UTF-32 codecs as optional third argument. It defaults to 0
753
   which means: use the native byte order and prepend the data with a
754
   BOM mark.
755
756
*/
757
758
/*[clinic input]
759
_codecs.utf_32_encode
760
    str: unicode
761
    errors: str(accept={str, NoneType}) = None
762
    byteorder: int = 0
763
    /
764
[clinic start generated code]*/
765
766
static PyObject *
767
_codecs_utf_32_encode_impl(PyObject *module, PyObject *str,
768
                           const char *errors, int byteorder)
769
/*[clinic end generated code: output=5c760da0c09a8b83 input=8ec4c64d983bc52b]*/
770
0
{
771
0
    return codec_tuple(_PyUnicode_EncodeUTF32(str, errors, byteorder),
772
0
                       PyUnicode_GET_LENGTH(str));
773
0
}
774
775
/*[clinic input]
776
_codecs.utf_32_le_encode
777
    str: unicode
778
    errors: str(accept={str, NoneType}) = None
779
    /
780
[clinic start generated code]*/
781
782
static PyObject *
783
_codecs_utf_32_le_encode_impl(PyObject *module, PyObject *str,
784
                              const char *errors)
785
/*[clinic end generated code: output=b65cd176de8e36d6 input=f0918d41de3eb1b1]*/
786
0
{
787
0
    return codec_tuple(_PyUnicode_EncodeUTF32(str, errors, -1),
788
0
                       PyUnicode_GET_LENGTH(str));
789
0
}
790
791
/*[clinic input]
792
_codecs.utf_32_be_encode
793
    str: unicode
794
    errors: str(accept={str, NoneType}) = None
795
    /
796
[clinic start generated code]*/
797
798
static PyObject *
799
_codecs_utf_32_be_encode_impl(PyObject *module, PyObject *str,
800
                              const char *errors)
801
/*[clinic end generated code: output=1d9e71a9358709e9 input=967a99a95748b557]*/
802
0
{
803
0
    return codec_tuple(_PyUnicode_EncodeUTF32(str, errors, +1),
804
0
                       PyUnicode_GET_LENGTH(str));
805
0
}
806
807
/*[clinic input]
808
_codecs.unicode_escape_encode
809
    str: unicode
810
    errors: str(accept={str, NoneType}) = None
811
    /
812
[clinic start generated code]*/
813
814
static PyObject *
815
_codecs_unicode_escape_encode_impl(PyObject *module, PyObject *str,
816
                                   const char *errors)
817
/*[clinic end generated code: output=66271b30bc4f7a3c input=8c4de07597054e33]*/
818
330k
{
819
330k
    return codec_tuple(PyUnicode_AsUnicodeEscapeString(str),
820
330k
                       PyUnicode_GET_LENGTH(str));
821
330k
}
822
823
/*[clinic input]
824
_codecs.raw_unicode_escape_encode
825
    str: unicode
826
    errors: str(accept={str, NoneType}) = None
827
    /
828
[clinic start generated code]*/
829
830
static PyObject *
831
_codecs_raw_unicode_escape_encode_impl(PyObject *module, PyObject *str,
832
                                       const char *errors)
833
/*[clinic end generated code: output=a66a806ed01c830a input=4aa6f280d78e4574]*/
834
247k
{
835
247k
    return codec_tuple(PyUnicode_AsRawUnicodeEscapeString(str),
836
247k
                       PyUnicode_GET_LENGTH(str));
837
247k
}
838
839
/*[clinic input]
840
_codecs.latin_1_encode
841
    str: unicode
842
    errors: str(accept={str, NoneType}) = None
843
    /
844
[clinic start generated code]*/
845
846
static PyObject *
847
_codecs_latin_1_encode_impl(PyObject *module, PyObject *str,
848
                            const char *errors)
849
/*[clinic end generated code: output=2c28c83a27884e08 input=ec3ef74bf85c5c5d]*/
850
0
{
851
0
    return codec_tuple(_PyUnicode_AsLatin1String(str, errors),
852
0
                       PyUnicode_GET_LENGTH(str));
853
0
}
854
855
/*[clinic input]
856
_codecs.ascii_encode
857
    str: unicode
858
    errors: str(accept={str, NoneType}) = None
859
    /
860
[clinic start generated code]*/
861
862
static PyObject *
863
_codecs_ascii_encode_impl(PyObject *module, PyObject *str,
864
                          const char *errors)
865
/*[clinic end generated code: output=b5e035182d33befc input=93e6e602838bd3de]*/
866
0
{
867
0
    return codec_tuple(_PyUnicode_AsASCIIString(str, errors),
868
0
                       PyUnicode_GET_LENGTH(str));
869
0
}
870
871
/*[clinic input]
872
_codecs.charmap_encode
873
    str: unicode
874
    errors: str(accept={str, NoneType}) = None
875
    mapping: object = None
876
    /
877
[clinic start generated code]*/
878
879
static PyObject *
880
_codecs_charmap_encode_impl(PyObject *module, PyObject *str,
881
                            const char *errors, PyObject *mapping)
882
/*[clinic end generated code: output=047476f48495a9e9 input=2a98feae73dadce8]*/
883
0
{
884
0
    if (mapping == Py_None)
885
0
        mapping = NULL;
886
887
0
    return codec_tuple(_PyUnicode_EncodeCharmap(str, mapping, errors),
888
0
                       PyUnicode_GET_LENGTH(str));
889
0
}
890
891
/*[clinic input]
892
_codecs.charmap_build
893
    map: unicode
894
    /
895
[clinic start generated code]*/
896
897
static PyObject *
898
_codecs_charmap_build_impl(PyObject *module, PyObject *map)
899
/*[clinic end generated code: output=bb073c27031db9ac input=d91a91d1717dbc6d]*/
900
136
{
901
136
    return PyUnicode_BuildEncodingMap(map);
902
136
}
903
904
#ifdef MS_WINDOWS
905
906
/*[clinic input]
907
_codecs.mbcs_encode
908
    str: unicode
909
    errors: str(accept={str, NoneType}) = None
910
    /
911
[clinic start generated code]*/
912
913
static PyObject *
914
_codecs_mbcs_encode_impl(PyObject *module, PyObject *str, const char *errors)
915
/*[clinic end generated code: output=76e2e170c966c080 input=2e932fc289ea5a5b]*/
916
{
917
    return codec_tuple(PyUnicode_EncodeCodePage(CP_ACP, str, errors),
918
                       PyUnicode_GET_LENGTH(str));
919
}
920
921
/*[clinic input]
922
_codecs.oem_encode
923
    str: unicode
924
    errors: str(accept={str, NoneType}) = None
925
    /
926
[clinic start generated code]*/
927
928
static PyObject *
929
_codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors)
930
/*[clinic end generated code: output=65d5982c737de649 input=9eac86dc21eb14f2]*/
931
{
932
    return codec_tuple(PyUnicode_EncodeCodePage(CP_OEMCP, str, errors),
933
        PyUnicode_GET_LENGTH(str));
934
}
935
936
/*[clinic input]
937
_codecs.code_page_encode
938
    code_page: int
939
    str: unicode
940
    errors: str(accept={str, NoneType}) = None
941
    /
942
[clinic start generated code]*/
943
944
static PyObject *
945
_codecs_code_page_encode_impl(PyObject *module, int code_page, PyObject *str,
946
                              const char *errors)
947
/*[clinic end generated code: output=45673f6085657a9e input=7d18a33bc8cd0f94]*/
948
{
949
    return codec_tuple(PyUnicode_EncodeCodePage(code_page, str, errors),
950
                       PyUnicode_GET_LENGTH(str));
951
}
952
953
#endif /* MS_WINDOWS */
954
955
/* --- Error handler registry --------------------------------------------- */
956
957
/*[clinic input]
958
_codecs.register_error
959
    errors: str
960
    handler: object
961
    /
962
963
Register the specified error handler under the name errors.
964
965
handler must be a callable object, that will be called with an exception
966
instance containing information about the location of the
967
encoding/decoding error and must return a (replacement, new position)
968
tuple.
969
[clinic start generated code]*/
970
971
static PyObject *
972
_codecs_register_error_impl(PyObject *module, const char *errors,
973
                            PyObject *handler)
974
/*[clinic end generated code: output=fa2f7d1879b3067d input=5bea01dfe835d9d8]*/
975
0
{
976
0
    if (PyCodec_RegisterError(errors, handler))
977
0
        return NULL;
978
0
    Py_RETURN_NONE;
979
0
}
980
981
/*[clinic input]
982
_codecs._unregister_error -> bool
983
    errors: str
984
    /
985
986
Un-register the specified error handler for the error handling `errors'.
987
988
Only custom error handlers can be un-registered. An exception is raised
989
if the error handling is a built-in one (e.g., 'strict'), or if an error
990
occurs.
991
992
Otherwise, this returns True if a custom handler has been successfully
993
un-registered, and False if no custom handler for the specified error
994
handling exists.
995
996
[clinic start generated code]*/
997
998
static int
999
_codecs__unregister_error_impl(PyObject *module, const char *errors)
1000
/*[clinic end generated code: output=28c22be667465503 input=a63ab9e9ce1686d4]*/
1001
0
{
1002
0
    return _PyCodec_UnregisterError(errors);
1003
0
}
1004
1005
/*[clinic input]
1006
_codecs.lookup_error
1007
    name: str
1008
    /
1009
1010
lookup_error(errors) -> handler
1011
1012
Return the error handler for the specified error handling name or raise
1013
a LookupError, if no handler exists under this name.
1014
[clinic start generated code]*/
1015
1016
static PyObject *
1017
_codecs_lookup_error_impl(PyObject *module, const char *name)
1018
/*[clinic end generated code: output=087f05dc0c9a98cc input=86cfb6a7a9c67113]*/
1019
222
{
1020
222
    return PyCodec_LookupError(name);
1021
222
}
1022
1023
extern int _Py_normalize_encoding(const char *, char *, size_t, int);
1024
1025
/*[clinic input]
1026
_codecs._normalize_encoding
1027
    encoding: unicode
1028
1029
Normalize an encoding name *encoding*.
1030
1031
Used for encodings.normalize_encoding. Does not convert to lower case.
1032
[clinic start generated code]*/
1033
1034
static PyObject *
1035
_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding)
1036
/*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/
1037
3.57k
{
1038
3.57k
    Py_ssize_t len;
1039
3.57k
    const char *cstr = PyUnicode_AsUTF8AndSize(encoding, &len);
1040
3.57k
    if (cstr == NULL) {
1041
0
        return NULL;
1042
0
    }
1043
1044
3.57k
    if (len > PY_SSIZE_T_MAX) {
1045
0
        PyErr_SetString(PyExc_OverflowError, "encoding is too large");
1046
0
        return NULL;
1047
0
    }
1048
1049
3.57k
    char *normalized = PyMem_Malloc(len + 1);
1050
3.57k
    if (normalized == NULL) {
1051
0
        return PyErr_NoMemory();
1052
0
    }
1053
1054
3.57k
    if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) {
1055
0
        PyMem_Free(normalized);
1056
0
        return NULL;
1057
0
    }
1058
1059
3.57k
    PyObject *result = PyUnicode_FromString(normalized);
1060
3.57k
    PyMem_Free(normalized);
1061
3.57k
    return result;
1062
3.57k
}
1063
1064
/* --- Module API --------------------------------------------------------- */
1065
1066
static PyMethodDef _codecs_functions[] = {
1067
    _CODECS_REGISTER_METHODDEF
1068
    _CODECS_UNREGISTER_METHODDEF
1069
    _CODECS_LOOKUP_METHODDEF
1070
    _CODECS_ENCODE_METHODDEF
1071
    _CODECS_DECODE_METHODDEF
1072
    _CODECS_ESCAPE_ENCODE_METHODDEF
1073
    _CODECS_ESCAPE_DECODE_METHODDEF
1074
    _CODECS_UTF_8_ENCODE_METHODDEF
1075
    _CODECS_UTF_8_DECODE_METHODDEF
1076
    _CODECS_UTF_7_ENCODE_METHODDEF
1077
    _CODECS_UTF_7_DECODE_METHODDEF
1078
    _CODECS_UTF_16_ENCODE_METHODDEF
1079
    _CODECS_UTF_16_LE_ENCODE_METHODDEF
1080
    _CODECS_UTF_16_BE_ENCODE_METHODDEF
1081
    _CODECS_UTF_16_DECODE_METHODDEF
1082
    _CODECS_UTF_16_LE_DECODE_METHODDEF
1083
    _CODECS_UTF_16_BE_DECODE_METHODDEF
1084
    _CODECS_UTF_16_EX_DECODE_METHODDEF
1085
    _CODECS_UTF_32_ENCODE_METHODDEF
1086
    _CODECS_UTF_32_LE_ENCODE_METHODDEF
1087
    _CODECS_UTF_32_BE_ENCODE_METHODDEF
1088
    _CODECS_UTF_32_DECODE_METHODDEF
1089
    _CODECS_UTF_32_LE_DECODE_METHODDEF
1090
    _CODECS_UTF_32_BE_DECODE_METHODDEF
1091
    _CODECS_UTF_32_EX_DECODE_METHODDEF
1092
    _CODECS_UNICODE_ESCAPE_ENCODE_METHODDEF
1093
    _CODECS_UNICODE_ESCAPE_DECODE_METHODDEF
1094
    _CODECS_RAW_UNICODE_ESCAPE_ENCODE_METHODDEF
1095
    _CODECS_RAW_UNICODE_ESCAPE_DECODE_METHODDEF
1096
    _CODECS_LATIN_1_ENCODE_METHODDEF
1097
    _CODECS_LATIN_1_DECODE_METHODDEF
1098
    _CODECS_ASCII_ENCODE_METHODDEF
1099
    _CODECS_ASCII_DECODE_METHODDEF
1100
    _CODECS_CHARMAP_ENCODE_METHODDEF
1101
    _CODECS_CHARMAP_DECODE_METHODDEF
1102
    _CODECS_CHARMAP_BUILD_METHODDEF
1103
    _CODECS_READBUFFER_ENCODE_METHODDEF
1104
    _CODECS_MBCS_ENCODE_METHODDEF
1105
    _CODECS_MBCS_DECODE_METHODDEF
1106
    _CODECS_OEM_ENCODE_METHODDEF
1107
    _CODECS_OEM_DECODE_METHODDEF
1108
    _CODECS_CODE_PAGE_ENCODE_METHODDEF
1109
    _CODECS_CODE_PAGE_DECODE_METHODDEF
1110
    _CODECS_REGISTER_ERROR_METHODDEF
1111
    _CODECS__UNREGISTER_ERROR_METHODDEF
1112
    _CODECS_LOOKUP_ERROR_METHODDEF
1113
    _CODECS__NORMALIZE_ENCODING_METHODDEF
1114
    {NULL, NULL}                /* sentinel */
1115
};
1116
1117
static PyModuleDef_Slot _codecs_slots[] = {
1118
    _Py_ABI_SLOT,
1119
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
1120
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
1121
    {0, NULL}
1122
};
1123
1124
static struct PyModuleDef codecsmodule = {
1125
        PyModuleDef_HEAD_INIT,
1126
        "_codecs",
1127
        NULL,
1128
        0,
1129
        _codecs_functions,
1130
        _codecs_slots,
1131
        NULL,
1132
        NULL,
1133
        NULL
1134
};
1135
1136
PyMODINIT_FUNC
1137
PyInit__codecs(void)
1138
37
{
1139
37
    return PyModuleDef_Init(&codecsmodule);
1140
37
}