Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/registry.py: 91%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Encoding registry with metadata for all supported encodings."""
3from __future__ import annotations
5import codecs
6import dataclasses
7import functools
8from types import MappingProxyType
9from typing import Literal
11from chardet.enums import EncodingEra
13EncodingName = Literal[
14 "ascii",
15 "big5hkscs",
16 "cp1006",
17 "cp1026",
18 "cp1125",
19 "cp1140",
20 "cp1250",
21 "cp1251",
22 "cp1252",
23 "cp1253",
24 "cp1254",
25 "cp1255",
26 "cp1256",
27 "cp1257",
28 "cp1258",
29 "cp273",
30 "cp424",
31 "cp437",
32 "cp500",
33 "cp720",
34 "cp737",
35 "cp775",
36 "cp850",
37 "cp852",
38 "cp855",
39 "cp856",
40 "cp857",
41 "cp858",
42 "cp860",
43 "cp861",
44 "cp862",
45 "cp863",
46 "cp864",
47 "cp865",
48 "cp866",
49 "cp869",
50 "cp874",
51 "cp875",
52 "cp932",
53 "cp949",
54 "euc_jis_2004",
55 "euc_kr",
56 "gb18030",
57 "hp-roman8",
58 "hz",
59 "iso2022_jp_2",
60 "iso2022_jp_2004",
61 "iso2022_jp_ext",
62 "iso2022_kr",
63 "iso8859-1",
64 "iso8859-10",
65 "iso8859-13",
66 "iso8859-14",
67 "iso8859-15",
68 "iso8859-16",
69 "iso8859-2",
70 "iso8859-3",
71 "iso8859-4",
72 "iso8859-5",
73 "iso8859-6",
74 "iso8859-7",
75 "iso8859-8",
76 "iso8859-9",
77 "johab",
78 "koi8-r",
79 "koi8-t",
80 "koi8-u",
81 "kz1048",
82 "mac-cyrillic",
83 "mac-greek",
84 "mac-iceland",
85 "mac-latin2",
86 "mac-roman",
87 "mac-turkish",
88 "ptcp154",
89 "shift_jis_2004",
90 "tis-620",
91 "utf-16",
92 "utf-16-be",
93 "utf-16-le",
94 "utf-32",
95 "utf-32-be",
96 "utf-32-le",
97 "utf-7",
98 "utf-8",
99 "utf-8-sig",
100]
102# Shared language tuples — used by multiple EncodingInfo entries below.
103_WESTERN = (
104 "en",
105 "fr",
106 "de",
107 "es",
108 "pt",
109 "it",
110 "nl",
111 "da",
112 "sv",
113 "no",
114 "fi",
115 "is",
116 "id",
117 "ms",
118)
119_WESTERN_TR = (*_WESTERN, "tr")
120_CYRILLIC = ("ru", "bg", "uk", "sr", "mk", "be")
121_CENTRAL_EU = ("pl", "cs", "hu", "hr", "ro", "sk", "sl")
122_CENTRAL_EU_NO_RO = ("pl", "cs", "hu", "hr", "sk", "sl")
123_BALTIC = ("et", "lt", "lv")
124_ARABIC = ("ar", "fa")
127@dataclasses.dataclass(frozen=True, slots=True)
128class EncodingInfo:
129 """Metadata for a single encoding."""
131 name: EncodingName
132 aliases: tuple[str, ...]
133 era: EncodingEra
134 is_multibyte: bool
135 languages: tuple[str, ...]
138@functools.cache
139def get_candidates(era: EncodingEra) -> tuple[EncodingInfo, ...]:
140 """Return registry entries matching the given era filter.
142 :param era: Bit flags specifying which encoding eras to include.
143 :returns: A tuple of matching :class:`EncodingInfo` entries.
144 """
145 return tuple(enc for enc in REGISTRY.values() if enc.era & era)
148# Era assignments match chardet 6.0.0's chardet/metadata/charsets.py
150_REGISTRY_ENTRIES = (
151 # === MODERN_WEB ===
152 EncodingInfo(
153 name="ascii",
154 aliases=("us-ascii",),
155 era=EncodingEra.MODERN_WEB,
156 is_multibyte=False,
157 languages=(),
158 ),
159 EncodingInfo(
160 name="utf-8",
161 aliases=("utf-8", "utf8"),
162 era=EncodingEra.MODERN_WEB,
163 is_multibyte=False,
164 languages=(),
165 ),
166 EncodingInfo(
167 name="utf-8-sig",
168 aliases=("UTF-8-SIG", "utf-8-bom"),
169 era=EncodingEra.MODERN_WEB,
170 is_multibyte=False,
171 languages=(),
172 ),
173 EncodingInfo(
174 name="utf-16",
175 aliases=("UTF-16", "utf16"),
176 era=EncodingEra.MODERN_WEB,
177 is_multibyte=False,
178 languages=(),
179 ),
180 EncodingInfo(
181 name="utf-16-be",
182 aliases=("UTF-16-BE", "utf-16be"),
183 era=EncodingEra.MODERN_WEB,
184 is_multibyte=False,
185 languages=(),
186 ),
187 EncodingInfo(
188 name="utf-16-le",
189 aliases=("UTF-16-LE", "utf-16le"),
190 era=EncodingEra.MODERN_WEB,
191 is_multibyte=False,
192 languages=(),
193 ),
194 EncodingInfo(
195 name="utf-32",
196 aliases=("UTF-32", "utf32"),
197 era=EncodingEra.MODERN_WEB,
198 is_multibyte=False,
199 languages=(),
200 ),
201 EncodingInfo(
202 name="utf-32-be",
203 aliases=("UTF-32-BE", "utf-32be"),
204 era=EncodingEra.MODERN_WEB,
205 is_multibyte=False,
206 languages=(),
207 ),
208 EncodingInfo(
209 name="utf-32-le",
210 aliases=("UTF-32-LE", "utf-32le"),
211 era=EncodingEra.MODERN_WEB,
212 is_multibyte=False,
213 languages=(),
214 ),
215 EncodingInfo(
216 name="utf-7",
217 aliases=("UTF-7", "utf7"),
218 era=EncodingEra.LEGACY_REGIONAL,
219 is_multibyte=False,
220 languages=(),
221 ),
222 # CJK - Modern Web
223 EncodingInfo(
224 name="big5hkscs",
225 aliases=("Big5-HKSCS", "Big5HKSCS", "big5", "big5-tw", "csbig5", "cp950"),
226 era=EncodingEra.MODERN_WEB,
227 is_multibyte=True,
228 languages=("zh",),
229 ),
230 EncodingInfo(
231 name="cp932",
232 aliases=("CP932", "ms932", "mskanji", "ms-kanji"),
233 era=EncodingEra.MODERN_WEB,
234 is_multibyte=True,
235 languages=("ja",),
236 ),
237 EncodingInfo(
238 name="cp949",
239 aliases=("CP949", "ms949", "uhc"),
240 era=EncodingEra.MODERN_WEB,
241 is_multibyte=True,
242 languages=("ko",),
243 ),
244 EncodingInfo(
245 name="euc_jis_2004",
246 aliases=("EUC-JIS-2004", "euc-jp", "eucjp", "ujis", "u-jis", "euc-jisx0213"),
247 era=EncodingEra.MODERN_WEB,
248 is_multibyte=True,
249 languages=("ja",),
250 ),
251 EncodingInfo(
252 name="euc_kr",
253 aliases=("EUC-KR", "euckr"),
254 era=EncodingEra.MODERN_WEB,
255 is_multibyte=True,
256 languages=("ko",),
257 ),
258 EncodingInfo(
259 name="gb18030",
260 aliases=("GB18030", "gb-18030", "gb2312", "gbk"),
261 era=EncodingEra.MODERN_WEB,
262 is_multibyte=True,
263 languages=("zh",),
264 ),
265 EncodingInfo(
266 name="hz",
267 aliases=("HZ-GB-2312", "hz"),
268 era=EncodingEra.LEGACY_REGIONAL,
269 is_multibyte=True,
270 languages=("zh",),
271 ),
272 EncodingInfo(
273 name="iso2022_jp_2",
274 aliases=("ISO-2022-JP-2", "iso-2022-jp", "csiso2022jp", "iso2022-jp-1"),
275 era=EncodingEra.MODERN_WEB,
276 is_multibyte=True,
277 languages=("ja",),
278 ),
279 EncodingInfo(
280 name="iso2022_jp_2004",
281 aliases=("ISO-2022-JP-2004", "iso2022-jp-3"),
282 era=EncodingEra.MODERN_WEB,
283 is_multibyte=True,
284 languages=("ja",),
285 ),
286 EncodingInfo(
287 name="iso2022_jp_ext",
288 aliases=("ISO-2022-JP-EXT",),
289 era=EncodingEra.MODERN_WEB,
290 is_multibyte=True,
291 languages=("ja",),
292 ),
293 EncodingInfo(
294 name="iso2022_kr",
295 aliases=("ISO-2022-KR", "csiso2022kr"),
296 era=EncodingEra.LEGACY_REGIONAL,
297 is_multibyte=True,
298 languages=("ko",),
299 ),
300 EncodingInfo(
301 name="shift_jis_2004",
302 aliases=(
303 "Shift-JIS-2004",
304 "Shift_JIS_2004",
305 "shift_jis",
306 "sjis",
307 "shiftjis",
308 "s_jis",
309 "shift-jisx0213",
310 ),
311 era=EncodingEra.MODERN_WEB,
312 is_multibyte=True,
313 languages=("ja",),
314 ),
315 # Windows code pages - Modern Web
316 EncodingInfo(
317 name="cp874",
318 aliases=("CP874", "windows-874"),
319 era=EncodingEra.MODERN_WEB,
320 is_multibyte=False,
321 languages=("th",),
322 ),
323 EncodingInfo(
324 name="cp1250",
325 aliases=("Windows-1250", "cp1250"),
326 era=EncodingEra.MODERN_WEB,
327 is_multibyte=False,
328 languages=_CENTRAL_EU,
329 ),
330 EncodingInfo(
331 name="cp1251",
332 aliases=("Windows-1251", "cp1251"),
333 era=EncodingEra.MODERN_WEB,
334 is_multibyte=False,
335 languages=_CYRILLIC,
336 ),
337 EncodingInfo(
338 name="cp1252",
339 aliases=("Windows-1252", "cp1252"),
340 era=EncodingEra.MODERN_WEB,
341 is_multibyte=False,
342 languages=_WESTERN,
343 ),
344 EncodingInfo(
345 name="cp1253",
346 aliases=("Windows-1253", "cp1253"),
347 era=EncodingEra.MODERN_WEB,
348 is_multibyte=False,
349 languages=("el",),
350 ),
351 EncodingInfo(
352 name="cp1254",
353 aliases=("Windows-1254", "cp1254"),
354 era=EncodingEra.MODERN_WEB,
355 is_multibyte=False,
356 languages=("tr",),
357 ),
358 EncodingInfo(
359 name="cp1255",
360 aliases=("Windows-1255", "cp1255"),
361 era=EncodingEra.MODERN_WEB,
362 is_multibyte=False,
363 languages=("he",),
364 ),
365 EncodingInfo(
366 name="cp1256",
367 aliases=("Windows-1256", "cp1256"),
368 era=EncodingEra.MODERN_WEB,
369 is_multibyte=False,
370 languages=_ARABIC,
371 ),
372 EncodingInfo(
373 name="cp1257",
374 aliases=("Windows-1257", "cp1257"),
375 era=EncodingEra.MODERN_WEB,
376 is_multibyte=False,
377 languages=_BALTIC,
378 ),
379 EncodingInfo(
380 name="cp1258",
381 aliases=("Windows-1258", "cp1258"),
382 era=EncodingEra.MODERN_WEB,
383 is_multibyte=False,
384 languages=("vi",),
385 ),
386 # KOI8 - Modern Web
387 EncodingInfo(
388 name="koi8-r",
389 aliases=("KOI8-R", "koi8r"),
390 era=EncodingEra.MODERN_WEB,
391 is_multibyte=False,
392 languages=("ru",),
393 ),
394 EncodingInfo(
395 name="koi8-u",
396 aliases=("KOI8-U", "koi8u"),
397 era=EncodingEra.MODERN_WEB,
398 is_multibyte=False,
399 languages=("uk",),
400 ),
401 # TIS-620 - Modern Web
402 EncodingInfo(
403 name="tis-620",
404 aliases=("TIS-620", "tis620", "iso-8859-11"),
405 era=EncodingEra.MODERN_WEB,
406 is_multibyte=False,
407 languages=("th",),
408 ),
409 # === LEGACY_ISO ===
410 EncodingInfo(
411 name="iso8859-1",
412 aliases=("ISO-8859-1", "latin-1", "latin1", "iso8859-1"),
413 era=EncodingEra.LEGACY_ISO,
414 is_multibyte=False,
415 languages=_WESTERN,
416 ),
417 EncodingInfo(
418 name="iso8859-2",
419 aliases=("ISO-8859-2", "latin-2", "latin2", "iso8859-2"),
420 era=EncodingEra.LEGACY_ISO,
421 is_multibyte=False,
422 languages=_CENTRAL_EU,
423 ),
424 EncodingInfo(
425 name="iso8859-3",
426 aliases=("ISO-8859-3", "latin-3", "latin3", "iso8859-3"),
427 era=EncodingEra.LEGACY_ISO,
428 is_multibyte=False,
429 languages=("eo", "mt", "tr"),
430 ),
431 EncodingInfo(
432 name="iso8859-4",
433 aliases=("ISO-8859-4", "latin-4", "latin4", "iso8859-4"),
434 era=EncodingEra.LEGACY_ISO,
435 is_multibyte=False,
436 languages=_BALTIC,
437 ),
438 EncodingInfo(
439 name="iso8859-5",
440 aliases=("ISO-8859-5", "iso8859-5", "cyrillic"),
441 era=EncodingEra.LEGACY_ISO,
442 is_multibyte=False,
443 languages=_CYRILLIC,
444 ),
445 EncodingInfo(
446 name="iso8859-6",
447 aliases=("ISO-8859-6", "iso8859-6", "arabic"),
448 era=EncodingEra.LEGACY_ISO,
449 is_multibyte=False,
450 languages=_ARABIC,
451 ),
452 EncodingInfo(
453 name="iso8859-7",
454 aliases=("ISO-8859-7", "iso8859-7", "greek"),
455 era=EncodingEra.LEGACY_ISO,
456 is_multibyte=False,
457 languages=("el",),
458 ),
459 EncodingInfo(
460 name="iso8859-8",
461 aliases=("ISO-8859-8", "iso8859-8", "hebrew"),
462 era=EncodingEra.LEGACY_ISO,
463 is_multibyte=False,
464 languages=("he",),
465 ),
466 EncodingInfo(
467 name="iso8859-9",
468 aliases=("ISO-8859-9", "latin-5", "latin5", "iso8859-9"),
469 era=EncodingEra.LEGACY_ISO,
470 is_multibyte=False,
471 languages=("tr",),
472 ),
473 EncodingInfo(
474 name="iso8859-10",
475 aliases=("ISO-8859-10", "latin-6", "latin6", "iso8859-10"),
476 era=EncodingEra.LEGACY_ISO,
477 is_multibyte=False,
478 languages=("is", "fi"),
479 ),
480 EncodingInfo(
481 name="iso8859-13",
482 aliases=("ISO-8859-13", "latin-7", "latin7", "iso8859-13"),
483 era=EncodingEra.LEGACY_ISO,
484 is_multibyte=False,
485 languages=_BALTIC,
486 ),
487 EncodingInfo(
488 name="iso8859-14",
489 aliases=("ISO-8859-14", "latin-8", "latin8", "iso8859-14"),
490 era=EncodingEra.LEGACY_ISO,
491 is_multibyte=False,
492 languages=("cy", "ga", "br", "gd"),
493 ),
494 EncodingInfo(
495 name="iso8859-15",
496 aliases=("ISO-8859-15", "latin-9", "latin9", "iso8859-15"),
497 era=EncodingEra.LEGACY_ISO,
498 is_multibyte=False,
499 languages=_WESTERN,
500 ),
501 EncodingInfo(
502 name="iso8859-16",
503 aliases=("ISO-8859-16", "latin-10", "latin10", "iso8859-16"),
504 era=EncodingEra.LEGACY_ISO,
505 is_multibyte=False,
506 languages=("ro", "pl", "hr", "hu", "sk", "sl"),
507 ),
508 # Johab - Legacy ISO per chardet 6.0.0
509 EncodingInfo(
510 name="johab",
511 aliases=("Johab",),
512 era=EncodingEra.LEGACY_ISO,
513 is_multibyte=True,
514 languages=("ko",),
515 ),
516 # === LEGACY_MAC ===
517 EncodingInfo(
518 name="mac-cyrillic",
519 aliases=("Mac-Cyrillic", "MacCyrillic", "maccyrillic"),
520 era=EncodingEra.LEGACY_MAC,
521 is_multibyte=False,
522 languages=_CYRILLIC,
523 ),
524 EncodingInfo(
525 name="mac-greek",
526 aliases=("Mac-Greek", "MacGreek", "macgreek"),
527 era=EncodingEra.LEGACY_MAC,
528 is_multibyte=False,
529 languages=("el",),
530 ),
531 EncodingInfo(
532 name="mac-iceland",
533 aliases=("Mac-Iceland", "MacIceland", "maciceland"),
534 era=EncodingEra.LEGACY_MAC,
535 is_multibyte=False,
536 languages=("is",),
537 ),
538 EncodingInfo(
539 name="mac-latin2",
540 aliases=("Mac-Latin2", "MacLatin2", "maclatin2", "maccentraleurope"),
541 era=EncodingEra.LEGACY_MAC,
542 is_multibyte=False,
543 languages=_CENTRAL_EU_NO_RO,
544 ),
545 EncodingInfo(
546 name="mac-roman",
547 aliases=("Mac-Roman", "MacRoman", "macroman", "macintosh"),
548 era=EncodingEra.LEGACY_MAC,
549 is_multibyte=False,
550 languages=_WESTERN,
551 ),
552 EncodingInfo(
553 name="mac-turkish",
554 aliases=("Mac-Turkish", "MacTurkish", "macturkish"),
555 era=EncodingEra.LEGACY_MAC,
556 is_multibyte=False,
557 languages=("tr",),
558 ),
559 # === LEGACY_REGIONAL ===
560 EncodingInfo(
561 name="cp720",
562 aliases=("CP720",),
563 era=EncodingEra.LEGACY_REGIONAL,
564 is_multibyte=False,
565 languages=_ARABIC,
566 ),
567 EncodingInfo(
568 name="cp1006",
569 aliases=("CP1006",),
570 era=EncodingEra.LEGACY_REGIONAL,
571 is_multibyte=False,
572 languages=("ur",),
573 ),
574 EncodingInfo(
575 name="cp1125",
576 aliases=("CP1125",),
577 era=EncodingEra.LEGACY_REGIONAL,
578 is_multibyte=False,
579 languages=("uk",),
580 ),
581 EncodingInfo(
582 name="koi8-t",
583 aliases=("KOI8-T",),
584 era=EncodingEra.LEGACY_REGIONAL,
585 is_multibyte=False,
586 languages=("tg",),
587 ),
588 EncodingInfo(
589 name="kz1048",
590 aliases=("KZ-1048", "kz1048", "strk1048-2002", "rk1048"),
591 era=EncodingEra.LEGACY_REGIONAL,
592 is_multibyte=False,
593 languages=("kk",),
594 ),
595 EncodingInfo(
596 name="ptcp154",
597 aliases=("PTCP154", "pt154", "cp154"),
598 era=EncodingEra.LEGACY_REGIONAL,
599 is_multibyte=False,
600 languages=("kk",),
601 ),
602 EncodingInfo(
603 name="hp-roman8",
604 aliases=("HP-Roman8", "roman8", "r8", "csHPRoman8"),
605 era=EncodingEra.LEGACY_REGIONAL,
606 is_multibyte=False,
607 languages=_WESTERN,
608 ),
609 # === DOS ===
610 EncodingInfo(
611 name="cp437",
612 aliases=("CP437",),
613 era=EncodingEra.DOS,
614 is_multibyte=False,
615 languages=("en", "fr", "de", "es", "pt", "it", "nl", "da", "sv", "fi"),
616 ),
617 EncodingInfo(
618 name="cp737",
619 aliases=("CP737",),
620 era=EncodingEra.DOS,
621 is_multibyte=False,
622 languages=("el",),
623 ),
624 EncodingInfo(
625 name="cp775",
626 aliases=("CP775",),
627 era=EncodingEra.DOS,
628 is_multibyte=False,
629 languages=_BALTIC,
630 ),
631 EncodingInfo(
632 name="cp850",
633 aliases=("CP850",),
634 era=EncodingEra.DOS,
635 is_multibyte=False,
636 languages=_WESTERN,
637 ),
638 EncodingInfo(
639 name="cp852",
640 aliases=("CP852",),
641 era=EncodingEra.DOS,
642 is_multibyte=False,
643 languages=_CENTRAL_EU_NO_RO,
644 ),
645 EncodingInfo(
646 name="cp855",
647 aliases=("CP855",),
648 era=EncodingEra.DOS,
649 is_multibyte=False,
650 languages=_CYRILLIC,
651 ),
652 EncodingInfo(
653 name="cp856",
654 aliases=("CP856",),
655 era=EncodingEra.DOS,
656 is_multibyte=False,
657 languages=("he",),
658 ),
659 EncodingInfo(
660 name="cp857",
661 aliases=("CP857",),
662 era=EncodingEra.DOS,
663 is_multibyte=False,
664 languages=("tr",),
665 ),
666 EncodingInfo(
667 name="cp858",
668 aliases=("CP858",),
669 era=EncodingEra.DOS,
670 is_multibyte=False,
671 languages=_WESTERN,
672 ),
673 EncodingInfo(
674 name="cp860",
675 aliases=("CP860",),
676 era=EncodingEra.DOS,
677 is_multibyte=False,
678 languages=("pt",),
679 ),
680 EncodingInfo(
681 name="cp861",
682 aliases=("CP861",),
683 era=EncodingEra.DOS,
684 is_multibyte=False,
685 languages=("is",),
686 ),
687 EncodingInfo(
688 name="cp862",
689 aliases=("CP862",),
690 era=EncodingEra.DOS,
691 is_multibyte=False,
692 languages=("he",),
693 ),
694 EncodingInfo(
695 name="cp863",
696 aliases=("CP863",),
697 era=EncodingEra.DOS,
698 is_multibyte=False,
699 languages=("fr",),
700 ),
701 EncodingInfo(
702 name="cp864",
703 aliases=("CP864",),
704 era=EncodingEra.DOS,
705 is_multibyte=False,
706 languages=("ar",),
707 ),
708 EncodingInfo(
709 name="cp865",
710 aliases=("CP865",),
711 era=EncodingEra.DOS,
712 is_multibyte=False,
713 languages=("da", "no"),
714 ),
715 EncodingInfo(
716 name="cp866",
717 aliases=("CP866",),
718 era=EncodingEra.DOS,
719 is_multibyte=False,
720 languages=_CYRILLIC,
721 ),
722 EncodingInfo(
723 name="cp869",
724 aliases=("CP869",),
725 era=EncodingEra.DOS,
726 is_multibyte=False,
727 languages=("el",),
728 ),
729 # === MAINFRAME ===
730 EncodingInfo(
731 name="cp1140",
732 aliases=("CP1140", "cp037"),
733 era=EncodingEra.MAINFRAME,
734 is_multibyte=False,
735 languages=_WESTERN_TR,
736 ),
737 EncodingInfo(
738 name="cp424",
739 aliases=("CP424",),
740 era=EncodingEra.MAINFRAME,
741 is_multibyte=False,
742 languages=("he",),
743 ),
744 EncodingInfo(
745 name="cp500",
746 aliases=("CP500",),
747 era=EncodingEra.MAINFRAME,
748 is_multibyte=False,
749 languages=_WESTERN,
750 ),
751 EncodingInfo(
752 name="cp875",
753 aliases=("CP875",),
754 era=EncodingEra.MAINFRAME,
755 is_multibyte=False,
756 languages=("el",),
757 ),
758 EncodingInfo(
759 name="cp1026",
760 aliases=("CP1026",),
761 era=EncodingEra.MAINFRAME,
762 is_multibyte=False,
763 languages=("tr",),
764 ),
765 EncodingInfo(
766 name="cp273",
767 aliases=("CP273",),
768 era=EncodingEra.MAINFRAME,
769 is_multibyte=False,
770 languages=("de",),
771 ),
772)
774REGISTRY: MappingProxyType[str, EncodingInfo] = MappingProxyType(
775 {e.name: e for e in _REGISTRY_ENTRIES}
776)
779@functools.cache
780def lookup_encoding(name: str) -> EncodingName | None:
781 """Convert an encoding name string to the canonical EncodingName.
783 Handles arbitrary casing, aliases, and Python codec names.
785 :param name: Any encoding name string.
786 :returns: The canonical :data:`EncodingName`, or ``None`` if unknown.
787 """
788 lowered = name.lower()
789 for entry in REGISTRY.values():
790 if entry.name == lowered:
791 return entry.name
792 for alias in entry.aliases:
793 if alias.lower() == lowered:
794 return entry.name
795 # Fallback: resolve through Python's codec registry
796 try:
797 codec_name = codecs.lookup(name).name
798 except LookupError:
799 return None
800 if codec_name != lowered:
801 return lookup_encoding(codec_name)
802 return None