Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/registry.py: 75%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Encoding registry with metadata for all supported encodings."""
3from __future__ import annotations
5import codecs
6import dataclasses
7import functools
8from collections.abc import Iterable
9from types import MappingProxyType
10from typing import Literal
12from chardet.enums import EncodingEra
14EncodingName = Literal[
15 "ascii",
16 "big5hkscs",
17 "cp1006",
18 "cp1026",
19 "cp1125",
20 "cp1140",
21 "cp1250",
22 "cp1251",
23 "cp1252",
24 "cp1253",
25 "cp1254",
26 "cp1255",
27 "cp1256",
28 "cp1257",
29 "cp1258",
30 "cp273",
31 "cp424",
32 "cp437",
33 "cp500",
34 "cp720",
35 "cp737",
36 "cp775",
37 "cp850",
38 "cp852",
39 "cp855",
40 "cp856",
41 "cp857",
42 "cp858",
43 "cp860",
44 "cp861",
45 "cp862",
46 "cp863",
47 "cp864",
48 "cp865",
49 "cp866",
50 "cp869",
51 "cp874",
52 "cp875",
53 "cp932",
54 "cp949",
55 "euc_jis_2004",
56 "euc_kr",
57 "gb18030",
58 "hp-roman8",
59 "hz",
60 "iso2022_jp_2",
61 "iso2022_jp_2004",
62 "iso2022_jp_ext",
63 "iso2022_kr",
64 "iso8859-1",
65 "iso8859-10",
66 "iso8859-13",
67 "iso8859-14",
68 "iso8859-15",
69 "iso8859-16",
70 "iso8859-2",
71 "iso8859-3",
72 "iso8859-4",
73 "iso8859-5",
74 "iso8859-6",
75 "iso8859-7",
76 "iso8859-8",
77 "iso8859-9",
78 "johab",
79 "koi8-r",
80 "koi8-t",
81 "koi8-u",
82 "kz1048",
83 "mac-cyrillic",
84 "mac-greek",
85 "mac-iceland",
86 "mac-latin2",
87 "mac-roman",
88 "mac-turkish",
89 "ptcp154",
90 "shift_jis_2004",
91 "tis-620",
92 "utf-16",
93 "utf-16-be",
94 "utf-16-le",
95 "utf-32",
96 "utf-32-be",
97 "utf-32-le",
98 "utf-7",
99 "utf-8",
100 "utf-8-sig",
101]
103# Shared language tuples — used by multiple EncodingInfo entries below.
104_WESTERN = (
105 "br",
106 "cy",
107 "da",
108 "de",
109 "en",
110 "es",
111 "fi",
112 "fr",
113 "ga",
114 "id",
115 "is",
116 "it",
117 "ms",
118 "nl",
119 "no",
120 "pt",
121 "sv",
122)
123_WESTERN_TR = (*_WESTERN, "tr")
124_CYRILLIC = ("ru", "bg", "uk", "sr", "mk", "be")
125_CENTRAL_EU = ("pl", "cs", "hu", "hr", "ro", "sk", "sl")
126_CENTRAL_EU_NO_RO = ("pl", "cs", "hu", "hr", "sk", "sl")
127_BALTIC = ("et", "lt", "lv")
128_ARABIC = ("ar", "fa")
131@dataclasses.dataclass(frozen=True, slots=True)
132class EncodingInfo:
133 """Metadata for a single encoding."""
135 name: EncodingName
136 aliases: tuple[str, ...]
137 era: EncodingEra
138 is_multibyte: bool
139 languages: tuple[str, ...]
142@functools.lru_cache(maxsize=256)
143def get_candidates(
144 era: EncodingEra,
145 include_encodings: frozenset[str] | None = None,
146 exclude_encodings: frozenset[str] | None = None,
147) -> tuple[EncodingInfo, ...]:
148 """Return registry entries matching the given filters.
150 Filters are applied in order: era, include, exclude.
152 :param era: Bit flags specifying which encoding eras to include.
153 :param include_encodings: If not ``None``, only return encodings in this set.
154 :param exclude_encodings: If not ``None``, exclude encodings in this set.
155 :returns: A tuple of matching :class:`EncodingInfo` entries.
156 """
157 candidates = (enc for enc in REGISTRY.values() if enc.era & era)
158 if include_encodings is not None:
159 candidates = (enc for enc in candidates if enc.name in include_encodings)
160 if exclude_encodings is not None:
161 candidates = (enc for enc in candidates if enc.name not in exclude_encodings)
162 return tuple(candidates)
165# Era assignments match chardet 6.0.0's chardet/metadata/charsets.py
167_REGISTRY_ENTRIES = (
168 # === MODERN_WEB ===
169 EncodingInfo(
170 name="ascii",
171 aliases=("us-ascii",),
172 era=EncodingEra.MODERN_WEB,
173 is_multibyte=False,
174 languages=(),
175 ),
176 EncodingInfo(
177 name="utf-8",
178 aliases=(
179 "utf-8",
180 "utf8",
181 "csutf8",
182 "unicode-1-1-utf-8",
183 "unicode11utf8",
184 "unicode20utf8",
185 "x-unicode20utf8",
186 ),
187 era=EncodingEra.MODERN_WEB,
188 is_multibyte=False,
189 languages=(),
190 ),
191 EncodingInfo(
192 name="utf-8-sig",
193 aliases=("UTF-8-SIG", "utf-8-bom"),
194 era=EncodingEra.MODERN_WEB,
195 is_multibyte=False,
196 languages=(),
197 ),
198 EncodingInfo(
199 name="utf-16",
200 aliases=("UTF-16", "utf16", "csutf16"),
201 era=EncodingEra.MODERN_WEB,
202 is_multibyte=False,
203 languages=(),
204 ),
205 EncodingInfo(
206 name="utf-16-be",
207 aliases=("UTF-16-BE", "utf-16be", "csutf16be"),
208 era=EncodingEra.MODERN_WEB,
209 is_multibyte=False,
210 languages=(),
211 ),
212 EncodingInfo(
213 name="utf-16-le",
214 aliases=("UTF-16-LE", "utf-16le", "csutf16le"),
215 era=EncodingEra.MODERN_WEB,
216 is_multibyte=False,
217 languages=(),
218 ),
219 EncodingInfo(
220 name="utf-32",
221 aliases=("UTF-32", "utf32", "csutf32"),
222 era=EncodingEra.MODERN_WEB,
223 is_multibyte=False,
224 languages=(),
225 ),
226 EncodingInfo(
227 name="utf-32-be",
228 aliases=("UTF-32-BE", "utf-32be", "csutf32be"),
229 era=EncodingEra.MODERN_WEB,
230 is_multibyte=False,
231 languages=(),
232 ),
233 EncodingInfo(
234 name="utf-32-le",
235 aliases=("UTF-32-LE", "utf-32le", "csutf32le"),
236 era=EncodingEra.MODERN_WEB,
237 is_multibyte=False,
238 languages=(),
239 ),
240 EncodingInfo(
241 name="utf-7",
242 aliases=("UTF-7", "utf7", "csutf7"),
243 era=EncodingEra.LEGACY_REGIONAL,
244 is_multibyte=False,
245 languages=(),
246 ),
247 # CJK - Modern Web
248 EncodingInfo(
249 name="big5hkscs",
250 aliases=(
251 "Big5-HKSCS",
252 "Big5HKSCS",
253 "big5",
254 "big5-tw",
255 "csbig5",
256 "cp950",
257 "cn-big5",
258 "x-x-big5",
259 "csbig5hkscs",
260 ),
261 era=EncodingEra.MODERN_WEB,
262 is_multibyte=True,
263 languages=("zh",),
264 ),
265 EncodingInfo(
266 name="cp932",
267 aliases=(
268 "CP932",
269 "ms932",
270 "mskanji",
271 "ms-kanji",
272 "cswindows31j",
273 "windows-31j",
274 ),
275 era=EncodingEra.MODERN_WEB,
276 is_multibyte=True,
277 languages=("ja",),
278 ),
279 EncodingInfo(
280 # Note: "korean" is NOT an alias here. Python's codec table
281 # already resolves "korean" to ``euc_kr``, and WHATWG's primary
282 # name for the group is EUC-KR, so letting the default fall
283 # through is more spec-aligned than routing to cp949.
284 name="cp949",
285 aliases=(
286 "CP949",
287 "ms949",
288 "uhc",
289 "windows-949",
290 "csksc56011987",
291 "iso-ir-149",
292 "ks_c_5601-1987",
293 "ks_c_5601-1989",
294 "ksc5601",
295 "ksc_5601",
296 ),
297 era=EncodingEra.MODERN_WEB,
298 is_multibyte=True,
299 languages=("ko",),
300 ),
301 EncodingInfo(
302 name="euc_jis_2004",
303 aliases=(
304 "EUC-JIS-2004",
305 "euc-jp",
306 "eucjp",
307 "ujis",
308 "u-jis",
309 "euc-jisx0213",
310 "cseucpkdfmtjapanese",
311 "x-euc-jp",
312 ),
313 era=EncodingEra.MODERN_WEB,
314 is_multibyte=True,
315 languages=("ja",),
316 ),
317 EncodingInfo(
318 name="euc_kr",
319 aliases=("EUC-KR", "euckr", "cseuckr"),
320 era=EncodingEra.MODERN_WEB,
321 is_multibyte=True,
322 languages=("ko",),
323 ),
324 EncodingInfo(
325 # Note: "chinese" is NOT listed here because the label is
326 # semantically ambiguous between Traditional (Big5) and Simplified
327 # (GB18030) Chinese. Python's codec table still resolves
328 # "chinese" -> "gb2312" -> (via the gb2312 alias below) gb18030,
329 # so the label continues to work for Simplified content via the
330 # codec fallback in lookup_encoding(). We simply decline to
331 # bless the ambiguity in our own table.
332 name="gb18030",
333 aliases=(
334 "GB18030",
335 "gb-18030",
336 "gb2312",
337 "gbk",
338 "csgb2312",
339 "gb_2312",
340 "gb_2312-80",
341 "x-gbk",
342 "csiso58gb231280",
343 "iso-ir-58",
344 "csgb18030",
345 "csgbk",
346 "cp936",
347 "ms936",
348 "windows-936",
349 ),
350 era=EncodingEra.MODERN_WEB,
351 is_multibyte=True,
352 languages=("zh",),
353 ),
354 EncodingInfo(
355 name="hz",
356 aliases=("HZ-GB-2312", "hz"),
357 era=EncodingEra.LEGACY_REGIONAL,
358 is_multibyte=True,
359 languages=("zh",),
360 ),
361 EncodingInfo(
362 name="iso2022_jp_2",
363 aliases=(
364 "ISO-2022-JP-2",
365 "iso-2022-jp",
366 "csiso2022jp",
367 "iso2022-jp-1",
368 "csiso2022jp2",
369 ),
370 era=EncodingEra.MODERN_WEB,
371 is_multibyte=True,
372 languages=("ja",),
373 ),
374 EncodingInfo(
375 name="iso2022_jp_2004",
376 aliases=("ISO-2022-JP-2004", "iso2022-jp-3"),
377 era=EncodingEra.MODERN_WEB,
378 is_multibyte=True,
379 languages=("ja",),
380 ),
381 EncodingInfo(
382 name="iso2022_jp_ext",
383 aliases=("ISO-2022-JP-EXT",),
384 era=EncodingEra.MODERN_WEB,
385 is_multibyte=True,
386 languages=("ja",),
387 ),
388 EncodingInfo(
389 name="iso2022_kr",
390 aliases=("ISO-2022-KR", "csiso2022kr"),
391 era=EncodingEra.LEGACY_REGIONAL,
392 is_multibyte=True,
393 languages=("ko",),
394 ),
395 EncodingInfo(
396 name="shift_jis_2004",
397 aliases=(
398 "Shift-JIS-2004",
399 "Shift_JIS_2004",
400 "shift_jis",
401 "sjis",
402 "shiftjis",
403 "s_jis",
404 "shift-jisx0213",
405 "x-sjis",
406 "csshiftjis",
407 "ms_kanji",
408 ),
409 era=EncodingEra.MODERN_WEB,
410 is_multibyte=True,
411 languages=("ja",),
412 ),
413 # Windows code pages - Modern Web
414 EncodingInfo(
415 name="cp874",
416 aliases=("CP874", "windows-874", "dos-874"),
417 era=EncodingEra.MODERN_WEB,
418 is_multibyte=False,
419 languages=("th",),
420 ),
421 EncodingInfo(
422 name="cp1250",
423 aliases=("Windows-1250", "cp1250", "x-cp1250", "cswindows1250"),
424 era=EncodingEra.MODERN_WEB,
425 is_multibyte=False,
426 languages=(*_CENTRAL_EU, "sr"),
427 ),
428 EncodingInfo(
429 name="cp1251",
430 aliases=("Windows-1251", "cp1251", "x-cp1251", "cswindows1251"),
431 era=EncodingEra.MODERN_WEB,
432 is_multibyte=False,
433 languages=_CYRILLIC,
434 ),
435 EncodingInfo(
436 name="cp1252",
437 aliases=("Windows-1252", "cp1252", "x-cp1252", "cswindows1252"),
438 era=EncodingEra.MODERN_WEB,
439 is_multibyte=False,
440 languages=_WESTERN,
441 ),
442 EncodingInfo(
443 name="cp1253",
444 aliases=("Windows-1253", "cp1253", "x-cp1253", "cswindows1253"),
445 era=EncodingEra.MODERN_WEB,
446 is_multibyte=False,
447 languages=("el",),
448 ),
449 EncodingInfo(
450 name="cp1254",
451 aliases=("Windows-1254", "cp1254", "x-cp1254", "cswindows1254"),
452 era=EncodingEra.MODERN_WEB,
453 is_multibyte=False,
454 languages=("tr",),
455 ),
456 EncodingInfo(
457 name="cp1255",
458 aliases=("Windows-1255", "cp1255", "x-cp1255", "cswindows1255"),
459 era=EncodingEra.MODERN_WEB,
460 is_multibyte=False,
461 languages=("he",),
462 ),
463 EncodingInfo(
464 name="cp1256",
465 aliases=("Windows-1256", "cp1256", "x-cp1256", "cswindows1256"),
466 era=EncodingEra.MODERN_WEB,
467 is_multibyte=False,
468 languages=_ARABIC,
469 ),
470 EncodingInfo(
471 name="cp1257",
472 aliases=("Windows-1257", "cp1257", "x-cp1257", "cswindows1257"),
473 era=EncodingEra.MODERN_WEB,
474 is_multibyte=False,
475 languages=_BALTIC,
476 ),
477 EncodingInfo(
478 name="cp1258",
479 aliases=("Windows-1258", "cp1258", "x-cp1258", "cswindows1258"),
480 era=EncodingEra.MODERN_WEB,
481 is_multibyte=False,
482 languages=("vi",),
483 ),
484 # KOI8 - Modern Web
485 EncodingInfo(
486 name="koi8-r",
487 aliases=("KOI8-R", "koi8r", "koi", "koi8", "cskoi8r"),
488 era=EncodingEra.MODERN_WEB,
489 is_multibyte=False,
490 languages=("ru",),
491 ),
492 EncodingInfo(
493 name="koi8-u",
494 aliases=("KOI8-U", "koi8u", "koi8-ru", "cskoi8u"),
495 era=EncodingEra.MODERN_WEB,
496 is_multibyte=False,
497 languages=("uk",),
498 ),
499 # TIS-620 - Modern Web
500 EncodingInfo(
501 name="tis-620",
502 aliases=(
503 "TIS-620",
504 "tis620",
505 "iso-8859-11",
506 "iso8859-11",
507 "iso885911",
508 "cstis620",
509 ),
510 era=EncodingEra.MODERN_WEB,
511 is_multibyte=False,
512 languages=("th",),
513 ),
514 # === LEGACY_ISO ===
515 EncodingInfo(
516 name="iso8859-1",
517 aliases=("ISO-8859-1", "latin-1", "latin1", "iso8859-1", "iso88591"),
518 era=EncodingEra.LEGACY_ISO,
519 is_multibyte=False,
520 languages=_WESTERN,
521 ),
522 EncodingInfo(
523 name="iso8859-2",
524 aliases=("ISO-8859-2", "latin-2", "latin2", "iso8859-2", "iso88592"),
525 era=EncodingEra.LEGACY_ISO,
526 is_multibyte=False,
527 languages=_CENTRAL_EU,
528 ),
529 EncodingInfo(
530 name="iso8859-3",
531 aliases=("ISO-8859-3", "latin-3", "latin3", "iso8859-3", "iso88593"),
532 era=EncodingEra.LEGACY_ISO,
533 is_multibyte=False,
534 languages=("eo", "mt", "tr"),
535 ),
536 EncodingInfo(
537 name="iso8859-4",
538 aliases=("ISO-8859-4", "latin-4", "latin4", "iso8859-4", "iso88594"),
539 era=EncodingEra.LEGACY_ISO,
540 is_multibyte=False,
541 languages=_BALTIC,
542 ),
543 EncodingInfo(
544 name="iso8859-5",
545 aliases=("ISO-8859-5", "iso8859-5", "cyrillic", "iso88595"),
546 era=EncodingEra.LEGACY_ISO,
547 is_multibyte=False,
548 languages=_CYRILLIC,
549 ),
550 EncodingInfo(
551 # The -E ("explicit directionality") and -I ("implicit
552 # directionality") variants listed by IANA and WHATWG are
553 # higher-level bidi-ordering hints, not separate codecs -- Python
554 # stdlib has no distinct decoder for them, so all four resolve to
555 # the same ``iso8859-6`` canonical here.
556 name="iso8859-6",
557 aliases=(
558 "ISO-8859-6",
559 "iso8859-6",
560 "arabic",
561 "iso88596",
562 "iso-8859-6-e",
563 "iso-8859-6-i",
564 "csiso88596e",
565 "csiso88596i",
566 ),
567 era=EncodingEra.LEGACY_ISO,
568 is_multibyte=False,
569 languages=_ARABIC,
570 ),
571 EncodingInfo(
572 name="iso8859-7",
573 aliases=(
574 "ISO-8859-7",
575 "iso8859-7",
576 "greek",
577 "iso88597",
578 "sun_eu_greek",
579 ),
580 era=EncodingEra.LEGACY_ISO,
581 is_multibyte=False,
582 languages=("el",),
583 ),
584 EncodingInfo(
585 # WHATWG and IANA distinguish ISO-8859-8 ("visual", -E) from
586 # ISO-8859-8-I ("logical", -I) to signal Hebrew bidi ordering, but
587 # Python's stdlib has a single ``iso8859-8`` codec -- the bidi
588 # distinction is a higher-layer concern. All variants collapse
589 # onto this one canonical.
590 name="iso8859-8",
591 aliases=(
592 "ISO-8859-8",
593 "iso8859-8",
594 "hebrew",
595 "iso88598",
596 "iso-8859-8-e",
597 "iso-8859-8-i",
598 "csiso88598e",
599 "csiso88598i",
600 "visual",
601 "logical",
602 ),
603 era=EncodingEra.LEGACY_ISO,
604 is_multibyte=False,
605 languages=("he",),
606 ),
607 EncodingInfo(
608 name="iso8859-9",
609 aliases=("ISO-8859-9", "latin-5", "latin5", "iso8859-9", "iso88599"),
610 era=EncodingEra.LEGACY_ISO,
611 is_multibyte=False,
612 languages=("tr",),
613 ),
614 EncodingInfo(
615 name="iso8859-10",
616 aliases=("ISO-8859-10", "latin-6", "latin6", "iso8859-10", "iso885910"),
617 era=EncodingEra.LEGACY_ISO,
618 is_multibyte=False,
619 languages=("is", "fi"),
620 ),
621 EncodingInfo(
622 name="iso8859-13",
623 aliases=(
624 "ISO-8859-13",
625 "latin-7",
626 "latin7",
627 "iso8859-13",
628 "iso885913",
629 "csiso885913",
630 ),
631 era=EncodingEra.LEGACY_ISO,
632 is_multibyte=False,
633 languages=_BALTIC,
634 ),
635 EncodingInfo(
636 # ``iso-celtic`` is a Python stdlib alias, not a WHATWG/IANA name.
637 name="iso8859-14",
638 aliases=(
639 "ISO-8859-14",
640 "latin-8",
641 "latin8",
642 "iso8859-14",
643 "iso885914",
644 "csiso885914",
645 "iso-ir-199",
646 "iso-celtic",
647 "l8",
648 ),
649 era=EncodingEra.LEGACY_ISO,
650 is_multibyte=False,
651 languages=("cy", "ga", "br", "gd"),
652 ),
653 EncodingInfo(
654 name="iso8859-15",
655 aliases=(
656 "ISO-8859-15",
657 "latin-9",
658 "latin9",
659 "iso8859-15",
660 "iso885915",
661 "csisolatin9",
662 "csiso885915",
663 "l9",
664 ),
665 era=EncodingEra.LEGACY_ISO,
666 is_multibyte=False,
667 languages=_WESTERN,
668 ),
669 EncodingInfo(
670 name="iso8859-16",
671 aliases=(
672 "ISO-8859-16",
673 "latin-10",
674 "latin10",
675 "iso8859-16",
676 "iso885916",
677 "csiso885916",
678 "iso-ir-226",
679 "l10",
680 ),
681 era=EncodingEra.LEGACY_ISO,
682 is_multibyte=False,
683 languages=("ro", "pl", "hr", "hu", "sk", "sl"),
684 ),
685 # Johab - Legacy ISO per chardet 6.0.0
686 EncodingInfo(
687 name="johab",
688 aliases=("Johab",),
689 era=EncodingEra.LEGACY_ISO,
690 is_multibyte=True,
691 languages=("ko",),
692 ),
693 # === LEGACY_MAC ===
694 EncodingInfo(
695 name="mac-cyrillic",
696 aliases=(
697 "Mac-Cyrillic",
698 "MacCyrillic",
699 "maccyrillic",
700 "x-mac-cyrillic",
701 "x-mac-ukrainian",
702 ),
703 era=EncodingEra.LEGACY_MAC,
704 is_multibyte=False,
705 languages=_CYRILLIC,
706 ),
707 EncodingInfo(
708 name="mac-greek",
709 aliases=("Mac-Greek", "MacGreek", "macgreek"),
710 era=EncodingEra.LEGACY_MAC,
711 is_multibyte=False,
712 languages=("el",),
713 ),
714 EncodingInfo(
715 name="mac-iceland",
716 aliases=("Mac-Iceland", "MacIceland", "maciceland"),
717 era=EncodingEra.LEGACY_MAC,
718 is_multibyte=False,
719 languages=("is",),
720 ),
721 EncodingInfo(
722 name="mac-latin2",
723 aliases=("Mac-Latin2", "MacLatin2", "maclatin2", "maccentraleurope"),
724 era=EncodingEra.LEGACY_MAC,
725 is_multibyte=False,
726 languages=_CENTRAL_EU_NO_RO,
727 ),
728 EncodingInfo(
729 name="mac-roman",
730 aliases=(
731 "Mac-Roman",
732 "MacRoman",
733 "macroman",
734 "macintosh",
735 "csmacintosh",
736 "mac",
737 "x-mac-roman",
738 ),
739 era=EncodingEra.LEGACY_MAC,
740 is_multibyte=False,
741 languages=_WESTERN,
742 ),
743 EncodingInfo(
744 name="mac-turkish",
745 aliases=("Mac-Turkish", "MacTurkish", "macturkish"),
746 era=EncodingEra.LEGACY_MAC,
747 is_multibyte=False,
748 languages=("tr",),
749 ),
750 # === LEGACY_REGIONAL ===
751 EncodingInfo(
752 name="cp720",
753 aliases=("CP720",),
754 era=EncodingEra.LEGACY_REGIONAL,
755 is_multibyte=False,
756 languages=_ARABIC,
757 ),
758 EncodingInfo(
759 name="cp1006",
760 aliases=("CP1006",),
761 era=EncodingEra.LEGACY_REGIONAL,
762 is_multibyte=False,
763 languages=("ur",),
764 ),
765 EncodingInfo(
766 name="cp1125",
767 aliases=("CP1125",),
768 era=EncodingEra.LEGACY_REGIONAL,
769 is_multibyte=False,
770 languages=("uk",),
771 ),
772 EncodingInfo(
773 name="koi8-t",
774 aliases=("KOI8-T",),
775 era=EncodingEra.LEGACY_REGIONAL,
776 is_multibyte=False,
777 languages=("tg",),
778 ),
779 EncodingInfo(
780 name="kz1048",
781 aliases=("KZ-1048", "kz1048", "strk1048-2002", "rk1048"),
782 era=EncodingEra.LEGACY_REGIONAL,
783 is_multibyte=False,
784 languages=("kk",),
785 ),
786 EncodingInfo(
787 name="ptcp154",
788 aliases=("PTCP154", "pt154", "cp154"),
789 era=EncodingEra.LEGACY_REGIONAL,
790 is_multibyte=False,
791 languages=("kk",),
792 ),
793 EncodingInfo(
794 name="hp-roman8",
795 aliases=("HP-Roman8", "roman8", "r8", "csHPRoman8"),
796 era=EncodingEra.LEGACY_REGIONAL,
797 is_multibyte=False,
798 languages=_WESTERN,
799 ),
800 # === DOS ===
801 EncodingInfo(
802 name="cp437",
803 aliases=("CP437",),
804 era=EncodingEra.DOS,
805 is_multibyte=False,
806 languages=("en", "fr", "de", "es", "pt", "it", "nl", "da", "sv", "fi", "ga"),
807 ),
808 EncodingInfo(
809 name="cp737",
810 aliases=("CP737",),
811 era=EncodingEra.DOS,
812 is_multibyte=False,
813 languages=("el",),
814 ),
815 EncodingInfo(
816 name="cp775",
817 aliases=("CP775",),
818 era=EncodingEra.DOS,
819 is_multibyte=False,
820 languages=_BALTIC,
821 ),
822 EncodingInfo(
823 name="cp850",
824 aliases=("CP850",),
825 era=EncodingEra.DOS,
826 is_multibyte=False,
827 languages=_WESTERN,
828 ),
829 EncodingInfo(
830 name="cp852",
831 aliases=("CP852",),
832 era=EncodingEra.DOS,
833 is_multibyte=False,
834 languages=_CENTRAL_EU,
835 ),
836 EncodingInfo(
837 name="cp855",
838 aliases=("CP855",),
839 era=EncodingEra.DOS,
840 is_multibyte=False,
841 languages=_CYRILLIC,
842 ),
843 EncodingInfo(
844 name="cp856",
845 aliases=("CP856",),
846 era=EncodingEra.DOS,
847 is_multibyte=False,
848 languages=("he",),
849 ),
850 EncodingInfo(
851 name="cp857",
852 aliases=("CP857",),
853 era=EncodingEra.DOS,
854 is_multibyte=False,
855 languages=("tr",),
856 ),
857 EncodingInfo(
858 name="cp858",
859 aliases=("CP858",),
860 era=EncodingEra.DOS,
861 is_multibyte=False,
862 languages=_WESTERN,
863 ),
864 EncodingInfo(
865 name="cp860",
866 aliases=("CP860",),
867 era=EncodingEra.DOS,
868 is_multibyte=False,
869 languages=("pt",),
870 ),
871 EncodingInfo(
872 name="cp861",
873 aliases=("CP861",),
874 era=EncodingEra.DOS,
875 is_multibyte=False,
876 languages=("is",),
877 ),
878 EncodingInfo(
879 name="cp862",
880 aliases=("CP862",),
881 era=EncodingEra.DOS,
882 is_multibyte=False,
883 languages=("he",),
884 ),
885 EncodingInfo(
886 name="cp863",
887 aliases=("CP863",),
888 era=EncodingEra.DOS,
889 is_multibyte=False,
890 languages=("fr",),
891 ),
892 EncodingInfo(
893 name="cp864",
894 aliases=("CP864",),
895 era=EncodingEra.DOS,
896 is_multibyte=False,
897 languages=("ar",),
898 ),
899 EncodingInfo(
900 name="cp865",
901 aliases=("CP865",),
902 era=EncodingEra.DOS,
903 is_multibyte=False,
904 languages=("da", "no"),
905 ),
906 EncodingInfo(
907 name="cp866",
908 aliases=("CP866",),
909 era=EncodingEra.DOS,
910 is_multibyte=False,
911 languages=_CYRILLIC,
912 ),
913 EncodingInfo(
914 name="cp869",
915 aliases=("CP869",),
916 era=EncodingEra.DOS,
917 is_multibyte=False,
918 languages=("el",),
919 ),
920 # === MAINFRAME ===
921 EncodingInfo(
922 name="cp1140",
923 aliases=(
924 "CP1140",
925 "cp037",
926 "cp01140",
927 "ibm01140",
928 "ibm1140",
929 "csibm01140",
930 ),
931 era=EncodingEra.MAINFRAME,
932 is_multibyte=False,
933 languages=_WESTERN_TR,
934 ),
935 EncodingInfo(
936 name="cp424",
937 aliases=("CP424",),
938 era=EncodingEra.MAINFRAME,
939 is_multibyte=False,
940 languages=("he",),
941 ),
942 EncodingInfo(
943 name="cp500",
944 aliases=("CP500",),
945 era=EncodingEra.MAINFRAME,
946 is_multibyte=False,
947 languages=_WESTERN,
948 ),
949 EncodingInfo(
950 name="cp875",
951 aliases=("CP875",),
952 era=EncodingEra.MAINFRAME,
953 is_multibyte=False,
954 languages=("el",),
955 ),
956 EncodingInfo(
957 name="cp1026",
958 aliases=("CP1026",),
959 era=EncodingEra.MAINFRAME,
960 is_multibyte=False,
961 languages=("tr",),
962 ),
963 EncodingInfo(
964 name="cp273",
965 aliases=("CP273",),
966 era=EncodingEra.MAINFRAME,
967 is_multibyte=False,
968 languages=("de",),
969 ),
970)
972REGISTRY: MappingProxyType[str, EncodingInfo] = MappingProxyType(
973 {e.name: e for e in _REGISTRY_ENTRIES}
974)
977@functools.cache
978def lookup_encoding(name: str) -> EncodingName | None:
979 """Convert an encoding name string to the canonical EncodingName.
981 Handles arbitrary casing, aliases, and Python codec names.
983 :param name: Any encoding name string.
984 :returns: The canonical :data:`EncodingName`, or ``None`` if unknown.
985 """
986 lowered = name.lower()
987 for entry in REGISTRY.values():
988 if entry.name == lowered:
989 return entry.name
990 for alias in entry.aliases:
991 if alias.lower() == lowered:
992 return entry.name
993 # Fallback: resolve through Python's codec registry
994 try:
995 codec_name = codecs.lookup(name).name
996 except (LookupError, ValueError):
997 return None
998 if codec_name != lowered:
999 return lookup_encoding(codec_name)
1000 return None
1003def _validate_encoding(name: str, param_name: str) -> str:
1004 """Validate and normalize a single encoding name.
1006 :param name: The encoding name to validate.
1007 :param param_name: Parameter name for error messages.
1008 :returns: The canonical encoding name.
1009 :raises ValueError: If the encoding name is unknown.
1010 """
1011 canonical = lookup_encoding(name)
1012 if canonical is None:
1013 msg = f"Unknown encoding {name!r} in {param_name}"
1014 raise ValueError(msg)
1015 return canonical
1018def normalize_encodings(
1019 encodings: Iterable[str] | None,
1020 param_name: str,
1021) -> frozenset[str] | None:
1022 """Normalize an iterable of encoding names to canonical forms.
1024 :param encodings: Encoding names to normalize, or ``None``.
1025 :param param_name: Parameter name for error messages.
1026 :returns: A frozenset of canonical encoding names, or ``None``.
1027 :raises ValueError: If any encoding name is unknown.
1028 """
1029 if encodings is None:
1030 return None
1031 result = frozenset(_validate_encoding(name, param_name) for name in encodings)
1032 if not result:
1033 msg = f"{param_name} must not be empty; omit the argument or pass None to disable filtering"
1034 raise ValueError(msg)
1035 return result