Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/registry.py: 81%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Encoding registry with metadata for all supported encodings."""
3from __future__ import annotations
5import codecs
6import dataclasses
7import functools
8from collections.abc import Iterable
9from types import MappingProxyType
10from typing import Literal
12from chardet.enums import EncodingEra
14EncodingName = Literal[
15 "ascii",
16 "big5hkscs",
17 "cp1006",
18 "cp1026",
19 "cp1125",
20 "cp1140",
21 "cp1250",
22 "cp1251",
23 "cp1252",
24 "cp1253",
25 "cp1254",
26 "cp1255",
27 "cp1256",
28 "cp1257",
29 "cp1258",
30 "cp273",
31 "cp424",
32 "cp437",
33 "cp500",
34 "cp720",
35 "cp737",
36 "cp775",
37 "cp850",
38 "cp852",
39 "cp855",
40 "cp856",
41 "cp857",
42 "cp858",
43 "cp860",
44 "cp861",
45 "cp862",
46 "cp863",
47 "cp864",
48 "cp865",
49 "cp866",
50 "cp869",
51 "cp874",
52 "cp875",
53 "cp932",
54 "cp949",
55 "euc_jis_2004",
56 "euc_kr",
57 "gb18030",
58 "hp-roman8",
59 "hz",
60 "iso2022_jp_2",
61 "iso2022_jp_2004",
62 "iso2022_jp_ext",
63 "iso2022_kr",
64 "iso8859-1",
65 "iso8859-10",
66 "iso8859-13",
67 "iso8859-14",
68 "iso8859-15",
69 "iso8859-16",
70 "iso8859-2",
71 "iso8859-3",
72 "iso8859-4",
73 "iso8859-5",
74 "iso8859-6",
75 "iso8859-7",
76 "iso8859-8",
77 "iso8859-9",
78 "johab",
79 "koi8-r",
80 "koi8-t",
81 "koi8-u",
82 "kz1048",
83 "mac-cyrillic",
84 "mac-greek",
85 "mac-iceland",
86 "mac-latin2",
87 "mac-roman",
88 "mac-turkish",
89 "ptcp154",
90 "shift_jis_2004",
91 "tis-620",
92 "utf-16",
93 "utf-16-be",
94 "utf-16-le",
95 "utf-32",
96 "utf-32-be",
97 "utf-32-le",
98 "utf-7",
99 "utf-8",
100 "utf-8-sig",
101]
103# Shared language tuples — used by multiple EncodingInfo entries below.
104_WESTERN = (
105 "br",
106 "cy",
107 "da",
108 "de",
109 "en",
110 "es",
111 "fi",
112 "fr",
113 "ga",
114 "id",
115 "is",
116 "it",
117 "ms",
118 "nl",
119 "no",
120 "pt",
121 "sv",
122)
123_WESTERN_TR = (*_WESTERN, "tr")
124_CYRILLIC = ("ru", "bg", "uk", "sr", "mk", "be")
125_CENTRAL_EU = ("pl", "cs", "hu", "hr", "ro", "sk", "sl")
126_CENTRAL_EU_NO_RO = ("pl", "cs", "hu", "hr", "sk", "sl")
127_BALTIC = ("et", "lt", "lv")
128_ARABIC = ("ar", "fa")
131@dataclasses.dataclass(frozen=True, slots=True)
132class EncodingInfo:
133 """Metadata for a single encoding."""
135 name: EncodingName
136 aliases: tuple[str, ...]
137 era: EncodingEra
138 is_multibyte: bool
139 languages: tuple[str, ...]
142@functools.lru_cache(maxsize=256)
143def get_candidates(
144 era: EncodingEra,
145 include_encodings: frozenset[str] | None = None,
146 exclude_encodings: frozenset[str] | None = None,
147) -> tuple[EncodingInfo, ...]:
148 """Return registry entries matching the given filters.
150 Filters are applied in order: era, include, exclude.
152 :param era: Bit flags specifying which encoding eras to include.
153 :param include_encodings: If not ``None``, only return encodings in this set.
154 :param exclude_encodings: If not ``None``, exclude encodings in this set.
155 :returns: A tuple of matching :class:`EncodingInfo` entries.
156 """
157 candidates = (enc for enc in REGISTRY.values() if enc.era & era)
158 if include_encodings is not None:
159 candidates = (enc for enc in candidates if enc.name in include_encodings)
160 if exclude_encodings is not None:
161 candidates = (enc for enc in candidates if enc.name not in exclude_encodings)
162 return tuple(candidates)
165# Era assignments match chardet 6.0.0's chardet/metadata/charsets.py
167_REGISTRY_ENTRIES = (
168 # === MODERN_WEB ===
169 EncodingInfo(
170 name="ascii",
171 aliases=("us-ascii",),
172 era=EncodingEra.MODERN_WEB,
173 is_multibyte=False,
174 languages=(),
175 ),
176 EncodingInfo(
177 name="utf-8",
178 aliases=("utf-8", "utf8"),
179 era=EncodingEra.MODERN_WEB,
180 is_multibyte=False,
181 languages=(),
182 ),
183 EncodingInfo(
184 name="utf-8-sig",
185 aliases=("UTF-8-SIG", "utf-8-bom"),
186 era=EncodingEra.MODERN_WEB,
187 is_multibyte=False,
188 languages=(),
189 ),
190 EncodingInfo(
191 name="utf-16",
192 aliases=("UTF-16", "utf16"),
193 era=EncodingEra.MODERN_WEB,
194 is_multibyte=False,
195 languages=(),
196 ),
197 EncodingInfo(
198 name="utf-16-be",
199 aliases=("UTF-16-BE", "utf-16be"),
200 era=EncodingEra.MODERN_WEB,
201 is_multibyte=False,
202 languages=(),
203 ),
204 EncodingInfo(
205 name="utf-16-le",
206 aliases=("UTF-16-LE", "utf-16le"),
207 era=EncodingEra.MODERN_WEB,
208 is_multibyte=False,
209 languages=(),
210 ),
211 EncodingInfo(
212 name="utf-32",
213 aliases=("UTF-32", "utf32"),
214 era=EncodingEra.MODERN_WEB,
215 is_multibyte=False,
216 languages=(),
217 ),
218 EncodingInfo(
219 name="utf-32-be",
220 aliases=("UTF-32-BE", "utf-32be"),
221 era=EncodingEra.MODERN_WEB,
222 is_multibyte=False,
223 languages=(),
224 ),
225 EncodingInfo(
226 name="utf-32-le",
227 aliases=("UTF-32-LE", "utf-32le"),
228 era=EncodingEra.MODERN_WEB,
229 is_multibyte=False,
230 languages=(),
231 ),
232 EncodingInfo(
233 name="utf-7",
234 aliases=("UTF-7", "utf7"),
235 era=EncodingEra.LEGACY_REGIONAL,
236 is_multibyte=False,
237 languages=(),
238 ),
239 # CJK - Modern Web
240 EncodingInfo(
241 name="big5hkscs",
242 aliases=("Big5-HKSCS", "Big5HKSCS", "big5", "big5-tw", "csbig5", "cp950"),
243 era=EncodingEra.MODERN_WEB,
244 is_multibyte=True,
245 languages=("zh",),
246 ),
247 EncodingInfo(
248 name="cp932",
249 aliases=("CP932", "ms932", "mskanji", "ms-kanji"),
250 era=EncodingEra.MODERN_WEB,
251 is_multibyte=True,
252 languages=("ja",),
253 ),
254 EncodingInfo(
255 name="cp949",
256 aliases=("CP949", "ms949", "uhc"),
257 era=EncodingEra.MODERN_WEB,
258 is_multibyte=True,
259 languages=("ko",),
260 ),
261 EncodingInfo(
262 name="euc_jis_2004",
263 aliases=("EUC-JIS-2004", "euc-jp", "eucjp", "ujis", "u-jis", "euc-jisx0213"),
264 era=EncodingEra.MODERN_WEB,
265 is_multibyte=True,
266 languages=("ja",),
267 ),
268 EncodingInfo(
269 name="euc_kr",
270 aliases=("EUC-KR", "euckr"),
271 era=EncodingEra.MODERN_WEB,
272 is_multibyte=True,
273 languages=("ko",),
274 ),
275 EncodingInfo(
276 name="gb18030",
277 aliases=("GB18030", "gb-18030", "gb2312", "gbk"),
278 era=EncodingEra.MODERN_WEB,
279 is_multibyte=True,
280 languages=("zh",),
281 ),
282 EncodingInfo(
283 name="hz",
284 aliases=("HZ-GB-2312", "hz"),
285 era=EncodingEra.LEGACY_REGIONAL,
286 is_multibyte=True,
287 languages=("zh",),
288 ),
289 EncodingInfo(
290 name="iso2022_jp_2",
291 aliases=("ISO-2022-JP-2", "iso-2022-jp", "csiso2022jp", "iso2022-jp-1"),
292 era=EncodingEra.MODERN_WEB,
293 is_multibyte=True,
294 languages=("ja",),
295 ),
296 EncodingInfo(
297 name="iso2022_jp_2004",
298 aliases=("ISO-2022-JP-2004", "iso2022-jp-3"),
299 era=EncodingEra.MODERN_WEB,
300 is_multibyte=True,
301 languages=("ja",),
302 ),
303 EncodingInfo(
304 name="iso2022_jp_ext",
305 aliases=("ISO-2022-JP-EXT",),
306 era=EncodingEra.MODERN_WEB,
307 is_multibyte=True,
308 languages=("ja",),
309 ),
310 EncodingInfo(
311 name="iso2022_kr",
312 aliases=("ISO-2022-KR", "csiso2022kr"),
313 era=EncodingEra.LEGACY_REGIONAL,
314 is_multibyte=True,
315 languages=("ko",),
316 ),
317 EncodingInfo(
318 name="shift_jis_2004",
319 aliases=(
320 "Shift-JIS-2004",
321 "Shift_JIS_2004",
322 "shift_jis",
323 "sjis",
324 "shiftjis",
325 "s_jis",
326 "shift-jisx0213",
327 ),
328 era=EncodingEra.MODERN_WEB,
329 is_multibyte=True,
330 languages=("ja",),
331 ),
332 # Windows code pages - Modern Web
333 EncodingInfo(
334 name="cp874",
335 aliases=("CP874", "windows-874"),
336 era=EncodingEra.MODERN_WEB,
337 is_multibyte=False,
338 languages=("th",),
339 ),
340 EncodingInfo(
341 name="cp1250",
342 aliases=("Windows-1250", "cp1250"),
343 era=EncodingEra.MODERN_WEB,
344 is_multibyte=False,
345 languages=(*_CENTRAL_EU, "sr"),
346 ),
347 EncodingInfo(
348 name="cp1251",
349 aliases=("Windows-1251", "cp1251"),
350 era=EncodingEra.MODERN_WEB,
351 is_multibyte=False,
352 languages=_CYRILLIC,
353 ),
354 EncodingInfo(
355 name="cp1252",
356 aliases=("Windows-1252", "cp1252"),
357 era=EncodingEra.MODERN_WEB,
358 is_multibyte=False,
359 languages=_WESTERN,
360 ),
361 EncodingInfo(
362 name="cp1253",
363 aliases=("Windows-1253", "cp1253"),
364 era=EncodingEra.MODERN_WEB,
365 is_multibyte=False,
366 languages=("el",),
367 ),
368 EncodingInfo(
369 name="cp1254",
370 aliases=("Windows-1254", "cp1254"),
371 era=EncodingEra.MODERN_WEB,
372 is_multibyte=False,
373 languages=("tr",),
374 ),
375 EncodingInfo(
376 name="cp1255",
377 aliases=("Windows-1255", "cp1255"),
378 era=EncodingEra.MODERN_WEB,
379 is_multibyte=False,
380 languages=("he",),
381 ),
382 EncodingInfo(
383 name="cp1256",
384 aliases=("Windows-1256", "cp1256"),
385 era=EncodingEra.MODERN_WEB,
386 is_multibyte=False,
387 languages=_ARABIC,
388 ),
389 EncodingInfo(
390 name="cp1257",
391 aliases=("Windows-1257", "cp1257"),
392 era=EncodingEra.MODERN_WEB,
393 is_multibyte=False,
394 languages=_BALTIC,
395 ),
396 EncodingInfo(
397 name="cp1258",
398 aliases=("Windows-1258", "cp1258"),
399 era=EncodingEra.MODERN_WEB,
400 is_multibyte=False,
401 languages=("vi",),
402 ),
403 # KOI8 - Modern Web
404 EncodingInfo(
405 name="koi8-r",
406 aliases=("KOI8-R", "koi8r"),
407 era=EncodingEra.MODERN_WEB,
408 is_multibyte=False,
409 languages=("ru",),
410 ),
411 EncodingInfo(
412 name="koi8-u",
413 aliases=("KOI8-U", "koi8u"),
414 era=EncodingEra.MODERN_WEB,
415 is_multibyte=False,
416 languages=("uk",),
417 ),
418 # TIS-620 - Modern Web
419 EncodingInfo(
420 name="tis-620",
421 aliases=("TIS-620", "tis620", "iso-8859-11"),
422 era=EncodingEra.MODERN_WEB,
423 is_multibyte=False,
424 languages=("th",),
425 ),
426 # === LEGACY_ISO ===
427 EncodingInfo(
428 name="iso8859-1",
429 aliases=("ISO-8859-1", "latin-1", "latin1", "iso8859-1"),
430 era=EncodingEra.LEGACY_ISO,
431 is_multibyte=False,
432 languages=_WESTERN,
433 ),
434 EncodingInfo(
435 name="iso8859-2",
436 aliases=("ISO-8859-2", "latin-2", "latin2", "iso8859-2"),
437 era=EncodingEra.LEGACY_ISO,
438 is_multibyte=False,
439 languages=_CENTRAL_EU,
440 ),
441 EncodingInfo(
442 name="iso8859-3",
443 aliases=("ISO-8859-3", "latin-3", "latin3", "iso8859-3"),
444 era=EncodingEra.LEGACY_ISO,
445 is_multibyte=False,
446 languages=("eo", "mt", "tr"),
447 ),
448 EncodingInfo(
449 name="iso8859-4",
450 aliases=("ISO-8859-4", "latin-4", "latin4", "iso8859-4"),
451 era=EncodingEra.LEGACY_ISO,
452 is_multibyte=False,
453 languages=_BALTIC,
454 ),
455 EncodingInfo(
456 name="iso8859-5",
457 aliases=("ISO-8859-5", "iso8859-5", "cyrillic"),
458 era=EncodingEra.LEGACY_ISO,
459 is_multibyte=False,
460 languages=_CYRILLIC,
461 ),
462 EncodingInfo(
463 name="iso8859-6",
464 aliases=("ISO-8859-6", "iso8859-6", "arabic"),
465 era=EncodingEra.LEGACY_ISO,
466 is_multibyte=False,
467 languages=_ARABIC,
468 ),
469 EncodingInfo(
470 name="iso8859-7",
471 aliases=("ISO-8859-7", "iso8859-7", "greek"),
472 era=EncodingEra.LEGACY_ISO,
473 is_multibyte=False,
474 languages=("el",),
475 ),
476 EncodingInfo(
477 name="iso8859-8",
478 aliases=("ISO-8859-8", "iso8859-8", "hebrew"),
479 era=EncodingEra.LEGACY_ISO,
480 is_multibyte=False,
481 languages=("he",),
482 ),
483 EncodingInfo(
484 name="iso8859-9",
485 aliases=("ISO-8859-9", "latin-5", "latin5", "iso8859-9"),
486 era=EncodingEra.LEGACY_ISO,
487 is_multibyte=False,
488 languages=("tr",),
489 ),
490 EncodingInfo(
491 name="iso8859-10",
492 aliases=("ISO-8859-10", "latin-6", "latin6", "iso8859-10"),
493 era=EncodingEra.LEGACY_ISO,
494 is_multibyte=False,
495 languages=("is", "fi"),
496 ),
497 EncodingInfo(
498 name="iso8859-13",
499 aliases=("ISO-8859-13", "latin-7", "latin7", "iso8859-13"),
500 era=EncodingEra.LEGACY_ISO,
501 is_multibyte=False,
502 languages=_BALTIC,
503 ),
504 EncodingInfo(
505 name="iso8859-14",
506 aliases=("ISO-8859-14", "latin-8", "latin8", "iso8859-14"),
507 era=EncodingEra.LEGACY_ISO,
508 is_multibyte=False,
509 languages=("cy", "ga", "br", "gd"),
510 ),
511 EncodingInfo(
512 name="iso8859-15",
513 aliases=("ISO-8859-15", "latin-9", "latin9", "iso8859-15"),
514 era=EncodingEra.LEGACY_ISO,
515 is_multibyte=False,
516 languages=_WESTERN,
517 ),
518 EncodingInfo(
519 name="iso8859-16",
520 aliases=("ISO-8859-16", "latin-10", "latin10", "iso8859-16"),
521 era=EncodingEra.LEGACY_ISO,
522 is_multibyte=False,
523 languages=("ro", "pl", "hr", "hu", "sk", "sl"),
524 ),
525 # Johab - Legacy ISO per chardet 6.0.0
526 EncodingInfo(
527 name="johab",
528 aliases=("Johab",),
529 era=EncodingEra.LEGACY_ISO,
530 is_multibyte=True,
531 languages=("ko",),
532 ),
533 # === LEGACY_MAC ===
534 EncodingInfo(
535 name="mac-cyrillic",
536 aliases=("Mac-Cyrillic", "MacCyrillic", "maccyrillic"),
537 era=EncodingEra.LEGACY_MAC,
538 is_multibyte=False,
539 languages=_CYRILLIC,
540 ),
541 EncodingInfo(
542 name="mac-greek",
543 aliases=("Mac-Greek", "MacGreek", "macgreek"),
544 era=EncodingEra.LEGACY_MAC,
545 is_multibyte=False,
546 languages=("el",),
547 ),
548 EncodingInfo(
549 name="mac-iceland",
550 aliases=("Mac-Iceland", "MacIceland", "maciceland"),
551 era=EncodingEra.LEGACY_MAC,
552 is_multibyte=False,
553 languages=("is",),
554 ),
555 EncodingInfo(
556 name="mac-latin2",
557 aliases=("Mac-Latin2", "MacLatin2", "maclatin2", "maccentraleurope"),
558 era=EncodingEra.LEGACY_MAC,
559 is_multibyte=False,
560 languages=_CENTRAL_EU_NO_RO,
561 ),
562 EncodingInfo(
563 name="mac-roman",
564 aliases=("Mac-Roman", "MacRoman", "macroman", "macintosh"),
565 era=EncodingEra.LEGACY_MAC,
566 is_multibyte=False,
567 languages=_WESTERN,
568 ),
569 EncodingInfo(
570 name="mac-turkish",
571 aliases=("Mac-Turkish", "MacTurkish", "macturkish"),
572 era=EncodingEra.LEGACY_MAC,
573 is_multibyte=False,
574 languages=("tr",),
575 ),
576 # === LEGACY_REGIONAL ===
577 EncodingInfo(
578 name="cp720",
579 aliases=("CP720",),
580 era=EncodingEra.LEGACY_REGIONAL,
581 is_multibyte=False,
582 languages=_ARABIC,
583 ),
584 EncodingInfo(
585 name="cp1006",
586 aliases=("CP1006",),
587 era=EncodingEra.LEGACY_REGIONAL,
588 is_multibyte=False,
589 languages=("ur",),
590 ),
591 EncodingInfo(
592 name="cp1125",
593 aliases=("CP1125",),
594 era=EncodingEra.LEGACY_REGIONAL,
595 is_multibyte=False,
596 languages=("uk",),
597 ),
598 EncodingInfo(
599 name="koi8-t",
600 aliases=("KOI8-T",),
601 era=EncodingEra.LEGACY_REGIONAL,
602 is_multibyte=False,
603 languages=("tg",),
604 ),
605 EncodingInfo(
606 name="kz1048",
607 aliases=("KZ-1048", "kz1048", "strk1048-2002", "rk1048"),
608 era=EncodingEra.LEGACY_REGIONAL,
609 is_multibyte=False,
610 languages=("kk",),
611 ),
612 EncodingInfo(
613 name="ptcp154",
614 aliases=("PTCP154", "pt154", "cp154"),
615 era=EncodingEra.LEGACY_REGIONAL,
616 is_multibyte=False,
617 languages=("kk",),
618 ),
619 EncodingInfo(
620 name="hp-roman8",
621 aliases=("HP-Roman8", "roman8", "r8", "csHPRoman8"),
622 era=EncodingEra.LEGACY_REGIONAL,
623 is_multibyte=False,
624 languages=_WESTERN,
625 ),
626 # === DOS ===
627 EncodingInfo(
628 name="cp437",
629 aliases=("CP437",),
630 era=EncodingEra.DOS,
631 is_multibyte=False,
632 languages=("en", "fr", "de", "es", "pt", "it", "nl", "da", "sv", "fi", "ga"),
633 ),
634 EncodingInfo(
635 name="cp737",
636 aliases=("CP737",),
637 era=EncodingEra.DOS,
638 is_multibyte=False,
639 languages=("el",),
640 ),
641 EncodingInfo(
642 name="cp775",
643 aliases=("CP775",),
644 era=EncodingEra.DOS,
645 is_multibyte=False,
646 languages=_BALTIC,
647 ),
648 EncodingInfo(
649 name="cp850",
650 aliases=("CP850",),
651 era=EncodingEra.DOS,
652 is_multibyte=False,
653 languages=_WESTERN,
654 ),
655 EncodingInfo(
656 name="cp852",
657 aliases=("CP852",),
658 era=EncodingEra.DOS,
659 is_multibyte=False,
660 languages=_CENTRAL_EU,
661 ),
662 EncodingInfo(
663 name="cp855",
664 aliases=("CP855",),
665 era=EncodingEra.DOS,
666 is_multibyte=False,
667 languages=_CYRILLIC,
668 ),
669 EncodingInfo(
670 name="cp856",
671 aliases=("CP856",),
672 era=EncodingEra.DOS,
673 is_multibyte=False,
674 languages=("he",),
675 ),
676 EncodingInfo(
677 name="cp857",
678 aliases=("CP857",),
679 era=EncodingEra.DOS,
680 is_multibyte=False,
681 languages=("tr",),
682 ),
683 EncodingInfo(
684 name="cp858",
685 aliases=("CP858",),
686 era=EncodingEra.DOS,
687 is_multibyte=False,
688 languages=_WESTERN,
689 ),
690 EncodingInfo(
691 name="cp860",
692 aliases=("CP860",),
693 era=EncodingEra.DOS,
694 is_multibyte=False,
695 languages=("pt",),
696 ),
697 EncodingInfo(
698 name="cp861",
699 aliases=("CP861",),
700 era=EncodingEra.DOS,
701 is_multibyte=False,
702 languages=("is",),
703 ),
704 EncodingInfo(
705 name="cp862",
706 aliases=("CP862",),
707 era=EncodingEra.DOS,
708 is_multibyte=False,
709 languages=("he",),
710 ),
711 EncodingInfo(
712 name="cp863",
713 aliases=("CP863",),
714 era=EncodingEra.DOS,
715 is_multibyte=False,
716 languages=("fr",),
717 ),
718 EncodingInfo(
719 name="cp864",
720 aliases=("CP864",),
721 era=EncodingEra.DOS,
722 is_multibyte=False,
723 languages=("ar",),
724 ),
725 EncodingInfo(
726 name="cp865",
727 aliases=("CP865",),
728 era=EncodingEra.DOS,
729 is_multibyte=False,
730 languages=("da", "no"),
731 ),
732 EncodingInfo(
733 name="cp866",
734 aliases=("CP866",),
735 era=EncodingEra.DOS,
736 is_multibyte=False,
737 languages=_CYRILLIC,
738 ),
739 EncodingInfo(
740 name="cp869",
741 aliases=("CP869",),
742 era=EncodingEra.DOS,
743 is_multibyte=False,
744 languages=("el",),
745 ),
746 # === MAINFRAME ===
747 EncodingInfo(
748 name="cp1140",
749 aliases=("CP1140", "cp037"),
750 era=EncodingEra.MAINFRAME,
751 is_multibyte=False,
752 languages=_WESTERN_TR,
753 ),
754 EncodingInfo(
755 name="cp424",
756 aliases=("CP424",),
757 era=EncodingEra.MAINFRAME,
758 is_multibyte=False,
759 languages=("he",),
760 ),
761 EncodingInfo(
762 name="cp500",
763 aliases=("CP500",),
764 era=EncodingEra.MAINFRAME,
765 is_multibyte=False,
766 languages=_WESTERN,
767 ),
768 EncodingInfo(
769 name="cp875",
770 aliases=("CP875",),
771 era=EncodingEra.MAINFRAME,
772 is_multibyte=False,
773 languages=("el",),
774 ),
775 EncodingInfo(
776 name="cp1026",
777 aliases=("CP1026",),
778 era=EncodingEra.MAINFRAME,
779 is_multibyte=False,
780 languages=("tr",),
781 ),
782 EncodingInfo(
783 name="cp273",
784 aliases=("CP273",),
785 era=EncodingEra.MAINFRAME,
786 is_multibyte=False,
787 languages=("de",),
788 ),
789)
791REGISTRY: MappingProxyType[str, EncodingInfo] = MappingProxyType(
792 {e.name: e for e in _REGISTRY_ENTRIES}
793)
796@functools.cache
797def lookup_encoding(name: str) -> EncodingName | None:
798 """Convert an encoding name string to the canonical EncodingName.
800 Handles arbitrary casing, aliases, and Python codec names.
802 :param name: Any encoding name string.
803 :returns: The canonical :data:`EncodingName`, or ``None`` if unknown.
804 """
805 lowered = name.lower()
806 for entry in REGISTRY.values():
807 if entry.name == lowered:
808 return entry.name
809 for alias in entry.aliases:
810 if alias.lower() == lowered:
811 return entry.name
812 # Fallback: resolve through Python's codec registry
813 try:
814 codec_name = codecs.lookup(name).name
815 except LookupError:
816 return None
817 if codec_name != lowered:
818 return lookup_encoding(codec_name)
819 return None
822def _validate_encoding(name: str, param_name: str) -> str:
823 """Validate and normalize a single encoding name.
825 :param name: The encoding name to validate.
826 :param param_name: Parameter name for error messages.
827 :returns: The canonical encoding name.
828 :raises ValueError: If the encoding name is unknown.
829 """
830 canonical = lookup_encoding(name)
831 if canonical is None:
832 msg = f"Unknown encoding {name!r} in {param_name}"
833 raise ValueError(msg)
834 return canonical
837def normalize_encodings(
838 encodings: Iterable[str] | None,
839 param_name: str,
840) -> frozenset[str] | None:
841 """Normalize an iterable of encoding names to canonical forms.
843 :param encodings: Encoding names to normalize, or ``None``.
844 :param param_name: Parameter name for error messages.
845 :returns: A frozenset of canonical encoding names, or ``None``.
846 :raises ValueError: If any encoding name is unknown.
847 """
848 if encodings is None:
849 return None
850 result = frozenset(_validate_encoding(name, param_name) for name in encodings)
851 if not result:
852 msg = f"{param_name} must not be empty; omit the argument or pass None to disable filtering"
853 raise ValueError(msg)
854 return result