1"""
2Metadata about charsets used by our model training code and test file
3generationcode. Could be used for other things in the future.
4"""
5
6from dataclasses import dataclass
7
8from chardet.enums import EncodingEra, LanguageFilter
9
10
11@dataclass(frozen=True)
12class Charset:
13 """Metadata about charsets useful for training models and generating test files."""
14
15 name: str
16 is_multi_byte: bool
17 encoding_era: EncodingEra
18 language_filter: LanguageFilter
19
20
21CHARSETS = {
22 "ASCII": Charset(
23 name="ASCII",
24 is_multi_byte=False,
25 encoding_era=EncodingEra.MODERN_WEB,
26 language_filter=LanguageFilter.NON_CJK,
27 ),
28 "BIG5": Charset(
29 name="Big5",
30 is_multi_byte=True,
31 encoding_era=EncodingEra.MODERN_WEB,
32 language_filter=LanguageFilter.CHINESE_TRADITIONAL,
33 ),
34 "CP037": Charset(
35 name="CP037",
36 is_multi_byte=False,
37 encoding_era=EncodingEra.MAINFRAME,
38 language_filter=LanguageFilter.NON_CJK,
39 ),
40 "CP424": Charset(
41 name="CP424",
42 is_multi_byte=False,
43 encoding_era=EncodingEra.MAINFRAME,
44 language_filter=LanguageFilter.NON_CJK,
45 ),
46 "CP437": Charset(
47 name="CP437",
48 is_multi_byte=False,
49 encoding_era=EncodingEra.DOS,
50 language_filter=LanguageFilter.NON_CJK,
51 ),
52 "CP500": Charset(
53 name="CP500",
54 is_multi_byte=False,
55 encoding_era=EncodingEra.MAINFRAME,
56 language_filter=LanguageFilter.NON_CJK,
57 ),
58 "CP720": Charset(
59 name="CP720",
60 is_multi_byte=False,
61 encoding_era=EncodingEra.LEGACY_MAC,
62 language_filter=LanguageFilter.NON_CJK,
63 ),
64 "CP737": Charset(
65 name="CP737",
66 is_multi_byte=False,
67 encoding_era=EncodingEra.DOS,
68 language_filter=LanguageFilter.NON_CJK,
69 ),
70 "CP775": Charset(
71 name="CP775",
72 is_multi_byte=False,
73 encoding_era=EncodingEra.DOS,
74 language_filter=LanguageFilter.NON_CJK,
75 ),
76 "CP850": Charset(
77 name="CP850",
78 is_multi_byte=False,
79 encoding_era=EncodingEra.DOS,
80 language_filter=LanguageFilter.NON_CJK,
81 ),
82 "CP852": Charset(
83 name="CP852",
84 is_multi_byte=False,
85 encoding_era=EncodingEra.DOS,
86 language_filter=LanguageFilter.NON_CJK,
87 ),
88 "CP855": Charset(
89 name="CP855",
90 is_multi_byte=False,
91 encoding_era=EncodingEra.DOS,
92 language_filter=LanguageFilter.NON_CJK,
93 ),
94 "CP856": Charset(
95 name="CP856",
96 is_multi_byte=False,
97 encoding_era=EncodingEra.DOS,
98 language_filter=LanguageFilter.NON_CJK,
99 ),
100 "CP857": Charset(
101 name="CP857",
102 is_multi_byte=False,
103 encoding_era=EncodingEra.DOS,
104 language_filter=LanguageFilter.NON_CJK,
105 ),
106 "CP858": Charset(
107 name="CP858",
108 is_multi_byte=False,
109 encoding_era=EncodingEra.DOS,
110 language_filter=LanguageFilter.NON_CJK,
111 ),
112 "CP860": Charset(
113 name="CP860",
114 is_multi_byte=False,
115 encoding_era=EncodingEra.DOS,
116 language_filter=LanguageFilter.NON_CJK,
117 ),
118 "CP861": Charset(
119 name="CP861",
120 is_multi_byte=False,
121 encoding_era=EncodingEra.DOS,
122 language_filter=LanguageFilter.NON_CJK,
123 ),
124 "CP862": Charset(
125 name="CP862",
126 is_multi_byte=False,
127 encoding_era=EncodingEra.DOS,
128 language_filter=LanguageFilter.NON_CJK,
129 ),
130 "CP863": Charset(
131 name="CP863",
132 is_multi_byte=False,
133 encoding_era=EncodingEra.DOS,
134 language_filter=LanguageFilter.NON_CJK,
135 ),
136 "CP864": Charset(
137 name="CP864",
138 is_multi_byte=False,
139 encoding_era=EncodingEra.DOS,
140 language_filter=LanguageFilter.NON_CJK,
141 ),
142 "CP865": Charset(
143 name="CP865",
144 is_multi_byte=False,
145 encoding_era=EncodingEra.DOS,
146 language_filter=LanguageFilter.NON_CJK,
147 ),
148 "CP866": Charset(
149 name="CP866",
150 is_multi_byte=False,
151 encoding_era=EncodingEra.DOS,
152 language_filter=LanguageFilter.NON_CJK,
153 ),
154 "CP869": Charset(
155 name="CP869",
156 is_multi_byte=False,
157 encoding_era=EncodingEra.DOS,
158 language_filter=LanguageFilter.NON_CJK,
159 ),
160 "CP874": Charset(
161 name="CP874",
162 is_multi_byte=False,
163 encoding_era=EncodingEra.MODERN_WEB,
164 language_filter=LanguageFilter.NON_CJK,
165 ),
166 "CP875": Charset(
167 name="CP875",
168 is_multi_byte=False,
169 encoding_era=EncodingEra.MAINFRAME,
170 language_filter=LanguageFilter.NON_CJK,
171 ),
172 "CP932": Charset(
173 name="CP932",
174 is_multi_byte=False,
175 encoding_era=EncodingEra.MODERN_WEB,
176 language_filter=LanguageFilter.JAPANESE,
177 ),
178 "CP949": Charset(
179 name="CP949",
180 is_multi_byte=True,
181 encoding_era=EncodingEra.MODERN_WEB,
182 language_filter=LanguageFilter.KOREAN,
183 ),
184 "CP1006": Charset(
185 name="CP1006",
186 is_multi_byte=False,
187 encoding_era=EncodingEra.LEGACY_MAC,
188 language_filter=LanguageFilter.NON_CJK,
189 ),
190 "CP1026": Charset(
191 name="CP1026",
192 is_multi_byte=False,
193 encoding_era=EncodingEra.MAINFRAME,
194 language_filter=LanguageFilter.NON_CJK,
195 ),
196 "CP1125": Charset(
197 name="CP1125",
198 is_multi_byte=False,
199 encoding_era=EncodingEra.LEGACY_MAC,
200 language_filter=LanguageFilter.NON_CJK,
201 ),
202 "EUC-JP": Charset(
203 name="EUC-JP",
204 is_multi_byte=True,
205 encoding_era=EncodingEra.MODERN_WEB,
206 language_filter=LanguageFilter.JAPANESE,
207 ),
208 "EUC-KR": Charset(
209 name="EUC-KR",
210 is_multi_byte=True,
211 encoding_era=EncodingEra.MODERN_WEB,
212 language_filter=LanguageFilter.KOREAN,
213 ),
214 "GB18030": Charset(
215 name="GB18030",
216 is_multi_byte=True,
217 encoding_era=EncodingEra.MODERN_WEB,
218 language_filter=LanguageFilter.CHINESE_SIMPLIFIED,
219 ),
220 "HZ-GB-2312": Charset(
221 name="HZ-GB-2312",
222 is_multi_byte=True,
223 encoding_era=EncodingEra.MODERN_WEB,
224 language_filter=LanguageFilter.CHINESE_SIMPLIFIED,
225 ),
226 "ISO-2022-JP": Charset(
227 name="ISO-2022-JP",
228 is_multi_byte=True,
229 encoding_era=EncodingEra.MODERN_WEB,
230 language_filter=LanguageFilter.JAPANESE,
231 ),
232 "ISO-2022-KR": Charset(
233 name="ISO-2022-KR",
234 is_multi_byte=True,
235 encoding_era=EncodingEra.MODERN_WEB,
236 language_filter=LanguageFilter.KOREAN,
237 ),
238 "ISO-8859-1": Charset(
239 name="ISO-8859-1",
240 is_multi_byte=False,
241 encoding_era=EncodingEra.LEGACY_ISO,
242 language_filter=LanguageFilter.NON_CJK,
243 ),
244 "ISO-8859-2": Charset(
245 name="ISO-8859-2",
246 is_multi_byte=False,
247 encoding_era=EncodingEra.LEGACY_ISO,
248 language_filter=LanguageFilter.NON_CJK,
249 ),
250 "ISO-8859-3": Charset(
251 name="ISO-8859-3",
252 is_multi_byte=False,
253 encoding_era=EncodingEra.LEGACY_ISO,
254 language_filter=LanguageFilter.NON_CJK,
255 ),
256 "ISO-8859-4": Charset(
257 name="ISO-8859-4",
258 is_multi_byte=False,
259 encoding_era=EncodingEra.LEGACY_ISO,
260 language_filter=LanguageFilter.NON_CJK,
261 ),
262 "ISO-8859-5": Charset(
263 name="ISO-8859-5",
264 is_multi_byte=False,
265 encoding_era=EncodingEra.LEGACY_ISO,
266 language_filter=LanguageFilter.NON_CJK,
267 ),
268 "ISO-8859-6": Charset(
269 name="ISO-8859-6",
270 is_multi_byte=False,
271 encoding_era=EncodingEra.LEGACY_ISO,
272 language_filter=LanguageFilter.NON_CJK,
273 ),
274 "ISO-8859-7": Charset(
275 name="ISO-8859-7",
276 is_multi_byte=False,
277 encoding_era=EncodingEra.LEGACY_ISO,
278 language_filter=LanguageFilter.NON_CJK,
279 ),
280 "ISO-8859-8": Charset(
281 name="ISO-8859-8",
282 is_multi_byte=False,
283 encoding_era=EncodingEra.LEGACY_ISO,
284 language_filter=LanguageFilter.NON_CJK,
285 ),
286 "ISO-8859-9": Charset(
287 name="ISO-8859-9",
288 is_multi_byte=False,
289 encoding_era=EncodingEra.LEGACY_ISO,
290 language_filter=LanguageFilter.NON_CJK,
291 ),
292 "ISO-8859-10": Charset(
293 name="ISO-8859-10",
294 is_multi_byte=False,
295 encoding_era=EncodingEra.LEGACY_ISO,
296 language_filter=LanguageFilter.NON_CJK,
297 ),
298 "ISO-8859-11": Charset(
299 name="ISO-8859-11",
300 is_multi_byte=False,
301 encoding_era=EncodingEra.LEGACY_ISO,
302 language_filter=LanguageFilter.NON_CJK,
303 ),
304 "ISO-8859-13": Charset(
305 name="ISO-8859-13",
306 is_multi_byte=False,
307 encoding_era=EncodingEra.LEGACY_ISO,
308 language_filter=LanguageFilter.NON_CJK,
309 ),
310 "ISO-8859-14": Charset(
311 name="ISO-8859-14",
312 is_multi_byte=False,
313 encoding_era=EncodingEra.LEGACY_ISO,
314 language_filter=LanguageFilter.NON_CJK,
315 ),
316 "ISO-8859-15": Charset(
317 name="ISO-8859-15",
318 is_multi_byte=False,
319 encoding_era=EncodingEra.LEGACY_ISO,
320 language_filter=LanguageFilter.NON_CJK,
321 ),
322 "ISO-8859-16": Charset(
323 name="ISO-8859-16",
324 is_multi_byte=False,
325 encoding_era=EncodingEra.LEGACY_ISO,
326 language_filter=LanguageFilter.NON_CJK,
327 ),
328 "JOHAB": Charset(
329 name="Johab",
330 is_multi_byte=True,
331 encoding_era=EncodingEra.LEGACY_ISO,
332 language_filter=LanguageFilter.KOREAN,
333 ),
334 "KOI8-R": Charset(
335 name="KOI8-R",
336 is_multi_byte=False,
337 encoding_era=EncodingEra.MODERN_WEB,
338 language_filter=LanguageFilter.NON_CJK,
339 ),
340 "KOI8-U": Charset(
341 name="KOI8-U",
342 is_multi_byte=False,
343 encoding_era=EncodingEra.MODERN_WEB,
344 language_filter=LanguageFilter.NON_CJK,
345 ),
346 "KOI8-T": Charset(
347 name="KOI8-T",
348 is_multi_byte=False,
349 encoding_era=EncodingEra.LEGACY_MAC,
350 language_filter=LanguageFilter.NON_CJK,
351 ),
352 "KZ1048": Charset(
353 name="KZ1048",
354 is_multi_byte=False,
355 encoding_era=EncodingEra.LEGACY_MAC,
356 language_filter=LanguageFilter.NON_CJK,
357 ),
358 "MACCYRILLIC": Charset(
359 name="MacCyrillic",
360 is_multi_byte=False,
361 encoding_era=EncodingEra.LEGACY_MAC,
362 language_filter=LanguageFilter.NON_CJK,
363 ),
364 "MACGREEK": Charset(
365 name="MacGreek",
366 is_multi_byte=False,
367 encoding_era=EncodingEra.LEGACY_MAC,
368 language_filter=LanguageFilter.NON_CJK,
369 ),
370 "MACICELAND": Charset(
371 name="MacIceland",
372 is_multi_byte=False,
373 encoding_era=EncodingEra.LEGACY_MAC,
374 language_filter=LanguageFilter.NON_CJK,
375 ),
376 "MACLATIN2": Charset(
377 name="MacLatin2",
378 is_multi_byte=False,
379 encoding_era=EncodingEra.LEGACY_MAC,
380 language_filter=LanguageFilter.NON_CJK,
381 ),
382 "MACROMAN": Charset(
383 name="MacRoman",
384 is_multi_byte=False,
385 encoding_era=EncodingEra.LEGACY_MAC,
386 language_filter=LanguageFilter.NON_CJK,
387 ),
388 "MACTURKISH": Charset(
389 name="MacTurkish",
390 is_multi_byte=False,
391 encoding_era=EncodingEra.LEGACY_MAC,
392 language_filter=LanguageFilter.NON_CJK,
393 ),
394 "PTCP154": Charset(
395 name="PTCP154",
396 is_multi_byte=False,
397 encoding_era=EncodingEra.LEGACY_MAC,
398 language_filter=LanguageFilter.NON_CJK,
399 ),
400 "SHIFT-JIS": Charset(
401 name="Shift-JIS",
402 is_multi_byte=True,
403 encoding_era=EncodingEra.MODERN_WEB,
404 language_filter=LanguageFilter.JAPANESE,
405 ),
406 "TIS-620": Charset(
407 name="TIS-620",
408 is_multi_byte=False,
409 encoding_era=EncodingEra.MODERN_WEB,
410 language_filter=LanguageFilter.NON_CJK,
411 ),
412 "UTF-8": Charset(
413 name="UTF-8",
414 is_multi_byte=True,
415 encoding_era=EncodingEra.MODERN_WEB,
416 language_filter=LanguageFilter.ALL,
417 ),
418 "UTF-8-SIG": Charset(
419 name="UTF-8-SIG",
420 is_multi_byte=True,
421 encoding_era=EncodingEra.MODERN_WEB,
422 language_filter=LanguageFilter.ALL,
423 ),
424 "UTF-16": Charset(
425 name="UTF-16",
426 is_multi_byte=True,
427 encoding_era=EncodingEra.MODERN_WEB,
428 language_filter=LanguageFilter.ALL,
429 ),
430 "UTF-16BE": Charset(
431 name="UTF-16BE",
432 is_multi_byte=True,
433 encoding_era=EncodingEra.MODERN_WEB,
434 language_filter=LanguageFilter.ALL,
435 ),
436 "UTF-16LE": Charset(
437 name="UTF-16LE",
438 is_multi_byte=True,
439 encoding_era=EncodingEra.MODERN_WEB,
440 language_filter=LanguageFilter.ALL,
441 ),
442 "UTF-32": Charset(
443 name="UTF-32",
444 is_multi_byte=True,
445 encoding_era=EncodingEra.MODERN_WEB,
446 language_filter=LanguageFilter.ALL,
447 ),
448 "UTF-32BE": Charset(
449 name="UTF-32BE",
450 is_multi_byte=True,
451 encoding_era=EncodingEra.MODERN_WEB,
452 language_filter=LanguageFilter.ALL,
453 ),
454 "UTF-32LE": Charset(
455 name="UTF-32LE",
456 is_multi_byte=True,
457 encoding_era=EncodingEra.MODERN_WEB,
458 language_filter=LanguageFilter.ALL,
459 ),
460 "WINDOWS-1250": Charset(
461 name="Windows-1250",
462 is_multi_byte=False,
463 encoding_era=EncodingEra.MODERN_WEB,
464 language_filter=LanguageFilter.NON_CJK,
465 ),
466 "WINDOWS-1251": Charset(
467 name="Windows-1251",
468 is_multi_byte=False,
469 encoding_era=EncodingEra.MODERN_WEB,
470 language_filter=LanguageFilter.NON_CJK,
471 ),
472 "WINDOWS-1252": Charset(
473 name="Windows-1252",
474 is_multi_byte=False,
475 encoding_era=EncodingEra.MODERN_WEB,
476 language_filter=LanguageFilter.NON_CJK,
477 ),
478 "WINDOWS-1253": Charset(
479 name="Windows-1253",
480 is_multi_byte=False,
481 encoding_era=EncodingEra.MODERN_WEB,
482 language_filter=LanguageFilter.NON_CJK,
483 ),
484 "WINDOWS-1254": Charset(
485 name="Windows-1254",
486 is_multi_byte=False,
487 encoding_era=EncodingEra.MODERN_WEB,
488 language_filter=LanguageFilter.NON_CJK,
489 ),
490 "WINDOWS-1255": Charset(
491 name="Windows-1255",
492 is_multi_byte=False,
493 encoding_era=EncodingEra.MODERN_WEB,
494 language_filter=LanguageFilter.NON_CJK,
495 ),
496 "WINDOWS-1256": Charset(
497 name="Windows-1256",
498 is_multi_byte=False,
499 encoding_era=EncodingEra.MODERN_WEB,
500 language_filter=LanguageFilter.NON_CJK,
501 ),
502 "WINDOWS-1257": Charset(
503 name="Windows-1257",
504 is_multi_byte=False,
505 encoding_era=EncodingEra.MODERN_WEB,
506 language_filter=LanguageFilter.NON_CJK,
507 ),
508 "WINDOWS-1258": Charset(
509 name="Windows-1258",
510 is_multi_byte=False,
511 encoding_era=EncodingEra.MODERN_WEB,
512 language_filter=LanguageFilter.NON_CJK,
513 ),
514}
515
516
517def get_charset(encoding_name: str) -> Charset:
518 """
519 Get the Charset metadata for a given encoding name.
520
521 :param encoding_name: The encoding name to look up
522 :return: The Charset for this encoding, defaults to a MODERN_WEB charset if unknown
523 """
524 normalized_name = encoding_name.upper().replace("_", "-")
525 return CHARSETS[normalized_name]
526
527
528def is_unicode_encoding(encoding_name: str) -> bool:
529 """
530 Check if an encoding is a Unicode encoding (UTF-8, UTF-16, UTF-32).
531
532 :param encoding_name: The encoding name to check
533 :return: True if the encoding is Unicode, False otherwise
534 """
535 normalized_name = encoding_name.upper().replace("_", "-")
536 return normalized_name.startswith("UTF-")