Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/metadata/charsets.py: 100%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

16 statements  

1""" 

2Metadata about charsets used by our model training code and test file 

3generationcode. Could be used for other things in the future. 

4""" 

5 

6from dataclasses import dataclass 

7 

8from chardet.enums import EncodingEra, LanguageFilter 

9 

10 

11@dataclass(frozen=True) 

12class Charset: 

13 """Metadata about charsets useful for training models and generating test files.""" 

14 

15 name: str 

16 is_multi_byte: bool 

17 encoding_era: EncodingEra 

18 language_filter: LanguageFilter 

19 

20 

21CHARSETS = { 

22 "ASCII": Charset( 

23 name="ASCII", 

24 is_multi_byte=False, 

25 encoding_era=EncodingEra.MODERN_WEB, 

26 language_filter=LanguageFilter.NON_CJK, 

27 ), 

28 "BIG5": Charset( 

29 name="Big5", 

30 is_multi_byte=True, 

31 encoding_era=EncodingEra.MODERN_WEB, 

32 language_filter=LanguageFilter.CHINESE_TRADITIONAL, 

33 ), 

34 "CP037": Charset( 

35 name="CP037", 

36 is_multi_byte=False, 

37 encoding_era=EncodingEra.MAINFRAME, 

38 language_filter=LanguageFilter.NON_CJK, 

39 ), 

40 "CP424": Charset( 

41 name="CP424", 

42 is_multi_byte=False, 

43 encoding_era=EncodingEra.MAINFRAME, 

44 language_filter=LanguageFilter.NON_CJK, 

45 ), 

46 "CP437": Charset( 

47 name="CP437", 

48 is_multi_byte=False, 

49 encoding_era=EncodingEra.DOS, 

50 language_filter=LanguageFilter.NON_CJK, 

51 ), 

52 "CP500": Charset( 

53 name="CP500", 

54 is_multi_byte=False, 

55 encoding_era=EncodingEra.MAINFRAME, 

56 language_filter=LanguageFilter.NON_CJK, 

57 ), 

58 "CP720": Charset( 

59 name="CP720", 

60 is_multi_byte=False, 

61 encoding_era=EncodingEra.LEGACY_MAC, 

62 language_filter=LanguageFilter.NON_CJK, 

63 ), 

64 "CP737": Charset( 

65 name="CP737", 

66 is_multi_byte=False, 

67 encoding_era=EncodingEra.DOS, 

68 language_filter=LanguageFilter.NON_CJK, 

69 ), 

70 "CP775": Charset( 

71 name="CP775", 

72 is_multi_byte=False, 

73 encoding_era=EncodingEra.DOS, 

74 language_filter=LanguageFilter.NON_CJK, 

75 ), 

76 "CP850": Charset( 

77 name="CP850", 

78 is_multi_byte=False, 

79 encoding_era=EncodingEra.DOS, 

80 language_filter=LanguageFilter.NON_CJK, 

81 ), 

82 "CP852": Charset( 

83 name="CP852", 

84 is_multi_byte=False, 

85 encoding_era=EncodingEra.DOS, 

86 language_filter=LanguageFilter.NON_CJK, 

87 ), 

88 "CP855": Charset( 

89 name="CP855", 

90 is_multi_byte=False, 

91 encoding_era=EncodingEra.DOS, 

92 language_filter=LanguageFilter.NON_CJK, 

93 ), 

94 "CP856": Charset( 

95 name="CP856", 

96 is_multi_byte=False, 

97 encoding_era=EncodingEra.DOS, 

98 language_filter=LanguageFilter.NON_CJK, 

99 ), 

100 "CP857": Charset( 

101 name="CP857", 

102 is_multi_byte=False, 

103 encoding_era=EncodingEra.DOS, 

104 language_filter=LanguageFilter.NON_CJK, 

105 ), 

106 "CP858": Charset( 

107 name="CP858", 

108 is_multi_byte=False, 

109 encoding_era=EncodingEra.DOS, 

110 language_filter=LanguageFilter.NON_CJK, 

111 ), 

112 "CP860": Charset( 

113 name="CP860", 

114 is_multi_byte=False, 

115 encoding_era=EncodingEra.DOS, 

116 language_filter=LanguageFilter.NON_CJK, 

117 ), 

118 "CP861": Charset( 

119 name="CP861", 

120 is_multi_byte=False, 

121 encoding_era=EncodingEra.DOS, 

122 language_filter=LanguageFilter.NON_CJK, 

123 ), 

124 "CP862": Charset( 

125 name="CP862", 

126 is_multi_byte=False, 

127 encoding_era=EncodingEra.DOS, 

128 language_filter=LanguageFilter.NON_CJK, 

129 ), 

130 "CP863": Charset( 

131 name="CP863", 

132 is_multi_byte=False, 

133 encoding_era=EncodingEra.DOS, 

134 language_filter=LanguageFilter.NON_CJK, 

135 ), 

136 "CP864": Charset( 

137 name="CP864", 

138 is_multi_byte=False, 

139 encoding_era=EncodingEra.DOS, 

140 language_filter=LanguageFilter.NON_CJK, 

141 ), 

142 "CP865": Charset( 

143 name="CP865", 

144 is_multi_byte=False, 

145 encoding_era=EncodingEra.DOS, 

146 language_filter=LanguageFilter.NON_CJK, 

147 ), 

148 "CP866": Charset( 

149 name="CP866", 

150 is_multi_byte=False, 

151 encoding_era=EncodingEra.DOS, 

152 language_filter=LanguageFilter.NON_CJK, 

153 ), 

154 "CP869": Charset( 

155 name="CP869", 

156 is_multi_byte=False, 

157 encoding_era=EncodingEra.DOS, 

158 language_filter=LanguageFilter.NON_CJK, 

159 ), 

160 "CP874": Charset( 

161 name="CP874", 

162 is_multi_byte=False, 

163 encoding_era=EncodingEra.MODERN_WEB, 

164 language_filter=LanguageFilter.NON_CJK, 

165 ), 

166 "CP875": Charset( 

167 name="CP875", 

168 is_multi_byte=False, 

169 encoding_era=EncodingEra.MAINFRAME, 

170 language_filter=LanguageFilter.NON_CJK, 

171 ), 

172 "CP932": Charset( 

173 name="CP932", 

174 is_multi_byte=False, 

175 encoding_era=EncodingEra.MODERN_WEB, 

176 language_filter=LanguageFilter.JAPANESE, 

177 ), 

178 "CP949": Charset( 

179 name="CP949", 

180 is_multi_byte=True, 

181 encoding_era=EncodingEra.MODERN_WEB, 

182 language_filter=LanguageFilter.KOREAN, 

183 ), 

184 "CP1006": Charset( 

185 name="CP1006", 

186 is_multi_byte=False, 

187 encoding_era=EncodingEra.LEGACY_MAC, 

188 language_filter=LanguageFilter.NON_CJK, 

189 ), 

190 "CP1026": Charset( 

191 name="CP1026", 

192 is_multi_byte=False, 

193 encoding_era=EncodingEra.MAINFRAME, 

194 language_filter=LanguageFilter.NON_CJK, 

195 ), 

196 "CP1125": Charset( 

197 name="CP1125", 

198 is_multi_byte=False, 

199 encoding_era=EncodingEra.LEGACY_MAC, 

200 language_filter=LanguageFilter.NON_CJK, 

201 ), 

202 "EUC-JP": Charset( 

203 name="EUC-JP", 

204 is_multi_byte=True, 

205 encoding_era=EncodingEra.MODERN_WEB, 

206 language_filter=LanguageFilter.JAPANESE, 

207 ), 

208 "EUC-KR": Charset( 

209 name="EUC-KR", 

210 is_multi_byte=True, 

211 encoding_era=EncodingEra.MODERN_WEB, 

212 language_filter=LanguageFilter.KOREAN, 

213 ), 

214 "GB18030": Charset( 

215 name="GB18030", 

216 is_multi_byte=True, 

217 encoding_era=EncodingEra.MODERN_WEB, 

218 language_filter=LanguageFilter.CHINESE_SIMPLIFIED, 

219 ), 

220 "HZ-GB-2312": Charset( 

221 name="HZ-GB-2312", 

222 is_multi_byte=True, 

223 encoding_era=EncodingEra.MODERN_WEB, 

224 language_filter=LanguageFilter.CHINESE_SIMPLIFIED, 

225 ), 

226 "ISO-2022-JP": Charset( 

227 name="ISO-2022-JP", 

228 is_multi_byte=True, 

229 encoding_era=EncodingEra.MODERN_WEB, 

230 language_filter=LanguageFilter.JAPANESE, 

231 ), 

232 "ISO-2022-KR": Charset( 

233 name="ISO-2022-KR", 

234 is_multi_byte=True, 

235 encoding_era=EncodingEra.MODERN_WEB, 

236 language_filter=LanguageFilter.KOREAN, 

237 ), 

238 "ISO-8859-1": Charset( 

239 name="ISO-8859-1", 

240 is_multi_byte=False, 

241 encoding_era=EncodingEra.LEGACY_ISO, 

242 language_filter=LanguageFilter.NON_CJK, 

243 ), 

244 "ISO-8859-2": Charset( 

245 name="ISO-8859-2", 

246 is_multi_byte=False, 

247 encoding_era=EncodingEra.LEGACY_ISO, 

248 language_filter=LanguageFilter.NON_CJK, 

249 ), 

250 "ISO-8859-3": Charset( 

251 name="ISO-8859-3", 

252 is_multi_byte=False, 

253 encoding_era=EncodingEra.LEGACY_ISO, 

254 language_filter=LanguageFilter.NON_CJK, 

255 ), 

256 "ISO-8859-4": Charset( 

257 name="ISO-8859-4", 

258 is_multi_byte=False, 

259 encoding_era=EncodingEra.LEGACY_ISO, 

260 language_filter=LanguageFilter.NON_CJK, 

261 ), 

262 "ISO-8859-5": Charset( 

263 name="ISO-8859-5", 

264 is_multi_byte=False, 

265 encoding_era=EncodingEra.LEGACY_ISO, 

266 language_filter=LanguageFilter.NON_CJK, 

267 ), 

268 "ISO-8859-6": Charset( 

269 name="ISO-8859-6", 

270 is_multi_byte=False, 

271 encoding_era=EncodingEra.LEGACY_ISO, 

272 language_filter=LanguageFilter.NON_CJK, 

273 ), 

274 "ISO-8859-7": Charset( 

275 name="ISO-8859-7", 

276 is_multi_byte=False, 

277 encoding_era=EncodingEra.LEGACY_ISO, 

278 language_filter=LanguageFilter.NON_CJK, 

279 ), 

280 "ISO-8859-8": Charset( 

281 name="ISO-8859-8", 

282 is_multi_byte=False, 

283 encoding_era=EncodingEra.LEGACY_ISO, 

284 language_filter=LanguageFilter.NON_CJK, 

285 ), 

286 "ISO-8859-9": Charset( 

287 name="ISO-8859-9", 

288 is_multi_byte=False, 

289 encoding_era=EncodingEra.LEGACY_ISO, 

290 language_filter=LanguageFilter.NON_CJK, 

291 ), 

292 "ISO-8859-10": Charset( 

293 name="ISO-8859-10", 

294 is_multi_byte=False, 

295 encoding_era=EncodingEra.LEGACY_ISO, 

296 language_filter=LanguageFilter.NON_CJK, 

297 ), 

298 "ISO-8859-11": Charset( 

299 name="ISO-8859-11", 

300 is_multi_byte=False, 

301 encoding_era=EncodingEra.LEGACY_ISO, 

302 language_filter=LanguageFilter.NON_CJK, 

303 ), 

304 "ISO-8859-13": Charset( 

305 name="ISO-8859-13", 

306 is_multi_byte=False, 

307 encoding_era=EncodingEra.LEGACY_ISO, 

308 language_filter=LanguageFilter.NON_CJK, 

309 ), 

310 "ISO-8859-14": Charset( 

311 name="ISO-8859-14", 

312 is_multi_byte=False, 

313 encoding_era=EncodingEra.LEGACY_ISO, 

314 language_filter=LanguageFilter.NON_CJK, 

315 ), 

316 "ISO-8859-15": Charset( 

317 name="ISO-8859-15", 

318 is_multi_byte=False, 

319 encoding_era=EncodingEra.LEGACY_ISO, 

320 language_filter=LanguageFilter.NON_CJK, 

321 ), 

322 "ISO-8859-16": Charset( 

323 name="ISO-8859-16", 

324 is_multi_byte=False, 

325 encoding_era=EncodingEra.LEGACY_ISO, 

326 language_filter=LanguageFilter.NON_CJK, 

327 ), 

328 "JOHAB": Charset( 

329 name="Johab", 

330 is_multi_byte=True, 

331 encoding_era=EncodingEra.LEGACY_ISO, 

332 language_filter=LanguageFilter.KOREAN, 

333 ), 

334 "KOI8-R": Charset( 

335 name="KOI8-R", 

336 is_multi_byte=False, 

337 encoding_era=EncodingEra.MODERN_WEB, 

338 language_filter=LanguageFilter.NON_CJK, 

339 ), 

340 "KOI8-U": Charset( 

341 name="KOI8-U", 

342 is_multi_byte=False, 

343 encoding_era=EncodingEra.MODERN_WEB, 

344 language_filter=LanguageFilter.NON_CJK, 

345 ), 

346 "KOI8-T": Charset( 

347 name="KOI8-T", 

348 is_multi_byte=False, 

349 encoding_era=EncodingEra.LEGACY_MAC, 

350 language_filter=LanguageFilter.NON_CJK, 

351 ), 

352 "KZ1048": Charset( 

353 name="KZ1048", 

354 is_multi_byte=False, 

355 encoding_era=EncodingEra.LEGACY_MAC, 

356 language_filter=LanguageFilter.NON_CJK, 

357 ), 

358 "MACCYRILLIC": Charset( 

359 name="MacCyrillic", 

360 is_multi_byte=False, 

361 encoding_era=EncodingEra.LEGACY_MAC, 

362 language_filter=LanguageFilter.NON_CJK, 

363 ), 

364 "MACGREEK": Charset( 

365 name="MacGreek", 

366 is_multi_byte=False, 

367 encoding_era=EncodingEra.LEGACY_MAC, 

368 language_filter=LanguageFilter.NON_CJK, 

369 ), 

370 "MACICELAND": Charset( 

371 name="MacIceland", 

372 is_multi_byte=False, 

373 encoding_era=EncodingEra.LEGACY_MAC, 

374 language_filter=LanguageFilter.NON_CJK, 

375 ), 

376 "MACLATIN2": Charset( 

377 name="MacLatin2", 

378 is_multi_byte=False, 

379 encoding_era=EncodingEra.LEGACY_MAC, 

380 language_filter=LanguageFilter.NON_CJK, 

381 ), 

382 "MACROMAN": Charset( 

383 name="MacRoman", 

384 is_multi_byte=False, 

385 encoding_era=EncodingEra.LEGACY_MAC, 

386 language_filter=LanguageFilter.NON_CJK, 

387 ), 

388 "MACTURKISH": Charset( 

389 name="MacTurkish", 

390 is_multi_byte=False, 

391 encoding_era=EncodingEra.LEGACY_MAC, 

392 language_filter=LanguageFilter.NON_CJK, 

393 ), 

394 "PTCP154": Charset( 

395 name="PTCP154", 

396 is_multi_byte=False, 

397 encoding_era=EncodingEra.LEGACY_MAC, 

398 language_filter=LanguageFilter.NON_CJK, 

399 ), 

400 "SHIFT-JIS": Charset( 

401 name="Shift-JIS", 

402 is_multi_byte=True, 

403 encoding_era=EncodingEra.MODERN_WEB, 

404 language_filter=LanguageFilter.JAPANESE, 

405 ), 

406 "TIS-620": Charset( 

407 name="TIS-620", 

408 is_multi_byte=False, 

409 encoding_era=EncodingEra.MODERN_WEB, 

410 language_filter=LanguageFilter.NON_CJK, 

411 ), 

412 "UTF-8": Charset( 

413 name="UTF-8", 

414 is_multi_byte=True, 

415 encoding_era=EncodingEra.MODERN_WEB, 

416 language_filter=LanguageFilter.ALL, 

417 ), 

418 "UTF-8-SIG": Charset( 

419 name="UTF-8-SIG", 

420 is_multi_byte=True, 

421 encoding_era=EncodingEra.MODERN_WEB, 

422 language_filter=LanguageFilter.ALL, 

423 ), 

424 "UTF-16": Charset( 

425 name="UTF-16", 

426 is_multi_byte=True, 

427 encoding_era=EncodingEra.MODERN_WEB, 

428 language_filter=LanguageFilter.ALL, 

429 ), 

430 "UTF-16BE": Charset( 

431 name="UTF-16BE", 

432 is_multi_byte=True, 

433 encoding_era=EncodingEra.MODERN_WEB, 

434 language_filter=LanguageFilter.ALL, 

435 ), 

436 "UTF-16LE": Charset( 

437 name="UTF-16LE", 

438 is_multi_byte=True, 

439 encoding_era=EncodingEra.MODERN_WEB, 

440 language_filter=LanguageFilter.ALL, 

441 ), 

442 "UTF-32": Charset( 

443 name="UTF-32", 

444 is_multi_byte=True, 

445 encoding_era=EncodingEra.MODERN_WEB, 

446 language_filter=LanguageFilter.ALL, 

447 ), 

448 "UTF-32BE": Charset( 

449 name="UTF-32BE", 

450 is_multi_byte=True, 

451 encoding_era=EncodingEra.MODERN_WEB, 

452 language_filter=LanguageFilter.ALL, 

453 ), 

454 "UTF-32LE": Charset( 

455 name="UTF-32LE", 

456 is_multi_byte=True, 

457 encoding_era=EncodingEra.MODERN_WEB, 

458 language_filter=LanguageFilter.ALL, 

459 ), 

460 "WINDOWS-1250": Charset( 

461 name="Windows-1250", 

462 is_multi_byte=False, 

463 encoding_era=EncodingEra.MODERN_WEB, 

464 language_filter=LanguageFilter.NON_CJK, 

465 ), 

466 "WINDOWS-1251": Charset( 

467 name="Windows-1251", 

468 is_multi_byte=False, 

469 encoding_era=EncodingEra.MODERN_WEB, 

470 language_filter=LanguageFilter.NON_CJK, 

471 ), 

472 "WINDOWS-1252": Charset( 

473 name="Windows-1252", 

474 is_multi_byte=False, 

475 encoding_era=EncodingEra.MODERN_WEB, 

476 language_filter=LanguageFilter.NON_CJK, 

477 ), 

478 "WINDOWS-1253": Charset( 

479 name="Windows-1253", 

480 is_multi_byte=False, 

481 encoding_era=EncodingEra.MODERN_WEB, 

482 language_filter=LanguageFilter.NON_CJK, 

483 ), 

484 "WINDOWS-1254": Charset( 

485 name="Windows-1254", 

486 is_multi_byte=False, 

487 encoding_era=EncodingEra.MODERN_WEB, 

488 language_filter=LanguageFilter.NON_CJK, 

489 ), 

490 "WINDOWS-1255": Charset( 

491 name="Windows-1255", 

492 is_multi_byte=False, 

493 encoding_era=EncodingEra.MODERN_WEB, 

494 language_filter=LanguageFilter.NON_CJK, 

495 ), 

496 "WINDOWS-1256": Charset( 

497 name="Windows-1256", 

498 is_multi_byte=False, 

499 encoding_era=EncodingEra.MODERN_WEB, 

500 language_filter=LanguageFilter.NON_CJK, 

501 ), 

502 "WINDOWS-1257": Charset( 

503 name="Windows-1257", 

504 is_multi_byte=False, 

505 encoding_era=EncodingEra.MODERN_WEB, 

506 language_filter=LanguageFilter.NON_CJK, 

507 ), 

508 "WINDOWS-1258": Charset( 

509 name="Windows-1258", 

510 is_multi_byte=False, 

511 encoding_era=EncodingEra.MODERN_WEB, 

512 language_filter=LanguageFilter.NON_CJK, 

513 ), 

514} 

515 

516 

517def get_charset(encoding_name: str) -> Charset: 

518 """ 

519 Get the Charset metadata for a given encoding name. 

520 

521 :param encoding_name: The encoding name to look up 

522 :return: The Charset for this encoding, defaults to a MODERN_WEB charset if unknown 

523 """ 

524 normalized_name = encoding_name.upper().replace("_", "-") 

525 return CHARSETS[normalized_name] 

526 

527 

528def is_unicode_encoding(encoding_name: str) -> bool: 

529 """ 

530 Check if an encoding is a Unicode encoding (UTF-8, UTF-16, UTF-32). 

531 

532 :param encoding_name: The encoding name to check 

533 :return: True if the encoding is Unicode, False otherwise 

534 """ 

535 normalized_name = encoding_name.upper().replace("_", "-") 

536 return normalized_name.startswith("UTF-")