Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/utils.py: 31%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

211 statements  

1from __future__ import annotations 

2 

3import importlib 

4import logging 

5import unicodedata 

6from bisect import bisect_right 

7from codecs import IncrementalDecoder 

8from encodings.aliases import aliases 

9from functools import lru_cache 

10from re import findall 

11from typing import Generator 

12 

13from _multibytecodec import ( # type: ignore[import-not-found,import] 

14 MultibyteIncrementalDecoder, 

15) 

16 

17from .constant import ( 

18 ENCODING_MARKS, 

19 IANA_SUPPORTED_SIMILAR, 

20 RE_POSSIBLE_ENCODING_INDICATION, 

21 UNICODE_RANGES_COMBINED, 

22 UNICODE_SECONDARY_RANGE_KEYWORD, 

23 UTF8_MAXIMAL_ALLOCATION, 

24 COMMON_CJK_CHARACTERS, 

25 _LATIN, 

26 _CJK, 

27 _HANGUL, 

28 _KATAKANA, 

29 _HIRAGANA, 

30 _THAI, 

31 _ARABIC, 

32 _ARABIC_ISOLATED_FORM, 

33 _ACCENT_KEYWORDS, 

34 _ACCENTUATED, 

35) 

36 

37 

38@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

39def _character_flags(character: str) -> int: 

40 """Compute all name-based classification flags with a single unicodedata.name() call.""" 

41 try: 

42 desc: str = unicodedata.name(character) 

43 except ValueError: 

44 return 0 

45 

46 flags: int = 0 

47 

48 if "LATIN" in desc: 

49 flags |= _LATIN 

50 if "CJK" in desc: 

51 flags |= _CJK 

52 if "HANGUL" in desc: 

53 flags |= _HANGUL 

54 if "KATAKANA" in desc: 

55 flags |= _KATAKANA 

56 if "HIRAGANA" in desc: 

57 flags |= _HIRAGANA 

58 if "THAI" in desc: 

59 flags |= _THAI 

60 if "ARABIC" in desc: 

61 flags |= _ARABIC 

62 if "ISOLATED FORM" in desc: 

63 flags |= _ARABIC_ISOLATED_FORM 

64 

65 for kw in _ACCENT_KEYWORDS: 

66 if kw in desc: 

67 flags |= _ACCENTUATED 

68 break 

69 

70 return flags 

71 

72 

73@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

74def is_accentuated(character: str) -> bool: 

75 return bool(_character_flags(character) & _ACCENTUATED) 

76 

77 

78@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

79def remove_accent(character: str) -> str: 

80 decomposed: str = unicodedata.decomposition(character) 

81 if not decomposed: 

82 return character 

83 

84 codes: list[str] = decomposed.split(" ") 

85 

86 return chr(int(codes[0], 16)) 

87 

88 

89# Pre-built sorted lookup table for O(log n) binary search in unicode_range(). 

90# Each entry is (range_start, range_end_exclusive, range_name). 

91_UNICODE_RANGES_SORTED: list[tuple[int, int, str]] = sorted( 

92 (ord_range.start, ord_range.stop, name) 

93 for name, ord_range in UNICODE_RANGES_COMBINED.items() 

94) 

95_UNICODE_RANGE_STARTS: list[int] = [e[0] for e in _UNICODE_RANGES_SORTED] 

96 

97 

98@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

99def unicode_range(character: str) -> str | None: 

100 """ 

101 Retrieve the Unicode range official name from a single character. 

102 """ 

103 character_ord: int = ord(character) 

104 

105 # Binary search: find the rightmost range whose start <= character_ord 

106 idx = bisect_right(_UNICODE_RANGE_STARTS, character_ord) - 1 

107 if idx >= 0: 

108 start, stop, name = _UNICODE_RANGES_SORTED[idx] 

109 if character_ord < stop: 

110 return name 

111 

112 return None 

113 

114 

115@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

116def is_latin(character: str) -> bool: 

117 return bool(_character_flags(character) & _LATIN) 

118 

119 

120@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

121def is_punctuation(character: str) -> bool: 

122 character_category: str = unicodedata.category(character) 

123 

124 if "P" in character_category: 

125 return True 

126 

127 character_range: str | None = unicode_range(character) 

128 

129 if character_range is None: 

130 return False 

131 

132 return "Punctuation" in character_range 

133 

134 

135@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

136def is_symbol(character: str) -> bool: 

137 character_category: str = unicodedata.category(character) 

138 

139 if "S" in character_category or "N" in character_category: 

140 return True 

141 

142 character_range: str | None = unicode_range(character) 

143 

144 if character_range is None: 

145 return False 

146 

147 return "Forms" in character_range and character_category != "Lo" 

148 

149 

150@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

151def is_emoticon(character: str) -> bool: 

152 character_range: str | None = unicode_range(character) 

153 

154 if character_range is None: 

155 return False 

156 

157 return "Emoticons" in character_range or "Pictographs" in character_range 

158 

159 

160@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

161def is_separator(character: str) -> bool: 

162 if character.isspace() or character in {"|", "+", "<", ">"}: 

163 return True 

164 

165 character_category: str = unicodedata.category(character) 

166 

167 return "Z" in character_category or character_category in {"Po", "Pd", "Pc"} 

168 

169 

170@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

171def is_case_variable(character: str) -> bool: 

172 return character.islower() != character.isupper() 

173 

174 

175@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

176def is_cjk(character: str) -> bool: 

177 return bool(_character_flags(character) & _CJK) 

178 

179 

180@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

181def is_hiragana(character: str) -> bool: 

182 return bool(_character_flags(character) & _HIRAGANA) 

183 

184 

185@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

186def is_katakana(character: str) -> bool: 

187 return bool(_character_flags(character) & _KATAKANA) 

188 

189 

190@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

191def is_hangul(character: str) -> bool: 

192 return bool(_character_flags(character) & _HANGUL) 

193 

194 

195@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

196def is_thai(character: str) -> bool: 

197 return bool(_character_flags(character) & _THAI) 

198 

199 

200@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

201def is_arabic(character: str) -> bool: 

202 return bool(_character_flags(character) & _ARABIC) 

203 

204 

205@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

206def is_arabic_isolated_form(character: str) -> bool: 

207 return bool(_character_flags(character) & _ARABIC_ISOLATED_FORM) 

208 

209 

210@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

211def is_cjk_uncommon(character: str) -> bool: 

212 return character not in COMMON_CJK_CHARACTERS 

213 

214 

215@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED)) 

216def is_unicode_range_secondary(range_name: str) -> bool: 

217 return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) 

218 

219 

220@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

221def is_unprintable(character: str) -> bool: 

222 return ( 

223 character.isspace() is False # includes \n \t \r \v 

224 and character.isprintable() is False 

225 and character != "\x1a" # Why? Its the ASCII substitute character. 

226 and character != "\ufeff" # bug discovered in Python, 

227 # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space. 

228 ) 

229 

230 

231def any_specified_encoding( 

232 sequence: bytes | bytearray, search_zone: int = 8192 

233) -> str | None: 

234 """ 

235 Extract using ASCII-only decoder any specified encoding in the first n-bytes. 

236 """ 

237 if not isinstance(sequence, (bytes, bytearray)): 

238 raise TypeError 

239 

240 seq_len: int = len(sequence) 

241 

242 results: list[str] = findall( 

243 RE_POSSIBLE_ENCODING_INDICATION, 

244 sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"), 

245 ) 

246 

247 if len(results) == 0: 

248 return None 

249 

250 for specified_encoding in results: 

251 specified_encoding = specified_encoding.lower().replace("-", "_") 

252 

253 encoding_alias: str 

254 encoding_iana: str 

255 

256 for encoding_alias, encoding_iana in aliases.items(): 

257 if encoding_alias == specified_encoding: 

258 return encoding_iana 

259 if encoding_iana == specified_encoding: 

260 return encoding_iana 

261 

262 return None 

263 

264 

265@lru_cache(maxsize=128) 

266def is_multi_byte_encoding(name: str) -> bool: 

267 """ 

268 Verify is a specific encoding is a multi byte one based on it IANA name 

269 """ 

270 return name in { 

271 "utf_8", 

272 "utf_8_sig", 

273 "utf_16", 

274 "utf_16_be", 

275 "utf_16_le", 

276 "utf_32", 

277 "utf_32_le", 

278 "utf_32_be", 

279 "utf_7", 

280 } or issubclass( 

281 importlib.import_module(f"encodings.{name}").IncrementalDecoder, 

282 MultibyteIncrementalDecoder, 

283 ) 

284 

285 

286def identify_sig_or_bom(sequence: bytes | bytearray) -> tuple[str | None, bytes]: 

287 """ 

288 Identify and extract SIG/BOM in given sequence. 

289 """ 

290 

291 for iana_encoding in ENCODING_MARKS: 

292 marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding] 

293 

294 if isinstance(marks, bytes): 

295 marks = [marks] 

296 

297 for mark in marks: 

298 if sequence.startswith(mark): 

299 return iana_encoding, mark 

300 

301 return None, b"" 

302 

303 

304def should_strip_sig_or_bom(iana_encoding: str) -> bool: 

305 return iana_encoding not in {"utf_16", "utf_32"} 

306 

307 

308def iana_name(cp_name: str, strict: bool = True) -> str: 

309 """Returns the Python normalized encoding name (Not the IANA official name).""" 

310 cp_name = cp_name.lower().replace("-", "_") 

311 

312 encoding_alias: str 

313 encoding_iana: str 

314 

315 for encoding_alias, encoding_iana in aliases.items(): 

316 if cp_name in [encoding_alias, encoding_iana]: 

317 return encoding_iana 

318 

319 if strict: 

320 raise ValueError(f"Unable to retrieve IANA for '{cp_name}'") 

321 

322 return cp_name 

323 

324 

325def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: 

326 if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): 

327 return 0.0 

328 

329 decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder 

330 decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder 

331 

332 id_a: IncrementalDecoder = decoder_a(errors="ignore") 

333 id_b: IncrementalDecoder = decoder_b(errors="ignore") 

334 

335 character_match_count: int = 0 

336 

337 for i in range(256): 

338 to_be_decoded: bytes = bytes([i]) 

339 if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded): 

340 character_match_count += 1 

341 

342 return character_match_count / 256 

343 

344 

345def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool: 

346 """ 

347 Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using 

348 the function cp_similarity. 

349 """ 

350 return ( 

351 iana_name_a in IANA_SUPPORTED_SIMILAR 

352 and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a] 

353 ) 

354 

355 

356def set_logging_handler( 

357 name: str = "charset_normalizer", 

358 level: int = logging.INFO, 

359 format_string: str = "%(asctime)s | %(levelname)s | %(message)s", 

360) -> None: 

361 logger = logging.getLogger(name) 

362 logger.setLevel(level) 

363 

364 handler = logging.StreamHandler() 

365 handler.setFormatter(logging.Formatter(format_string)) 

366 logger.addHandler(handler) 

367 

368 

369def cut_sequence_chunks( 

370 sequences: bytes | bytearray, 

371 encoding_iana: str, 

372 offsets: range, 

373 chunk_size: int, 

374 bom_or_sig_available: bool, 

375 strip_sig_or_bom: bool, 

376 sig_payload: bytes, 

377 is_multi_byte_decoder: bool, 

378 decoded_payload: str | None = None, 

379) -> Generator[str, None, None]: 

380 if decoded_payload and is_multi_byte_decoder is False: 

381 for i in offsets: 

382 chunk = decoded_payload[i : i + chunk_size] 

383 if not chunk: 

384 break 

385 yield chunk 

386 else: 

387 for i in offsets: 

388 chunk_end = i + chunk_size 

389 if chunk_end > len(sequences) + 8: 

390 continue 

391 

392 cut_sequence = sequences[i : i + chunk_size] 

393 

394 if bom_or_sig_available and strip_sig_or_bom is False: 

395 cut_sequence = sig_payload + cut_sequence 

396 

397 chunk = cut_sequence.decode( 

398 encoding_iana, 

399 errors="ignore" if is_multi_byte_decoder else "strict", 

400 ) 

401 

402 # multi-byte bad cutting detector and adjustment 

403 # not the cleanest way to perform that fix but clever enough for now. 

404 if is_multi_byte_decoder and i > 0: 

405 chunk_partial_size_chk: int = min(chunk_size, 16) 

406 

407 if ( 

408 decoded_payload 

409 and chunk[:chunk_partial_size_chk] not in decoded_payload 

410 ): 

411 for j in range(i, i - 4, -1): 

412 cut_sequence = sequences[j:chunk_end] 

413 

414 if bom_or_sig_available and strip_sig_or_bom is False: 

415 cut_sequence = sig_payload + cut_sequence 

416 

417 chunk = cut_sequence.decode(encoding_iana, errors="ignore") 

418 

419 if chunk[:chunk_partial_size_chk] in decoded_payload: 

420 break 

421 

422 yield chunk