Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/utils.py: 26%

218 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-08 06:40 +0000

1import importlib 

2import logging 

3import unicodedata 

4from codecs import IncrementalDecoder 

5from encodings.aliases import aliases 

6from functools import lru_cache 

7from re import findall 

8from typing import Generator, List, Optional, Set, Tuple, Union 

9 

10from _multibytecodec import MultibyteIncrementalDecoder 

11 

12from .constant import ( 

13 ENCODING_MARKS, 

14 IANA_SUPPORTED_SIMILAR, 

15 RE_POSSIBLE_ENCODING_INDICATION, 

16 UNICODE_RANGES_COMBINED, 

17 UNICODE_SECONDARY_RANGE_KEYWORD, 

18 UTF8_MAXIMAL_ALLOCATION, 

19) 

20 

21 

22@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

23def is_accentuated(character: str) -> bool: 

24 try: 

25 description: str = unicodedata.name(character) 

26 except ValueError: 

27 return False 

28 return ( 

29 "WITH GRAVE" in description 

30 or "WITH ACUTE" in description 

31 or "WITH CEDILLA" in description 

32 or "WITH DIAERESIS" in description 

33 or "WITH CIRCUMFLEX" in description 

34 or "WITH TILDE" in description 

35 or "WITH MACRON" in description 

36 or "WITH RING ABOVE" in description 

37 ) 

38 

39 

40@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

41def remove_accent(character: str) -> str: 

42 decomposed: str = unicodedata.decomposition(character) 

43 if not decomposed: 

44 return character 

45 

46 codes: List[str] = decomposed.split(" ") 

47 

48 return chr(int(codes[0], 16)) 

49 

50 

51@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

52def unicode_range(character: str) -> Optional[str]: 

53 """ 

54 Retrieve the Unicode range official name from a single character. 

55 """ 

56 character_ord: int = ord(character) 

57 

58 for range_name, ord_range in UNICODE_RANGES_COMBINED.items(): 

59 if character_ord in ord_range: 

60 return range_name 

61 

62 return None 

63 

64 

65@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

66def is_latin(character: str) -> bool: 

67 try: 

68 description: str = unicodedata.name(character) 

69 except ValueError: 

70 return False 

71 return "LATIN" in description 

72 

73 

74@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

75def is_punctuation(character: str) -> bool: 

76 character_category: str = unicodedata.category(character) 

77 

78 if "P" in character_category: 

79 return True 

80 

81 character_range: Optional[str] = unicode_range(character) 

82 

83 if character_range is None: 

84 return False 

85 

86 return "Punctuation" in character_range 

87 

88 

89@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

90def is_symbol(character: str) -> bool: 

91 character_category: str = unicodedata.category(character) 

92 

93 if "S" in character_category or "N" in character_category: 

94 return True 

95 

96 character_range: Optional[str] = unicode_range(character) 

97 

98 if character_range is None: 

99 return False 

100 

101 return "Forms" in character_range and character_category != "Lo" 

102 

103 

104@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

105def is_emoticon(character: str) -> bool: 

106 character_range: Optional[str] = unicode_range(character) 

107 

108 if character_range is None: 

109 return False 

110 

111 return "Emoticons" in character_range or "Pictographs" in character_range 

112 

113 

114@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

115def is_separator(character: str) -> bool: 

116 if character.isspace() or character in {"|", "+", "<", ">"}: 

117 return True 

118 

119 character_category: str = unicodedata.category(character) 

120 

121 return "Z" in character_category or character_category in {"Po", "Pd", "Pc"} 

122 

123 

124@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

125def is_case_variable(character: str) -> bool: 

126 return character.islower() != character.isupper() 

127 

128 

129@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

130def is_cjk(character: str) -> bool: 

131 try: 

132 character_name = unicodedata.name(character) 

133 except ValueError: 

134 return False 

135 

136 return "CJK" in character_name 

137 

138 

139@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

140def is_hiragana(character: str) -> bool: 

141 try: 

142 character_name = unicodedata.name(character) 

143 except ValueError: 

144 return False 

145 

146 return "HIRAGANA" in character_name 

147 

148 

149@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

150def is_katakana(character: str) -> bool: 

151 try: 

152 character_name = unicodedata.name(character) 

153 except ValueError: 

154 return False 

155 

156 return "KATAKANA" in character_name 

157 

158 

159@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

160def is_hangul(character: str) -> bool: 

161 try: 

162 character_name = unicodedata.name(character) 

163 except ValueError: 

164 return False 

165 

166 return "HANGUL" in character_name 

167 

168 

169@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

170def is_thai(character: str) -> bool: 

171 try: 

172 character_name = unicodedata.name(character) 

173 except ValueError: 

174 return False 

175 

176 return "THAI" in character_name 

177 

178 

179@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

180def is_arabic(character: str) -> bool: 

181 try: 

182 character_name = unicodedata.name(character) 

183 except ValueError: 

184 return False 

185 

186 return "ARABIC" in character_name 

187 

188 

189@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

190def is_arabic_isolated_form(character: str) -> bool: 

191 try: 

192 character_name = unicodedata.name(character) 

193 except ValueError: 

194 return False 

195 

196 return "ARABIC" in character_name and "ISOLATED FORM" in character_name 

197 

198 

199@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED)) 

200def is_unicode_range_secondary(range_name: str) -> bool: 

201 return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) 

202 

203 

204@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

205def is_unprintable(character: str) -> bool: 

206 return ( 

207 character.isspace() is False # includes \n \t \r \v 

208 and character.isprintable() is False 

209 and character != "\x1A" # Why? Its the ASCII substitute character. 

210 and character != "\ufeff" # bug discovered in Python, 

211 # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space. 

212 ) 

213 

214 

215def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]: 

216 """ 

217 Extract using ASCII-only decoder any specified encoding in the first n-bytes. 

218 """ 

219 if not isinstance(sequence, bytes): 

220 raise TypeError 

221 

222 seq_len: int = len(sequence) 

223 

224 results: List[str] = findall( 

225 RE_POSSIBLE_ENCODING_INDICATION, 

226 sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"), 

227 ) 

228 

229 if len(results) == 0: 

230 return None 

231 

232 for specified_encoding in results: 

233 specified_encoding = specified_encoding.lower().replace("-", "_") 

234 

235 encoding_alias: str 

236 encoding_iana: str 

237 

238 for encoding_alias, encoding_iana in aliases.items(): 

239 if encoding_alias == specified_encoding: 

240 return encoding_iana 

241 if encoding_iana == specified_encoding: 

242 return encoding_iana 

243 

244 return None 

245 

246 

247@lru_cache(maxsize=128) 

248def is_multi_byte_encoding(name: str) -> bool: 

249 """ 

250 Verify is a specific encoding is a multi byte one based on it IANA name 

251 """ 

252 return name in { 

253 "utf_8", 

254 "utf_8_sig", 

255 "utf_16", 

256 "utf_16_be", 

257 "utf_16_le", 

258 "utf_32", 

259 "utf_32_le", 

260 "utf_32_be", 

261 "utf_7", 

262 } or issubclass( 

263 importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, 

264 MultibyteIncrementalDecoder, 

265 ) 

266 

267 

268def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]: 

269 """ 

270 Identify and extract SIG/BOM in given sequence. 

271 """ 

272 

273 for iana_encoding in ENCODING_MARKS: 

274 marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding] 

275 

276 if isinstance(marks, bytes): 

277 marks = [marks] 

278 

279 for mark in marks: 

280 if sequence.startswith(mark): 

281 return iana_encoding, mark 

282 

283 return None, b"" 

284 

285 

286def should_strip_sig_or_bom(iana_encoding: str) -> bool: 

287 return iana_encoding not in {"utf_16", "utf_32"} 

288 

289 

290def iana_name(cp_name: str, strict: bool = True) -> str: 

291 cp_name = cp_name.lower().replace("-", "_") 

292 

293 encoding_alias: str 

294 encoding_iana: str 

295 

296 for encoding_alias, encoding_iana in aliases.items(): 

297 if cp_name in [encoding_alias, encoding_iana]: 

298 return encoding_iana 

299 

300 if strict: 

301 raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name)) 

302 

303 return cp_name 

304 

305 

306def range_scan(decoded_sequence: str) -> List[str]: 

307 ranges: Set[str] = set() 

308 

309 for character in decoded_sequence: 

310 character_range: Optional[str] = unicode_range(character) 

311 

312 if character_range is None: 

313 continue 

314 

315 ranges.add(character_range) 

316 

317 return list(ranges) 

318 

319 

320def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: 

321 if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): 

322 return 0.0 

323 

324 decoder_a = importlib.import_module( 

325 "encodings.{}".format(iana_name_a) 

326 ).IncrementalDecoder 

327 decoder_b = importlib.import_module( 

328 "encodings.{}".format(iana_name_b) 

329 ).IncrementalDecoder 

330 

331 id_a: IncrementalDecoder = decoder_a(errors="ignore") 

332 id_b: IncrementalDecoder = decoder_b(errors="ignore") 

333 

334 character_match_count: int = 0 

335 

336 for i in range(255): 

337 to_be_decoded: bytes = bytes([i]) 

338 if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded): 

339 character_match_count += 1 

340 

341 return character_match_count / 254 

342 

343 

344def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool: 

345 """ 

346 Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using 

347 the function cp_similarity. 

348 """ 

349 return ( 

350 iana_name_a in IANA_SUPPORTED_SIMILAR 

351 and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a] 

352 ) 

353 

354 

355def set_logging_handler( 

356 name: str = "charset_normalizer", 

357 level: int = logging.INFO, 

358 format_string: str = "%(asctime)s | %(levelname)s | %(message)s", 

359) -> None: 

360 logger = logging.getLogger(name) 

361 logger.setLevel(level) 

362 

363 handler = logging.StreamHandler() 

364 handler.setFormatter(logging.Formatter(format_string)) 

365 logger.addHandler(handler) 

366 

367 

368def cut_sequence_chunks( 

369 sequences: bytes, 

370 encoding_iana: str, 

371 offsets: range, 

372 chunk_size: int, 

373 bom_or_sig_available: bool, 

374 strip_sig_or_bom: bool, 

375 sig_payload: bytes, 

376 is_multi_byte_decoder: bool, 

377 decoded_payload: Optional[str] = None, 

378) -> Generator[str, None, None]: 

379 if decoded_payload and is_multi_byte_decoder is False: 

380 for i in offsets: 

381 chunk = decoded_payload[i : i + chunk_size] 

382 if not chunk: 

383 break 

384 yield chunk 

385 else: 

386 for i in offsets: 

387 chunk_end = i + chunk_size 

388 if chunk_end > len(sequences) + 8: 

389 continue 

390 

391 cut_sequence = sequences[i : i + chunk_size] 

392 

393 if bom_or_sig_available and strip_sig_or_bom is False: 

394 cut_sequence = sig_payload + cut_sequence 

395 

396 chunk = cut_sequence.decode( 

397 encoding_iana, 

398 errors="ignore" if is_multi_byte_decoder else "strict", 

399 ) 

400 

401 # multi-byte bad cutting detector and adjustment 

402 # not the cleanest way to perform that fix but clever enough for now. 

403 if is_multi_byte_decoder and i > 0: 

404 chunk_partial_size_chk: int = min(chunk_size, 16) 

405 

406 if ( 

407 decoded_payload 

408 and chunk[:chunk_partial_size_chk] not in decoded_payload 

409 ): 

410 for j in range(i, i - 4, -1): 

411 cut_sequence = sequences[j:chunk_end] 

412 

413 if bom_or_sig_available and strip_sig_or_bom is False: 

414 cut_sequence = sig_payload + cut_sequence 

415 

416 chunk = cut_sequence.decode(encoding_iana, errors="ignore") 

417 

418 if chunk[:chunk_partial_size_chk] in decoded_payload: 

419 break 

420 

421 yield chunk