Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/utils.py: 26%

214 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:35 +0000

1import importlib 

2import logging 

3import unicodedata 

4from codecs import IncrementalDecoder 

5from encodings.aliases import aliases 

6from functools import lru_cache 

7from re import findall 

8from typing import Generator, List, Optional, Set, Tuple, Union 

9 

10from _multibytecodec import MultibyteIncrementalDecoder 

11 

12from .constant import ( 

13 ENCODING_MARKS, 

14 IANA_SUPPORTED_SIMILAR, 

15 RE_POSSIBLE_ENCODING_INDICATION, 

16 UNICODE_RANGES_COMBINED, 

17 UNICODE_SECONDARY_RANGE_KEYWORD, 

18 UTF8_MAXIMAL_ALLOCATION, 

19) 

20 

21 

22@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

23def is_accentuated(character: str) -> bool: 

24 try: 

25 description: str = unicodedata.name(character) 

26 except ValueError: 

27 return False 

28 return ( 

29 "WITH GRAVE" in description 

30 or "WITH ACUTE" in description 

31 or "WITH CEDILLA" in description 

32 or "WITH DIAERESIS" in description 

33 or "WITH CIRCUMFLEX" in description 

34 or "WITH TILDE" in description 

35 ) 

36 

37 

38@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

39def remove_accent(character: str) -> str: 

40 decomposed: str = unicodedata.decomposition(character) 

41 if not decomposed: 

42 return character 

43 

44 codes: List[str] = decomposed.split(" ") 

45 

46 return chr(int(codes[0], 16)) 

47 

48 

49@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

50def unicode_range(character: str) -> Optional[str]: 

51 """ 

52 Retrieve the Unicode range official name from a single character. 

53 """ 

54 character_ord: int = ord(character) 

55 

56 for range_name, ord_range in UNICODE_RANGES_COMBINED.items(): 

57 if character_ord in ord_range: 

58 return range_name 

59 

60 return None 

61 

62 

63@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

64def is_latin(character: str) -> bool: 

65 try: 

66 description: str = unicodedata.name(character) 

67 except ValueError: 

68 return False 

69 return "LATIN" in description 

70 

71 

72@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

73def is_ascii(character: str) -> bool: 

74 try: 

75 character.encode("ascii") 

76 except UnicodeEncodeError: 

77 return False 

78 return True 

79 

80 

81@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

82def is_punctuation(character: str) -> bool: 

83 character_category: str = unicodedata.category(character) 

84 

85 if "P" in character_category: 

86 return True 

87 

88 character_range: Optional[str] = unicode_range(character) 

89 

90 if character_range is None: 

91 return False 

92 

93 return "Punctuation" in character_range 

94 

95 

96@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

97def is_symbol(character: str) -> bool: 

98 character_category: str = unicodedata.category(character) 

99 

100 if "S" in character_category or "N" in character_category: 

101 return True 

102 

103 character_range: Optional[str] = unicode_range(character) 

104 

105 if character_range is None: 

106 return False 

107 

108 return "Forms" in character_range 

109 

110 

111@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

112def is_emoticon(character: str) -> bool: 

113 character_range: Optional[str] = unicode_range(character) 

114 

115 if character_range is None: 

116 return False 

117 

118 return "Emoticons" in character_range 

119 

120 

121@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

122def is_separator(character: str) -> bool: 

123 if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}: 

124 return True 

125 

126 character_category: str = unicodedata.category(character) 

127 

128 return "Z" in character_category 

129 

130 

131@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

132def is_case_variable(character: str) -> bool: 

133 return character.islower() != character.isupper() 

134 

135 

136def is_private_use_only(character: str) -> bool: 

137 character_category: str = unicodedata.category(character) 

138 

139 return character_category == "Co" 

140 

141 

142@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

143def is_cjk(character: str) -> bool: 

144 try: 

145 character_name = unicodedata.name(character) 

146 except ValueError: 

147 return False 

148 

149 return "CJK" in character_name 

150 

151 

152@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

153def is_hiragana(character: str) -> bool: 

154 try: 

155 character_name = unicodedata.name(character) 

156 except ValueError: 

157 return False 

158 

159 return "HIRAGANA" in character_name 

160 

161 

162@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

163def is_katakana(character: str) -> bool: 

164 try: 

165 character_name = unicodedata.name(character) 

166 except ValueError: 

167 return False 

168 

169 return "KATAKANA" in character_name 

170 

171 

172@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

173def is_hangul(character: str) -> bool: 

174 try: 

175 character_name = unicodedata.name(character) 

176 except ValueError: 

177 return False 

178 

179 return "HANGUL" in character_name 

180 

181 

182@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

183def is_thai(character: str) -> bool: 

184 try: 

185 character_name = unicodedata.name(character) 

186 except ValueError: 

187 return False 

188 

189 return "THAI" in character_name 

190 

191 

192@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED)) 

193def is_unicode_range_secondary(range_name: str) -> bool: 

194 return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) 

195 

196 

197@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 

198def is_unprintable(character: str) -> bool: 

199 return ( 

200 character.isspace() is False # includes \n \t \r \v 

201 and character.isprintable() is False 

202 and character != "\x1A" # Why? Its the ASCII substitute character. 

203 and character != "\ufeff" # bug discovered in Python, 

204 # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space. 

205 ) 

206 

207 

208def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]: 

209 """ 

210 Extract using ASCII-only decoder any specified encoding in the first n-bytes. 

211 """ 

212 if not isinstance(sequence, bytes): 

213 raise TypeError 

214 

215 seq_len: int = len(sequence) 

216 

217 results: List[str] = findall( 

218 RE_POSSIBLE_ENCODING_INDICATION, 

219 sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"), 

220 ) 

221 

222 if len(results) == 0: 

223 return None 

224 

225 for specified_encoding in results: 

226 specified_encoding = specified_encoding.lower().replace("-", "_") 

227 

228 encoding_alias: str 

229 encoding_iana: str 

230 

231 for encoding_alias, encoding_iana in aliases.items(): 

232 if encoding_alias == specified_encoding: 

233 return encoding_iana 

234 if encoding_iana == specified_encoding: 

235 return encoding_iana 

236 

237 return None 

238 

239 

240@lru_cache(maxsize=128) 

241def is_multi_byte_encoding(name: str) -> bool: 

242 """ 

243 Verify is a specific encoding is a multi byte one based on it IANA name 

244 """ 

245 return name in { 

246 "utf_8", 

247 "utf_8_sig", 

248 "utf_16", 

249 "utf_16_be", 

250 "utf_16_le", 

251 "utf_32", 

252 "utf_32_le", 

253 "utf_32_be", 

254 "utf_7", 

255 } or issubclass( 

256 importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, 

257 MultibyteIncrementalDecoder, 

258 ) 

259 

260 

261def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]: 

262 """ 

263 Identify and extract SIG/BOM in given sequence. 

264 """ 

265 

266 for iana_encoding in ENCODING_MARKS: 

267 marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding] 

268 

269 if isinstance(marks, bytes): 

270 marks = [marks] 

271 

272 for mark in marks: 

273 if sequence.startswith(mark): 

274 return iana_encoding, mark 

275 

276 return None, b"" 

277 

278 

279def should_strip_sig_or_bom(iana_encoding: str) -> bool: 

280 return iana_encoding not in {"utf_16", "utf_32"} 

281 

282 

283def iana_name(cp_name: str, strict: bool = True) -> str: 

284 cp_name = cp_name.lower().replace("-", "_") 

285 

286 encoding_alias: str 

287 encoding_iana: str 

288 

289 for encoding_alias, encoding_iana in aliases.items(): 

290 if cp_name in [encoding_alias, encoding_iana]: 

291 return encoding_iana 

292 

293 if strict: 

294 raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name)) 

295 

296 return cp_name 

297 

298 

299def range_scan(decoded_sequence: str) -> List[str]: 

300 ranges: Set[str] = set() 

301 

302 for character in decoded_sequence: 

303 character_range: Optional[str] = unicode_range(character) 

304 

305 if character_range is None: 

306 continue 

307 

308 ranges.add(character_range) 

309 

310 return list(ranges) 

311 

312 

313def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: 

314 if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): 

315 return 0.0 

316 

317 decoder_a = importlib.import_module( 

318 "encodings.{}".format(iana_name_a) 

319 ).IncrementalDecoder 

320 decoder_b = importlib.import_module( 

321 "encodings.{}".format(iana_name_b) 

322 ).IncrementalDecoder 

323 

324 id_a: IncrementalDecoder = decoder_a(errors="ignore") 

325 id_b: IncrementalDecoder = decoder_b(errors="ignore") 

326 

327 character_match_count: int = 0 

328 

329 for i in range(255): 

330 to_be_decoded: bytes = bytes([i]) 

331 if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded): 

332 character_match_count += 1 

333 

334 return character_match_count / 254 

335 

336 

337def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool: 

338 """ 

339 Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using 

340 the function cp_similarity. 

341 """ 

342 return ( 

343 iana_name_a in IANA_SUPPORTED_SIMILAR 

344 and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a] 

345 ) 

346 

347 

348def set_logging_handler( 

349 name: str = "charset_normalizer", 

350 level: int = logging.INFO, 

351 format_string: str = "%(asctime)s | %(levelname)s | %(message)s", 

352) -> None: 

353 logger = logging.getLogger(name) 

354 logger.setLevel(level) 

355 

356 handler = logging.StreamHandler() 

357 handler.setFormatter(logging.Formatter(format_string)) 

358 logger.addHandler(handler) 

359 

360 

361def cut_sequence_chunks( 

362 sequences: bytes, 

363 encoding_iana: str, 

364 offsets: range, 

365 chunk_size: int, 

366 bom_or_sig_available: bool, 

367 strip_sig_or_bom: bool, 

368 sig_payload: bytes, 

369 is_multi_byte_decoder: bool, 

370 decoded_payload: Optional[str] = None, 

371) -> Generator[str, None, None]: 

372 if decoded_payload and is_multi_byte_decoder is False: 

373 for i in offsets: 

374 chunk = decoded_payload[i : i + chunk_size] 

375 if not chunk: 

376 break 

377 yield chunk 

378 else: 

379 for i in offsets: 

380 chunk_end = i + chunk_size 

381 if chunk_end > len(sequences) + 8: 

382 continue 

383 

384 cut_sequence = sequences[i : i + chunk_size] 

385 

386 if bom_or_sig_available and strip_sig_or_bom is False: 

387 cut_sequence = sig_payload + cut_sequence 

388 

389 chunk = cut_sequence.decode( 

390 encoding_iana, 

391 errors="ignore" if is_multi_byte_decoder else "strict", 

392 ) 

393 

394 # multi-byte bad cutting detector and adjustment 

395 # not the cleanest way to perform that fix but clever enough for now. 

396 if is_multi_byte_decoder and i > 0: 

397 chunk_partial_size_chk: int = min(chunk_size, 16) 

398 

399 if ( 

400 decoded_payload 

401 and chunk[:chunk_partial_size_chk] not in decoded_payload 

402 ): 

403 for j in range(i, i - 4, -1): 

404 cut_sequence = sequences[j:chunk_end] 

405 

406 if bom_or_sig_available and strip_sig_or_bom is False: 

407 cut_sequence = sig_payload + cut_sequence 

408 

409 chunk = cut_sequence.decode(encoding_iana, errors="ignore") 

410 

411 if chunk[:chunk_partial_size_chk] in decoded_payload: 

412 break 

413 

414 yield chunk