Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/utils.py: 26%
218 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-08 06:40 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-08 06:40 +0000
1import importlib
2import logging
3import unicodedata
4from codecs import IncrementalDecoder
5from encodings.aliases import aliases
6from functools import lru_cache
7from re import findall
8from typing import Generator, List, Optional, Set, Tuple, Union
10from _multibytecodec import MultibyteIncrementalDecoder
12from .constant import (
13 ENCODING_MARKS,
14 IANA_SUPPORTED_SIMILAR,
15 RE_POSSIBLE_ENCODING_INDICATION,
16 UNICODE_RANGES_COMBINED,
17 UNICODE_SECONDARY_RANGE_KEYWORD,
18 UTF8_MAXIMAL_ALLOCATION,
19)
22@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
23def is_accentuated(character: str) -> bool:
24 try:
25 description: str = unicodedata.name(character)
26 except ValueError:
27 return False
28 return (
29 "WITH GRAVE" in description
30 or "WITH ACUTE" in description
31 or "WITH CEDILLA" in description
32 or "WITH DIAERESIS" in description
33 or "WITH CIRCUMFLEX" in description
34 or "WITH TILDE" in description
35 or "WITH MACRON" in description
36 or "WITH RING ABOVE" in description
37 )
40@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
41def remove_accent(character: str) -> str:
42 decomposed: str = unicodedata.decomposition(character)
43 if not decomposed:
44 return character
46 codes: List[str] = decomposed.split(" ")
48 return chr(int(codes[0], 16))
51@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
52def unicode_range(character: str) -> Optional[str]:
53 """
54 Retrieve the Unicode range official name from a single character.
55 """
56 character_ord: int = ord(character)
58 for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
59 if character_ord in ord_range:
60 return range_name
62 return None
65@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
66def is_latin(character: str) -> bool:
67 try:
68 description: str = unicodedata.name(character)
69 except ValueError:
70 return False
71 return "LATIN" in description
74@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
75def is_punctuation(character: str) -> bool:
76 character_category: str = unicodedata.category(character)
78 if "P" in character_category:
79 return True
81 character_range: Optional[str] = unicode_range(character)
83 if character_range is None:
84 return False
86 return "Punctuation" in character_range
89@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
90def is_symbol(character: str) -> bool:
91 character_category: str = unicodedata.category(character)
93 if "S" in character_category or "N" in character_category:
94 return True
96 character_range: Optional[str] = unicode_range(character)
98 if character_range is None:
99 return False
101 return "Forms" in character_range and character_category != "Lo"
104@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
105def is_emoticon(character: str) -> bool:
106 character_range: Optional[str] = unicode_range(character)
108 if character_range is None:
109 return False
111 return "Emoticons" in character_range or "Pictographs" in character_range
114@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
115def is_separator(character: str) -> bool:
116 if character.isspace() or character in {"|", "+", "<", ">"}:
117 return True
119 character_category: str = unicodedata.category(character)
121 return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
124@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
125def is_case_variable(character: str) -> bool:
126 return character.islower() != character.isupper()
129@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
130def is_cjk(character: str) -> bool:
131 try:
132 character_name = unicodedata.name(character)
133 except ValueError:
134 return False
136 return "CJK" in character_name
139@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
140def is_hiragana(character: str) -> bool:
141 try:
142 character_name = unicodedata.name(character)
143 except ValueError:
144 return False
146 return "HIRAGANA" in character_name
149@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
150def is_katakana(character: str) -> bool:
151 try:
152 character_name = unicodedata.name(character)
153 except ValueError:
154 return False
156 return "KATAKANA" in character_name
159@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
160def is_hangul(character: str) -> bool:
161 try:
162 character_name = unicodedata.name(character)
163 except ValueError:
164 return False
166 return "HANGUL" in character_name
169@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
170def is_thai(character: str) -> bool:
171 try:
172 character_name = unicodedata.name(character)
173 except ValueError:
174 return False
176 return "THAI" in character_name
179@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
180def is_arabic(character: str) -> bool:
181 try:
182 character_name = unicodedata.name(character)
183 except ValueError:
184 return False
186 return "ARABIC" in character_name
189@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
190def is_arabic_isolated_form(character: str) -> bool:
191 try:
192 character_name = unicodedata.name(character)
193 except ValueError:
194 return False
196 return "ARABIC" in character_name and "ISOLATED FORM" in character_name
199@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
200def is_unicode_range_secondary(range_name: str) -> bool:
201 return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
204@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
205def is_unprintable(character: str) -> bool:
206 return (
207 character.isspace() is False # includes \n \t \r \v
208 and character.isprintable() is False
209 and character != "\x1A" # Why? Its the ASCII substitute character.
210 and character != "\ufeff" # bug discovered in Python,
211 # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
212 )
215def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:
216 """
217 Extract using ASCII-only decoder any specified encoding in the first n-bytes.
218 """
219 if not isinstance(sequence, bytes):
220 raise TypeError
222 seq_len: int = len(sequence)
224 results: List[str] = findall(
225 RE_POSSIBLE_ENCODING_INDICATION,
226 sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
227 )
229 if len(results) == 0:
230 return None
232 for specified_encoding in results:
233 specified_encoding = specified_encoding.lower().replace("-", "_")
235 encoding_alias: str
236 encoding_iana: str
238 for encoding_alias, encoding_iana in aliases.items():
239 if encoding_alias == specified_encoding:
240 return encoding_iana
241 if encoding_iana == specified_encoding:
242 return encoding_iana
244 return None
247@lru_cache(maxsize=128)
248def is_multi_byte_encoding(name: str) -> bool:
249 """
250 Verify is a specific encoding is a multi byte one based on it IANA name
251 """
252 return name in {
253 "utf_8",
254 "utf_8_sig",
255 "utf_16",
256 "utf_16_be",
257 "utf_16_le",
258 "utf_32",
259 "utf_32_le",
260 "utf_32_be",
261 "utf_7",
262 } or issubclass(
263 importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
264 MultibyteIncrementalDecoder,
265 )
268def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
269 """
270 Identify and extract SIG/BOM in given sequence.
271 """
273 for iana_encoding in ENCODING_MARKS:
274 marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
276 if isinstance(marks, bytes):
277 marks = [marks]
279 for mark in marks:
280 if sequence.startswith(mark):
281 return iana_encoding, mark
283 return None, b""
286def should_strip_sig_or_bom(iana_encoding: str) -> bool:
287 return iana_encoding not in {"utf_16", "utf_32"}
290def iana_name(cp_name: str, strict: bool = True) -> str:
291 cp_name = cp_name.lower().replace("-", "_")
293 encoding_alias: str
294 encoding_iana: str
296 for encoding_alias, encoding_iana in aliases.items():
297 if cp_name in [encoding_alias, encoding_iana]:
298 return encoding_iana
300 if strict:
301 raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
303 return cp_name
306def range_scan(decoded_sequence: str) -> List[str]:
307 ranges: Set[str] = set()
309 for character in decoded_sequence:
310 character_range: Optional[str] = unicode_range(character)
312 if character_range is None:
313 continue
315 ranges.add(character_range)
317 return list(ranges)
320def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
321 if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
322 return 0.0
324 decoder_a = importlib.import_module(
325 "encodings.{}".format(iana_name_a)
326 ).IncrementalDecoder
327 decoder_b = importlib.import_module(
328 "encodings.{}".format(iana_name_b)
329 ).IncrementalDecoder
331 id_a: IncrementalDecoder = decoder_a(errors="ignore")
332 id_b: IncrementalDecoder = decoder_b(errors="ignore")
334 character_match_count: int = 0
336 for i in range(255):
337 to_be_decoded: bytes = bytes([i])
338 if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
339 character_match_count += 1
341 return character_match_count / 254
344def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
345 """
346 Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
347 the function cp_similarity.
348 """
349 return (
350 iana_name_a in IANA_SUPPORTED_SIMILAR
351 and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
352 )
355def set_logging_handler(
356 name: str = "charset_normalizer",
357 level: int = logging.INFO,
358 format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
359) -> None:
360 logger = logging.getLogger(name)
361 logger.setLevel(level)
363 handler = logging.StreamHandler()
364 handler.setFormatter(logging.Formatter(format_string))
365 logger.addHandler(handler)
368def cut_sequence_chunks(
369 sequences: bytes,
370 encoding_iana: str,
371 offsets: range,
372 chunk_size: int,
373 bom_or_sig_available: bool,
374 strip_sig_or_bom: bool,
375 sig_payload: bytes,
376 is_multi_byte_decoder: bool,
377 decoded_payload: Optional[str] = None,
378) -> Generator[str, None, None]:
379 if decoded_payload and is_multi_byte_decoder is False:
380 for i in offsets:
381 chunk = decoded_payload[i : i + chunk_size]
382 if not chunk:
383 break
384 yield chunk
385 else:
386 for i in offsets:
387 chunk_end = i + chunk_size
388 if chunk_end > len(sequences) + 8:
389 continue
391 cut_sequence = sequences[i : i + chunk_size]
393 if bom_or_sig_available and strip_sig_or_bom is False:
394 cut_sequence = sig_payload + cut_sequence
396 chunk = cut_sequence.decode(
397 encoding_iana,
398 errors="ignore" if is_multi_byte_decoder else "strict",
399 )
401 # multi-byte bad cutting detector and adjustment
402 # not the cleanest way to perform that fix but clever enough for now.
403 if is_multi_byte_decoder and i > 0:
404 chunk_partial_size_chk: int = min(chunk_size, 16)
406 if (
407 decoded_payload
408 and chunk[:chunk_partial_size_chk] not in decoded_payload
409 ):
410 for j in range(i, i - 4, -1):
411 cut_sequence = sequences[j:chunk_end]
413 if bom_or_sig_available and strip_sig_or_bom is False:
414 cut_sequence = sig_payload + cut_sequence
416 chunk = cut_sequence.decode(encoding_iana, errors="ignore")
418 if chunk[:chunk_partial_size_chk] in decoded_payload:
419 break
421 yield chunk