Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset

1import importlib

2import logging

3import unicodedata

4from codecs import IncrementalDecoder

5from encodings.aliases import aliases

6from functools import lru_cache

7from re import findall

8from typing import Generator, List, Optional, Set, Tuple, Union

10from _multibytecodec import MultibyteIncrementalDecoder

12from .constant import (

13 ENCODING_MARKS,

14 IANA_SUPPORTED_SIMILAR,

15 RE_POSSIBLE_ENCODING_INDICATION,

16 UNICODE_RANGES_COMBINED,

17 UNICODE_SECONDARY_RANGE_KEYWORD,

18 UTF8_MAXIMAL_ALLOCATION,

19)

22@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

23def is_accentuated(character: str) -> bool:

24 try:

25 description: str = unicodedata.name(character)

26 except ValueError:

27 return False

28 return (

29 "WITH GRAVE" in description

30 or "WITH ACUTE" in description

31 or "WITH CEDILLA" in description

32 or "WITH DIAERESIS" in description

33 or "WITH CIRCUMFLEX" in description

34 or "WITH TILDE" in description

35 or "WITH MACRON" in description

36 or "WITH RING ABOVE" in description

37 )

40@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

41def remove_accent(character: str) -> str:

42 decomposed: str = unicodedata.decomposition(character)

43 if not decomposed:

44 return character

46 codes: List[str] = decomposed.split(" ")

48 return chr(int(codes[0], 16))

51@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

52def unicode_range(character: str) -> Optional[str]:

53 """

54 Retrieve the Unicode range official name from a single character.

55 """

56 character_ord: int = ord(character)

58 for range_name, ord_range in UNICODE_RANGES_COMBINED.items():

59 if character_ord in ord_range:

60 return range_name

62 return None

65@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

66def is_latin(character: str) -> bool:

67 try:

68 description: str = unicodedata.name(character)

69 except ValueError:

70 return False

71 return "LATIN" in description

74@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

75def is_punctuation(character: str) -> bool:

76 character_category: str = unicodedata.category(character)

78 if "P" in character_category:

79 return True

81 character_range: Optional[str] = unicode_range(character)

83 if character_range is None:

84 return False

86 return "Punctuation" in character_range

89@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

90def is_symbol(character: str) -> bool:

91 character_category: str = unicodedata.category(character)

93 if "S" in character_category or "N" in character_category:

94 return True

96 character_range: Optional[str] = unicode_range(character)

98 if character_range is None:

99 return False

100

101 return "Forms" in character_range and character_category != "Lo"

102

103

104@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

105def is_emoticon(character: str) -> bool:

106 character_range: Optional[str] = unicode_range(character)

107

108 if character_range is None:

109 return False

110

111 return "Emoticons" in character_range or "Pictographs" in character_range

112

113

114@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

115def is_separator(character: str) -> bool:

116 if character.isspace() or character in {"｜", "+", "<", ">"}:

117 return True

118

119 character_category: str = unicodedata.category(character)

120

121 return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}

122

123

124@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

125def is_case_variable(character: str) -> bool:

126 return character.islower() != character.isupper()

127

128

129@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

130def is_cjk(character: str) -> bool:

131 try:

132 character_name = unicodedata.name(character)

133 except ValueError:

134 return False

135

136 return "CJK" in character_name

137

138

139@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

140def is_hiragana(character: str) -> bool:

141 try:

142 character_name = unicodedata.name(character)

143 except ValueError:

144 return False

145

146 return "HIRAGANA" in character_name

147

148

149@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

150def is_katakana(character: str) -> bool:

151 try:

152 character_name = unicodedata.name(character)

153 except ValueError:

154 return False

155

156 return "KATAKANA" in character_name

157

158

159@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

160def is_hangul(character: str) -> bool:

161 try:

162 character_name = unicodedata.name(character)

163 except ValueError:

164 return False

165

166 return "HANGUL" in character_name

167

168

169@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

170def is_thai(character: str) -> bool:

171 try:

172 character_name = unicodedata.name(character)

173 except ValueError:

174 return False

175

176 return "THAI" in character_name

177

178

179@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

180def is_arabic(character: str) -> bool:

181 try:

182 character_name = unicodedata.name(character)

183 except ValueError:

184 return False

185

186 return "ARABIC" in character_name

187

188

189@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

190def is_arabic_isolated_form(character: str) -> bool:

191 try:

192 character_name = unicodedata.name(character)

193 except ValueError:

194 return False

195

196 return "ARABIC" in character_name and "ISOLATED FORM" in character_name

197

198

199@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))

200def is_unicode_range_secondary(range_name: str) -> bool:

201 return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)

202

203

204@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

205def is_unprintable(character: str) -> bool:

206 return (

207 character.isspace() is False # includes \n \t \r \v

208 and character.isprintable() is False

209 and character != "\x1A" # Why? Its the ASCII substitute character.

210 and character != "\ufeff" # bug discovered in Python,

211 # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.

212 )

213

214

215def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:

216 """

217 Extract using ASCII-only decoder any specified encoding in the first n-bytes.

218 """

219 if not isinstance(sequence, bytes):

220 raise TypeError

221

222 seq_len: int = len(sequence)

223

224 results: List[str] = findall(

225 RE_POSSIBLE_ENCODING_INDICATION,

226 sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),

227 )

228

229 if len(results) == 0:

230 return None

231

232 for specified_encoding in results:

233 specified_encoding = specified_encoding.lower().replace("-", "_")

234

235 encoding_alias: str

236 encoding_iana: str

237

238 for encoding_alias, encoding_iana in aliases.items():

239 if encoding_alias == specified_encoding:

240 return encoding_iana

241 if encoding_iana == specified_encoding:

242 return encoding_iana

243

244 return None

245

246

247@lru_cache(maxsize=128)

248def is_multi_byte_encoding(name: str) -> bool:

249 """

250 Verify is a specific encoding is a multi byte one based on it IANA name

251 """

252 return name in {

253 "utf_8",

254 "utf_8_sig",

255 "utf_16",

256 "utf_16_be",

257 "utf_16_le",

258 "utf_32",

259 "utf_32_le",

260 "utf_32_be",

261 "utf_7",

262 } or issubclass(

263 importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,

264 MultibyteIncrementalDecoder,

265 )

266

267

268def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:

269 """

270 Identify and extract SIG/BOM in given sequence.

271 """

272

273 for iana_encoding in ENCODING_MARKS:

274 marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]

275

276 if isinstance(marks, bytes):

277 marks = [marks]

278

279 for mark in marks:

280 if sequence.startswith(mark):

281 return iana_encoding, mark

282

283 return None, b""

284

285

286def should_strip_sig_or_bom(iana_encoding: str) -> bool:

287 return iana_encoding not in {"utf_16", "utf_32"}

288

289

290def iana_name(cp_name: str, strict: bool = True) -> str:

291 cp_name = cp_name.lower().replace("-", "_")

292

293 encoding_alias: str

294 encoding_iana: str

295

296 for encoding_alias, encoding_iana in aliases.items():

297 if cp_name in [encoding_alias, encoding_iana]:

298 return encoding_iana

299

300 if strict:

301 raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))

302

303 return cp_name

304

305

306def range_scan(decoded_sequence: str) -> List[str]:

307 ranges: Set[str] = set()

308

309 for character in decoded_sequence:

310 character_range: Optional[str] = unicode_range(character)

311

312 if character_range is None:

313 continue

314

315 ranges.add(character_range)

316

317 return list(ranges)

318

319

320def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:

321 if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):

322 return 0.0

323

324 decoder_a = importlib.import_module(

325 "encodings.{}".format(iana_name_a)

326 ).IncrementalDecoder

327 decoder_b = importlib.import_module(

328 "encodings.{}".format(iana_name_b)

329 ).IncrementalDecoder

330

331 id_a: IncrementalDecoder = decoder_a(errors="ignore")

332 id_b: IncrementalDecoder = decoder_b(errors="ignore")

333

334 character_match_count: int = 0

335

336 for i in range(255):

337 to_be_decoded: bytes = bytes([i])

338 if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):

339 character_match_count += 1

340

341 return character_match_count / 254

342

343

344def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:

345 """

346 Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using

347 the function cp_similarity.

348 """

349 return (

350 iana_name_a in IANA_SUPPORTED_SIMILAR

351 and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]

352 )

353

354

355def set_logging_handler(

356 name: str = "charset_normalizer",

357 level: int = logging.INFO,

358 format_string: str = "%(asctime)s | %(levelname)s | %(message)s",

359) -> None:

360 logger = logging.getLogger(name)

361 logger.setLevel(level)

362

363 handler = logging.StreamHandler()

364 handler.setFormatter(logging.Formatter(format_string))

365 logger.addHandler(handler)

366

367

368def cut_sequence_chunks(

369 sequences: bytes,

370 encoding_iana: str,

371 offsets: range,

372 chunk_size: int,

373 bom_or_sig_available: bool,

374 strip_sig_or_bom: bool,

375 sig_payload: bytes,

376 is_multi_byte_decoder: bool,

377 decoded_payload: Optional[str] = None,

378) -> Generator[str, None, None]:

379 if decoded_payload and is_multi_byte_decoder is False:

380 for i in offsets:

381 chunk = decoded_payload[i : i + chunk_size]

382 if not chunk:

383 break

384 yield chunk

385 else:

386 for i in offsets:

387 chunk_end = i + chunk_size

388 if chunk_end > len(sequences) + 8:

389 continue

390

391 cut_sequence = sequences[i : i + chunk_size]

392

393 if bom_or_sig_available and strip_sig_or_bom is False:

394 cut_sequence = sig_payload + cut_sequence

395

396 chunk = cut_sequence.decode(

397 encoding_iana,

398 errors="ignore" if is_multi_byte_decoder else "strict",

399 )

400

401 # multi-byte bad cutting detector and adjustment

402 # not the cleanest way to perform that fix but clever enough for now.

403 if is_multi_byte_decoder and i > 0:

404 chunk_partial_size_chk: int = min(chunk_size, 16)

405

406 if (

407 decoded_payload

408 and chunk[:chunk_partial_size_chk] not in decoded_payload

409 ):

410 for j in range(i, i - 4, -1):

411 cut_sequence = sequences[j:chunk_end]

412

413 if bom_or_sig_available and strip_sig_or_bom is False:

414 cut_sequence = sig_payload + cut_sequence

415

416 chunk = cut_sequence.decode(encoding_iana, errors="ignore")

417

418 if chunk[:chunk_partial_size_chk] in decoded_payload:

419 break

420

421 yield chunk

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/utils.py: 26%

218 statements