Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/cd.py: 97%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

220 statements  

1from __future__ import annotations 

2 

3import importlib 

4from codecs import IncrementalDecoder 

5from collections import Counter 

6from functools import lru_cache 

7from typing import Counter as TypeCounter 

8 

9from .constant import ( 

10 FREQUENCIES, 

11 KO_NAMES, 

12 LANGUAGE_SUPPORTED_COUNT, 

13 TOO_SMALL_SEQUENCE, 

14 ZH_NAMES, 

15 _FREQUENCIES_SET, 

16 _FREQUENCIES_RANK, 

17) 

18from .md import is_suspiciously_successive_range 

19from .models import CoherenceMatches 

20from .utils import ( 

21 is_accentuated, 

22 is_latin, 

23 is_multi_byte_encoding, 

24 is_unicode_range_secondary, 

25 unicode_range, 

26) 

27 

28 

29def encoding_unicode_range(iana_name: str) -> list[str]: 

30 """ 

31 Return associated unicode ranges in a single byte code page. 

32 """ 

33 if is_multi_byte_encoding(iana_name): 

34 raise OSError( # Defensive: 

35 "Function not supported on multi-byte code page" 

36 ) 

37 

38 decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder 

39 

40 p: IncrementalDecoder = decoder(errors="ignore") 

41 seen_ranges: dict[str, int] = {} 

42 character_count: int = 0 

43 

44 for i in range(0x40, 0xFF): 

45 chunk: str = p.decode(bytes([i])) 

46 

47 if chunk: 

48 character_range: str | None = unicode_range(chunk) 

49 

50 if character_range is None: 

51 continue 

52 

53 if is_unicode_range_secondary(character_range) is False: 

54 if character_range not in seen_ranges: 

55 seen_ranges[character_range] = 0 

56 seen_ranges[character_range] += 1 

57 character_count += 1 

58 

59 return sorted( 

60 [ 

61 character_range 

62 for character_range in seen_ranges 

63 if seen_ranges[character_range] / character_count >= 0.15 

64 ] 

65 ) 

66 

67 

68def unicode_range_languages(primary_range: str) -> list[str]: 

69 """ 

70 Return inferred languages used with a unicode range. 

71 """ 

72 languages: list[str] = [] 

73 

74 for language, characters in FREQUENCIES.items(): 

75 for character in characters: 

76 if unicode_range(character) == primary_range: 

77 languages.append(language) 

78 break 

79 

80 return languages 

81 

82 

83@lru_cache() 

84def encoding_languages(iana_name: str) -> list[str]: 

85 """ 

86 Single-byte encoding language association. Some code page are heavily linked to particular language(s). 

87 This function does the correspondence. 

88 """ 

89 unicode_ranges: list[str] = encoding_unicode_range(iana_name) 

90 primary_range: str | None = None 

91 

92 for specified_range in unicode_ranges: 

93 if "Latin" not in specified_range: 

94 primary_range = specified_range 

95 break 

96 

97 if primary_range is None: 

98 return ["Latin Based"] 

99 

100 return unicode_range_languages(primary_range) 

101 

102 

103@lru_cache() 

104def mb_encoding_languages(iana_name: str) -> list[str]: 

105 """ 

106 Multi-byte encoding language association. Some code page are heavily linked to particular language(s). 

107 This function does the correspondence. 

108 """ 

109 if ( 

110 iana_name.startswith("shift_") 

111 or iana_name.startswith("iso2022_jp") 

112 or iana_name.startswith("euc_j") 

113 or iana_name == "cp932" 

114 ): 

115 return ["Japanese"] 

116 if iana_name.startswith("gb") or iana_name in ZH_NAMES: 

117 return ["Chinese"] 

118 if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: 

119 return ["Korean"] 

120 

121 return [] 

122 

123 

124@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT) 

125def get_target_features(language: str) -> tuple[bool, bool]: 

126 """ 

127 Determine main aspects from a supported language if it contains accents and if is pure Latin. 

128 """ 

129 target_have_accents: bool = False 

130 target_pure_latin: bool = True 

131 

132 for character in FREQUENCIES[language]: 

133 if not target_have_accents and is_accentuated(character): 

134 target_have_accents = True 

135 if target_pure_latin and is_latin(character) is False: 

136 target_pure_latin = False 

137 

138 return target_have_accents, target_pure_latin 

139 

140 

141def alphabet_languages( 

142 characters: list[str], ignore_non_latin: bool = False 

143) -> list[str]: 

144 """ 

145 Return associated languages associated to given characters. 

146 """ 

147 languages: list[tuple[str, float]] = [] 

148 

149 characters_set: frozenset[str] = frozenset(characters) 

150 source_have_accents = any(is_accentuated(character) for character in characters) 

151 

152 for language, language_characters in FREQUENCIES.items(): 

153 target_have_accents, target_pure_latin = get_target_features(language) 

154 

155 if ignore_non_latin and target_pure_latin is False: 

156 continue 

157 

158 if target_have_accents is False and source_have_accents: 

159 continue 

160 

161 character_count: int = len(language_characters) 

162 

163 character_match_count: int = len(_FREQUENCIES_SET[language] & characters_set) 

164 

165 ratio: float = character_match_count / character_count 

166 

167 if ratio >= 0.2: 

168 languages.append((language, ratio)) 

169 

170 languages = sorted(languages, key=lambda x: x[1], reverse=True) 

171 

172 return [compatible_language[0] for compatible_language in languages] 

173 

174 

175def characters_popularity_compare( 

176 language: str, ordered_characters: list[str] 

177) -> float: 

178 """ 

179 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. 

180 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit). 

181 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) 

182 """ 

183 if language not in FREQUENCIES: 

184 raise ValueError(f"{language} not available") # Defensive: 

185 

186 character_approved_count: int = 0 

187 frequencies_language_set: frozenset[str] = _FREQUENCIES_SET[language] 

188 lang_rank: dict[str, int] = _FREQUENCIES_RANK[language] 

189 

190 ordered_characters_count: int = len(ordered_characters) 

191 target_language_characters_count: int = len(FREQUENCIES[language]) 

192 

193 large_alphabet: bool = target_language_characters_count > 26 

194 

195 expected_projection_ratio: float = ( 

196 target_language_characters_count / ordered_characters_count 

197 ) 

198 

199 # Pre-built rank dict for ordered_characters (avoids repeated list slicing). 

200 ordered_rank: dict[str, int] = { 

201 char: rank for rank, char in enumerate(ordered_characters) 

202 } 

203 

204 # Pre-compute characters common to both orderings. 

205 # Avoids repeated `c in ordered_rank` dict lookups in the inner counts. 

206 common_chars: list[tuple[int, int]] = [ 

207 (lr, ordered_rank[c]) for c, lr in lang_rank.items() if c in ordered_rank 

208 ] 

209 

210 # Pre-extract lr and orr arrays for faster iteration in the inner loop. 

211 # Plain integer loops with local arrays are much faster under mypyc than 

212 # generator expression sums over a list of tuples. 

213 common_count: int = len(common_chars) 

214 common_lr: list[int] = [p[0] for p in common_chars] 

215 common_orr: list[int] = [p[1] for p in common_chars] 

216 

217 for character, character_rank in zip( 

218 ordered_characters, range(0, ordered_characters_count) 

219 ): 

220 if character not in frequencies_language_set: 

221 continue 

222 

223 character_rank_in_language: int = lang_rank[character] 

224 character_rank_projection: int = int(character_rank * expected_projection_ratio) 

225 

226 if ( 

227 large_alphabet is False 

228 and abs(character_rank_projection - character_rank_in_language) > 4 

229 ): 

230 continue 

231 

232 if ( 

233 large_alphabet is True 

234 and abs(character_rank_projection - character_rank_in_language) 

235 < target_language_characters_count / 3 

236 ): 

237 character_approved_count += 1 

238 continue 

239 

240 # Count how many characters appear "before" in both orderings, 

241 # and how many appear "at or after" in both orderings. 

242 # Single pass over pre-extracted arrays — much faster under mypyc 

243 # than two generator expression sums. 

244 before_match_count: int = 0 

245 after_match_count: int = 0 

246 for i in range(common_count): 

247 lr_i: int = common_lr[i] 

248 orr_i: int = common_orr[i] 

249 if lr_i < character_rank_in_language: 

250 if orr_i < character_rank: 

251 before_match_count += 1 

252 else: 

253 if orr_i >= character_rank: 

254 after_match_count += 1 

255 

256 after_len: int = target_language_characters_count - character_rank_in_language 

257 

258 if character_rank_in_language == 0 and before_match_count <= 4: 

259 character_approved_count += 1 

260 continue 

261 

262 if after_len == 0 and after_match_count <= 4: 

263 character_approved_count += 1 

264 continue 

265 

266 if ( 

267 character_rank_in_language > 0 

268 and before_match_count / character_rank_in_language >= 0.4 

269 ) or (after_len > 0 and after_match_count / after_len >= 0.4): 

270 character_approved_count += 1 

271 continue 

272 

273 return character_approved_count / len(ordered_characters) 

274 

275 

276def alpha_unicode_split(decoded_sequence: str) -> list[str]: 

277 """ 

278 Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. 

279 Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; 

280 One containing the latin letters and the other hebrew. 

281 """ 

282 layers: dict[str, list[str]] = {} 

283 

284 # Fast path: track single-layer key to skip dict iteration for single-script text. 

285 single_layer_key: str | None = None 

286 multi_layer: bool = False 

287 

288 # Cache the last character_range and its resolved layer to avoid repeated 

289 # is_suspiciously_successive_range calls for consecutive same-range chars. 

290 prev_character_range: str | None = None 

291 prev_layer_target: str | None = None 

292 

293 for character in decoded_sequence: 

294 if character.isalpha() is False: 

295 continue 

296 

297 # ASCII fast-path: a-z and A-Z are always "Basic Latin". 

298 # Avoids unicode_range() function call overhead for the most common case. 

299 character_ord: int = ord(character) 

300 if character_ord < 128: 

301 character_range: str | None = "Basic Latin" 

302 else: 

303 character_range = unicode_range(character) 

304 

305 if character_range is None: 

306 continue 

307 

308 # Fast path: same range as previous character → reuse cached layer target. 

309 if character_range == prev_character_range: 

310 if prev_layer_target is not None: 

311 layers[prev_layer_target].append(character) 

312 continue 

313 

314 layer_target_range: str | None = None 

315 

316 if multi_layer: 

317 for discovered_range in layers: 

318 if ( 

319 is_suspiciously_successive_range(discovered_range, character_range) 

320 is False 

321 ): 

322 layer_target_range = discovered_range 

323 break 

324 elif single_layer_key is not None: 

325 if ( 

326 is_suspiciously_successive_range(single_layer_key, character_range) 

327 is False 

328 ): 

329 layer_target_range = single_layer_key 

330 

331 if layer_target_range is None: 

332 layer_target_range = character_range 

333 

334 if layer_target_range not in layers: 

335 layers[layer_target_range] = [] 

336 if single_layer_key is None: 

337 single_layer_key = layer_target_range 

338 else: 

339 multi_layer = True 

340 

341 layers[layer_target_range].append(character) 

342 

343 # Cache for next iteration 

344 prev_character_range = character_range 

345 prev_layer_target = layer_target_range 

346 

347 return ["".join(chars).lower() for chars in layers.values()] 

348 

349 

350def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches: 

351 """ 

352 This function merge results previously given by the function coherence_ratio. 

353 The return type is the same as coherence_ratio. 

354 """ 

355 per_language_ratios: dict[str, list[float]] = {} 

356 for result in results: 

357 for sub_result in result: 

358 language, ratio = sub_result 

359 if language not in per_language_ratios: 

360 per_language_ratios[language] = [ratio] 

361 continue 

362 per_language_ratios[language].append(ratio) 

363 

364 merge = [ 

365 ( 

366 language, 

367 round( 

368 sum(per_language_ratios[language]) / len(per_language_ratios[language]), 

369 4, 

370 ), 

371 ) 

372 for language in per_language_ratios 

373 ] 

374 

375 return sorted(merge, key=lambda x: x[1], reverse=True) 

376 

377 

378def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: 

379 """ 

380 We shall NOT return "English—" in CoherenceMatches because it is an alternative 

381 of "English". This function only keeps the best match and remove the em-dash in it. 

382 """ 

383 index_results: dict[str, list[float]] = dict() 

384 

385 for result in results: 

386 language, ratio = result 

387 no_em_name: str = language.replace("—", "") 

388 

389 if no_em_name not in index_results: 

390 index_results[no_em_name] = [] 

391 

392 index_results[no_em_name].append(ratio) 

393 

394 if any(len(index_results[e]) > 1 for e in index_results): 

395 filtered_results: CoherenceMatches = [] 

396 

397 for language in index_results: 

398 filtered_results.append((language, max(index_results[language]))) 

399 

400 return filtered_results 

401 

402 return results 

403 

404 

405@lru_cache(maxsize=2048) 

406def coherence_ratio( 

407 decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None 

408) -> CoherenceMatches: 

409 """ 

410 Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. 

411 A layer = Character extraction by alphabets/ranges. 

412 """ 

413 

414 results: list[tuple[str, float]] = [] 

415 ignore_non_latin: bool = False 

416 

417 sufficient_match_count: int = 0 

418 

419 lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] 

420 if "Latin Based" in lg_inclusion_list: 

421 ignore_non_latin = True 

422 lg_inclusion_list.remove("Latin Based") 

423 

424 for layer in alpha_unicode_split(decoded_sequence): 

425 sequence_frequencies: TypeCounter[str] = Counter(layer) 

426 most_common = sequence_frequencies.most_common() 

427 

428 character_count: int = len(layer) 

429 

430 if character_count <= TOO_SMALL_SEQUENCE: 

431 continue 

432 

433 popular_character_ordered: list[str] = [c for c, o in most_common] 

434 

435 for language in lg_inclusion_list or alphabet_languages( 

436 popular_character_ordered, ignore_non_latin 

437 ): 

438 ratio: float = characters_popularity_compare( 

439 language, popular_character_ordered 

440 ) 

441 

442 if ratio < threshold: 

443 continue 

444 elif ratio >= 0.8: 

445 sufficient_match_count += 1 

446 

447 results.append((language, round(ratio, 4))) 

448 

449 if sufficient_match_count >= 3: 

450 break 

451 

452 return sorted( 

453 filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True 

454 )