Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/cd.py: 97%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

188 statements  

1import importlib 

2from codecs import IncrementalDecoder 

3from collections import Counter 

4from functools import lru_cache 

5from typing import Counter as TypeCounter, Dict, List, Optional, Tuple 

6 

7from .constant import ( 

8 FREQUENCIES, 

9 KO_NAMES, 

10 LANGUAGE_SUPPORTED_COUNT, 

11 TOO_SMALL_SEQUENCE, 

12 ZH_NAMES, 

13) 

14from .md import is_suspiciously_successive_range 

15from .models import CoherenceMatches 

16from .utils import ( 

17 is_accentuated, 

18 is_latin, 

19 is_multi_byte_encoding, 

20 is_unicode_range_secondary, 

21 unicode_range, 

22) 

23 

24 

25def encoding_unicode_range(iana_name: str) -> List[str]: 

26 """ 

27 Return associated unicode ranges in a single byte code page. 

28 """ 

29 if is_multi_byte_encoding(iana_name): 

30 raise IOError("Function not supported on multi-byte code page") 

31 

32 decoder = importlib.import_module( 

33 "encodings.{}".format(iana_name) 

34 ).IncrementalDecoder 

35 

36 p: IncrementalDecoder = decoder(errors="ignore") 

37 seen_ranges: Dict[str, int] = {} 

38 character_count: int = 0 

39 

40 for i in range(0x40, 0xFF): 

41 chunk: str = p.decode(bytes([i])) 

42 

43 if chunk: 

44 character_range: Optional[str] = unicode_range(chunk) 

45 

46 if character_range is None: 

47 continue 

48 

49 if is_unicode_range_secondary(character_range) is False: 

50 if character_range not in seen_ranges: 

51 seen_ranges[character_range] = 0 

52 seen_ranges[character_range] += 1 

53 character_count += 1 

54 

55 return sorted( 

56 [ 

57 character_range 

58 for character_range in seen_ranges 

59 if seen_ranges[character_range] / character_count >= 0.15 

60 ] 

61 ) 

62 

63 

64def unicode_range_languages(primary_range: str) -> List[str]: 

65 """ 

66 Return inferred languages used with a unicode range. 

67 """ 

68 languages: List[str] = [] 

69 

70 for language, characters in FREQUENCIES.items(): 

71 for character in characters: 

72 if unicode_range(character) == primary_range: 

73 languages.append(language) 

74 break 

75 

76 return languages 

77 

78 

79@lru_cache() 

80def encoding_languages(iana_name: str) -> List[str]: 

81 """ 

82 Single-byte encoding language association. Some code page are heavily linked to particular language(s). 

83 This function does the correspondence. 

84 """ 

85 unicode_ranges: List[str] = encoding_unicode_range(iana_name) 

86 primary_range: Optional[str] = None 

87 

88 for specified_range in unicode_ranges: 

89 if "Latin" not in specified_range: 

90 primary_range = specified_range 

91 break 

92 

93 if primary_range is None: 

94 return ["Latin Based"] 

95 

96 return unicode_range_languages(primary_range) 

97 

98 

99@lru_cache() 

100def mb_encoding_languages(iana_name: str) -> List[str]: 

101 """ 

102 Multi-byte encoding language association. Some code page are heavily linked to particular language(s). 

103 This function does the correspondence. 

104 """ 

105 if ( 

106 iana_name.startswith("shift_") 

107 or iana_name.startswith("iso2022_jp") 

108 or iana_name.startswith("euc_j") 

109 or iana_name == "cp932" 

110 ): 

111 return ["Japanese"] 

112 if iana_name.startswith("gb") or iana_name in ZH_NAMES: 

113 return ["Chinese"] 

114 if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: 

115 return ["Korean"] 

116 

117 return [] 

118 

119 

120@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT) 

121def get_target_features(language: str) -> Tuple[bool, bool]: 

122 """ 

123 Determine main aspects from a supported language if it contains accents and if is pure Latin. 

124 """ 

125 target_have_accents: bool = False 

126 target_pure_latin: bool = True 

127 

128 for character in FREQUENCIES[language]: 

129 if not target_have_accents and is_accentuated(character): 

130 target_have_accents = True 

131 if target_pure_latin and is_latin(character) is False: 

132 target_pure_latin = False 

133 

134 return target_have_accents, target_pure_latin 

135 

136 

137def alphabet_languages( 

138 characters: List[str], ignore_non_latin: bool = False 

139) -> List[str]: 

140 """ 

141 Return associated languages associated to given characters. 

142 """ 

143 languages: List[Tuple[str, float]] = [] 

144 

145 source_have_accents = any(is_accentuated(character) for character in characters) 

146 

147 for language, language_characters in FREQUENCIES.items(): 

148 target_have_accents, target_pure_latin = get_target_features(language) 

149 

150 if ignore_non_latin and target_pure_latin is False: 

151 continue 

152 

153 if target_have_accents is False and source_have_accents: 

154 continue 

155 

156 character_count: int = len(language_characters) 

157 

158 character_match_count: int = len( 

159 [c for c in language_characters if c in characters] 

160 ) 

161 

162 ratio: float = character_match_count / character_count 

163 

164 if ratio >= 0.2: 

165 languages.append((language, ratio)) 

166 

167 languages = sorted(languages, key=lambda x: x[1], reverse=True) 

168 

169 return [compatible_language[0] for compatible_language in languages] 

170 

171 

172def characters_popularity_compare( 

173 language: str, ordered_characters: List[str] 

174) -> float: 

175 """ 

176 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. 

177 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit). 

178 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) 

179 """ 

180 if language not in FREQUENCIES: 

181 raise ValueError("{} not available".format(language)) 

182 

183 character_approved_count: int = 0 

184 FREQUENCIES_language_set = set(FREQUENCIES[language]) 

185 

186 ordered_characters_count: int = len(ordered_characters) 

187 target_language_characters_count: int = len(FREQUENCIES[language]) 

188 

189 large_alphabet: bool = target_language_characters_count > 26 

190 

191 for character, character_rank in zip( 

192 ordered_characters, range(0, ordered_characters_count) 

193 ): 

194 if character not in FREQUENCIES_language_set: 

195 continue 

196 

197 character_rank_in_language: int = FREQUENCIES[language].index(character) 

198 expected_projection_ratio: float = ( 

199 target_language_characters_count / ordered_characters_count 

200 ) 

201 character_rank_projection: int = int(character_rank * expected_projection_ratio) 

202 

203 if ( 

204 large_alphabet is False 

205 and abs(character_rank_projection - character_rank_in_language) > 4 

206 ): 

207 continue 

208 

209 if ( 

210 large_alphabet is True 

211 and abs(character_rank_projection - character_rank_in_language) 

212 < target_language_characters_count / 3 

213 ): 

214 character_approved_count += 1 

215 continue 

216 

217 characters_before_source: List[str] = FREQUENCIES[language][ 

218 0:character_rank_in_language 

219 ] 

220 characters_after_source: List[str] = FREQUENCIES[language][ 

221 character_rank_in_language: 

222 ] 

223 characters_before: List[str] = ordered_characters[0:character_rank] 

224 characters_after: List[str] = ordered_characters[character_rank:] 

225 

226 before_match_count: int = len( 

227 set(characters_before) & set(characters_before_source) 

228 ) 

229 

230 after_match_count: int = len( 

231 set(characters_after) & set(characters_after_source) 

232 ) 

233 

234 if len(characters_before_source) == 0 and before_match_count <= 4: 

235 character_approved_count += 1 

236 continue 

237 

238 if len(characters_after_source) == 0 and after_match_count <= 4: 

239 character_approved_count += 1 

240 continue 

241 

242 if ( 

243 before_match_count / len(characters_before_source) >= 0.4 

244 or after_match_count / len(characters_after_source) >= 0.4 

245 ): 

246 character_approved_count += 1 

247 continue 

248 

249 return character_approved_count / len(ordered_characters) 

250 

251 

252def alpha_unicode_split(decoded_sequence: str) -> List[str]: 

253 """ 

254 Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. 

255 Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; 

256 One containing the latin letters and the other hebrew. 

257 """ 

258 layers: Dict[str, str] = {} 

259 

260 for character in decoded_sequence: 

261 if character.isalpha() is False: 

262 continue 

263 

264 character_range: Optional[str] = unicode_range(character) 

265 

266 if character_range is None: 

267 continue 

268 

269 layer_target_range: Optional[str] = None 

270 

271 for discovered_range in layers: 

272 if ( 

273 is_suspiciously_successive_range(discovered_range, character_range) 

274 is False 

275 ): 

276 layer_target_range = discovered_range 

277 break 

278 

279 if layer_target_range is None: 

280 layer_target_range = character_range 

281 

282 if layer_target_range not in layers: 

283 layers[layer_target_range] = character.lower() 

284 continue 

285 

286 layers[layer_target_range] += character.lower() 

287 

288 return list(layers.values()) 

289 

290 

291def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: 

292 """ 

293 This function merge results previously given by the function coherence_ratio. 

294 The return type is the same as coherence_ratio. 

295 """ 

296 per_language_ratios: Dict[str, List[float]] = {} 

297 for result in results: 

298 for sub_result in result: 

299 language, ratio = sub_result 

300 if language not in per_language_ratios: 

301 per_language_ratios[language] = [ratio] 

302 continue 

303 per_language_ratios[language].append(ratio) 

304 

305 merge = [ 

306 ( 

307 language, 

308 round( 

309 sum(per_language_ratios[language]) / len(per_language_ratios[language]), 

310 4, 

311 ), 

312 ) 

313 for language in per_language_ratios 

314 ] 

315 

316 return sorted(merge, key=lambda x: x[1], reverse=True) 

317 

318 

319def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: 

320 """ 

321 We shall NOT return "English—" in CoherenceMatches because it is an alternative 

322 of "English". This function only keeps the best match and remove the em-dash in it. 

323 """ 

324 index_results: Dict[str, List[float]] = dict() 

325 

326 for result in results: 

327 language, ratio = result 

328 no_em_name: str = language.replace("—", "") 

329 

330 if no_em_name not in index_results: 

331 index_results[no_em_name] = [] 

332 

333 index_results[no_em_name].append(ratio) 

334 

335 if any(len(index_results[e]) > 1 for e in index_results): 

336 filtered_results: CoherenceMatches = [] 

337 

338 for language in index_results: 

339 filtered_results.append((language, max(index_results[language]))) 

340 

341 return filtered_results 

342 

343 return results 

344 

345 

346@lru_cache(maxsize=2048) 

347def coherence_ratio( 

348 decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None 

349) -> CoherenceMatches: 

350 """ 

351 Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. 

352 A layer = Character extraction by alphabets/ranges. 

353 """ 

354 

355 results: List[Tuple[str, float]] = [] 

356 ignore_non_latin: bool = False 

357 

358 sufficient_match_count: int = 0 

359 

360 lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] 

361 if "Latin Based" in lg_inclusion_list: 

362 ignore_non_latin = True 

363 lg_inclusion_list.remove("Latin Based") 

364 

365 for layer in alpha_unicode_split(decoded_sequence): 

366 sequence_frequencies: TypeCounter[str] = Counter(layer) 

367 most_common = sequence_frequencies.most_common() 

368 

369 character_count: int = sum(o for c, o in most_common) 

370 

371 if character_count <= TOO_SMALL_SEQUENCE: 

372 continue 

373 

374 popular_character_ordered: List[str] = [c for c, o in most_common] 

375 

376 for language in lg_inclusion_list or alphabet_languages( 

377 popular_character_ordered, ignore_non_latin 

378 ): 

379 ratio: float = characters_popularity_compare( 

380 language, popular_character_ordered 

381 ) 

382 

383 if ratio < threshold: 

384 continue 

385 elif ratio >= 0.8: 

386 sufficient_match_count += 1 

387 

388 results.append((language, round(ratio, 4))) 

389 

390 if sufficient_match_count >= 3: 

391 break 

392 

393 return sorted( 

394 filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True 

395 )