Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/cd.py: 13%

189 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:35 +0000

1import importlib 

2from codecs import IncrementalDecoder 

3from collections import Counter 

4from functools import lru_cache 

5from typing import Counter as TypeCounter, Dict, List, Optional, Tuple 

6 

7from .assets import FREQUENCIES 

8from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES 

9from .md import is_suspiciously_successive_range 

10from .models import CoherenceMatches 

11from .utils import ( 

12 is_accentuated, 

13 is_latin, 

14 is_multi_byte_encoding, 

15 is_unicode_range_secondary, 

16 unicode_range, 

17) 

18 

19 

20def encoding_unicode_range(iana_name: str) -> List[str]: 

21 """ 

22 Return associated unicode ranges in a single byte code page. 

23 """ 

24 if is_multi_byte_encoding(iana_name): 

25 raise IOError("Function not supported on multi-byte code page") 

26 

27 decoder = importlib.import_module( 

28 "encodings.{}".format(iana_name) 

29 ).IncrementalDecoder 

30 

31 p: IncrementalDecoder = decoder(errors="ignore") 

32 seen_ranges: Dict[str, int] = {} 

33 character_count: int = 0 

34 

35 for i in range(0x40, 0xFF): 

36 chunk: str = p.decode(bytes([i])) 

37 

38 if chunk: 

39 character_range: Optional[str] = unicode_range(chunk) 

40 

41 if character_range is None: 

42 continue 

43 

44 if is_unicode_range_secondary(character_range) is False: 

45 if character_range not in seen_ranges: 

46 seen_ranges[character_range] = 0 

47 seen_ranges[character_range] += 1 

48 character_count += 1 

49 

50 return sorted( 

51 [ 

52 character_range 

53 for character_range in seen_ranges 

54 if seen_ranges[character_range] / character_count >= 0.15 

55 ] 

56 ) 

57 

58 

59def unicode_range_languages(primary_range: str) -> List[str]: 

60 """ 

61 Return inferred languages used with a unicode range. 

62 """ 

63 languages: List[str] = [] 

64 

65 for language, characters in FREQUENCIES.items(): 

66 for character in characters: 

67 if unicode_range(character) == primary_range: 

68 languages.append(language) 

69 break 

70 

71 return languages 

72 

73 

74@lru_cache() 

75def encoding_languages(iana_name: str) -> List[str]: 

76 """ 

77 Single-byte encoding language association. Some code page are heavily linked to particular language(s). 

78 This function does the correspondence. 

79 """ 

80 unicode_ranges: List[str] = encoding_unicode_range(iana_name) 

81 primary_range: Optional[str] = None 

82 

83 for specified_range in unicode_ranges: 

84 if "Latin" not in specified_range: 

85 primary_range = specified_range 

86 break 

87 

88 if primary_range is None: 

89 return ["Latin Based"] 

90 

91 return unicode_range_languages(primary_range) 

92 

93 

94@lru_cache() 

95def mb_encoding_languages(iana_name: str) -> List[str]: 

96 """ 

97 Multi-byte encoding language association. Some code page are heavily linked to particular language(s). 

98 This function does the correspondence. 

99 """ 

100 if ( 

101 iana_name.startswith("shift_") 

102 or iana_name.startswith("iso2022_jp") 

103 or iana_name.startswith("euc_j") 

104 or iana_name == "cp932" 

105 ): 

106 return ["Japanese"] 

107 if iana_name.startswith("gb") or iana_name in ZH_NAMES: 

108 return ["Chinese"] 

109 if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: 

110 return ["Korean"] 

111 

112 return [] 

113 

114 

115@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT) 

116def get_target_features(language: str) -> Tuple[bool, bool]: 

117 """ 

118 Determine main aspects from a supported language if it contains accents and if is pure Latin. 

119 """ 

120 target_have_accents: bool = False 

121 target_pure_latin: bool = True 

122 

123 for character in FREQUENCIES[language]: 

124 if not target_have_accents and is_accentuated(character): 

125 target_have_accents = True 

126 if target_pure_latin and is_latin(character) is False: 

127 target_pure_latin = False 

128 

129 return target_have_accents, target_pure_latin 

130 

131 

132def alphabet_languages( 

133 characters: List[str], ignore_non_latin: bool = False 

134) -> List[str]: 

135 """ 

136 Return associated languages associated to given characters. 

137 """ 

138 languages: List[Tuple[str, float]] = [] 

139 

140 source_have_accents = any(is_accentuated(character) for character in characters) 

141 

142 for language, language_characters in FREQUENCIES.items(): 

143 target_have_accents, target_pure_latin = get_target_features(language) 

144 

145 if ignore_non_latin and target_pure_latin is False: 

146 continue 

147 

148 if target_have_accents is False and source_have_accents: 

149 continue 

150 

151 character_count: int = len(language_characters) 

152 

153 character_match_count: int = len( 

154 [c for c in language_characters if c in characters] 

155 ) 

156 

157 ratio: float = character_match_count / character_count 

158 

159 if ratio >= 0.2: 

160 languages.append((language, ratio)) 

161 

162 languages = sorted(languages, key=lambda x: x[1], reverse=True) 

163 

164 return [compatible_language[0] for compatible_language in languages] 

165 

166 

167def characters_popularity_compare( 

168 language: str, ordered_characters: List[str] 

169) -> float: 

170 """ 

171 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. 

172 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit). 

173 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) 

174 """ 

175 if language not in FREQUENCIES: 

176 raise ValueError("{} not available".format(language)) 

177 

178 character_approved_count: int = 0 

179 FREQUENCIES_language_set = set(FREQUENCIES[language]) 

180 

181 ordered_characters_count: int = len(ordered_characters) 

182 target_language_characters_count: int = len(FREQUENCIES[language]) 

183 

184 large_alphabet: bool = target_language_characters_count > 26 

185 

186 for character, character_rank in zip( 

187 ordered_characters, range(0, ordered_characters_count) 

188 ): 

189 if character not in FREQUENCIES_language_set: 

190 continue 

191 

192 character_rank_in_language: int = FREQUENCIES[language].index(character) 

193 expected_projection_ratio: float = ( 

194 target_language_characters_count / ordered_characters_count 

195 ) 

196 character_rank_projection: int = int(character_rank * expected_projection_ratio) 

197 

198 if ( 

199 large_alphabet is False 

200 and abs(character_rank_projection - character_rank_in_language) > 4 

201 ): 

202 continue 

203 

204 if ( 

205 large_alphabet is True 

206 and abs(character_rank_projection - character_rank_in_language) 

207 < target_language_characters_count / 3 

208 ): 

209 character_approved_count += 1 

210 continue 

211 

212 characters_before_source: List[str] = FREQUENCIES[language][ 

213 0:character_rank_in_language 

214 ] 

215 characters_after_source: List[str] = FREQUENCIES[language][ 

216 character_rank_in_language: 

217 ] 

218 characters_before: List[str] = ordered_characters[0:character_rank] 

219 characters_after: List[str] = ordered_characters[character_rank:] 

220 

221 before_match_count: int = len( 

222 set(characters_before) & set(characters_before_source) 

223 ) 

224 

225 after_match_count: int = len( 

226 set(characters_after) & set(characters_after_source) 

227 ) 

228 

229 if len(characters_before_source) == 0 and before_match_count <= 4: 

230 character_approved_count += 1 

231 continue 

232 

233 if len(characters_after_source) == 0 and after_match_count <= 4: 

234 character_approved_count += 1 

235 continue 

236 

237 if ( 

238 before_match_count / len(characters_before_source) >= 0.4 

239 or after_match_count / len(characters_after_source) >= 0.4 

240 ): 

241 character_approved_count += 1 

242 continue 

243 

244 return character_approved_count / len(ordered_characters) 

245 

246 

247def alpha_unicode_split(decoded_sequence: str) -> List[str]: 

248 """ 

249 Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. 

250 Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; 

251 One containing the latin letters and the other hebrew. 

252 """ 

253 layers: Dict[str, str] = {} 

254 

255 for character in decoded_sequence: 

256 if character.isalpha() is False: 

257 continue 

258 

259 character_range: Optional[str] = unicode_range(character) 

260 

261 if character_range is None: 

262 continue 

263 

264 layer_target_range: Optional[str] = None 

265 

266 for discovered_range in layers: 

267 if ( 

268 is_suspiciously_successive_range(discovered_range, character_range) 

269 is False 

270 ): 

271 layer_target_range = discovered_range 

272 break 

273 

274 if layer_target_range is None: 

275 layer_target_range = character_range 

276 

277 if layer_target_range not in layers: 

278 layers[layer_target_range] = character.lower() 

279 continue 

280 

281 layers[layer_target_range] += character.lower() 

282 

283 return list(layers.values()) 

284 

285 

286def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: 

287 """ 

288 This function merge results previously given by the function coherence_ratio. 

289 The return type is the same as coherence_ratio. 

290 """ 

291 per_language_ratios: Dict[str, List[float]] = {} 

292 for result in results: 

293 for sub_result in result: 

294 language, ratio = sub_result 

295 if language not in per_language_ratios: 

296 per_language_ratios[language] = [ratio] 

297 continue 

298 per_language_ratios[language].append(ratio) 

299 

300 merge = [ 

301 ( 

302 language, 

303 round( 

304 sum(per_language_ratios[language]) / len(per_language_ratios[language]), 

305 4, 

306 ), 

307 ) 

308 for language in per_language_ratios 

309 ] 

310 

311 return sorted(merge, key=lambda x: x[1], reverse=True) 

312 

313 

314def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: 

315 """ 

316 We shall NOT return "English—" in CoherenceMatches because it is an alternative 

317 of "English". This function only keeps the best match and remove the em-dash in it. 

318 """ 

319 index_results: Dict[str, List[float]] = dict() 

320 

321 for result in results: 

322 language, ratio = result 

323 no_em_name: str = language.replace("—", "") 

324 

325 if no_em_name not in index_results: 

326 index_results[no_em_name] = [] 

327 

328 index_results[no_em_name].append(ratio) 

329 

330 if any(len(index_results[e]) > 1 for e in index_results): 

331 filtered_results: CoherenceMatches = [] 

332 

333 for language in index_results: 

334 filtered_results.append((language, max(index_results[language]))) 

335 

336 return filtered_results 

337 

338 return results 

339 

340 

341@lru_cache(maxsize=2048) 

342def coherence_ratio( 

343 decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None 

344) -> CoherenceMatches: 

345 """ 

346 Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. 

347 A layer = Character extraction by alphabets/ranges. 

348 """ 

349 

350 results: List[Tuple[str, float]] = [] 

351 ignore_non_latin: bool = False 

352 

353 sufficient_match_count: int = 0 

354 

355 lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] 

356 if "Latin Based" in lg_inclusion_list: 

357 ignore_non_latin = True 

358 lg_inclusion_list.remove("Latin Based") 

359 

360 for layer in alpha_unicode_split(decoded_sequence): 

361 sequence_frequencies: TypeCounter[str] = Counter(layer) 

362 most_common = sequence_frequencies.most_common() 

363 

364 character_count: int = sum(o for c, o in most_common) 

365 

366 if character_count <= TOO_SMALL_SEQUENCE: 

367 continue 

368 

369 popular_character_ordered: List[str] = [c for c, o in most_common] 

370 

371 for language in lg_inclusion_list or alphabet_languages( 

372 popular_character_ordered, ignore_non_latin 

373 ): 

374 ratio: float = characters_popularity_compare( 

375 language, popular_character_ordered 

376 ) 

377 

378 if ratio < threshold: 

379 continue 

380 elif ratio >= 0.8: 

381 sufficient_match_count += 1 

382 

383 results.append((language, round(ratio, 4))) 

384 

385 if sufficient_match_count >= 3: 

386 break 

387 

388 return sorted( 

389 filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True 

390 )