Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset

1from __future__ import annotations

3import importlib

4from codecs import IncrementalDecoder

5from collections import Counter

6from functools import lru_cache

7from typing import Counter as TypeCounter

9from .constant import (

10 FREQUENCIES,

11 KO_NAMES,

12 LANGUAGE_SUPPORTED_COUNT,

13 TOO_SMALL_SEQUENCE,

14 ZH_NAMES,

15)

16from .md import is_suspiciously_successive_range

17from .models import CoherenceMatches

18from .utils import (

19 is_accentuated,

20 is_latin,

21 is_multi_byte_encoding,

22 is_unicode_range_secondary,

23 unicode_range,

24)

27def encoding_unicode_range(iana_name: str) -> list[str]:

28 """

29 Return associated unicode ranges in a single byte code page.

30 """

31 if is_multi_byte_encoding(iana_name):

32 raise OSError("Function not supported on multi-byte code page")

34 decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder

36 p: IncrementalDecoder = decoder(errors="ignore")

37 seen_ranges: dict[str, int] = {}

38 character_count: int = 0

40 for i in range(0x40, 0xFF):

41 chunk: str = p.decode(bytes([i]))

43 if chunk:

44 character_range: str | None = unicode_range(chunk)

46 if character_range is None:

47 continue

49 if is_unicode_range_secondary(character_range) is False:

50 if character_range not in seen_ranges:

51 seen_ranges[character_range] = 0

52 seen_ranges[character_range] += 1

53 character_count += 1

55 return sorted(

56 [

57 character_range

58 for character_range in seen_ranges

59 if seen_ranges[character_range] / character_count >= 0.15

60 ]

61 )

64def unicode_range_languages(primary_range: str) -> list[str]:

65 """

66 Return inferred languages used with a unicode range.

67 """

68 languages: list[str] = []

70 for language, characters in FREQUENCIES.items():

71 for character in characters:

72 if unicode_range(character) == primary_range:

73 languages.append(language)

74 break

76 return languages

79@lru_cache()

80def encoding_languages(iana_name: str) -> list[str]:

81 """

82 Single-byte encoding language association. Some code page are heavily linked to particular language(s).

83 This function does the correspondence.

84 """

85 unicode_ranges: list[str] = encoding_unicode_range(iana_name)

86 primary_range: str | None = None

88 for specified_range in unicode_ranges:

89 if "Latin" not in specified_range:

90 primary_range = specified_range

91 break

93 if primary_range is None:

94 return ["Latin Based"]

96 return unicode_range_languages(primary_range)

99@lru_cache()

100def mb_encoding_languages(iana_name: str) -> list[str]:

101 """

102 Multi-byte encoding language association. Some code page are heavily linked to particular language(s).

103 This function does the correspondence.

104 """

105 if (

106 iana_name.startswith("shift_")

107 or iana_name.startswith("iso2022_jp")

108 or iana_name.startswith("euc_j")

109 or iana_name == "cp932"

110 ):

111 return ["Japanese"]

112 if iana_name.startswith("gb") or iana_name in ZH_NAMES:

113 return ["Chinese"]

114 if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:

115 return ["Korean"]

116

117 return []

118

119

120@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)

121def get_target_features(language: str) -> tuple[bool, bool]:

122 """

123 Determine main aspects from a supported language if it contains accents and if is pure Latin.

124 """

125 target_have_accents: bool = False

126 target_pure_latin: bool = True

127

128 for character in FREQUENCIES[language]:

129 if not target_have_accents and is_accentuated(character):

130 target_have_accents = True

131 if target_pure_latin and is_latin(character) is False:

132 target_pure_latin = False

133

134 return target_have_accents, target_pure_latin

135

136

137def alphabet_languages(

138 characters: list[str], ignore_non_latin: bool = False

139) -> list[str]:

140 """

141 Return associated languages associated to given characters.

142 """

143 languages: list[tuple[str, float]] = []

144

145 source_have_accents = any(is_accentuated(character) for character in characters)

146

147 for language, language_characters in FREQUENCIES.items():

148 target_have_accents, target_pure_latin = get_target_features(language)

149

150 if ignore_non_latin and target_pure_latin is False:

151 continue

152

153 if target_have_accents is False and source_have_accents:

154 continue

155

156 character_count: int = len(language_characters)

157

158 character_match_count: int = len(

159 [c for c in language_characters if c in characters]

160 )

161

162 ratio: float = character_match_count / character_count

163

164 if ratio >= 0.2:

165 languages.append((language, ratio))

166

167 languages = sorted(languages, key=lambda x: x[1], reverse=True)

168

169 return [compatible_language[0] for compatible_language in languages]

170

171

172def characters_popularity_compare(

173 language: str, ordered_characters: list[str]

174) -> float:

175 """

176 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.

177 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).

178 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)

179 """

180 if language not in FREQUENCIES:

181 raise ValueError(f"{language} not available")

182

183 character_approved_count: int = 0

184 FREQUENCIES_language_set = set(FREQUENCIES[language])

185

186 ordered_characters_count: int = len(ordered_characters)

187 target_language_characters_count: int = len(FREQUENCIES[language])

188

189 large_alphabet: bool = target_language_characters_count > 26

190

191 for character, character_rank in zip(

192 ordered_characters, range(0, ordered_characters_count)

193 ):

194 if character not in FREQUENCIES_language_set:

195 continue

196

197 character_rank_in_language: int = FREQUENCIES[language].index(character)

198 expected_projection_ratio: float = (

199 target_language_characters_count / ordered_characters_count

200 )

201 character_rank_projection: int = int(character_rank * expected_projection_ratio)

202

203 if (

204 large_alphabet is False

205 and abs(character_rank_projection - character_rank_in_language) > 4

206 ):

207 continue

208

209 if (

210 large_alphabet is True

211 and abs(character_rank_projection - character_rank_in_language)

212 < target_language_characters_count / 3

213 ):

214 character_approved_count += 1

215 continue

216

217 characters_before_source: list[str] = FREQUENCIES[language][

218 0:character_rank_in_language

219 ]

220 characters_after_source: list[str] = FREQUENCIES[language][

221 character_rank_in_language:

222 ]

223 characters_before: list[str] = ordered_characters[0:character_rank]

224 characters_after: list[str] = ordered_characters[character_rank:]

225

226 before_match_count: int = len(

227 set(characters_before) & set(characters_before_source)

228 )

229

230 after_match_count: int = len(

231 set(characters_after) & set(characters_after_source)

232 )

233

234 if len(characters_before_source) == 0 and before_match_count <= 4:

235 character_approved_count += 1

236 continue

237

238 if len(characters_after_source) == 0 and after_match_count <= 4:

239 character_approved_count += 1

240 continue

241

242 if (

243 before_match_count / len(characters_before_source) >= 0.4

244 or after_match_count / len(characters_after_source) >= 0.4

245 ):

246 character_approved_count += 1

247 continue

248

249 return character_approved_count / len(ordered_characters)

250

251

252def alpha_unicode_split(decoded_sequence: str) -> list[str]:

253 """

254 Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.

255 Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;

256 One containing the latin letters and the other hebrew.

257 """

258 layers: dict[str, str] = {}

259

260 for character in decoded_sequence:

261 if character.isalpha() is False:

262 continue

263

264 character_range: str | None = unicode_range(character)

265

266 if character_range is None:

267 continue

268

269 layer_target_range: str | None = None

270

271 for discovered_range in layers:

272 if (

273 is_suspiciously_successive_range(discovered_range, character_range)

274 is False

275 ):

276 layer_target_range = discovered_range

277 break

278

279 if layer_target_range is None:

280 layer_target_range = character_range

281

282 if layer_target_range not in layers:

283 layers[layer_target_range] = character.lower()

284 continue

285

286 layers[layer_target_range] += character.lower()

287

288 return list(layers.values())

289

290

291def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:

292 """

293 This function merge results previously given by the function coherence_ratio.

294 The return type is the same as coherence_ratio.

295 """

296 per_language_ratios: dict[str, list[float]] = {}

297 for result in results:

298 for sub_result in result:

299 language, ratio = sub_result

300 if language not in per_language_ratios:

301 per_language_ratios[language] = [ratio]

302 continue

303 per_language_ratios[language].append(ratio)

304

305 merge = [

306 (

307 language,

308 round(

309 sum(per_language_ratios[language]) / len(per_language_ratios[language]),

310 4,

311 ),

312 )

313 for language in per_language_ratios

314 ]

315

316 return sorted(merge, key=lambda x: x[1], reverse=True)

317

318

319def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:

320 """

321 We shall NOT return "English—" in CoherenceMatches because it is an alternative

322 of "English". This function only keeps the best match and remove the em-dash in it.

323 """

324 index_results: dict[str, list[float]] = dict()

325

326 for result in results:

327 language, ratio = result

328 no_em_name: str = language.replace("—", "")

329

330 if no_em_name not in index_results:

331 index_results[no_em_name] = []

332

333 index_results[no_em_name].append(ratio)

334

335 if any(len(index_results[e]) > 1 for e in index_results):

336 filtered_results: CoherenceMatches = []

337

338 for language in index_results:

339 filtered_results.append((language, max(index_results[language])))

340

341 return filtered_results

342

343 return results

344

345

346@lru_cache(maxsize=2048)

347def coherence_ratio(

348 decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None

349) -> CoherenceMatches:

350 """

351 Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.

352 A layer = Character extraction by alphabets/ranges.

353 """

354

355 results: list[tuple[str, float]] = []

356 ignore_non_latin: bool = False

357

358 sufficient_match_count: int = 0

359

360 lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []

361 if "Latin Based" in lg_inclusion_list:

362 ignore_non_latin = True

363 lg_inclusion_list.remove("Latin Based")

364

365 for layer in alpha_unicode_split(decoded_sequence):

366 sequence_frequencies: TypeCounter[str] = Counter(layer)

367 most_common = sequence_frequencies.most_common()

368

369 character_count: int = sum(o for c, o in most_common)

370

371 if character_count <= TOO_SMALL_SEQUENCE:

372 continue

373

374 popular_character_ordered: list[str] = [c for c, o in most_common]

375

376 for language in lg_inclusion_list or alphabet_languages(

377 popular_character_ordered, ignore_non_latin

378 ):

379 ratio: float = characters_popularity_compare(

380 language, popular_character_ordered

381 )

382

383 if ratio < threshold:

384 continue

385 elif ratio >= 0.8:

386 sufficient_match_count += 1

387

388 results.append((language, round(ratio, 4)))

389

390 if sufficient_match_count >= 3:

391 break

392

393 return sorted(

394 filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True

395 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/cd.py: 13%

189 statements